| version 1.1 | version 1.2 |
|---|
| |
| | |
| void NAMD_die(const char *); | void NAMD_die(const char *); |
| | |
| texture<float2, 1, cudaReadModeElementType> vdwCoefTableTexture; | |
| | |
| #ifdef DISABLE_CUDA_TEXTURE_OBJECTS | |
| texture<float4, 1, cudaReadModeElementType> force_table; | |
| texture<float4, 1, cudaReadModeElementType> energy_table; | |
| #endif | |
| | |
| #define MAX_CONST_EXCLUSIONS 2048 // cache size is 8k | #define MAX_CONST_EXCLUSIONS 2048 // cache size is 8k |
| __constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS]; | __constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS]; |
| | |
| |
| void calcForceEnergy(const float r2, const float qi, const float qj, | void calcForceEnergy(const float r2, const float qi, const float qj, |
| const float dx, const float dy, const float dz, | const float dx, const float dy, const float dz, |
| const int vdwtypei, const int vdwtypej, const float2* __restrict__ vdwCoefTable, | const int vdwtypei, const int vdwtypej, const float2* __restrict__ vdwCoefTable, |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | cudaTextureObject_t vdwCoefTableTex, |
| cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex, | cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex, |
| #endif | |
| float3& iforce, float3& iforceSlow, float3& jforce, float3& jforceSlow, | float3& iforce, float3& iforceSlow, float3& jforce, float3& jforceSlow, |
| float& energyVdw, float& energyElec, float& energySlow) { | float& energyVdw, float& energyElec, float& energySlow) { |
| | |
| |
| #if __CUDA_ARCH__ >= 350 | #if __CUDA_ARCH__ >= 350 |
| float2 ljab = __ldg(&vdwCoefTable[vdwIndex]); | float2 ljab = __ldg(&vdwCoefTable[vdwIndex]); |
| #else | #else |
| float2 ljab = tex1Dfetch(vdwCoefTableTexture, vdwIndex); | float2 ljab = tex1D<float2>(vdwCoefTableTex, vdwIndex); |
| #endif | #endif |
| | |
| float rinv = rsqrtf(r2); | float rinv = rsqrtf(r2); |
| float4 ei; | float4 ei; |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
| float4 fi = tex1D<float4>(forceTableTex, rinv); | float4 fi = tex1D<float4>(forceTableTex, rinv); |
| if (doEnergy) ei = tex1D<float4>(energyTableTex, rinv); | if (doEnergy) ei = tex1D<float4>(energyTableTex, rinv); |
| #else | |
| float4 fi = tex1D(force_table, rinv); | |
| if (doEnergy) ei = tex1D(energy_table, rinv); | |
| #endif | |
| | |
| float fSlow = qi * qj; | float fSlow = qi * qj; |
| float f = ljab.x * fi.z + ljab.y * fi.y + fSlow * fi.x; | float f = ljab.x * fi.z + ljab.y * fi.y + fSlow * fi.x; |
| |
| const TileList* __restrict__ tileLists, TileExcl* __restrict__ tileExcls, | const TileList* __restrict__ tileLists, TileExcl* __restrict__ tileExcls, |
| const int* __restrict__ tileJatomStart, | const int* __restrict__ tileJatomStart, |
| const int vdwCoefTableWidth, const float2* __restrict__ vdwCoefTable, const int* __restrict__ vdwTypes, | const int vdwCoefTableWidth, const float2* __restrict__ vdwCoefTable, const int* __restrict__ vdwTypes, |
| const float latticeX, const float latticeY, const float latticeZ, | const float3 lata, const float3 latb, const float3 latc, |
| const float4* __restrict__ xyzq, const float cutoff2, | const float4* __restrict__ xyzq, const float cutoff2, |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | cudaTextureObject_t vdwCoefTableTex, |
| cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex, | cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex, |
| #endif | |
| // ---------- | // ---------- |
| // doPairlist | // doPairlist |
| const int atomStorageSize, const float plcutoff2, const PatchPairRecord* __restrict__ patchPairs, | const int atomStorageSize, const float plcutoff2, const PatchPairRecord* __restrict__ patchPairs, |
| |
| patchInd = tmp.patchInd; | patchInd = tmp.patchInd; |
| patchNumList = tmp.patchNumList; | patchNumList = tmp.patchNumList; |
| | |
| float shx = tmp.offsetXYZ.x*latticeX; | float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x; |
| float shy = tmp.offsetXYZ.y*latticeY; | float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y; |
| float shz = tmp.offsetXYZ.z*latticeZ; | float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z; |
| | |
| int iatomSize, iatomFreeSize, jatomSize, jatomFreeSize; | int iatomSize, iatomFreeSize, jatomSize, jatomFreeSize; |
| if (doPairlist) { | if (doPairlist) { |
| |
| calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w, xyzq_j.w, dx, dy, dz, | calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w, xyzq_j.w, dx, dy, dz, |
| vdwtypei, vdwtypej, | vdwtypei, vdwtypej, |
| vdwCoefTable, | vdwCoefTable, |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | vdwCoefTableTex, forceTableTex, energyTableTex, |
| forceTableTex, energyTableTex, | |
| #endif | |
| iforce, iforceSlow, jforce, jforceSlow, energyVdw, energyElec, energySlow); | iforce, iforceSlow, jforce, jforceSlow, energyVdw, energyElec, energySlow); |
| } | } |
| } | } |
| |
| } // t | } // t |
| } else { | } else { |
| // Just compute forces | // Just compute forces |
| /* | |
| if (self) { | |
| shuffleNext<doPairlist>(xyzq_j.w, vdwtypej, jatomIndex); | |
| excl >>= 1; | |
| } | |
| for (;t < WARPSIZE;t++) { | |
| int j = (t + wid) & (WARPSIZE-1); | |
| | |
| float dx = __shfl(xyzq_j.x,j) - xyzq_i.x; | |
| float dy = __shfl(xyzq_j.y,j) - xyzq_i.y; | |
| float dz = __shfl(xyzq_j.z,j) - xyzq_i.z; | |
| | |
| float r2 = dx*dx + dy*dy + dz*dz; | |
| | |
| if ((excl & 1) && r2 < cutoff2) { | |
| calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w, xyzq_j.w, dx, dy, dz, | |
| vdwtypei, vdwtypej, vdwCoefTable, | |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
| forceTableTex, energyTableTex, | |
| #endif | |
| iforce, iforceSlow, jforce, jforceSlow, energyVdw, energyElec, energySlow); | |
| } // ((excl & 1) && r2 < cutoff2) | |
| excl >>= 1; | |
| shuffleNext<doPairlist>(xyzq_j.w, vdwtypej, jatomIndex); | |
| shuffleNext<doSlow>(jforce, jforceSlow); | |
| } // t | |
| */ | |
| if (self) { | if (self) { |
| excl >>= 1; | excl >>= 1; |
| xyzq_j.x = __shfl(xyzq_j.x, (threadIdx.x+1) & (WARPSIZE-1)); | xyzq_j.x = __shfl(xyzq_j.x, (threadIdx.x+1) & (WARPSIZE-1)); |
| |
| calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w, xyzq_j.w, dx, dy, dz, | calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w, xyzq_j.w, dx, dy, dz, |
| vdwtypei, vdwtypej, | vdwtypei, vdwtypej, |
| vdwCoefTable, | vdwCoefTable, |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | vdwCoefTableTex, forceTableTex, energyTableTex, |
| forceTableTex, energyTableTex, | |
| #endif | |
| iforce, iforceSlow, jforce, jforceSlow, energyVdw, energyElec, energySlow); | iforce, iforceSlow, jforce, jforceSlow, energyVdw, energyElec, energySlow); |
| } // (r2 < cutoff2) | } // (r2 < cutoff2) |
| } // (excl & 1) | } // (excl & 1) |
| |
| // ############################################################################################## | // ############################################################################################## |
| // ############################################################################################## | // ############################################################################################## |
| | |
| CudaComputeNonbondedKernel::CudaComputeNonbondedKernel(int deviceID, bool doStreaming) : deviceID(deviceID), doStreaming(doStreaming) { | CudaComputeNonbondedKernel::CudaComputeNonbondedKernel(int deviceID, CudaNonbondedTables& cudaNonbondedTables, |
| | bool doStreaming) : deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables), doStreaming(doStreaming) { |
| | |
| cudaCheck(cudaSetDevice(deviceID)); | cudaCheck(cudaSetDevice(deviceID)); |
| | |
| overflowExclusions = NULL; | overflowExclusions = NULL; |
| |
| vdwTypes = NULL; | vdwTypes = NULL; |
| vdwTypesSize = 0; | vdwTypesSize = 0; |
| | |
| vdwCoefTable = NULL; | |
| vdwCoefTableWidth = 0; | |
| vdwCoefTableSize = 0; | |
| | |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
| forceTableTexActive = false; | |
| energyTableTexActive = false; | |
| #endif | |
| | |
| vdwCoefTableTextureBound = false; | |
| #ifdef DISABLE_CUDA_TEXTURE_OBJECTS | |
| force_table_bound = false; | |
| energy_table_bound = false; | |
| #endif | |
| | |
| patchNumCount = NULL; | patchNumCount = NULL; |
| patchNumCountSize = 0; | patchNumCountSize = 0; |
| | |
| |
| if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff); | if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff); |
| if (atomIndex != NULL) deallocate_device<int>(&atomIndex); | if (atomIndex != NULL) deallocate_device<int>(&atomIndex); |
| if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes); | if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes); |
| if (vdwCoefTable != NULL) deallocate_device<float2>(&vdwCoefTable); | |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
| if (forceTableTexActive) cudaCheck(cudaDestroyTextureObject(forceTableTex)); | |
| if (energyTableTexActive) cudaCheck(cudaDestroyTextureObject(energyTableTex)); | |
| #else | |
| if (force_table_bound) cudaCheck(cudaUnbindTexture(force_table)); | |
| if (energy_table_bound) cudaCheck(cudaUnbindTexture(energy_table)); | |
| #endif | |
| if (vdwCoefTableTextureBound) cudaCheck(cudaUnbindTexture(vdwCoefTableTexture)); | |
| if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount); | if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount); |
| if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue); | if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue); |
| } | } |
| |
| void CudaComputeNonbondedKernel::nonbondedForce(CudaTileListKernel& tlKernel, | void CudaComputeNonbondedKernel::nonbondedForce(CudaTileListKernel& tlKernel, |
| const int atomStorageSize, const bool doPairlist, | const int atomStorageSize, const bool doPairlist, |
| const bool doEnergy, const bool doVirial, const bool doSlow, | const bool doEnergy, const bool doVirial, const bool doSlow, |
| const float latticeX, const float latticeY, const float latticeZ, | const float3 lata, const float3 latb, const float3 latc, |
| const float4* h_xyzq, const float cutoff2, | const float4* h_xyzq, const float cutoff2, |
| float4* d_forces, float4* d_forcesSlow, | float4* d_forces, float4* d_forcesSlow, |
| float4* h_forces, float4* h_forcesSlow, | float4* h_forces, float4* h_forcesSlow, |
| |
| int nleft = tlKernel.getNumTileLists() - start; | int nleft = tlKernel.getNumTileLists() - start; |
| int nblock = min(deviceCUDA->getMaxNumBlocks(), (nleft-1)/nwarp+1); | int nblock = min(deviceCUDA->getMaxNumBlocks(), (nleft-1)/nwarp+1); |
| | |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
| #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \ | #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \ |
| nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \ | nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \ |
| <<< nblock, nthread, shMemSize, stream >>> \ | <<< nblock, nthread, shMemSize, stream >>> \ |
| (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \ | (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \ |
| vdwCoefTableWidth, vdwCoefTable, vdwTypes, latticeX, latticeY, latticeZ, tlKernel.get_xyzq(), cutoff2, \ | cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getVdwCoefTable(), \ |
| forceTableTex, energyTableTex, \ | vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, \ |
| | cudaNonbondedTables.getVdwCoefTableTex(), cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex(), \ |
| atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \ | atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \ |
| tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \ | tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \ |
| tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \ | tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \ |
| numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \ | numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \ |
| outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true | outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true |
| #else // DISABLE_CUDA_TEXTURE_OBJECTS | |
| #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \ | |
| nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \ | |
| <<< nblock, nthread, shMemSize, stream >>> \ | |
| (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \ | |
| vdwCoefTableWidth, vdwCoefTable, vdwTypes, latticeX, latticeY, latticeZ, tlKernel.get_xyzq(), cutoff2, \ | |
| atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \ | |
| tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \ | |
| tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \ | |
| numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \ | |
| outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true | |
| #endif // DISABLE_CUDA_TEXTURE_OBJECTS | |
| | |
| bool called = false; | bool called = false; |
| | |
| |
| reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions); | reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions); |
| copy_HtoD_sync<unsigned int>(exclusion_bits, overflowExclusions, numExclusions); | copy_HtoD_sync<unsigned int>(exclusion_bits, overflowExclusions, numExclusions); |
| } | } |
| | |
| void CudaComputeNonbondedKernel::bindVdwCoefTable(float2* h_vdwCoefTable, int vdwCoefTableWidthIn) { | |
| vdwCoefTableWidth = vdwCoefTableWidthIn; | |
| | |
| int vdwCoefTableArea = vdwCoefTableWidth*vdwCoefTableWidth; | |
| | |
| reallocate_device<float2>(&vdwCoefTable, &vdwCoefTableSize, vdwCoefTableArea); | |
| copy_HtoD_sync<float2>(h_vdwCoefTable, vdwCoefTable, vdwCoefTableArea); | |
| | |
| if (vdwCoefTableTextureBound) { | |
| cudaCheck(cudaUnbindTexture(vdwCoefTableTexture)); | |
| vdwCoefTableTextureBound = false; | |
| } | |
| | |
| vdwCoefTableTexture.normalized = false; | |
| vdwCoefTableTexture.addressMode[0] = cudaAddressModeClamp; | |
| vdwCoefTableTexture.filterMode = cudaFilterModePoint; | |
| cudaCheck(cudaBindTexture((size_t*)0, vdwCoefTableTexture, vdwCoefTable, vdwCoefTableArea*sizeof(float2))); | |
| vdwCoefTableTextureBound = true; | |
| } | |
| | |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
| void CudaComputeNonbondedKernel::bindTextureObject(int tableSize, float4* h_table, cudaArray_t& array, cudaTextureObject_t& tableTex) { | |
| cudaChannelFormatDesc desc; | |
| desc.x = sizeof(float)*8; | |
| desc.y = sizeof(float)*8; | |
| desc.z = sizeof(float)*8; | |
| desc.w = sizeof(float)*8; | |
| desc.f = cudaChannelFormatKindFloat; | |
| cudaCheck(cudaMallocArray(&array, &desc, tableSize, 1)); | |
| cudaCheck(cudaMemcpyToArray(array, 0, 0, h_table, tableSize*sizeof(float4), cudaMemcpyHostToDevice)); | |
| | |
| cudaResourceDesc resDesc; | |
| memset(&resDesc, 0, sizeof(resDesc)); | |
| resDesc.resType = cudaResourceTypeArray; | |
| resDesc.res.array.array = array; | |
| | |
| cudaTextureDesc texDesc; | |
| memset(&texDesc, 0, sizeof(texDesc)); | |
| texDesc.addressMode[0] = cudaAddressModeClamp; | |
| texDesc.filterMode = cudaFilterModeLinear; | |
| texDesc.normalizedCoords = 1; | |
| | |
| cudaCheck(cudaCreateTextureObject(&tableTex, &resDesc, &texDesc, NULL)); | |
| } | |
| #endif | |
| | |
| void CudaComputeNonbondedKernel::bindForceAndEnergyTable(int tableSize, | |
| float4* h_forceTable, | |
| float4* h_energyTable) { | |
| | |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
| | |
| if (forceTableTexActive) { | |
| cudaCheck(cudaFreeArray(forceArray)); | |
| cudaCheck(cudaDestroyTextureObject(forceTableTex)); | |
| } | |
| | |
| if (energyTableTexActive) { | |
| cudaCheck(cudaFreeArray(energyArray)); | |
| cudaCheck(cudaDestroyTextureObject(energyTableTex)); | |
| } | |
| forceTableTex = 0; | |
| energyTableTex = 0; | |
| | |
| bindTextureObject(tableSize, h_forceTable, forceArray, forceTableTex); | |
| bindTextureObject(tableSize, h_energyTable, energyArray, energyTableTex); | |
| forceTableTexActive = true; | |
| energyTableTexActive = true; | |
| | |
| #else // DISABLE_CUDA_TEXTURE_OBJECTS | |
| | |
| if (force_table_bound) { | |
| cudaCheck(cudaFreeArray(forceArray)); | |
| } | |
| cudaCheck(cudaMallocArray(&forceArray, &force_table.channelDesc, tableSize, 1)); | |
| | |
| if (energy_table_bound) { | |
| cudaCheck(cudaFreeArray(energyArray)); | |
| } | |
| cudaCheck(cudaMallocArray(&energyArray, &energy_table.channelDesc, tableSize, 1)); | |
| | |
| cudaCheck(cudaMemcpyToArray(forceArray, 0, 0, h_forceTable, tableSize*sizeof(float4), cudaMemcpyHostToDevice)); | |
| cudaCheck(cudaMemcpyToArray(energyArray, 0, 0, h_energyTable, tableSize*sizeof(float4), cudaMemcpyHostToDevice)); | |
| | |
| force_table.normalized = true; | |
| force_table.addressMode[0] = cudaAddressModeClamp; | |
| force_table.addressMode[1] = cudaAddressModeClamp; | |
| force_table.filterMode = cudaFilterModeLinear; | |
| | |
| energy_table.normalized = true; | |
| energy_table.addressMode[0] = cudaAddressModeClamp; | |
| energy_table.addressMode[1] = cudaAddressModeClamp; | |
| energy_table.filterMode = cudaFilterModeLinear; | |
| | |
| cudaCheck(cudaBindTextureToArray(force_table, forceArray)); | |
| cudaCheck(cudaBindTextureToArray(energy_table, energyArray)); | |
| | |
| force_table_bound = true; | |
| energy_table_bound = true; | |
| #endif // DISABLE_CUDA_TEXTURE_OBJECTS | |
| } | |