version 1.1 | version 1.2 |
---|
| |
| |
void NAMD_die(const char *); | void NAMD_die(const char *); |
| |
texture<float2, 1, cudaReadModeElementType> vdwCoefTableTexture; | |
| |
#ifdef DISABLE_CUDA_TEXTURE_OBJECTS | |
texture<float4, 1, cudaReadModeElementType> force_table; | |
texture<float4, 1, cudaReadModeElementType> energy_table; | |
#endif | |
| |
#define MAX_CONST_EXCLUSIONS 2048 // cache size is 8k | #define MAX_CONST_EXCLUSIONS 2048 // cache size is 8k |
__constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS]; | __constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS]; |
| |
| |
void calcForceEnergy(const float r2, const float qi, const float qj, | void calcForceEnergy(const float r2, const float qi, const float qj, |
const float dx, const float dy, const float dz, | const float dx, const float dy, const float dz, |
const int vdwtypei, const int vdwtypej, const float2* __restrict__ vdwCoefTable, | const int vdwtypei, const int vdwtypej, const float2* __restrict__ vdwCoefTable, |
#ifndef DISABLE_CUDA_TEXTURE_OBJECTS | cudaTextureObject_t vdwCoefTableTex, |
cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex, | cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex, |
#endif | |
float3& iforce, float3& iforceSlow, float3& jforce, float3& jforceSlow, | float3& iforce, float3& iforceSlow, float3& jforce, float3& jforceSlow, |
float& energyVdw, float& energyElec, float& energySlow) { | float& energyVdw, float& energyElec, float& energySlow) { |
| |
| |
#if __CUDA_ARCH__ >= 350 | #if __CUDA_ARCH__ >= 350 |
float2 ljab = __ldg(&vdwCoefTable[vdwIndex]); | float2 ljab = __ldg(&vdwCoefTable[vdwIndex]); |
#else | #else |
float2 ljab = tex1Dfetch(vdwCoefTableTexture, vdwIndex); | float2 ljab = tex1D<float2>(vdwCoefTableTex, vdwIndex); |
#endif | #endif |
| |
float rinv = rsqrtf(r2); | float rinv = rsqrtf(r2); |
float4 ei; | float4 ei; |
#ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
float4 fi = tex1D<float4>(forceTableTex, rinv); | float4 fi = tex1D<float4>(forceTableTex, rinv); |
if (doEnergy) ei = tex1D<float4>(energyTableTex, rinv); | if (doEnergy) ei = tex1D<float4>(energyTableTex, rinv); |
#else | |
float4 fi = tex1D(force_table, rinv); | |
if (doEnergy) ei = tex1D(energy_table, rinv); | |
#endif | |
| |
float fSlow = qi * qj; | float fSlow = qi * qj; |
float f = ljab.x * fi.z + ljab.y * fi.y + fSlow * fi.x; | float f = ljab.x * fi.z + ljab.y * fi.y + fSlow * fi.x; |
| |
const TileList* __restrict__ tileLists, TileExcl* __restrict__ tileExcls, | const TileList* __restrict__ tileLists, TileExcl* __restrict__ tileExcls, |
const int* __restrict__ tileJatomStart, | const int* __restrict__ tileJatomStart, |
const int vdwCoefTableWidth, const float2* __restrict__ vdwCoefTable, const int* __restrict__ vdwTypes, | const int vdwCoefTableWidth, const float2* __restrict__ vdwCoefTable, const int* __restrict__ vdwTypes, |
const float latticeX, const float latticeY, const float latticeZ, | const float3 lata, const float3 latb, const float3 latc, |
const float4* __restrict__ xyzq, const float cutoff2, | const float4* __restrict__ xyzq, const float cutoff2, |
#ifndef DISABLE_CUDA_TEXTURE_OBJECTS | cudaTextureObject_t vdwCoefTableTex, |
cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex, | cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex, |
#endif | |
// ---------- | // ---------- |
// doPairlist | // doPairlist |
const int atomStorageSize, const float plcutoff2, const PatchPairRecord* __restrict__ patchPairs, | const int atomStorageSize, const float plcutoff2, const PatchPairRecord* __restrict__ patchPairs, |
| |
patchInd = tmp.patchInd; | patchInd = tmp.patchInd; |
patchNumList = tmp.patchNumList; | patchNumList = tmp.patchNumList; |
| |
float shx = tmp.offsetXYZ.x*latticeX; | float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x; |
float shy = tmp.offsetXYZ.y*latticeY; | float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y; |
float shz = tmp.offsetXYZ.z*latticeZ; | float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z; |
| |
int iatomSize, iatomFreeSize, jatomSize, jatomFreeSize; | int iatomSize, iatomFreeSize, jatomSize, jatomFreeSize; |
if (doPairlist) { | if (doPairlist) { |
| |
calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w, xyzq_j.w, dx, dy, dz, | calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w, xyzq_j.w, dx, dy, dz, |
vdwtypei, vdwtypej, | vdwtypei, vdwtypej, |
vdwCoefTable, | vdwCoefTable, |
#ifndef DISABLE_CUDA_TEXTURE_OBJECTS | vdwCoefTableTex, forceTableTex, energyTableTex, |
forceTableTex, energyTableTex, | |
#endif | |
iforce, iforceSlow, jforce, jforceSlow, energyVdw, energyElec, energySlow); | iforce, iforceSlow, jforce, jforceSlow, energyVdw, energyElec, energySlow); |
} | } |
} | } |
| |
} // t | } // t |
} else { | } else { |
// Just compute forces | // Just compute forces |
/* | |
if (self) { | |
shuffleNext<doPairlist>(xyzq_j.w, vdwtypej, jatomIndex); | |
excl >>= 1; | |
} | |
for (;t < WARPSIZE;t++) { | |
int j = (t + wid) & (WARPSIZE-1); | |
| |
float dx = __shfl(xyzq_j.x,j) - xyzq_i.x; | |
float dy = __shfl(xyzq_j.y,j) - xyzq_i.y; | |
float dz = __shfl(xyzq_j.z,j) - xyzq_i.z; | |
| |
float r2 = dx*dx + dy*dy + dz*dz; | |
| |
if ((excl & 1) && r2 < cutoff2) { | |
calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w, xyzq_j.w, dx, dy, dz, | |
vdwtypei, vdwtypej, vdwCoefTable, | |
#ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
forceTableTex, energyTableTex, | |
#endif | |
iforce, iforceSlow, jforce, jforceSlow, energyVdw, energyElec, energySlow); | |
} // ((excl & 1) && r2 < cutoff2) | |
excl >>= 1; | |
shuffleNext<doPairlist>(xyzq_j.w, vdwtypej, jatomIndex); | |
shuffleNext<doSlow>(jforce, jforceSlow); | |
} // t | |
*/ | |
if (self) { | if (self) { |
excl >>= 1; | excl >>= 1; |
xyzq_j.x = __shfl(xyzq_j.x, (threadIdx.x+1) & (WARPSIZE-1)); | xyzq_j.x = __shfl(xyzq_j.x, (threadIdx.x+1) & (WARPSIZE-1)); |
| |
calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w, xyzq_j.w, dx, dy, dz, | calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w, xyzq_j.w, dx, dy, dz, |
vdwtypei, vdwtypej, | vdwtypei, vdwtypej, |
vdwCoefTable, | vdwCoefTable, |
#ifndef DISABLE_CUDA_TEXTURE_OBJECTS | vdwCoefTableTex, forceTableTex, energyTableTex, |
forceTableTex, energyTableTex, | |
#endif | |
iforce, iforceSlow, jforce, jforceSlow, energyVdw, energyElec, energySlow); | iforce, iforceSlow, jforce, jforceSlow, energyVdw, energyElec, energySlow); |
} // (r2 < cutoff2) | } // (r2 < cutoff2) |
} // (excl & 1) | } // (excl & 1) |
| |
// ############################################################################################## | // ############################################################################################## |
// ############################################################################################## | // ############################################################################################## |
| |
CudaComputeNonbondedKernel::CudaComputeNonbondedKernel(int deviceID, bool doStreaming) : deviceID(deviceID), doStreaming(doStreaming) { | CudaComputeNonbondedKernel::CudaComputeNonbondedKernel(int deviceID, CudaNonbondedTables& cudaNonbondedTables, |
| bool doStreaming) : deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables), doStreaming(doStreaming) { |
| |
cudaCheck(cudaSetDevice(deviceID)); | cudaCheck(cudaSetDevice(deviceID)); |
| |
overflowExclusions = NULL; | overflowExclusions = NULL; |
| |
vdwTypes = NULL; | vdwTypes = NULL; |
vdwTypesSize = 0; | vdwTypesSize = 0; |
| |
vdwCoefTable = NULL; | |
vdwCoefTableWidth = 0; | |
vdwCoefTableSize = 0; | |
| |
#ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
forceTableTexActive = false; | |
energyTableTexActive = false; | |
#endif | |
| |
vdwCoefTableTextureBound = false; | |
#ifdef DISABLE_CUDA_TEXTURE_OBJECTS | |
force_table_bound = false; | |
energy_table_bound = false; | |
#endif | |
| |
patchNumCount = NULL; | patchNumCount = NULL; |
patchNumCountSize = 0; | patchNumCountSize = 0; |
| |
| |
if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff); | if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff); |
if (atomIndex != NULL) deallocate_device<int>(&atomIndex); | if (atomIndex != NULL) deallocate_device<int>(&atomIndex); |
if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes); | if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes); |
if (vdwCoefTable != NULL) deallocate_device<float2>(&vdwCoefTable); | |
#ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
if (forceTableTexActive) cudaCheck(cudaDestroyTextureObject(forceTableTex)); | |
if (energyTableTexActive) cudaCheck(cudaDestroyTextureObject(energyTableTex)); | |
#else | |
if (force_table_bound) cudaCheck(cudaUnbindTexture(force_table)); | |
if (energy_table_bound) cudaCheck(cudaUnbindTexture(energy_table)); | |
#endif | |
if (vdwCoefTableTextureBound) cudaCheck(cudaUnbindTexture(vdwCoefTableTexture)); | |
if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount); | if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount); |
if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue); | if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue); |
} | } |
| |
void CudaComputeNonbondedKernel::nonbondedForce(CudaTileListKernel& tlKernel, | void CudaComputeNonbondedKernel::nonbondedForce(CudaTileListKernel& tlKernel, |
const int atomStorageSize, const bool doPairlist, | const int atomStorageSize, const bool doPairlist, |
const bool doEnergy, const bool doVirial, const bool doSlow, | const bool doEnergy, const bool doVirial, const bool doSlow, |
const float latticeX, const float latticeY, const float latticeZ, | const float3 lata, const float3 latb, const float3 latc, |
const float4* h_xyzq, const float cutoff2, | const float4* h_xyzq, const float cutoff2, |
float4* d_forces, float4* d_forcesSlow, | float4* d_forces, float4* d_forcesSlow, |
float4* h_forces, float4* h_forcesSlow, | float4* h_forces, float4* h_forcesSlow, |
| |
int nleft = tlKernel.getNumTileLists() - start; | int nleft = tlKernel.getNumTileLists() - start; |
int nblock = min(deviceCUDA->getMaxNumBlocks(), (nleft-1)/nwarp+1); | int nblock = min(deviceCUDA->getMaxNumBlocks(), (nleft-1)/nwarp+1); |
| |
#ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
#define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \ | #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \ |
nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \ | nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \ |
<<< nblock, nthread, shMemSize, stream >>> \ | <<< nblock, nthread, shMemSize, stream >>> \ |
(start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \ | (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \ |
vdwCoefTableWidth, vdwCoefTable, vdwTypes, latticeX, latticeY, latticeZ, tlKernel.get_xyzq(), cutoff2, \ | cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getVdwCoefTable(), \ |
forceTableTex, energyTableTex, \ | vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, \ |
| cudaNonbondedTables.getVdwCoefTableTex(), cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex(), \ |
atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \ | atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \ |
tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \ | tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \ |
tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \ | tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \ |
numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \ | numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \ |
outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true | outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true |
#else // DISABLE_CUDA_TEXTURE_OBJECTS | |
#define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \ | |
nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \ | |
<<< nblock, nthread, shMemSize, stream >>> \ | |
(start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \ | |
vdwCoefTableWidth, vdwCoefTable, vdwTypes, latticeX, latticeY, latticeZ, tlKernel.get_xyzq(), cutoff2, \ | |
atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \ | |
tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \ | |
tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \ | |
numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \ | |
outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true | |
#endif // DISABLE_CUDA_TEXTURE_OBJECTS | |
| |
bool called = false; | bool called = false; |
| |
| |
reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions); | reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions); |
copy_HtoD_sync<unsigned int>(exclusion_bits, overflowExclusions, numExclusions); | copy_HtoD_sync<unsigned int>(exclusion_bits, overflowExclusions, numExclusions); |
} | } |
| |
void CudaComputeNonbondedKernel::bindVdwCoefTable(float2* h_vdwCoefTable, int vdwCoefTableWidthIn) { | |
vdwCoefTableWidth = vdwCoefTableWidthIn; | |
| |
int vdwCoefTableArea = vdwCoefTableWidth*vdwCoefTableWidth; | |
| |
reallocate_device<float2>(&vdwCoefTable, &vdwCoefTableSize, vdwCoefTableArea); | |
copy_HtoD_sync<float2>(h_vdwCoefTable, vdwCoefTable, vdwCoefTableArea); | |
| |
if (vdwCoefTableTextureBound) { | |
cudaCheck(cudaUnbindTexture(vdwCoefTableTexture)); | |
vdwCoefTableTextureBound = false; | |
} | |
| |
vdwCoefTableTexture.normalized = false; | |
vdwCoefTableTexture.addressMode[0] = cudaAddressModeClamp; | |
vdwCoefTableTexture.filterMode = cudaFilterModePoint; | |
cudaCheck(cudaBindTexture((size_t*)0, vdwCoefTableTexture, vdwCoefTable, vdwCoefTableArea*sizeof(float2))); | |
vdwCoefTableTextureBound = true; | |
} | |
| |
#ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
void CudaComputeNonbondedKernel::bindTextureObject(int tableSize, float4* h_table, cudaArray_t& array, cudaTextureObject_t& tableTex) { | |
cudaChannelFormatDesc desc; | |
desc.x = sizeof(float)*8; | |
desc.y = sizeof(float)*8; | |
desc.z = sizeof(float)*8; | |
desc.w = sizeof(float)*8; | |
desc.f = cudaChannelFormatKindFloat; | |
cudaCheck(cudaMallocArray(&array, &desc, tableSize, 1)); | |
cudaCheck(cudaMemcpyToArray(array, 0, 0, h_table, tableSize*sizeof(float4), cudaMemcpyHostToDevice)); | |
| |
cudaResourceDesc resDesc; | |
memset(&resDesc, 0, sizeof(resDesc)); | |
resDesc.resType = cudaResourceTypeArray; | |
resDesc.res.array.array = array; | |
| |
cudaTextureDesc texDesc; | |
memset(&texDesc, 0, sizeof(texDesc)); | |
texDesc.addressMode[0] = cudaAddressModeClamp; | |
texDesc.filterMode = cudaFilterModeLinear; | |
texDesc.normalizedCoords = 1; | |
| |
cudaCheck(cudaCreateTextureObject(&tableTex, &resDesc, &texDesc, NULL)); | |
} | |
#endif | |
| |
void CudaComputeNonbondedKernel::bindForceAndEnergyTable(int tableSize, | |
float4* h_forceTable, | |
float4* h_energyTable) { | |
| |
#ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
| |
if (forceTableTexActive) { | |
cudaCheck(cudaFreeArray(forceArray)); | |
cudaCheck(cudaDestroyTextureObject(forceTableTex)); | |
} | |
| |
if (energyTableTexActive) { | |
cudaCheck(cudaFreeArray(energyArray)); | |
cudaCheck(cudaDestroyTextureObject(energyTableTex)); | |
} | |
forceTableTex = 0; | |
energyTableTex = 0; | |
| |
bindTextureObject(tableSize, h_forceTable, forceArray, forceTableTex); | |
bindTextureObject(tableSize, h_energyTable, energyArray, energyTableTex); | |
forceTableTexActive = true; | |
energyTableTexActive = true; | |
| |
#else // DISABLE_CUDA_TEXTURE_OBJECTS | |
| |
if (force_table_bound) { | |
cudaCheck(cudaFreeArray(forceArray)); | |
} | |
cudaCheck(cudaMallocArray(&forceArray, &force_table.channelDesc, tableSize, 1)); | |
| |
if (energy_table_bound) { | |
cudaCheck(cudaFreeArray(energyArray)); | |
} | |
cudaCheck(cudaMallocArray(&energyArray, &energy_table.channelDesc, tableSize, 1)); | |
| |
cudaCheck(cudaMemcpyToArray(forceArray, 0, 0, h_forceTable, tableSize*sizeof(float4), cudaMemcpyHostToDevice)); | |
cudaCheck(cudaMemcpyToArray(energyArray, 0, 0, h_energyTable, tableSize*sizeof(float4), cudaMemcpyHostToDevice)); | |
| |
force_table.normalized = true; | |
force_table.addressMode[0] = cudaAddressModeClamp; | |
force_table.addressMode[1] = cudaAddressModeClamp; | |
force_table.filterMode = cudaFilterModeLinear; | |
| |
energy_table.normalized = true; | |
energy_table.addressMode[0] = cudaAddressModeClamp; | |
energy_table.addressMode[1] = cudaAddressModeClamp; | |
energy_table.filterMode = cudaFilterModeLinear; | |
| |
cudaCheck(cudaBindTextureToArray(force_table, forceArray)); | |
cudaCheck(cudaBindTextureToArray(energy_table, energyArray)); | |
| |
force_table_bound = true; | |
energy_table_bound = true; | |
#endif // DISABLE_CUDA_TEXTURE_OBJECTS | |
} | |