Difference for src/CudaComputeNonbondedKernel.cu from version 1.1 to 1.2

version 1.1version 1.2
Line 12
Line 12
  
 void NAMD_die(const char *); void NAMD_die(const char *);
  
 texture<float2, 1, cudaReadModeElementType> vdwCoefTableTexture; 
  
 #ifdef DISABLE_CUDA_TEXTURE_OBJECTS 
 texture<float4, 1, cudaReadModeElementType> force_table; 
 texture<float4, 1, cudaReadModeElementType> energy_table; 
 #endif 
  
 #define MAX_CONST_EXCLUSIONS 2048  // cache size is 8k #define MAX_CONST_EXCLUSIONS 2048  // cache size is 8k
 __constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS]; __constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS];
  
Line 29
Line 22
 void calcForceEnergy(const float r2, const float qi, const float qj, void calcForceEnergy(const float r2, const float qi, const float qj,
   const float dx, const float dy, const float dz,   const float dx, const float dy, const float dz,
   const int vdwtypei, const int vdwtypej, const float2* __restrict__ vdwCoefTable,   const int vdwtypei, const int vdwtypej, const float2* __restrict__ vdwCoefTable,
 #ifndef DISABLE_CUDA_TEXTURE_OBJECTS   cudaTextureObject_t vdwCoefTableTex, 
   cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,   cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
 #endif 
   float3& iforce, float3& iforceSlow, float3& jforce, float3& jforceSlow,   float3& iforce, float3& iforceSlow, float3& jforce, float3& jforceSlow,
   float& energyVdw, float& energyElec, float& energySlow) {   float& energyVdw, float& energyElec, float& energySlow) {
  
Line 39
Line 31
 #if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
   float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);   float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
 #else #else
   float2 ljab = tex1Dfetch(vdwCoefTableTexture, vdwIndex);   float2 ljab = tex1D<float2>(vdwCoefTableTex, vdwIndex);
 #endif #endif
  
   float rinv = rsqrtf(r2);   float rinv = rsqrtf(r2);
   float4 ei;   float4 ei;
 #ifndef DISABLE_CUDA_TEXTURE_OBJECTS 
   float4 fi = tex1D<float4>(forceTableTex, rinv);   float4 fi = tex1D<float4>(forceTableTex, rinv);
   if (doEnergy) ei = tex1D<float4>(energyTableTex, rinv);   if (doEnergy) ei = tex1D<float4>(energyTableTex, rinv);
 #else 
   float4 fi = tex1D(force_table, rinv); 
   if (doEnergy) ei = tex1D(energy_table, rinv); 
 #endif 
  
   float fSlow = qi * qj;   float fSlow = qi * qj;
   float f = ljab.x * fi.z + ljab.y * fi.y + fSlow * fi.x;   float f = ljab.x * fi.z + ljab.y * fi.y + fSlow * fi.x;
Line 175
Line 162
   const TileList* __restrict__ tileLists, TileExcl* __restrict__ tileExcls,   const TileList* __restrict__ tileLists, TileExcl* __restrict__ tileExcls,
   const int* __restrict__ tileJatomStart,   const int* __restrict__ tileJatomStart,
   const int vdwCoefTableWidth, const float2* __restrict__ vdwCoefTable, const int* __restrict__ vdwTypes,   const int vdwCoefTableWidth, const float2* __restrict__ vdwCoefTable, const int* __restrict__ vdwTypes,
   const float latticeX, const float latticeY, const float latticeZ,   const float3 lata, const float3 latb, const float3 latc,
   const float4* __restrict__ xyzq, const float cutoff2,   const float4* __restrict__ xyzq, const float cutoff2,
 #ifndef DISABLE_CUDA_TEXTURE_OBJECTS   cudaTextureObject_t vdwCoefTableTex,
   cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,   cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
 #endif 
   // ----------   // ----------
   // doPairlist   // doPairlist
   const int atomStorageSize, const float plcutoff2, const PatchPairRecord* __restrict__ patchPairs,   const int atomStorageSize, const float plcutoff2, const PatchPairRecord* __restrict__ patchPairs,
Line 229
Line 215
       patchInd     = tmp.patchInd;       patchInd     = tmp.patchInd;
       patchNumList = tmp.patchNumList;       patchNumList = tmp.patchNumList;
  
       float shx = tmp.offsetXYZ.x*latticeX;       float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x;
       float shy = tmp.offsetXYZ.y*latticeY;       float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y;
       float shz = tmp.offsetXYZ.z*latticeZ;       float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z;
  
       int iatomSize, iatomFreeSize, jatomSize, jatomFreeSize;       int iatomSize, iatomFreeSize, jatomSize, jatomFreeSize;
       if (doPairlist) {       if (doPairlist) {
Line 454
Line 440
                   calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,                   calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
                     vdwtypei, vdwtypej,                     vdwtypei, vdwtypej,
                     vdwCoefTable,                     vdwCoefTable,
 #ifndef DISABLE_CUDA_TEXTURE_OBJECTS                     vdwCoefTableTex, forceTableTex, energyTableTex,
                     forceTableTex, energyTableTex, 
 #endif 
                     iforce, iforceSlow, jforce, jforceSlow, energyVdw, energyElec, energySlow);                     iforce, iforceSlow, jforce, jforceSlow, energyVdw, energyElec, energySlow);
                 }                 }
               }               }
Line 466
Line 450
           } // t           } // t
         } else {         } else {
           // Just compute forces           // Just compute forces
 /*           
           if (self) { 
             shuffleNext<doPairlist>(xyzq_j.w, vdwtypej, jatomIndex); 
             excl >>= 1; 
           } 
           for (;t < WARPSIZE;t++) { 
             int j = (t + wid) & (WARPSIZE-1); 
  
             float dx = __shfl(xyzq_j.x,j) - xyzq_i.x; 
             float dy = __shfl(xyzq_j.y,j) - xyzq_i.y; 
             float dz = __shfl(xyzq_j.z,j) - xyzq_i.z; 
  
             float r2 = dx*dx + dy*dy + dz*dz; 
  
             if ((excl & 1) && r2 < cutoff2) { 
               calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w, xyzq_j.w, dx, dy, dz, 
                 vdwtypei, vdwtypej, vdwCoefTable, 
 #ifndef DISABLE_CUDA_TEXTURE_OBJECTS 
                 forceTableTex, energyTableTex, 
 #endif 
                 iforce, iforceSlow, jforce, jforceSlow, energyVdw, energyElec, energySlow); 
             } // ((excl & 1) && r2 < cutoff2) 
             excl >>= 1; 
             shuffleNext<doPairlist>(xyzq_j.w, vdwtypej, jatomIndex); 
             shuffleNext<doSlow>(jforce, jforceSlow); 
           } // t 
 */ 
           if (self) {           if (self) {
             excl >>= 1;             excl >>= 1;
             xyzq_j.x = __shfl(xyzq_j.x, (threadIdx.x+1) & (WARPSIZE-1));             xyzq_j.x = __shfl(xyzq_j.x, (threadIdx.x+1) & (WARPSIZE-1));
Line 512
Line 469
                 calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,                 calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
                   vdwtypei, vdwtypej,                   vdwtypei, vdwtypej,
                   vdwCoefTable,                   vdwCoefTable,
 #ifndef DISABLE_CUDA_TEXTURE_OBJECTS                   vdwCoefTableTex, forceTableTex, energyTableTex,
                   forceTableTex, energyTableTex, 
 #endif 
                   iforce, iforceSlow, jforce, jforceSlow, energyVdw, energyElec, energySlow);                   iforce, iforceSlow, jforce, jforceSlow, energyVdw, energyElec, energySlow);
               } // (r2 < cutoff2)               } // (r2 < cutoff2)
             } // (excl & 1)             } // (excl & 1)
Line 972
Line 927
 // ############################################################################################## // ##############################################################################################
 // ############################################################################################## // ##############################################################################################
  
 CudaComputeNonbondedKernel::CudaComputeNonbondedKernel(int deviceID, bool doStreaming) : deviceID(deviceID), doStreaming(doStreaming) { CudaComputeNonbondedKernel::CudaComputeNonbondedKernel(int deviceID, CudaNonbondedTables& cudaNonbondedTables,
    bool doStreaming) : deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables), doStreaming(doStreaming) {
    
   cudaCheck(cudaSetDevice(deviceID));   cudaCheck(cudaSetDevice(deviceID));
  
   overflowExclusions = NULL;   overflowExclusions = NULL;
Line 987
Line 944
   vdwTypes = NULL;   vdwTypes = NULL;
   vdwTypesSize = 0;   vdwTypesSize = 0;
  
   vdwCoefTable = NULL; 
   vdwCoefTableWidth = 0; 
   vdwCoefTableSize = 0; 
  
 #ifndef DISABLE_CUDA_TEXTURE_OBJECTS 
   forceTableTexActive = false; 
   energyTableTexActive = false; 
 #endif 
  
   vdwCoefTableTextureBound = false; 
 #ifdef DISABLE_CUDA_TEXTURE_OBJECTS 
   force_table_bound = false; 
   energy_table_bound = false; 
 #endif 
  
   patchNumCount = NULL;   patchNumCount = NULL;
   patchNumCountSize = 0;   patchNumCountSize = 0;
  
Line 1016
Line 958
   if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff);   if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff);
   if (atomIndex != NULL) deallocate_device<int>(&atomIndex);   if (atomIndex != NULL) deallocate_device<int>(&atomIndex);
   if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes);   if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes);
   if (vdwCoefTable != NULL) deallocate_device<float2>(&vdwCoefTable); 
 #ifndef DISABLE_CUDA_TEXTURE_OBJECTS 
   if (forceTableTexActive) cudaCheck(cudaDestroyTextureObject(forceTableTex)); 
   if (energyTableTexActive) cudaCheck(cudaDestroyTextureObject(energyTableTex)); 
 #else 
   if (force_table_bound) cudaCheck(cudaUnbindTexture(force_table)); 
   if (energy_table_bound) cudaCheck(cudaUnbindTexture(energy_table)); 
 #endif 
   if (vdwCoefTableTextureBound) cudaCheck(cudaUnbindTexture(vdwCoefTableTexture)); 
   if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount);   if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount);
   if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue);   if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue);
 } }
Line 1051
Line 984
 void CudaComputeNonbondedKernel::nonbondedForce(CudaTileListKernel& tlKernel, void CudaComputeNonbondedKernel::nonbondedForce(CudaTileListKernel& tlKernel,
   const int atomStorageSize, const bool doPairlist,   const int atomStorageSize, const bool doPairlist,
   const bool doEnergy, const bool doVirial, const bool doSlow,   const bool doEnergy, const bool doVirial, const bool doSlow,
   const float latticeX, const float latticeY, const float latticeZ,   const float3 lata, const float3 latb, const float3 latc,
   const float4* h_xyzq, const float cutoff2,    const float4* h_xyzq, const float cutoff2, 
   float4* d_forces, float4* d_forcesSlow,   float4* d_forces, float4* d_forcesSlow,
   float4* h_forces, float4* h_forcesSlow,   float4* h_forces, float4* h_forcesSlow,
Line 1106
Line 1039
     int nleft = tlKernel.getNumTileLists() - start;     int nleft = tlKernel.getNumTileLists() - start;
     int nblock = min(deviceCUDA->getMaxNumBlocks(), (nleft-1)/nwarp+1);     int nblock = min(deviceCUDA->getMaxNumBlocks(), (nleft-1)/nwarp+1);
  
 #ifndef DISABLE_CUDA_TEXTURE_OBJECTS 
 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \ #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \
     nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \     nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \
   <<< nblock, nthread, shMemSize, stream >>>  \   <<< nblock, nthread, shMemSize, stream >>>  \
   (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \   (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
     vdwCoefTableWidth, vdwCoefTable, vdwTypes, latticeX, latticeY, latticeZ, tlKernel.get_xyzq(), cutoff2, \     cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getVdwCoefTable(), \
     forceTableTex, energyTableTex, \     vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, \
      cudaNonbondedTables.getVdwCoefTableTex(), cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex(), \
     atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \     atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
     tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \     tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
     tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \     tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \
     numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \     numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
     outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true     outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true
 #else  // DISABLE_CUDA_TEXTURE_OBJECTS 
 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \ 
   nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \ 
   <<< nblock, nthread, shMemSize, stream >>>  \ 
   (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \ 
     vdwCoefTableWidth, vdwCoefTable, vdwTypes, latticeX, latticeY, latticeZ, tlKernel.get_xyzq(), cutoff2, \ 
     atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \ 
     tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \ 
     tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \ 
     numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \ 
     outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true 
 #endif // DISABLE_CUDA_TEXTURE_OBJECTS 
  
     bool called = false;     bool called = false;
  
Line 1231
Line 1152
   reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions);   reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions);
   copy_HtoD_sync<unsigned int>(exclusion_bits, overflowExclusions, numExclusions);   copy_HtoD_sync<unsigned int>(exclusion_bits, overflowExclusions, numExclusions);
 } }
  
 void CudaComputeNonbondedKernel::bindVdwCoefTable(float2* h_vdwCoefTable, int vdwCoefTableWidthIn) { 
   vdwCoefTableWidth = vdwCoefTableWidthIn; 
  
   int vdwCoefTableArea = vdwCoefTableWidth*vdwCoefTableWidth; 
  
   reallocate_device<float2>(&vdwCoefTable, &vdwCoefTableSize, vdwCoefTableArea); 
   copy_HtoD_sync<float2>(h_vdwCoefTable, vdwCoefTable, vdwCoefTableArea); 
  
   if (vdwCoefTableTextureBound) { 
     cudaCheck(cudaUnbindTexture(vdwCoefTableTexture)); 
     vdwCoefTableTextureBound = false; 
   } 
  
   vdwCoefTableTexture.normalized = false; 
   vdwCoefTableTexture.addressMode[0] = cudaAddressModeClamp; 
   vdwCoefTableTexture.filterMode = cudaFilterModePoint; 
   cudaCheck(cudaBindTexture((size_t*)0, vdwCoefTableTexture, vdwCoefTable, vdwCoefTableArea*sizeof(float2))); 
   vdwCoefTableTextureBound = true; 
 } 
  
 #ifndef DISABLE_CUDA_TEXTURE_OBJECTS 
 void CudaComputeNonbondedKernel::bindTextureObject(int tableSize, float4* h_table, cudaArray_t& array, cudaTextureObject_t& tableTex) { 
   cudaChannelFormatDesc desc; 
   desc.x = sizeof(float)*8; 
   desc.y = sizeof(float)*8; 
   desc.z = sizeof(float)*8; 
   desc.w = sizeof(float)*8; 
   desc.f = cudaChannelFormatKindFloat; 
   cudaCheck(cudaMallocArray(&array, &desc, tableSize, 1)); 
   cudaCheck(cudaMemcpyToArray(array, 0, 0, h_table, tableSize*sizeof(float4), cudaMemcpyHostToDevice)); 
  
   cudaResourceDesc resDesc; 
   memset(&resDesc, 0, sizeof(resDesc)); 
   resDesc.resType = cudaResourceTypeArray; 
   resDesc.res.array.array = array; 
  
   cudaTextureDesc texDesc; 
   memset(&texDesc, 0, sizeof(texDesc)); 
   texDesc.addressMode[0] = cudaAddressModeClamp; 
   texDesc.filterMode = cudaFilterModeLinear; 
   texDesc.normalizedCoords = 1; 
  
   cudaCheck(cudaCreateTextureObject(&tableTex, &resDesc, &texDesc, NULL)); 
 } 
 #endif 
  
 void CudaComputeNonbondedKernel::bindForceAndEnergyTable(int tableSize, 
   float4* h_forceTable, 
   float4* h_energyTable) { 
  
 #ifndef DISABLE_CUDA_TEXTURE_OBJECTS 
  
   if (forceTableTexActive) { 
     cudaCheck(cudaFreeArray(forceArray)); 
     cudaCheck(cudaDestroyTextureObject(forceTableTex)); 
   } 
  
   if (energyTableTexActive) { 
     cudaCheck(cudaFreeArray(energyArray)); 
     cudaCheck(cudaDestroyTextureObject(energyTableTex)); 
   } 
   forceTableTex = 0; 
   energyTableTex = 0; 
  
   bindTextureObject(tableSize, h_forceTable, forceArray, forceTableTex); 
   bindTextureObject(tableSize, h_energyTable, energyArray, energyTableTex); 
   forceTableTexActive = true; 
   energyTableTexActive = true; 
  
 #else // DISABLE_CUDA_TEXTURE_OBJECTS 
  
   if (force_table_bound) { 
    cudaCheck(cudaFreeArray(forceArray)); 
   } 
   cudaCheck(cudaMallocArray(&forceArray, &force_table.channelDesc, tableSize, 1)); 
  
   if (energy_table_bound) { 
    cudaCheck(cudaFreeArray(energyArray)); 
   } 
   cudaCheck(cudaMallocArray(&energyArray, &energy_table.channelDesc, tableSize, 1)); 
  
   cudaCheck(cudaMemcpyToArray(forceArray, 0, 0, h_forceTable, tableSize*sizeof(float4), cudaMemcpyHostToDevice)); 
   cudaCheck(cudaMemcpyToArray(energyArray, 0, 0, h_energyTable, tableSize*sizeof(float4), cudaMemcpyHostToDevice)); 
  
   force_table.normalized = true; 
   force_table.addressMode[0] = cudaAddressModeClamp; 
   force_table.addressMode[1] = cudaAddressModeClamp; 
   force_table.filterMode = cudaFilterModeLinear; 
  
   energy_table.normalized = true; 
   energy_table.addressMode[0] = cudaAddressModeClamp; 
   energy_table.addressMode[1] = cudaAddressModeClamp; 
   energy_table.filterMode = cudaFilterModeLinear; 
  
   cudaCheck(cudaBindTextureToArray(force_table, forceArray)); 
   cudaCheck(cudaBindTextureToArray(energy_table, energyArray)); 
  
   force_table_bound = true; 
   energy_table_bound = true; 
 #endif // DISABLE_CUDA_TEXTURE_OBJECTS 
 } 


Legend:
Removed in v.1.1 
changed lines
 Added in v.1.2



Made by using version 1.53 of cvs2html