Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members

ComputeNonbondedCUDAKernel.cu

Go to the documentation of this file.
00001 
00002 #include "ComputeNonbondedCUDAKernel.h"
00003 #include <stdio.h>
00004 
00005 #ifdef NAMD_CUDA
00006 
00007 
00008 __constant__ unsigned int exclusions[MAX_EXCLUSIONS];
00009 
00010 #define SET_EXCL(EXCL,BASE,DIFF) \
00011          (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
00012 
00013 void cuda_bind_exclusions(const unsigned int *t, int n) {
00014 
00015 #if 0
00016   unsigned int excl[MAX_EXCLUSIONS];
00017 
00018   for ( int i=0; i<MAX_EXCLUSIONS; ++i ) { excl[i] = 0; }
00019 
00020   SET_EXCL(excl,0,2);
00021   SET_EXCL(excl,0,3);
00022   SET_EXCL(excl,0,4);
00023 
00024   cudaMemcpyToSymbol(exclusions, excl, MAX_EXCLUSIONS*sizeof(unsigned int), 0);
00025 #endif
00026 
00027   cudaMemcpyToSymbol(exclusions, t, n*sizeof(unsigned int), 0);
00028   cuda_errcheck("memcpy to exclusions");
00029 }
00030 
00031 
00032 texture<float4, 1, cudaReadModeElementType> force_table;
00033 
00034 void cuda_bind_force_table(const float4 *t) {
00035     static cudaArray *ct;
00036     if ( ! ct ) {
00037       cudaMallocArray(&ct, &force_table.channelDesc, FORCE_TABLE_SIZE, 1);
00038       cuda_errcheck("allocating force table");
00039     }     cudaMemcpyToArray(ct, 0, 0, t, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);     // cudaMemcpy(ct, t, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);
00040     cuda_errcheck("memcpy to force table");
00041 
00042     force_table.normalized = true;
00043     force_table.addressMode[0] = cudaAddressModeClamp;
00044     force_table.addressMode[1] = cudaAddressModeClamp;
00045     force_table.filterMode = cudaFilterModeLinear;
00046 
00047     cudaBindTextureToArray(force_table, ct);
00048     cuda_errcheck("binding force table to texture");
00049 }
00050 
00051 static int patch_pairs_size;
00052 static patch_pair *patch_pairs;
00053 
00054 static int force_lists_size;
00055 static force_list *force_lists;
00056 
00057 static int force_buffers_size;
00058 static float4 *force_buffers;
00059 static float4 *slow_force_buffers;
00060 
00061 static int atoms_size;
00062 static atom *atoms;
00063 static atom_param *atom_params;
00064 static float4 *forces;
00065 static float4 *slow_forces;
00066 
00067 static int patch_pairs_alloc;
00068 static int force_buffers_alloc;
00069 static int force_lists_alloc;
00070 static int atoms_alloc;
00071 
00072 static int max_atoms_per_patch;
00073 
00074 // static cudaStream_t stream;
00075 cudaStream_t stream;
00076  
00077 void cuda_init() {
00078   forces = 0;
00079   slow_forces = 0;
00080   atom_params = 0;
00081   atoms = 0;
00082   force_buffers = 0;
00083   slow_force_buffers = 0;
00084   force_lists = 0;
00085   patch_pairs = 0;
00086 
00087   patch_pairs_alloc = 0;
00088   force_buffers_alloc = 0;
00089   force_lists_alloc = 0;
00090   atoms_alloc = 0;
00091 
00092   cudaStreamCreate(&stream);
00093   cuda_errcheck("cudaStreamCreate");
00094 }
00095 
00096 void cuda_bind_patch_pairs(const patch_pair *pp, int npp,
00097                         const force_list *fl, int nfl,
00098                         int atoms_size_p, int force_buffers_size_p,
00099                         int max_atoms_per_patch_p) {
00100 
00101   patch_pairs_size = npp;
00102   force_buffers_size = force_buffers_size_p;
00103   force_lists_size = nfl;
00104   atoms_size = atoms_size_p;
00105   max_atoms_per_patch = max_atoms_per_patch_p;
00106 
00107 #if 0
00108  printf("%d %d %d %d %d %d %d %d\n",
00109       patch_pairs_size , patch_pairs_alloc ,
00110       force_buffers_size , force_buffers_alloc ,
00111       force_lists_size , force_lists_alloc ,
00112       atoms_size , atoms_alloc );
00113 #endif
00114 
00115  if ( patch_pairs_size > patch_pairs_alloc ||
00116       force_buffers_size > force_buffers_alloc ||
00117       force_lists_size > force_lists_alloc ||
00118       atoms_size > atoms_alloc ) {
00119 
00120   patch_pairs_alloc = (int) (1.2 * patch_pairs_size);
00121   force_buffers_alloc = (int) (1.2 * force_buffers_size);
00122   force_lists_alloc = (int) (1.2 * force_lists_size);
00123   atoms_alloc = (int) (1.2 * atoms_size);
00124 
00125   if ( forces ) cudaFree(forces);
00126   if ( slow_forces ) cudaFree(slow_forces);
00127   if ( atom_params ) cudaFree(atom_params);
00128   if ( atoms ) cudaFree(atoms);
00129   if ( force_buffers ) cudaFree(force_buffers);
00130   if ( slow_force_buffers ) cudaFree(slow_force_buffers);
00131   if ( force_lists ) cudaFree(force_lists);
00132   if ( patch_pairs ) cudaFree(patch_pairs);
00133   cuda_errcheck("free everything");
00134 
00135 #if 1
00136   int totalmem = patch_pairs_alloc * sizeof(patch_pair) +
00137                 force_lists_alloc * sizeof(force_list) +
00138                 2 * force_buffers_alloc * sizeof(float4) +
00139                 atoms_alloc * sizeof(atom) +
00140                 atoms_alloc * sizeof(atom_param) +
00141                 2 * atoms_alloc * sizeof(float4);
00142   // printf("allocating %d MB of memory on GPU\n", totalmem >> 20);
00143 #endif
00144 
00145   cudaMalloc((void**) &patch_pairs, patch_pairs_alloc * sizeof(patch_pair));
00146   cudaMalloc((void**) &force_lists, force_lists_alloc * sizeof(force_list));
00147   cudaMalloc((void**) &force_buffers, force_buffers_alloc * sizeof(float4));
00148   cudaMalloc((void**) &slow_force_buffers, force_buffers_alloc * sizeof(float4));
00149   cudaMalloc((void**) &atoms, atoms_alloc * sizeof(atom));
00150   cudaMalloc((void**) &atom_params, atoms_alloc * sizeof(atom_param));
00151   cudaMalloc((void**) &forces, atoms_alloc * sizeof(float4));
00152   cudaMalloc((void**) &slow_forces, atoms_alloc * sizeof(float4));
00153   cuda_errcheck("malloc everything");
00154 
00155  }
00156 
00157   cudaMemcpy(patch_pairs, pp, npp * sizeof(patch_pair),
00158                                 cudaMemcpyHostToDevice);
00159   cuda_errcheck("memcpy to patch_pairs");
00160 
00161   cudaMemcpy(force_lists, fl, nfl * sizeof(force_list),
00162                                 cudaMemcpyHostToDevice);
00163   cuda_errcheck("memcpy to force_lists");
00164 }
00165 
00166 void cuda_bind_atom_params(const atom_param *t) {
00167   cudaMemcpyAsync(atom_params, t, atoms_size * sizeof(atom_param),
00168                                 cudaMemcpyHostToDevice, stream);
00169   cuda_errcheck("memcpy to atom_params");
00170 }
00171 
00172 void cuda_bind_atoms(const atom *a) {
00173   cuda_errcheck("before memcpy to atoms");
00174   cudaMemcpyAsync(atoms, a, atoms_size * sizeof(atom),
00175                                 cudaMemcpyHostToDevice, stream);
00176   cuda_errcheck("memcpy to atoms");
00177 }
00178 
00179 void cuda_load_forces(float4 *f, float4 *f_slow, int begin, int count) {
00180   // printf("load forces %d %d %d\n",begin,count,atoms_size);
00181   cudaMemcpyAsync(f+begin, forces+begin, count * sizeof(float4),
00182                                 cudaMemcpyDeviceToHost, stream);
00183   if ( f_slow ) {
00184     cudaMemcpyAsync(f_slow+begin, slow_forces+begin, count * sizeof(float4),
00185                                 cudaMemcpyDeviceToHost, stream);
00186   }
00187   cuda_errcheck("memcpy from forces");
00188 }
00189 
00190 
00191 #if 0
00192 __host__ __device__ static int3 patch_coords_from_id(
00193         dim3 PATCH_GRID, int id) {
00194 
00195   return make_int3( id % PATCH_GRID.x,
00196                 ( id / PATCH_GRID.x ) % PATCH_GRID.y,
00197                 id / ( PATCH_GRID.x * PATCH_GRID.y ) );
00198 }
00199 
00200 __host__ __device__ static int patch_id_from_coords(
00201         dim3 PATCH_GRID, int3 coords) {
00202 
00203   // handles periodic boundaries
00204   int x = (coords.x + 4 * PATCH_GRID.x) % PATCH_GRID.x;
00205   int y = (coords.y + 4 * PATCH_GRID.y) % PATCH_GRID.y;
00206   int z = (coords.z + 4 * PATCH_GRID.z) % PATCH_GRID.z;
00207 
00208   return ( z * PATCH_GRID.y + y ) * PATCH_GRID.x + x;
00209 }
00210 
00211 __host__ __device__ static int3 patch_offset_from_neighbor(int neighbor) {
00212 
00213   // int3 coords = patch_coords_from_id(make_uint3(3,3,3), 13 + neighbor);
00214   int3 coords = patch_coords_from_id(make_uint3(3,3,3), neighbor);
00215   return make_int3(coords.x - 1, coords.y - 1, coords.z - 1);
00216 
00217 }
00218 #endif
00219  
00220 #define BLOCK_SIZE 64
00221 #define SHARED_SIZE 16
00222 
00223 __global__ static void dev_nonbonded(
00224         const patch_pair *patch_pairs,
00225         const atom *atoms,
00226         const atom_param *atom_params,
00227         float4 *force_buffers,
00228         float4 *slow_force_buffers,
00229         float3 lata, float3 latb, float3 latc,
00230         float cutoff2) {
00231 // call with two blocks per patch_pair
00232 // call with BLOCK_SIZE threads per block
00233 // call with no shared memory
00234 
00235   #define jpqs jpqu.d
00236   __shared__ union {
00237     atom d[SHARED_SIZE];
00238     unsigned int i[BLOCK_SIZE];
00239     float f[BLOCK_SIZE];
00240   } jpqu;
00241 
00242   #define japs japu.d
00243   __shared__ union {
00244     atom_param d[SHARED_SIZE];
00245     unsigned int i[BLOCK_SIZE];
00246   } japu;
00247 
00248   #define myPatchPair pp.pp
00249   __shared__ union { patch_pair pp; unsigned int i[12]; } pp;
00250 
00251   if ( threadIdx.x < (sizeof(patch_pair)>>2) ) {
00252     unsigned int tmp = ((unsigned int*)patch_pairs)[
00253                         (sizeof(patch_pair)>>2)*blockIdx.x+threadIdx.x];
00254     pp.i[threadIdx.x] = tmp;
00255   }
00256   __syncthreads();
00257 
00258   // convert scaled offset with current lattice
00259   if ( threadIdx.x == 0 ) {
00260     float offx = myPatchPair.offset.x * lata.x
00261                + myPatchPair.offset.y * latb.x
00262                + myPatchPair.offset.z * latc.x;
00263     float offy = myPatchPair.offset.x * lata.y
00264                + myPatchPair.offset.y * latb.y
00265                + myPatchPair.offset.z * latc.y;
00266     float offz = myPatchPair.offset.x * lata.z
00267                + myPatchPair.offset.y * latb.z
00268                + myPatchPair.offset.z * latc.z;
00269     myPatchPair.offset.x = offx;
00270     myPatchPair.offset.y = offy;
00271     myPatchPair.offset.z = offz;
00272   }
00273   __syncthreads();
00274 
00275 #if 0
00276 // compute records duplicated so this is no longer needed
00277   if ( blockIdx.y ) {
00278 
00279     if ( myPatchPair.patch1_force_start == myPatchPair.patch2_force_start ) {
00280       return;
00281     }
00282 
00283   } else {  // swap patches
00284 
00285     if ( threadIdx.x == 0 &&
00286          myPatchPair.patch1_force_start != myPatchPair.patch2_force_start ) {
00287 
00288 #undef SWAP
00289 #define SWAP(FIELD1, FIELD2) { \
00290         unsigned int tmp1 = myPatchPair.FIELD1; \
00291         unsigned int tmp2 = myPatchPair.FIELD2; \
00292         if ( tmp1 != tmp2 ) { \
00293           myPatchPair.FIELD1 = tmp2; \
00294           myPatchPair.FIELD2 = tmp1; \
00295         } \
00296       }
00297 
00298       SWAP(patch1_size, patch2_size)
00299       SWAP(patch1_force_size, patch2_force_size)
00300       SWAP(patch1_atom_start, patch2_atom_start)
00301       SWAP(patch1_force_start, patch2_force_start)
00302 
00303       myPatchPair.offset.x *= -1.f;
00304       myPatchPair.offset.y *= -1.f;
00305       myPatchPair.offset.z *= -1.f;
00306 
00307     }
00308 
00309     __syncthreads();
00310 
00311   }
00312 #endif
00313 
00314   for ( int blocki = 0;
00315         blocki < myPatchPair.patch1_force_size;
00316         blocki += BLOCK_SIZE ) {
00317 
00318   __syncthreads();
00319 
00320   atom ipq;
00321   struct {
00322     float sqrt_epsilon;
00323     float half_sigma;
00324     int index; } iap;
00325 
00326   // load patch 1
00327   if ( blocki + threadIdx.x < myPatchPair.patch1_force_size ) {
00328     int i = myPatchPair.patch1_atom_start + blocki + threadIdx.x;
00329     float4 tmpa = ((float4*)atoms)[i];
00330 
00331     ipq.position.x = tmpa.x + myPatchPair.offset.x;
00332     ipq.position.y = tmpa.y + myPatchPair.offset.y;
00333     ipq.position.z = tmpa.z + myPatchPair.offset.z;
00334     ipq.charge = tmpa.w;
00335 
00336     uint4 tmpap = ((uint4*)atom_params)[i];
00337 
00338     jpqu.i[threadIdx.x] = tmpap.x;
00339     iap.sqrt_epsilon = jpqu.f[threadIdx.x];
00340     jpqu.i[threadIdx.x] = tmpap.y;
00341     iap.half_sigma = jpqu.f[threadIdx.x];
00342     iap.index = tmpap.z;
00343   }
00344 
00345   float4 ife, ife_slow;
00346   ife.x = 0.f;
00347   ife.y = 0.f;
00348   ife.z = 0.f;
00349   ife.w = 0.f;
00350   ife_slow.x = 0.f;
00351   ife_slow.y = 0.f;
00352   ife_slow.z = 0.f;
00353   ife_slow.w = 0.f;
00354 
00355   for ( int blockj = 0;
00356         blockj < myPatchPair.patch2_size;
00357         blockj += SHARED_SIZE ) {
00358 
00359   int shared_size = myPatchPair.patch2_size - blockj;
00360   if ( shared_size > SHARED_SIZE ) shared_size = SHARED_SIZE;
00361 
00362   // load patch 2
00363   // sync needed because of loop, could avoid with double-buffering
00364   __syncthreads();
00365 
00366   if ( threadIdx.x < 4 * shared_size ) {
00367     int j = myPatchPair.patch2_atom_start + blockj;
00368     jpqu.i[threadIdx.x] = ((unsigned int *)(atoms + j))[threadIdx.x];
00369     japu.i[threadIdx.x] = ((unsigned int *)(atom_params + j))[threadIdx.x];
00370   }
00371   __syncthreads();
00372 
00373   // calc forces on patch 1
00374   if ( blocki + threadIdx.x < myPatchPair.patch1_force_size ) {
00375 
00376 // be careful not to use // comments inside macros!
00377 #define FORCE_INNER_LOOP(IPQ,IAP) \
00378     for ( int j = 0; j < shared_size; ++j ) { \
00379       /* actually calculate force */ \
00380       float tmpx = jpqs[j].position.x - IPQ.position.x; \
00381       float tmpy = jpqs[j].position.y - IPQ.position.y; \
00382       float tmpz = jpqs[j].position.z - IPQ.position.z; \
00383       float r2 = tmpx*tmpx + tmpy*tmpy + tmpz*tmpz; \
00384       if ( r2 < cutoff2 ) { \
00385         float4 fi = tex1D(force_table, 1.f/sqrt(r2)); \
00386         bool excluded = false; \
00387         int indexdiff = (int)(IAP.index) - (int)(japs[j].index); \
00388         if ( abs(indexdiff) <= (int) japs[j].excl_maxdiff ) { \
00389           indexdiff += japs[j].excl_index; \
00390           excluded = ((exclusions[indexdiff>>5] & (1<<(indexdiff&31))) != 0); \
00391         } \
00392         float e = IAP.half_sigma + japs[j].half_sigma;  /* sigma */ \
00393         e *= e*e;  /* sigma^3 */ \
00394         e *= e;  /* sigma^6 */ \
00395         e *= ( e * fi.z + fi.y );  /* s^12 * fi.z - s^6 * fi.y */ \
00396         e *= IAP.sqrt_epsilon * japs[j].sqrt_epsilon;  /* full L-J */ \
00397         float e_slow = IPQ.charge * jpqs[j].charge; \
00398         e += e_slow * fi.x; \
00399         e_slow *= fi.w; \
00400         if ( ! excluded ) { \
00401           ife.w += r2 * e; \
00402           ife.x += tmpx * e; \
00403           ife.y += tmpy * e; \
00404           ife.z += tmpz * e; \
00405           ife_slow.w += r2 * e_slow; \
00406           ife_slow.x += tmpx * e_slow; \
00407           ife_slow.y += tmpy * e_slow; \
00408           ife_slow.z += tmpz * e_slow; \
00409         } \
00410       }  /* cutoff */ \
00411     }
00412     FORCE_INNER_LOOP(ipq,iap)
00413 
00414   } // if
00415   } // blockj loop
00416 
00417   if ( blocki + threadIdx.x < myPatchPair.patch1_force_size ) {
00418     int i_out = myPatchPair.patch1_force_start + blocki + threadIdx.x;
00419     force_buffers[i_out] = ife;
00420     if ( slow_force_buffers ) {
00421       slow_force_buffers[i_out] = ife_slow;
00422     }
00423   }
00424 
00425   } // blocki loop
00426 
00427 }
00428 
00429 
00430 __global__ static void dev_sum_forces(
00431         const force_list *force_lists,
00432         const float4 *force_buffers,
00433         float4 *forces) {
00434 // call with one block per patch
00435 // call multiple of 64 threads per block
00436 // call with no shared memory
00437 
00438   __shared__ force_list myForceList;
00439 
00440   if ( threadIdx.x == 0 ) {
00441     myForceList = force_lists[blockIdx.x];
00442   }
00443   __syncthreads();
00444 
00445   for ( int j = threadIdx.x; j < myForceList.patch_size; j += blockDim.x ) {
00446 
00447     const float4 *fbuf = force_buffers + myForceList.force_list_start + j;
00448     float4 fout;
00449     fout.x = 0.f;
00450     fout.y = 0.f;
00451     fout.z = 0.f;
00452     fout.w = 0.f;
00453     for ( int i=0; i < myForceList.force_list_size; ++i ) {
00454       float4 f = *fbuf;
00455       fout.x += f.x;
00456       fout.y += f.y;
00457       fout.z += f.z;
00458       fout.w += f.w;
00459       fbuf += myForceList.patch_size;
00460     }
00461 
00462     forces[myForceList.force_output_start + j] = fout;
00463 
00464   }
00465 }
00466 
00467 
00468 void cuda_nonbonded_forces(float3 lata, float3 latb, float3 latc, float cutoff2,
00469                 int cbegin, int ccount, int pbegin, int pcount, int doSlow) {
00470 
00471  if ( ccount ) {
00472   // printf("%d %d %d\n",cbegin,ccount,patch_pairs_size);
00473   dev_nonbonded<<< ccount, BLOCK_SIZE, 0, stream
00474         >>>(patch_pairs+cbegin,atoms,atom_params,force_buffers,
00475              (doSlow?slow_force_buffers:0), lata, latb, latc, cutoff2);
00476   cuda_errcheck("dev_nonbonded");
00477  }
00478 
00479  if ( pcount ) {
00480   // printf("%d %d %d\n",pbegin,pcount,force_lists_size);
00481   dev_sum_forces<<< pcount, 64, 0, stream
00482         >>>(force_lists+pbegin,force_buffers,forces);
00483   if ( doSlow ) {
00484     dev_sum_forces<<< pcount, 64, 0, stream
00485         >>>(force_lists+pbegin,slow_force_buffers,slow_forces);
00486   }
00487   cuda_errcheck("dev_sum_forces");
00488  }
00489 
00490 }
00491 
00492 
00493 int cuda_stream_finished() {
00494   return ( cudaStreamQuery(stream) == cudaSuccess );
00495 }
00496 
00497 
00498 #endif  // NAMD_CUDA
00499 

Generated on Tue Nov 24 04:07:43 2009 for NAMD by  doxygen 1.3.9.1