ComputePmeCUDAKernel.cu

Go to the documentation of this file.
00001 #include "ComputePmeCUDAKernel.h"
00002 #include "CudaUtils.h"
00003 
00004 void NAMD_die(const char *);
00005 
00006 #ifdef NAMD_CUDA
00007 
00008 #include <cuda.h>
00009 
00010 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200
00011 // allow linking, must prevent execution elsewhere
00012 __device__ void atomicAdd(float *, float) { }
00013 #endif
00014 
00015 #if 0
00016 #include <stdio.h>
00017 #endif
00018 
00019 // must run with at least order**2 threads
00020 #define warps_per_block 8
00021 #define atoms_per_warp 4
00022 #define atoms_per_block (atoms_per_warp * warps_per_block)
00023 
00024 
00025 static const float order_4_coeffs[4][4] =
00026 {{1, -3, 3, -1}, {0, 3, -6, 3}, {0, 3, 0, -3}, {0, 1, 4, 1}};
00027 // divide by 3! = 6
00028 
00029 static const float order_6_coeffs[6][6] =
00030 {{1, -5, 10, -10, 5, -1}, {0, 5, -20, 30, -20, 5}, {0, 10, -20, 0, 
00031   20, -10}, {0, 10, 20, -60, 20, 10}, {0, 5, 50, 0, -50, -5}, {0, 1, 
00032   26, 66, 26, 1}};
00033 // divide by 5! = 120
00034 
00035 static const float order_8_coeffs[8][8] =
00036 {{1, -7, 21, -35, 35, -21, 7, -1}, {0, 7, -42, 105, -140, 105, -42, 
00037   7}, {0, 21, -84, 105, 0, -105, 84, -21}, {0, 35, 0, -315, 560, -315,
00038    0, 35}, {0, 35, 280, -665, 0, 665, -280, -35}, {0, 21, 504, 
00039   315, -1680, 315, 504, 21}, {0, 7, 392, 1715, 
00040   0, -1715, -392, -7}, {0, 1, 120, 1191, 2416, 1191, 120, 1}};
00041 // divide by 7! = 5040
00042 
00043 void cuda_init_bspline_coeffs(float **c, float **dc, int order) {
00044   float *coeffs = new float[order*order];
00045   float *dcoeffs = new float[order*order];
00046   double divisor;
00047   static const float *scoeffs;
00048   switch ( order ) {
00049   case 4:
00050     scoeffs = &order_4_coeffs[0][0];
00051     divisor = 6;
00052     break;
00053   case 6:
00054     scoeffs = &order_6_coeffs[0][0];
00055     divisor = 120;
00056     break;
00057   case 8:
00058     scoeffs = &order_8_coeffs[0][0];
00059     divisor = 5040;
00060     break;
00061   default:
00062     NAMD_die("unsupported PMEInterpOrder");
00063   }
00064   double sum = 0;
00065   for ( int i=0, p=order-1; i<order; ++i,--p ) {
00066     for ( int j=0; j<order; ++j ) {
00067       double c = scoeffs[i*order+(order-1-j)];  // reverse order
00068       sum += c;
00069       c /= divisor;
00070       coeffs[i*order+j] = c;
00071       dcoeffs[i*order+j] = (double)p * c;
00072       // printf("%d %d %f %f\n", i, j, c, (double)p*c);
00073     }
00074   }
00075   // printf("checksum: %f %f\n", sum, divisor);
00076   if ( sum != divisor )
00077     NAMD_die("cuda_init_bspline_coeffs static data checksum error");
00078   cudaMalloc((void**) c, order*order*sizeof(float));
00079   cudaMalloc((void**) dc, order*order*sizeof(float));
00080   cudaMemcpy(*c, coeffs, order*order*sizeof(float), cudaMemcpyHostToDevice);
00081   cudaMemcpy(*dc, dcoeffs, order*order*sizeof(float), cudaMemcpyHostToDevice);
00082   delete [] coeffs;
00083   delete [] dcoeffs;
00084 }
00085 
00086 #define BSPLINE_DEFS \
00087   __shared__ union { \
00088     float a2d[order][order]; \
00089     float a1d[order*order]; \
00090   } bspline_coeffs; \
00091   __shared__ volatile union { \
00092     float a2d[atoms_per_warp][7]; \
00093     float a1d[atoms_per_warp*7]; \
00094   } atoms[warps_per_block]; \
00095   if ( threadIdx.x < order*order ) { \
00096     bspline_coeffs.a1d[threadIdx.x] = coeffs[threadIdx.x]; \
00097   } \
00098   BLOCK_SYNC;
00099 
00100 // simplify warp-synchronous operations
00101 #define WARP (threadIdx.x>>5)
00102 #define THREAD (threadIdx.x&31)
00103 #define FX bspline_factors[threadIdx.x>>5][0]
00104 #define FY bspline_factors[threadIdx.x>>5][1]
00105 #define FZ bspline_factors[threadIdx.x>>5][2]
00106 #define DFX bspline_dfactors[threadIdx.x>>5][0]
00107 #define DFY bspline_dfactors[threadIdx.x>>5][1]
00108 #define DFZ bspline_dfactors[threadIdx.x>>5][2]
00109 
00110 #define FX2 bspline_2factors[threadIdx.x>>5][0]
00111 #define FY2 bspline_2factors[threadIdx.x>>5][1]
00112 #define FZ2 bspline_2factors[threadIdx.x>>5][2]
00113 
00114 #define ATOM atoms[threadIdx.x>>5].a2d[i_atom]
00115 #define AX atoms[threadIdx.x>>5].a2d[i_atom][0]
00116 #define AY atoms[threadIdx.x>>5].a2d[i_atom][1]
00117 #define AZ atoms[threadIdx.x>>5].a2d[i_atom][2]
00118 #define AQ atoms[threadIdx.x>>5].a2d[i_atom][3]
00119 #define AI atoms[threadIdx.x>>5].a2d[i_atom][4]
00120 #define AJ atoms[threadIdx.x>>5].a2d[i_atom][5]
00121 #define AK atoms[threadIdx.x>>5].a2d[i_atom][6]
00122 
00123 
00124 
00125 template <int order>
00126 __global__ void cuda_pme_charges_batched_dev(
00127   const float * __restrict__ coeffs, \
00128   float * const * __restrict__ q_arr, int * __restrict__ f_arr, int * __restrict__ fz_arr, \
00129   float ** __restrict__ a_data_ptr, int* n_atoms_ptr, \
00130   int* K1_ptr, int* K2_ptr, int* K3_ptr
00131 ) {
00132 
00133   int patchIndex = blockIdx.y;
00134   int K1 = K1_ptr[patchIndex];
00135   int K2 = K2_ptr[patchIndex];
00136   int K3 = K3_ptr[patchIndex];
00137   int n_atoms = n_atoms_ptr[patchIndex];
00138 
00139   BSPLINE_DEFS
00140   __shared__ volatile float bspline_factors[warps_per_block][3][order];
00141   int atoms_this_warp = atoms_per_warp;
00142   {
00143     int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
00144     if ( aidx + atoms_per_warp > n_atoms ) {
00145       atoms_this_warp = n_atoms - aidx;  // may be negative
00146       if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
00147     }
00148     if ( THREAD < atoms_this_warp*7 ) {
00149       float* a_data = a_data_ptr[patchIndex];
00150       atoms[WARP].a1d[THREAD] = a_data[aidx*7+THREAD];
00151     }
00152   }
00153   for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
00154     WARP_SYNC(WARP_FULL_MASK);  // done writing atoms, reading bspline_factors (FX,FY,FZ)
00155     float aq=AQ;
00156     int ai=(int)AI;
00157     int aj=(int)AJ;
00158     int ak=(int)AK;
00159 
00160     if ( THREAD < 3 * order ) {
00161       const float af = ATOM[THREAD/order];  // x, y, or z
00162       float f = bspline_coeffs.a2d[0][THREAD % order];
00163       for ( int i=1; i<order; ++i ) {
00164         f = af * f + bspline_coeffs.a2d[i][THREAD % order];
00165       }
00166       bspline_factors[WARP][THREAD/order][THREAD % order] = f;
00167     }
00168     WARP_SYNC(WARP_FULL_MASK);  // done writing bspline_factors
00169     for ( int i=THREAD; i < order*order; i+=32 ) {
00170       int ti = i/order;
00171       int tj = i%order;
00172       int gti = ti + ai;  if ( gti >= K1 ) gti -= K1;
00173       int gtj = tj + aj;  if ( gtj >= K2 ) gtj -= K2;
00174       f_arr[gti * K2 + gtj] = 1;
00175     }
00176     if ( THREAD < order ) {
00177       int gtk = ak;  if ( gtk >= K3 ) gtk -= K3;
00178       gtk += THREAD;  // padded to increase coalescing (maybe, but reduces re-use)
00179 
00180       fz_arr[gtk] = 1;
00181     }
00182 
00183     for ( int i=THREAD; i < order*order*order; i+=32 ) {
00184       int ti = i/(order*order);
00185       int tj = (i/order)%order;
00186       int tk = i%order;
00187       float val = aq * FX[ti] * FY[tj] * FZ[tk];
00188       
00189       int gti = ti + ai;  if ( gti >= K1 ) gti -= K1;
00190       int gtj = tj + aj;  if ( gtj >= K2 ) gtj -= K2;
00191       int gtk = ak;  if ( gtk >= K3 ) gtk -= K3;
00192       gtk += tk;  // padded to increase coalescing (maybe, but reduces re-use)
00193       float *dest = q_arr[gti * K2 + gtj];  // could load as constant
00194       if ( dest ) atomicAdd(dest+gtk,val);
00195     }
00196   } // i_atom
00197 }
00198 
00199 
00200 template <int order>
00201 __global__ void cuda_pme_charges_dev(
00202   const float * __restrict__ coeffs, \
00203   float * const * __restrict__ q_arr, int * __restrict__ f_arr, int * __restrict__ fz_arr, \
00204   float * __restrict__ a_data, int n_atoms, \
00205   int K1, int K2, int K3
00206 ) {
00207   BSPLINE_DEFS
00208   __shared__ volatile float bspline_factors[warps_per_block][3][order];
00209   int atoms_this_warp = atoms_per_warp;
00210   {
00211     int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
00212     if ( aidx + atoms_per_warp > n_atoms ) {
00213       atoms_this_warp = n_atoms - aidx;  // may be negative
00214       if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
00215     }
00216     if ( THREAD < atoms_this_warp*7 ) {
00217       atoms[WARP].a1d[THREAD] = a_data[aidx*7+THREAD];
00218     }
00219   }
00220   for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
00221     WARP_SYNC(WARP_FULL_MASK);  // done writing atoms, reading bspline_factors (FX,FY,FZ)
00222     float aq=AQ;
00223     int ai=(int)AI;
00224     int aj=(int)AJ;
00225     int ak=(int)AK;
00226 #if 0
00227 if ( THREAD == 0 && (
00228   AI != (int)AI || AJ != (int)AJ || AK != (int)AK ||
00229   AQ < -30.f || AQ > 30.f || AX < 0.f || AX > 1.f ||
00230   AY < 0.f || AY > 1.f || AZ < 0.f || AZ > 1.f )
00231 ) {
00232   int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
00233   printf("%d/%d %f %f %f %f %f %f %f\n", aidx, n_atoms, AI, AJ, AK, AQ, AX, AY, AZ );
00234   continue;
00235 }
00236 #endif
00237     if ( THREAD < 3 * order ) {
00238       const float af = ATOM[THREAD/order];  // x, y, or z
00239       float f = bspline_coeffs.a2d[0][THREAD % order];
00240       for ( int i=1; i<order; ++i ) {
00241         f = af * f + bspline_coeffs.a2d[i][THREAD % order];
00242       }
00243       bspline_factors[WARP][THREAD/order][THREAD % order] = f;
00244     }
00245     WARP_SYNC(WARP_FULL_MASK);  // done writing bspline_factors
00246     for ( int i=THREAD; i < order*order; i+=32 ) {
00247       int ti = i/order;
00248       int tj = i%order;
00249       int gti = ti + ai;  if ( gti >= K1 ) gti -= K1;
00250       int gtj = tj + aj;  if ( gtj >= K2 ) gtj -= K2;
00251 #if 0
00252 if ( gti < 0 || gtj < 0 || gti >= K1 || gtj >= K2 ) {
00253   int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
00254   printf("%d/%d %d %d %d %f %f %f %f %f %f %f\n", aidx, n_atoms, i, gti, gtj, AI, AJ, AK, AQ, AX, AY, AZ);
00255 } else
00256 #endif
00257       f_arr[gti * K2 + gtj] = 1;
00258     }
00259     if ( THREAD < order ) {
00260       int gtk = ak;  if ( gtk >= K3 ) gtk -= K3;
00261       gtk += THREAD;  // padded to increase coalescing (maybe, but reduces re-use)
00262 #if 0
00263 if ( gtk < 0 || gtk >= K3+order-1 ) {
00264   int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
00265   printf("%d/%d %d %d %f %f %f %f %f %f %f\n", aidx, n_atoms, THREAD, gtk, AI, AJ, AK, AQ, AX, AY, AZ);
00266 } else
00267 #endif
00268       fz_arr[gtk] = 1;
00269     }
00270     for ( int i=THREAD; i < order*order*order; i+=32 ) {
00271       int ti = i/(order*order);
00272       int tj = (i/order)%order;
00273       int tk = i%order;
00274       float val = aq * FX[ti] * FY[tj] * FZ[tk];
00275       int gti = ti + ai;  if ( gti >= K1 ) gti -= K1;
00276       int gtj = tj + aj;  if ( gtj >= K2 ) gtj -= K2;
00277       int gtk = ak;  if ( gtk >= K3 ) gtk -= K3;
00278       gtk += tk;  // padded to increase coalescing (maybe, but reduces re-use)
00279       float *dest = q_arr[gti * K2 + gtj];  // could load as constant
00280 #if 0
00281 if ( gti < 0 || gtj < 0 || gtk < 0 || gti >= K1 || gtj >= K2 || gtk >= K3+order-1 || dest == 0 ) {
00282   int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
00283   printf("%d/%d %d %d %d %d %f %f %f %ld %f %f %f %f\n", aidx, n_atoms, i, gti, gtj, gtk, AI, AJ, AK, dest, AQ, AX, AY, AZ);
00284 } else
00285 #endif
00286        if ( dest ) atomicAdd(dest+gtk,val);
00287     }
00288   } // i_atom
00289 }
00290 
00291 template <int order>
00292 __global__ void cuda_pme_forces_dev(
00293   const float * __restrict__ coeffs, \
00294   float * const * __restrict__ q_arr, \
00295   float * const * __restrict__ afn_g, \
00296   /* float * __restrict__ a_data,
00297   float * __restrict__ f_data, int n_atoms, */
00298   int K1, int K2, int K3
00299 ) {
00300   __shared__ float *afn_s[3];
00301   if ( threadIdx.x < 3 ) afn_s[threadIdx.x] = afn_g[3*blockIdx.y+threadIdx.x];
00302   BSPLINE_DEFS
00303   __shared__ volatile union {
00304     float a2d[atoms_per_warp][3];
00305     float a1d[atoms_per_warp*3];
00306   } force_output[warps_per_block];
00307   __shared__ float2 bspline_2factors[warps_per_block][3][order];
00308   __shared__ volatile union {
00309     float a2d[32][3];
00310     float a1d[32*3];
00311   } force_reduction[warps_per_block];
00312   int atoms_this_warp = atoms_per_warp;
00313   {
00314     const int n_atoms = afn_s[2] - afn_s[1];
00315     int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
00316     if ( aidx + atoms_per_warp > n_atoms ) {
00317       atoms_this_warp = n_atoms - aidx;  // may be negative
00318       if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
00319     }
00320     if ( THREAD < atoms_this_warp*7 ) {
00321       atoms[WARP].a1d[THREAD] = afn_s[0][aidx*7+THREAD];
00322     }
00323     WARP_SYNC(WARP_FULL_MASK);  // done writing atoms
00324   }
00325   for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
00326     float aq=AQ;
00327     int ai=(int)AI;
00328     int aj=(int)AJ;
00329     int ak=(int)AK;
00330     if ( THREAD < 3 * order ) {
00331       const float af = ATOM[THREAD/order];  // x, y, or z
00332       float df = 0.f;
00333       float c = bspline_coeffs.a2d[0][THREAD % order];
00334       float f = c;
00335       for ( int i=1; i<order; ++i ) {
00336         df = af * df - (order-i) * c;
00337         c = bspline_coeffs.a2d[i][THREAD % order];
00338         f = af * f + c;
00339       }
00340       bspline_2factors[WARP][THREAD/order][THREAD % order] = make_float2(f,df);
00341     }
00342     __threadfence_block();  // bspline_2factors not declared volatile
00343     WARP_SYNC(WARP_FULL_MASK);  // done writing bspline_2factors (FX2,FY2,FZ2)
00344     float force_x = 0.f;
00345     float force_y = 0.f;
00346     float force_z = 0.f;
00347 
00348     for ( int i=THREAD; i < order*order*order; i+=32 ) {
00349       int ti = i/(order*order);
00350       int tj = (i/order)%order;
00351       int tk = i%order;
00352 
00353       const float2 fx2 = FX2[ti];
00354       const float2 fy2 = FY2[tj];
00355       const float2 fz2 = FZ2[tk];
00356 
00357       const float fx = fx2.x;
00358       const float fy = fy2.x;
00359       const float fz = fz2.x;
00360       const float dfx = fx2.y;
00361       const float dfy = fy2.y;
00362       const float dfz = fz2.y;
00363 
00364       float dx = K1 * aq * dfx * fy * fz;
00365       float dy = K2 * aq * fx * dfy * fz;
00366       float dz = K3 * aq * fx * fy * dfz;
00367     
00368       int gti = ti + ai;  if ( gti >= K1 ) gti -= K1;
00369       int gtj = tj + aj;  if ( gtj >= K2 ) gtj -= K2;
00370       int gtk = ak;  if ( gtk >= K3 ) gtk -= K3;
00371       
00372       gtk += tk;  // padded to increase coalescing (maybe, but reduces re-use)
00373       const float * __restrict__ src = q_arr[gti * K2 + gtj];  // could load as constant
00374       float pot = src ? src[gtk] : __int_as_float(0x7fffffff);  // CUDA NaN
00375       force_x += dx * pot;
00376       force_y += dy * pot;
00377       force_z += dz * pot;
00378     }
00379     force_reduction[WARP].a2d[THREAD][0] = force_x;
00380     force_reduction[WARP].a2d[THREAD][1] = force_y;
00381     force_reduction[WARP].a2d[THREAD][2] = force_z;
00382     WARP_SYNC(WARP_FULL_MASK);  // done writing force_reduction
00383     if ( THREAD < 24 ) {
00384       force_reduction[WARP].a1d[THREAD] += force_reduction[WARP].a1d[THREAD + 24]
00385                                          + force_reduction[WARP].a1d[THREAD + 48]
00386                                          + force_reduction[WARP].a1d[THREAD + 72];
00387     }
00388     WARP_SYNC(WARP_FULL_MASK);  // done writing force_reduction
00389     if ( THREAD < 6 ) {
00390       force_reduction[WARP].a1d[THREAD] += force_reduction[WARP].a1d[THREAD + 6]
00391                                          + force_reduction[WARP].a1d[THREAD + 12]
00392                                          + force_reduction[WARP].a1d[THREAD + 18];
00393     }
00394     WARP_SYNC(WARP_FULL_MASK);  // done writing force_reduction
00395     if ( THREAD < 3 ) {
00396       force_output[WARP].a2d[i_atom][THREAD] = force_reduction[WARP].a1d[THREAD]
00397                                              + force_reduction[WARP].a1d[THREAD + 3];
00398     }
00399   } // i_atom
00400   WARP_SYNC(WARP_FULL_MASK);  // done writing force_output
00401   if ( THREAD < atoms_this_warp*3 ) {
00402     int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
00403     afn_s[1][aidx*3+THREAD] = force_output[WARP].a1d[THREAD];
00404   }
00405 }
00406 
00407 CUDA_PME_CHARGES_PROTOTYPE {
00408   int nblocks = (n_atoms + atoms_per_block - 1) / atoms_per_block;
00409   if ( ! nblocks ) return;
00410 
00411 #define CALL(ORDER) if ( order == ORDER ) \
00412                       cuda_pme_charges_dev<ORDER><<<nblocks,32*warps_per_block,0,stream>>>( \
00413                         coeffs, q_arr, f_arr, fz_arr, a_data, n_atoms, K1, K2, K3)
00414   CALL(4); 
00415   else CALL(6);
00416   else CALL(8);
00417   else NAMD_die("unsupported PMEInterpOrder");
00418 #undef CALL
00419 }
00420 
00421 CUDA_PME_CHARGES_BATCHED_PROTOTYPE {
00422   int nblocksX = (n_max_atoms + atoms_per_block - 1) / atoms_per_block;
00423   if ( (! nblocksX) || (! numPatches) ) return;
00424   dim3 gridSize;
00425   gridSize.x = nblocksX;
00426   gridSize.y = numPatches;
00427   gridSize.z = 1;
00428 #define CALL(ORDER) if ( order == ORDER ) \
00429                       cuda_pme_charges_batched_dev<ORDER><<<gridSize,32*warps_per_block,0,stream>>>( \
00430                         coeffs, q_arr, f_arr, fz_arr, a_data_ptr, n_atoms_ptr, K1_ptr, K2_ptr, K3_ptr)
00431   CALL(4);
00432   else CALL(6);
00433   else CALL(8);
00434   else NAMD_die("unsupported PMEInterpOrder");
00435 #undef CALL
00436 }
00437 
00438 
00439 #ifdef CUDA_VERSION
00440 #if CUDA_VERSION < 4020
00441 void dummy() { }
00442 #define cudaFuncSetSharedMemConfig(X,Y) dummy()
00443 #endif
00444 #endif
00445 
00446 
00447 CUDA_PME_FORCES_PROTOTYPE {
00448   int nblocks = (maxn + atoms_per_block - 1) / atoms_per_block;
00449   if ( (! nblocks) || (! dimy) ) return;
00450 
00451 #define CALL(ORDER) if ( order == ORDER ) \
00452                       cudaFuncSetSharedMemConfig(cuda_pme_forces_dev<ORDER>,cudaSharedMemBankSizeEightByte), \
00453                       cuda_pme_forces_dev<ORDER><<<dim3(nblocks,dimy),32*warps_per_block,0,stream>>>( \
00454                         coeffs, q_arr, afn, /* a_data, f_data, n_atoms, */ K1, K2, K3)
00455   CALL(4); 
00456   else CALL(6);
00457   else CALL(8);
00458   else NAMD_die("unsupported PMEInterpOrder");
00459 #undef CALL
00460 }
00461 
00462 #endif // NAMD_CUDA
00463 

Generated on Mon Nov 20 01:17:11 2017 for NAMD by  doxygen 1.4.7