CudaPmeSolverUtilKernel.cu

Go to the documentation of this file.
00001 #ifdef WIN32
00002 #define _USE_MATH_DEFINES
00003 #endif
00004 #include <math.h>
00005 #include <stdio.h>
00006 #ifdef NAMD_CUDA
00007 #include <cuda.h>
00008 #endif
00009 #include "CudaUtils.h"
00010 #include "CudaPmeSolverUtilKernel.h"
00011 
00012 #ifdef NAMD_CUDA
00013 // CCELEC is 1/ (4 pi eps ) in AKMA units, conversion from SI
00014 // units: CCELEC = e*e*Na / (4*pi*eps*1Kcal*1A)
00015 //
00016 //      parameter :: CCELEC=332.0636D0 ! old value of dubious origin
00017 //      parameter :: CCELEC=331.843D0  ! value from 1986-1987 CRC Handbook
00018 //                                   ! of Chemistry and Physics
00019 //  real(chm_real), parameter ::  &
00020 //       CCELEC_amber    = 332.0522173D0, &
00021 //       CCELEC_charmm   = 332.0716D0   , &
00022 //       CCELEC_discover = 332.054D0    , &
00023 //       CCELEC_namd     = 332.0636D0   
00024 //const double ccelec = 332.0636;
00025 //const double half_ccelec = 0.5*ccelec;
00026 //const float ccelec_float = 332.0636f;
00027 
00028 /*
00029 // Structure into which virials are stored
00030 // NOTE: This structure is only used for computing addresses
00031 struct Virial_t {
00032   double sforce_dp[27][3];
00033   long long int sforce_fp[27][3];
00034   double virmat[9];
00035   // Energies start here ...
00036 };
00037 */
00038 
00039 // Local structure for scalar_sum -function for energy and virial reductions
00040 struct RecipVirial_t {
00041   double energy;
00042   double virial[6];
00043 };
00044 
00045 
00046 __forceinline__ __device__ double expfunc(const double x) {
00047   return exp(x);
00048 }
00049 
00050 __forceinline__ __device__ float expfunc(const float x) {
00051   return __expf(x);
00052 }
00053 
00054 //
00055 // Performs scalar sum on data(nfft1, nfft2, nfft3)
00056 // T = float or double
00057 // T2 = float2 or double2
00058 //
00059 template <typename T, typename T2, bool calc_energy_virial, bool orderXYZ, bool doOrtho>
00060 __global__ void scalar_sum_kernel(const int nfft1, const int nfft2, const int nfft3,
00061           const int size1, const int size2, const int size3,
00062           const T recip1x, const T recip1y, const T recip1z,
00063           const T recip2x, const T recip2y, const T recip2z,
00064           const T recip3x, const T recip3y, const T recip3z,
00065           const T* prefac1, const T* prefac2, const T* prefac3,
00066           const T pi_ewald, const T piv_inv,
00067           const int k2_00, const int k3_00,
00068           T2* data,
00069           double* __restrict__ energy_recip,
00070           double* __restrict__ virial) {
00071   // Shared memory required: sizeof(T)*(nfft1 + nfft2 + nfft3)
00072   extern __shared__ T sh_prefac[];
00073 
00074   // Create pointers to shared memory
00075   T* sh_prefac1 = (T *)&sh_prefac[0];
00076   T* sh_prefac2 = (T *)&sh_prefac[nfft1];
00077   T* sh_prefac3 = (T *)&sh_prefac[nfft1 + nfft2];
00078 
00079   // Calculate start position (k1, k2, k3) for each thread
00080   unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
00081   int k3 = tid/(size1*size2);
00082   tid -= k3*size1*size2;
00083   int k2 = tid/size1;
00084   int k1 = tid - k2*size1;
00085 
00086   // Starting position in data
00087   int pos = k1 + (k2 + k3*size2)*size1;
00088 
00089   // Move (k2, k3) to the global coordinate (k1_00 = 0 since this is the pencil direction)
00090   k2 += k2_00;
00091   k3 += k3_00;
00092 
00093   // Calculate limits w.r.t. global coordinates
00094   const int lim2 = size2 + k2_00;
00095   const int lim3 = size3 + k3_00;
00096 
00097   // Calculate increments (k1_inc, k2_inc, k3_inc)
00098   int tot_inc = blockDim.x*gridDim.x;
00099   const int k3_inc = tot_inc/(size1*size2);
00100   tot_inc -= k3_inc*size1*size2;
00101   const int k2_inc = tot_inc/size1;
00102   const int k1_inc = tot_inc - k2_inc*size1;
00103 
00104   // Set data[0] = 0 for the global (0,0,0)
00105   if (k1 == 0 && k2 == 0 && k3 == 0) {
00106     T2 zero;
00107     zero.x = (T)0;
00108     zero.y = (T)0;
00109     data[0] = zero;
00110     // Increment position
00111     k1 += k1_inc;
00112     pos += k1_inc;
00113     if (k1 >= size1) {
00114       k1 -= size1;
00115       k2++;
00116     }
00117     k2 += k2_inc;
00118     pos += k2_inc*size1;
00119     if (k2 >= lim2) {
00120       k2 -= size2;
00121       k3++;
00122     }
00123     k3 += k3_inc;
00124     pos += k3_inc*size1*size2;
00125   }
00126 
00127   // Load prefac data into shared memory
00128   {
00129     int t = threadIdx.x;
00130     while (t < nfft1) {
00131       sh_prefac1[t] = prefac1[t];
00132       t += blockDim.x;
00133     }
00134     t = threadIdx.x;
00135     while (t < nfft2) {
00136       sh_prefac2[t] = prefac2[t];
00137       t += blockDim.x;
00138     }
00139     t = threadIdx.x;
00140     while (t < nfft3) {
00141       sh_prefac3[t] = prefac3[t];
00142       t += blockDim.x;
00143     }
00144   }
00145   BLOCK_SYNC;
00146 
00147   double energy = 0.0;
00148   double virial0 = 0.0;
00149   double virial1 = 0.0;
00150   double virial2 = 0.0;
00151   double virial3 = 0.0;
00152   double virial4 = 0.0;
00153   double virial5 = 0.0;
00154 
00155   // If nfft1 is odd, set nfft1_half to impossible value so that
00156   // the second condition in "if ( (k1 == 0) || (k1 == nfft1_half) )" 
00157   // is never satisfied
00158   const int nfft1_half = ((nfft1 & 1) == 0) ? nfft1/2 : -1;
00159   const int nfft2_half = nfft2/2;
00160   const int nfft3_half = nfft3/2;
00161 
00162   while (k3 < lim3) {
00163 
00164     T2 q = data[pos];
00165 
00166     int k2s = (k2 <= nfft2_half) ? k2 : k2 - nfft2;
00167     int k3s = (k3 <= nfft3_half) ? k3 : k3 - nfft3;
00168 
00169     T m1, m2, m3;
00170     if (doOrtho) {
00171       m1 = recip1x*k1;
00172       m2 = recip2y*k2s;
00173       m3 = recip3z*k3s;
00174     } else {
00175       m1 = recip1x*k1 + recip2x*k2s + recip3x*k3s;
00176       m2 = recip1y*k1 + recip2y*k2s + recip3y*k3s;
00177       m3 = recip1z*k1 + recip2z*k2s + recip3z*k3s;
00178     }
00179 
00180     T msq = m1*m1 + m2*m2 + m3*m3;
00181     T msq_inv = ((T)1)/msq;
00182 
00183     T theta3 = sh_prefac1[k1]*sh_prefac2[k2]*sh_prefac3[k3];
00184     T q2 = ((T)2)*(q.x*q.x + q.y*q.y)*theta3;
00185     if ( (k1 == 0) || (k1 == nfft1_half) ) q2 *= ((T)0.5);
00186     T xp3 = expfunc(-pi_ewald*msq)*piv_inv;
00187     T C = xp3*msq_inv;
00188     T theta = theta3*C;
00189 
00190     if (calc_energy_virial) {
00191       T fac = q2*C;
00192       T vir = ((T)2)*(pi_ewald + msq_inv);
00193       energy += fac;
00194       virial0 += (double)(fac*(vir*m1*m1 - ((T)1)));
00195       virial1 += (double)(fac*vir*m1*m2);
00196       virial2 += (double)(fac*vir*m1*m3);
00197       virial3 += (double)(fac*(vir*m2*m2 - ((T)1)));
00198       virial4 += (double)(fac*vir*m2*m3);
00199       virial5 += (double)(fac*(vir*m3*m3 - ((T)1)));
00200     }
00201 
00202     q.x *= theta;
00203     q.y *= theta;
00204     data[pos] = q;
00205     
00206     // Increment position
00207     k1 += k1_inc;
00208     pos += k1_inc;
00209     if (k1 >= size1) {
00210       k1 -= size1;
00211       k2++;
00212     }
00213     k2 += k2_inc;
00214     pos += k2_inc*size1;
00215     if (k2 >= lim2) {
00216       k2 -= size2;
00217       k3++;
00218     }
00219     k3 += k3_inc;
00220     pos += k3_inc*size2*size1;
00221   }
00222 
00223   // Reduce energy and virial
00224   if (calc_energy_virial) {
00225     const int tid = threadIdx.x & (warpSize-1);
00226     const int base = (threadIdx.x/warpSize);
00227     volatile RecipVirial_t* sh_ev = (RecipVirial_t *)sh_prefac;
00228     // Reduce within warps
00229     for (int d=warpSize/2;d >= 1;d /= 2) {
00230       energy += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(energy), tid+d, WARPSIZE),
00231          WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(energy), tid+d, WARPSIZE));
00232       virial0 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial0), tid+d, WARPSIZE),
00233           WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial0), tid+d, WARPSIZE));
00234       virial1 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial1), tid+d, WARPSIZE),
00235           WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial1), tid+d, WARPSIZE));
00236       virial2 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial2), tid+d, WARPSIZE),
00237           WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial2), tid+d, WARPSIZE));
00238       virial3 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial3), tid+d, WARPSIZE),
00239           WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial3), tid+d, WARPSIZE));
00240       virial4 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial4), tid+d, WARPSIZE),
00241           WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial4), tid+d, WARPSIZE));
00242       virial5 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial5), tid+d, WARPSIZE),
00243           WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial5), tid+d, WARPSIZE));
00244     }
00245     // Reduce between warps
00246     // NOTE: this BLOCK_SYNC is needed because we're using a single shared memory buffer
00247     BLOCK_SYNC;
00248     if (tid == 0) {
00249       sh_ev[base].energy = energy;
00250       sh_ev[base].virial[0] = virial0;
00251       sh_ev[base].virial[1] = virial1;
00252       sh_ev[base].virial[2] = virial2;
00253       sh_ev[base].virial[3] = virial3;
00254       sh_ev[base].virial[4] = virial4;
00255       sh_ev[base].virial[5] = virial5;
00256     }
00257     BLOCK_SYNC;
00258     if (base == 0) {
00259       energy = (tid < blockDim.x/warpSize) ? sh_ev[tid].energy : 0.0;
00260       virial0 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[0] : 0.0;
00261       virial1 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[1] : 0.0;
00262       virial2 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[2] : 0.0;
00263       virial3 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[3] : 0.0;
00264       virial4 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[4] : 0.0;
00265       virial5 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[5] : 0.0;
00266       for (int d=warpSize/2;d >= 1;d /= 2) {
00267   energy += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(energy), tid+d, WARPSIZE),
00268            WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(energy), tid+d, WARPSIZE));
00269   virial0 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial0), tid+d, WARPSIZE),
00270             WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial0), tid+d, WARPSIZE));
00271   virial1 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial1), tid+d, WARPSIZE),
00272             WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial1), tid+d, WARPSIZE));
00273   virial2 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial2), tid+d, WARPSIZE),
00274             WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial2), tid+d, WARPSIZE));
00275   virial3 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial3), tid+d, WARPSIZE),
00276             WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial3), tid+d, WARPSIZE));
00277   virial4 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial4), tid+d, WARPSIZE),
00278             WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial4), tid+d, WARPSIZE));
00279   virial5 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial5), tid+d, WARPSIZE),
00280             WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial5), tid+d, WARPSIZE));
00281       }
00282     }
00283 
00284     if (threadIdx.x == 0) {
00285       atomicAdd(energy_recip, energy*0.5);
00286       virial0 *= -0.5;
00287       virial1 *= -0.5;
00288       virial2 *= -0.5;
00289       virial3 *= -0.5;
00290       virial4 *= -0.5;
00291       virial5 *= -0.5;
00292       atomicAdd(&virial[0], virial0);
00293       atomicAdd(&virial[1], virial1);
00294       atomicAdd(&virial[2], virial2);
00295       atomicAdd(&virial[3], virial3);
00296       atomicAdd(&virial[4], virial4);
00297       atomicAdd(&virial[5], virial5);
00298     }
00299 
00300   }
00301 
00302 }
00303 
00304 template <typename T>
00305 __forceinline__ __device__ void write_grid(const float val, const int ind,
00306              T* data) {
00307   atomicAdd(&data[ind], (T)val);
00308 }
00309 
00310 //
00311 // General version for any order
00312 //
00313 template <typename T, int order>
00314 __forceinline__ __device__ void calc_one_theta(const T w, T *theta) {
00315 
00316   theta[order-1] = ((T)0);
00317   theta[1] = w;
00318   theta[0] = ((T)1) - w;
00319 
00320 #pragma unroll
00321   for (int k=3;k <= order-1;k++) {
00322     T div = ((T)1) / (T)(k-1);
00323     theta[k-1] = div*w*theta[k-2];
00324 #pragma unroll
00325     for (int j=1;j <= k-2;j++) {
00326       theta[k-j-1] = div*((w+j)*theta[k-j-2] + (k-j-w)*theta[k-j-1]);
00327     }
00328     theta[0] = div*(((T)1) - w)*theta[0];
00329   }
00330       
00331   //--- one more recursion
00332   T div = ((T)1) / (T)(order-1);
00333   theta[order-1] = div*w*theta[order-2];
00334 #pragma unroll
00335   for (int j=1;j <= order-2;j++) {
00336     theta[order-j-1] = div*((w+j)*theta[order-j-2] + (order-j-w)*theta[order-j-1]);
00337   }
00338     
00339   theta[0] = div*(((T)1) - w)*theta[0];
00340 }
00341 
00342 //
00343 // Calculate theta and dtheta for general order bspline
00344 //
00345 template <typename T, typename T3, int order>
00346 __forceinline__ __device__ void calc_theta_dtheta(T wx, T wy, T wz, T3 *theta, T3 *dtheta) {
00347 
00348   theta[order-1].x = ((T)0);
00349   theta[order-1].y = ((T)0);
00350   theta[order-1].z = ((T)0);
00351   theta[1].x = wx;
00352   theta[1].y = wy;
00353   theta[1].z = wz;
00354   theta[0].x = ((T)1) - wx;
00355   theta[0].y = ((T)1) - wy;
00356   theta[0].z = ((T)1) - wz;
00357 
00358 #pragma unroll
00359   for (int k=3;k <= order-1;k++) {
00360     T div = ((T)1) / (T)(k-1);
00361     theta[k-1].x = div*wx*theta[k-2].x;
00362     theta[k-1].y = div*wy*theta[k-2].y;
00363     theta[k-1].z = div*wz*theta[k-2].z;
00364 #pragma unroll
00365     for (int j=1;j <= k-2;j++) {
00366       theta[k-j-1].x = div*((wx + j)*theta[k-j-2].x + (k-j-wx)*theta[k-j-1].x);
00367       theta[k-j-1].y = div*((wy + j)*theta[k-j-2].y + (k-j-wy)*theta[k-j-1].y);
00368       theta[k-j-1].z = div*((wz + j)*theta[k-j-2].z + (k-j-wz)*theta[k-j-1].z);
00369     }
00370     theta[0].x = div*(((T)1) - wx)*theta[0].x;
00371     theta[0].y = div*(((T)1) - wy)*theta[0].y;
00372     theta[0].z = div*(((T)1) - wz)*theta[0].z;
00373   }
00374 
00375   //--- perform standard b-spline differentiation
00376   dtheta[0].x = -theta[0].x;
00377   dtheta[0].y = -theta[0].y;
00378   dtheta[0].z = -theta[0].z;
00379 #pragma unroll
00380   for (int j=2;j <= order;j++) {
00381     dtheta[j-1].x = theta[j-2].x - theta[j-1].x;
00382     dtheta[j-1].y = theta[j-2].y - theta[j-1].y;
00383     dtheta[j-1].z = theta[j-2].z - theta[j-1].z;
00384   }
00385       
00386   //--- one more recursion
00387   T div = ((T)1) / (T)(order-1);
00388   theta[order-1].x = div*wx*theta[order-2].x;
00389   theta[order-1].y = div*wy*theta[order-2].y;
00390   theta[order-1].z = div*wz*theta[order-2].z;
00391 #pragma unroll
00392   for (int j=1;j <= order-2;j++) {
00393     theta[order-j-1].x = div*((wx + j)*theta[order-j-2].x + (order-j-wx)*theta[order-j-1].x);
00394     theta[order-j-1].y = div*((wy + j)*theta[order-j-2].y + (order-j-wy)*theta[order-j-1].y);
00395     theta[order-j-1].z = div*((wz + j)*theta[order-j-2].z + (order-j-wz)*theta[order-j-1].z);
00396   }
00397     
00398   theta[0].x = div*(((T)1) - wx)*theta[0].x;
00399   theta[0].y = div*(((T)1) - wy)*theta[0].y;
00400   theta[0].z = div*(((T)1) - wz)*theta[0].z;
00401 }
00402 
00403 //
00404 // Spreads the charge on the grid. Calculates theta and dtheta on the fly
00405 // blockDim.x                   = Number of atoms each block loads
00406 // blockDim.y*blockDim.x/order3 = Number of atoms we spread at once
00407 //
00408 template <typename AT, int order, bool periodicY, bool periodicZ>
00409 __global__ void
00410 spread_charge_kernel(const float4 *xyzq, const int ncoord,
00411           const int nfftx, const int nffty, const int nfftz,
00412           const int xsize, const int ysize, const int zsize,
00413           const int xdim, const int y00, const int z00,
00414           AT* data) {
00415 
00416   // Shared memory use:
00417   // order = 4: 1920 bytes
00418   // order = 6: 2688 bytes
00419   // order = 8: 3456 bytes
00420   __shared__ int sh_ix[32];
00421   __shared__ int sh_iy[32];
00422   __shared__ int sh_iz[32];
00423   __shared__ float sh_thetax[order*32];
00424   __shared__ float sh_thetay[order*32];
00425   __shared__ float sh_thetaz[order*32];
00426 
00427   // Process atoms pos to pos_end-1 (blockDim.x)
00428   const unsigned int pos = blockIdx.x*blockDim.x + threadIdx.x;
00429   const unsigned int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
00430 
00431   if (pos < pos_end && threadIdx.y == 0) {
00432 
00433     float4 xyzqi = xyzq[pos];
00434     float x = xyzqi.x;
00435     float y = xyzqi.y;
00436     float z = xyzqi.z;
00437     float q = xyzqi.w;
00438 
00439     float frx = ((float)nfftx)*x;
00440     float fry = ((float)nffty)*y;
00441     float frz = ((float)nfftz)*z;
00442 
00443     int frxi = (int)frx;
00444     int fryi = (int)fry;
00445     int frzi = (int)frz;
00446 
00447     float wx = frx - (float)frxi;
00448     float wy = fry - (float)fryi;
00449     float wz = frz - (float)frzi;
00450 
00451     if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
00452     if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
00453 
00454     sh_ix[threadIdx.x] = frxi;
00455     sh_iy[threadIdx.x] = fryi - y00;
00456     sh_iz[threadIdx.x] = frzi - z00;
00457 
00458     float theta[order];
00459 
00460     calc_one_theta<float, order>(wx, theta);
00461 #pragma unroll
00462     for (int i=0;i < order;i++) sh_thetax[threadIdx.x*order + i] = q*theta[i];
00463 
00464     calc_one_theta<float, order>(wy, theta);
00465 #pragma unroll
00466     for (int i=0;i < order;i++) sh_thetay[threadIdx.x*order + i] = theta[i];
00467 
00468     calc_one_theta<float, order>(wz, theta);
00469 #pragma unroll
00470     for (int i=0;i < order;i++) sh_thetaz[threadIdx.x*order + i] = theta[i];
00471 
00472   }
00473 
00474   BLOCK_SYNC;
00475 
00476   // Grid point location, values of (ix0, iy0, iz0) are in range 0..order-1
00477   // NOTE: Only tid=0...order*order*order-1 do any computation
00478   const int order3 = ((order*order*order-1)/warpSize + 1)*warpSize;
00479   const int tid = (threadIdx.x + threadIdx.y*blockDim.x) % order3;   // 0...order3-1
00480   const int x0 = tid % order;
00481   const int y0 = (tid / order) % order;
00482   const int z0 = tid / (order*order);
00483 
00484   // Loop over atoms pos..pos_end-1
00485   int iadd = blockDim.x*blockDim.y/order3;
00486   int i = (threadIdx.x + threadIdx.y*blockDim.x)/order3;
00487   int iend = pos_end - blockIdx.x*blockDim.x;
00488   for (;i < iend;i += iadd) {
00489     int x = sh_ix[i] + x0;
00490     int y = sh_iy[i] + y0;
00491     int z = sh_iz[i] + z0;
00492       
00493     if (x >= nfftx) x -= nfftx;
00494 
00495     if (periodicY  && (y >= nffty)) y -= nffty;
00496     if (!periodicY && (y < 0 || y >= ysize)) continue;
00497 
00498     if (periodicZ  && (z >= nfftz)) z -= nfftz;
00499     if (!periodicZ && (z < 0 || z >= zsize)) continue;
00500       
00501     // Get position on the grid
00502     int ind = x + xdim*(y + ysize*(z));
00503     
00504     // Here we unroll the 6x6x6 loop with 216 threads.
00505     // NOTE: We use 7*32=224 threads to do this
00506     // Calculate interpolated charge value and store it to global memory
00507 
00508     if (tid < order*order*order)
00509       write_grid<AT>(sh_thetax[i*order+x0]*sh_thetay[i*order+y0]*sh_thetaz[i*order+z0], ind, data);
00510   }
00511 
00512 }
00513 
00514 //-----------------------------------------------------------------------------------------
00515 // Generic version can not be used
00516 template <typename T> __forceinline__ __device__
00517 void gather_force_store(const float fx, const float fy, const float fz,
00518       const int stride, const int pos,
00519       T* force) {
00520 }
00521 
00522 // Template specialization for "float"
00523 template <> __forceinline__ __device__
00524 void gather_force_store<float>(const float fx, const float fy, const float fz, 
00525              const int stride, const int pos, 
00526              float* force) {
00527   // Store into non-strided float XYZ array
00528   force[pos]          = fx;
00529   force[pos+stride]   = fy;
00530   force[pos+stride*2] = fz;
00531 }
00532 
00533 // Template specialization for "float3"
00534 template <> __forceinline__ __device__
00535 void gather_force_store<float3>(const float fx, const float fy, const float fz, 
00536         const int stride, const int pos, 
00537         float3* force) {
00538   // Store into non-strided "float3" array
00539   force[pos].x = fx;
00540   force[pos].y = fy;
00541   force[pos].z = fz;
00542 }
00543 //-----------------------------------------------------------------------------------------
00544 
00545 // Per atom data structure for the gather_force -kernels
00546 template <typename T, int order>
00547 struct gather_t {
00548   int ix;
00549   int iy;
00550   int iz;
00551   T charge;
00552   T thetax[order];
00553   T thetay[order];
00554   T thetaz[order];
00555   T dthetax[order];
00556   T dthetay[order];
00557   T dthetaz[order];
00558   float f1;
00559   float f2;
00560   float f3;
00561 };
00562 
00563 //
00564 // Gathers forces from the grid
00565 // blockDim.x            = Number of atoms each block loads
00566 // blockDim.x*blockDim.y = Total number of threads per block
00567 //
00568 template <typename CT, typename FT, int order, bool periodicY, bool periodicZ>
00569 __global__ void gather_force(const float4 *xyzq, const int ncoord,
00570               const int nfftx, const int nffty, const int nfftz,
00571               const int xsize, const int ysize, const int zsize,
00572               const int xdim, const int y00, const int z00,
00573               // const float recip1, const float recip2, const float recip3,
00574               const float* data,      // NOTE: data is used for loads when __CUDA_ARCH__ >= 350
00575               const cudaTextureObject_t gridTexObj,
00576               const int stride,
00577               FT *force) {
00578 
00579   const int tid = threadIdx.x + threadIdx.y*blockDim.x; // 0...63
00580 
00581   // Shared memory
00582   __shared__ gather_t<CT, order> shmem[32];
00583 
00584   const int pos = blockIdx.x*blockDim.x + threadIdx.x;
00585   const int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
00586 
00587   // Load atom data into shared memory
00588   if (pos < pos_end && threadIdx.y == 0) {
00589 
00590     float4 xyzqi = xyzq[pos];
00591     float x = xyzqi.x;
00592     float y = xyzqi.y;
00593     float z = xyzqi.z;
00594     float q = xyzqi.w;
00595 
00596     float frx = ((float)nfftx)*x;
00597     float fry = ((float)nffty)*y;
00598     float frz = ((float)nfftz)*z;
00599 
00600     int frxi = (int)frx;
00601     int fryi = (int)fry;
00602     int frzi = (int)frz;
00603 
00604     float wx = frx - (float)frxi;
00605     float wy = fry - (float)fryi;
00606     float wz = frz - (float)frzi;
00607 
00608     if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
00609     if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
00610 
00611     shmem[threadIdx.x].ix = frxi;
00612     shmem[threadIdx.x].iy = fryi - y00;
00613     shmem[threadIdx.x].iz = frzi - z00;
00614     shmem[threadIdx.x].charge = q;
00615 
00616     float3 theta_tmp[order];
00617     float3 dtheta_tmp[order];
00618     calc_theta_dtheta<float, float3, order>(wx, wy, wz, theta_tmp, dtheta_tmp);
00619     
00620 #pragma unroll
00621     for (int i=0;i < order;i++) shmem[threadIdx.x].thetax[i] = theta_tmp[i].x;
00622 
00623 #pragma unroll
00624     for (int i=0;i < order;i++) shmem[threadIdx.x].thetay[i] = theta_tmp[i].y;
00625 
00626 #pragma unroll
00627     for (int i=0;i < order;i++) shmem[threadIdx.x].thetaz[i] = theta_tmp[i].z;
00628 
00629 #pragma unroll
00630     for (int i=0;i < order;i++) shmem[threadIdx.x].dthetax[i] = dtheta_tmp[i].x;
00631 
00632 #pragma unroll
00633     for (int i=0;i < order;i++) shmem[threadIdx.x].dthetay[i] = dtheta_tmp[i].y;
00634 
00635 #pragma unroll
00636     for (int i=0;i < order;i++) shmem[threadIdx.x].dthetaz[i] = dtheta_tmp[i].z;
00637 
00638   }
00639   BLOCK_SYNC;
00640 
00641   // We divide the order x order x order cube into 8 sub-cubes.
00642   // These sub-cubes are taken care by a single thread
00643   // The size of the sub-cubes is:
00644   // order=4 : 2x2x2
00645   // order=6 : 3x3x3
00646   // order=8 : 4x4x4
00647   const int nsc = (order == 4) ? 2 : ((order == 6) ? 3 : 4);
00648   // Calculate the starting index on the sub-cube for this thread
00649   // tid = 0...63
00650   const int t = (tid % 8);         // sub-cube index (0...7)
00651   // t = (tx0 + ty0*2 + tz0*4)/nsc
00652   // (tx0, ty0, tz0) gives the starting index of the 3x3x3 sub-cube
00653   const int tz0 = (t / 4)*nsc;
00654   const int ty0 = ((t / 2) % 2)*nsc;
00655   const int tx0 = (t % 2)*nsc;
00656 
00657   //
00658   // Calculate forces for 32 atoms. We have 32*2 = 64 threads
00659   // Loop is iterated 4 times:
00660   //                         (iterations)
00661   // Threads 0...7   = atoms 0, 8,  16, 24
00662   // Threads 8...15  = atoms 1, 9,  17, 25
00663   // Threads 16...31 = atoms 2, 10, 18, 26
00664   //                ...
00665   // Threads 56...63 = atoms 7, 15, 23, 31
00666   //
00667 
00668   int base = tid/8;
00669   const int base_end = pos_end - blockIdx.x*blockDim.x;
00670 
00671   // Make sure to mask out any threads that are not running the loop.
00672   // This will happen if the number of atoms is not a multiple of 32.
00673   int warp_mask = WARP_BALLOT(WARP_FULL_MASK, (base < base_end) );
00674 
00675   while (base < base_end) {
00676 
00677     float f1 = 0.0f;
00678     float f2 = 0.0f;
00679     float f3 = 0.0f;
00680     int ix0 = shmem[base].ix;
00681     int iy0 = shmem[base].iy;
00682     int iz0 = shmem[base].iz;
00683 
00684     // Each thread calculates a nsc x nsc x nsc sub-cube
00685 #pragma unroll
00686     for (int i=0;i < nsc*nsc*nsc;i++) {
00687       int tz = tz0 + (i/(nsc*nsc));
00688       int ty = ty0 + ((i/nsc) % nsc);
00689       int tx = tx0 + (i % nsc);
00690 
00691       int ix = ix0 + tx;
00692       int iy = iy0 + ty;
00693       int iz = iz0 + tz;
00694       if (ix >= nfftx) ix -= nfftx;
00695 
00696       if (periodicY  && (iy >= nffty)) iy -= nffty;
00697       if (!periodicY && (iy < 0 || iy >= ysize)) continue;
00698 
00699       if (periodicZ  && (iz >= nfftz)) iz -= nfftz;
00700       if (!periodicZ && (iz < 0 || iz >= zsize)) continue;
00701 
00702       int ind = ix + (iy + iz*ysize)*xdim;
00703 
00704 #if __CUDA_ARCH__ >= 350
00705       float q0 = __ldg(&data[ind]);
00706 #else
00707       float q0 = tex1Dfetch<float>(gridTexObj, ind);
00708 #endif
00709       float thx0 = shmem[base].thetax[tx];
00710       float thy0 = shmem[base].thetay[ty];
00711       float thz0 = shmem[base].thetaz[tz];
00712       float dthx0 = shmem[base].dthetax[tx];
00713       float dthy0 = shmem[base].dthetay[ty];
00714       float dthz0 = shmem[base].dthetaz[tz];
00715       f1 += dthx0 * thy0 * thz0 * q0;
00716       f2 += thx0 * dthy0 * thz0 * q0;
00717       f3 += thx0 * thy0 * dthz0 * q0;
00718     }
00719 
00720     //-------------------------
00721 
00722     // Reduce
00723     const int i = threadIdx.x & 7;
00724 
00725     f1 += WARP_SHUFFLE(warp_mask, f1, i+4, 8);
00726     f2 += WARP_SHUFFLE(warp_mask, f2, i+4, 8);
00727     f3 += WARP_SHUFFLE(warp_mask, f3, i+4, 8);
00728 
00729     f1 += WARP_SHUFFLE(warp_mask, f1, i+2, 8);
00730     f2 += WARP_SHUFFLE(warp_mask, f2, i+2, 8);
00731     f3 += WARP_SHUFFLE(warp_mask, f3, i+2, 8);
00732 
00733     f1 += WARP_SHUFFLE(warp_mask, f1, i+1, 8);
00734     f2 += WARP_SHUFFLE(warp_mask, f2, i+1, 8);
00735     f3 += WARP_SHUFFLE(warp_mask, f3, i+1, 8);
00736 
00737     if (i == 0) {
00738       shmem[base].f1 = f1;
00739       shmem[base].f2 = f2;
00740       shmem[base].f3 = f3;
00741     }
00742 
00743     base += 8;
00744     warp_mask = WARP_BALLOT(warp_mask, (base < base_end) );
00745   }
00746 
00747   // Write forces
00748   BLOCK_SYNC;
00749   if (pos < pos_end && threadIdx.y == 0) {
00750     float f1 = shmem[threadIdx.x].f1;
00751     float f2 = shmem[threadIdx.x].f2;
00752     float f3 = shmem[threadIdx.x].f3;
00753     float q = -shmem[threadIdx.x].charge; //*ccelec_float;
00754     // float fx = q*recip1*f1*nfftx;
00755     // float fy = q*recip2*f2*nffty;
00756     // float fz = q*recip3*f3*nfftz;
00757     float fx = q*f1*nfftx;
00758     float fy = q*f2*nffty;
00759     float fz = q*f3*nfftz;
00760     gather_force_store<FT>(fx, fy, fz, stride, pos, force);
00761   }
00762 
00763 }
00764 
00765 const int TILEDIM = 32;
00766 const int TILEROWS = 8;
00767 
00768 template <typename T>
00769 __device__ __forceinline__
00770 void transpose_xyz_yzx_device(
00771   const int x_in, const int y_in, const int z_in,
00772   const int x_out, const int y_out,
00773   const int nx, const int ny, const int nz,
00774   const int xsize_in, const int ysize_in,
00775   const int ysize_out, const int zsize_out,
00776   const T* data_in, T* data_out) {
00777 
00778   // Shared memory
00779   __shared__ T tile[TILEDIM][TILEDIM+1];
00780 
00781   // Read (x,y) data_in into tile (shared memory)
00782   for (int j=0;j < TILEDIM;j += TILEROWS)
00783     if ((x_in < nx) && (y_in + j < ny) && (z_in < nz))
00784       tile[threadIdx.y + j][threadIdx.x] = data_in[x_in + (y_in + j + z_in*ysize_in)*xsize_in];
00785 
00786   BLOCK_SYNC;
00787 
00788   // Write (y,x) tile into data_out
00789   const int z_out = z_in;
00790   for (int j=0;j < TILEDIM;j += TILEROWS)
00791     if ((x_out + j < nx) && (y_out < ny) && (z_out < nz))
00792       data_out[y_out + (z_out + (x_out+j)*zsize_out)*ysize_out] = tile[threadIdx.x][threadIdx.y + j];
00793 }
00794 
00795 //
00796 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(y, z, x)
00797 //
00798 template <typename T>
00799 __global__ void transpose_xyz_yzx_kernel(
00800   const int nx, const int ny, const int nz,
00801   const int xsize_in, const int ysize_in,
00802   const int ysize_out, const int zsize_out,
00803   const T* data_in, T* data_out) {
00804 
00805   int x_in = blockIdx.x * TILEDIM + threadIdx.x;
00806   int y_in = blockIdx.y * TILEDIM + threadIdx.y;
00807   int z_in = blockIdx.z           + threadIdx.z;
00808 
00809   int x_out = blockIdx.x * TILEDIM + threadIdx.y;
00810   int y_out = blockIdx.y * TILEDIM + threadIdx.x;
00811 
00812   transpose_xyz_yzx_device<T>(
00813     x_in, y_in, z_in,
00814     x_out, y_out,
00815     nx, ny, nz,
00816     xsize_in, ysize_in,
00817     ysize_out, zsize_out,
00818     data_in, data_out);
00819 
00820 /*
00821   // Shared memory
00822   __shared__ T tile[TILEDIM][TILEDIM+1];
00823 
00824   int x = blockIdx.x * TILEDIM + threadIdx.x;
00825   int y = blockIdx.y * TILEDIM + threadIdx.y;
00826   int z = blockIdx.z           + threadIdx.z;
00827 
00828   // Read (x,y) data_in into tile (shared memory)
00829   for (int j=0;j < TILEDIM;j += TILEROWS)
00830     if ((x < nx) && (y + j < ny) && (z < nz))
00831       tile[threadIdx.y + j][threadIdx.x] = data_in[x + (y + j + z*ysize_in)*xsize_in];
00832 
00833   BLOCK_SYNC;
00834 
00835   // Write (y,x) tile into data_out
00836   x = blockIdx.x * TILEDIM + threadIdx.y;
00837   y = blockIdx.y * TILEDIM + threadIdx.x;
00838   for (int j=0;j < TILEDIM;j += TILEROWS)
00839     if ((x + j < nx) && (y < ny) && (z < nz))
00840       data_out[y + (z + (x+j)*zsize_out)*ysize_out] = tile[threadIdx.x][threadIdx.y + j];
00841 */
00842 }
00843 
00844 //
00845 // Transposes a batch of 3d matrices out-of-place: data_in(x, y, z) -> data_out(y, z, x)
00846 // Batch index bi is encoded in blockIdx.z, where 
00847 // blockIdx.z = 0...nz-1 are for batch 1
00848 // blockIdx.z = nz...2*nz-1 are for batch 2
00849 // ...
00850 // gridDim.z = nz*numBatches
00851 //
00852 template <typename T>
00853 __global__ void batchTranspose_xyz_yzx_kernel(
00854   const TransposeBatch<T>* batches,
00855   const int ny, const int nz, 
00856   const int xsize_in, const int ysize_in) {
00857 
00858   int x_in = blockIdx.x * TILEDIM + threadIdx.x;
00859   int y_in = blockIdx.y * TILEDIM + threadIdx.y;
00860   int z_in = (blockIdx.z % nz)    + threadIdx.z;
00861 
00862   int x_out = blockIdx.x * TILEDIM + threadIdx.y;
00863   int y_out = blockIdx.y * TILEDIM + threadIdx.x;
00864 
00865   int bi = blockIdx.z/nz;
00866 
00867   TransposeBatch<T> batch = batches[bi];
00868   int nx        = batch.nx;
00869   int ysize_out = batch.ysize_out;
00870   int zsize_out = batch.zsize_out;
00871   T* data_in    = batch.data_in;
00872   T* data_out   = batch.data_out;
00873 
00874   transpose_xyz_yzx_device<T>(
00875     x_in, y_in, z_in,
00876     x_out, y_out,
00877     nx, ny, nz,
00878     xsize_in, ysize_in,
00879     ysize_out, zsize_out,
00880     data_in, data_out);
00881 
00882 }
00883 
00884 /*
00885 //
00886 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(y, z, x)
00887 //
00888 template <typename T>
00889 __forceinline__ __device__
00890 void transpose_xyz_yzx_dev(
00891   const int blockz,
00892   const int nx, const int ny, const int nz,
00893   const int xsize_in, const int ysize_in,
00894   const int xsize_out, const int ysize_out,
00895   const T* data_in, T* data_out) {
00896 
00897   // Shared memory
00898   __shared__ T tile[TILEDIM][TILEDIM+1];
00899 
00900   int x = blockIdx.x * TILEDIM + threadIdx.x;
00901   int y = blockIdx.y * TILEDIM + threadIdx.y;
00902   // int z = blockIdx.z           + threadIdx.z;
00903   int z = blockz               + threadIdx.z;
00904 
00905   // Read (x,y) data_in into tile (shared memory)
00906   for (int j=0;j < TILEDIM;j += TILEROWS)
00907     if ((x < nx) && (y + j < ny) && (z < nz))
00908       tile[threadIdx.y + j][threadIdx.x] = data_in[x + (y + j + z*ysize_in)*xsize_in];
00909 
00910   BLOCK_SYNC;
00911 
00912   // Write (y,x) tile into data_out
00913   x = blockIdx.x * TILEDIM + threadIdx.y;
00914   y = blockIdx.y * TILEDIM + threadIdx.x;
00915   for (int j=0;j < TILEDIM;j += TILEROWS)
00916     if ((x + j < nx) && (y < ny) && (z < nz))
00917       data_out[y + (z + (x+j)*ysize_out)*xsize_out] = tile[threadIdx.x][threadIdx.y + j];
00918 
00919 }
00920 
00921 //
00922 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(y, z, x)
00923 // (nx, ny, nz)                     = size of the transposed volume
00924 // (xsize_in, ysize_in, zsize_in)   = size of the input data
00925 // into nblock memory blocks
00926 //
00927 template <typename T>
00928 __global__ void transpose_xyz_yzx_kernel(
00929   const int nblock,
00930   const int nx, const int ny, const int nz,
00931   const int xsize_in, const int ysize_in,
00932   const int xsize_out, const int ysize_out,
00933   const T* data_in, T* data_out) {
00934 
00935   const int iblock = blockIdx.z/nz;
00936 
00937   if (iblock < nblock) {
00938     transpose_xyz_yzx_dev(blockIdx.z % nz, nx, ny, nz,
00939       xsize_in, ysize_in, xsize_out, ysize_out,
00940       data_in, data_out);
00941   }
00942 
00943 }
00944 */
00945 
00946 template <typename T>
00947 __device__ __forceinline__
00948 void transpose_xyz_zxy_device(
00949   const int x_in, const int y_in, const int z_in,
00950   const int x_out, const int z_out,
00951   const int nx, const int ny, const int nz,
00952   const int xsize_in, const int ysize_in,
00953   const int zsize_out, const int xsize_out,
00954   const T* data_in, T* data_out) {
00955 
00956   // Shared memory
00957   __shared__ T tile[TILEDIM][TILEDIM+1];
00958 
00959   // Read (x,z) data_in into tile (shared memory)
00960   for (int k=0;k < TILEDIM;k += TILEROWS)
00961     if ((x_in < nx) && (y_in < ny) && (z_in + k < nz))
00962       tile[threadIdx.y + k][threadIdx.x] = data_in[x_in + (y_in + (z_in + k)*ysize_in)*xsize_in];
00963 
00964   BLOCK_SYNC;
00965 
00966   // Write (z,x) tile into data_out
00967   const int y_out = y_in;
00968   for (int k=0;k < TILEDIM;k += TILEROWS)
00969     if ((x_out + k < nx) && (y_out < ny) && (z_out < nz))
00970       data_out[z_out + (x_out + k + y_out*xsize_out)*zsize_out] = tile[threadIdx.x][threadIdx.y + k];
00971 }
00972 
00973 //
00974 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(z, x, y)
00975 //
00976 template <typename T>
00977 __global__ void transpose_xyz_zxy_kernel(
00978   const int nx, const int ny, const int nz,
00979   const int xsize_in, const int ysize_in,
00980   const int zsize_out, const int xsize_out,
00981   const T* data_in, T* data_out) {
00982 
00983   int x_in = blockIdx.x * TILEDIM + threadIdx.x;
00984   int y_in = blockIdx.z           + threadIdx.z;
00985   int z_in = blockIdx.y * TILEDIM + threadIdx.y;
00986 
00987   int x_out = blockIdx.x * TILEDIM + threadIdx.y;
00988   int z_out = blockIdx.y * TILEDIM + threadIdx.x;
00989 
00990   transpose_xyz_zxy_device<T>(
00991     x_in, y_in, z_in, x_out, z_out,
00992     nx, ny, nz,
00993     xsize_in, ysize_in,
00994     zsize_out, xsize_out,
00995     data_in, data_out);
00996 
00997 }
00998 
00999 //
01000 // Transposes a batch of 3d matrices out-of-place: data_in(x, y, z) -> data_out(z, x, y)
01001 // Batch index bi is encoded in blockIdx.z, where 
01002 // blockIdx.z = 0...ny-1 are for batch 1
01003 // blockIdx.z = ny...2*ny-1 are for batch 2
01004 // ...
01005 // gridDim.z = ny*numBatches
01006 //
01007 template <typename T>
01008 __global__ void batchTranspose_xyz_zxy_kernel(
01009   const TransposeBatch<T>* batches,
01010   const int ny, const int nz, 
01011   const int xsize_in, const int ysize_in) {
01012 
01013   int x_in = blockIdx.x * TILEDIM + threadIdx.x;
01014   int y_in = (blockIdx.z % ny)    + threadIdx.z;
01015   int z_in = blockIdx.y * TILEDIM + threadIdx.y;
01016 
01017   int x_out = blockIdx.x * TILEDIM + threadIdx.y;
01018   int z_out = blockIdx.y * TILEDIM + threadIdx.x;
01019 
01020   int bi = blockIdx.z/ny;
01021 
01022   TransposeBatch<T> batch = batches[bi];
01023   int nx        = batch.nx;
01024   int zsize_out = batch.zsize_out;
01025   int xsize_out = batch.xsize_out;
01026   T* data_in    = batch.data_in;
01027   T* data_out   = batch.data_out;
01028 
01029   transpose_xyz_zxy_device<T>(
01030     x_in, y_in, z_in, x_out, z_out,
01031     nx, ny, nz,
01032     xsize_in, ysize_in,
01033     zsize_out, xsize_out,
01034     data_in, data_out);
01035 
01036 }
01037 
01038 //#######################################################################################
01039 //#######################################################################################
01040 //#######################################################################################
01041 
01042 void spread_charge(const float4 *atoms, const int numAtoms,
01043   const int nfftx, const int nffty, const int nfftz,
01044   const int xsize, const int ysize, const int zsize,
01045   const int xdim, const int y00, const int z00, 
01046   const bool periodicY, const bool periodicZ,
01047   float* data, const int order, cudaStream_t stream) {
01048 
01049   dim3 nthread, nblock;
01050 
01051   switch(order) {
01052   case 4:
01053     nthread.x = 32;
01054     nthread.y = 4;
01055     nthread.z = 1;
01056     nblock.x = (numAtoms - 1)/nthread.x + 1;
01057     nblock.y = 1;
01058     nblock.z = 1;
01059     if (periodicY && periodicZ)
01060       spread_charge_kernel<float, 4, true, true> <<< nblock, nthread, 0, stream >>>
01061         (atoms, numAtoms,
01062          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01063     else if (periodicY)
01064       spread_charge_kernel<float, 4, true, false> <<< nblock, nthread, 0, stream >>>
01065         (atoms, numAtoms,
01066          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01067     else if (periodicZ)
01068       spread_charge_kernel<float, 4, false, true> <<< nblock, nthread, 0, stream >>>
01069         (atoms, numAtoms,
01070          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01071     else
01072       spread_charge_kernel<float, 4, false, false> <<< nblock, nthread, 0, stream >>>
01073         (atoms, numAtoms,
01074          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01075     break;
01076 
01077   case 6:
01078     nthread.x = 32;
01079     nthread.y = 7;
01080     nthread.z = 1;
01081     nblock.x = (numAtoms - 1)/nthread.x + 1;
01082     nblock.y = 1;
01083     nblock.z = 1;
01084     if (periodicY && periodicZ)
01085       spread_charge_kernel<float, 6, true, true> <<< nblock, nthread, 0, stream >>>
01086         (atoms, numAtoms,
01087          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01088     else if (periodicY)
01089       spread_charge_kernel<float, 6, true, false> <<< nblock, nthread, 0, stream >>>
01090         (atoms, numAtoms,
01091          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01092     else if (periodicZ)
01093       spread_charge_kernel<float, 6, false, true> <<< nblock, nthread, 0, stream >>>
01094         (atoms, numAtoms,
01095          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01096     else
01097       spread_charge_kernel<float, 6, false, false> <<< nblock, nthread, 0, stream >>>
01098         (atoms, numAtoms,
01099          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01100     break;
01101 
01102   case 8:
01103     nthread.x = 32;
01104     nthread.y = 16;
01105     nthread.z = 1;
01106     nblock.x = (numAtoms - 1)/nthread.x + 1;
01107     nblock.y = 1;
01108     nblock.z = 1;
01109     if (periodicY && periodicZ)
01110       spread_charge_kernel<float, 8, true, true> <<< nblock, nthread, 0, stream >>>
01111         (atoms, numAtoms,
01112          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01113     else if (periodicY)
01114       spread_charge_kernel<float, 8, true, false> <<< nblock, nthread, 0, stream >>>
01115         (atoms, numAtoms,
01116          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01117     else if (periodicZ)
01118       spread_charge_kernel<float, 8, false, true> <<< nblock, nthread, 0, stream >>>
01119         (atoms, numAtoms,
01120          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01121     else
01122       spread_charge_kernel<float, 8, false, false> <<< nblock, nthread, 0, stream >>>
01123         (atoms, numAtoms,
01124          nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
01125     break;
01126 
01127   default:
01128     char str[128];
01129     sprintf(str, "spread_charge, order %d not implemented",order);
01130     cudaNAMD_bug(str);
01131   }
01132   cudaCheck(cudaGetLastError());
01133 
01134 }
01135 
01136 void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3,
01137   const int size1, const int size2, const int size3, const double kappa,
01138   const float recip1x, const float recip1y, const float recip1z,
01139   const float recip2x, const float recip2y, const float recip2z,
01140   const float recip3x, const float recip3y, const float recip3z,
01141   const double volume,
01142   const float* prefac1, const float* prefac2, const float* prefac3,
01143   const int k2_00, const int k3_00,
01144   const bool doEnergyVirial, double* energy, double* virial, float2* data,
01145   cudaStream_t stream) {
01146 
01147   int nthread = 1024;
01148   int nblock = 64;
01149 
01150   int shmem_size = sizeof(float)*(nfft1 + nfft2 + nfft3);
01151   if (doEnergyVirial) {
01152     const int warpSize = 32;      
01153     shmem_size = max(shmem_size, (int)((nthread/warpSize)*sizeof(RecipVirial_t)));
01154   }
01155 
01156   float piv_inv = (float)(1.0/(M_PI*volume));
01157   float fac = (float)(M_PI*M_PI/(kappa*kappa));
01158 
01159   if (doEnergyVirial) {
01160     if (orderXYZ) {
01161       scalar_sum_kernel<float, float2, true, true, false> <<< nblock, nthread, shmem_size, stream >>>
01162       (nfft1, nfft2, nfft3, size1, size2, size3,
01163         recip1x, recip1y, recip1z,
01164         recip2x, recip2y, recip2z,
01165         recip3x, recip3y, recip3z,
01166         prefac1, prefac2, prefac3,
01167         fac, piv_inv, k2_00, k3_00, data, energy, virial);
01168     } else {
01169       scalar_sum_kernel<float, float2, true, false, false> <<< nblock, nthread, shmem_size, stream >>>
01170       (nfft1, nfft2, nfft3, size1, size2, size3,
01171         recip1x, recip1y, recip1z,
01172         recip2x, recip2y, recip2z,
01173         recip3x, recip3y, recip3z,
01174         prefac1, prefac2, prefac3,
01175         fac, piv_inv, k2_00, k3_00, data, energy, virial);
01176     }
01177   } else {
01178     if (orderXYZ) {
01179       scalar_sum_kernel<float, float2, false, true, false> <<< nblock, nthread, shmem_size, stream >>>
01180       (nfft1, nfft2, nfft3, size1, size2, size3,
01181         recip1x, recip1y, recip1z,
01182         recip2x, recip2y, recip2z,
01183         recip3x, recip3y, recip3z,
01184         prefac1, prefac2, prefac3,
01185         fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
01186     } else {
01187       scalar_sum_kernel<float, float2, false, false, false> <<< nblock, nthread, shmem_size, stream >>>
01188       (nfft1, nfft2, nfft3, size1, size2, size3,
01189         recip1x, recip1y, recip1z,
01190         recip2x, recip2y, recip2z,
01191         recip3x, recip3y, recip3z,
01192         prefac1, prefac2, prefac3,
01193         fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
01194     }
01195   }
01196   cudaCheck(cudaGetLastError());
01197 
01198 }
01199 
01200 void gather_force(const float4 *atoms, const int numAtoms,
01201   // const float recip11, const float recip22, const float recip33,
01202   const int nfftx, const int nffty, const int nfftz,
01203   const int xsize, const int ysize, const int zsize,
01204   const int xdim, const int y00, const int z00, 
01205   const bool periodicY, const bool periodicZ,
01206   const float* data, const int order, float3* force, 
01207   const cudaTextureObject_t gridTexObj,
01208   cudaStream_t stream) {
01209 
01210   dim3 nthread(32, 2, 1);
01211   dim3 nblock((numAtoms - 1)/nthread.x + 1, 1, 1);
01212   // dim3 nblock(npatch, 1, 1);
01213 
01214   switch(order) {
01215     case 4:
01216     if (periodicY && periodicZ)
01217       gather_force<float, float3, 4, true, true> <<< nblock, nthread, 0, stream >>>
01218       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01219         // recip11, recip22, recip33,
01220         data,
01221         gridTexObj,
01222         1, force);
01223     else if (periodicY)
01224       gather_force<float, float3, 4, true, false> <<< nblock, nthread, 0, stream >>>
01225       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01226         // recip11, recip22, recip33,
01227         data,
01228         gridTexObj,
01229         1, force);
01230     else if (periodicZ)
01231       gather_force<float, float3, 4, false, true> <<< nblock, nthread, 0, stream >>>
01232       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01233         // recip11, recip22, recip33,
01234         data,
01235         gridTexObj,
01236         1, force);
01237     else
01238       gather_force<float, float3, 4, false, false> <<< nblock, nthread, 0, stream >>>
01239       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01240         // recip11, recip22, recip33,
01241         data,
01242         gridTexObj,
01243         1, force);
01244     break;
01245 
01246     case 6:
01247     if (periodicY && periodicZ)
01248       gather_force<float, float3, 6, true, true> <<< nblock, nthread, 0, stream >>>
01249       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01250         // recip11, recip22, recip33,
01251         data,
01252         gridTexObj,
01253         1, force);
01254     else if (periodicY)
01255       gather_force<float, float3, 6, true, false> <<< nblock, nthread, 0, stream >>>
01256       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01257         // recip11, recip22, recip33,
01258         data,
01259         gridTexObj,
01260         1, force);
01261     else if (periodicZ)
01262       gather_force<float, float3, 6, false, true> <<< nblock, nthread, 0, stream >>>
01263       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01264         // recip11, recip22, recip33,
01265         data,
01266         gridTexObj,
01267         1, force);
01268     else
01269       gather_force<float, float3, 6, false, false> <<< nblock, nthread, 0, stream >>>
01270       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01271         // recip11, recip22, recip33,
01272         data,
01273         gridTexObj,
01274         1, force);
01275     break;
01276  
01277     case 8:
01278     if (periodicY && periodicZ)
01279       gather_force<float, float3, 8, true, true> <<< nblock, nthread, 0, stream >>>
01280       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01281         // recip11, recip22, recip33,
01282         data,
01283         gridTexObj,
01284         1, force);
01285     else if (periodicY)
01286       gather_force<float, float3, 8, true, false> <<< nblock, nthread, 0, stream >>>
01287       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01288         // recip11, recip22, recip33,
01289         data,
01290         gridTexObj,
01291         1, force);
01292     else if (periodicZ)
01293       gather_force<float, float3, 8, false, true> <<< nblock, nthread, 0, stream >>>
01294       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01295         // recip11, recip22, recip33,
01296         data,
01297         gridTexObj,
01298         1, force);
01299     else
01300       gather_force<float, float3, 8, false, false> <<< nblock, nthread, 0, stream >>>
01301       (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
01302         // recip11, recip22, recip33,
01303         data,
01304         gridTexObj,
01305         1, force);
01306     break;
01307 
01308     default:
01309     char str[128];
01310     sprintf(str, "gather_force, order %d not implemented",order);
01311     cudaNAMD_bug(str);
01312   }
01313   cudaCheck(cudaGetLastError());
01314 
01315 }
01316 
01317 //
01318 // Transpose
01319 //
01320 void transpose_xyz_yzx(
01321   const int nx, const int ny, const int nz,
01322   const int xsize_in, const int ysize_in,
01323   const int ysize_out, const int zsize_out,
01324   const float2* data_in, float2* data_out, cudaStream_t stream) {
01325 
01326   dim3 numthread(TILEDIM, TILEROWS, 1);
01327   dim3 numblock((nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz);
01328 
01329   transpose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>>
01330   (nx, ny, nz, xsize_in, ysize_in,
01331     ysize_out, zsize_out,
01332     data_in, data_out);
01333 
01334   cudaCheck(cudaGetLastError());
01335 }
01336 
01337 //
01338 // Batched transpose
01339 //
01340 void batchTranspose_xyz_yzx(
01341   const int numBatches, TransposeBatch<float2>* batches, 
01342   const int max_nx, const int ny, const int nz,
01343   const int xsize_in, const int ysize_in, cudaStream_t stream) {
01344 
01345   dim3 numthread(TILEDIM, TILEROWS, 1);
01346   dim3 numblock((max_nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz*numBatches);
01347 
01348   batchTranspose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>>
01349   (batches, ny, nz, xsize_in, ysize_in);
01350 
01351   cudaCheck(cudaGetLastError());
01352 }
01353 
01354 //
01355 // Transpose
01356 //
01357 void transpose_xyz_zxy(
01358   const int nx, const int ny, const int nz,
01359   const int xsize_in, const int ysize_in,
01360   const int zsize_out, const int xsize_out,
01361   const float2* data_in, float2* data_out, cudaStream_t stream) {
01362 
01363   dim3 numthread(TILEDIM, TILEROWS, 1);
01364   dim3 numblock((nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny);
01365 
01366   transpose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>>
01367   (nx, ny, nz, xsize_in, ysize_in,
01368     zsize_out, xsize_out,
01369     data_in, data_out);
01370 
01371   cudaCheck(cudaGetLastError());
01372 }
01373 
01374 //
01375 // Batched transpose
01376 //
01377 void batchTranspose_xyz_zxy(
01378   const int numBatches, TransposeBatch<float2>* batches, 
01379   const int max_nx, const int ny, const int nz,
01380   const int xsize_in, const int ysize_in,
01381   cudaStream_t stream) {
01382 
01383   dim3 numthread(TILEDIM, TILEROWS, 1);
01384   dim3 numblock((max_nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny*numBatches);
01385 
01386   batchTranspose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>>
01387   (batches, ny, nz, xsize_in, ysize_in);
01388 
01389   cudaCheck(cudaGetLastError());
01390 }
01391 
01392 #endif // NAMD_CUDA

Generated on Sat Sep 23 01:17:13 2017 for NAMD by  doxygen 1.4.7