ComputePmeCUDAKernel.cu File Reference

#include "ComputePmeCUDAKernel.h"
#include "CudaUtils.h"
#include <cuda.h>

Go to the source code of this file.

Defines

#define warps_per_block   8
#define atoms_per_warp   4
#define atoms_per_block   (atoms_per_warp * warps_per_block)
#define BSPLINE_DEFS
#define WARP   (threadIdx.x>>5)
#define THREAD   (threadIdx.x&31)
#define FX   bspline_factors[threadIdx.x>>5][0]
#define FY   bspline_factors[threadIdx.x>>5][1]
#define FZ   bspline_factors[threadIdx.x>>5][2]
#define DFX   bspline_dfactors[threadIdx.x>>5][0]
#define DFY   bspline_dfactors[threadIdx.x>>5][1]
#define DFZ   bspline_dfactors[threadIdx.x>>5][2]
#define FX2   bspline_2factors[threadIdx.x>>5][0]
#define FY2   bspline_2factors[threadIdx.x>>5][1]
#define FZ2   bspline_2factors[threadIdx.x>>5][2]
#define ATOM   atoms[threadIdx.x>>5].a2d[i_atom]
#define AX   atoms[threadIdx.x>>5].a2d[i_atom][0]
#define AY   atoms[threadIdx.x>>5].a2d[i_atom][1]
#define AZ   atoms[threadIdx.x>>5].a2d[i_atom][2]
#define AQ   atoms[threadIdx.x>>5].a2d[i_atom][3]
#define AI   atoms[threadIdx.x>>5].a2d[i_atom][4]
#define AJ   atoms[threadIdx.x>>5].a2d[i_atom][5]
#define AK   atoms[threadIdx.x>>5].a2d[i_atom][6]
#define CALL(ORDER)
#define CALL(ORDER)
#define CALL(ORDER)

Functions

void NAMD_die (const char *)
void cuda_init_bspline_coeffs (float **c, float **dc, int order)
template<int order>
__global__ void cuda_pme_charges_batched_dev (const float *__restrict__ coeffs, float *const *__restrict__ q_arr, int *__restrict__ f_arr, int *__restrict__ fz_arr, float **__restrict__ a_data_ptr, int *n_atoms_ptr, int *K1_ptr, int *K2_ptr, int *K3_ptr)
template<int order>
__global__ void cuda_pme_charges_dev (const float *__restrict__ coeffs, float *const *__restrict__ q_arr, int *__restrict__ f_arr, int *__restrict__ fz_arr, float *__restrict__ a_data, int n_atoms, int K1, int K2, int K3)
template<int order>
__global__ void cuda_pme_forces_dev (const float *__restrict__ coeffs, float *const *__restrict__ q_arr, float *const *__restrict__ afn_g, int K1, int K2, int K3)

Variables

static const float order_4_coeffs [4][4]
static const float order_6_coeffs [6][6]
static const float order_8_coeffs [8][8]


Define Documentation

#define AI   atoms[threadIdx.x>>5].a2d[i_atom][4]

Definition at line 119 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), cuda_pme_charges_dev(), and cuda_pme_forces_dev().

#define AJ   atoms[threadIdx.x>>5].a2d[i_atom][5]

Definition at line 120 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), cuda_pme_charges_dev(), and cuda_pme_forces_dev().

#define AK   atoms[threadIdx.x>>5].a2d[i_atom][6]

Definition at line 121 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), cuda_pme_charges_dev(), and cuda_pme_forces_dev().

#define AQ   atoms[threadIdx.x>>5].a2d[i_atom][3]

Definition at line 118 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), cuda_pme_charges_dev(), and cuda_pme_forces_dev().

#define ATOM   atoms[threadIdx.x>>5].a2d[i_atom]

Definition at line 114 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), cuda_pme_charges_dev(), and cuda_pme_forces_dev().

#define atoms_per_block   (atoms_per_warp * warps_per_block)

Definition at line 22 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), cuda_pme_charges_dev(), and cuda_pme_forces_dev().

#define atoms_per_warp   4

Definition at line 21 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), cuda_pme_charges_dev(), and cuda_pme_forces_dev().

#define AX   atoms[threadIdx.x>>5].a2d[i_atom][0]

Definition at line 115 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_dev().

#define AY   atoms[threadIdx.x>>5].a2d[i_atom][1]

Definition at line 116 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_dev().

#define AZ   atoms[threadIdx.x>>5].a2d[i_atom][2]

Definition at line 117 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_dev().

#define BSPLINE_DEFS

Value:

__shared__ union { \
    float a2d[order][order]; \
    float a1d[order*order]; \
  } bspline_coeffs; \
  __shared__ volatile union { \
    float a2d[atoms_per_warp][7]; \
    float a1d[atoms_per_warp*7]; \
  } atoms[warps_per_block]; \
  if ( threadIdx.x < order*order ) { \
    bspline_coeffs.a1d[threadIdx.x] = coeffs[threadIdx.x]; \
  } \
  BLOCK_SYNC;

Definition at line 86 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), cuda_pme_charges_dev(), and cuda_pme_forces_dev().

#define CALL ( ORDER   ) 

Value:

if ( order == ORDER ) \
                      cudaFuncSetSharedMemConfig(cuda_pme_forces_dev<ORDER>,cudaSharedMemBankSizeEightByte), \
                      cuda_pme_forces_dev<ORDER><<<dim3(nblocks,dimy),32*warps_per_block,0,stream>>>( \
                        coeffs, q_arr, afn, /* a_data, f_data, n_atoms, */ K1, K2, K3)

#define CALL ( ORDER   ) 

Value:

if ( order == ORDER ) \
                      cuda_pme_charges_batched_dev<ORDER><<<gridSize,32*warps_per_block,0,stream>>>( \
                        coeffs, q_arr, f_arr, fz_arr, a_data_ptr, n_atoms_ptr, K1_ptr, K2_ptr, K3_ptr)

#define CALL ( ORDER   ) 

Value:

if ( order == ORDER ) \
                      cuda_pme_charges_dev<ORDER><<<nblocks,32*warps_per_block,0,stream>>>( \
                        coeffs, q_arr, f_arr, fz_arr, a_data, n_atoms, K1, K2, K3)

#define DFX   bspline_dfactors[threadIdx.x>>5][0]

Definition at line 106 of file ComputePmeCUDAKernel.cu.

#define DFY   bspline_dfactors[threadIdx.x>>5][1]

Definition at line 107 of file ComputePmeCUDAKernel.cu.

#define DFZ   bspline_dfactors[threadIdx.x>>5][2]

Definition at line 108 of file ComputePmeCUDAKernel.cu.

#define FX   bspline_factors[threadIdx.x>>5][0]

Definition at line 103 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), and cuda_pme_charges_dev().

#define FX2   bspline_2factors[threadIdx.x>>5][0]

Definition at line 110 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_forces_dev().

#define FY   bspline_factors[threadIdx.x>>5][1]

Definition at line 104 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), and cuda_pme_charges_dev().

#define FY2   bspline_2factors[threadIdx.x>>5][1]

Definition at line 111 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_forces_dev().

#define FZ   bspline_factors[threadIdx.x>>5][2]

Definition at line 105 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), and cuda_pme_charges_dev().

#define FZ2   bspline_2factors[threadIdx.x>>5][2]

Definition at line 112 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_forces_dev().

#define THREAD   (threadIdx.x&31)

Definition at line 102 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), cuda_pme_charges_dev(), and cuda_pme_forces_dev().

#define WARP   (threadIdx.x>>5)

Definition at line 101 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), cuda_pme_charges_dev(), and cuda_pme_forces_dev().

#define warps_per_block   8

Definition at line 20 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), cuda_pme_charges_dev(), and cuda_pme_forces_dev().


Function Documentation

void cuda_init_bspline_coeffs ( float **  c,
float **  dc,
int  order 
)

Definition at line 43 of file ComputePmeCUDAKernel.cu.

References j, NAMD_die(), order_4_coeffs, order_6_coeffs, and order_8_coeffs.

Referenced by ComputePmeMgr::initialize_computes().

00043                                                                 {
00044   float *coeffs = new float[order*order];
00045   float *dcoeffs = new float[order*order];
00046   double divisor;
00047   static const float *scoeffs;
00048   switch ( order ) {
00049   case 4:
00050     scoeffs = &order_4_coeffs[0][0];
00051     divisor = 6;
00052     break;
00053   case 6:
00054     scoeffs = &order_6_coeffs[0][0];
00055     divisor = 120;
00056     break;
00057   case 8:
00058     scoeffs = &order_8_coeffs[0][0];
00059     divisor = 5040;
00060     break;
00061   default:
00062     NAMD_die("unsupported PMEInterpOrder");
00063   }
00064   double sum = 0;
00065   for ( int i=0, p=order-1; i<order; ++i,--p ) {
00066     for ( int j=0; j<order; ++j ) {
00067       double c = scoeffs[i*order+(order-1-j)];  // reverse order
00068       sum += c;
00069       c /= divisor;
00070       coeffs[i*order+j] = c;
00071       dcoeffs[i*order+j] = (double)p * c;
00072       // printf("%d %d %f %f\n", i, j, c, (double)p*c);
00073     }
00074   }
00075   // printf("checksum: %f %f\n", sum, divisor);
00076   if ( sum != divisor )
00077     NAMD_die("cuda_init_bspline_coeffs static data checksum error");
00078   cudaMalloc((void**) c, order*order*sizeof(float));
00079   cudaMalloc((void**) dc, order*order*sizeof(float));
00080   cudaMemcpy(*c, coeffs, order*order*sizeof(float), cudaMemcpyHostToDevice);
00081   cudaMemcpy(*dc, dcoeffs, order*order*sizeof(float), cudaMemcpyHostToDevice);
00082   delete [] coeffs;
00083   delete [] dcoeffs;
00084 }

template<int order>
__global__ void cuda_pme_charges_batched_dev ( const float *__restrict__  coeffs,
float *const *__restrict__  q_arr,
int *__restrict__  f_arr,
int *__restrict__  fz_arr,
float **__restrict__  a_data_ptr,
int *  n_atoms_ptr,
int *  K1_ptr,
int *  K2_ptr,
int *  K3_ptr 
)

Definition at line 126 of file ComputePmeCUDAKernel.cu.

References AI, AJ, AK, AQ, ATOM, atoms, atoms_per_block, atoms_per_warp, BSPLINE_DEFS, f, FX, FY, FZ, order, THREAD, WARP, WARP_FULL_MASK, WARP_SYNC, and warps_per_block.

00131   {
00132 
00133   int patchIndex = blockIdx.y;
00134   int K1 = K1_ptr[patchIndex];
00135   int K2 = K2_ptr[patchIndex];
00136   int K3 = K3_ptr[patchIndex];
00137   int n_atoms = n_atoms_ptr[patchIndex];
00138 
00139   BSPLINE_DEFS
00140   __shared__ volatile float bspline_factors[warps_per_block][3][order];
00141   int atoms_this_warp = atoms_per_warp;
00142   {
00143     int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
00144     if ( aidx + atoms_per_warp > n_atoms ) {
00145       atoms_this_warp = n_atoms - aidx;  // may be negative
00146       if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
00147     }
00148     if ( THREAD < atoms_this_warp*7 ) {
00149       float* a_data = a_data_ptr[patchIndex];
00150       atoms[WARP].a1d[THREAD] = a_data[aidx*7+THREAD];
00151     }
00152   }
00153   for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
00154     WARP_SYNC(WARP_FULL_MASK);  // done writing atoms, reading bspline_factors (FX,FY,FZ)
00155     float aq=AQ;
00156     int ai=(int)AI;
00157     int aj=(int)AJ;
00158     int ak=(int)AK;
00159 
00160     if ( THREAD < 3 * order ) {
00161       const float af = ATOM[THREAD/order];  // x, y, or z
00162       float f = bspline_coeffs.a2d[0][THREAD % order];
00163       for ( int i=1; i<order; ++i ) {
00164         f = af * f + bspline_coeffs.a2d[i][THREAD % order];
00165       }
00166       bspline_factors[WARP][THREAD/order][THREAD % order] = f;
00167     }
00168     WARP_SYNC(WARP_FULL_MASK);  // done writing bspline_factors
00169     for ( int i=THREAD; i < order*order; i+=32 ) {
00170       int ti = i/order;
00171       int tj = i%order;
00172       int gti = ti + ai;  if ( gti >= K1 ) gti -= K1;
00173       int gtj = tj + aj;  if ( gtj >= K2 ) gtj -= K2;
00174       f_arr[gti * K2 + gtj] = 1;
00175     }
00176     if ( THREAD < order ) {
00177       int gtk = ak;  if ( gtk >= K3 ) gtk -= K3;
00178       gtk += THREAD;  // padded to increase coalescing (maybe, but reduces re-use)
00179 
00180       fz_arr[gtk] = 1;
00181     }
00182 
00183     for ( int i=THREAD; i < order*order*order; i+=32 ) {
00184       int ti = i/(order*order);
00185       int tj = (i/order)%order;
00186       int tk = i%order;
00187       float val = aq * FX[ti] * FY[tj] * FZ[tk];
00188       
00189       int gti = ti + ai;  if ( gti >= K1 ) gti -= K1;
00190       int gtj = tj + aj;  if ( gtj >= K2 ) gtj -= K2;
00191       int gtk = ak;  if ( gtk >= K3 ) gtk -= K3;
00192       gtk += tk;  // padded to increase coalescing (maybe, but reduces re-use)
00193       float *dest = q_arr[gti * K2 + gtj];  // could load as constant
00194       if ( dest ) atomicAdd(dest+gtk,val);
00195     }
00196   } // i_atom
00197 }

template<int order>
__global__ void cuda_pme_charges_dev ( const float *__restrict__  coeffs,
float *const *__restrict__  q_arr,
int *__restrict__  f_arr,
int *__restrict__  fz_arr,
float *__restrict__  a_data,
int  n_atoms,
int  K1,
int  K2,
int  K3 
)

Definition at line 201 of file ComputePmeCUDAKernel.cu.

References AI, AJ, AK, AQ, ATOM, atoms, atoms_per_block, atoms_per_warp, AX, AY, AZ, BSPLINE_DEFS, f, FX, FY, FZ, order, THREAD, WARP, WARP_FULL_MASK, WARP_SYNC, and warps_per_block.

00206   {
00207   BSPLINE_DEFS
00208   __shared__ volatile float bspline_factors[warps_per_block][3][order];
00209   int atoms_this_warp = atoms_per_warp;
00210   {
00211     int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
00212     if ( aidx + atoms_per_warp > n_atoms ) {
00213       atoms_this_warp = n_atoms - aidx;  // may be negative
00214       if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
00215     }
00216     if ( THREAD < atoms_this_warp*7 ) {
00217       atoms[WARP].a1d[THREAD] = a_data[aidx*7+THREAD];
00218     }
00219   }
00220   for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
00221     WARP_SYNC(WARP_FULL_MASK);  // done writing atoms, reading bspline_factors (FX,FY,FZ)
00222     float aq=AQ;
00223     int ai=(int)AI;
00224     int aj=(int)AJ;
00225     int ak=(int)AK;
00226 #if 0
00227 if ( THREAD == 0 && (
00228   AI != (int)AI || AJ != (int)AJ || AK != (int)AK ||
00229   AQ < -30.f || AQ > 30.f || AX < 0.f || AX > 1.f ||
00230   AY < 0.f || AY > 1.f || AZ < 0.f || AZ > 1.f )
00231 ) {
00232   int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
00233   printf("%d/%d %f %f %f %f %f %f %f\n", aidx, n_atoms, AI, AJ, AK, AQ, AX, AY, AZ );
00234   continue;
00235 }
00236 #endif
00237     if ( THREAD < 3 * order ) {
00238       const float af = ATOM[THREAD/order];  // x, y, or z
00239       float f = bspline_coeffs.a2d[0][THREAD % order];
00240       for ( int i=1; i<order; ++i ) {
00241         f = af * f + bspline_coeffs.a2d[i][THREAD % order];
00242       }
00243       bspline_factors[WARP][THREAD/order][THREAD % order] = f;
00244     }
00245     WARP_SYNC(WARP_FULL_MASK);  // done writing bspline_factors
00246     for ( int i=THREAD; i < order*order; i+=32 ) {
00247       int ti = i/order;
00248       int tj = i%order;
00249       int gti = ti + ai;  if ( gti >= K1 ) gti -= K1;
00250       int gtj = tj + aj;  if ( gtj >= K2 ) gtj -= K2;
00251 #if 0
00252 if ( gti < 0 || gtj < 0 || gti >= K1 || gtj >= K2 ) {
00253   int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
00254   printf("%d/%d %d %d %d %f %f %f %f %f %f %f\n", aidx, n_atoms, i, gti, gtj, AI, AJ, AK, AQ, AX, AY, AZ);
00255 } else
00256 #endif
00257       f_arr[gti * K2 + gtj] = 1;
00258     }
00259     if ( THREAD < order ) {
00260       int gtk = ak;  if ( gtk >= K3 ) gtk -= K3;
00261       gtk += THREAD;  // padded to increase coalescing (maybe, but reduces re-use)
00262 #if 0
00263 if ( gtk < 0 || gtk >= K3+order-1 ) {
00264   int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
00265   printf("%d/%d %d %d %f %f %f %f %f %f %f\n", aidx, n_atoms, THREAD, gtk, AI, AJ, AK, AQ, AX, AY, AZ);
00266 } else
00267 #endif
00268       fz_arr[gtk] = 1;
00269     }
00270     for ( int i=THREAD; i < order*order*order; i+=32 ) {
00271       int ti = i/(order*order);
00272       int tj = (i/order)%order;
00273       int tk = i%order;
00274       float val = aq * FX[ti] * FY[tj] * FZ[tk];
00275       int gti = ti + ai;  if ( gti >= K1 ) gti -= K1;
00276       int gtj = tj + aj;  if ( gtj >= K2 ) gtj -= K2;
00277       int gtk = ak;  if ( gtk >= K3 ) gtk -= K3;
00278       gtk += tk;  // padded to increase coalescing (maybe, but reduces re-use)
00279       float *dest = q_arr[gti * K2 + gtj];  // could load as constant
00280 #if 0
00281 if ( gti < 0 || gtj < 0 || gtk < 0 || gti >= K1 || gtj >= K2 || gtk >= K3+order-1 || dest == 0 ) {
00282   int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
00283   printf("%d/%d %d %d %d %d %f %f %f %ld %f %f %f %f\n", aidx, n_atoms, i, gti, gtj, gtk, AI, AJ, AK, dest, AQ, AX, AY, AZ);
00284 } else
00285 #endif
00286        if ( dest ) atomicAdd(dest+gtk,val);
00287     }
00288   } // i_atom
00289 }

template<int order>
__global__ void cuda_pme_forces_dev ( const float *__restrict__  coeffs,
float *const *__restrict__  q_arr,
float *const *__restrict__  afn_g,
int  K1,
int  K2,
int  K3 
)

Definition at line 292 of file ComputePmeCUDAKernel.cu.

References AI, AJ, AK, AQ, ATOM, atoms, atoms_per_block, atoms_per_warp, BSPLINE_DEFS, f, FX2, FY2, FZ2, order, THREAD, WARP, WARP_FULL_MASK, WARP_SYNC, warps_per_block, float2::x, and float2::y.

00299   {
00300   __shared__ float *afn_s[3];
00301   if ( threadIdx.x < 3 ) afn_s[threadIdx.x] = afn_g[3*blockIdx.y+threadIdx.x];
00302   BSPLINE_DEFS
00303   __shared__ volatile union {
00304     float a2d[atoms_per_warp][3];
00305     float a1d[atoms_per_warp*3];
00306   } force_output[warps_per_block];
00307   __shared__ float2 bspline_2factors[warps_per_block][3][order];
00308   __shared__ volatile union {
00309     float a2d[32][3];
00310     float a1d[32*3];
00311   } force_reduction[warps_per_block];
00312   int atoms_this_warp = atoms_per_warp;
00313   {
00314     const int n_atoms = afn_s[2] - afn_s[1];
00315     int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
00316     if ( aidx + atoms_per_warp > n_atoms ) {
00317       atoms_this_warp = n_atoms - aidx;  // may be negative
00318       if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
00319     }
00320     if ( THREAD < atoms_this_warp*7 ) {
00321       atoms[WARP].a1d[THREAD] = afn_s[0][aidx*7+THREAD];
00322     }
00323     WARP_SYNC(WARP_FULL_MASK);  // done writing atoms
00324   }
00325   for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
00326     float aq=AQ;
00327     int ai=(int)AI;
00328     int aj=(int)AJ;
00329     int ak=(int)AK;
00330     if ( THREAD < 3 * order ) {
00331       const float af = ATOM[THREAD/order];  // x, y, or z
00332       float df = 0.f;
00333       float c = bspline_coeffs.a2d[0][THREAD % order];
00334       float f = c;
00335       for ( int i=1; i<order; ++i ) {
00336         df = af * df - (order-i) * c;
00337         c = bspline_coeffs.a2d[i][THREAD % order];
00338         f = af * f + c;
00339       }
00340       bspline_2factors[WARP][THREAD/order][THREAD % order] = make_float2(f,df);
00341     }
00342     __threadfence_block();  // bspline_2factors not declared volatile
00343     WARP_SYNC(WARP_FULL_MASK);  // done writing bspline_2factors (FX2,FY2,FZ2)
00344     float force_x = 0.f;
00345     float force_y = 0.f;
00346     float force_z = 0.f;
00347 
00348     for ( int i=THREAD; i < order*order*order; i+=32 ) {
00349       int ti = i/(order*order);
00350       int tj = (i/order)%order;
00351       int tk = i%order;
00352 
00353       const float2 fx2 = FX2[ti];
00354       const float2 fy2 = FY2[tj];
00355       const float2 fz2 = FZ2[tk];
00356 
00357       const float fx = fx2.x;
00358       const float fy = fy2.x;
00359       const float fz = fz2.x;
00360       const float dfx = fx2.y;
00361       const float dfy = fy2.y;
00362       const float dfz = fz2.y;
00363 
00364       float dx = K1 * aq * dfx * fy * fz;
00365       float dy = K2 * aq * fx * dfy * fz;
00366       float dz = K3 * aq * fx * fy * dfz;
00367     
00368       int gti = ti + ai;  if ( gti >= K1 ) gti -= K1;
00369       int gtj = tj + aj;  if ( gtj >= K2 ) gtj -= K2;
00370       int gtk = ak;  if ( gtk >= K3 ) gtk -= K3;
00371       
00372       gtk += tk;  // padded to increase coalescing (maybe, but reduces re-use)
00373       const float * __restrict__ src = q_arr[gti * K2 + gtj];  // could load as constant
00374       float pot = src ? src[gtk] : __int_as_float(0x7fffffff);  // CUDA NaN
00375       force_x += dx * pot;
00376       force_y += dy * pot;
00377       force_z += dz * pot;
00378     }
00379     force_reduction[WARP].a2d[THREAD][0] = force_x;
00380     force_reduction[WARP].a2d[THREAD][1] = force_y;
00381     force_reduction[WARP].a2d[THREAD][2] = force_z;
00382     WARP_SYNC(WARP_FULL_MASK);  // done writing force_reduction
00383     if ( THREAD < 24 ) {
00384       force_reduction[WARP].a1d[THREAD] += force_reduction[WARP].a1d[THREAD + 24]
00385                                          + force_reduction[WARP].a1d[THREAD + 48]
00386                                          + force_reduction[WARP].a1d[THREAD + 72];
00387     }
00388     WARP_SYNC(WARP_FULL_MASK);  // done writing force_reduction
00389     if ( THREAD < 6 ) {
00390       force_reduction[WARP].a1d[THREAD] += force_reduction[WARP].a1d[THREAD + 6]
00391                                          + force_reduction[WARP].a1d[THREAD + 12]
00392                                          + force_reduction[WARP].a1d[THREAD + 18];
00393     }
00394     WARP_SYNC(WARP_FULL_MASK);  // done writing force_reduction
00395     if ( THREAD < 3 ) {
00396       force_output[WARP].a2d[i_atom][THREAD] = force_reduction[WARP].a1d[THREAD]
00397                                              + force_reduction[WARP].a1d[THREAD + 3];
00398     }
00399   } // i_atom
00400   WARP_SYNC(WARP_FULL_MASK);  // done writing force_output
00401   if ( THREAD < atoms_this_warp*3 ) {
00402     int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
00403     afn_s[1][aidx*3+THREAD] = force_output[WARP].a1d[THREAD];
00404   }
00405 }

void NAMD_die ( const char *   ) 

Definition at line 81 of file common.C.

00083 {
00084    if ( ! err_msg ) err_msg = "(unknown error)";
00085    char *new_err_msg = new char[strlen(err_msg) + 40];
00086    sprintf(new_err_msg,"FATAL ERROR: %s\n",err_msg);
00087    CkPrintf(new_err_msg);
00088    fflush(stdout);
00089    if ( CmiNumPartitions() > 1 ) {
00090      sprintf(new_err_msg,"REPLICA %d FATAL ERROR: %s\n", CmiMyPartition(), err_msg);
00091    }
00092    CmiAbort(new_err_msg);
00093    delete [] new_err_msg;
00094 }


Variable Documentation

const float order_4_coeffs[4][4] [static]

Initial value:

{{1, -3, 3, -1}, {0, 3, -6, 3}, {0, 3, 0, -3}, {0, 1, 4, 1}}

Definition at line 25 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_init_bspline_coeffs().

const float order_6_coeffs[6][6] [static]

Initial value:

{{1, -5, 10, -10, 5, -1}, {0, 5, -20, 30, -20, 5}, {0, 10, -20, 0, 
  20, -10}, {0, 10, 20, -60, 20, 10}, {0, 5, 50, 0, -50, -5}, {0, 1, 
  26, 66, 26, 1}}

Definition at line 29 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_init_bspline_coeffs().

const float order_8_coeffs[8][8] [static]

Initial value:

{{1, -7, 21, -35, 35, -21, 7, -1}, {0, 7, -42, 105, -140, 105, -42, 
  7}, {0, 21, -84, 105, 0, -105, 84, -21}, {0, 35, 0, -315, 560, -315,
   0, 35}, {0, 35, 280, -665, 0, 665, -280, -35}, {0, 21, 504, 
  315, -1680, 315, 504, 21}, {0, 7, 392, 1715, 
  0, -1715, -392, -7}, {0, 1, 120, 1191, 2416, 1191, 120, 1}}

Definition at line 35 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_init_bspline_coeffs().


Generated on Thu Sep 21 01:17:15 2017 for NAMD by  doxygen 1.4.7