NAMD
Macros | Functions | Variables
ComputePmeCUDAKernel.cu File Reference
#include "ComputePmeCUDAKernel.h"
#include "CudaUtils.h"
#include <cuda.h>

Go to the source code of this file.

Macros

#define warps_per_block   8
 
#define atoms_per_warp   4
 
#define atoms_per_block   (atoms_per_warp * warps_per_block)
 
#define BSPLINE_DEFS
 
#define WARP   (threadIdx.x>>5)
 
#define THREAD   (threadIdx.x&31)
 
#define FX   bspline_factors[threadIdx.x>>5][0]
 
#define FY   bspline_factors[threadIdx.x>>5][1]
 
#define FZ   bspline_factors[threadIdx.x>>5][2]
 
#define DFX   bspline_dfactors[threadIdx.x>>5][0]
 
#define DFY   bspline_dfactors[threadIdx.x>>5][1]
 
#define DFZ   bspline_dfactors[threadIdx.x>>5][2]
 
#define FX2   bspline_2factors[threadIdx.x>>5][0]
 
#define FY2   bspline_2factors[threadIdx.x>>5][1]
 
#define FZ2   bspline_2factors[threadIdx.x>>5][2]
 
#define ATOM   atoms[threadIdx.x>>5].a2d[i_atom]
 
#define AX   atoms[threadIdx.x>>5].a2d[i_atom][0]
 
#define AY   atoms[threadIdx.x>>5].a2d[i_atom][1]
 
#define AZ   atoms[threadIdx.x>>5].a2d[i_atom][2]
 
#define AQ   atoms[threadIdx.x>>5].a2d[i_atom][3]
 
#define AI   atoms[threadIdx.x>>5].a2d[i_atom][4]
 
#define AJ   atoms[threadIdx.x>>5].a2d[i_atom][5]
 
#define AK   atoms[threadIdx.x>>5].a2d[i_atom][6]
 
#define CALL(ORDER)
 
#define CALL(ORDER)
 
#define cudaFuncSetSharedMemConfig(X, Y)   dummy()
 
#define CALL(ORDER)
 

Functions

void NAMD_die (const char *)
 
void cuda_init_bspline_coeffs (float **c, float **dc, int order)
 
template<int order>
__global__ void cuda_pme_charges_batched_dev (const float *__restrict__ coeffs, float *const *__restrict__ q_arr, int *__restrict__ f_arr, int *__restrict__ fz_arr, float **__restrict__ a_data_ptr, int *n_atoms_ptr, int *K1_ptr, int *K2_ptr, int *K3_ptr)
 
template<int order>
__global__ void cuda_pme_charges_dev (const float *__restrict__ coeffs, float *const *__restrict__ q_arr, int *__restrict__ f_arr, int *__restrict__ fz_arr, float *__restrict__ a_data, int n_atoms, int K1, int K2, int K3)
 
template<int order>
__global__ void cuda_pme_forces_dev (const float *__restrict__ coeffs, float *const *__restrict__ q_arr, float *const *__restrict__ afn_g, int K1, int K2, int K3)
 
 if (!nblocks) return
 
 CALL (4)
 
else CALL (6)
 
else CALL (8)
 
else NAMD_die ("unsupported PMEInterpOrder")
 
 if ((!nblocksX)||(!numPatches)) return
 
void dummy ()
 
 if ((!nblocks)||(!dimy)) return
 

Variables

static const float order_4_coeffs [4][4]
 
static const float order_6_coeffs [6][6]
 
static const float order_8_coeffs [8][8]
 
CUDA_PME_CHARGES_PROTOTYPE int nblocks = (n_atoms + atoms_per_block - 1) / atoms_per_block
 
 CUDA_PME_CHARGES_BATCHED_PROTOTYPE
 
dim3 gridSize
 
gridSize x = nblocksX
 
gridSize y = numPatches
 
gridSize z = 1
 
 CUDA_PME_FORCES_PROTOTYPE
 

Macro Definition Documentation

#define AI   atoms[threadIdx.x>>5].a2d[i_atom][4]
#define AJ   atoms[threadIdx.x>>5].a2d[i_atom][5]
#define AK   atoms[threadIdx.x>>5].a2d[i_atom][6]
#define AQ   atoms[threadIdx.x>>5].a2d[i_atom][3]
#define ATOM   atoms[threadIdx.x>>5].a2d[i_atom]
#define atoms_per_block   (atoms_per_warp * warps_per_block)
#define atoms_per_warp   4
#define AX   atoms[threadIdx.x>>5].a2d[i_atom][0]

Definition at line 119 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_dev().

#define AY   atoms[threadIdx.x>>5].a2d[i_atom][1]

Definition at line 120 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_dev().

#define AZ   atoms[threadIdx.x>>5].a2d[i_atom][2]

Definition at line 121 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_dev().

#define BSPLINE_DEFS
Value:
__shared__ union { \
float a2d[order][order]; \
float a1d[order*order]; \
} bspline_coeffs; \
__shared__ volatile union { \
float a2d[atoms_per_warp][7]; \
float a1d[atoms_per_warp*7]; \
if ( threadIdx.x < order*order ) { \
bspline_coeffs.a1d[threadIdx.x] = coeffs[threadIdx.x]; \
} \
#define warps_per_block
static __thread atom * atoms
if(ComputeNonbondedUtil::goMethod==2)
#define order
Definition: PmeRealSpace.C:235
#define atoms_per_warp

Definition at line 90 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), cuda_pme_charges_dev(), and cuda_pme_forces_dev().

#define CALL (   ORDER)
Value:
if ( order == ORDER ) \
cuda_pme_charges_dev<ORDER><<<nblocks,WARPSIZE*warps_per_block,0,stream>>>( \
coeffs, q_arr, f_arr, fz_arr, a_data, n_atoms, K1, K2, K3)
#define warps_per_block
#define WARPSIZE
Definition: CudaUtils.h:10
__thread cudaStream_t stream
#define order
Definition: PmeRealSpace.C:235
CUDA_PME_CHARGES_PROTOTYPE int nblocks

Definition at line 453 of file ComputePmeCUDAKernel.cu.

#define CALL (   ORDER)
Value:
if ( order == ORDER ) \
cuda_pme_charges_batched_dev<ORDER><<<gridSize,WARPSIZE*warps_per_block,0,stream>>>( \
coeffs, q_arr, f_arr, fz_arr, a_data_ptr, n_atoms_ptr, K1_ptr, K2_ptr, K3_ptr)
#define warps_per_block
#define WARPSIZE
Definition: CudaUtils.h:10
__thread cudaStream_t stream
#define order
Definition: PmeRealSpace.C:235
dim3 gridSize

Definition at line 453 of file ComputePmeCUDAKernel.cu.

#define CALL (   ORDER)
Value:
if ( order == ORDER ) \
cudaFuncSetSharedMemConfig(cuda_pme_forces_dev<ORDER>,cudaSharedMemBankSizeEightByte), \
cuda_pme_forces_dev<ORDER><<<dim3(nblocks,dimy),WARPSIZE*warps_per_block,0,stream>>>( \
coeffs, q_arr, afn, /* a_data, f_data, n_atoms, */ K1, K2, K3)
#define warps_per_block
#define WARPSIZE
Definition: CudaUtils.h:10
__thread cudaStream_t stream
#define order
Definition: PmeRealSpace.C:235
#define cudaFuncSetSharedMemConfig(X, Y)
CUDA_PME_CHARGES_PROTOTYPE int nblocks

Definition at line 453 of file ComputePmeCUDAKernel.cu.

#define cudaFuncSetSharedMemConfig (   X,
  Y 
)    dummy()

Definition at line 445 of file ComputePmeCUDAKernel.cu.

#define DFX   bspline_dfactors[threadIdx.x>>5][0]

Definition at line 110 of file ComputePmeCUDAKernel.cu.

#define DFY   bspline_dfactors[threadIdx.x>>5][1]

Definition at line 111 of file ComputePmeCUDAKernel.cu.

#define DFZ   bspline_dfactors[threadIdx.x>>5][2]

Definition at line 112 of file ComputePmeCUDAKernel.cu.

#define FX   bspline_factors[threadIdx.x>>5][0]

Definition at line 107 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), and cuda_pme_charges_dev().

#define FX2   bspline_2factors[threadIdx.x>>5][0]

Definition at line 114 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_forces_dev().

#define FY   bspline_factors[threadIdx.x>>5][1]

Definition at line 108 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), and cuda_pme_charges_dev().

#define FY2   bspline_2factors[threadIdx.x>>5][1]

Definition at line 115 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_forces_dev().

#define FZ   bspline_factors[threadIdx.x>>5][2]

Definition at line 109 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_charges_batched_dev(), and cuda_pme_charges_dev().

#define FZ2   bspline_2factors[threadIdx.x>>5][2]

Definition at line 116 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_pme_forces_dev().

#define THREAD   (threadIdx.x&31)
#define WARP   (threadIdx.x>>5)
#define warps_per_block   8

Function Documentation

CALL ( )
else CALL ( )
else CALL ( )
void cuda_init_bspline_coeffs ( float **  c,
float **  dc,
int  order 
)

Definition at line 47 of file ComputePmeCUDAKernel.cu.

References NAMD_die(), order, order_4_coeffs, order_6_coeffs, and order_8_coeffs.

Referenced by ComputePmeMgr::initialize_computes().

47  {
48  float *coeffs = new float[order*order];
49  float *dcoeffs = new float[order*order];
50  double divisor;
51  static const float *scoeffs;
52  switch ( order ) {
53  case 4:
54  scoeffs = &order_4_coeffs[0][0];
55  divisor = 6;
56  break;
57  case 6:
58  scoeffs = &order_6_coeffs[0][0];
59  divisor = 120;
60  break;
61  case 8:
62  scoeffs = &order_8_coeffs[0][0];
63  divisor = 5040;
64  break;
65  default:
66  NAMD_die("unsupported PMEInterpOrder");
67  }
68  double sum = 0;
69  for ( int i=0, p=order-1; i<order; ++i,--p ) {
70  for ( int j=0; j<order; ++j ) {
71  double c = scoeffs[i*order+(order-1-j)]; // reverse order
72  sum += c;
73  c /= divisor;
74  coeffs[i*order+j] = c;
75  dcoeffs[i*order+j] = (double)p * c;
76  // printf("%d %d %f %f\n", i, j, c, (double)p*c);
77  }
78  }
79  // printf("checksum: %f %f\n", sum, divisor);
80  if ( sum != divisor )
81  NAMD_die("cuda_init_bspline_coeffs static data checksum error");
82  cudaMalloc((void**) c, order*order*sizeof(float));
83  cudaMalloc((void**) dc, order*order*sizeof(float));
84  cudaMemcpy(*c, coeffs, order*order*sizeof(float), cudaMemcpyHostToDevice);
85  cudaMemcpy(*dc, dcoeffs, order*order*sizeof(float), cudaMemcpyHostToDevice);
86  delete [] coeffs;
87  delete [] dcoeffs;
88 }
static const float order_8_coeffs[8][8]
#define order
Definition: PmeRealSpace.C:235
void NAMD_die(const char *err_msg)
Definition: common.C:85
static const float order_4_coeffs[4][4]
static const float order_6_coeffs[6][6]
template<int order>
__global__ void cuda_pme_charges_batched_dev ( const float *__restrict__  coeffs,
float *const *__restrict__  q_arr,
int *__restrict__  f_arr,
int *__restrict__  fz_arr,
float **__restrict__  a_data_ptr,
int *  n_atoms_ptr,
int *  K1_ptr,
int *  K2_ptr,
int *  K3_ptr 
)

Definition at line 130 of file ComputePmeCUDAKernel.cu.

References AI, AJ, AK, AQ, ATOM, atoms, atoms_per_block, atoms_per_warp, BSPLINE_DEFS, FX, FY, FZ, order, THREAD, WARP, WARP_FULL_MASK, WARP_SYNC, warps_per_block, and WARPSIZE.

135  {
136 
137  int patchIndex = blockIdx.y;
138  int K1 = K1_ptr[patchIndex];
139  int K2 = K2_ptr[patchIndex];
140  int K3 = K3_ptr[patchIndex];
141  int n_atoms = n_atoms_ptr[patchIndex];
142 
144  __shared__ volatile float bspline_factors[warps_per_block][3][order];
145  int atoms_this_warp = atoms_per_warp;
146  {
147  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
148  if ( aidx + atoms_per_warp > n_atoms ) {
149  atoms_this_warp = n_atoms - aidx; // may be negative
150  if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
151  }
152  if ( THREAD < atoms_this_warp*7 ) {
153  float* a_data = a_data_ptr[patchIndex];
154  atoms[WARP].a1d[THREAD] = a_data[aidx*7+THREAD];
155  }
156  }
157  for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
158  WARP_SYNC(WARP_FULL_MASK); // done writing atoms, reading bspline_factors (FX,FY,FZ)
159  float aq=AQ;
160  int ai=(int)AI;
161  int aj=(int)AJ;
162  int ak=(int)AK;
163 
164  if ( THREAD < 3 * order ) {
165  const float af = ATOM[THREAD/order]; // x, y, or z
166  float f = bspline_coeffs.a2d[0][THREAD % order];
167  for ( int i=1; i<order; ++i ) {
168  f = af * f + bspline_coeffs.a2d[i][THREAD % order];
169  }
170  bspline_factors[WARP][THREAD/order][THREAD % order] = f;
171  }
172  WARP_SYNC(WARP_FULL_MASK); // done writing bspline_factors
173  for ( int i=THREAD; i < order*order; i+=WARPSIZE ) {
174  int ti = i/order;
175  int tj = i%order;
176  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
177  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
178  f_arr[gti * K2 + gtj] = 1;
179  }
180  if ( THREAD < order ) {
181  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
182  gtk += THREAD; // padded to increase coalescing (maybe, but reduces re-use)
183 
184  fz_arr[gtk] = 1;
185  }
186 
187  for ( int i=THREAD; i < order*order*order; i+=WARPSIZE ) {
188  int ti = i/(order*order);
189  int tj = (i/order)%order;
190  int tk = i%order;
191  float val = aq * FX[ti] * FY[tj] * FZ[tk];
192 
193  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
194  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
195  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
196  gtk += tk; // padded to increase coalescing (maybe, but reduces re-use)
197  float *dest = q_arr[gti * K2 + gtj]; // could load as constant
198  if ( dest ) atomicAdd(dest+gtk,val);
199  }
200  } // i_atom
201 }
#define ATOM
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
#define THREAD
#define warps_per_block
#define atoms_per_block
static __thread atom * atoms
#define WARPSIZE
Definition: CudaUtils.h:10
#define WARP
#define order
Definition: PmeRealSpace.C:235
#define FX
#define FZ
#define BSPLINE_DEFS
#define WARP_SYNC(MASK)
Definition: CudaUtils.h:59
#define AQ
#define AK
#define AI
#define atoms_per_warp
#define AJ
#define FY
template<int order>
__global__ void cuda_pme_charges_dev ( const float *__restrict__  coeffs,
float *const *__restrict__  q_arr,
int *__restrict__  f_arr,
int *__restrict__  fz_arr,
float *__restrict__  a_data,
int  n_atoms,
int  K1,
int  K2,
int  K3 
)

Definition at line 205 of file ComputePmeCUDAKernel.cu.

References AI, AJ, AK, AQ, ATOM, atoms, atoms_per_block, atoms_per_warp, AX, AY, AZ, BSPLINE_DEFS, FX, FY, FZ, order, THREAD, WARP, WARP_FULL_MASK, WARP_SYNC, warps_per_block, and WARPSIZE.

210  {
212  __shared__ volatile float bspline_factors[warps_per_block][3][order];
213  int atoms_this_warp = atoms_per_warp;
214  {
215  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
216  if ( aidx + atoms_per_warp > n_atoms ) {
217  atoms_this_warp = n_atoms - aidx; // may be negative
218  if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
219  }
220  if ( THREAD < atoms_this_warp*7 ) {
221  atoms[WARP].a1d[THREAD] = a_data[aidx*7+THREAD];
222  }
223  }
224  for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
225  WARP_SYNC(WARP_FULL_MASK); // done writing atoms, reading bspline_factors (FX,FY,FZ)
226  float aq=AQ;
227  int ai=(int)AI;
228  int aj=(int)AJ;
229  int ak=(int)AK;
230 #if 0
231 if ( THREAD == 0 && (
232  AI != (int)AI || AJ != (int)AJ || AK != (int)AK ||
233  AQ < -30.f || AQ > 30.f || AX < 0.f || AX > 1.f ||
234  AY < 0.f || AY > 1.f || AZ < 0.f || AZ > 1.f )
235 ) {
236  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
237  printf("%d/%d %f %f %f %f %f %f %f\n", aidx, n_atoms, AI, AJ, AK, AQ, AX, AY, AZ );
238  continue;
239 }
240 #endif
241  if ( THREAD < 3 * order ) {
242  const float af = ATOM[THREAD/order]; // x, y, or z
243  float f = bspline_coeffs.a2d[0][THREAD % order];
244  for ( int i=1; i<order; ++i ) {
245  f = af * f + bspline_coeffs.a2d[i][THREAD % order];
246  }
247  bspline_factors[WARP][THREAD/order][THREAD % order] = f;
248  }
249  WARP_SYNC(WARP_FULL_MASK); // done writing bspline_factors
250  for ( int i=THREAD; i < order*order; i+=WARPSIZE ) {
251  int ti = i/order;
252  int tj = i%order;
253  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
254  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
255 #if 0
256 if ( gti < 0 || gtj < 0 || gti >= K1 || gtj >= K2 ) {
257  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
258  printf("%d/%d %d %d %d %f %f %f %f %f %f %f\n", aidx, n_atoms, i, gti, gtj, AI, AJ, AK, AQ, AX, AY, AZ);
259 } else
260 #endif
261  f_arr[gti * K2 + gtj] = 1;
262  }
263  if ( THREAD < order ) {
264  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
265  gtk += THREAD; // padded to increase coalescing (maybe, but reduces re-use)
266 #if 0
267 if ( gtk < 0 || gtk >= K3+order-1 ) {
268  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
269  printf("%d/%d %d %d %f %f %f %f %f %f %f\n", aidx, n_atoms, THREAD, gtk, AI, AJ, AK, AQ, AX, AY, AZ);
270 } else
271 #endif
272  fz_arr[gtk] = 1;
273  }
274  for ( int i=THREAD; i < order*order*order; i+=WARPSIZE ) {
275  int ti = i/(order*order);
276  int tj = (i/order)%order;
277  int tk = i%order;
278  float val = aq * FX[ti] * FY[tj] * FZ[tk];
279  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
280  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
281  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
282  gtk += tk; // padded to increase coalescing (maybe, but reduces re-use)
283  float *dest = q_arr[gti * K2 + gtj]; // could load as constant
284 #if 0
285 if ( gti < 0 || gtj < 0 || gtk < 0 || gti >= K1 || gtj >= K2 || gtk >= K3+order-1 || dest == 0 ) {
286  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
287  printf("%d/%d %d %d %d %d %f %f %f %ld %f %f %f %f\n", aidx, n_atoms, i, gti, gtj, gtk, AI, AJ, AK, dest, AQ, AX, AY, AZ);
288 } else
289 #endif
290  if ( dest ) atomicAdd(dest+gtk,val);
291  }
292  } // i_atom
293 }
#define ATOM
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
#define THREAD
#define warps_per_block
#define atoms_per_block
#define AY
static __thread atom * atoms
#define WARPSIZE
Definition: CudaUtils.h:10
#define WARP
#define order
Definition: PmeRealSpace.C:235
#define FX
#define FZ
#define BSPLINE_DEFS
#define WARP_SYNC(MASK)
Definition: CudaUtils.h:59
#define AZ
#define AQ
#define AK
#define AI
#define atoms_per_warp
#define AX
#define AJ
#define FY
template<int order>
__global__ void cuda_pme_forces_dev ( const float *__restrict__  coeffs,
float *const *__restrict__  q_arr,
float *const *__restrict__  afn_g,
int  K1,
int  K2,
int  K3 
)

Definition at line 296 of file ComputePmeCUDAKernel.cu.

References AI, AJ, AK, AQ, ATOM, atoms, atoms_per_block, atoms_per_warp, BSPLINE_DEFS, FX2, FY2, FZ2, order, THREAD, WARP, WARP_FULL_MASK, WARP_SYNC, warps_per_block, WARPSIZE, float2::x, and float2::y.

303  {
304  __shared__ float *afn_s[3];
305  if ( threadIdx.x < 3 ) afn_s[threadIdx.x] = afn_g[3*blockIdx.y+threadIdx.x];
307  __shared__ volatile union {
308  float a2d[atoms_per_warp][3];
309  float a1d[atoms_per_warp*3];
310  } force_output[warps_per_block];
311  __shared__ float2 bspline_2factors[warps_per_block][3][order];
312  __shared__ volatile union {
313  float a2d[WARPSIZE][3];
314  float a1d[WARPSIZE*3];
315  } force_reduction[warps_per_block];
316  int atoms_this_warp = atoms_per_warp;
317  {
318  const int n_atoms = afn_s[2] - afn_s[1];
319  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
320  if ( aidx + atoms_per_warp > n_atoms ) {
321  atoms_this_warp = n_atoms - aidx; // may be negative
322  if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
323  }
324  if ( THREAD < atoms_this_warp*7 ) {
325  atoms[WARP].a1d[THREAD] = afn_s[0][aidx*7+THREAD];
326  }
327  WARP_SYNC(WARP_FULL_MASK); // done writing atoms
328  }
329  for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
330  float aq=AQ;
331  int ai=(int)AI;
332  int aj=(int)AJ;
333  int ak=(int)AK;
334  if ( THREAD < 3 * order ) {
335  const float af = ATOM[THREAD/order]; // x, y, or z
336  float df = 0.f;
337  float c = bspline_coeffs.a2d[0][THREAD % order];
338  float f = c;
339  for ( int i=1; i<order; ++i ) {
340  df = af * df - (order-i) * c;
341  c = bspline_coeffs.a2d[i][THREAD % order];
342  f = af * f + c;
343  }
344  bspline_2factors[WARP][THREAD/order][THREAD % order] = make_float2(f,df);
345  }
346  __threadfence_block(); // bspline_2factors not declared volatile
347  WARP_SYNC(WARP_FULL_MASK); // done writing bspline_2factors (FX2,FY2,FZ2)
348  float force_x = 0.f;
349  float force_y = 0.f;
350  float force_z = 0.f;
351 
352  for ( int i=THREAD; i < order*order*order; i+=WARPSIZE ) {
353  int ti = i/(order*order);
354  int tj = (i/order)%order;
355  int tk = i%order;
356 
357  const float2 fx2 = FX2[ti];
358  const float2 fy2 = FY2[tj];
359  const float2 fz2 = FZ2[tk];
360 
361  const float fx = fx2.x;
362  const float fy = fy2.x;
363  const float fz = fz2.x;
364  const float dfx = fx2.y;
365  const float dfy = fy2.y;
366  const float dfz = fz2.y;
367 
368  float dx = K1 * aq * dfx * fy * fz;
369  float dy = K2 * aq * fx * dfy * fz;
370  float dz = K3 * aq * fx * fy * dfz;
371 
372  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
373  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
374  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
375 
376  gtk += tk; // padded to increase coalescing (maybe, but reduces re-use)
377  const float * __restrict__ src = q_arr[gti * K2 + gtj]; // could load as constant
378  float pot = src ? src[gtk] : __int_as_float(0x7fffffff); // CUDA NaN
379  force_x += dx * pot;
380  force_y += dy * pot;
381  force_z += dz * pot;
382  }
383  force_reduction[WARP].a2d[THREAD][0] = force_x;
384  force_reduction[WARP].a2d[THREAD][1] = force_y;
385  force_reduction[WARP].a2d[THREAD][2] = force_z;
386  WARP_SYNC(WARP_FULL_MASK); // done writing force_reduction
387  if ( THREAD < 24 ) {
388  force_reduction[WARP].a1d[THREAD] += force_reduction[WARP].a1d[THREAD + 24]
389  + force_reduction[WARP].a1d[THREAD + 48]
390  + force_reduction[WARP].a1d[THREAD + 72];
391  }
392  WARP_SYNC(WARP_FULL_MASK); // done writing force_reduction
393  if ( THREAD < 6 ) {
394  force_reduction[WARP].a1d[THREAD] += force_reduction[WARP].a1d[THREAD + 6]
395  + force_reduction[WARP].a1d[THREAD + 12]
396  + force_reduction[WARP].a1d[THREAD + 18];
397  }
398  WARP_SYNC(WARP_FULL_MASK); // done writing force_reduction
399  if ( THREAD < 3 ) {
400  force_output[WARP].a2d[i_atom][THREAD] = force_reduction[WARP].a1d[THREAD]
401  + force_reduction[WARP].a1d[THREAD + 3];
402  }
403  } // i_atom
404  WARP_SYNC(WARP_FULL_MASK); // done writing force_output
405  if ( THREAD < atoms_this_warp*3 ) {
406  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
407  afn_s[1][aidx*3+THREAD] = force_output[WARP].a1d[THREAD];
408  }
409 }
#define ATOM
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
#define THREAD
#define warps_per_block
float x
Definition: PmeSolver.C:4
#define atoms_per_block
#define FX2
static __thread atom * atoms
#define WARPSIZE
Definition: CudaUtils.h:10
#define FY2
#define WARP
#define order
Definition: PmeRealSpace.C:235
float y
Definition: PmeSolver.C:4
#define FZ2
#define BSPLINE_DEFS
#define WARP_SYNC(MASK)
Definition: CudaUtils.h:59
#define AQ
#define AK
#define AI
#define atoms_per_warp
#define AJ
void dummy ( )
if ( nblocks)
if ( (!nblocksX)||(!numPatches )
if ( (!nblocks)||(!dimy)  )
void NAMD_die ( const char *  )

Definition at line 85 of file common.C.

86 {
87  if ( ! err_msg ) err_msg = "(unknown error)";
88  CkPrintf("FATAL ERROR: %s\n", err_msg);
89  fflush(stdout);
90  char repstr[24] = "";
91  if (CmiNumPartitions() > 1) {
92  sprintf(repstr,"REPLICA %d ", CmiMyPartition());
93  // CkAbort ensures that all replicas die
94  CkAbort("%sFATAL ERROR: %s\n", repstr, err_msg);
95  }
96  CkError("%sFATAL ERROR: %s\n", repstr, err_msg);
97 #if CHARM_VERSION < 61000
98  CkExit();
99 #else
100  CkExit(1);
101 #endif
102 }
else NAMD_die ( "unsupported PMEInterpOrder"  )

Variable Documentation

CUDA_PME_CHARGES_BATCHED_PROTOTYPE
Initial value:
{
int nblocksX = (n_max_atoms + atoms_per_block - 1) / atoms_per_block
#define atoms_per_block

Definition at line 425 of file ComputePmeCUDAKernel.cu.

CUDA_PME_FORCES_PROTOTYPE
Initial value:
{
#define atoms_per_block
CUDA_PME_CHARGES_PROTOTYPE int nblocks

Definition at line 449 of file ComputePmeCUDAKernel.cu.

dim3 gridSize

Definition at line 428 of file ComputePmeCUDAKernel.cu.

Definition at line 412 of file ComputePmeCUDAKernel.cu.

const float order_4_coeffs[4][4]
static
Initial value:
=
{{1, -3, 3, -1}, {0, 3, -6, 3}, {0, 3, 0, -3}, {0, 1, 4, 1}}

Definition at line 29 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_init_bspline_coeffs().

const float order_6_coeffs[6][6]
static
Initial value:
=
{{1, -5, 10, -10, 5, -1}, {0, 5, -20, 30, -20, 5}, {0, 10, -20, 0,
20, -10}, {0, 10, 20, -60, 20, 10}, {0, 5, 50, 0, -50, -5}, {0, 1,
26, 66, 26, 1}}

Definition at line 33 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_init_bspline_coeffs().

const float order_8_coeffs[8][8]
static
Initial value:
=
{{1, -7, 21, -35, 35, -21, 7, -1}, {0, 7, -42, 105, -140, 105, -42,
7}, {0, 21, -84, 105, 0, -105, 84, -21}, {0, 35, 0, -315, 560, -315,
0, 35}, {0, 35, 280, -665, 0, 665, -280, -35}, {0, 21, 504,
315, -1680, 315, 504, 21}, {0, 7, 392, 1715,
0, -1715, -392, -7}, {0, 1, 120, 1191, 2416, 1191, 120, 1}}

Definition at line 39 of file ComputePmeCUDAKernel.cu.

Referenced by cuda_init_bspline_coeffs().

gridSize x = nblocksX

Definition at line 429 of file ComputePmeCUDAKernel.cu.

Referenced by AVector::AVector(), buildSortKeys(), calc_theta_dtheta(), PmeRealSpaceCompute::calcGridCoord(), ComputeQMMgr::calcMOPAC(), ComputeQMMgr::calcORCA(), ComputeQMMgr::calcUSR(), eABF1D::calpmf(), GridforceFullSubGrid::compute_b(), compute_b_spline(), GridforceFullBaseGrid::compute_VdV(), BondElem::computeForce(), GromacsPairElem::computeForce(), ComputeSphericalBC::doForce(), ComputeCylindricalBC::doForce(), HomePatch::doGroupSizeCheck(), ComputeFmmSerial::doWork(), ComputeMsmSerial::doWork(), ComputeFullDirect::doWork(), ComputeExt::doWork(), ComputeMsm::doWork(), ComputeGBISser::doWork(), ComputeGlobal::doWork(), ComputeEwald::doWork(), ComputeQM::doWork(), dumpbench(), PmeAtomFiler::fileAtoms(), gather_force(), GBIS_P2_Kernel(), GBIS_P3_Kernel(), PmePencilXY::initBlockSizes(), PmePencilX::initBlockSizes(), ComputePmeMgr::initialize(), CudaPmePencilXY::initializeDevice(), CudaPmePencilX::initializeDevice(), minHeap::insert(), maxHeap::insert(), MatrixFitRMS(), Controller::minimize(), Node::Node(), obj_transabout(), obj_transvec(), obj_transvecinv(), pe_sortop_coord_x::operator()(), ComputeQM::processFullQM(), ComputeQMMgr::procQMRes(), ComputePmeMgr::procUntrans(), HashPool< T >::push_back(), Parameters::read_energy_type_cubspline(), eABF1D::readfile(), recursive_bisect_coord(), ComputePmeMgr::recvArrays(), ComputeExtMgr::recvCoord(), ComputePmeCUDAMgr::recvPencils(), ComputeGlobal::recvResults(), sampleTableTex(), Matrix4::scale(), scale_coordinates(), ComputeNonbondedUtil::select(), ComputePmeMgr::sendTransSubset(), ComputePmeMgr::sendUntransSubset(), AVector::Set(), PDB::set_all_positions(), settle1(), spread_charge_kernel(), Tcl_loadCoords(), ProxyCombinedResultMsg::toRaw(), Matrix4Symmetry::translate(), Matrix4TMD::translate(), Matrix4::translate(), ParallelIOMgr::updateMolInfo(), void(), writeComplexToDisk(), writeHostComplexToDisk(), and writeResult().

gridSize z = 1

Definition at line 431 of file ComputePmeCUDAKernel.cu.

Referenced by AVector::AVector(), calc_theta_dtheta(), PmeRealSpaceCompute::calcGridCoord(), ComputeQMMgr::calcMOPAC(), ComputeQMMgr::calcORCA(), ComputeQMMgr::calcUSR(), GridforceFullSubGrid::compute_b(), compute_b_spline(), GridforceFullBaseGrid::compute_VdV(), HomePatch::doGroupSizeCheck(), dumpbench(), PmeAtomFiler::fileAtoms(), gather_force(), GBIS_P2_Kernel(), GBIS_P3_Kernel(), Molecule::get_go_force2(), Molecule::get_gro_force2(), HomePatch::hardWallDrude(), PmePencilZ::initBlockSizes(), ComputePmeCUDADevice::initialize(), ComputePmeMgr::initialize(), CudaPmePencilZ::initializeDevice(), ComputePmeCUDADevice::initializePatches(), Node::Node(), obj_transabout(), obj_transvec(), obj_transvecinv(), ComputeQMMgr::procQMRes(), HomePatch::rattle1old(), ComputePmeMgr::recvArrays(), ComputePmeCUDADevice::recvAtoms(), ComputePmeCUDADevice::recvAtomsFromNeighbor(), ComputeExtMgr::recvCoord(), ComputePmeCUDADevice::recvForcesFromNeighbor(), ComputePmeCUDAMgr::recvPencils(), Matrix4::scale(), scale_coordinates(), ComputePmeCUDADevice::sendAtomsToNeighbors(), ComputePmeCUDADevice::sendForcesToNeighbors(), AVector::Set(), PDB::set_all_positions(), spread_charge_kernel(), Sequencer::submitHalfstep(), Sequencer::submitReductions(), Random::sum_of_squared_gaussians(), Tcl_loadCoords(), ProxyCombinedResultMsg::toRaw(), Matrix4Symmetry::translate(), Matrix4TMD::translate(), Matrix4::translate(), ParallelIOMgr::updateMolInfo(), void(), and writeResult().