NAMD
ComputeNonbondedCUDAKernel.cu
Go to the documentation of this file.
1 
2 #include "CudaUtils.h"
4 #include <stdio.h>
5 
6 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
7 
8 #ifdef WIN32
9 #define __thread __declspec(thread)
10 #endif
11 
12 texture<unsigned int, 1, cudaReadModeElementType> tex_exclusions;
13 static __thread int exclusions_size;
14 static __thread unsigned int *exclusions;
15 
16 __constant__ unsigned int const_exclusions[MAX_CONST_EXCLUSIONS];
17 static __thread unsigned int *overflow_exclusions;
18 
19 #define SET_EXCL(EXCL,BASE,DIFF) \
20  (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
21 
22 void cuda_bind_exclusions(const unsigned int *t, int n) {
23  exclusions_size = n;
24  static __thread int exclusions_alloc;
25  if ( exclusions && exclusions_alloc < exclusions_size ) {
26  cudaFree(exclusions);
27  cuda_errcheck("freeing exclusions");
28  cudaFree(overflow_exclusions);
29  cuda_errcheck("freeing overflow_exclusions");
30  exclusions = 0;
31  }
32  if ( ! exclusions ) {
33  exclusions_alloc = exclusions_size;
34  cudaMalloc((void**) &exclusions, n*sizeof(unsigned int));
35  cuda_errcheck("malloc exclusions");
36  cudaMalloc((void**) &overflow_exclusions, n*sizeof(unsigned int));
37  cuda_errcheck("malloc overflow_exclusions");
38  }
39  cudaMemcpy(exclusions, t, n*sizeof(unsigned int), cudaMemcpyHostToDevice);
40  cuda_errcheck("memcpy exclusions");
41  tex_exclusions.normalized = false;
42  tex_exclusions.addressMode[0] = cudaAddressModeClamp;
43  tex_exclusions.filterMode = cudaFilterModePoint;
44  cudaBindTexture(NULL, tex_exclusions, exclusions, n*sizeof(unsigned int));
45  cuda_errcheck("binding exclusions to texture");
46 
47  cudaMemcpy(overflow_exclusions, t,
48  n*sizeof(unsigned int), cudaMemcpyHostToDevice);
49  cuda_errcheck("memcpy to overflow_exclusions");
50  int nconst = ( n < MAX_CONST_EXCLUSIONS ? n : MAX_CONST_EXCLUSIONS );
51  cudaMemcpyToSymbol(const_exclusions, t, nconst*sizeof(unsigned int), 0);
52  cuda_errcheck("memcpy to const_exclusions");
53 }
54 
55 
56 texture<float2, 1, cudaReadModeElementType> lj_table;
57 static __thread int lj_table_size;
58 
59 void cuda_bind_lj_table(const float2 *t, int _lj_table_size) {
60  static __thread float2 *ct;
61  static __thread int lj_table_alloc;
62  lj_table_size = _lj_table_size;
63  if ( ct && lj_table_alloc < lj_table_size ) {
64  cudaFree(ct);
65  cuda_errcheck("freeing lj table");
66  ct = 0;
67  }
68  if ( ! ct ) {
69  lj_table_alloc = lj_table_size;
70  cudaMalloc((void**) &ct, lj_table_size*lj_table_size*sizeof(float2));
71  cuda_errcheck("allocating lj table");
72  }
73  cudaMemcpy(ct, t, lj_table_size*lj_table_size*sizeof(float2),
74  cudaMemcpyHostToDevice);
75  cuda_errcheck("memcpy to lj table");
76 
77  lj_table.normalized = false;
78  lj_table.addressMode[0] = cudaAddressModeClamp;
79  lj_table.filterMode = cudaFilterModePoint;
80 
81  cudaBindTexture((size_t*)0, lj_table, ct,
83  cuda_errcheck("binding lj table to texture");
84 }
85 
86 
87 texture<float4, 1, cudaReadModeElementType> force_table;
88 texture<float4, 1, cudaReadModeElementType> energy_table;
89 
90 void cuda_bind_force_table(const float4 *t, const float4 *et) {
91  static __thread cudaArray *ct;
92  static __thread cudaArray *ect;
93  if ( ! ct ) {
94  cudaMallocArray(&ct, &force_table.channelDesc, FORCE_TABLE_SIZE, 1);
95  cuda_errcheck("allocating force table");
96  }
97  if ( ! ect ) {
98  cudaMallocArray(&ect, &energy_table.channelDesc, FORCE_TABLE_SIZE, 1);
99  cuda_errcheck("allocating energy table");
100  }
101  cudaMemcpyToArray(ct, 0, 0, t, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);
102  // cudaMemcpy(ct, t, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);
103  cuda_errcheck("memcpy to force table");
104  cudaMemcpyToArray(ect, 0, 0, et, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);
105  cuda_errcheck("memcpy to energy table");
106 
107  force_table.normalized = true;
108  force_table.addressMode[0] = cudaAddressModeClamp;
109  force_table.addressMode[1] = cudaAddressModeClamp;
110  force_table.filterMode = cudaFilterModeLinear;
111 
112  energy_table.normalized = true;
113  energy_table.addressMode[0] = cudaAddressModeClamp;
114  energy_table.addressMode[1] = cudaAddressModeClamp;
115  energy_table.filterMode = cudaFilterModeLinear;
116 
117  cudaBindTextureToArray(force_table, ct);
118  cuda_errcheck("binding force table to texture");
119 
120  cudaBindTextureToArray(energy_table, ect);
121  cuda_errcheck("binding energy table to texture");
122 }
123 
124 static __thread int num_patches;
125 static __thread int num_virials;
126 static __thread int num_atoms;
127 // Size of the device array followed by the array pointer
128 static __thread int patch_pairs_size;
129 static __thread patch_pair* patch_pairs;
130 static __thread int atom_params_size;
131 static __thread atom_param* atom_params;
132 static __thread int vdw_types_size;
133 static __thread int* vdw_types;
134 static __thread int atoms_size;
135 static __thread atom* atoms;
136 
137 static __thread int tmpforces_size;
138 static __thread float4* tmpforces;
139 static __thread int slow_tmpforces_size;
140 static __thread float4* slow_tmpforces;
141 
142 static __thread int tmpvirials_size;
143 static __thread float* tmpvirials;
144 static __thread int slow_tmpvirials_size;
145 static __thread float* slow_tmpvirials;
146 
147 static __thread int global_counters_size;
148 static __thread unsigned int* global_counters;
149 static __thread int plist_size;
150 static __thread unsigned int* plist;
151 static __thread int exclmasks_size;
152 static __thread exclmask* exclmasks;
153 // Device pointers to the page-locked host arrays (provided by ComputeNonbondedCUDA -class)
154 static __thread float4* forces;
155 static __thread float4* slow_forces;
156 static __thread int* force_ready_queue;
157 static __thread float* virials;
158 static __thread float* slow_virials;
159 static __thread int* block_order;
160 
161 //GBIS arrays
162 static __thread int intRad0D_size;
163 static __thread float *intRad0D;
164 
165 static __thread int intRadSD_size;
166 static __thread float *intRadSD;
167 
168 static __thread GBReal *psiSumD; // host-mapped memory
169 
170 static __thread int tmp_psiSumD_size;
171 static __thread GBReal *tmp_psiSumD;
172 
173 static __thread int bornRadD_size;
174 static __thread float *bornRadD;
175 
176 static __thread GBReal *dEdaSumD; // host-mapped memory
177 
178 static __thread int tmp_dEdaSumD_size;
179 static __thread GBReal *tmp_dEdaSumD;
180 
181 static __thread int dHdrPrefixD_size;
182 static __thread float *dHdrPrefixD;
183 
184 static __thread int GBIS_P1_counters_size;
185 static __thread unsigned int *GBIS_P1_counters;
186 
187 static __thread int GBIS_P2_counters_size;
188 static __thread unsigned int *GBIS_P2_counters;
189 
190 static __thread int GBIS_P3_counters_size;
191 static __thread unsigned int *GBIS_P3_counters;
192 
193 static __thread float *energy_gbis; // host-mapped memory
194 
195 static __thread int tmp_energy_gbis_size;
196 static __thread float *tmp_energy_gbis;
197 
198 __thread int max_grid_size;
199 
200 __thread cudaStream_t stream;
201 __thread cudaStream_t stream2;
202 
203 void cuda_init() {
204  patch_pairs_size = 0;
205  patch_pairs = NULL;
206 
207  atom_params_size = 0;
208  atom_params = NULL;
209 
210  vdw_types_size = 0;
211  vdw_types = NULL;
212 
213  atoms_size = 0;
214  atoms = NULL;
215 
216  tmpforces_size = 0;
217  tmpforces = NULL;
218 
220  slow_tmpforces = NULL;
221 
222  tmpvirials_size = 0;
223  tmpvirials = NULL;
224 
226  slow_tmpvirials = NULL;
227 
229  global_counters = NULL;
230 
231  plist_size = 0;
232  plist = NULL;
233 
234  exclmasks_size = 0;
235  exclmasks = NULL;
236 
237  forces = NULL;
238  slow_forces = NULL;
239 
240  force_ready_queue = NULL;
241 
242  exclusions_size = 0;
243  exclusions = NULL;
244 
245  // --------------------
246  // For GBIS
247  // --------------------
248  intRad0D_size = 0;
249  intRad0D = NULL;
250 
251  intRadSD_size = 0;
252  intRadSD = NULL;
253 
254  psiSumD = NULL; // host-mapped memory
255 
256  tmp_psiSumD_size = 0;
257  tmp_psiSumD = NULL;
258 
259  bornRadD_size = 0;
260  bornRadD = NULL;
261 
262  dEdaSumD = NULL; // host-mapped memory
263 
264  tmp_dEdaSumD_size = 0;
265  tmp_dEdaSumD = NULL;
266 
267  dHdrPrefixD_size = 0;
268  dHdrPrefixD = NULL;
269 
271  GBIS_P1_counters = NULL;
272 
274  GBIS_P2_counters = NULL;
275 
277  GBIS_P3_counters = NULL;
278 
279  energy_gbis = NULL; // host-mapped memory
280 
282  tmp_energy_gbis = NULL;
283 
284  int dev;
285  cudaGetDevice(&dev);
286  cuda_errcheck("cudaGetDevice");
287  cudaDeviceProp deviceProp;
288  cudaGetDeviceProperties(&deviceProp, dev);
289  cuda_errcheck("cudaGetDeviceProperties");
290  max_grid_size = deviceProp.maxGridSize[1];
291 }
292 
293 void cuda_bind_patch_pairs(patch_pair *h_patch_pairs, int npatch_pairs,
294  int npatches, int natoms, int plist_len,
295  int nexclmask) {
296  num_patches = npatches;
297  num_virials = npatches;
298  num_atoms = natoms;
299  reallocate_device<patch_pair>(&patch_pairs, &patch_pairs_size, npatch_pairs, 1.2f);
300  reallocate_device<atom>(&atoms, &atoms_size, num_atoms, 1.2f);
301  reallocate_device<atom_param>(&atom_params, &atom_params_size, num_atoms, 1.2f);
302  reallocate_device<int>(&vdw_types, &vdw_types_size, num_atoms, 1.2f);
303  reallocate_device<unsigned int>(&global_counters, &global_counters_size, num_patches+2, 1.2f);
304  reallocate_device<float4>(&tmpforces, &tmpforces_size, num_atoms, 1.2f);
305  reallocate_device<float4>(&slow_tmpforces, &slow_tmpforces_size, num_atoms, 1.2f);
306  reallocate_device<unsigned int>(&plist, &plist_size, plist_len, 1.2f);
307  reallocate_device<exclmask>(&exclmasks, &exclmasks_size, nexclmask, 1.2f);
308  reallocate_device<float>(&tmpvirials, &tmpvirials_size, num_patches*16, 1.2f);
309  reallocate_device<float>(&slow_tmpvirials, &slow_tmpvirials_size, num_patches*16, 1.2f);
310 
311  // For GBIS
312  reallocate_device<unsigned int>(&GBIS_P1_counters, &GBIS_P1_counters_size, num_patches, 1.2f);
313  reallocate_device<unsigned int>(&GBIS_P2_counters, &GBIS_P2_counters_size, num_patches, 1.2f);
314  reallocate_device<unsigned int>(&GBIS_P3_counters, &GBIS_P3_counters_size, num_patches, 1.2f);
315  reallocate_device<float>(&intRad0D, &intRad0D_size, num_atoms, 1.2f);
316  reallocate_device<float>(&intRadSD, &intRadSD_size, num_atoms, 1.2f);
317  reallocate_device<GBReal>(&tmp_psiSumD, &tmp_psiSumD_size, num_atoms, 1.2f);
318  reallocate_device<float>(&bornRadD, &bornRadD_size, num_atoms, 1.2f);
319  reallocate_device<GBReal>(&tmp_dEdaSumD, &tmp_dEdaSumD_size, num_atoms, 1.2f);
320  reallocate_device<float>(&dHdrPrefixD, &dHdrPrefixD_size, num_atoms, 1.2f);
321  reallocate_device<float>(&tmp_energy_gbis, &tmp_energy_gbis_size, num_patches, 1.2f);
322 
323  cudaMemcpy(patch_pairs, h_patch_pairs, npatch_pairs*sizeof(patch_pair), cudaMemcpyHostToDevice);
324  cuda_errcheck("memcpy to patch_pairs");
325 
326  cudaMemset(global_counters, 0, (num_patches+2)*sizeof(unsigned int));
327  cuda_errcheck("memset global_counters");
328 
329  cudaMemset(GBIS_P1_counters, 0, num_patches*sizeof(unsigned int));
330  cuda_errcheck("memset GBIS_P1_counters");
331 
332  cudaMemset(GBIS_P2_counters, 0, num_patches*sizeof(unsigned int));
333  cuda_errcheck("memset GBIS_P2_counters");
334 
335  cudaMemset(GBIS_P3_counters, 0, num_patches*sizeof(unsigned int));
336  cuda_errcheck("memset GBIS_P3_counters");
337 
338  cudaMemset(tmpforces, 0, num_atoms*sizeof(float4));
339  cuda_errcheck("memset tmpforces");
340 
341  cudaMemset(tmpvirials, 0, num_patches*sizeof(float)*16);
342  cuda_errcheck("memset tmpvirials");
343 
344  cudaMemset(slow_tmpforces, 0, num_atoms*sizeof(float4));
345  cuda_errcheck("memset slow_tmpforces");
346 
347  cudaMemset(slow_tmpvirials, 0, num_patches*sizeof(float)*16);
348  cuda_errcheck("memset slow_tmpvirials");
349 
350 }
351 
352 void cuda_bind_atom_params(const atom_param *t) {
353  cudaMemcpyAsync(atom_params, t, num_atoms * sizeof(atom_param),
354  cudaMemcpyHostToDevice, stream);
355  cuda_errcheck("memcpy to atom_params");
356 }
357 
358 void cuda_bind_vdw_types(const int *t) {
359  cudaMemcpyAsync(vdw_types, t, num_atoms * sizeof(int),
360  cudaMemcpyHostToDevice, stream);
361  cuda_errcheck("memcpy to vdw_types");
362 }
363 
364 void cuda_bind_atoms(const atom *a) {
365  cuda_errcheck("before memcpy to atoms");
366  cudaMemcpyAsync(atoms, a, num_atoms * sizeof(atom),
367  cudaMemcpyHostToDevice, stream);
368  cuda_errcheck("memcpy to atoms");
369 }
370 
371 void cuda_bind_forces(float4 *f, float4 *f_slow) {
372  cudaHostGetDevicePointer((void **)&forces, f, 0);
373  cuda_errcheck("cudaHostGetDevicePointer forces");
374  cudaHostGetDevicePointer((void **)&slow_forces, f_slow, 0);
375  cuda_errcheck("cudaHostGetDevicePointer slow_forces");
376 }
377 
378 void cuda_bind_virials(float *v, int *queue, int *blockorder) {
379  cudaHostGetDevicePointer((void **)&virials, v, 0);
380  cuda_errcheck("cudaHostGetDevicePointer virials");
382  cudaHostGetDevicePointer((void **)&force_ready_queue, queue, 0);
383  cuda_errcheck("cudaHostGetDevicePointer force_ready_queue");
384  cudaHostGetDevicePointer((void **)&block_order, blockorder, 0);
385  cuda_errcheck("cudaHostGetDevicePointer block_order");
386 }
387 
388 //GBIS bindings
389 void cuda_bind_GBIS_energy(float *e) {
390  cudaHostGetDevicePointer((void **)&energy_gbis, e, 0);
391  cuda_errcheck("cudaHostGetDevicePointer energy_gbis");
392 }
394  cudaMemcpyAsync(intRad0D, intRad0H, num_atoms * sizeof(float),
395  cudaMemcpyHostToDevice, stream);
396  cudaMemcpyAsync(intRadSD, intRadSH, num_atoms * sizeof(float),
397  cudaMemcpyHostToDevice, stream);
398  cuda_errcheck("memcpy to intRad");
399 }
400 
402  cudaHostGetDevicePointer((void **)&psiSumD, psiSumH, 0);
403  cuda_errcheck("cudaHostGetDevicePointer psiSum");
404 }
405 
407  cudaMemcpyAsync(bornRadD, bornRadH, num_atoms * sizeof(float),
408  cudaMemcpyHostToDevice, stream);
409  cuda_errcheck("memcpy to bornRad");
410 }
411 
413  cudaHostGetDevicePointer((void **)&dEdaSumD, dEdaSumH, 0);
414  cuda_errcheck("cudaHostGetDevicePointer dEdaSum");
415 }
416 
418  cudaMemcpyAsync(dHdrPrefixD, dHdrPrefixH, num_atoms * sizeof(float),
419  cudaMemcpyHostToDevice, stream);
420  cuda_errcheck("memcpy to dHdrPrefix");
421 }
422 // end GBIS methods
423 
424 #if 0
425 void cuda_load_forces(float4 *f, float4 *f_slow, int begin, int count) {
426  // printf("load forces %d %d %d\n",begin,count,num_atoms);
427  cudaMemcpyAsync(f+begin, forces+begin, count * sizeof(float4),
428  cudaMemcpyDeviceToHost, stream);
429  if ( f_slow ) {
430  cudaMemcpyAsync(f_slow+begin, slow_forces+begin, count * sizeof(float4),
431  cudaMemcpyDeviceToHost, stream);
432  }
433  cuda_errcheck("memcpy from forces");
434 }
435 
436 void cuda_load_virials(float *v, int doSlow) {
437  int count = force_lists_size;
438  if ( doSlow ) count *= 2;
439  cudaMemcpyAsync(v, virials, count * 16*sizeof(float),
440  cudaMemcpyDeviceToHost, stream);
441  cuda_errcheck("memcpy from virials");
442 }
443 #endif
444 
445 #if 0
446 __host__ __device__ static int3 patch_coords_from_id(
447  dim3 PATCH_GRID, int id) {
448 
449  return make_int3( id % PATCH_GRID.x,
450  ( id / PATCH_GRID.x ) % PATCH_GRID.y,
451  id / ( PATCH_GRID.x * PATCH_GRID.y ) );
452 }
453 
454 __host__ __device__ static int patch_id_from_coords(
455  dim3 PATCH_GRID, int3 coords) {
456 
457  // handles periodic boundaries
458  int x = (coords.x + 4 * PATCH_GRID.x) % PATCH_GRID.x;
459  int y = (coords.y + 4 * PATCH_GRID.y) % PATCH_GRID.y;
460  int z = (coords.z + 4 * PATCH_GRID.z) % PATCH_GRID.z;
461 
462  return ( z * PATCH_GRID.y + y ) * PATCH_GRID.x + x;
463 }
464 
465 __host__ __device__ static int3 patch_offset_from_neighbor(int neighbor) {
466 
467  // int3 coords = patch_coords_from_id(make_uint3(3,3,3), 13 + neighbor);
468  int3 coords = patch_coords_from_id(make_uint3(3,3,3), neighbor);
469  return make_int3(coords.x - 1, coords.y - 1, coords.z - 1);
470 
471 }
472 #endif
473 
474 #define BLOCK_SIZE 128
475 
476 #define MAKE_PAIRLIST
477 #define DO_SLOW
478 #define DO_ENERGY
480 #undef DO_ENERGY
482 #undef DO_SLOW
483 #define DO_ENERGY
485 #undef DO_ENERGY
487 #undef MAKE_PAIRLIST
488 #define DO_SLOW
489 #define DO_ENERGY
491 #undef DO_ENERGY
493 #undef DO_SLOW
494 #define DO_ENERGY
496 #undef DO_ENERGY
498 
499 void cuda_nonbonded_forces(float3 lata, float3 latb, float3 latc,
500  float cutoff2, float plcutoff2,
501  int cbegin, int ccount, int ctotal,
502  int doSlow, int doEnergy, int usePairlists, int savePairlists,
503  int doStreaming, int saveOrder, cudaStream_t &strm) {
504 
505  if ( ccount ) {
506  if ( usePairlists ) {
507  if ( ! savePairlists ) plcutoff2 = 0.;
508  } else {
509  plcutoff2 = cutoff2;
510  }
511 
512  cudaMemsetAsync(tmpforces, 0, num_atoms*sizeof(float4), strm);
513  cudaMemsetAsync(tmpvirials, 0, num_patches*sizeof(float)*16, strm);
514  if ( doSlow ) {
515  cudaMemsetAsync(slow_tmpforces, 0, num_atoms*sizeof(float4), strm);
516  cudaMemsetAsync(slow_tmpvirials, 0, num_patches*sizeof(float)*16, strm);
517  }
518 
519  int grid_dim = max_grid_size; // maximum allowed
520  for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
521  if ( grid_dim > ccount - cstart ) grid_dim = ccount - cstart;
522 
523  dim3 nthread3(WARPSIZE, NUM_WARP, 1);
524 #define CALL(X) X<<< grid_dim, nthread3, 0, strm >>> ( \
525  patch_pairs, atoms, atom_params, vdw_types, plist, \
526  tmpforces, (doSlow?slow_tmpforces:NULL), \
527  forces, (doSlow?slow_forces:NULL), \
528  tmpvirials, (doSlow?slow_tmpvirials:NULL), \
529  virials, (doSlow?slow_virials:NULL), \
530  global_counters, (doStreaming?force_ready_queue:NULL), \
531  overflow_exclusions, num_patches, \
532  cbegin+cstart, ctotal, (saveOrder?block_order:NULL), \
533  exclmasks, lj_table_size, \
534  lata, latb, latc, cutoff2, plcutoff2, doSlow)
535 
536 //end definition
537 
538  if ( doEnergy ) {
539  if ( doSlow ) {
540  if ( plcutoff2 != 0. ) CALL(dev_nonbonded_slow_energy_pairlist);
541  else CALL(dev_nonbonded_slow_energy);
542  } else {
543  if ( plcutoff2 != 0. ) CALL(dev_nonbonded_energy_pairlist);
544  else CALL(dev_nonbonded_energy);
545  }
546  } else {
547  if ( doSlow ) {
548  if ( plcutoff2 != 0. ) CALL(dev_nonbonded_slow_pairlist);
549  else CALL(dev_nonbonded_slow);
550  } else {
551  if ( plcutoff2 != 0. ) CALL(dev_nonbonded_pairlist);
552  else CALL(dev_nonbonded);
553  }
554  }
555 
556  cuda_errcheck("dev_nonbonded");
557  }
558  }
559 
560 }
561 
562 //import GBIS Kernel definitions
563 #include "ComputeGBISCUDAKernel.h"
564 
566 // GBIS P1
569  int cbegin,
570  int ccount,
571  int pbegin,
572  int pcount,
573  float a_cut,
574  float rho_0,
575  float3 lata,
576  float3 latb,
577  float3 latc,
578  cudaStream_t &strm
579 ) {
580 
581  if ( ccount ) {
582  cudaMemsetAsync(tmp_psiSumD, 0, num_atoms*sizeof(GBReal), strm);
583 
584  int grid_dim = max_grid_size; // maximum allowed
585  for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
586  if (grid_dim > ccount - cstart) {
587  grid_dim = ccount - cstart;
588  }
589 
590  dim3 nthread3(WARPSIZE, NUM_WARP, 1);
591  GBIS_P1_Kernel<<<grid_dim, nthread3, 0, strm>>>(
592  patch_pairs+cbegin+cstart,
593  atoms,
594  intRad0D,
595  intRadSD,
596  tmp_psiSumD,
597  psiSumD,
598  a_cut,
599  rho_0,
600  lata,
601  latb,
602  latc,
604  );
605  cuda_errcheck("dev_GBIS_P1");
606  } // end for
607  }
608 } // end GBIS P1
609 
611 // GBIS P2
614  int cbegin,
615  int ccount,
616  int pbegin,
617  int pcount,
618  float a_cut,
619  float r_cut,
620  float scaling,
621  float kappa,
622  float smoothDist,
623  float epsilon_p,
624  float epsilon_s,
625  float3 lata,
626  float3 latb,
627  float3 latc,
628  int doEnergy,
629  int doFullElec,
630  cudaStream_t &strm
631 ) {
632 
633  if ( ccount ) {
634  cudaMemsetAsync(tmp_dEdaSumD, 0, num_atoms*sizeof(GBReal), strm);
635  cudaMemsetAsync(tmp_energy_gbis, 0, num_patches*sizeof(float), strm);
636 
637  int grid_dim = max_grid_size; // maximum allowed
638  for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
639  if (grid_dim > ccount - cstart)
640  grid_dim = ccount - cstart;
641 
642  dim3 nthread3(WARPSIZE, NUM_WARP, 1);
643  GBIS_P2_Kernel<<<grid_dim, nthread3, 0, strm>>>(
644  patch_pairs+cbegin+cstart,
645  atoms,
646  bornRadD,
647  tmp_dEdaSumD,
648  dEdaSumD,
649  a_cut,
650  r_cut,
651  scaling,
652  kappa,
653  smoothDist,
654  epsilon_p,
655  epsilon_s,
656  lata,
657  latb,
658  latc,
659  doEnergy,
660  doFullElec,
661  tmpforces,
662  forces,
664  energy_gbis,
666  );
667  cuda_errcheck("dev_GBIS_P2");
668  } // end for
669  }
670 } // end P2
671 
673 // GBIS P3
676  int cbegin,
677  int ccount,
678  int pbegin,
679  int pcount,
680  float a_cut,
681  float rho_0,
682  float scaling,
683  float3 lata,
684  float3 latb,
685  float3 latc,
686  cudaStream_t &strm
687 ) {
688  int grid_dim = max_grid_size; // maximum allowed
689  for ( int cstart = 0; cstart < ccount; cstart += grid_dim ) {
690  if (grid_dim > ccount - cstart)
691  grid_dim = ccount - cstart;
692 
693  dim3 nthread3(WARPSIZE, NUM_WARP, 1);
694  GBIS_P3_Kernel<<<grid_dim, nthread3, 0, strm>>>(
695  patch_pairs+cbegin+cstart,
696  atoms,
697  intRad0D,
698  intRadSD,
699  dHdrPrefixD,
700  a_cut,
701  rho_0,
702  scaling,
703  lata,
704  latb,
705  latc,
707  slow_forces,
709  );
710  cuda_errcheck("dev_GBIS_P3");
711  }
712 }
713 
714 #if 0
715 int cuda_stream_finished() {
716  return ( cudaStreamQuery(stream) == cudaSuccess );
717 }
718 #endif
719 
720 
721 #else // NAMD_CUDA
722 
723 // for make depends
725 #include "ComputeGBISCUDAKernel.h"
726 
727 #endif // NAMD_CUDA
728 
void cuda_bind_force_table(const float4 *t, const float4 *et)
static __thread unsigned int * GBIS_P2_counters
void cuda_bind_forces(float4 *f, float4 *f_slow)
static __thread GBReal * tmp_dEdaSumD
int cuda_stream_finished()
static __thread float * tmp_energy_gbis
void cuda_bind_exclusions(const unsigned int *t, int n)
static __thread float * intRadSD
void cuda_bind_atoms(const atom *a)
static __thread int patch_pairs_size
static __thread int intRad0D_size
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread int lj_table_size
static __thread float * bornRadD
static __thread unsigned int * exclusions
static __thread atom * atoms
static __thread float * intRad0D
static __thread float4 * forces
static __thread GBReal * tmp_psiSumD
static __thread unsigned int * plist
static __thread unsigned int * overflow_exclusions
static __thread GBReal * dEdaSumD
#define FORCE_TABLE_SIZE
static __thread float * slow_virials
static __thread float * bornRadH
static __thread int GBIS_P1_counters_size
static __thread int intRadSD_size
static __thread float * energy_gbis
static __thread exclmask * exclmasks
static __thread float * slow_tmpvirials
static __thread float * dHdrPrefixH
static __thread int plist_size
void cuda_bind_GBIS_energy(float *e)
void cuda_bind_GBIS_dEdaSum(GBReal *dEdaSumH)
#define WARPSIZE
Definition: CudaUtils.h:10
static __thread float * virials
static __thread float4 * slow_forces
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float plcutoff2
static __thread int vdw_types_size
void cuda_nonbonded_forces(float3 lata, float3 latb, float3 latc, float cutoff2, float plcutoff2, int cbegin, int ccount, int ctotal, int doSlow, int doEnergy, int usePairlists, int savePairlists, int doStreaming, int saveOrder, cudaStream_t &strm)
static __thread int atom_params_size
__thread cudaStream_t stream
static __thread float * dHdrPrefixD
static __thread int global_counters_size
static __thread int GBIS_P3_counters_size
static __thread float * tmpvirials
static __thread int * force_ready_queue
static __thread patch_pair * patch_pairs
__constant__ unsigned int const_exclusions[MAX_CONST_EXCLUSIONS]
static __thread float * intRadSH
void cuda_bind_patch_pairs(patch_pair *h_patch_pairs, int npatch_pairs, int npatches, int natoms, int plist_len, int nexclmask)
void cuda_bind_vdw_types(const int *t)
void cuda_bind_lj_table(const float2 *t, int _lj_table_size)
static __thread unsigned int * GBIS_P1_counters
static __thread int num_patches
texture< float2, 1, cudaReadModeElementType > lj_table
void cuda_bind_GBIS_dHdrPrefix(float *dHdrPrefixH)
static __thread int num_virials
texture< unsigned int, 1, cudaReadModeElementType > tex_exclusions
gridSize z
static __thread int tmpvirials_size
static __thread int tmp_psiSumD_size
static __thread int bornRadD_size
static __thread atom_param * atom_params
void cuda_bind_GBIS_bornRad(float *bornRadH)
static __thread int GBIS_P2_counters_size
static __thread int tmp_dEdaSumD_size
static __thread int tmp_energy_gbis_size
void cuda_bind_virials(float *v, int *queue, int *blockorder)
static __thread unsigned int * GBIS_P3_counters
static __thread int num_atoms
static __thread int slow_tmpforces_size
static __thread int exclusions_size
void cuda_bind_GBIS_psiSum(GBReal *psiSumH)
void cuda_GBIS_P3(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float scaling, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
#define MAX_CONST_EXCLUSIONS
static __thread float4 * slow_tmpforces
static __thread int tmpforces_size
void cuda_GBIS_P1(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
static __thread unsigned int * global_counters
void cuda_init()
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
void cuda_errcheck(const char *msg)
void cuda_bind_GBIS_intRad(float *intRad0H, float *intRadSH)
static __thread int slow_tmpvirials_size
texture< float4, 1, cudaReadModeElementType > energy_table
static __thread int dHdrPrefixD_size
static __thread int exclmasks_size
static __thread int * vdw_types
__thread int max_grid_size
texture< float4, 1, cudaReadModeElementType > force_table
gridSize y
#define NUM_WARP
void cuda_GBIS_P2(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float r_cut, float scaling, float kappa, float smoothDist, float epsilon_p, float epsilon_s, float3 lata, float3 latb, float3 latc, int doEnergy, int doFullElec, cudaStream_t &strm)
__thread cudaStream_t stream2
static __thread int * block_order
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
void cuda_bind_atom_params(const atom_param *t)
gridSize x
#define CALL(DOENERGY, DOVIRIAL)
static float * coords
Definition: ScriptTcl.C:66
static __thread float4 * tmpforces
static __thread GBReal * psiSumD
static __thread float * intRad0H
float GBReal
Definition: ComputeGBIS.inl:17
static __thread int atoms_size