NAMD
ComputeGBISCUDAKernel.h
Go to the documentation of this file.
1 #include <stdio.h>
2 #define GBIS_CUDA
3 #include "ComputeGBIS.inl"
4 
5 #undef KEPLER_SHUFFLE
6 #ifdef __CUDA_ARCH__
7 #define KEPLER_SHUFFLE
8 #if __CUDA_ARCH__ < 300
9 #undef KEPLER_SHUFFLE
10 #endif
11 #endif
12 
13 //1111111111111111111111111111111111111111111111111111111111
14 //
15 // GBIS Phase 1 CUDA Kernal
16 //
17 //1111111111111111111111111111111111111111111111111111111111
18 __global__ static void GBIS_P1_Kernel (
19  const patch_pair *patch_pairs, // atoms pointers and such
20  const atom *atoms, // position & charge
21  const float *intRad0, // read in intrinsic radius
22  const float *intRadS, // read in intrinsic radius
23  GBReal *tmp_psiSum, // temporary device memory
24  GBReal *psiSum, // host-mapped memory
25  const float a_cut, // P1 interaction cutoff
26  const float rho_0, // H(i,j) parameter
27  float3 lata,
28  float3 latb,
29  float3 latc,
30  unsigned int *P1_counters
31 ) {
32 
33  // shared memory
34  __shared__ GBReal sh_psiSumJ_2d[NUM_WARP][WARPSIZE];
35  __shared__ patch_pair sh_patch_pair;
36 #ifndef KEPLER_SHUFFLE
37  __shared__ atom sh_jpq_2d[NUM_WARP][WARPSIZE];
38  __shared__ float sh_intRad0j_2d[NUM_WARP][WARPSIZE];
39 #endif
40 
41  volatile GBReal* sh_psiSumJ = sh_psiSumJ_2d[threadIdx.y];
42 #ifndef KEPLER_SHUFFLE
43  volatile atom* sh_jpq = sh_jpq_2d[threadIdx.y];
44  volatile float* sh_intRad0j = sh_intRad0j_2d[threadIdx.y];
45 #endif
46 
47  // Load data into shared memory
48  {
49  const int t = threadIdx.x + threadIdx.y*WARPSIZE;
50  if (t < PATCH_PAIR_SIZE) {
51  int* src = (int *)&patch_pairs[blockIdx.x];
52  int* dst = (int *)&sh_patch_pair;
53  dst[t] = src[t];
54  }
55  BLOCK_SYNC;
56 
57  // convert scaled offset with current lattice and write into shared memory
58  if (t == 0) {
59  float offx = sh_patch_pair.offset.x * lata.x
60  + sh_patch_pair.offset.y * latb.x
61  + sh_patch_pair.offset.z * latc.x;
62  float offy = sh_patch_pair.offset.x * lata.y
63  + sh_patch_pair.offset.y * latb.y
64  + sh_patch_pair.offset.z * latc.y;
65  float offz = sh_patch_pair.offset.x * lata.z
66  + sh_patch_pair.offset.y * latb.z
67  + sh_patch_pair.offset.z * latc.z;
68  sh_patch_pair.offset.x = offx;
69  sh_patch_pair.offset.y = offy;
70  sh_patch_pair.offset.z = offz;
71  }
72  BLOCK_SYNC;
73  }
74 
75  //iterate over chunks of atoms within Patch 1
76  for (int blocki = threadIdx.y*WARPSIZE; blocki < sh_patch_pair.patch1_size; blocki += NUM_WARP*WARPSIZE) {
77 
78  int nloopi = sh_patch_pair.patch1_size - blocki;
79  nloopi = min(nloopi, WARPSIZE);
80 
81  //this thread calculates only the force on atomi; iterating over js
82  atom atomi;
83  float intRadSi;
84  int i;
85 
86  // load BLOCK of Patch i atoms
87  if ( blocki + threadIdx.x < sh_patch_pair.patch1_size ) {
88  i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
89  float4 tmpa = ((float4*)atoms)[i];
90  atomi.position.x = tmpa.x + sh_patch_pair.offset.x;
91  atomi.position.y = tmpa.y + sh_patch_pair.offset.y;
92  atomi.position.z = tmpa.z + sh_patch_pair.offset.z;
93  atomi.charge = intRad0[i]; // overwrite charge with radius
94  intRadSi = intRadS[i];
95  } // load patch 1
96 
97  //init intermediate variables
98  GBReal psiSumI = 0.f; // each thread accumulating single psi
99 
100  const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) &&
101  (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
102  int blockj = (diag_patch_pair) ? blocki : 0;
103 
104  //iterate over chunks of atoms within Patch 2
105  for (; blockj < sh_patch_pair.patch2_size; blockj += WARPSIZE ) {
106 
107  int nloopj = min(sh_patch_pair.patch2_size - blockj, WARPSIZE);
108 
109 #ifdef KEPLER_SHUFFLE
110  float xj;
111  float yj;
112  float zj;
113  float chargej;
114  float intRad0j_val;
115 #endif
116 
117  //load smaller chunk of j atoms into shared memory: coordinates
118  if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
119  int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
120  float4 tmpa = ((float4*)atoms)[j];
121 #ifdef KEPLER_SHUFFLE
122  xj = tmpa.x;
123  yj = tmpa.y;
124  zj = tmpa.z;
125  chargej = intRadS[j];
126  intRad0j_val = intRad0[j];
127 #else
128  sh_jpq[threadIdx.x].position.x = tmpa.x;
129  sh_jpq[threadIdx.x].position.y = tmpa.y;
130  sh_jpq[threadIdx.x].position.z = tmpa.z;
131  sh_jpq[threadIdx.x].charge = intRadS[j];
132  sh_intRad0j[threadIdx.x] = intRad0[j];
133 #endif
134  }
135 
136  const bool diag_tile = diag_patch_pair && (blocki == blockj);
137  const int modval = diag_tile ? 2*WARPSIZE : WARPSIZE;
138 
139  sh_psiSumJ[threadIdx.x] = 0.f;
140 
141  //each thread loop over shared atoms
142  int t = diag_tile ? 1 : 0;
143 #ifdef KEPLER_SHUFFLE
144  if (diag_tile) {
145  xj = WARP_SHUFFLE(WARP_FULL_MASK, xj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
146  yj = WARP_SHUFFLE(WARP_FULL_MASK, yj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
147  zj = WARP_SHUFFLE(WARP_FULL_MASK, zj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
148  chargej = WARP_SHUFFLE(WARP_FULL_MASK, chargej, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
149  intRad0j_val = WARP_SHUFFLE(WARP_FULL_MASK, intRad0j_val, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
150  }
151 #endif
152 
153  for (; t < WARPSIZE; ++t ) {
154  int j = (t + threadIdx.x) % modval;
155 #ifndef KEPLER_SHUFFLE
156  float xj = sh_jpq[j].position.x;
157  float yj = sh_jpq[j].position.y;
158  float zj = sh_jpq[j].position.z;
159  float chargej = sh_jpq[j].charge;
160  float intRad0j_val = sh_intRad0j[j];
161 #endif
162  if (j < nloopj && threadIdx.x < nloopi)
163  {
164  float dx = atomi.position.x - xj;
165  float dy = atomi.position.y - yj;
166  float dz = atomi.position.z - zj;
167  float r2 = dx*dx + dy*dy + dz*dz;
168 
169  // within cutoff different atoms
170  if (r2 < (a_cut+FS_MAX)*(a_cut+FS_MAX) && r2 > 0.01f) {
171  // calculate H(i,j) [and not H(j,i)]
172  float r_i = 1.f / sqrt(r2);
173  float r = r2 * r_i;
174  float hij;
175  int dij;
176  CalcH(r,r2,r_i,a_cut,atomi.charge,chargej,hij,dij);
177  psiSumI += hij;
178  float hji;
179  int dji;
180  CalcH(r,r2,r_i,a_cut,intRad0j_val,intRadSi,hji,dji);
181  sh_psiSumJ[j] += hji;
182  } // cutoff
183  } // if (j < nloopj)
184 #ifdef KEPLER_SHUFFLE
185  xj = WARP_SHUFFLE(WARP_FULL_MASK, xj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
186  yj = WARP_SHUFFLE(WARP_FULL_MASK, yj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
187  zj = WARP_SHUFFLE(WARP_FULL_MASK, zj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
188  chargej = WARP_SHUFFLE(WARP_FULL_MASK, chargej, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
189  intRad0j_val = WARP_SHUFFLE(WARP_FULL_MASK, intRad0j_val, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
190 #endif
191  } // for t
192 
193  if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
194  int i_out = sh_patch_pair.patch2_start + blockj + threadIdx.x;
195  atomicAdd(&tmp_psiSum[i_out], sh_psiSumJ[threadIdx.x]);
196  }
197 
198  } // for block j
199  //psiSumI now contains contributions from all j in 2nd patch
200 
201  // write psiSum to global memory buffer; to be accumulated later
202  if ( blocki + threadIdx.x < sh_patch_pair.patch1_size) {
203  int i_out = sh_patch_pair.patch1_start + blocki + threadIdx.x;
204  atomicAdd(&tmp_psiSum[i_out], psiSumI);
205  }
206  } // for block i
207 
208  { // start of force sum
209  // make sure psiSums are visible in global memory
210  __threadfence();
211  BLOCK_SYNC;
212 
213  // Mark patch pair (patch1_ind, patch2_ind) as "done"
214  int patch1_ind = sh_patch_pair.patch1_ind;
215  int patch2_ind = sh_patch_pair.patch2_ind;
216 
217  if (threadIdx.x == 0 && threadIdx.y == 0) {
218  sh_patch_pair.patch_done[0] = false;
219  sh_patch_pair.patch_done[1] = false;
220 
221  unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
222  int patch1_old = atomicInc(&P1_counters[patch1_ind], patch1_num_pairs-1);
223  if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] = true;
224  if (patch1_ind != patch2_ind) {
225  unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
226  int patch2_old = atomicInc(&P1_counters[patch2_ind], patch2_num_pairs-1);
227  if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] = true;
228  }
229  }
230  // sync threads so that patch1_done and patch2_done are visible to all threads
231  BLOCK_SYNC;
232 
233  if (sh_patch_pair.patch_done[0]) {
234  const int start = sh_patch_pair.patch1_start;
235  for (int i=threadIdx.x+threadIdx.y*WARPSIZE;i < sh_patch_pair.patch1_size;i+=NUM_WARP*WARPSIZE) {
236  psiSum[start+i] = tmp_psiSum[start+i];
237  }
238  }
239 
240  if (sh_patch_pair.patch_done[1]) {
241  const int start = sh_patch_pair.patch2_start;
242  for (int i=threadIdx.x+threadIdx.y*WARPSIZE;i < sh_patch_pair.patch2_size;i+=NUM_WARP*WARPSIZE) {
243  psiSum[start+i] = tmp_psiSum[start+i];
244  }
245  }
246 
247  if (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1]) {
248  // Make sure page-locked host forces are up-to-date
249 #if __CUDA_ARCH__ < 200
250  __threadfence();
251 #else
252  __threadfence_system();
253 #endif
254  }
255 
256  } // end of force sum
257 } //GBIS_P1
258 
259 //2222222222222222222222222222222222222222222222222222222222
260 //
261 // GBIS Phase 2 CUDA Kernal
262 //
263 //2222222222222222222222222222222222222222222222222222222222
264 __global__ static void GBIS_P2_Kernel (
265  const patch_pair *patch_pairs,// atoms pointers and such
266  const atom *atoms, // position & charge
267  const float *bornRad, // read in Born radius
268  GBReal *tmp_dEdaSum, // temporary device memory
269  GBReal *dEdaSum, // host-mapped memory
270  const float a_cut, // P1 interaction cutoff
271  const float r_cut, // P1 interaction cutoff
272  const float scaling, // scale nonbonded
273  const float kappa,
274  const float smoothDist, // use interaction cutoff smoothing?
275  const float epsilon_p, // protein dielectric
276  const float epsilon_s, // solvent dielectric
277  float3 lata,
278  float3 latb,
279  float3 latc,
280  const int doEnergy, // calculate energy too?
281  const int doFullElec, // calc dEdaSum for P3 full electrostatics
282  float4 *tmp_forces, // temporary device memory
283  float4 *forces, // host-mapped memory
284  float *tmp_energy, // temporary device memory
285  float *energy, // host-mapped memory
286  unsigned int *P2_counters
287 ) {
288 
289  // Shared memory
290  __shared__ patch_pair sh_patch_pair;
291 #ifndef KEPLER_SHUFFLE
292  __shared__ atom sh_jpq_2d[NUM_WARP][WARPSIZE];
293  __shared__ float sh_jBornRad_2d[NUM_WARP][WARPSIZE];
294 #endif
295  __shared__ float3 sh_forceJ_2d[NUM_WARP][WARPSIZE];
296  __shared__ float sh_dEdaSumJ_2d[NUM_WARP][WARPSIZE];
297 
298 #ifndef KEPLER_SHUFFLE
299  volatile atom* sh_jpq = sh_jpq_2d[threadIdx.y];
300  volatile float* sh_jBornRad = sh_jBornRad_2d[threadIdx.y];
301 #endif
302  volatile float3* sh_forceJ = sh_forceJ_2d[threadIdx.y];
303  volatile float* sh_dEdaSumJ = sh_dEdaSumJ_2d[threadIdx.y];
304 
305  // Load data into shared memory
306  {
307  const int t = threadIdx.x + threadIdx.y*WARPSIZE;
308  if (t < PATCH_PAIR_SIZE) {
309  int* src = (int *)&patch_pairs[blockIdx.x];
310  int* dst = (int *)&sh_patch_pair;
311  dst[t] = src[t];
312  }
313  BLOCK_SYNC;
314 
315  // convert scaled offset with current lattice and write into shared memory
316  if (t == 0) {
317  float offx = sh_patch_pair.offset.x * lata.x
318  + sh_patch_pair.offset.y * latb.x
319  + sh_patch_pair.offset.z * latc.x;
320  float offy = sh_patch_pair.offset.x * lata.y
321  + sh_patch_pair.offset.y * latb.y
322  + sh_patch_pair.offset.z * latc.y;
323  float offz = sh_patch_pair.offset.x * lata.z
324  + sh_patch_pair.offset.y * latb.z
325  + sh_patch_pair.offset.z * latc.z;
326  sh_patch_pair.offset.x = offx;
327  sh_patch_pair.offset.y = offy;
328  sh_patch_pair.offset.z = offz;
329  }
330  BLOCK_SYNC;
331  }
332 
333  float energyT = 0.f; // total energy for this thread; to be reduced
334 
335  //values used in loop
336  float r_cut2 = r_cut*r_cut;
337  float r_cut_2 = 1.f / r_cut2;
338  float r_cut_4 = 4.f*r_cut_2*r_cut_2;
339  float epsilon_s_i = 1.f / epsilon_s;
340  float epsilon_p_i = 1.f / epsilon_p;
341 
342  //iterate over chunks of atoms within Patch 1
343  for ( int blocki = threadIdx.y*WARPSIZE; blocki < sh_patch_pair.patch1_size; blocki += BLOCK_SIZE ) {
344 
345  int nloopi = sh_patch_pair.patch1_size - blocki;
346  nloopi = min(nloopi, WARPSIZE);
347 
348  //this thread calculates only the force on atomi; iterating over js
349  atom atomi;
350  float bornRadI;
351  int i;
352 
353  // load BLOCK of Patch i atoms
354  if ( blocki + threadIdx.x < sh_patch_pair.patch1_size ) {
355  i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
356  float4 tmpa = ((float4*)atoms)[i];
357  atomi.position.x = tmpa.x + sh_patch_pair.offset.x;
358  atomi.position.y = tmpa.y + sh_patch_pair.offset.y;
359  atomi.position.z = tmpa.z + sh_patch_pair.offset.z;
360  atomi.charge = - tmpa.w * scaling;
361  bornRadI = bornRad[i];
362  } // load patch 1
363 
364  //init intermediate variables
365  GBReal dEdaSumI = 0.f; // each thread accumulating single psi
366  float3 forceI;
367  forceI.x = 0.f;
368  forceI.y = 0.f;
369  forceI.z = 0.f;
370 
371  const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) &&
372  (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
373  int blockj = (diag_patch_pair) ? blocki : 0;
374  //iterate over chunks of atoms within Patch 2
375  for (; blockj < sh_patch_pair.patch2_size; blockj += WARPSIZE) {
376 
377  int nloopj = min(sh_patch_pair.patch2_size - blockj, WARPSIZE);
378 
379 #ifdef KEPLER_SHUFFLE
380  float xj;
381  float yj;
382  float zj;
383  float chargej;
384  float bornRadJ;
385 #endif
386 
387  //load smaller chunk of j atoms into shared memory: coordinates
388  if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
389  int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
390  float4 tmpa = ((float4*)atoms)[j];
391 #ifdef KEPLER_SHUFFLE
392  xj = tmpa.x;
393  yj = tmpa.y;
394  zj = tmpa.z;
395  chargej = tmpa.w;
396  bornRadJ = bornRad[j];
397 #else
398  sh_jpq[threadIdx.x].position.x = tmpa.x;
399  sh_jpq[threadIdx.x].position.y = tmpa.y;
400  sh_jpq[threadIdx.x].position.z = tmpa.z;
401  sh_jpq[threadIdx.x].charge = tmpa.w;
402  sh_jBornRad[threadIdx.x] = bornRad[j];
403 #endif
404  }
405 
406  sh_forceJ[threadIdx.x].x = 0.f;
407  sh_forceJ[threadIdx.x].y = 0.f;
408  sh_forceJ[threadIdx.x].z = 0.f;
409  sh_dEdaSumJ[threadIdx.x] = 0.f;
410 
411  const bool diag_tile = diag_patch_pair && (blocki == blockj);
412  const int modval = diag_tile ? 2*WARPSIZE : WARPSIZE;
413  for (int t=0; t < WARPSIZE; ++t ) {
414  int j = (t + threadIdx.x) % modval;
415 #ifndef KEPLER_SHUFFLE
416  float xj = sh_jpq[j].position.x;
417  float yj = sh_jpq[j].position.y;
418  float zj = sh_jpq[j].position.z;
419  float chargej = sh_jpq[j].charge;
420  float bornRadJ = sh_jBornRad[j];
421 #endif
422  if (j < nloopj && threadIdx.x < nloopi)
423  {
424  float dx = atomi.position.x - xj;
425  float dy = atomi.position.y - yj;
426  float dz = atomi.position.z - zj;
427  float r2 = dx*dx + dy*dy + dz*dz;
428 
429  // within cutoff different atoms
430  if (r2 < r_cut2 && r2 > 0.01f) {
431 
432  float r_i = 1.f / sqrt(r2);
433  float r = r2 * r_i;
434  //float bornRadJ = sh_jBornRad[j];
435 
436  //calculate GB energy
437  float qiqj = atomi.charge*chargej;
438  float aiaj = bornRadI*bornRadJ;
439  float aiaj4 = 4*aiaj;
440  float expr2aiaj4 = exp(-r2/aiaj4);
441  float fij = sqrt(r2+aiaj*expr2aiaj4);
442  float f_i = 1/fij;
443  float expkappa = exp(-kappa*fij);
444  float Dij = epsilon_p_i - expkappa*epsilon_s_i;
445  float gbEij = qiqj*Dij*f_i;
446 
447  //calculate energy derivatives
448  float ddrfij = r*f_i*(1.f - 0.25f*expr2aiaj4);
449  float ddrf_i = -ddrfij*f_i*f_i;
450  float ddrDij = kappa*expkappa*ddrfij*epsilon_s_i;
451  float ddrGbEij = qiqj*(ddrDij*f_i+Dij*ddrf_i);
452 
453  //NAMD smoothing function
454  float scale = 1.f;
455  float ddrScale = 0.f;
456  float forcedEdr;
457  if (smoothDist > 0.f) {
458  scale = r2 * r_cut_2 - 1.f;
459  scale *= scale;
460  ddrScale = r*(r2-r_cut2)*r_cut_4;
461  energyT += gbEij * scale;
462  forcedEdr = -(ddrGbEij)*scale-(gbEij)*ddrScale;
463  } else {
464  energyT += gbEij;
465  forcedEdr = -ddrGbEij;
466  }
467 
468  //add dEda
469  if (doFullElec) {
470  float dEdai = 0.5f*qiqj*f_i*f_i
471  *(kappa*epsilon_s_i*expkappa-Dij*f_i)
472  *(aiaj+0.25f*r2)*expr2aiaj4/bornRadI*scale;//0
473  dEdaSumI += dEdai;
474  float dEdaj = 0.5f*qiqj*f_i*f_i
475  *(kappa*epsilon_s_i*expkappa-Dij*f_i)
476  *(aiaj+0.25f*r2)*expr2aiaj4/bornRadJ*scale;//0
477  sh_dEdaSumJ[j] += dEdaj;
478  }
479 
480  forcedEdr *= r_i;
481  float tmpx = dx*forcedEdr;
482  float tmpy = dy*forcedEdr;
483  float tmpz = dz*forcedEdr;
484  forceI.x += tmpx;
485  forceI.y += tmpy;
486  forceI.z += tmpz;
487  sh_forceJ[j].x -= tmpx;
488  sh_forceJ[j].y -= tmpy;
489  sh_forceJ[j].z -= tmpz;
490  } // within cutoff
491  if (r2 < 0.01f) {
492  // GB Self Energy
493  if (doEnergy) {
494  float fij = bornRadI;//inf
495  float expkappa = exp(-kappa*fij);//0
496  float Dij = epsilon_p_i - expkappa*epsilon_s_i;
497  float gbEij = atomi.charge*(atomi.charge / (-scaling) )*Dij/fij;
498  energyT += 0.5f*gbEij;
499  }
500  } //same atom or within cutoff
501  } // if (j < nloopj)
502 #ifdef KEPLER_SHUFFLE
503  xj = WARP_SHUFFLE(WARP_FULL_MASK, xj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
504  yj = WARP_SHUFFLE(WARP_FULL_MASK, yj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
505  zj = WARP_SHUFFLE(WARP_FULL_MASK, zj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
506  chargej = WARP_SHUFFLE(WARP_FULL_MASK, chargej, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
507  bornRadJ = WARP_SHUFFLE(WARP_FULL_MASK, bornRadJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
508 #endif
509  } // for t
510  if ( blockj + threadIdx.x < sh_patch_pair.patch2_size) {
511  int i_out = sh_patch_pair.patch2_start + blockj + threadIdx.x;
512  atomicAdd(&tmp_dEdaSum[i_out], sh_dEdaSumJ[threadIdx.x]);
513  atomicAdd(&tmp_forces[i_out].x, sh_forceJ[threadIdx.x].x);
514  atomicAdd(&tmp_forces[i_out].y, sh_forceJ[threadIdx.x].y);
515  atomicAdd(&tmp_forces[i_out].z, sh_forceJ[threadIdx.x].z);
516  }
517  } // for block j
518  //psiSumI now contains contributions from all j in 2nd patch
519 
520  // write psiSum to global memory buffer; to be accumulated later
521  if ( blocki + threadIdx.x < sh_patch_pair.patch1_size) {
522  int i_out = sh_patch_pair.patch1_start + blocki + threadIdx.x;
523  atomicAdd(&tmp_dEdaSum[i_out], dEdaSumI);
524  atomicAdd(&tmp_forces[i_out].x, forceI.x);
525  atomicAdd(&tmp_forces[i_out].y, forceI.y);
526  atomicAdd(&tmp_forces[i_out].z, forceI.z);
527  }
528  } // for block i
529 
530  //Energy Reduction
531  if (doEnergy) {
532  // Do not have to sync here because each warp writes to the same
533  // portion of sh_jforce_2d as in the above force computation loop
534  volatile float* sh_energy = (float *)&sh_forceJ_2d[threadIdx.y][0].x;
535  // Reduce within warps
536  sh_energy[threadIdx.x] = (float)energyT;
537  for (int d=1;d < WARPSIZE;d*=2) {
538  int pos = threadIdx.x + d;
539  float val = (pos < WARPSIZE) ? sh_energy[pos] : 0.0f;
540  sh_energy[threadIdx.x] += val;
541  }
542  BLOCK_SYNC;
543  // Reduce among warps
544  if (threadIdx.x == 0 && threadIdx.y == 0) {
545  float tot_energy = 0.0f;
546 #pragma unroll
547  for (int i=0;i < NUM_WARP;++i) {
548  tot_energy += ((float *)&sh_forceJ_2d[i][0].x)[0];
549  }
550  int patch1_ind = sh_patch_pair.patch1_ind;
551  atomicAdd(&tmp_energy[patch1_ind], (float)tot_energy);
552  }
553  } //end Energy Reduction
554 
555  { // start of reduction
556  // make sure tmp_forces and tmp_dEdaSum are visible in global memory
557  __threadfence();
558  BLOCK_SYNC;
559 
560  // Mark patch pair (patch1_ind, patch2_ind) as "done"
561  int patch1_ind = sh_patch_pair.patch1_ind;
562  int patch2_ind = sh_patch_pair.patch2_ind;
563 
564  if (threadIdx.x == 0 && threadIdx.y == 0) {
565  sh_patch_pair.patch_done[0] = false;
566  sh_patch_pair.patch_done[1] = false;
567 
568  unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
569  int patch1_old = atomicInc(&P2_counters[patch1_ind], patch1_num_pairs-1);
570  if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] = true;
571  if (patch1_ind != patch2_ind) {
572  unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
573  int patch2_old = atomicInc(&P2_counters[patch2_ind], patch2_num_pairs-1);
574  if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] = true;
575  }
576  }
577  // sync threads so that patch1_done and patch2_done are visible to all threads
578  BLOCK_SYNC;
579 
580  if (sh_patch_pair.patch_done[0]) {
581  const int start = sh_patch_pair.patch1_start;
582  for (int i=threadIdx.x+threadIdx.y*WARPSIZE;i < sh_patch_pair.patch1_size;i+=NUM_WARP*WARPSIZE) {
583  forces[start+i] = tmp_forces[start+i];
584  dEdaSum[start+i] = tmp_dEdaSum[start+i];
585  }
586  energy[patch1_ind] = tmp_energy[patch1_ind];
587  }
588 
589  if (sh_patch_pair.patch_done[1]) {
590  const int start = sh_patch_pair.patch2_start;
591  for (int i=threadIdx.x+threadIdx.y*WARPSIZE;i < sh_patch_pair.patch2_size;i+=NUM_WARP*WARPSIZE) {
592  forces[start+i] = tmp_forces[start+i];
593  dEdaSum[start+i] = tmp_dEdaSum[start+i];
594  }
595  energy[patch2_ind] = tmp_energy[patch2_ind];
596  }
597 
598  if (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1]) {
599  // Make sure page-locked host arrays are up-to-date
600 #if __CUDA_ARCH__ < 200
601  __threadfence();
602 #else
603  __threadfence_system();
604 #endif
605  }
606 
607  } // end of sum
608 } //GBIS_P2
609 
610 //3333333333333333333333333333333333333333333333333333333333
611 //
612 // GBIS Phase 3 CUDA Kernal
613 //
614 //3333333333333333333333333333333333333333333333333333333333
615 __global__ static void GBIS_P3_Kernel (
616  const patch_pair *patch_pairs, // atoms pointers and such
617  const atom *atoms, // position & charge
618  const float *intRad0, // read in intrinsic radius
619  const float *intRadS, // read in intrinsic radius
620  const float *dHdrPrefix, // read in prefix
621  const float a_cut, // P1 interaction cutoff
622  const float rho_0, // H(i,j) parameter
623  const float scaling, // scale nonbonded
624  float3 lata,
625  float3 latb,
626  float3 latc,
627  float4 *tmp_forces, // temporary device memory
628  float4 *forces, // host-mapped memory
629  unsigned int *P3_counters
630 ) {
631 
632  // Shared memory
633  __shared__ patch_pair sh_patch_pair;
634 #ifndef KEPLER_SHUFFLE
635  __shared__ atom sh_jpq_2d[NUM_WARP][WARPSIZE];
636  __shared__ float sh_intRadJ0_2d[NUM_WARP][WARPSIZE];
637  __shared__ float sh_jDHdrPrefix_2d[NUM_WARP][WARPSIZE];
638 #endif
639  __shared__ float3 sh_forceJ_2d[NUM_WARP][WARPSIZE];
640 
641 #ifndef KEPLER_SHUFFLE
642  volatile atom* sh_jpq = sh_jpq_2d[threadIdx.y];
643  volatile float* sh_intRadJ0 = sh_intRadJ0_2d[threadIdx.y];
644  volatile float* sh_jDHdrPrefix = sh_jDHdrPrefix_2d[threadIdx.y];
645 #endif
646  volatile float3* sh_forceJ = sh_forceJ_2d[threadIdx.y];
647 
648  // Load data into shared memory
649  {
650  const int t = threadIdx.x + threadIdx.y*WARPSIZE;
651  if (t < PATCH_PAIR_SIZE) {
652  int* src = (int *)&patch_pairs[blockIdx.x];
653  int* dst = (int *)&sh_patch_pair;
654  dst[t] = src[t];
655  }
656  BLOCK_SYNC;
657 
658  // convert scaled offset with current lattice and write into shared memory
659  if (t == 0) {
660  float offx = sh_patch_pair.offset.x * lata.x
661  + sh_patch_pair.offset.y * latb.x
662  + sh_patch_pair.offset.z * latc.x;
663  float offy = sh_patch_pair.offset.x * lata.y
664  + sh_patch_pair.offset.y * latb.y
665  + sh_patch_pair.offset.z * latc.y;
666  float offz = sh_patch_pair.offset.x * lata.z
667  + sh_patch_pair.offset.y * latb.z
668  + sh_patch_pair.offset.z * latc.z;
669  sh_patch_pair.offset.x = offx;
670  sh_patch_pair.offset.y = offy;
671  sh_patch_pair.offset.z = offz;
672  }
673  BLOCK_SYNC;
674  }
675 
676  //iterate over chunks of atoms within Patch 1
677  for ( int blocki = threadIdx.y*WARPSIZE; blocki < sh_patch_pair.patch1_size; blocki += NUM_WARP*WARPSIZE ) {
678 
679  int nloopi = sh_patch_pair.patch1_size - blocki;
680  nloopi = min(nloopi, WARPSIZE);
681 
682  //this thread calculates only the force on atomi; iterating over js
683  atom atomi;
684  float intRadIS;
685  int i;
686  float dHdrPrefixI;
687 
688  // load BLOCK of Patch i atoms
689  if ( blocki + threadIdx.x < sh_patch_pair.patch1_size ) {
690  i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
691  float4 tmpa = ((float4*)atoms)[i];
692  atomi.position.x = tmpa.x + sh_patch_pair.offset.x;
693  atomi.position.y = tmpa.y + sh_patch_pair.offset.y;
694  atomi.position.z = tmpa.z + sh_patch_pair.offset.z;
695  atomi.charge = intRad0[i]; // overwrite charge with radius
696  intRadIS = intRadS[i];
697  dHdrPrefixI = dHdrPrefix[i];
698  } // load patch 1
699 
700  //init intermediate variables
701  float3 forceI;
702  forceI.x = 0.f;
703  forceI.y = 0.f;
704  forceI.z = 0.f;
705 
706  const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) &&
707  (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
708  int blockj = (diag_patch_pair) ? blocki : 0;
709  //iterate over chunks of atoms within Patch 2
710  for (; blockj < sh_patch_pair.patch2_size; blockj += WARPSIZE ) {
711 
712  int nloopj = min(sh_patch_pair.patch2_size - blockj, WARPSIZE);
713 
714 #ifdef KEPLER_SHUFFLE
715  float xj;
716  float yj;
717  float zj;
718  float intRadSJ;
719  float dHdrPrefixJ;
720  float intRadJ0;
721 #endif
722 
723  //load smaller chunk of j atoms into shared memory: coordinates
724  if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
725  int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
726  float4 tmpa = ((float4*)atoms)[j];
727 #ifdef KEPLER_SHUFFLE
728  xj = tmpa.x;
729  yj = tmpa.y;
730  zj = tmpa.z;
731  intRadSJ = intRadS[j];
732  dHdrPrefixJ = dHdrPrefix[j];
733  intRadJ0 = intRad0[j];
734 #else
735  sh_jpq[threadIdx.x].position.x = tmpa.x;
736  sh_jpq[threadIdx.x].position.y = tmpa.y;
737  sh_jpq[threadIdx.x].position.z = tmpa.z;
738  sh_jpq[threadIdx.x].charge = intRadS[j];
739  sh_jDHdrPrefix[threadIdx.x] = dHdrPrefix[j]; // load dHdrPrefix into shared
740  sh_intRadJ0[threadIdx.x] = intRad0[j];
741 #endif
742  }
743 
744  sh_forceJ[threadIdx.x].x = 0.f;
745  sh_forceJ[threadIdx.x].y = 0.f;
746  sh_forceJ[threadIdx.x].z = 0.f;
747 
748  const bool diag_tile = diag_patch_pair && (blocki == blockj);
749  const int modval = diag_tile ? 2*WARPSIZE : WARPSIZE;
750 #ifdef KEPLER_SHUFFLE
751  if (diag_tile) {
752  xj = WARP_SHUFFLE(WARP_FULL_MASK, xj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
753  yj = WARP_SHUFFLE(WARP_FULL_MASK, yj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
754  zj = WARP_SHUFFLE(WARP_FULL_MASK, zj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
755  intRadSJ = WARP_SHUFFLE(WARP_FULL_MASK, intRadSJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
756  dHdrPrefixJ = WARP_SHUFFLE(WARP_FULL_MASK, dHdrPrefixJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
757  intRadJ0 = WARP_SHUFFLE(WARP_FULL_MASK, intRadJ0, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
758  }
759 #endif
760  int t = diag_tile ? 1 : 0;
761  for (; t < WARPSIZE; ++t ) {
762  int j = (t + threadIdx.x) % modval;
763 #ifndef KEPLER_SHUFFLE
764  float xj = sh_jpq[j].position.x;
765  float yj = sh_jpq[j].position.y;
766  float zj = sh_jpq[j].position.z;
767  float intRadSJ = sh_jpq[j].charge;
768  float dHdrPrefixJ = sh_jDHdrPrefix[j];
769  float intRadJ0 = sh_intRadJ0[j];
770 #endif
771  if (j < nloopj && threadIdx.x < nloopi)
772  {
773  float dx = atomi.position.x - xj;
774  float dy = atomi.position.y - yj;
775  float dz = atomi.position.z - zj;
776  float r2 = dx*dx + dy*dy + dz*dz;
777 
778  // within cutoff different atoms
779  if (r2 < (a_cut+FS_MAX)*(a_cut+FS_MAX) && r2 > 0.01f) {
780 
781  float r_i = 1.f / sqrt(r2);
782  float r = r2 * r_i;
783  float dhij, dhji;
784  int dij, dji;
785  CalcDH(r,r2,r_i,a_cut,atomi.charge,intRadSJ,dhij,dij);
786  CalcDH(r,r2,r_i,a_cut,intRadJ0,intRadIS,dhji,dji);
787 
788  float forceAlpha = -r_i*(dHdrPrefixI*dhij+dHdrPrefixJ*dhji);
789  float tmpx = dx * forceAlpha;
790  float tmpy = dy * forceAlpha;
791  float tmpz = dz * forceAlpha;
792  forceI.x += tmpx;
793  forceI.y += tmpy;
794  forceI.z += tmpz;
795  sh_forceJ[j].x -= tmpx;
796  sh_forceJ[j].y -= tmpy;
797  sh_forceJ[j].z -= tmpz;
798  } // cutoff
799  } // if (j < nloopj...)
800 #ifdef KEPLER_SHUFFLE
801  xj = WARP_SHUFFLE(WARP_FULL_MASK, xj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
802  yj = WARP_SHUFFLE(WARP_FULL_MASK, yj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
803  zj = WARP_SHUFFLE(WARP_FULL_MASK, zj, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
804  intRadSJ = WARP_SHUFFLE(WARP_FULL_MASK, intRadSJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
805  dHdrPrefixJ = WARP_SHUFFLE(WARP_FULL_MASK, dHdrPrefixJ, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
806  intRadJ0 = WARP_SHUFFLE(WARP_FULL_MASK, intRadJ0, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
807 #endif
808  } // for t
809  if ( blockj + threadIdx.x < sh_patch_pair.patch2_size) {
810  int i_out = sh_patch_pair.patch2_start + blockj + threadIdx.x;
811  atomicAdd(&tmp_forces[i_out].x, sh_forceJ[threadIdx.x].x);
812  atomicAdd(&tmp_forces[i_out].y, sh_forceJ[threadIdx.x].y);
813  atomicAdd(&tmp_forces[i_out].z, sh_forceJ[threadIdx.x].z);
814  }
815  } // for block j
816 
817  // write psiSum to global memory buffer; to be accumulated later
818  if ( blocki + threadIdx.x < sh_patch_pair.patch1_size) {
819  int i_out = sh_patch_pair.patch1_start + blocki + threadIdx.x;
820  atomicAdd(&tmp_forces[i_out].x, forceI.x);
821  atomicAdd(&tmp_forces[i_out].y, forceI.y);
822  atomicAdd(&tmp_forces[i_out].z, forceI.z);
823  }
824  } // for block i
825 
826  { // start of force sum
827  // make sure forces are visible in global memory
828  __threadfence();
829  BLOCK_SYNC;
830 
831  // Mark patch pair (patch1_ind, patch2_ind) as "done"
832  int patch1_ind = sh_patch_pair.patch1_ind;
833  int patch2_ind = sh_patch_pair.patch2_ind;
834 
835  if (threadIdx.x == 0 && threadIdx.y == 0) {
836  sh_patch_pair.patch_done[0] = false;
837  sh_patch_pair.patch_done[1] = false;
838 
839  unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
840  int patch1_old = atomicInc(&P3_counters[patch1_ind], patch1_num_pairs-1);
841  if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] = true;
842  if (patch1_ind != patch2_ind) {
843  unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
844  int patch2_old = atomicInc(&P3_counters[patch2_ind], patch2_num_pairs-1);
845  if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] = true;
846  }
847  }
848  // sync threads so that patch1_done and patch2_done are visible to all threads
849  BLOCK_SYNC;
850 
851  if (sh_patch_pair.patch_done[0]) {
852  const int start = sh_patch_pair.patch1_start;
853  for (int i=threadIdx.x+threadIdx.y*WARPSIZE;i < sh_patch_pair.patch1_size;i+=NUM_WARP*WARPSIZE) {
854  forces[start+i] = tmp_forces[start+i];
855  }
856  }
857 
858  if (sh_patch_pair.patch_done[1]) {
859  const int start = sh_patch_pair.patch2_start;
860  for (int i=threadIdx.x+threadIdx.y*WARPSIZE;i < sh_patch_pair.patch2_size;i+=NUM_WARP*WARPSIZE) {
861  forces[start+i] = tmp_forces[start+i];
862  }
863  }
864 
865  if (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1]) {
866  // Make sure page-locked host arrays are up-to-date
867 #if __CUDA_ARCH__ < 200
868  __threadfence();
869 #else
870  __threadfence_system();
871 #endif
872  }
873 
874  } // end of force sum
875 
876 } //GBIS_P3
877 
static __global__ void GBIS_P3_Kernel(const patch_pair *patch_pairs, const atom *atoms, const float *intRad0, const float *intRadS, const float *dHdrPrefix, const float a_cut, const float rho_0, const float scaling, float3 lata, float3 latb, float3 latc, float4 *tmp_forces, float4 *forces, unsigned int *P3_counters)
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread atom * atoms
static __thread float4 * forces
#define BLOCK_SIZE
#define PATCH_PAIR_SIZE
#define WARPSIZE
Definition: CudaUtils.h:10
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
static __thread patch_pair * patch_pairs
static void CalcDH(float r, float r2, float ri, float rc, float r0, float rs, float &dh, int &d)
gridSize z
#define FS_MAX
Definition: ComputeGBIS.inl:24
static void CalcH(float r, float r2, float ri, float rc, float r0, float rs, float &h, int &d)
gridSize y
#define NUM_WARP
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
gridSize x
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:54
static __global__ void GBIS_P1_Kernel(const patch_pair *patch_pairs, const atom *atoms, const float *intRad0, const float *intRadS, GBReal *tmp_psiSum, GBReal *psiSum, const float a_cut, const float rho_0, float3 lata, float3 latb, float3 latc, unsigned int *P1_counters)
float GBReal
Definition: ComputeGBIS.inl:17
static __global__ void GBIS_P2_Kernel(const patch_pair *patch_pairs, const atom *atoms, const float *bornRad, GBReal *tmp_dEdaSum, GBReal *dEdaSum, const float a_cut, const float r_cut, const float scaling, const float kappa, const float smoothDist, const float epsilon_p, const float epsilon_s, float3 lata, float3 latb, float3 latc, const int doEnergy, const int doFullElec, float4 *tmp_forces, float4 *forces, float *tmp_energy, float *energy, unsigned int *P2_counters)