NAMD
ComputeBondedCUDAKernel.cu
Go to the documentation of this file.
1 #ifdef WIN32
2 #define _USE_MATH_DEFINES
3 #define __thread __declspec(thread)
4 #endif
5 #include <math.h>
6 
7 #if defined(NAMD_HIP) && !defined(NAMD_CUDA)
8 #include <hipcub/hipcub.hpp>
9 #define cub hipcub
10 #endif
11 #ifdef NAMD_CUDA
12 #include <cuda.h>
13 #if __CUDACC_VER_MAJOR__ >= 11
14 #include <cub/cub.cuh>
15 #else
16 #include <namd_cub/cub.cuh>
17 #endif
18 #endif
19 
21 
22 #ifdef FUTURE_CUDADEVICE
23 #include "CudaDevice.h"
24 #else
25 #include "DeviceCUDA.h"
26 extern __thread DeviceCUDA *deviceCUDA;
27 #endif
28 
29 __device__ __forceinline__
30 float4 sampleTableTex(cudaTextureObject_t tex, float k) {
31  const int tableSize = FORCE_ENERGY_TABLE_SIZE;
32  const float x = k * (float)tableSize - 0.5f;
33  const float f = floorf(x);
34  const float a = x - f;
35  const unsigned int i = (unsigned int)f;
36  const int i0 = i < tableSize - 1 ? i : tableSize - 1;
37  const int i1 = i0 + 1;
38  const float4 t0 = tex1Dfetch<float4>(tex, i0);
39  const float4 t1 = tex1Dfetch<float4>(tex, i1);
40  return make_float4(
41  a * (t1.x - t0.x) + t0.x,
42  a * (t1.y - t0.y) + t0.y,
43  a * (t1.z - t0.z) + t0.z,
44  a * (t1.w - t0.w) + t0.w);
45 }
46 
47 #if defined(USE_FP_FORCE) || defined(USE_FP_VIRIAL)
48 #ifndef NAMD_HIP
49 __device__ inline long long int lliroundf(float f)
50 {
51  long long int l;
52  asm("cvt.rni.s64.f32 %0, %1;" : "=l"(l) : "f"(f));
53  return l;
54 }
55 
56 __device__ inline unsigned long long int llitoulli(long long int l)
57 {
58  unsigned long long int u;
59  asm("mov.b64 %0, %1;" : "=l"(u) : "l"(l));
60  return u;
61 }
62 #else
63 __device__ inline long long int lliroundf(float f)
64 {
65  return llrintf(f);
66 }
67 
68 __device__ inline unsigned long long int llitoulli(long long int l)
69 {
70  return l;
71 }
72 #endif
73 #endif
74 
75 template <typename T>
76 __forceinline__ __device__
77 void convertForces(const float x, const float y, const float z,
78  T &Tx, T &Ty, T &Tz) {
79 
80  Tx = (T)(x);
81  Ty = (T)(y);
82  Tz = (T)(z);
83 }
84 
85 #if defined(USE_FP_FORCE)
86 template <>
87 __forceinline__ __device__
88 void convertForces<long long int>(const float x, const float y, const float z,
89  long long int &Tx, long long int &Ty, long long int &Tz) {
90 
91  Tx = lliroundf(x*float_to_force);
92  Ty = lliroundf(y*float_to_force);
93  Tz = lliroundf(z*float_to_force);
94 }
95 #endif
96 
97 template <typename T>
98 __forceinline__ __device__
100  float fij,
101  const float dx, const float dy, const float dz,
102  T &fxij, T &fyij, T &fzij) {
103 
104  fxij = (T)(fij*dx);
105  fyij = (T)(fij*dy);
106  fzij = (T)(fij*dz);
107 
108 }
109 
110 #if defined(USE_FP_FORCE)
111 template <>
112 __forceinline__ __device__
113 void calcComponentForces<long long int>(
114  float fij,
115  const float dx, const float dy, const float dz,
116  long long int &fxij,
117  long long int &fyij,
118  long long int &fzij) {
119 
120  fij *= float_to_force;
121  fxij = lliroundf(fij*dx);
122  fyij = lliroundf(fij*dy);
123  fzij = lliroundf(fij*dz);
124 
125 }
126 #endif
127 
128 template <typename T>
129 __forceinline__ __device__
131  float fij1,
132  const float dx1, const float dy1, const float dz1,
133  float fij2,
134  const float dx2, const float dy2, const float dz2,
135  T &fxij, T &fyij, T &fzij) {
136 
137  fxij = (T)(fij1*dx1 + fij2*dx2);
138  fyij = (T)(fij1*dy1 + fij2*dy2);
139  fzij = (T)(fij1*dz1 + fij2*dz2);
140 }
141 
142 #if defined(USE_FP_FORCE)
143 template <>
144 __forceinline__ __device__
145 void calcComponentForces<long long int>(
146  float fij1,
147  const float dx1,
148  const float dy1,
149  const float dz1,
150  float fij2,
151  const float dx2,
152  const float dy2,
153  const float dz2,
154  long long int &fxij,
155  long long int &fyij,
156  long long int &fzij) {
157 
158  fij1 *= float_to_force;
159  fij2 *= float_to_force;
160  fxij = lliroundf(fij1*dx1 + fij2*dx2);
161  fyij = lliroundf(fij1*dy1 + fij2*dy2);
162  fzij = lliroundf(fij1*dz1 + fij2*dz2);
163 }
164 #endif
165 
166 __forceinline__ __device__
167 int warpAggregatingAtomicInc(int* counter) {
168 #if WARPSIZE == 64
169  WarpMask mask = __ballot(1);
170  int total = __popcll(mask);
171  int prefix = __popcll(mask & cub::LaneMaskLt());
172  int firstLane = __ffsll(mask) - 1;
173 #else
174  WarpMask mask = __activemask();
175  int total = __popc(mask);
176  int prefix = __popc(mask & cub::LaneMaskLt());
177  int firstLane = __ffs(mask) - 1;
178 #endif
179  int start = 0;
180  if (prefix == 0) {
181  start = atomicAdd(counter, total);
182  }
183  start = WARP_SHUFFLE(mask, start, firstLane, WARPSIZE);
184  return start + prefix;
185 }
186 
187 template <typename T>
188 __forceinline__ __device__
189 void storeForces(const T fx, const T fy, const T fz,
190  const int ind, const int stride,
191  T* force,
192  T* forceList, int* forceListCounter,
193  int* forceListStarts, int* forceListNexts) {
194 #if defined(USE_BONDED_FORCE_ATOMIC_STORE)
195 #if defined(NAMD_HIP)
196  // Try to decrease conflicts between lanes if there are repeting ind
198  if (mask == ~(WarpMask)0) { // All lanes are active (may be not true for the last block)
199  const int laneId = threadIdx.x % WARPSIZE;
200  const int prevLaneInd = WARP_SHUFFLE(WARP_FULL_MASK, ind, laneId - 1, WARPSIZE);
201  const bool isHead = laneId == 0 || ind != prevLaneInd;
202  if (!WARP_ALL(WARP_FULL_MASK, isHead)) {
203  // There are segments of repeating ind
204  typedef cub::WarpReduce<T> WarpReduce;
205  __shared__ typename WarpReduce::TempStorage temp_storage;
206  const T sumfx = WarpReduce(temp_storage).HeadSegmentedSum(fx, isHead);
207  const T sumfy = WarpReduce(temp_storage).HeadSegmentedSum(fy, isHead);
208  const T sumfz = WarpReduce(temp_storage).HeadSegmentedSum(fz, isHead);
209  if (isHead) {
210  atomicAdd(&force[ind ], sumfx);
211  atomicAdd(&force[ind + stride ], sumfy);
212  atomicAdd(&force[ind + stride*2], sumfz);
213  }
214  return;
215  }
216  }
217  // Not all lanes are active (the last block) or there is no repeating ind
218  atomicAdd(&force[ind ], fx);
219  atomicAdd(&force[ind + stride ], fy);
220  atomicAdd(&force[ind + stride*2], fz);
221 #else
222  atomicAdd(&force[ind ], fx);
223  atomicAdd(&force[ind + stride ], fy);
224  atomicAdd(&force[ind + stride*2], fz);
225 #endif
226 #else
227  const int newPos = warpAggregatingAtomicInc(forceListCounter);
228  forceListNexts[newPos] = atomicExch(&forceListStarts[ind], newPos);
229  forceList[newPos * 3 + 0] = fx;
230  forceList[newPos * 3 + 1] = fy;
231  forceList[newPos * 3 + 2] = fz;
232 #endif
233 }
234 
235 #if defined(USE_FP_FORCE)
236 // Template specialization for 64bit integer = "long long int"
237 template <>
238 __forceinline__ __device__
239 void storeForces <long long int> (const long long int fx,
240  const long long int fy,
241  const long long int fz,
242  const int ind, const int stride,
243  long long int* force) {
244  atomicAdd((unsigned long long int *)&force[ind ], llitoulli(fx));
245  atomicAdd((unsigned long long int *)&force[ind + stride ], llitoulli(fy));
246  atomicAdd((unsigned long long int *)&force[ind + stride*2], llitoulli(fz));
247 }
248 #endif
249 
250 //
251 // Calculates bonds
252 //
253 template <typename T, bool doEnergy, bool doVirial>
254 __device__ void bondForce(
255  const int index,
256  const CudaBond* __restrict__ bondList,
257  const CudaBondValue* __restrict__ bondValues,
258  const float4* __restrict__ xyzq,
259  const int stride,
260  const float3 lata, const float3 latb, const float3 latc,
261  T* __restrict__ force, double &energy,
262  T* __restrict__ forceList, int* forceListCounter,
263  int* forceListStarts, int* forceListNexts,
264 #ifdef WRITE_FULL_VIRIALS
266 #else
267  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial
268 #endif
269  ) {
270 
271  CudaBond bl = bondList[index];
272  CudaBondValue bondValue = bondValues[bl.itype];
273  if (bondValue.x0 == 0.0f) return;
274 
275  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
276  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
277  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
278 
279  float4 xyzqi = xyzq[bl.i];
280  float4 xyzqj = xyzq[bl.j];
281 
282  float xij = xyzqi.x + shx - xyzqj.x;
283  float yij = xyzqi.y + shy - xyzqj.y;
284  float zij = xyzqi.z + shz - xyzqj.z;
285 
286  float r = sqrtf(xij*xij + yij*yij + zij*zij);
287 
288  float db = r - bondValue.x0;
289  if (bondValue.x1) {
290  // in this case, the bond represents a harmonic wall potential
291  // where x0 is the lower wall and x1 is the upper
292  db = (r > bondValue.x1 ? r - bondValue.x1 : (r > bondValue.x0 ? 0 : db));
293  }
294  float fij = db * bondValue.k * bl.scale;
295 
296  if (doEnergy) {
297  energy += (double)(fij*db);
298  }
299  fij *= -2.0f/r;
300 
301  T T_fxij, T_fyij, T_fzij;
302  calcComponentForces<T>(fij, xij, yij, zij, T_fxij, T_fyij, T_fzij);
303 
304  // Store forces
305  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
306  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
307 
308  // Store virial
309  if (doVirial) {
310 #ifdef WRITE_FULL_VIRIALS
311  float fxij = fij*xij;
312  float fyij = fij*yij;
313  float fzij = fij*zij;
314  virial.xx = (fxij*xij);
315  virial.xy = (fxij*yij);
316  virial.xz = (fxij*zij);
317  virial.yx = (fyij*xij);
318  virial.yy = (fyij*yij);
319  virial.yz = (fyij*zij);
320  virial.zx = (fzij*xij);
321  virial.zy = (fzij*yij);
322  virial.zz = (fzij*zij);
323 #endif
324  }
325 }
326 
327 //
328 // Calculates modified exclusions
329 //
330 
331 #if defined(NAMD_HIP)
332 // JM: IF NAMD_HIP, tex1D produces wrong results here: We need force and energy tables
333 // stored on gmem and do the linear interpolation manually -
334 // same as the nonbonded calculation
335 
336 
337 template<class T>
338 __device__ __forceinline__
339 T tableLookup(const T* table, const int tableSize, const float k)
340 {
341  const float x = k * static_cast<float>(tableSize) - 0.5f;
342  const float f = floorf(x);
343  const float a = x - f;
344  const int i = static_cast<int>(f);
345  const int i0 = max(0, min(tableSize - 1, i));
346  const int i1 = max(0, min(tableSize - 1, i + 1));
347  return (1.0f - a) * __ldg(&table[i0]) + a * __ldg(&table[i1]);
348 }
349 #endif
350 
351 template <typename T, bool doEnergy, bool doVirial, bool doElect>
352 __device__ void modifiedExclusionForce(
353  const int index,
354  const CudaExclusion* __restrict__ exclusionList,
355  const bool doSlow,
356  const float one_scale14, // 1 - scale14
357  const int vdwCoefTableWidth,
358 #if __CUDA_ARCH__ >= 350
359  const float2* __restrict__ vdwCoefTable,
360 #else
361  cudaTextureObject_t vdwCoefTableTex,
362 #endif
363  cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
364  const float4* __restrict__ xyzq,
365  const int stride,
366  const float3 lata, const float3 latb, const float3 latc,
367  const float cutoff2,
368  double &energyVdw,
369  T* __restrict__ forceNbond, double &energyNbond,
370  T* __restrict__ forceSlow, double &energySlow,
371  T* __restrict__ forceList, int* forceListCounter,
372  int* forceListStartsNbond, int* forceListStartsSlow, int* forceListNexts,
373 #ifdef WRITE_FULL_VIRIALS
376 #else
377  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialNbond,
378  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialSlow
379 #endif
380  ) {
381 
382  CudaExclusion bl = exclusionList[index];
383 
384  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
385  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
386  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
387 
388  float4 xyzqi = xyzq[bl.i];
389  float4 xyzqj = xyzq[bl.j];
390 
391  float xij = xyzqi.x + shx - xyzqj.x;
392  float yij = xyzqi.y + shy - xyzqj.y;
393  float zij = xyzqi.z + shz - xyzqj.z;
394 
395  float r2 = xij*xij + yij*yij + zij*zij;
396  if (r2 < cutoff2) {
397 
398  float rinv = rsqrtf(r2);
399 
400  float qq;
401  if (doElect) qq = one_scale14 * xyzqi.w * xyzqj.w;
402 
403  int vdwIndex = bl.vdwtypei + bl.vdwtypej*vdwCoefTableWidth;
404 #if __CUDA_ARCH__ >= 350
405  float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
406 #else
407  float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
408 #endif
409 
410 #if defined(NAMD_CUDA)
411  float4 fi = tex1D<float4>(forceTableTex, rinv);
412 #else // NAMD_HIP
413  float4 fi = sampleTableTex(forceTableTex, rinv);
414 #endif
415  float4 ei;
416  if (doEnergy) {
417 #if defined NAMD_CUDA
418  ei = tex1D<float4>(energyTableTex, rinv);
419 #else
420  ei = sampleTableTex(energyTableTex, rinv);
421 #endif
422  energyVdw += (double)(ljab.x * ei.z + ljab.y * ei.y);
423  if (doElect) {
424  energyNbond += qq * ei.x;
425  if (doSlow) energySlow += qq * ei.w;
426  }
427  }
428 
429  float fNbond;
430  if (doElect) {
431  fNbond = -(ljab.x * fi.z + ljab.y * fi.y + qq * fi.x);
432  } else {
433  fNbond = -(ljab.x * fi.z + ljab.y * fi.y);
434  }
435  T T_fxij, T_fyij, T_fzij;
436  calcComponentForces<T>(fNbond, xij, yij, zij, T_fxij, T_fyij, T_fzij);
437  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceNbond, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
438  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceNbond, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
439 
440  float fSlow;
441  if (doSlow && doElect) {
442  fSlow = -qq * fi.w;
443  T T_fxij, T_fyij, T_fzij;
444  calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
445  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
446  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
447  }
448 
449  // Store virial
450  if (doVirial) {
451 #ifdef WRITE_FULL_VIRIALS
452  float fxij = fNbond*xij;
453  float fyij = fNbond*yij;
454  float fzij = fNbond*zij;
455  virialNbond.xx = (fxij*xij);
456  virialNbond.xy = (fxij*yij);
457  virialNbond.xz = (fxij*zij);
458  virialNbond.yx = (fyij*xij);
459  virialNbond.yy = (fyij*yij);
460  virialNbond.yz = (fyij*zij);
461  virialNbond.zx = (fzij*xij);
462  virialNbond.zy = (fzij*yij);
463  virialNbond.zz = (fzij*zij);
464 #endif
465  }
466 
467  // Store virial
468  if (doVirial && doSlow && doElect) {
469 #ifdef WRITE_FULL_VIRIALS
470  float fxij = fSlow*xij;
471  float fyij = fSlow*yij;
472  float fzij = fSlow*zij;
473  virialSlow.xx = (fxij*xij);
474  virialSlow.xy = (fxij*yij);
475  virialSlow.xz = (fxij*zij);
476  virialSlow.yx = (fyij*xij);
477  virialSlow.yy = (fyij*yij);
478  virialSlow.yz = (fyij*zij);
479  virialSlow.zx = (fzij*xij);
480  virialSlow.zy = (fzij*yij);
481  virialSlow.zz = (fzij*zij);
482 #endif
483  }
484 
485  }
486 }
487 
488 //
489 // Calculates exclusions. Here doSlow = true
490 //
491 template <typename T, bool doEnergy, bool doVirial>
492 __device__ void exclusionForce(
493  const int index,
494  const CudaExclusion* __restrict__ exclusionList,
495  const float r2_delta, const int r2_delta_expc,
496 
497 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
498  const float* __restrict__ r2_table,
499  const float4* __restrict__ exclusionTable,
500 #else
501  cudaTextureObject_t r2_table_tex,
502  cudaTextureObject_t exclusionTableTex,
503 #endif
504 
505  const float4* __restrict__ xyzq,
506  const int stride,
507  const float3 lata, const float3 latb, const float3 latc,
508  const float cutoff2,
509  T* __restrict__ forceSlow, double &energySlow,
510  T* __restrict__ forceList, int* forceListCounter,
511  int* forceListStartsSlow, int* forceListNexts,
512 #ifdef WRITE_FULL_VIRIALS
514 #else
515  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialSlow
516 #endif
517  ) {
518 
519  CudaExclusion bl = exclusionList[index];
520 
521  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
522  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
523  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
524 
525  float4 xyzqi = xyzq[bl.i];
526  float4 xyzqj = xyzq[bl.j];
527 
528  float xij = xyzqi.x + shx - xyzqj.x;
529  float yij = xyzqi.y + shy - xyzqj.y;
530  float zij = xyzqi.z + shz - xyzqj.z;
531 
532  float r2 = xij*xij + yij*yij + zij*zij;
533  if (r2 < cutoff2) {
534  r2 += r2_delta;
535 
536  union { float f; int i; } r2i;
537  r2i.f = r2;
538  int table_i = (r2i.i >> 17) + r2_delta_expc; // table_i >= 0
539 
540 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
541  float r2_table_val = __ldg(&r2_table[table_i]);
542 #else
543  float r2_table_val = tex1Dfetch<float>(r2_table_tex, table_i);
544 #endif
545 
546  float diffa = r2 - r2_table_val;
547  float qq = xyzqi.w * xyzqj.w;
548 
549 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
550  float4 slow = __ldg(&exclusionTable[table_i]);
551 #else
552  float4 slow = tex1Dfetch<float4>(exclusionTableTex, table_i);
553 #endif
554 
555 
556  if (doEnergy) {
557  energySlow += (double)(qq*(((diffa * (1.0f/6.0f)*slow.x + 0.25f*slow.y ) * diffa + 0.5f*slow.z ) * diffa + slow.w));
558  }
559 
560  float fSlow = -qq*((diffa * slow.x + slow.y) * diffa + slow.z);
561 
562  T T_fxij, T_fyij, T_fzij;
563  calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
564  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
565  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
566 
567  // Store virial
568  if (doVirial) {
569 #ifdef WRITE_FULL_VIRIALS
570  float fxij = fSlow*xij;
571  float fyij = fSlow*yij;
572  float fzij = fSlow*zij;
573  virialSlow.xx = (fxij*xij);
574  virialSlow.xy = (fxij*yij);
575  virialSlow.xz = (fxij*zij);
576  virialSlow.yx = (fyij*xij);
577  virialSlow.yy = (fyij*yij);
578  virialSlow.yz = (fyij*zij);
579  virialSlow.zx = (fzij*xij);
580  virialSlow.zy = (fzij*yij);
581  virialSlow.zz = (fzij*zij);
582 #endif
583  }
584  }
585 }
586 
587 template <typename T, bool doEnergy, bool doVirial>
588 __device__ void angleForce(const int index,
589  const CudaAngle* __restrict__ angleList,
590  const CudaAngleValue* __restrict__ angleValues,
591  const float4* __restrict__ xyzq,
592  const int stride,
593  const float3 lata, const float3 latb, const float3 latc,
594  T* __restrict__ force, double &energy,
595  T* __restrict__ forceList, int* forceListCounter,
596  int* forceListStarts, int* forceListNexts,
597 #ifdef WRITE_FULL_VIRIALS
599 #else
600  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial
601 #endif
602  ) {
603 
604  CudaAngle al = angleList[index];
605 
606  float ishx = al.ioffsetXYZ.x*lata.x + al.ioffsetXYZ.y*latb.x + al.ioffsetXYZ.z*latc.x;
607  float ishy = al.ioffsetXYZ.x*lata.y + al.ioffsetXYZ.y*latb.y + al.ioffsetXYZ.z*latc.y;
608  float ishz = al.ioffsetXYZ.x*lata.z + al.ioffsetXYZ.y*latb.z + al.ioffsetXYZ.z*latc.z;
609 
610  float kshx = al.koffsetXYZ.x*lata.x + al.koffsetXYZ.y*latb.x + al.koffsetXYZ.z*latc.x;
611  float kshy = al.koffsetXYZ.x*lata.y + al.koffsetXYZ.y*latb.y + al.koffsetXYZ.z*latc.y;
612  float kshz = al.koffsetXYZ.x*lata.z + al.koffsetXYZ.y*latb.z + al.koffsetXYZ.z*latc.z;
613 
614  float xij = xyzq[al.i].x + ishx - xyzq[al.j].x;
615  float yij = xyzq[al.i].y + ishy - xyzq[al.j].y;
616  float zij = xyzq[al.i].z + ishz - xyzq[al.j].z;
617 
618  float xkj = xyzq[al.k].x + kshx - xyzq[al.j].x;
619  float ykj = xyzq[al.k].y + kshy - xyzq[al.j].y;
620  float zkj = xyzq[al.k].z + kshz - xyzq[al.j].z;
621 
622  float rij_inv = rsqrtf(xij*xij + yij*yij + zij*zij);
623  float rkj_inv = rsqrtf(xkj*xkj + ykj*ykj + zkj*zkj);
624 
625  float xijr = xij*rij_inv;
626  float yijr = yij*rij_inv;
627  float zijr = zij*rij_inv;
628  float xkjr = xkj*rkj_inv;
629  float ykjr = ykj*rkj_inv;
630  float zkjr = zkj*rkj_inv;
631  float cos_theta = xijr*xkjr + yijr*ykjr + zijr*zkjr;
632 
633  CudaAngleValue angleValue = angleValues[al.itype];
634  angleValue.k *= al.scale;
635 
636  float diff;
637  if (angleValue.normal == 1) {
638  // Restrict values of cos_theta to the interval [-0.999 ... 0.999]
639  cos_theta = fminf(0.999f, fmaxf(-0.999f, cos_theta));
640  float theta = acosf(cos_theta);
641  diff = theta - angleValue.theta0;
642  } else {
643  diff = cos_theta - angleValue.theta0;
644  }
645 
646  if (doEnergy) {
647  energy += (double)(angleValue.k * diff * diff);
648  }
649  if (angleValue.normal == 1) {
650  float inv_sin_theta = rsqrtf(1.0f - cos_theta*cos_theta);
651  if (inv_sin_theta > 1.0e6) {
652  diff = (diff < 0.0f) ? 1.0f : -1.0f;
653  } else {
654  diff *= -inv_sin_theta;
655  }
656  }
657  diff *= -2.0f*angleValue.k;
658 
659  float dtxi = rij_inv*(xkjr - cos_theta*xijr);
660  float dtxj = rkj_inv*(xijr - cos_theta*xkjr);
661  float dtyi = rij_inv*(ykjr - cos_theta*yijr);
662  float dtyj = rkj_inv*(yijr - cos_theta*ykjr);
663  float dtzi = rij_inv*(zkjr - cos_theta*zijr);
664  float dtzj = rkj_inv*(zijr - cos_theta*zkjr);
665 
666  T T_dtxi, T_dtyi, T_dtzi;
667  T T_dtxj, T_dtyj, T_dtzj;
668  calcComponentForces<T>(diff, dtxi, dtyi, dtzi, T_dtxi, T_dtyi, T_dtzi);
669  calcComponentForces<T>(diff, dtxj, dtyj, dtzj, T_dtxj, T_dtyj, T_dtzj);
670  T T_dtxk = -T_dtxi - T_dtxj;
671  T T_dtyk = -T_dtyi - T_dtyj;
672  T T_dtzk = -T_dtzi - T_dtzj;
673  storeForces<T>(T_dtxk, T_dtyk, T_dtzk, al.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
674 
675  if (angleValue.k_ub) {
676  float xik = xij - xkj;
677  float yik = yij - ykj;
678  float zik = zij - zkj;
679  float rik_inv = rsqrtf(xik*xik + yik*yik + zik*zik);
680  float rik = 1.0f/rik_inv;
681  float diff_ub = rik - angleValue.r_ub;
682  if (doEnergy) {
683  energy += (double)(angleValue.k_ub * diff_ub * diff_ub);
684  }
685  diff_ub *= -2.0f*angleValue.k_ub*rik_inv;
686  T T_dxik, T_dyik, T_dzik;
687  calcComponentForces<T>(diff_ub, xik, yik, zik, T_dxik, T_dyik, T_dzik);
688  T_dtxi += T_dxik;
689  T_dtyi += T_dyik;
690  T_dtzi += T_dzik;
691  T_dtxj -= T_dxik;
692  T_dtyj -= T_dyik;
693  T_dtzj -= T_dzik;
694  }
695 
696  storeForces<T>(T_dtxi, T_dtyi, T_dtzi, al.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
697  storeForces<T>(T_dtxj, T_dtyj, T_dtzj, al.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
698 
699  // Store virial
700  if (doVirial) {
701 #ifdef WRITE_FULL_VIRIALS
702  float fxi = ((float)T_dtxi)*force_to_float;
703  float fyi = ((float)T_dtyi)*force_to_float;
704  float fzi = ((float)T_dtzi)*force_to_float;
705  float fxk = ((float)T_dtxj)*force_to_float;
706  float fyk = ((float)T_dtyj)*force_to_float;
707  float fzk = ((float)T_dtzj)*force_to_float;
708  virial.xx = (fxi*xij) + (fxk*xkj);
709  virial.xy = (fxi*yij) + (fxk*ykj);
710  virial.xz = (fxi*zij) + (fxk*zkj);
711  virial.yx = (fyi*xij) + (fyk*xkj);
712  virial.yy = (fyi*yij) + (fyk*ykj);
713  virial.yz = (fyi*zij) + (fyk*zkj);
714  virial.zx = (fzi*xij) + (fzk*xkj);
715  virial.zy = (fzi*yij) + (fzk*ykj);
716  virial.zz = (fzi*zij) + (fzk*zkj);
717 #endif
718  }
719 }
720 
721 //
722 // Dihedral computation
723 //
724 // Out: df, e
725 //
726 template <bool doEnergy>
727 __forceinline__ __device__
728 void diheComp(const CudaDihedralValue* dihedralValues, int ic,
729  const float sin_phi, const float cos_phi, const float scale, float& df, double& e) {
730 
731  df = 0.0f;
732  if (doEnergy) e = 0.0;
733 
734  float phi = atan2f(sin_phi, cos_phi);
735 
736  bool lrep = true;
737  while (lrep) {
738  CudaDihedralValue dihedralValue = dihedralValues[ic];
739  dihedralValue.k *= scale;
740 
741  // Last dihedral has n low bit set to 0
742  lrep = (dihedralValue.n & 1);
743  dihedralValue.n >>= 1;
744  if (dihedralValue.n) {
745  float nf = dihedralValue.n;
746  float x = nf*phi - dihedralValue.delta;
747  if (doEnergy) {
748  float sin_x, cos_x;
749  sincosf(x, &sin_x, &cos_x);
750  e += (double)(dihedralValue.k*(1.0f + cos_x));
751  df += (double)(nf*dihedralValue.k*sin_x);
752  } else {
753  float sin_x = sinf(x);
754  df += (double)(nf*dihedralValue.k*sin_x);
755  }
756  } else {
757  float diff = phi - dihedralValue.delta;
758  if (diff < -(float)(M_PI)) diff += (float)(2.0*M_PI);
759  if (diff > (float)(M_PI)) diff -= (float)(2.0*M_PI);
760  if (doEnergy) {
761  e += (double)(dihedralValue.k*diff*diff);
762  }
763  df -= 2.0f*dihedralValue.k*diff;
764  }
765  ic++;
766  }
767 
768 }
769 
770 
771 template <typename T, bool doEnergy, bool doVirial>
772 __device__ void diheForce(const int index,
773  const CudaDihedral* __restrict__ diheList,
774  const CudaDihedralValue* __restrict__ dihedralValues,
775  const float4* __restrict__ xyzq,
776  const int stride,
777  const float3 lata, const float3 latb, const float3 latc,
778  T* __restrict__ force, double &energy,
779  T* __restrict__ forceList, int* forceListCounter, int* forceListStarts, int* forceListNexts,
780 #ifdef WRITE_FULL_VIRIALS
782 #else
783  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial
784 #endif
785  ) {
786 
787  CudaDihedral dl = diheList[index];
788 
789  float ishx = dl.ioffsetXYZ.x*lata.x + dl.ioffsetXYZ.y*latb.x + dl.ioffsetXYZ.z*latc.x;
790  float ishy = dl.ioffsetXYZ.x*lata.y + dl.ioffsetXYZ.y*latb.y + dl.ioffsetXYZ.z*latc.y;
791  float ishz = dl.ioffsetXYZ.x*lata.z + dl.ioffsetXYZ.y*latb.z + dl.ioffsetXYZ.z*latc.z;
792 
793  float jshx = dl.joffsetXYZ.x*lata.x + dl.joffsetXYZ.y*latb.x + dl.joffsetXYZ.z*latc.x;
794  float jshy = dl.joffsetXYZ.x*lata.y + dl.joffsetXYZ.y*latb.y + dl.joffsetXYZ.z*latc.y;
795  float jshz = dl.joffsetXYZ.x*lata.z + dl.joffsetXYZ.y*latb.z + dl.joffsetXYZ.z*latc.z;
796 
797  float lshx = dl.loffsetXYZ.x*lata.x + dl.loffsetXYZ.y*latb.x + dl.loffsetXYZ.z*latc.x;
798  float lshy = dl.loffsetXYZ.x*lata.y + dl.loffsetXYZ.y*latb.y + dl.loffsetXYZ.z*latc.y;
799  float lshz = dl.loffsetXYZ.x*lata.z + dl.loffsetXYZ.y*latb.z + dl.loffsetXYZ.z*latc.z;
800 
801  float xij = xyzq[dl.i].x + ishx - xyzq[dl.j].x;
802  float yij = xyzq[dl.i].y + ishy - xyzq[dl.j].y;
803  float zij = xyzq[dl.i].z + ishz - xyzq[dl.j].z;
804 
805  float xjk = xyzq[dl.j].x + jshx - xyzq[dl.k].x;
806  float yjk = xyzq[dl.j].y + jshy - xyzq[dl.k].y;
807  float zjk = xyzq[dl.j].z + jshz - xyzq[dl.k].z;
808 
809  float xlk = xyzq[dl.l].x + lshx - xyzq[dl.k].x;
810  float ylk = xyzq[dl.l].y + lshy - xyzq[dl.k].y;
811  float zlk = xyzq[dl.l].z + lshz - xyzq[dl.k].z;
812 
813  // A=F^G, B=H^G.
814  float ax = yij*zjk - zij*yjk;
815  float ay = zij*xjk - xij*zjk;
816  float az = xij*yjk - yij*xjk;
817  float bx = ylk*zjk - zlk*yjk;
818  float by = zlk*xjk - xlk*zjk;
819  float bz = xlk*yjk - ylk*xjk;
820 
821  float ra2 = ax*ax + ay*ay + az*az;
822  float rb2 = bx*bx + by*by + bz*bz;
823  float rg = sqrtf(xjk*xjk + yjk*yjk + zjk*zjk);
824 
825  // if((ra2 <= rxmin2) .or. (rb2 <= rxmin2) .or. (rg <= rxmin)) then
826  // nlinear = nlinear + 1
827  // endif
828 
829  float rgr = 1.0f / rg;
830  float ra2r = 1.0f / ra2;
831  float rb2r = 1.0f / rb2;
832  float rabr = sqrtf(ra2r*rb2r);
833 
834  float cos_phi = (ax*bx + ay*by + az*bz)*rabr;
835  //
836  // Note that sin(phi).G/|G|=B^A/(|A|.|B|)
837  // which can be simplify to sin(phi)=|G|H.A/(|A|.|B|)
838  float sin_phi = rg*rabr*(ax*xlk + ay*ylk + az*zlk);
839  //
840  // Energy and derivative contributions.
841 
842  float df;
843  double e;
844  diheComp<doEnergy>(dihedralValues, dl.itype, sin_phi, cos_phi, dl.scale, df, e);
845 
846  if (doEnergy) energy += e;
847 
848  //
849  // Compute derivatives wrt catesian coordinates.
850  //
851  // GAA=dE/dphi.|G|/A^2, GBB=dE/dphi.|G|/B^2, FG=F.G, HG=H.G
852  // FGA=dE/dphi*F.G/(|G|A^2), HGB=dE/dphi*H.G/(|G|B^2)
853 
854  float fg = xij*xjk + yij*yjk + zij*zjk;
855  float hg = xlk*xjk + ylk*yjk + zlk*zjk;
856  ra2r *= df;
857  rb2r *= df;
858  float fga = fg*ra2r*rgr;
859  float hgb = hg*rb2r*rgr;
860  float gaa = ra2r*rg;
861  float gbb = rb2r*rg;
862  // DFi=dE/dFi, DGi=dE/dGi, DHi=dE/dHi.
863 
864  // Store forces
865  T T_fix, T_fiy, T_fiz;
866  calcComponentForces<T>(-gaa, ax, ay, az, T_fix, T_fiy, T_fiz);
867  storeForces<T>(T_fix, T_fiy, T_fiz, dl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
868 
869  T dgx, dgy, dgz;
870  calcComponentForces<T>(fga, ax, ay, az, -hgb, bx, by, bz, dgx, dgy, dgz);
871  T T_fjx = dgx - T_fix;
872  T T_fjy = dgy - T_fiy;
873  T T_fjz = dgz - T_fiz;
874  storeForces<T>(T_fjx, T_fjy, T_fjz, dl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
875 
876  T dhx, dhy, dhz;
877  calcComponentForces<T>(gbb, bx, by, bz, dhx, dhy, dhz);
878  T T_fkx = -dhx - dgx;
879  T T_fky = -dhy - dgy;
880  T T_fkz = -dhz - dgz;
881  storeForces<T>(T_fkx, T_fky, T_fkz, dl.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
882  storeForces<T>(dhx, dhy, dhz, dl.l, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
883 
884  // Store virial
885  if (doVirial) {
886 #ifdef WRITE_FULL_VIRIALS
887  float fix = ((float)T_fix)*force_to_float;
888  float fiy = ((float)T_fiy)*force_to_float;
889  float fiz = ((float)T_fiz)*force_to_float;
890  float fjx = ((float)dgx)*force_to_float;
891  float fjy = ((float)dgy)*force_to_float;
892  float fjz = ((float)dgz)*force_to_float;
893  float flx = ((float)dhx)*force_to_float;
894  float fly = ((float)dhy)*force_to_float;
895  float flz = ((float)dhz)*force_to_float;
896  virial.xx = (fix*xij) + (fjx*xjk) + (flx*xlk);
897  virial.xy = (fix*yij) + (fjx*yjk) + (flx*ylk);
898  virial.xz = (fix*zij) + (fjx*zjk) + (flx*zlk);
899  virial.yx = (fiy*xij) + (fjy*xjk) + (fly*xlk);
900  virial.yy = (fiy*yij) + (fjy*yjk) + (fly*ylk);
901  virial.yz = (fiy*zij) + (fjy*zjk) + (fly*zlk);
902  virial.zx = (fiz*xij) + (fjz*xjk) + (flz*xlk);
903  virial.zy = (fiz*yij) + (fjz*yjk) + (flz*ylk);
904  virial.zz = (fiz*zij) + (fjz*zjk) + (flz*zlk);
905 #endif
906  }
907 
908 }
909 
910 __device__ __forceinline__ float3 cross(const float3 v1, const float3 v2) {
911  return make_float3(
912  v1.y*v2.z-v2.y*v1.z,
913  v2.x*v1.z-v1.x*v2.z,
914  v1.x*v2.y-v2.x*v1.y
915  );
916 }
917 
918 __device__ __forceinline__ float dot(const float3 v1, const float3 v2) {
919  return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z);
920 }
921 
922 //
923 // Calculates crossterms
924 //
925 template <typename T, bool doEnergy, bool doVirial>
926 __device__ void crosstermForce(
927  const int index,
928  const CudaCrossterm* __restrict__ crosstermList,
929  const CudaCrosstermValue* __restrict__ crosstermValues,
930  const float4* __restrict__ xyzq,
931  const int stride,
932  const float3 lata, const float3 latb, const float3 latc,
933  T* __restrict__ force, double &energy,
934  T* __restrict__ forceList, int* forceListCounter, int* forceListStarts, int* forceListNexts,
935 #ifdef WRITE_FULL_VIRIALS
937 #else
938  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial
939 #endif
940  ) {
941 
942  CudaCrossterm cl = crosstermList[index];
943 
944  // ----------------------------------------------------------------------------
945  // Angle between 1 - 2 - 3 - 4
946  //
947  // 1 - 2
948  float3 sh12 = make_float3(
949  cl.offset12XYZ.x*lata.x + cl.offset12XYZ.y*latb.x + cl.offset12XYZ.z*latc.x,
950  cl.offset12XYZ.x*lata.y + cl.offset12XYZ.y*latb.y + cl.offset12XYZ.z*latc.y,
951  cl.offset12XYZ.x*lata.z + cl.offset12XYZ.y*latb.z + cl.offset12XYZ.z*latc.z);
952 
953  float4 xyzq1 = xyzq[cl.i1];
954  float4 xyzq2 = xyzq[cl.i2];
955 
956  float3 r12 = make_float3(
957  xyzq1.x + sh12.x - xyzq2.x,
958  xyzq1.y + sh12.y - xyzq2.y,
959  xyzq1.z + sh12.z - xyzq2.z);
960 
961  // 2 - 3
962  float3 sh23 = make_float3(
963  cl.offset23XYZ.x*lata.x + cl.offset23XYZ.y*latb.x + cl.offset23XYZ.z*latc.x,
964  cl.offset23XYZ.x*lata.y + cl.offset23XYZ.y*latb.y + cl.offset23XYZ.z*latc.y,
965  cl.offset23XYZ.x*lata.z + cl.offset23XYZ.y*latb.z + cl.offset23XYZ.z*latc.z);
966 
967  float4 xyzq3 = xyzq[cl.i3];
968 
969  float3 r23 = make_float3(
970  xyzq2.x + sh23.x - xyzq3.x,
971  xyzq2.y + sh23.y - xyzq3.y,
972  xyzq2.z + sh23.z - xyzq3.z);
973 
974  // 3 - 4
975  float3 sh34 = make_float3(
976  cl.offset34XYZ.x*lata.x + cl.offset34XYZ.y*latb.x + cl.offset34XYZ.z*latc.x,
977  cl.offset34XYZ.x*lata.y + cl.offset34XYZ.y*latb.y + cl.offset34XYZ.z*latc.y,
978  cl.offset34XYZ.x*lata.z + cl.offset34XYZ.y*latb.z + cl.offset34XYZ.z*latc.z);
979 
980  float4 xyzq4 = xyzq[cl.i4];
981 
982  float3 r34 = make_float3(
983  xyzq3.x + sh34.x - xyzq4.x,
984  xyzq3.y + sh34.y - xyzq4.y,
985  xyzq3.z + sh34.z - xyzq4.z);
986 
987  // Calculate the cross products
988  float3 A = cross(r12, r23);
989  float3 B = cross(r23, r34);
990  float3 C = cross(r23, A);
991 
992  // Calculate the inverse distances
993  float inv_rA = rsqrtf(dot(A, A));
994  float inv_rB = rsqrtf(dot(B, B));
995  float inv_rC = rsqrtf(dot(C, C));
996 
997  // Calculate the sin and cos
998  float cos_phi = dot(A, B)*(inv_rA*inv_rB);
999  float sin_phi = dot(C, B)*(inv_rC*inv_rB);
1000 
1001  float phi = -atan2f(sin_phi,cos_phi);
1002 
1003  // ----------------------------------------------------------------------------
1004  // Angle between 5 - 6 - 7 - 8
1005  //
1006 
1007  // 5 - 6
1008  float3 sh56 = make_float3(
1009  cl.offset56XYZ.x*lata.x + cl.offset56XYZ.y*latb.x + cl.offset56XYZ.z*latc.x,
1010  cl.offset56XYZ.x*lata.y + cl.offset56XYZ.y*latb.y + cl.offset56XYZ.z*latc.y,
1011  cl.offset56XYZ.x*lata.z + cl.offset56XYZ.y*latb.z + cl.offset56XYZ.z*latc.z);
1012 
1013  float4 xyzq5 = xyzq[cl.i5];
1014  float4 xyzq6 = xyzq[cl.i6];
1015 
1016  float3 r56 = make_float3(
1017  xyzq5.x + sh56.x - xyzq6.x,
1018  xyzq5.y + sh56.y - xyzq6.y,
1019  xyzq5.z + sh56.z - xyzq6.z);
1020 
1021  // 6 - 7
1022  float3 sh67 = make_float3(
1023  cl.offset67XYZ.x*lata.x + cl.offset67XYZ.y*latb.x + cl.offset67XYZ.z*latc.x,
1024  cl.offset67XYZ.x*lata.y + cl.offset67XYZ.y*latb.y + cl.offset67XYZ.z*latc.y,
1025  cl.offset67XYZ.x*lata.z + cl.offset67XYZ.y*latb.z + cl.offset67XYZ.z*latc.z);
1026 
1027  float4 xyzq7 = xyzq[cl.i7];
1028 
1029  float3 r67 = make_float3(
1030  xyzq6.x + sh67.x - xyzq7.x,
1031  xyzq6.y + sh67.y - xyzq7.y,
1032  xyzq6.z + sh67.z - xyzq7.z);
1033 
1034  // 7 - 8
1035  float3 sh78 = make_float3(
1036  cl.offset78XYZ.x*lata.x + cl.offset78XYZ.y*latb.x + cl.offset78XYZ.z*latc.x,
1037  cl.offset78XYZ.x*lata.y + cl.offset78XYZ.y*latb.y + cl.offset78XYZ.z*latc.y,
1038  cl.offset78XYZ.x*lata.z + cl.offset78XYZ.y*latb.z + cl.offset78XYZ.z*latc.z);
1039 
1040  float4 xyzq8 = xyzq[cl.i8];
1041 
1042  float3 r78 = make_float3(
1043  xyzq7.x + sh78.x - xyzq8.x,
1044  xyzq7.y + sh78.y - xyzq8.y,
1045  xyzq7.z + sh78.z - xyzq8.z);
1046 
1047  // Calculate the cross products
1048  float3 D = cross(r56, r67);
1049  float3 E = cross(r67, r78);
1050  float3 F = cross(r67, D);
1051 
1052  // Calculate the inverse distances
1053  float inv_rD = rsqrtf(dot(D, D));
1054  float inv_rE = rsqrtf(dot(E, E));
1055  float inv_rF = rsqrtf(dot(F, F));
1056 
1057  // Calculate the sin and cos
1058  float cos_psi = dot(D, E)*(inv_rD*inv_rE);
1059  float sin_psi = dot(F, E)*(inv_rF*inv_rE);
1060 
1061  float psi = -atan2f(sin_psi,cos_psi);
1062 
1063  // ----------------------------------------------------------------------------
1064  // Calculate the energy
1065 
1066  const float inv_h = (float)( (CudaCrosstermValue::dim) / (2.0*M_PI) );
1067 
1068  // Shift angles
1069  phi = fmod(phi + (float)M_PI, 2.0f*(float)M_PI);
1070  psi = fmod(psi + (float)M_PI, 2.0f*(float)M_PI);
1071 
1072  // distance measured in grid points between angle and smallest value
1073  float phi_h = phi * inv_h;
1074  float psi_h = psi * inv_h;
1075 
1076  // find smallest numbered grid point in stencil
1077  int iphi = (int)floor(phi_h);
1078  int ipsi = (int)floor(psi_h);
1079 
1080  const CudaCrosstermValue& cp = crosstermValues[cl.itype];
1081 
1082  // Load coefficients
1083  float4 c[4];
1084  c[0] = cp.c[iphi][ipsi][0];
1085  c[1] = cp.c[iphi][ipsi][1];
1086  c[2] = cp.c[iphi][ipsi][2];
1087  c[3] = cp.c[iphi][ipsi][3];
1088 
1089  float dphi = phi_h - (float)iphi;
1090  float dpsi = psi_h - (float)ipsi;
1091 
1092  if (doEnergy) {
1093  float energyf = c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) );
1094  energyf = energyf*dpsi + c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) );
1095  energyf = energyf*dpsi + c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) );
1096  energyf = energyf*dpsi + c[0].x + dphi*( c[0].y + dphi*( c[0].z + dphi*c[0].w ) );
1097  energy += energyf*cl.scale;
1098  }
1099 
1100  float dEdphi = 3.0f*(c[0].w + dpsi*( c[1].w + dpsi*( c[2].w + dpsi*c[3].w ) ));
1101  dEdphi = dEdphi*dphi + 2.0f*(c[0].z + dpsi*( c[1].z + dpsi*( c[2].z + dpsi*c[3].z ) ));
1102  dEdphi = dEdphi*dphi + (c[0].y + dpsi*( c[1].y + dpsi*( c[2].y + dpsi*c[3].y ) )); // 13 muls
1103  dEdphi *= cl.scale*inv_h;
1104 
1105  float dEdpsi = 3.0f*(c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) ));
1106  dEdpsi = dEdpsi*dpsi + 2.0f*(c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) ));
1107  dEdpsi = dEdpsi*dpsi + (c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) )); // 13 muls
1108  dEdpsi *= cl.scale*inv_h;
1109 
1110  // float normCross1 = dot(A, A);
1111  float square_r23 = dot(r23, r23);
1112  float norm_r23 = sqrtf(square_r23);
1113  float inv_square_r23 = 1.0f/square_r23;
1114  float ff1 = dEdphi*norm_r23*inv_rA*inv_rA;
1115  float ff2 = -dot(r12, r23)*inv_square_r23;
1116  float ff3 = -dot(r34, r23)*inv_square_r23;
1117  float ff4 = -dEdphi*norm_r23*inv_rB*inv_rB;
1118 
1119  float3 f1 = make_float3(ff1*A.x, ff1*A.y, ff1*A.z);
1120  float3 f4 = make_float3(ff4*B.x, ff4*B.y, ff4*B.z);
1121  float3 t1 = make_float3( ff2*f1.x - ff3*f4.x, ff2*f1.y - ff3*f4.y, ff2*f1.z - ff3*f4.z );
1122  float3 f2 = make_float3( t1.x - f1.x, t1.y - f1.y, t1.z - f1.z);
1123  float3 f3 = make_float3( -t1.x - f4.x, -t1.y - f4.y, -t1.z - f4.z);
1124 
1125  T T_f1x, T_f1y, T_f1z;
1126  T T_f2x, T_f2y, T_f2z;
1127  T T_f3x, T_f3y, T_f3z;
1128  T T_f4x, T_f4y, T_f4z;
1129  convertForces<T>(f1.x, f1.y, f1.z, T_f1x, T_f1y, T_f1z);
1130  convertForces<T>(f2.x, f2.y, f2.z, T_f2x, T_f2y, T_f2z);
1131  convertForces<T>(f3.x, f3.y, f3.z, T_f3x, T_f3y, T_f3z);
1132  convertForces<T>(f4.x, f4.y, f4.z, T_f4x, T_f4y, T_f4z);
1133  storeForces<T>(T_f1x, T_f1y, T_f1z, cl.i1, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1134  storeForces<T>(T_f2x, T_f2y, T_f2z, cl.i2, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1135  storeForces<T>(T_f3x, T_f3y, T_f3z, cl.i3, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1136  storeForces<T>(T_f4x, T_f4y, T_f4z, cl.i4, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1137 
1138  float square_r67 = dot(r67, r67);
1139  float norm_r67 = sqrtf(square_r67);
1140  float inv_square_r67 = 1.0f/(square_r67);
1141  ff1 = dEdpsi*norm_r67*inv_rD*inv_rD;
1142  ff2 = -dot(r56, r67)*inv_square_r67;
1143  ff3 = -dot(r78, r67)*inv_square_r67;
1144  ff4 = -dEdpsi*norm_r67*inv_rE*inv_rE;
1145 
1146  float3 f5 = make_float3( ff1*D.x, ff1*D.y, ff1*D.z );
1147  float3 f8 = make_float3( ff4*E.x, ff4*E.y, ff4*E.z );
1148  float3 t2 = make_float3( ff2*f5.x - ff3*f8.x, ff2*f5.y - ff3*f8.y, ff2*f5.z - ff3*f8.z );
1149  float3 f6 = make_float3( t2.x - f5.x, t2.y - f5.y, t2.z - f5.z );
1150  float3 f7 = make_float3(-t2.x - f8.x, -t2.y - f8.y, -t2.z - f8.z );
1151 
1152  T T_f5x, T_f5y, T_f5z;
1153  T T_f6x, T_f6y, T_f6z;
1154  T T_f7x, T_f7y, T_f7z;
1155  T T_f8x, T_f8y, T_f8z;
1156  convertForces<T>(f5.x, f5.y, f5.z, T_f5x, T_f5y, T_f5z);
1157  convertForces<T>(f6.x, f6.y, f6.z, T_f6x, T_f6y, T_f6z);
1158  convertForces<T>(f7.x, f7.y, f7.z, T_f7x, T_f7y, T_f7z);
1159  convertForces<T>(f8.x, f8.y, f8.z, T_f8x, T_f8y, T_f8z);
1160  storeForces<T>(T_f5x, T_f5y, T_f5z, cl.i5, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1161  storeForces<T>(T_f6x, T_f6y, T_f6z, cl.i6, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1162  storeForces<T>(T_f7x, T_f7y, T_f7z, cl.i7, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1163  storeForces<T>(T_f8x, T_f8y, T_f8z, cl.i8, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1164 
1165  // Store virial
1166  if (doVirial) {
1167 #ifdef WRITE_FULL_VIRIALS
1168  float3 s12 = make_float3( f1.x + f2.x, f1.y + f2.y, f1.z + f2.z );
1169  float3 s56 = make_float3( f5.x + f6.x, f5.y + f6.y, f5.z + f6.z );
1170  virial.xx = f1.x*r12.x + s12.x*r23.x - f4.x*r34.x + f5.x*r56.x + s56.x*r67.x - f8.x*r78.x;
1171  virial.xy = f1.x*r12.y + s12.x*r23.y - f4.x*r34.y + f5.x*r56.y + s56.x*r67.y - f8.x*r78.y;
1172  virial.xz = f1.x*r12.z + s12.x*r23.z - f4.x*r34.z + f5.x*r56.z + s56.x*r67.z - f8.x*r78.z;
1173  virial.yx = f1.y*r12.x + s12.y*r23.x - f4.y*r34.x + f5.y*r56.x + s56.y*r67.x - f8.y*r78.x;
1174  virial.yy = f1.y*r12.y + s12.y*r23.y - f4.y*r34.y + f5.y*r56.y + s56.y*r67.y - f8.y*r78.y;
1175  virial.yz = f1.y*r12.z + s12.y*r23.z - f4.y*r34.z + f5.y*r56.z + s56.y*r67.z - f8.y*r78.z;
1176  virial.zx = f1.z*r12.x + s12.z*r23.x - f4.z*r34.x + f5.z*r56.x + s56.z*r67.x - f8.z*r78.x;
1177  virial.zy = f1.z*r12.y + s12.z*r23.y - f4.z*r34.y + f5.z*r56.y + s56.z*r67.y - f8.z*r78.y;
1178  virial.zz = f1.z*r12.z + s12.z*r23.z - f4.z*r34.z + f5.z*r56.z + s56.z*r67.z - f8.z*r78.z;
1179  }
1180 #endif
1181 
1182 }
1183 
1184 
1185 #ifndef NAMD_CUDA
1186 #define BONDEDFORCESKERNEL_NUM_WARP 4
1187 #else
1188 #define BONDEDFORCESKERNEL_NUM_WARP 4
1189 #endif
1190 
1191 //
1192 // Calculates all forces in a single kernel call
1193 //
1194 template <typename T, bool doEnergy, bool doVirial>
1195 __global__ void bondedForcesKernel(
1196  const int start,
1197 
1198  const int numBonds,
1199  const CudaBond* __restrict__ bonds,
1200  const CudaBondValue* __restrict__ bondValues,
1201 
1202  const int numAngles,
1203  const CudaAngle* __restrict__ angles,
1204  const CudaAngleValue* __restrict__ angleValues,
1205 
1206  const int numDihedrals,
1207  const CudaDihedral* __restrict__ dihedrals,
1208  const CudaDihedralValue* __restrict__ dihedralValues,
1209 
1210  const int numImpropers,
1211  const CudaDihedral* __restrict__ impropers,
1212  const CudaDihedralValue* __restrict__ improperValues,
1213 
1214  const int numExclusions,
1215  const CudaExclusion* __restrict__ exclusions,
1216 
1217  const int numCrossterms,
1218  const CudaCrossterm* __restrict__ crossterms,
1219  const CudaCrosstermValue* __restrict__ crosstermValues,
1220 
1221  const float cutoff2,
1222  const float r2_delta, const int r2_delta_expc,
1223 
1224  const float* __restrict__ r2_table,
1225  const float4* __restrict__ exclusionTable,
1226 
1227 #ifndef NAMD_HIP
1228  cudaTextureObject_t r2_table_tex,
1229  cudaTextureObject_t exclusionTableTex,
1230 #endif
1231 
1232  const float4* __restrict__ xyzq,
1233  const int stride,
1234  const float3 lata, const float3 latb, const float3 latc,
1235  T* __restrict__ force,
1236  T* __restrict__ forceSlow,
1237  T* __restrict__ forceList,
1238  int* forceListCounter,
1239  int* forceListStarts,
1240  int* forceListStartsSlow,
1241  int* forceListNexts,
1242  double* __restrict__ energies_virials) {
1243 
1244  // Thread-block index
1245  int indexTB = start + blockIdx.x;
1246 
1247  const int numBondsTB = (numBonds + blockDim.x - 1)/blockDim.x;
1248  const int numAnglesTB = (numAngles + blockDim.x - 1)/blockDim.x;
1249  const int numDihedralsTB = (numDihedrals + blockDim.x - 1)/blockDim.x;
1250  const int numImpropersTB = (numImpropers + blockDim.x - 1)/blockDim.x;
1251  const int numExclusionsTB= (numExclusions + blockDim.x - 1)/blockDim.x;
1252  const int numCrosstermsTB= (numCrossterms + blockDim.x - 1)/blockDim.x;
1253 
1254  // Each thread computes single bonded interaction.
1255  // Each thread block computes single bonded type
1256  double energy;
1257  int energyIndex;
1258 
1259  if (doEnergy) {
1260  energy = 0.0;
1261  energyIndex = -1;
1262  }
1263 
1264 #ifdef WRITE_FULL_VIRIALS
1266  int virialIndex;
1267  if (doVirial) {
1268  virial.xx = 0.0;
1269  virial.xy = 0.0;
1270  virial.xz = 0.0;
1271  virial.yx = 0.0;
1272  virial.yy = 0.0;
1273  virial.yz = 0.0;
1274  virial.zx = 0.0;
1275  virial.zy = 0.0;
1276  virial.zz = 0.0;
1278  }
1279 #endif
1280 
1281  if (indexTB < numBondsTB) {
1282  int i = threadIdx.x + indexTB*blockDim.x;
1283  if (doEnergy) {
1285  }
1286  if (i < numBonds) {
1287  bondForce<T, doEnergy, doVirial>
1288  (i, bonds, bondValues, xyzq,
1289  stride, lata, latb, latc,
1290  force, energy,
1291  forceList, forceListCounter, forceListStarts, forceListNexts,
1292  virial);
1293  }
1294  goto reduce;
1295  }
1296  indexTB -= numBondsTB;
1297 
1298  if (indexTB < numAnglesTB) {
1299  int i = threadIdx.x + indexTB*blockDim.x;
1300  if (doEnergy) {
1302  }
1303  if (i < numAngles) {
1304  angleForce<T, doEnergy, doVirial>
1305  (i, angles, angleValues, xyzq, stride,
1306  lata, latb, latc,
1307  force, energy,
1308  forceList, forceListCounter, forceListStarts, forceListNexts,
1309  virial);
1310  }
1311  goto reduce;
1312  }
1313  indexTB -= numAnglesTB;
1314 
1315  if (indexTB < numDihedralsTB) {
1316  int i = threadIdx.x + indexTB*blockDim.x;
1317  if (doEnergy) {
1319  }
1320  if (doVirial) {
1322  }
1323  if (i < numDihedrals) {
1324  diheForce<T, doEnergy, doVirial>
1325  (i, dihedrals, dihedralValues, xyzq, stride,
1326  lata, latb, latc,
1327  force, energy,
1328  forceList, forceListCounter, forceListStarts, forceListNexts,
1329  virial);
1330  }
1331  goto reduce;
1332  }
1333  indexTB -= numDihedralsTB;
1334 
1335  if (indexTB < numImpropersTB) {
1336  int i = threadIdx.x + indexTB*blockDim.x;
1337  if (doEnergy) {
1339  }
1340  if (i < numImpropers) {
1341  diheForce<T, doEnergy, doVirial>
1342  (i, impropers, improperValues, xyzq, stride,
1343  lata, latb, latc,
1344  force, energy,
1345  forceList, forceListCounter, forceListStarts, forceListNexts,
1346  virial);
1347  }
1348  goto reduce;
1349  }
1350  indexTB -= numImpropersTB;
1351 
1352  if (indexTB < numExclusionsTB) {
1353  int i = threadIdx.x + indexTB*blockDim.x;
1354  if (doEnergy) {
1356  }
1357  if (doVirial) {
1359  }
1360  if (i < numExclusions) {
1361  exclusionForce<T, doEnergy, doVirial>
1362  (i, exclusions, r2_delta, r2_delta_expc,
1363 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
1364  r2_table, exclusionTable,
1365 #else
1366  r2_table_tex, exclusionTableTex,
1367 #endif
1368  xyzq, stride, lata, latb, latc, cutoff2,
1369  forceSlow, energy,
1370  forceList, forceListCounter, forceListStartsSlow, forceListNexts,
1371  virial);
1372  }
1373  goto reduce;
1374  }
1375  indexTB -= numExclusionsTB;
1376 
1377  if (indexTB < numCrosstermsTB) {
1378  int i = threadIdx.x + indexTB*blockDim.x;
1379  if (doEnergy) {
1381  }
1382  if (doVirial) {
1384  }
1385  if (i < numCrossterms) {
1386  crosstermForce<T, doEnergy, doVirial>
1387  (i, crossterms, crosstermValues,
1388  xyzq, stride, lata, latb, latc,
1389  force, energy,
1390  forceList, forceListCounter, forceListStarts, forceListNexts,
1391  virial);
1392  }
1393  goto reduce;
1394  }
1395  // indexTB -= numCrosstermsTB;
1396 
1397  reduce:
1398 
1399  // Write energies to global memory
1400  if (doEnergy) {
1401  // energyIndex is constant within thread-block
1402  __shared__ double shEnergy[BONDEDFORCESKERNEL_NUM_WARP];
1403 #pragma unroll
1404  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1405  energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, WARPSIZE);
1406  }
1407  int laneID = (threadIdx.x & (WARPSIZE - 1));
1408  int warpID = threadIdx.x / WARPSIZE;
1409  if (laneID == 0) {
1410  shEnergy[warpID] = energy;
1411  }
1412  BLOCK_SYNC;
1413  if (warpID == 0) {
1414  energy = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy[laneID] : 0.0;
1415 #pragma unroll
1416  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1417  energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, WARPSIZE);
1418  }
1419  if (laneID == 0) {
1420  const int bin = blockIdx.x % ATOMIC_BINS;
1421  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex], energy);
1422  }
1423  }
1424  }
1425 
1426  // Write virials to global memory
1427 #ifdef WRITE_FULL_VIRIALS
1428  if (doVirial) {
1429 #pragma unroll
1430  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1431  virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, WARPSIZE);
1432  virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, WARPSIZE);
1433  virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, WARPSIZE);
1434  virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, WARPSIZE);
1435  virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, WARPSIZE);
1436  virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, WARPSIZE);
1437  virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, WARPSIZE);
1438  virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, WARPSIZE);
1439  virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, WARPSIZE);
1440  }
1442  int laneID = (threadIdx.x & (WARPSIZE - 1));
1443  int warpID = threadIdx.x / WARPSIZE;
1444  if (laneID == 0) {
1445  shVirial[warpID] = virial;
1446  }
1447  BLOCK_SYNC;
1448 
1449  if (warpID == 0) {
1450  virial.xx = 0.0;
1451  virial.xy = 0.0;
1452  virial.xz = 0.0;
1453  virial.yx = 0.0;
1454  virial.yy = 0.0;
1455  virial.yz = 0.0;
1456  virial.zx = 0.0;
1457  virial.zy = 0.0;
1458  virial.zz = 0.0;
1459  if (laneID < BONDEDFORCESKERNEL_NUM_WARP) virial = shVirial[laneID];
1460 #pragma unroll
1461  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1462  virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, WARPSIZE);
1463  virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, WARPSIZE);
1464  virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, WARPSIZE);
1465  virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, WARPSIZE);
1466  virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, WARPSIZE);
1467  virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, WARPSIZE);
1468  virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, WARPSIZE);
1469  virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, WARPSIZE);
1470  virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, WARPSIZE);
1471  }
1472 
1473  if (laneID == 0) {
1474 #ifdef USE_FP_VIRIAL
1475  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 0], llitoulli(virial.xx*double_to_virial));
1476  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 1], llitoulli(virial.xy*double_to_virial));
1477  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 2], llitoulli(virial.xz*double_to_virial));
1478  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 3], llitoulli(virial.yx*double_to_virial));
1479  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 4], llitoulli(virial.yy*double_to_virial));
1480  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 5], llitoulli(virial.yz*double_to_virial));
1481  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 6], llitoulli(virial.zx*double_to_virial));
1482  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 7], llitoulli(virial.zy*double_to_virial));
1483  atomicAdd((unsigned long long int *)&energies_virials[virialIndex + 8], llitoulli(virial.zz*double_to_virial));
1484 #else
1485  const int bin = blockIdx.x % ATOMIC_BINS;
1486  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 0], virial.xx);
1487  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 1], virial.xy);
1488  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 2], virial.xz);
1489  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 3], virial.yx);
1490  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 4], virial.yy);
1491  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 5], virial.yz);
1492  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 6], virial.zx);
1493  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 7], virial.zy);
1494  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 8], virial.zz);
1495 #endif
1496  }
1497  }
1498  }
1499 #endif
1500 
1501 }
1502 
1503 template <typename T, bool doEnergy, bool doVirial, bool doElect>
1505  const int start,
1506 
1507  const int numModifiedExclusions,
1508  const CudaExclusion* __restrict__ modifiedExclusions,
1509 
1510  const bool doSlow,
1511  const float one_scale14, // 1 - scale14
1512  const float cutoff2,
1513  const int vdwCoefTableWidth,
1514  const float2* __restrict__ vdwCoefTable,
1515  cudaTextureObject_t vdwCoefTableTex,
1516  cudaTextureObject_t modifiedExclusionForceTableTex, cudaTextureObject_t modifiedExclusionEnergyTableTex,
1517  const float4* __restrict__ xyzq,
1518  const int stride,
1519  const float3 lata, const float3 latb, const float3 latc,
1520  T* __restrict__ forceNbond, T* __restrict__ forceSlow,
1521  T* __restrict__ forceList,
1522  int* forceListCounter,
1523  int* forceListStartsNbond,
1524  int* forceListStartsSlow,
1525  int* forceListNexts,
1526  double* __restrict__ energies_virials
1527  ) {
1528 
1529  // index
1530  int i = threadIdx.x + (start + blockIdx.x)*blockDim.x;
1531 
1532  double energyVdw, energyNbond, energySlow;
1533  if (doEnergy) {
1534  energyVdw = 0.0;
1535  if (doElect) {
1536  energyNbond = 0.0;
1537  energySlow = 0.0;
1538  }
1539  }
1540 
1541 #ifdef WRITE_FULL_VIRIALS
1544  if (doVirial) {
1545  virialNbond.xx = 0.0;
1546  virialNbond.xy = 0.0;
1547  virialNbond.xz = 0.0;
1548  virialNbond.yx = 0.0;
1549  virialNbond.yy = 0.0;
1550  virialNbond.yz = 0.0;
1551  virialNbond.zx = 0.0;
1552  virialNbond.zy = 0.0;
1553  virialNbond.zz = 0.0;
1554  if (doElect) {
1555  virialSlow.xx = 0.0;
1556  virialSlow.xy = 0.0;
1557  virialSlow.xz = 0.0;
1558  virialSlow.yx = 0.0;
1559  virialSlow.yy = 0.0;
1560  virialSlow.yz = 0.0;
1561  virialSlow.zx = 0.0;
1562  virialSlow.zy = 0.0;
1563  virialSlow.zz = 0.0;
1564  }
1565  }
1566 #endif
1567 
1568  if (i < numModifiedExclusions)
1569  {
1570  modifiedExclusionForce<T, doEnergy, doVirial, doElect>
1571  (i, modifiedExclusions, doSlow, one_scale14, vdwCoefTableWidth,
1572 #if __CUDA_ARCH__ >= 350
1573  vdwCoefTable,
1574 #else
1576 #endif
1577  modifiedExclusionForceTableTex, modifiedExclusionEnergyTableTex,
1578  xyzq, stride, lata, latb, latc, cutoff2,
1579  energyVdw, forceNbond, energyNbond,
1580  forceSlow, energySlow,
1581  forceList, forceListCounter, forceListStartsNbond, forceListStartsSlow, forceListNexts,
1582  virialNbond, virialSlow);
1583  }
1584 
1585  // Write energies to global memory
1586  if (doEnergy) {
1587  __shared__ double shEnergyVdw[BONDEDFORCESKERNEL_NUM_WARP];
1588  __shared__ double shEnergyNbond[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
1589  __shared__ double shEnergySlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
1590  // JM: Each warp reduce its energy, adds it to the shared memory array
1591  // and warp zero reduces the shared memory on all warps in the block and adds it to gmem
1592 #pragma unroll
1593  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1594  energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
1595  if (doElect) {
1596  energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, WARPSIZE);
1597  energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
1598  }
1599  }
1600  int laneID = (threadIdx.x & (WARPSIZE - 1));
1601  int warpID = threadIdx.x / WARPSIZE;
1602  if (laneID == 0) {
1603  shEnergyVdw[warpID] = energyVdw;
1604  if (doElect) {
1605  shEnergyNbond[warpID] = energyNbond;
1606  shEnergySlow[warpID] = energySlow;
1607  }
1608  }
1609  BLOCK_SYNC;
1610  if (warpID == 0) {
1611  energyVdw = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw[laneID] : 0.0;
1612  if (doElect) {
1613  energyNbond = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond[laneID] : 0.0;
1614  energySlow = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow[laneID] : 0.0;
1615  }
1616 #pragma unroll
1617  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1618  energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
1619  if (doElect) {
1620  energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, WARPSIZE);
1621  energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
1622  }
1623  }
1624  if (laneID == 0) {
1625  const int bin = blockIdx.x % ATOMIC_BINS;
1626  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ], energyVdw);
1627  if (doElect) {
1628  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT], energyNbond);
1630  }
1631  }
1632  }
1633  }
1634 
1635  // Write virials to global memory
1636 #ifdef WRITE_FULL_VIRIALS
1637  if (doVirial) {
1638 #pragma unroll
1639  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1640  virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, WARPSIZE);
1641  virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, WARPSIZE);
1642  virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, WARPSIZE);
1643  virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, WARPSIZE);
1644  virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, WARPSIZE);
1645  virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, WARPSIZE);
1646  virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, WARPSIZE);
1647  virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, WARPSIZE);
1648  virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, WARPSIZE);
1649  if (doElect && doSlow) {
1650  virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, WARPSIZE);
1651  virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, WARPSIZE);
1652  virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, WARPSIZE);
1653  virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, WARPSIZE);
1654  virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, WARPSIZE);
1655  virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, WARPSIZE);
1656  virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, WARPSIZE);
1657  virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, WARPSIZE);
1658  virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, WARPSIZE);
1659  }
1660  }
1662  __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirialSlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
1663  int laneID = (threadIdx.x & (WARPSIZE - 1));
1664  int warpID = threadIdx.x / WARPSIZE;
1665  if (laneID == 0) {
1666  shVirialNbond[warpID] = virialNbond;
1667  if (doElect && doSlow) {
1668  shVirialSlow[warpID] = virialSlow;
1669  }
1670  }
1671  BLOCK_SYNC;
1672 
1673  virialNbond.xx = 0.0;
1674  virialNbond.xy = 0.0;
1675  virialNbond.xz = 0.0;
1676  virialNbond.yx = 0.0;
1677  virialNbond.yy = 0.0;
1678  virialNbond.yz = 0.0;
1679  virialNbond.zx = 0.0;
1680  virialNbond.zy = 0.0;
1681  virialNbond.zz = 0.0;
1682  if (doElect && doSlow) {
1683  virialSlow.xx = 0.0;
1684  virialSlow.xy = 0.0;
1685  virialSlow.xz = 0.0;
1686  virialSlow.yx = 0.0;
1687  virialSlow.yy = 0.0;
1688  virialSlow.yz = 0.0;
1689  virialSlow.zx = 0.0;
1690  virialSlow.zy = 0.0;
1691  virialSlow.zz = 0.0;
1692  }
1693 
1694  if (warpID == 0) {
1695  if (laneID < BONDEDFORCESKERNEL_NUM_WARP) {
1696  virialNbond = shVirialNbond[laneID];
1697  if (doElect && doSlow) {
1698  virialSlow = shVirialSlow[laneID];
1699  }
1700  }
1701 #pragma unroll
1702  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1703  virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, WARPSIZE);
1704  virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, WARPSIZE);
1705  virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, WARPSIZE);
1706  virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, WARPSIZE);
1707  virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, WARPSIZE);
1708  virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, WARPSIZE);
1709  virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, WARPSIZE);
1710  virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, WARPSIZE);
1711  virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, WARPSIZE);
1712  if (doElect && doSlow) {
1713  virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, WARPSIZE);
1714  virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, WARPSIZE);
1715  virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, WARPSIZE);
1716  virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, WARPSIZE);
1717  virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, WARPSIZE);
1718  virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, WARPSIZE);
1719  virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, WARPSIZE);
1720  virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, WARPSIZE);
1721  virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, WARPSIZE);
1722  }
1723  }
1724 
1725  if (laneID == 0)
1726  {
1727 #ifdef USE_FP_VIRIAL
1728  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XX], llitoulli(virialNbond.xx*double_to_virial));
1729  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XY], llitoulli(virialNbond.xy*double_to_virial));
1730  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XZ], llitoulli(virialNbond.xz*double_to_virial));
1731  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YX], llitoulli(virialNbond.yx*double_to_virial));
1732  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YY], llitoulli(virialNbond.yy*double_to_virial));
1733  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_YZ], llitoulli(virialNbond.yz*double_to_virial));
1734  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZX], llitoulli(virialNbond.zx*double_to_virial));
1735  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZY], llitoulli(virialNbond.zy*double_to_virial));
1736  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_ZZ], llitoulli(virialNbond.zz*double_to_virial));
1737  if (doElect && doSlow) {
1738  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XX], llitoulli(virialSlow.xx*double_to_virial));
1739  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XY], llitoulli(virialSlow.xy*double_to_virial));
1740  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XZ], llitoulli(virialSlow.xz*double_to_virial));
1741  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YX], llitoulli(virialSlow.yx*double_to_virial));
1742  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YY], llitoulli(virialSlow.yy*double_to_virial));
1743  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_YZ], llitoulli(virialSlow.yz*double_to_virial));
1744  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZX], llitoulli(virialSlow.zx*double_to_virial));
1745  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZY], llitoulli(virialSlow.zy*double_to_virial));
1746  atomicAdd((unsigned long long int *)&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_ZZ], llitoulli(virialSlow.zz*double_to_virial));
1747  }
1748 #else
1749  const int bin = blockIdx.x % ATOMIC_BINS;
1750  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XX], virialNbond.xx);
1751  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XY], virialNbond.xy);
1752  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XZ], virialNbond.xz);
1753  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YX], virialNbond.yx);
1754  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YY], virialNbond.yy);
1755  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YZ], virialNbond.yz);
1756  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZX], virialNbond.zx);
1757  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZY], virialNbond.zy);
1758  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZZ], virialNbond.zz);
1759  if (doElect && doSlow) {
1760  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XX], virialSlow.xx);
1761  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XY], virialSlow.xy);
1762  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XZ], virialSlow.xz);
1763  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YX], virialSlow.yx);
1764  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YY], virialSlow.yy);
1765  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YZ], virialSlow.yz);
1766  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZX], virialSlow.zx);
1767  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZY], virialSlow.zy);
1768  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZZ], virialSlow.zz);
1769  }
1770 #endif
1771  }
1772  }
1773  }
1774 #endif
1775 
1776 }
1777 
1778 template <typename T>
1780  const int start,
1781  const int atomStorageSize,
1782  const int stride,
1783  const T* __restrict__ forceList,
1784  const int* __restrict__ forceListStarts,
1785  const int* __restrict__ forceListNexts,
1786  T* __restrict__ force) {
1787 
1788  int i = threadIdx.x + (start + blockIdx.x) * blockDim.x;
1789 
1790  if (i < atomStorageSize) {
1791  T fx = 0;
1792  T fy = 0;
1793  T fz = 0;
1794  int pos = forceListStarts[i];
1795  while (pos != -1) {
1796  fx += forceList[pos * 3 + 0];
1797  fy += forceList[pos * 3 + 1];
1798  fz += forceList[pos * 3 + 2];
1799  pos = forceListNexts[pos];
1800  }
1801  force[i ] = fx;
1802  force[i + stride ] = fy;
1803  force[i + stride * 2] = fz;
1804  }
1805 }
1806 
1807 __global__ void reduceBondedBinsKernel(double* energies_virials) {
1808  const int bin = threadIdx.x;
1809 
1810  typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
1811  __shared__ typename WarpReduce::TempStorage tempStorage;
1812 
1813  double v = WarpReduce(tempStorage).Sum(energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + blockIdx.x]);
1814  if (threadIdx.x == 0) {
1815  energies_virials[blockIdx.x] = v;
1816  }
1817 }
1818 
1819 // ##############################################################################################
1820 // ##############################################################################################
1821 // ##############################################################################################
1822 
1823 //
1824 // Class constructor
1825 //
1827 deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables) {
1828 
1829  cudaCheck(cudaSetDevice(deviceID));
1830 
1831  tupleData = NULL;
1832  tupleDataSize = 0;
1833 
1834  numBonds = 0;
1835  numAngles = 0;
1836  numDihedrals = 0;
1837  numImpropers = 0;
1838  numModifiedExclusions = 0;
1839  numExclusions = 0;
1840  numCrossterms = 0;
1841 
1842  bondValues = NULL;
1843  angleValues = NULL;
1844  dihedralValues = NULL;
1845  improperValues = NULL;
1846  crosstermValues = NULL;
1847 
1848  xyzq = NULL;
1849  xyzqSize = 0;
1850 
1851  forces = NULL;
1852  forcesSize = 0;
1853 
1854  forceList = NULL;
1855  forceListStarts = NULL;
1856  forceListNexts = NULL;
1857  forceListSize = 0;
1858  forceListStartsSize = 0;
1859  forceListNextsSize = 0;
1860  allocate_device<int>(&forceListCounter, 1);
1861 
1862  allocate_device<double>(&energies_virials, ATOMIC_BINS * energies_virials_SIZE);
1863 }
1864 
1865 //
1866 // Class destructor
1867 //
1869  cudaCheck(cudaSetDevice(deviceID));
1870 
1871  deallocate_device<double>(&energies_virials);
1872  // deallocate_device<BondedVirial>(&virial);
1873 
1874  if (tupleData != NULL) deallocate_device<char>(&tupleData);
1875  if (xyzq != NULL) deallocate_device<float4>(&xyzq);
1876  if (forces != NULL) deallocate_device<FORCE_TYPE>(&forces);
1877 
1878  if (forceList != NULL) deallocate_device<FORCE_TYPE>(&forceList);
1879  if (forceListCounter != NULL) deallocate_device<int>(&forceListCounter);
1880  if (forceListStarts != NULL) deallocate_device<int>(&forceListStarts);
1881  if (forceListNexts != NULL) deallocate_device<int>(&forceListNexts);
1882 
1883  if (bondValues != NULL) deallocate_device<CudaBondValue>(&bondValues);
1884  if (angleValues != NULL) deallocate_device<CudaAngleValue>(&angleValues);
1885  if (dihedralValues != NULL) deallocate_device<CudaDihedralValue>(&dihedralValues);
1886  if (improperValues != NULL) deallocate_device<CudaDihedralValue>(&improperValues);
1887  if (crosstermValues != NULL) deallocate_device<CudaCrosstermValue>(&crosstermValues);
1888 }
1889 
1890 void ComputeBondedCUDAKernel::setupBondValues(int numBondValues, CudaBondValue* h_bondValues) {
1891  allocate_device<CudaBondValue>(&bondValues, numBondValues);
1892  copy_HtoD_sync<CudaBondValue>(h_bondValues, bondValues, numBondValues);
1893 }
1894 
1895 void ComputeBondedCUDAKernel::setupAngleValues(int numAngleValues, CudaAngleValue* h_angleValues) {
1896  allocate_device<CudaAngleValue>(&angleValues, numAngleValues);
1897  copy_HtoD_sync<CudaAngleValue>(h_angleValues, angleValues, numAngleValues);
1898 }
1899 
1900 void ComputeBondedCUDAKernel::setupDihedralValues(int numDihedralValues, CudaDihedralValue* h_dihedralValues) {
1901  allocate_device<CudaDihedralValue>(&dihedralValues, numDihedralValues);
1902  copy_HtoD_sync<CudaDihedralValue>(h_dihedralValues, dihedralValues, numDihedralValues);
1903 }
1904 
1905 void ComputeBondedCUDAKernel::setupImproperValues(int numImproperValues, CudaDihedralValue* h_improperValues) {
1906  allocate_device<CudaDihedralValue>(&improperValues, numImproperValues);
1907  copy_HtoD_sync<CudaDihedralValue>(h_improperValues, improperValues, numImproperValues);
1908 }
1909 
1910 void ComputeBondedCUDAKernel::setupCrosstermValues(int numCrosstermValues, CudaCrosstermValue* h_crosstermValues) {
1911  allocate_device<CudaCrosstermValue>(&crosstermValues, numCrosstermValues);
1912  copy_HtoD_sync<CudaCrosstermValue>(h_crosstermValues, crosstermValues, numCrosstermValues);
1913 }
1914 
1915 //
1916 // Update bonded lists
1917 //
1919  const int numBondsIn,
1920  const int numAnglesIn,
1921  const int numDihedralsIn,
1922  const int numImpropersIn,
1923  const int numModifiedExclusionsIn,
1924  const int numExclusionsIn,
1925  const int numCrosstermsIn,
1926  const char* h_tupleData,
1927  cudaStream_t stream) {
1928 
1929  numBonds = numBondsIn;
1930  numAngles = numAnglesIn;
1931  numDihedrals = numDihedralsIn;
1932  numImpropers = numImpropersIn;
1933  numModifiedExclusions = numModifiedExclusionsIn;
1934  numExclusions = numExclusionsIn;
1935  numCrossterms = numCrosstermsIn;
1936 
1937  int numBondsWA = warpAlign(numBonds);
1938  int numAnglesWA = warpAlign(numAngles);
1939  int numDihedralsWA = warpAlign(numDihedrals);
1940  int numImpropersWA = warpAlign(numImpropers);
1941  int numModifiedExclusionsWA = warpAlign(numModifiedExclusions);
1942  int numExclusionsWA = warpAlign(numExclusions);
1943  int numCrosstermsWA = warpAlign(numCrossterms);
1944 
1945  int sizeTot = numBondsWA*sizeof(CudaBond) + numAnglesWA*sizeof(CudaAngle) +
1946  numDihedralsWA*sizeof(CudaDihedral) + numImpropersWA*sizeof(CudaDihedral) +
1947  numModifiedExclusionsWA*sizeof(CudaExclusion) + numExclusionsWA*sizeof(CudaExclusion) +
1948  numCrosstermsWA*sizeof(CudaCrossterm);
1949 
1950  reallocate_device<char>(&tupleData, &tupleDataSize, sizeTot, 1.4f);
1951  copy_HtoD<char>(h_tupleData, tupleData, sizeTot, stream);
1952 
1953  // Setup pointers
1954  int pos = 0;
1955  bonds = (CudaBond *)&tupleData[pos];
1956  pos += numBondsWA*sizeof(CudaBond);
1957 
1958  angles = (CudaAngle* )&tupleData[pos];
1959  pos += numAnglesWA*sizeof(CudaAngle);
1960 
1961  dihedrals = (CudaDihedral* )&tupleData[pos];
1962  pos += numDihedralsWA*sizeof(CudaDihedral);
1963 
1964  impropers = (CudaDihedral* )&tupleData[pos];
1965  pos += numImpropersWA*sizeof(CudaDihedral);
1966 
1967  modifiedExclusions = (CudaExclusion* )&tupleData[pos];
1968  pos += numModifiedExclusionsWA*sizeof(CudaExclusion);
1969 
1970  exclusions = (CudaExclusion* )&tupleData[pos];
1971  pos += numExclusionsWA*sizeof(CudaExclusion);
1972 
1973  crossterms = (CudaCrossterm* )&tupleData[pos];
1974  pos += numCrosstermsWA*sizeof(CudaCrossterm);
1975 }
1976 
1977 //
1978 // Return stride for force array
1979 //
1980 int ComputeBondedCUDAKernel::getForceStride(const int atomStorageSize) {
1981 #ifdef USE_STRIDED_FORCE
1982  // Align stride to 256 bytes
1983  return ((atomStorageSize*sizeof(FORCE_TYPE) - 1)/256 + 1)*256/sizeof(FORCE_TYPE);
1984 #else
1985  return 1;
1986 #endif
1987 }
1988 
1989 //
1990 // Return size of single force array
1991 //
1992 int ComputeBondedCUDAKernel::getForceSize(const int atomStorageSize) {
1993 #ifdef USE_STRIDED_FORCE
1994  return (3*getForceStride(atomStorageSize));
1995 #else
1996  return (3*atomStorageSize);
1997 #endif
1998 }
1999 
2000 //
2001 // Return size of the all force arrays
2002 //
2003 int ComputeBondedCUDAKernel::getAllForceSize(const int atomStorageSize, const bool doSlow) {
2004 
2005  int forceSize = getForceSize(atomStorageSize);
2006 
2007  if (numModifiedExclusions > 0 || numExclusions > 0) {
2008  if (doSlow) {
2009  // All three force arrays [normal, nbond, slow]
2010  forceSize *= 3;
2011  } else {
2012  // Two force arrays [normal, nbond]
2013  forceSize *= 2;
2014  }
2015  }
2016 
2017  return forceSize;
2018 }
2019 
2020 //
2021 // Compute bonded forces
2022 //
2024  const double scale14, const int atomStorageSize,
2025  const bool doEnergy, const bool doVirial, const bool doSlow,
2026  const float3 lata, const float3 latb, const float3 latc,
2027  const float cutoff2, const float r2_delta, const int r2_delta_expc,
2028  const float4* h_xyzq, FORCE_TYPE* h_forces,
2029  double *h_energies_virials,
2030  cudaStream_t stream) {
2031 
2032  int forceStorageSize = getAllForceSize(atomStorageSize, true);
2033  int forceCopySize = getAllForceSize(atomStorageSize, doSlow);
2034  int forceStride = getForceStride(atomStorageSize);
2035 
2036  int forceSize = getForceSize(atomStorageSize);
2037  int startNbond = forceSize;
2038  int startSlow = 2*forceSize;
2039 
2040  // Re-allocate coordinate and force arrays if neccessary
2041  reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, 1.4f);
2042  reallocate_device<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2043 
2044 #if !defined(USE_BONDED_FORCE_ATOMIC_STORE)
2045  // function stores
2046  // numBonds bondForce 2
2047  // numAngles angleForce 3
2048  // numDihedrals diheForce 4
2049  // numImpropers diheForce 4
2050  // numExclusions exclusionForce 2
2051  // numCrossterms crosstermForce 8
2052  // numModifiedExclusions modifiedExclusionForce 4
2053  int listSize = 3 * (numBonds * 2 + numAngles * 3 + numDihedrals * 4 + numImpropers * 4 + numExclusions * 2 + numCrossterms * 8 + numModifiedExclusions * 4);
2054  reallocate_device<FORCE_TYPE>(&forceList, &forceListSize, listSize, 1.4f);
2055  reallocate_device<int>(&forceListNexts, &forceListNextsSize, listSize, 1.4f);
2056  reallocate_device<int>(&forceListStarts, &forceListStartsSize, 3 * atomStorageSize, 1.4f);
2057  int* forceListStartsNbond = forceListStarts + atomStorageSize;
2058  int* forceListStartsSlow = forceListStarts + 2 * atomStorageSize;
2059 
2060  clear_device_array<int>(forceListCounter, 1, stream);
2061  cudaCheck(cudaMemsetAsync(forceListStarts, -1, sizeof(int) * 3 * atomStorageSize, stream));
2062 #else
2063  int* forceListStartsNbond = NULL;
2064  int* forceListStartsSlow = NULL;
2065 #endif
2066 
2067  // Copy coordinates to device
2068  copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
2069 
2070  // Clear force array
2071 #if defined(USE_BONDED_FORCE_ATOMIC_STORE)
2072  clear_device_array<FORCE_TYPE>(forces, forceCopySize, stream);
2073 #endif
2074  if (doEnergy || doVirial) {
2075  clear_device_array<double>(energies_virials, ATOMIC_BINS * energies_virials_SIZE, stream);
2076  }
2077 
2078  float one_scale14 = (float)(1.0 - scale14);
2079 
2080  // If doSlow = false, these exclusions are not calculated
2081  int numExclusionsDoSlow = doSlow ? numExclusions : 0;
2082 
2083  int nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
2084 
2085  int numBondsTB = (numBonds + nthread - 1)/nthread;
2086  int numAnglesTB = (numAngles + nthread - 1)/nthread;
2087  int numDihedralsTB = (numDihedrals + nthread - 1)/nthread;
2088  int numImpropersTB = (numImpropers + nthread - 1)/nthread;
2089  int numExclusionsTB= (numExclusionsDoSlow + nthread - 1)/nthread;
2090  int numCrosstermsTB= (numCrossterms + nthread - 1)/nthread;
2091 
2092  int nblock = numBondsTB + numAnglesTB + numDihedralsTB + numImpropersTB +
2093  numExclusionsTB + numCrosstermsTB;
2094  int shmem_size = 0;
2095 
2096  // printf("%d %d %d %d %d %d nblock %d\n",
2097  // numBonds, numAngles, numDihedrals, numImpropers, numModifiedExclusions, numExclusions, nblock);
2098 
2099  int max_nblock = deviceCUDA->getMaxNumBlocks();
2100 
2101  int start = 0;
2102  while (start < nblock)
2103  {
2104  int nleft = nblock - start;
2105  int nblock_use = min(max_nblock, nleft);
2106 
2107 
2108 #ifdef NAMD_HIP
2109 #define NONBONDEDTABLES cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable()
2110 #else
2111 #define NONBONDEDTABLES cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable(), \
2112 cudaNonbondedTables.get_r2_table_tex(), cudaNonbondedTables.getExclusionTableTex()
2113 #endif
2114 
2115 #ifdef NAMD_HIP
2116 #define CALL(DOENERGY, DOVIRIAL) \
2117  bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL> <<< nblock_use, nthread, shmem_size, stream >>> \
2118  (start, numBonds, bonds, bondValues, \
2119  numAngles, angles, angleValues, \
2120  numDihedrals, dihedrals, dihedralValues, \
2121  numImpropers, impropers, improperValues, \
2122  numExclusionsDoSlow, exclusions, \
2123  numCrossterms, crossterms, crosstermValues, \
2124  cutoff2, \
2125  r2_delta, r2_delta_expc, \
2126  cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable() , \
2127  xyzq, forceStride, \
2128  lata, latb, latc, \
2129  forces, &forces[startSlow], \
2130  forceList, forceListCounter, forceListStarts, forceListStartsSlow, forceListNexts, \
2131  energies_virials);
2132 #else
2133 #define CALL(DOENERGY, DOVIRIAL) \
2134  bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL> <<< nblock_use, nthread, shmem_size, stream >>> \
2135  (start, numBonds, bonds, bondValues, \
2136  numAngles, angles, angleValues, \
2137  numDihedrals, dihedrals, dihedralValues, \
2138  numImpropers, impropers, improperValues, \
2139  numExclusionsDoSlow, exclusions, \
2140  numCrossterms, crossterms, crosstermValues, \
2141  cutoff2, \
2142  r2_delta, r2_delta_expc, \
2143  cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable() , \
2144  cudaNonbondedTables.get_r2_table_tex(), cudaNonbondedTables.getExclusionTableTex() , \
2145  xyzq, forceStride, \
2146  lata, latb, latc, \
2147  forces, &forces[startSlow], \
2148  forceList, forceListCounter, forceListStarts, forceListStartsSlow, forceListNexts, \
2149  energies_virials);
2150 #endif
2151 
2152  if (!doEnergy && !doVirial) CALL(0, 0);
2153  if (!doEnergy && doVirial) CALL(0, 1);
2154  if (doEnergy && !doVirial) CALL(1, 0);
2155  if (doEnergy && doVirial) CALL(1, 1);
2156 
2157 #undef CALL
2158  cudaCheck(cudaGetLastError());
2159 
2160  start += nblock_use;
2161  }
2162 
2164  nblock = (numModifiedExclusions + nthread - 1)/nthread;
2165 
2166  bool doElect = (one_scale14 == 0.0f) ? false : true;
2167 
2168  start = 0;
2169  while (start < nblock)
2170  {
2171  int nleft = nblock - start;
2172  int nblock_use = min(max_nblock, nleft);
2173 
2174 #define CALL(DOENERGY, DOVIRIAL, DOELECT) \
2175  modifiedExclusionForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOELECT> \
2176  <<< nblock_use, nthread, shmem_size, stream >>> (\
2177  start, numModifiedExclusions, modifiedExclusions, \
2178  doSlow, one_scale14, cutoff2, \
2179  cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getExclusionVdwCoefTable(), \
2180  cudaNonbondedTables.getExclusionVdwCoefTableTex(), \
2181  cudaNonbondedTables.getModifiedExclusionForceTableTex(), cudaNonbondedTables.getModifiedExclusionEnergyTableTex(), \
2182  xyzq, forceStride, lata, latb, latc, \
2183  &forces[startNbond], &forces[startSlow], \
2184  forceList, forceListCounter, forceListStartsNbond, forceListStartsSlow, forceListNexts, \
2185  energies_virials);
2186 
2187 
2188  if (!doEnergy && !doVirial && !doElect) CALL(0, 0, 0);
2189  if (!doEnergy && doVirial && !doElect) CALL(0, 1, 0);
2190  if (doEnergy && !doVirial && !doElect) CALL(1, 0, 0);
2191  if (doEnergy && doVirial && !doElect) CALL(1, 1, 0);
2192 
2193  if (!doEnergy && !doVirial && doElect) CALL(0, 0, 1);
2194  if (!doEnergy && doVirial && doElect) CALL(0, 1, 1);
2195  if (doEnergy && !doVirial && doElect) CALL(1, 0, 1);
2196  if (doEnergy && doVirial && doElect) CALL(1, 1, 1);
2197 
2198 #undef CALL
2199  cudaCheck(cudaGetLastError());
2200 
2201  start += nblock_use;
2202  }
2203 #if !defined(USE_BONDED_FORCE_ATOMIC_STORE)
2205  nblock = (atomStorageSize + nthread - 1)/nthread;
2206 
2207  start = 0;
2208  while (start < nblock)
2209  {
2210  int nleft = nblock - start;
2211  int nblock_use = min(max_nblock, nleft);
2212 
2213  // cudaCheck(hipDeviceSynchronize());
2214  // auto t0 = std::chrono::high_resolution_clock::now();
2215 
2216  gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
2217  start, atomStorageSize, forceStride,
2218  forceList, forceListStarts, forceListNexts,
2219  forces);
2220  gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
2221  start, atomStorageSize, forceStride,
2222  forceList, forceListStartsNbond, forceListNexts,
2223  &forces[startNbond]);
2224  if (doSlow) {
2225  gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
2226  start, atomStorageSize, forceStride,
2227  forceList, forceListStartsSlow, forceListNexts,
2228  &forces[startSlow]);
2229  }
2230  cudaCheck(cudaGetLastError());
2231 
2232  // cudaCheck(hipStreamSynchronize(stream));
2233  // auto t1 = std::chrono::high_resolution_clock::now();
2234  // std::chrono::duration<double> diff1 = t1 - t0;
2235  // std::cout << "gatherBondedForcesKernel";
2236  // std::cout << " " << std::setprecision(3) << diff1.count() * 1e3 << " ms" << std::endl;
2237 
2238  start += nblock_use;
2239  }
2240 #endif
2241 
2242  copy_DtoH<FORCE_TYPE>(forces, h_forces, forceCopySize, stream);
2243  if (doEnergy || doVirial) {
2244  if (ATOMIC_BINS > 1) {
2245  // Reduce energies_virials[ATOMIC_BINS][energies_virials_SIZE] in-place (results are in energies_virials[0])
2246  reduceBondedBinsKernel<<<energies_virials_SIZE, ATOMIC_BINS, 0, stream>>>(energies_virials);
2247  }
2248  copy_DtoH<double>(energies_virials, h_energies_virials, energies_virials_SIZE, stream);
2249  }
2250 
2251 }
__global__ void modifiedExclusionForcesKernel(const int start, const int numModifiedExclusions, const CudaExclusion *__restrict__ modifiedExclusions, const bool doSlow, const float one_scale14, const float cutoff2, const int vdwCoefTableWidth, const float2 *__restrict__ vdwCoefTable, cudaTextureObject_t vdwCoefTableTex, cudaTextureObject_t modifiedExclusionForceTableTex, cudaTextureObject_t modifiedExclusionEnergyTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ forceNbond, T *__restrict__ forceSlow, T *__restrict__ forceList, int *forceListCounter, int *forceListStartsNbond, int *forceListStartsSlow, int *forceListNexts, double *__restrict__ energies_virials)
__device__ void modifiedExclusionForce(const int index, const CudaExclusion *__restrict__ exclusionList, const bool doSlow, const float one_scale14, const int vdwCoefTableWidth, cudaTextureObject_t vdwCoefTableTex, cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, double &energyVdw, T *__restrict__ forceNbond, double &energyNbond, T *__restrict__ forceSlow, double &energySlow, T *__restrict__ forceList, int *forceListCounter, int *forceListStartsNbond, int *forceListStartsSlow, int *forceListNexts, ComputeBondedCUDAKernel::BondedVirial< double > &virialNbond, ComputeBondedCUDAKernel::BondedVirial< double > &virialSlow)
float scale
__device__ void diheForce(const int index, const CudaDihedral *__restrict__ diheList, const CudaDihedralValue *__restrict__ dihedralValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, T *__restrict__ forceList, int *forceListCounter, int *forceListStarts, int *forceListNexts, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
#define M_PI
Definition: GoMolecule.C:39
#define WARP_ALL(MASK, P)
Definition: CudaUtils.h:56
#define WRITE_FULL_VIRIALS
float3 offset12XYZ
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
__forceinline__ __device__ void diheComp(const CudaDihedralValue *dihedralValues, int ic, const float sin_phi, const float cos_phi, const float scale, float &df, double &e)
static int warpAlign(const int n)
static __constant__ const float force_to_float
float3 ioffsetXYZ
float x
Definition: PmeSolver.C:4
float3 offset78XYZ
float3 joffsetXYZ
int getForceStride(const int atomStorageSize)
__forceinline__ __device__ void calcComponentForces(float fij, const float dx, const float dy, const float dz, T &fxij, T &fyij, T &fzij)
const BigReal A
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread unsigned int * exclusions
int getAllForceSize(const int atomStorageSize, const bool doSlow)
__device__ __forceinline__ float3 cross(const float3 v1, const float3 v2)
void bondedForce(const double scale14, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const float r2_delta, const int r2_delta_expc, const float4 *h_xyzq, FORCE_TYPE *h_forces, double *h_energies, cudaStream_t stream)
static __constant__ const float float_to_force
if(ComputeNonbondedUtil::goMethod==2)
#define BONDEDFORCESKERNEL_NUM_WARP
float3 offset67XYZ
#define WARPSIZE
Definition: CudaUtils.h:10
void setupAngleValues(int numAngleValues, CudaAngleValue *h_angleValues)
__device__ void bondForce(const int index, const CudaBond *__restrict__ bondList, const CudaBondValue *__restrict__ bondValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, T *__restrict__ forceList, int *forceListCounter, int *forceListStarts, int *forceListNexts, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
__device__ __forceinline__ float4 sampleTableTex(cudaTextureObject_t tex, float k)
__forceinline__ __device__ void convertForces(const float x, const float y, const float z, T &Tx, T &Ty, T &Tz)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__global__ void reduceBondedBinsKernel(double *energies_virials)
__thread cudaStream_t stream
__forceinline__ __device__ void storeForces(const T fx, const T fy, const T fz, const int ind, const int stride, T *force, T *forceList, int *forceListCounter, int *forceListStarts, int *forceListNexts)
float3 offset23XYZ
void setupBondValues(int numBondValues, CudaBondValue *h_bondValues)
int getForceSize(const int atomStorageSize)
float3 koffsetXYZ
float3 ioffsetXYZ
void setupImproperValues(int numImproperValues, CudaDihedralValue *h_improperValues)
unsigned int WarpMask
Definition: CudaUtils.h:11
float y
Definition: PmeSolver.C:4
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t forceTableTex
#define ATOMIC_BINS
Definition: CudaUtils.h:24
__device__ void exclusionForce(const int index, const CudaExclusion *__restrict__ exclusionList, const float r2_delta, const int r2_delta_expc, cudaTextureObject_t r2_table_tex, cudaTextureObject_t exclusionTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, T *__restrict__ forceSlow, double &energySlow, T *__restrict__ forceList, int *forceListCounter, int *forceListStartsSlow, int *forceListNexts, ComputeBondedCUDAKernel::BondedVirial< double > &virialSlow)
void setupCrosstermValues(int numCrosstermValues, CudaCrosstermValue *h_crosstermValues)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int vdwCoefTableWidth
gridSize z
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ vdwCoefTable
#define FORCE_TYPE
#define FORCE_ENERGY_TABLE_SIZE
Definition: CudaUtils.h:19
#define WARP_BALLOT(MASK, P)
Definition: CudaUtils.h:58
#define __ldg
int getMaxNumBlocks()
Definition: DeviceCUDA.C:419
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
float3 offset56XYZ
float4 c[dim][dim][4]
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t energyTableTex
__device__ __forceinline__ float dot(const float3 v1, const float3 v2)
ComputeBondedCUDAKernel(int deviceID, CudaNonbondedTables &cudaNonbondedTables)
__shared__ union @43 tempStorage
float3 ioffsetXYZ
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
void setupDihedralValues(int numDihedralValues, CudaDihedralValue *h_dihedralValues)
__device__ void angleForce(const int index, const CudaAngle *__restrict__ angleList, const CudaAngleValue *__restrict__ angleValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, T *__restrict__ forceList, int *forceListCounter, int *forceListStarts, int *forceListNexts, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
const BigReal B
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
__device__ void crosstermForce(const int index, const CudaCrossterm *__restrict__ crosstermList, const CudaCrosstermValue *__restrict__ crosstermValues, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, double &energy, T *__restrict__ forceList, int *forceListCounter, int *forceListStarts, int *forceListNexts, ComputeBondedCUDAKernel::BondedVirial< double > &virial)
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:48
gridSize y
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
__forceinline__ __device__ int warpAggregatingAtomicInc(int *counter)
__global__ void bondedForcesKernel(const int start, const int numBonds, const CudaBond *__restrict__ bonds, const CudaBondValue *__restrict__ bondValues, const int numAngles, const CudaAngle *__restrict__ angles, const CudaAngleValue *__restrict__ angleValues, const int numDihedrals, const CudaDihedral *__restrict__ dihedrals, const CudaDihedralValue *__restrict__ dihedralValues, const int numImpropers, const CudaDihedral *__restrict__ impropers, const CudaDihedralValue *__restrict__ improperValues, const int numExclusions, const CudaExclusion *__restrict__ exclusions, const int numCrossterms, const CudaCrossterm *__restrict__ crossterms, const CudaCrosstermValue *__restrict__ crosstermValues, const float cutoff2, const float r2_delta, const int r2_delta_expc, const float *__restrict__ r2_table, const float4 *__restrict__ exclusionTable, cudaTextureObject_t r2_table_tex, cudaTextureObject_t exclusionTableTex, const float4 *__restrict__ xyzq, const int stride, const float3 lata, const float3 latb, const float3 latc, T *__restrict__ force, T *__restrict__ forceSlow, T *__restrict__ forceList, int *forceListCounter, int *forceListStarts, int *forceListStartsSlow, int *forceListNexts, double *__restrict__ energies_virials)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
gridSize x
__global__ void gatherBondedForcesKernel(const int start, const int atomStorageSize, const int stride, const T *__restrict__ forceList, const int *__restrict__ forceListStarts, const int *__restrict__ forceListNexts, T *__restrict__ force)
float3 loffsetXYZ
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t vdwCoefTableTex
float3 offset34XYZ
#define CALL(DOENERGY, DOVIRIAL)
void update(const int numBondsIn, const int numAnglesIn, const int numDihedralsIn, const int numImpropersIn, const int numModifiedExclusionsIn, const int numExclusionsIn, const int numCrosstermsIn, const char *h_tupleData, cudaStream_t stream)
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:54