NAMD
ComputeBondedCUDAKernel.cu
Go to the documentation of this file.
1 #ifdef WIN32
2 #define _USE_MATH_DEFINES
3 #define __thread __declspec(thread)
4 #endif // WIN32
5 
6 #ifdef NAMD_HIP
7 #include <hipcub/hipcub.hpp>
8 #define cub hipcub
9 #endif // NAMD_HIP
10 
11 #ifdef NAMD_CUDA
12 #include <cuda.h>
13 #if __CUDACC_VER_MAJOR__ >= 11
14 #include <cub/cub.cuh>
15 #else
16 #include <namd_cub/cub.cuh>
17 #endif
18 #endif // NAMD_CUDA
19 
20 #include <math.h>
21 
22 #include "ComputeBondedCUDAKernel.h"
23 #include "CudaComputeNonbondedInteractions.h"
24 
25 #ifdef FUTURE_CUDADEVICE
26 #include "CudaDevice.h"
27 #else
28 #include "DeviceCUDA.h"
29 extern __thread DeviceCUDA *deviceCUDA;
30 #endif
31 
32 #ifndef USE_TABLE_ARRAYS
33 __device__ __forceinline__
34 float4 sampleTableTex(cudaTextureObject_t tex, float k) {
35 #if defined(NAMD_CUDA)
36  return tex1D<float4>(tex, k);
37 #else
38  const int tableSize = FORCE_ENERGY_TABLE_SIZE;
39  const float x = k * (float)tableSize - 0.5f;
40  const float f = floorf(x);
41  const float a = x - f;
42  const unsigned int i = (unsigned int)f;
43  const int i0 = i < tableSize - 1 ? i : tableSize - 1;
44  const int i1 = i0 + 1;
45  const float4 t0 = tex1Dfetch<float4>(tex, i0);
46  const float4 t1 = tex1Dfetch<float4>(tex, i1);
47  return make_float4(
48  a * (t1.x - t0.x) + t0.x,
49  a * (t1.y - t0.y) + t0.y,
50  a * (t1.z - t0.z) + t0.z,
51  a * (t1.w - t0.w) + t0.w);
52 #endif
53 }
54 #endif
55 
56 __device__ __forceinline__
57 float4 tableLookup(const float4* table, const float k)
58 {
59  const int tableSize = FORCE_ENERGY_TABLE_SIZE;
60  const float x = k * static_cast<float>(tableSize) - 0.5f;
61  const float f = floorf(x);
62  const float a = x - f;
63  const int i = static_cast<int>(f);
64  const int i0 = max(0, min(tableSize - 1, i));
65  const int i1 = max(0, min(tableSize - 1, i + 1));
66  const float4 t0 = __ldg(&table[i0]);
67  const float4 t1 = __ldg(&table[i1]);
68  return make_float4(
69  a * (t1.x - t0.x) + t0.x,
70  a * (t1.y - t0.y) + t0.y,
71  a * (t1.z - t0.z) + t0.z,
72  a * (t1.w - t0.w) + t0.w);
73 }
74 
75 // global variable for alchemical transformation
76 namespace AlchBondedCUDA {
77  // Alchemical parameters and lambdas
78  __constant__ CudaAlchParameters alchParams;
79  __constant__ CudaAlchLambdas alchLambdas;
80 }
81 
82 
83 template <typename T>
84 __forceinline__ __device__
85 void convertForces(const float x, const float y, const float z,
86  T &Tx, T &Ty, T &Tz) {
87 
88  Tx = (T)(x);
89  Ty = (T)(y);
90  Tz = (T)(z);
91 }
92 
93 template <typename T>
94 __forceinline__ __device__
95 void calcComponentForces(
96  float fij,
97  const float dx, const float dy, const float dz,
98  T &fxij, T &fyij, T &fzij) {
99 
100  fxij = (T)(fij*dx);
101  fyij = (T)(fij*dy);
102  fzij = (T)(fij*dz);
103 
104 }
105 
106 template <typename T>
107 __forceinline__ __device__
108 void calcComponentForces(
109  float fij1,
110  const float dx1, const float dy1, const float dz1,
111  float fij2,
112  const float dx2, const float dy2, const float dz2,
113  T &fxij, T &fyij, T &fzij) {
114 
115  fxij = (T)(fij1*dx1 + fij2*dx2);
116  fyij = (T)(fij1*dy1 + fij2*dy2);
117  fzij = (T)(fij1*dz1 + fij2*dz2);
118 }
119 
120 __forceinline__ __device__
121 int warpAggregatingAtomicInc(int* counter) {
122 #if BOUNDINGBOXSIZE == 64
123  WarpMask mask = __ballot(1);
124  int total = __popcll(mask);
125  int prefix = __popcll(mask & cub::LaneMaskLt());
126  int firstLane = __ffsll(mask) - 1;
127 #else
128  WarpMask mask = __ballot(1);
129  int total = __popc(mask);
130  int prefix = __popc(mask & cub::LaneMaskLt());
131  int firstLane = __ffs(mask) - 1;
132 #endif
133  int start = 0;
134  if (prefix == 0) {
135  start = atomicAdd(counter, total);
136  }
137  start = WARP_SHUFFLE(mask, start, firstLane, WARPSIZE);
138  return start + prefix;
139 }
140 
141 template <typename T>
142 __forceinline__ __device__
143 void storeForces(const T fx, const T fy, const T fz,
144  const int ind, const int stride,
145  T* force,
146  T* forceList, int* forceListCounter,
147  int* forceListStarts, int* forceListNexts) {
148 
149 #if defined(USE_BONDED_FORCE_ATOMIC_STORE)
150 #if defined(NAMD_HIP)
151  // Try to decrease conflicts between lanes if there are repeting ind
152  WarpMask mask = NAMD_WARP_BALLOT(WARP_FULL_MASK, 1);
153  if (mask == ~(WarpMask)0) { // All lanes are active (may be not true for the last block)
154  const int laneId = threadIdx.x % WARPSIZE;
155  const int prevLaneInd = WARP_SHUFFLE(WARP_FULL_MASK, ind, laneId - 1, WARPSIZE);
156  const bool isHead = laneId == 0 || ind != prevLaneInd;
157  if (!NAMD_WARP_ALL(WARP_FULL_MASK, isHead)) {
158  // There are segments of repeating ind
159  typedef cub::WarpReduce<T> WarpReduce;
160  __shared__ typename WarpReduce::TempStorage temp_storage;
161  const T sumfx = WarpReduce(temp_storage).HeadSegmentedSum(fx, isHead);
162  const T sumfy = WarpReduce(temp_storage).HeadSegmentedSum(fy, isHead);
163  const T sumfz = WarpReduce(temp_storage).HeadSegmentedSum(fz, isHead);
164  if (isHead) {
165  atomicAdd(&force[ind ], sumfx);
166  atomicAdd(&force[ind + stride ], sumfy);
167  atomicAdd(&force[ind + stride*2], sumfz);
168  }
169  return;
170  }
171  }
172  // Not all lanes are active (the last block) or there is no repeating ind
173  atomicAdd(&force[ind ], fx);
174  atomicAdd(&force[ind + stride ], fy);
175  atomicAdd(&force[ind + stride*2], fz);
176 #else
177  atomicAdd(&force[ind ], fx);
178  atomicAdd(&force[ind + stride ], fy);
179  atomicAdd(&force[ind + stride*2], fz);
180 #endif
181 #else
182  const int newPos = warpAggregatingAtomicInc(forceListCounter);
183  forceListNexts[newPos] = atomicExch(&forceListStarts[ind], newPos);
184  forceList[newPos * 3 + 0] = fx;
185  forceList[newPos * 3 + 1] = fy;
186  forceList[newPos * 3 + 2] = fz;
187 #endif
188 }
189 
190 //
191 // Calculates bonds
192 //
193 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
194 __device__ void bondForce(
195  const int index,
196  const CudaBond* __restrict__ bondList,
197  const CudaBondValue* __restrict__ bondValues,
198  const float4* __restrict__ xyzq,
199  const int stride,
200  const float3 lata, const float3 latb, const float3 latc,
201  T* __restrict__ force, double &energy,
202  T* __restrict__ forceList, int* forceListCounter,
203  int* forceListStarts, int* forceListNexts,
204 #ifdef WRITE_FULL_VIRIALS
205  ComputeBondedCUDAKernel::BondedVirial<double>& virial,
206 #else
207  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
208 #endif
209  double &energy_F, double &energy_TI_1, double &energy_TI_2
210  ) {
211 
212  CudaBond bl = bondList[index];
213  CudaBondValue bondValue = bondValues[bl.itype];
214 // if (bondValue.x0 == 0.0f) return;
215 
216  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
217  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
218  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
219 
220  float4 xyzqi = xyzq[bl.i];
221  float4 xyzqj = xyzq[bl.j];
222 
223  float xij = xyzqi.x + shx - xyzqj.x;
224  float yij = xyzqi.y + shy - xyzqj.y;
225  float zij = xyzqi.z + shz - xyzqj.z;
226 
227  float r = sqrtf(xij*xij + yij*yij + zij*zij);
228 
229  float db = r - bondValue.x0;
230  if (bondValue.x1) {
231  // in this case, the bond represents a harmonic wall potential
232  // where x0 is the lower wall and x1 is the upper
233  db = (r > bondValue.x1 ? r - bondValue.x1 : (r > bondValue.x0 ? 0 : db));
234  }
235  float fij = db * bondValue.k * bl.scale;
236 
237  // Define a temporary variable to store the energy
238  // used by both alchemical and non-alchemical route
239  float energy_tmp = 0.0f;
240  if (doEnergy) {
241  energy_tmp += fij * db;
242  }
243 
244  // Alchemical route
245  // JM: Get this shit off here
246  float alch_scale = 1.0f;
247  if (doFEP || doTI) {
248  switch (bl.fepBondedType) {
249  case 1: {
250  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
251  if (doTI) {
252  energy_TI_1 += (double)(energy_tmp);
253  }
254  if (doFEP) {
255  energy_F += (double)(energy_tmp * ((AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1));
256  }
257  break;
258  }
259  case 2: {
260  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
261  if (doTI) {
262  energy_TI_2 += (double)(energy_tmp);
263  }
264  if (doFEP) {
265  energy_F += (double)(energy_tmp * ((AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2));
266  }
267  break;
268  }
269  }
270  }
271 
272  if (doEnergy) {
273  if (doFEP || doTI) {
274  energy += (double)(energy_tmp * alch_scale);
275  } else {
276  energy += (double)(energy_tmp);
277  }
278  }
279  fij *= -2.0f/r;
280  // XXX TODO: Get this off here
281  // XXX TODO: Decide on a templated parameter nomenclature
282  if (doFEP || doTI) {
283  fij *= alch_scale;
284  }
285 
286  T T_fxij, T_fyij, T_fzij;
287  calcComponentForces<T>(fij, xij, yij, zij, T_fxij, T_fyij, T_fzij);
288 
289  // Store forces
290  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
291  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
292 
293  // Store virial
294  if (doVirial) {
295 #ifdef WRITE_FULL_VIRIALS
296  float fxij = fij*xij;
297  float fyij = fij*yij;
298  float fzij = fij*zij;
299  virial.xx = (fxij*xij);
300  virial.xy = (fxij*yij);
301  virial.xz = (fxij*zij);
302  virial.yx = (fyij*xij);
303  virial.yy = (fyij*yij);
304  virial.yz = (fyij*zij);
305  virial.zx = (fzij*xij);
306  virial.zy = (fzij*yij);
307  virial.zz = (fzij*zij);
308 #endif
309  }
310 }
311 
312 //
313 // Calculates modified exclusions
314 //
315 template <typename T, bool doEnergy, bool doVirial, bool doElect, bool doFEP, bool doTI, bool doTable>
316 __device__ void modifiedExclusionForce(
317  const int index,
318  const CudaExclusion* __restrict__ exclusionList,
319  const bool doSlow,
320  const float one_scale14, // 1 - scale14
321  const int vdwCoefTableWidth,
322 #if defined(USE_TABLE_ARRAYS) || __CUDA_ARCH__ >= 350
323  const float2* __restrict__ vdwCoefTable,
324 #else
325  cudaTextureObject_t vdwCoefTableTex,
326 #endif
327 #ifdef USE_TABLE_ARRAYS
328  const float4* __restrict__ forceTable, const float4* __restrict__ energyTable,
329 #else
330  cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
331 #endif
332  const float4* __restrict__ xyzq,
333  const int stride,
334  const float3 lata, const float3 latb, const float3 latc,
335  const float cutoff2,
336  const CudaNBConstants nbConstants,
337  double &energyVdw,
338  T* __restrict__ forceNbond, double &energyNbond,
339  T* __restrict__ forceSlow, double &energySlow,
340  T* __restrict__ forceList, int* forceListCounter,
341  int* forceListStartsNbond, int* forceListStartsSlow, int* forceListNexts,
342 #ifdef WRITE_FULL_VIRIALS
343  ComputeBondedCUDAKernel::BondedVirial<double>& virialNbond,
344  ComputeBondedCUDAKernel::BondedVirial<double>& virialSlow,
345 #else
346  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialNbond,
347  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialSlow,
348 #endif
349  double &energyVdw_F, double &energyVdw_TI_1, double &energyVdw_TI_2,
350  double &energyNbond_F, double &energyNbond_TI_1, double &energyNbond_TI_2,
351  double &energySlow_F, double &energySlow_TI_1, double &energySlow_TI_2
352  ) {
353 
354  CudaExclusion bl = exclusionList[index];
355 
356  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
357  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
358  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
359 
360  float4 xyzqi = xyzq[bl.i];
361  float4 xyzqj = xyzq[bl.j];
362 
363  float xij = xyzqi.x + shx - xyzqj.x;
364  float yij = xyzqi.y + shy - xyzqj.y;
365  float zij = xyzqi.z + shz - xyzqj.z;
366 
367  float r2 = xij*xij + yij*yij + zij*zij;
368  if (r2 < cutoff2) {
369 
370  float rinv = rsqrtf(r2);
371 
372  float qq;
373  if (doElect) qq = one_scale14 * xyzqi.w * xyzqj.w;
374 
375  int vdwIndex = bl.vdwtypei + bl.vdwtypej*vdwCoefTableWidth;
376 #if defined(USE_TABLE_ARRAYS) || __CUDA_ARCH__ >= 350
377  float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
378 #else
379  float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
380 #endif
381 
382  // Alchemical route
383  float myElecLambda = 1.0f;
384  float myElecLambda2 = 1.0f;
385  float myVdwLambda = 1.0f;
386  float myVdwLambda2 = 1.0f;
387  float myVdwShift = 0.0f;
388  float myVdwShift2 = 0.0f;
389  double alch_vdw_energy;
390  double alch_vdw_energy_2;
391  float alch_vdw_force = 0.0f;
392  float alch_vdw_dUdl = 0.0f;
393  double* p_energyVdw_TI = NULL;
394  double* p_energyNbond_TI = NULL;
395  double* p_energySlow_TI = NULL;
396  float p0Factor;
397  if (doFEP || doTI) {
398  p0Factor = 0.0f;
399  /*lambda values 'up' are for atoms scaled up with lambda (partition 1)*/
400  float elecLambdaUp = (AlchBondedCUDA::alchLambdas).elecLambdaUp;
401  float vdwLambdaUp = (AlchBondedCUDA::alchLambdas).vdwLambdaUp;
402  float vdwShiftUp = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambdaUp);
403  float elecLambda2Up = (AlchBondedCUDA::alchLambdas).elecLambda2Up;
404  float vdwLambda2Up = (AlchBondedCUDA::alchLambdas).vdwLambda2Up;
405  float vdwShift2Up = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambda2Up);
406  /*lambda values 'down' are for atoms scaled down with lambda (partition 2)*/
407  float elecLambdaDown = (AlchBondedCUDA::alchLambdas).elecLambdaDown;
408  float vdwLambdaDown = (AlchBondedCUDA::alchLambdas).vdwLambdaDown;
409  float vdwShiftDown = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambdaDown);
410  float elecLambda2Down = (AlchBondedCUDA::alchLambdas).elecLambda2Down;
411  float vdwLambda2Down = (AlchBondedCUDA::alchLambdas).vdwLambda2Down;
412  float vdwShift2Down = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambda2Down);
413  switch (bl.pswitch) {
414  case 0: myVdwLambda = 1.0f;
415  myVdwLambda2 = 1.0f;
416  myElecLambda = 1.0f;
417  myElecLambda2 = 1.0f;
418  p0Factor = 1.0f;
419  break;
420  case 1: myVdwLambda = vdwLambdaUp;
421  myVdwLambda2 = vdwLambda2Up;
422  myElecLambda = elecLambdaUp;
423  myElecLambda2 = elecLambda2Up;
424  myVdwShift = vdwShiftUp;
425  myVdwShift2 = vdwShift2Up;
426  p_energyVdw_TI = &(energyVdw_TI_1);
427  p_energyNbond_TI= &(energyNbond_TI_1);
428  p_energySlow_TI = &(energySlow_TI_1);
429  break;
430  case 2: myVdwLambda = vdwLambdaDown;
431  myVdwLambda2 = vdwLambda2Down;
432  myElecLambda = elecLambdaDown;
433  myElecLambda2 = elecLambda2Down;
434  myVdwShift = vdwShiftDown;
435  myVdwShift2 = vdwShift2Down;
436  p_energyVdw_TI = &(energyVdw_TI_2);
437  p_energyNbond_TI= &(energyNbond_TI_2);
438  p_energySlow_TI = &(energySlow_TI_2);
439  break;
440  default: myVdwLambda = 0.0f;
441  myVdwLambda2 = 0.0f;
442  myElecLambda = 0.0f;
443  myElecLambda2 = 0.0f;
444  break;
445  }
446  if (bl.pswitch != 99 && bl.pswitch != 0) {
447  // Common part of FEP and TI
448  const float& switchOn2 = (AlchBondedCUDA::alchParams).switchDist2;
449  const float switchfactor = 1.0f / ((cutoff2 - switchOn2) * (cutoff2 - switchOn2) * (cutoff2 - switchOn2));
450  const float switchmul = (r2 > switchOn2 ? (switchfactor * (cutoff2 - r2) * (cutoff2 - r2) * (cutoff2 - 3.0f * switchOn2 + 2.0f * r2)) : 1.0f);
451  const float switchmul2 = (r2 > switchOn2 ? (switchfactor * (cutoff2 - r2) * 12.0f * (r2 - switchOn2)) : 0.0f);
452  // A = ljab.x ; B = ljab.y
453  const float r2_1 = 1.0f / (r2 + myVdwShift);
454  const float r2_2 = 1.0f / (r2 + myVdwShift2);
455  const float r6_1 = r2_1 * r2_1 * r2_1;
456  const float r6_2 = r2_2 * r2_2 * r2_2;
457  const float U1 = ljab.x * r6_1 * r6_1 - ljab.y * r6_1;
458  const float U2 = ljab.x * r6_2 * r6_2 - ljab.y * r6_2;
459  // rinv is already calculated above
460  if (doEnergy) {
461  alch_vdw_energy = double(myVdwLambda * switchmul * U1);
462  if (doFEP) {
463  alch_vdw_energy_2 = double(myVdwLambda2 * switchmul * U2);
464  }
465  }
466  alch_vdw_force = myVdwLambda * (switchmul * (12.0f * U1 + 6.0f * ljab.y * r6_1) * r2_1 + switchmul2 * U1);
467  if (doTI) {
468  alch_vdw_dUdl = switchmul * (U1 + myVdwLambda * (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (6.0f * U1 + 3.0f * ljab.y * r6_1) * r2_1);
469  }
470  } else {
471  if (doEnergy) {
472  alch_vdw_energy = 0.0;
473  alch_vdw_energy_2 = 0.0;
474  }
475  }
476  }
477 
478  float4 fi;
479  float4 ei;
480  if (doTable) {
481 #ifdef NAMD_HIP
482 #ifdef USE_TABLE_ARRAYS
483  fi = tableLookup(forceTable, rinv);
484 #else
485  fi = sampleTableTex(forceTableTex, rinv);
486 #endif
487 #else
488  fi = tex1D<float4>(forceTableTex, rinv);
489 #endif
490  }
491 
492  if (doEnergy && doTable) {
493 #ifdef NAMD_HIP
494 #ifdef USE_TABLE_ARRAYS
495  ei = tableLookup(energyTable, rinv);
496 #else
497  ei = sampleTableTex(energyTableTex, rinv);
498 #endif
499 #else
500  ei = tex1D<float4>(energyTableTex, rinv);
501 #endif
502  if (doFEP || doTI) {
503  energyVdw += (double)((ljab.x * ei.z + ljab.y * ei.y) * myElecLambda * p0Factor);
504  energyVdw += alch_vdw_energy;
505  if (doFEP) {
506  energyVdw_F += (double)((ljab.x * ei.z + ljab.y * ei.y) * myElecLambda2 * p0Factor);
507  energyVdw_F += alch_vdw_energy_2;
508  }
509  if (doTI) {
510  if (p_energyVdw_TI != NULL) {
511  (*p_energyVdw_TI) += alch_vdw_dUdl;
512  }
513  }
514  } else {
515  energyVdw += (double)(ljab.x * ei.z + ljab.y * ei.y);
516  }
517  if (doElect) {
518  energyNbond += qq * ei.x * myElecLambda;
519  if (doFEP) {
520  energyNbond_F += qq * ei.x * myElecLambda2;
521  }
522  if (doTI) {
523  if (p_energyNbond_TI != NULL) (*p_energyNbond_TI) += qq * ei.x;
524  }
525  if (doSlow) {
526  energySlow += qq * ei.w * myElecLambda;
527  if (doFEP) {
528  energySlow_F += qq * ei.w * myElecLambda2;
529  }
530  if (doTI) {
531  if (p_energySlow_TI != NULL) (*p_energySlow_TI) += qq * ei.w;
532  }
533  }
534  }
535  }
536 
537  float fNbond;
538  float fSlow;
539  if (doTable) {
540  if (doElect) {
541  fNbond = -(ljab.x * fi.z + ljab.y * fi.y + qq * fi.x);
542  } else {
543  fNbond = -(ljab.x * fi.z + ljab.y * fi.y);
544  }
545  } else {
546  float e_vdw, e_elec, e_slow;
547  if (doEnergy) {
548  e_vdw = 0.0f;
549  e_elec = 0.0f;
550  e_slow = 0.0f;
551  }
552 
553  cudaModExclForceMagCalc_VdwEnergySwitch_PMEC1<doEnergy>(
554  doSlow, doElect, r2, rinv, qq, ljab,
555  nbConstants, fNbond, fSlow, e_vdw, e_elec, e_slow);
556 
557  if (doEnergy) {
558  energyVdw += (double) (e_vdw);
559  energyNbond += (double) (e_elec * myElecLambda);
560  energySlow += (double) (e_slow * myElecLambda);
561  }
562  }
563  // fNbond = fFast + fVdw
564  if (doFEP || doTI) {
565  if (bl.pswitch != 0) {
566  fNbond -= -(ljab.x * fi.z + ljab.y * fi.y);
567  fNbond = fNbond * myElecLambda + alch_vdw_force * float(bl.pswitch == 1 || bl.pswitch == 2);
568  }
569  // if pswitch == 0, myElecLambda = 1.0
570  }
571  T T_fxij, T_fyij, T_fzij;
572  calcComponentForces<T>(fNbond, xij, yij, zij, T_fxij, T_fyij, T_fzij);
573  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceNbond, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
574  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceNbond, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
575 
576  if (doSlow && doElect) {
577  if (doTable) {
578  fSlow = -qq * fi.w;
579  }
580  if (doFEP || doTI) {
581  fSlow *= myElecLambda;
582  }
583  T T_fxij, T_fyij, T_fzij;
584  calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
585  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
586  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
587  }
588 
589  // Store virial
590  if (doVirial) {
591 #ifdef WRITE_FULL_VIRIALS
592  float fxij = fNbond*xij;
593  float fyij = fNbond*yij;
594  float fzij = fNbond*zij;
595  virialNbond.xx = (fxij*xij);
596  virialNbond.xy = (fxij*yij);
597  virialNbond.xz = (fxij*zij);
598  virialNbond.yx = (fyij*xij);
599  virialNbond.yy = (fyij*yij);
600  virialNbond.yz = (fyij*zij);
601  virialNbond.zx = (fzij*xij);
602  virialNbond.zy = (fzij*yij);
603  virialNbond.zz = (fzij*zij);
604 #endif
605  }
606 
607  // Store virial
608  if (doVirial && doSlow && doElect) {
609 #ifdef WRITE_FULL_VIRIALS
610  float fxij = fSlow*xij;
611  float fyij = fSlow*yij;
612  float fzij = fSlow*zij;
613  virialSlow.xx = (fxij*xij);
614  virialSlow.xy = (fxij*yij);
615  virialSlow.xz = (fxij*zij);
616  virialSlow.yx = (fyij*xij);
617  virialSlow.yy = (fyij*yij);
618  virialSlow.yz = (fyij*zij);
619  virialSlow.zx = (fzij*xij);
620  virialSlow.zy = (fzij*yij);
621  virialSlow.zz = (fzij*zij);
622 #endif
623  }
624 
625  }
626 }
627 
628 //
629 // Calculates exclusions. Here doSlow = true
630 //
631 // doFull = doFullElectrostatics = doSlow
632 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
633 __device__ void exclusionForce(
634  const int index,
635  const CudaExclusion* __restrict__ exclusionList,
636  const float r2_delta, const int r2_delta_expc,
637 
638 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
639  const float* __restrict__ r2_table,
640  const float4* __restrict__ exclusionTable,
641 #else
642  cudaTextureObject_t r2_table_tex,
643  cudaTextureObject_t exclusionTableTex,
644 #endif
645  const float4* __restrict__ xyzq,
646  const int stride,
647  const float3 lata, const float3 latb, const float3 latc,
648  const float cutoff2,
649  T* __restrict__ forceSlow, double &energySlow,
650  T* __restrict__ forceList, int* forceListCounter,
651  int* forceListStartsSlow, int* forceListNexts,
652 #ifdef WRITE_FULL_VIRIALS
653  ComputeBondedCUDAKernel::BondedVirial<double>& virialSlow,
654 #else
655  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialSlow,
656 #endif
657  double &energy_F, double &energy_TI_1, double &energy_TI_2
658  ) {
659 
660  CudaExclusion bl = exclusionList[index];
661 
662  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
663  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
664  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
665 
666  float4 xyzqi = xyzq[bl.i];
667  float4 xyzqj = xyzq[bl.j];
668 
669  float xij = xyzqi.x + shx - xyzqj.x;
670  float yij = xyzqi.y + shy - xyzqj.y;
671  float zij = xyzqi.z + shz - xyzqj.z;
672 
673  float r2 = xij*xij + yij*yij + zij*zij;
674  if (r2 < cutoff2) {
675  r2 += r2_delta;
676 
677  union { float f; int i; } r2i;
678  r2i.f = r2;
679  int table_i = (r2i.i >> 17) + r2_delta_expc; // table_i >= 0
680 
681 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
682 #ifdef NAMD_HIP
683  float r2_table_val = r2_table[table_i];
684 #else
685  float r2_table_val = __ldg(&r2_table[table_i]);
686 #endif
687 #else
688  float r2_table_val = tex1Dfetch<float>(r2_table_tex, table_i);
689 #endif
690  float diffa = r2 - r2_table_val;
691  float qq = xyzqi.w * xyzqj.w;
692 
693 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
694 #ifdef NAMD_HIP
695  float4 slow = exclusionTable[table_i];
696 #else
697  float4 slow = __ldg(&exclusionTable[table_i]);
698 #endif
699 #else
700  float4 slow = tex1Dfetch<float4>(exclusionTableTex, table_i);
701 #endif
702  // Alchemical route
703  float myElecLambda;
704  float myElecLambda2;
705  double* p_energy_TI;
706  if (doFEP || doTI) {
707  myElecLambda = 1.0f;
708  myElecLambda2 = 1.0f;
709  if (doTI) p_energy_TI = NULL;
710  /*lambda values 'up' are for atoms scaled up with lambda (partition 1)*/
711  float elecLambdaUp = (AlchBondedCUDA::alchLambdas).elecLambdaUp;
712  float elecLambda2Up = (AlchBondedCUDA::alchLambdas).elecLambda2Up;
713  /*lambda values 'down' are for atoms scaled down with lambda (partition 2)*/
714  float elecLambdaDown = (AlchBondedCUDA::alchLambdas).elecLambdaDown;
715  float elecLambda2Down = (AlchBondedCUDA::alchLambdas).elecLambda2Down;
716  switch (bl.pswitch) {
717  case 0: myElecLambda = 1.0f;
718  myElecLambda2 = 1.0f;
719  break;
720  case 1: myElecLambda = elecLambdaUp;
721  myElecLambda2 = elecLambda2Up;
722  p_energy_TI = &(energy_TI_1);
723  break;
724  case 2: myElecLambda = elecLambdaDown;
725  myElecLambda2 = elecLambda2Down;
726  p_energy_TI = &(energy_TI_2);
727  break;
728  default: myElecLambda = 0.0f;
729  myElecLambda2 = 0.0f;
730  break;
731  }
732  }
733  if (doEnergy) {
734  double energy_slow_tmp = (double)(qq*(((diffa * (1.0f/6.0f)*slow.x + 0.25f*slow.y ) * diffa + 0.5f*slow.z ) * diffa + slow.w));
735  if (doFEP || doTI) {
736  energySlow += energy_slow_tmp * myElecLambda;
737  if (doFEP) {
738  energy_F += energy_slow_tmp * myElecLambda2;
739  }
740  if (doTI) {
741  if (p_energy_TI != NULL) {
742  (*p_energy_TI) += energy_slow_tmp;
743  }
744  }
745  } else {
746  energySlow += energy_slow_tmp;
747  }
748  }
749 
750  float fSlow = -qq*((diffa * slow.x + slow.y) * diffa + slow.z);
751  if (doFEP || doTI) {
752  fSlow *= myElecLambda;
753  }
754 
755  T T_fxij, T_fyij, T_fzij;
756  calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
757  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
758  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
759 
760  // Store virial
761  if (doVirial) {
762 #ifdef WRITE_FULL_VIRIALS
763  float fxij = fSlow*xij;
764  float fyij = fSlow*yij;
765  float fzij = fSlow*zij;
766  virialSlow.xx = (fxij*xij);
767  virialSlow.xy = (fxij*yij);
768  virialSlow.xz = (fxij*zij);
769  virialSlow.yx = (fyij*xij);
770  virialSlow.yy = (fyij*yij);
771  virialSlow.yz = (fyij*zij);
772  virialSlow.zx = (fzij*xij);
773  virialSlow.zy = (fzij*yij);
774  virialSlow.zz = (fzij*zij);
775 #endif
776  }
777  }
778 }
779 
780 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
781 __device__ void angleForce(const int index,
782  const CudaAngle* __restrict__ angleList,
783  const CudaAngleValue* __restrict__ angleValues,
784  const float4* __restrict__ xyzq,
785  const int stride,
786  const float3 lata, const float3 latb, const float3 latc,
787  T* __restrict__ force, double &energy,
788  T* __restrict__ forceList, int* forceListCounter,
789  int* forceListStarts, int* forceListNexts,
790 #ifdef WRITE_FULL_VIRIALS
791  ComputeBondedCUDAKernel::BondedVirial<double>& virial,
792 #else
793  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
794 #endif
795  double &energy_F, double &energy_TI_1, double &energy_TI_2
796  ) {
797 
798  CudaAngle al = angleList[index];
799 
800  float ishx = al.ioffsetXYZ.x*lata.x + al.ioffsetXYZ.y*latb.x + al.ioffsetXYZ.z*latc.x;
801  float ishy = al.ioffsetXYZ.x*lata.y + al.ioffsetXYZ.y*latb.y + al.ioffsetXYZ.z*latc.y;
802  float ishz = al.ioffsetXYZ.x*lata.z + al.ioffsetXYZ.y*latb.z + al.ioffsetXYZ.z*latc.z;
803 
804  float kshx = al.koffsetXYZ.x*lata.x + al.koffsetXYZ.y*latb.x + al.koffsetXYZ.z*latc.x;
805  float kshy = al.koffsetXYZ.x*lata.y + al.koffsetXYZ.y*latb.y + al.koffsetXYZ.z*latc.y;
806  float kshz = al.koffsetXYZ.x*lata.z + al.koffsetXYZ.y*latb.z + al.koffsetXYZ.z*latc.z;
807 
808  float xij = xyzq[al.i].x + ishx - xyzq[al.j].x;
809  float yij = xyzq[al.i].y + ishy - xyzq[al.j].y;
810  float zij = xyzq[al.i].z + ishz - xyzq[al.j].z;
811 
812  float xkj = xyzq[al.k].x + kshx - xyzq[al.j].x;
813  float ykj = xyzq[al.k].y + kshy - xyzq[al.j].y;
814  float zkj = xyzq[al.k].z + kshz - xyzq[al.j].z;
815 
816  float rij_inv = rsqrtf(xij*xij + yij*yij + zij*zij);
817  float rkj_inv = rsqrtf(xkj*xkj + ykj*ykj + zkj*zkj);
818 
819  float xijr = xij*rij_inv;
820  float yijr = yij*rij_inv;
821  float zijr = zij*rij_inv;
822  float xkjr = xkj*rkj_inv;
823  float ykjr = ykj*rkj_inv;
824  float zkjr = zkj*rkj_inv;
825  float cos_theta = xijr*xkjr + yijr*ykjr + zijr*zkjr;
826 
827  CudaAngleValue angleValue = angleValues[al.itype];
828  angleValue.k *= al.scale;
829 
830  float diff;
831  if (angleValue.normal == 1) {
832  // Restrict values of cos_theta to the interval [-0.999 ... 0.999]
833  cos_theta = min(0.999f, max(-0.999f, cos_theta));
834  float theta = acosf(cos_theta);
835  diff = theta - angleValue.theta0;
836  } else {
837  diff = cos_theta - angleValue.theta0;
838  }
839 
840  float energy_tmp = 0.0f;
841  if (doEnergy) {
842  energy_tmp += angleValue.k * diff * diff;
843  }
844 
845  // Alchemical route
846  float alch_scale;
847  // Be careful: alch_scale_energy_fep is 0 here!
848  float alch_scale_energy_fep;
849  double* p_energy_TI;
850  // NOTE: On account of the Urey-Bradley terms, energy calculation is not
851  // finished so we cannot add energy_tmp to energy_TI_* and energy_F here!
852  // but we still need the scaling factor for forces,
853  // so the code here is a little bit different from the bondForce.
854  // calculate a scale factor for FEP energy first, and then scale it to
855  // the final result after finishing energy evaluation, and also use
856  // a pointer point to the correct energy_TI_x term.
857  if (doFEP || doTI) {
858  alch_scale = 1.0f;
859  alch_scale_energy_fep = 0.0f;
860  if (doTI) p_energy_TI = NULL;
861  switch (al.fepBondedType) {
862  case 1: {
863  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
864  if (doEnergy) {
865  if (doTI) {
866  p_energy_TI = &(energy_TI_1);
867  }
868  if (doFEP) {
869  alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1;
870  }
871  }
872  break;
873  }
874  case 2: {
875  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
876  if (doEnergy) {
877  if (doTI) {
878  p_energy_TI = &(energy_TI_2);
879  }
880  if (doFEP) {
881  alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2;
882  }
883  }
884  break;
885  }
886  }
887  }
888 
889  if (angleValue.normal == 1) {
890  float inv_sin_theta = rsqrtf(1.0f - cos_theta*cos_theta);
891  if (inv_sin_theta > 1.0e6) {
892  diff = (diff < 0.0f) ? 1.0f : -1.0f;
893  } else {
894  diff *= -inv_sin_theta;
895  }
896  }
897  diff *= -2.0f*angleValue.k;
898 
899  // Do alchemical scaling
900  if (doFEP || doTI) {
901  diff *= alch_scale;
902  }
903 
904  float dtxi = rij_inv*(xkjr - cos_theta*xijr);
905  float dtxj = rkj_inv*(xijr - cos_theta*xkjr);
906  float dtyi = rij_inv*(ykjr - cos_theta*yijr);
907  float dtyj = rkj_inv*(yijr - cos_theta*ykjr);
908  float dtzi = rij_inv*(zkjr - cos_theta*zijr);
909  float dtzj = rkj_inv*(zijr - cos_theta*zkjr);
910 
911  T T_dtxi, T_dtyi, T_dtzi;
912  T T_dtxj, T_dtyj, T_dtzj;
913  calcComponentForces<T>(diff, dtxi, dtyi, dtzi, T_dtxi, T_dtyi, T_dtzi);
914  calcComponentForces<T>(diff, dtxj, dtyj, dtzj, T_dtxj, T_dtyj, T_dtzj);
915  T T_dtxk = -T_dtxi - T_dtxj;
916  T T_dtyk = -T_dtyi - T_dtyj;
917  T T_dtzk = -T_dtzi - T_dtzj;
918  storeForces<T>(T_dtxk, T_dtyk, T_dtzk, al.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
919 
920  if (angleValue.k_ub) {
921  float xik = xij - xkj;
922  float yik = yij - ykj;
923  float zik = zij - zkj;
924  float rik_inv = rsqrtf(xik*xik + yik*yik + zik*zik);
925  float rik = 1.0f/rik_inv;
926  float diff_ub = rik - angleValue.r_ub;
927  if (doEnergy) {
928  energy_tmp += angleValue.k_ub * diff_ub * diff_ub;
929  }
930  diff_ub *= -2.0f*angleValue.k_ub*rik_inv;
931  if (doFEP || doTI) {
932  diff_ub *= alch_scale;
933  }
934  T T_dxik, T_dyik, T_dzik;
935  calcComponentForces<T>(diff_ub, xik, yik, zik, T_dxik, T_dyik, T_dzik);
936  T_dtxi += T_dxik;
937  T_dtyi += T_dyik;
938  T_dtzi += T_dzik;
939  T_dtxj -= T_dxik;
940  T_dtyj -= T_dyik;
941  T_dtzj -= T_dzik;
942  }
943 
944 
945 
946  if (doEnergy) {
947  if (doFEP || doTI) {
948  energy += double(energy_tmp * alch_scale);
949  if (doTI) {
950  if (p_energy_TI != NULL) {
951  *(p_energy_TI) += double(energy_tmp);
952  }
953  }
954  if (doFEP) {
955  energy_F += double(energy_tmp * alch_scale_energy_fep);
956  }
957  } else {
958  energy += double(energy_tmp);
959  }
960  }
961 
962  storeForces<T>(T_dtxi, T_dtyi, T_dtzi, al.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
963  storeForces<T>(T_dtxj, T_dtyj, T_dtzj, al.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
964 
965  // Store virial
966  if (doVirial) {
967 #ifdef WRITE_FULL_VIRIALS
968  float fxi = ((float)T_dtxi);
969  float fyi = ((float)T_dtyi);
970  float fzi = ((float)T_dtzi);
971  float fxk = ((float)T_dtxj);
972  float fyk = ((float)T_dtyj);
973  float fzk = ((float)T_dtzj);
974  virial.xx = (fxi*xij) + (fxk*xkj);
975  virial.xy = (fxi*yij) + (fxk*ykj);
976  virial.xz = (fxi*zij) + (fxk*zkj);
977  virial.yx = (fyi*xij) + (fyk*xkj);
978  virial.yy = (fyi*yij) + (fyk*ykj);
979  virial.yz = (fyi*zij) + (fyk*zkj);
980  virial.zx = (fzi*xij) + (fzk*xkj);
981  virial.zy = (fzi*yij) + (fzk*ykj);
982  virial.zz = (fzi*zij) + (fzk*zkj);
983 #endif
984  }
985 }
986 
987 //
988 // Dihedral computation
989 //
990 // Out: df, e
991 //
992 template <bool doEnergy>
993 __forceinline__ __device__
994 void diheComp(const CudaDihedralValue* dihedralValues, int ic,
995  const float sin_phi, const float cos_phi, const float scale, float& df, double& e) {
996 
997  df = 0.0f;
998  if (doEnergy) e = 0.0;
999 
1000  float phi = atan2f(sin_phi, cos_phi);
1001 
1002  bool lrep = true;
1003  while (lrep) {
1004  CudaDihedralValue dihedralValue = dihedralValues[ic];
1005  dihedralValue.k *= scale;
1006 
1007  // Last dihedral has n low bit set to 0
1008  lrep = (dihedralValue.n & 1);
1009  dihedralValue.n >>= 1;
1010  if (dihedralValue.n) {
1011  float nf = dihedralValue.n;
1012  float x = nf*phi - dihedralValue.delta;
1013  if (doEnergy) {
1014  float sin_x, cos_x;
1015  sincosf(x, &sin_x, &cos_x);
1016  e += (double)(dihedralValue.k*(1.0f + cos_x));
1017  df += (double)(nf*dihedralValue.k*sin_x);
1018  } else {
1019  float sin_x = sinf(x);
1020  df += (double)(nf*dihedralValue.k*sin_x);
1021  }
1022  } else {
1023  float diff = phi - dihedralValue.delta;
1024  if (diff < -(float)(M_PI)) diff += (float)(2.0*M_PI);
1025  if (diff > (float)(M_PI)) diff -= (float)(2.0*M_PI);
1026  if (doEnergy) {
1027  e += (double)(dihedralValue.k*diff*diff);
1028  }
1029  df -= 2.0f*dihedralValue.k*diff;
1030  }
1031  ic++;
1032  }
1033 
1034 }
1035 
1036 
1037 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1038 __device__ void diheForce(const int index,
1039  const CudaDihedral* __restrict__ diheList,
1040  const CudaDihedralValue* __restrict__ dihedralValues,
1041  const float4* __restrict__ xyzq,
1042  const int stride,
1043  const float3 lata, const float3 latb, const float3 latc,
1044  T* __restrict__ force, double &energy,
1045  T* __restrict__ forceList, int* forceListCounter, int* forceListStarts, int* forceListNexts,
1046 #ifdef WRITE_FULL_VIRIALS
1047  ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1048 #else
1049  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1050 #endif
1051  double &energy_F, double &energy_TI_1, double &energy_TI_2
1052  ) {
1053 
1054  CudaDihedral dl = diheList[index];
1055 
1056  float ishx = dl.ioffsetXYZ.x*lata.x + dl.ioffsetXYZ.y*latb.x + dl.ioffsetXYZ.z*latc.x;
1057  float ishy = dl.ioffsetXYZ.x*lata.y + dl.ioffsetXYZ.y*latb.y + dl.ioffsetXYZ.z*latc.y;
1058  float ishz = dl.ioffsetXYZ.x*lata.z + dl.ioffsetXYZ.y*latb.z + dl.ioffsetXYZ.z*latc.z;
1059 
1060  float jshx = dl.joffsetXYZ.x*lata.x + dl.joffsetXYZ.y*latb.x + dl.joffsetXYZ.z*latc.x;
1061  float jshy = dl.joffsetXYZ.x*lata.y + dl.joffsetXYZ.y*latb.y + dl.joffsetXYZ.z*latc.y;
1062  float jshz = dl.joffsetXYZ.x*lata.z + dl.joffsetXYZ.y*latb.z + dl.joffsetXYZ.z*latc.z;
1063 
1064  float lshx = dl.loffsetXYZ.x*lata.x + dl.loffsetXYZ.y*latb.x + dl.loffsetXYZ.z*latc.x;
1065  float lshy = dl.loffsetXYZ.x*lata.y + dl.loffsetXYZ.y*latb.y + dl.loffsetXYZ.z*latc.y;
1066  float lshz = dl.loffsetXYZ.x*lata.z + dl.loffsetXYZ.y*latb.z + dl.loffsetXYZ.z*latc.z;
1067 
1068  float xij = xyzq[dl.i].x + ishx - xyzq[dl.j].x;
1069  float yij = xyzq[dl.i].y + ishy - xyzq[dl.j].y;
1070  float zij = xyzq[dl.i].z + ishz - xyzq[dl.j].z;
1071 
1072  float xjk = xyzq[dl.j].x + jshx - xyzq[dl.k].x;
1073  float yjk = xyzq[dl.j].y + jshy - xyzq[dl.k].y;
1074  float zjk = xyzq[dl.j].z + jshz - xyzq[dl.k].z;
1075 
1076  float xlk = xyzq[dl.l].x + lshx - xyzq[dl.k].x;
1077  float ylk = xyzq[dl.l].y + lshy - xyzq[dl.k].y;
1078  float zlk = xyzq[dl.l].z + lshz - xyzq[dl.k].z;
1079 
1080  // A=F^G, B=H^G.
1081  float ax = yij*zjk - zij*yjk;
1082  float ay = zij*xjk - xij*zjk;
1083  float az = xij*yjk - yij*xjk;
1084  float bx = ylk*zjk - zlk*yjk;
1085  float by = zlk*xjk - xlk*zjk;
1086  float bz = xlk*yjk - ylk*xjk;
1087 
1088  float ra2 = ax*ax + ay*ay + az*az;
1089  float rb2 = bx*bx + by*by + bz*bz;
1090  float rg = sqrtf(xjk*xjk + yjk*yjk + zjk*zjk);
1091 
1092  // if((ra2 <= rxmin2) .or. (rb2 <= rxmin2) .or. (rg <= rxmin)) then
1093  // nlinear = nlinear + 1
1094  // endif
1095 
1096  float rgr = 1.0f / rg;
1097  float ra2r = 1.0f / ra2;
1098  float rb2r = 1.0f / rb2;
1099  float rabr = sqrtf(ra2r*rb2r);
1100 
1101  float cos_phi = (ax*bx + ay*by + az*bz)*rabr;
1102  //
1103  // Note that sin(phi).G/|G|=B^A/(|A|.|B|)
1104  // which can be simplify to sin(phi)=|G|H.A/(|A|.|B|)
1105  float sin_phi = rg*rabr*(ax*xlk + ay*ylk + az*zlk);
1106  //
1107  // Energy and derivative contributions.
1108 
1109  float df;
1110  double e;
1111  diheComp<doEnergy>(dihedralValues, dl.itype, sin_phi, cos_phi, dl.scale, df, e);
1112 
1113  // Alchemical transformation
1114  float alch_scale;
1115  if (doFEP || doTI) {
1116  alch_scale = 1.0f;
1117  switch (dl.fepBondedType) {
1118  case 1: {
1119  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1120  if (doEnergy) {
1121  if (doTI) {
1122  energy_TI_1 += (double)(e);
1123  }
1124  if (doFEP) {
1125  energy_F += (double)(e * ((AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1));
1126  }
1127  }
1128  break;
1129  }
1130  case 2: {
1131  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1132  if (doEnergy) {
1133  if (doTI) {
1134  energy_TI_2 += (double)(e);
1135  }
1136  if (doFEP) {
1137  energy_F += (double)(e * ((AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2));
1138  }
1139  }
1140  break;
1141  }
1142  }
1143  }
1144 
1145  if (doEnergy) {
1146  if (doFEP || doTI) {
1147  energy += e * alch_scale;
1148  } else {
1149  energy += e;
1150  }
1151  }
1152 
1153  //
1154  // Compute derivatives wrt catesian coordinates.
1155  //
1156  // GAA=dE/dphi.|G|/A^2, GBB=dE/dphi.|G|/B^2, FG=F.G, HG=H.G
1157  // FGA=dE/dphi*F.G/(|G|A^2), HGB=dE/dphi*H.G/(|G|B^2)
1158 
1159  float fg = xij*xjk + yij*yjk + zij*zjk;
1160  float hg = xlk*xjk + ylk*yjk + zlk*zjk;
1161  ra2r *= df;
1162  rb2r *= df;
1163  float fga = fg*ra2r*rgr;
1164  float hgb = hg*rb2r*rgr;
1165  float gaa = ra2r*rg;
1166  float gbb = rb2r*rg;
1167  // DFi=dE/dFi, DGi=dE/dGi, DHi=dE/dHi.
1168 
1169  // Remember T is long long int
1170  // Don't try to scale T_fix and similar variables directly by float
1171  if (doFEP || doTI) {
1172  fga *= alch_scale;
1173  hgb *= alch_scale;
1174  gaa *= alch_scale;
1175  gbb *= alch_scale;
1176  }
1177 
1178  // Store forces
1179  T T_fix, T_fiy, T_fiz;
1180  calcComponentForces<T>(-gaa, ax, ay, az, T_fix, T_fiy, T_fiz);
1181  storeForces<T>(T_fix, T_fiy, T_fiz, dl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1182 
1183  T dgx, dgy, dgz;
1184  calcComponentForces<T>(fga, ax, ay, az, -hgb, bx, by, bz, dgx, dgy, dgz);
1185  T T_fjx = dgx - T_fix;
1186  T T_fjy = dgy - T_fiy;
1187  T T_fjz = dgz - T_fiz;
1188  storeForces<T>(T_fjx, T_fjy, T_fjz, dl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1189 
1190  T dhx, dhy, dhz;
1191  calcComponentForces<T>(gbb, bx, by, bz, dhx, dhy, dhz);
1192  T T_fkx = -dhx - dgx;
1193  T T_fky = -dhy - dgy;
1194  T T_fkz = -dhz - dgz;
1195  storeForces<T>(T_fkx, T_fky, T_fkz, dl.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1196  storeForces<T>(dhx, dhy, dhz, dl.l, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1197 
1198  // Store virial
1199  if (doVirial) {
1200 #ifdef WRITE_FULL_VIRIALS
1201  float fix = ((float)T_fix);
1202  float fiy = ((float)T_fiy);
1203  float fiz = ((float)T_fiz);
1204  float fjx = ((float)dgx);
1205  float fjy = ((float)dgy);
1206  float fjz = ((float)dgz);
1207  float flx = ((float)dhx);
1208  float fly = ((float)dhy);
1209  float flz = ((float)dhz);
1210  virial.xx = (fix*xij) + (fjx*xjk) + (flx*xlk);
1211  virial.xy = (fix*yij) + (fjx*yjk) + (flx*ylk);
1212  virial.xz = (fix*zij) + (fjx*zjk) + (flx*zlk);
1213  virial.yx = (fiy*xij) + (fjy*xjk) + (fly*xlk);
1214  virial.yy = (fiy*yij) + (fjy*yjk) + (fly*ylk);
1215  virial.yz = (fiy*zij) + (fjy*zjk) + (fly*zlk);
1216  virial.zx = (fiz*xij) + (fjz*xjk) + (flz*xlk);
1217  virial.zy = (fiz*yij) + (fjz*yjk) + (flz*ylk);
1218  virial.zz = (fiz*zij) + (fjz*zjk) + (flz*zlk);
1219 #endif
1220  }
1221 
1222 }
1223 
1224 __device__ __forceinline__ float3 cross(const float3 v1, const float3 v2) {
1225  return make_float3(
1226  v1.y*v2.z-v2.y*v1.z,
1227  v2.x*v1.z-v1.x*v2.z,
1228  v1.x*v2.y-v2.x*v1.y
1229  );
1230 }
1231 
1232 __device__ __forceinline__ float dot(const float3 v1, const float3 v2) {
1233  return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z);
1234 }
1235 
1236 //
1237 // Calculates crossterms
1238 //
1239 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1240 __device__ void crosstermForce(
1241  const int index,
1242  const CudaCrossterm* __restrict__ crosstermList,
1243  const CudaCrosstermValue* __restrict__ crosstermValues,
1244  const float4* __restrict__ xyzq,
1245  const int stride,
1246  const float3 lata, const float3 latb, const float3 latc,
1247  T* __restrict__ force, double &energy,
1248  T* __restrict__ forceList, int* forceListCounter, int* forceListStarts, int* forceListNexts,
1249 #ifdef WRITE_FULL_VIRIALS
1250  ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1251 #else
1252  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1253 #endif
1254  double &energy_F, double &energy_TI_1, double &energy_TI_2
1255  ) {
1256 
1257  CudaCrossterm cl = crosstermList[index];
1258 
1259  // ----------------------------------------------------------------------------
1260  // Angle between 1 - 2 - 3 - 4
1261  //
1262  // 1 - 2
1263  float3 sh12 = make_float3(
1264  cl.offset12XYZ.x*lata.x + cl.offset12XYZ.y*latb.x + cl.offset12XYZ.z*latc.x,
1265  cl.offset12XYZ.x*lata.y + cl.offset12XYZ.y*latb.y + cl.offset12XYZ.z*latc.y,
1266  cl.offset12XYZ.x*lata.z + cl.offset12XYZ.y*latb.z + cl.offset12XYZ.z*latc.z);
1267 
1268  float4 xyzq1 = xyzq[cl.i1];
1269  float4 xyzq2 = xyzq[cl.i2];
1270 
1271  float3 r12 = make_float3(
1272  xyzq1.x + sh12.x - xyzq2.x,
1273  xyzq1.y + sh12.y - xyzq2.y,
1274  xyzq1.z + sh12.z - xyzq2.z);
1275 
1276  // 2 - 3
1277  float3 sh23 = make_float3(
1278  cl.offset23XYZ.x*lata.x + cl.offset23XYZ.y*latb.x + cl.offset23XYZ.z*latc.x,
1279  cl.offset23XYZ.x*lata.y + cl.offset23XYZ.y*latb.y + cl.offset23XYZ.z*latc.y,
1280  cl.offset23XYZ.x*lata.z + cl.offset23XYZ.y*latb.z + cl.offset23XYZ.z*latc.z);
1281 
1282  float4 xyzq3 = xyzq[cl.i3];
1283 
1284  float3 r23 = make_float3(
1285  xyzq2.x + sh23.x - xyzq3.x,
1286  xyzq2.y + sh23.y - xyzq3.y,
1287  xyzq2.z + sh23.z - xyzq3.z);
1288 
1289  // 3 - 4
1290  float3 sh34 = make_float3(
1291  cl.offset34XYZ.x*lata.x + cl.offset34XYZ.y*latb.x + cl.offset34XYZ.z*latc.x,
1292  cl.offset34XYZ.x*lata.y + cl.offset34XYZ.y*latb.y + cl.offset34XYZ.z*latc.y,
1293  cl.offset34XYZ.x*lata.z + cl.offset34XYZ.y*latb.z + cl.offset34XYZ.z*latc.z);
1294 
1295  float4 xyzq4 = xyzq[cl.i4];
1296 
1297  float3 r34 = make_float3(
1298  xyzq3.x + sh34.x - xyzq4.x,
1299  xyzq3.y + sh34.y - xyzq4.y,
1300  xyzq3.z + sh34.z - xyzq4.z);
1301 
1302  // Calculate the cross products
1303  float3 A = cross(r12, r23);
1304  float3 B = cross(r23, r34);
1305  float3 C = cross(r23, A);
1306 
1307  // Calculate the inverse distances
1308  float inv_rA = rsqrtf(dot(A, A));
1309  float inv_rB = rsqrtf(dot(B, B));
1310  float inv_rC = rsqrtf(dot(C, C));
1311 
1312  // Calculate the sin and cos
1313  float cos_phi = dot(A, B)*(inv_rA*inv_rB);
1314  float sin_phi = dot(C, B)*(inv_rC*inv_rB);
1315 
1316  float phi = -atan2f(sin_phi,cos_phi);
1317 
1318  // ----------------------------------------------------------------------------
1319  // Angle between 5 - 6 - 7 - 8
1320  //
1321 
1322  // 5 - 6
1323  float3 sh56 = make_float3(
1324  cl.offset56XYZ.x*lata.x + cl.offset56XYZ.y*latb.x + cl.offset56XYZ.z*latc.x,
1325  cl.offset56XYZ.x*lata.y + cl.offset56XYZ.y*latb.y + cl.offset56XYZ.z*latc.y,
1326  cl.offset56XYZ.x*lata.z + cl.offset56XYZ.y*latb.z + cl.offset56XYZ.z*latc.z);
1327 
1328  float4 xyzq5 = xyzq[cl.i5];
1329  float4 xyzq6 = xyzq[cl.i6];
1330 
1331  float3 r56 = make_float3(
1332  xyzq5.x + sh56.x - xyzq6.x,
1333  xyzq5.y + sh56.y - xyzq6.y,
1334  xyzq5.z + sh56.z - xyzq6.z);
1335 
1336  // 6 - 7
1337  float3 sh67 = make_float3(
1338  cl.offset67XYZ.x*lata.x + cl.offset67XYZ.y*latb.x + cl.offset67XYZ.z*latc.x,
1339  cl.offset67XYZ.x*lata.y + cl.offset67XYZ.y*latb.y + cl.offset67XYZ.z*latc.y,
1340  cl.offset67XYZ.x*lata.z + cl.offset67XYZ.y*latb.z + cl.offset67XYZ.z*latc.z);
1341 
1342  float4 xyzq7 = xyzq[cl.i7];
1343 
1344  float3 r67 = make_float3(
1345  xyzq6.x + sh67.x - xyzq7.x,
1346  xyzq6.y + sh67.y - xyzq7.y,
1347  xyzq6.z + sh67.z - xyzq7.z);
1348 
1349  // 7 - 8
1350  float3 sh78 = make_float3(
1351  cl.offset78XYZ.x*lata.x + cl.offset78XYZ.y*latb.x + cl.offset78XYZ.z*latc.x,
1352  cl.offset78XYZ.x*lata.y + cl.offset78XYZ.y*latb.y + cl.offset78XYZ.z*latc.y,
1353  cl.offset78XYZ.x*lata.z + cl.offset78XYZ.y*latb.z + cl.offset78XYZ.z*latc.z);
1354 
1355  float4 xyzq8 = xyzq[cl.i8];
1356 
1357  float3 r78 = make_float3(
1358  xyzq7.x + sh78.x - xyzq8.x,
1359  xyzq7.y + sh78.y - xyzq8.y,
1360  xyzq7.z + sh78.z - xyzq8.z);
1361 
1362  // Calculate the cross products
1363  float3 D = cross(r56, r67);
1364  float3 E = cross(r67, r78);
1365  float3 F = cross(r67, D);
1366 
1367  // Calculate the inverse distances
1368  float inv_rD = rsqrtf(dot(D, D));
1369  float inv_rE = rsqrtf(dot(E, E));
1370  float inv_rF = rsqrtf(dot(F, F));
1371 
1372  // Calculate the sin and cos
1373  float cos_psi = dot(D, E)*(inv_rD*inv_rE);
1374  float sin_psi = dot(F, E)*(inv_rF*inv_rE);
1375 
1376  float psi = -atan2f(sin_psi,cos_psi);
1377 
1378  // ----------------------------------------------------------------------------
1379  // Calculate the energy
1380 
1381  const float inv_h = (float)( (CudaCrosstermValue::dim) / (2.0*M_PI) );
1382 
1383  // Shift angles
1384  phi = fmod(phi + (float)M_PI, 2.0f*(float)M_PI);
1385  psi = fmod(psi + (float)M_PI, 2.0f*(float)M_PI);
1386 
1387  // distance measured in grid points between angle and smallest value
1388  float phi_h = phi * inv_h;
1389  float psi_h = psi * inv_h;
1390 
1391  // find smallest numbered grid point in stencil
1392  int iphi = (int)floor(phi_h);
1393  int ipsi = (int)floor(psi_h);
1394 
1395  const CudaCrosstermValue& cp = crosstermValues[cl.itype];
1396 
1397  // Load coefficients
1398  float4 c[4];
1399  c[0] = cp.c[iphi][ipsi][0];
1400  c[1] = cp.c[iphi][ipsi][1];
1401  c[2] = cp.c[iphi][ipsi][2];
1402  c[3] = cp.c[iphi][ipsi][3];
1403 
1404  float dphi = phi_h - (float)iphi;
1405  float dpsi = psi_h - (float)ipsi;
1406 
1407  float alch_scale;
1408  float alch_scale_energy_fep;
1409  double* p_energy_TI;
1410  if (doFEP || doTI) {
1411  alch_scale = 1.0f;
1412  alch_scale_energy_fep = 0.0f;
1413  if (doTI) p_energy_TI = NULL;
1414  switch (cl.fepBondedType) {
1415  case 1: {
1416  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1417  if (doEnergy) {
1418  if (doTI) {
1419  p_energy_TI = &(energy_TI_1);
1420  }
1421  if (doFEP) {
1422  alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1;
1423  }
1424  }
1425  break;
1426  }
1427  case 2: {
1428  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1429  if (doEnergy) {
1430  if (doTI) {
1431  p_energy_TI = &(energy_TI_2);
1432  }
1433  if (doFEP) {
1434  alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2;
1435  }
1436  }
1437  break;
1438  }
1439  }
1440  }
1441 
1442  if (doEnergy) {
1443  float energyf = c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) );
1444  energyf = energyf*dpsi + c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) );
1445  energyf = energyf*dpsi + c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) );
1446  energyf = energyf*dpsi + c[0].x + dphi*( c[0].y + dphi*( c[0].z + dphi*c[0].w ) );
1447  if (doFEP || doTI) {
1448  energy += energyf * cl.scale * alch_scale;
1449  if (doFEP) {
1450  energy_F += energyf * cl.scale * alch_scale_energy_fep;
1451  }
1452  if (doTI) {
1453  if (p_energy_TI != NULL) {
1454  (*p_energy_TI) += energyf * cl.scale;
1455  }
1456  }
1457  } else {
1458  energy += energyf * cl.scale;
1459  }
1460  }
1461 
1462  float dEdphi = 3.0f*(c[0].w + dpsi*( c[1].w + dpsi*( c[2].w + dpsi*c[3].w ) ));
1463  dEdphi = dEdphi*dphi + 2.0f*(c[0].z + dpsi*( c[1].z + dpsi*( c[2].z + dpsi*c[3].z ) ));
1464  dEdphi = dEdphi*dphi + (c[0].y + dpsi*( c[1].y + dpsi*( c[2].y + dpsi*c[3].y ) )); // 13 muls
1465  dEdphi *= cl.scale*inv_h;
1466 
1467  float dEdpsi = 3.0f*(c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) ));
1468  dEdpsi = dEdpsi*dpsi + 2.0f*(c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) ));
1469  dEdpsi = dEdpsi*dpsi + (c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) )); // 13 muls
1470  dEdpsi *= cl.scale*inv_h;
1471 
1472  // float normCross1 = dot(A, A);
1473  float square_r23 = dot(r23, r23);
1474  float norm_r23 = sqrtf(square_r23);
1475  float inv_square_r23 = 1.0f/square_r23;
1476  float ff1 = dEdphi*norm_r23*inv_rA*inv_rA;
1477  float ff2 = -dot(r12, r23)*inv_square_r23;
1478  float ff3 = -dot(r34, r23)*inv_square_r23;
1479  float ff4 = -dEdphi*norm_r23*inv_rB*inv_rB;
1480 
1481  // NOTE: The reason why scaling ff1 and ff4 is enough:
1482  // f1, f4 are already scaled, and in following t1's formula:
1483  // first term (t1.x) : ff2*f1.x - ff3*f4.x
1484  // ^ ^
1485  // scaled scaled
1486  // so t1.x is scaled, and also t1.y and t1.z => t1 is scaled
1487  // then let's look at f2.x:
1488  // f2.x = t1.x - f1.x
1489  // ^ ^
1490  // scaled scaled => f2.x is scaled
1491  // and also f2.y and f2.z are scaled => f2 is scaled => similarly f3 is scaled
1492  // As a result scaling ff1 and ff4 is enough. DONT scale ff2 and ff3!
1493  if (doFEP || doTI) {
1494  ff1 *= alch_scale;
1495  ff4 *= alch_scale;
1496  }
1497 
1498  float3 f1 = make_float3(ff1*A.x, ff1*A.y, ff1*A.z);
1499  float3 f4 = make_float3(ff4*B.x, ff4*B.y, ff4*B.z);
1500  float3 t1 = make_float3( ff2*f1.x - ff3*f4.x, ff2*f1.y - ff3*f4.y, ff2*f1.z - ff3*f4.z );
1501  float3 f2 = make_float3( t1.x - f1.x, t1.y - f1.y, t1.z - f1.z);
1502  float3 f3 = make_float3( -t1.x - f4.x, -t1.y - f4.y, -t1.z - f4.z);
1503 
1504  T T_f1x, T_f1y, T_f1z;
1505  T T_f2x, T_f2y, T_f2z;
1506  T T_f3x, T_f3y, T_f3z;
1507  T T_f4x, T_f4y, T_f4z;
1508  convertForces<T>(f1.x, f1.y, f1.z, T_f1x, T_f1y, T_f1z);
1509  convertForces<T>(f2.x, f2.y, f2.z, T_f2x, T_f2y, T_f2z);
1510  convertForces<T>(f3.x, f3.y, f3.z, T_f3x, T_f3y, T_f3z);
1511  convertForces<T>(f4.x, f4.y, f4.z, T_f4x, T_f4y, T_f4z);
1512  storeForces<T>(T_f1x, T_f1y, T_f1z, cl.i1, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1513  storeForces<T>(T_f2x, T_f2y, T_f2z, cl.i2, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1514  storeForces<T>(T_f3x, T_f3y, T_f3z, cl.i3, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1515  storeForces<T>(T_f4x, T_f4y, T_f4z, cl.i4, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1516 
1517  float square_r67 = dot(r67, r67);
1518  float norm_r67 = sqrtf(square_r67);
1519  float inv_square_r67 = 1.0f/(square_r67);
1520  ff1 = dEdpsi*norm_r67*inv_rD*inv_rD;
1521  ff2 = -dot(r56, r67)*inv_square_r67;
1522  ff3 = -dot(r78, r67)*inv_square_r67;
1523  ff4 = -dEdpsi*norm_r67*inv_rE*inv_rE;
1524 
1525  if (doFEP || doTI) {
1526  ff1 *= alch_scale;
1527  ff4 *= alch_scale;
1528  }
1529 
1530  float3 f5 = make_float3( ff1*D.x, ff1*D.y, ff1*D.z );
1531  float3 f8 = make_float3( ff4*E.x, ff4*E.y, ff4*E.z );
1532  float3 t2 = make_float3( ff2*f5.x - ff3*f8.x, ff2*f5.y - ff3*f8.y, ff2*f5.z - ff3*f8.z );
1533  float3 f6 = make_float3( t2.x - f5.x, t2.y - f5.y, t2.z - f5.z );
1534  float3 f7 = make_float3(-t2.x - f8.x, -t2.y - f8.y, -t2.z - f8.z );
1535 
1536  T T_f5x, T_f5y, T_f5z;
1537  T T_f6x, T_f6y, T_f6z;
1538  T T_f7x, T_f7y, T_f7z;
1539  T T_f8x, T_f8y, T_f8z;
1540  convertForces<T>(f5.x, f5.y, f5.z, T_f5x, T_f5y, T_f5z);
1541  convertForces<T>(f6.x, f6.y, f6.z, T_f6x, T_f6y, T_f6z);
1542  convertForces<T>(f7.x, f7.y, f7.z, T_f7x, T_f7y, T_f7z);
1543  convertForces<T>(f8.x, f8.y, f8.z, T_f8x, T_f8y, T_f8z);
1544  storeForces<T>(T_f5x, T_f5y, T_f5z, cl.i5, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1545  storeForces<T>(T_f6x, T_f6y, T_f6z, cl.i6, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1546  storeForces<T>(T_f7x, T_f7y, T_f7z, cl.i7, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1547  storeForces<T>(T_f8x, T_f8y, T_f8z, cl.i8, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1548 
1549  // Store virial
1550  if (doVirial) {
1551 #ifdef WRITE_FULL_VIRIALS
1552  float3 s12 = make_float3( f1.x + f2.x, f1.y + f2.y, f1.z + f2.z );
1553  float3 s56 = make_float3( f5.x + f6.x, f5.y + f6.y, f5.z + f6.z );
1554  virial.xx = f1.x*r12.x + s12.x*r23.x - f4.x*r34.x + f5.x*r56.x + s56.x*r67.x - f8.x*r78.x;
1555  virial.xy = f1.x*r12.y + s12.x*r23.y - f4.x*r34.y + f5.x*r56.y + s56.x*r67.y - f8.x*r78.y;
1556  virial.xz = f1.x*r12.z + s12.x*r23.z - f4.x*r34.z + f5.x*r56.z + s56.x*r67.z - f8.x*r78.z;
1557  virial.yx = f1.y*r12.x + s12.y*r23.x - f4.y*r34.x + f5.y*r56.x + s56.y*r67.x - f8.y*r78.x;
1558  virial.yy = f1.y*r12.y + s12.y*r23.y - f4.y*r34.y + f5.y*r56.y + s56.y*r67.y - f8.y*r78.y;
1559  virial.yz = f1.y*r12.z + s12.y*r23.z - f4.y*r34.z + f5.y*r56.z + s56.y*r67.z - f8.y*r78.z;
1560  virial.zx = f1.z*r12.x + s12.z*r23.x - f4.z*r34.x + f5.z*r56.x + s56.z*r67.x - f8.z*r78.x;
1561  virial.zy = f1.z*r12.y + s12.z*r23.y - f4.z*r34.y + f5.z*r56.y + s56.z*r67.y - f8.z*r78.y;
1562  virial.zz = f1.z*r12.z + s12.z*r23.z - f4.z*r34.z + f5.z*r56.z + s56.z*r67.z - f8.z*r78.z;
1563  }
1564 #endif
1565 
1566 }
1567 
1568 #ifndef NAMD_CUDA
1569 #define BONDEDFORCESKERNEL_NUM_WARP 2
1570 #else
1571 #define BONDEDFORCESKERNEL_NUM_WARP 4
1572 #endif
1573 //
1574 // Calculates all forces in a single kernel call
1575 //
1576 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1577 __global__ void bondedForcesKernel(
1578  const int start,
1579 
1580  const int numBonds,
1581  const CudaBond* __restrict__ bonds,
1582  const CudaBondValue* __restrict__ bondValues,
1583 
1584  const int numAngles,
1585  const CudaAngle* __restrict__ angles,
1586  const CudaAngleValue* __restrict__ angleValues,
1587 
1588  const int numDihedrals,
1589  const CudaDihedral* __restrict__ dihedrals,
1590  const CudaDihedralValue* __restrict__ dihedralValues,
1591 
1592  const int numImpropers,
1593  const CudaDihedral* __restrict__ impropers,
1594  const CudaDihedralValue* __restrict__ improperValues,
1595 
1596  const int numExclusions,
1597  const CudaExclusion* __restrict__ exclusions,
1598 
1599  const int numCrossterms,
1600  const CudaCrossterm* __restrict__ crossterms,
1601  const CudaCrosstermValue* __restrict__ crosstermValues,
1602 
1603  const float cutoff2,
1604  const float r2_delta, const int r2_delta_expc,
1605 
1606  const float* __restrict__ r2_table,
1607  const float4* __restrict__ exclusionTable,
1608 
1609 #if !defined(USE_TABLE_ARRAYS)
1610  cudaTextureObject_t r2_table_tex,
1611  cudaTextureObject_t exclusionTableTex,
1612 #endif
1613 
1614  const float4* __restrict__ xyzq,
1615  const int stride,
1616  const float3 lata, const float3 latb, const float3 latc,
1617  T* __restrict__ force,
1618  T* __restrict__ forceSlow,
1619  T* __restrict__ forceList,
1620  int* forceListCounter,
1621  int* forceListStarts,
1622  int* forceListStartsSlow,
1623  int* forceListNexts,
1624  double* __restrict__ energies_virials) {
1625 
1626  // Thread-block index
1627  int indexTB = start + blockIdx.x;
1628 
1629  const int numBondsTB = (numBonds + blockDim.x - 1)/blockDim.x;
1630  const int numAnglesTB = (numAngles + blockDim.x - 1)/blockDim.x;
1631  const int numDihedralsTB = (numDihedrals + blockDim.x - 1)/blockDim.x;
1632  const int numImpropersTB = (numImpropers + blockDim.x - 1)/blockDim.x;
1633  const int numExclusionsTB= (numExclusions + blockDim.x - 1)/blockDim.x;
1634  const int numCrosstermsTB= (numCrossterms + blockDim.x - 1)/blockDim.x;
1635 
1636  // Each thread computes single bonded interaction.
1637  // Each thread block computes single bonded type
1638  double energy;
1639  double energy_F;
1640  double energy_TI_1;
1641  double energy_TI_2;
1642  int energyIndex;
1643  int energyIndex_F;
1644  int energyIndex_TI_1;
1645  int energyIndex_TI_2;
1646 
1647  if (doEnergy) {
1648  energy = 0.0;
1649  energyIndex = -1;
1650  if (doFEP) {
1651  energy_F = 0.0;
1652  energyIndex_F = -1;
1653  }
1654  if (doTI) {
1655  energy_TI_1 = 0.0;
1656  energy_TI_2 = 0.0;
1657  energyIndex_TI_1 = -1;
1658  energyIndex_TI_2 = -1;
1659  }
1660  }
1661 
1662 #ifdef WRITE_FULL_VIRIALS
1663  ComputeBondedCUDAKernel::BondedVirial<double> virial;
1664  int virialIndex;
1665  if (doVirial) {
1666  virial.xx = 0.0;
1667  virial.xy = 0.0;
1668  virial.xz = 0.0;
1669  virial.yx = 0.0;
1670  virial.yy = 0.0;
1671  virial.yz = 0.0;
1672  virial.zx = 0.0;
1673  virial.zy = 0.0;
1674  virial.zz = 0.0;
1675  virialIndex = ComputeBondedCUDAKernel::normalVirialIndex_XX;
1676  }
1677 #endif
1678 
1679  if (indexTB < numBondsTB) {
1680  int i = threadIdx.x + indexTB*blockDim.x;
1681  if (doEnergy) {
1682  energyIndex = ComputeBondedCUDAKernel::energyIndex_BOND;
1683  if (doFEP) {
1684  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_BOND_F;
1685  }
1686  if (doTI) {
1687  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_BOND_TI_1;
1688  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_BOND_TI_2;
1689  }
1690  }
1691  if (i < numBonds) {
1692  bondForce<T, doEnergy, doVirial, doFEP, doTI>
1693  (i, bonds, bondValues, xyzq,
1694  stride, lata, latb, latc,
1695  force, energy,
1696  forceList, forceListCounter, forceListStarts, forceListNexts,
1697  virial,
1698  energy_F, energy_TI_1, energy_TI_2);
1699  }
1700  goto reduce;
1701  }
1702  indexTB -= numBondsTB;
1703 
1704  if (indexTB < numAnglesTB) {
1705  int i = threadIdx.x + indexTB*blockDim.x;
1706  if (doEnergy) {
1707  energyIndex = ComputeBondedCUDAKernel::energyIndex_ANGLE;
1708  if (doFEP) {
1709  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_ANGLE_F;
1710  }
1711  if (doTI) {
1712  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_ANGLE_TI_1;
1713  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_ANGLE_TI_2;
1714  }
1715  }
1716  if (i < numAngles) {
1717  angleForce<T, doEnergy, doVirial, doFEP, doTI>
1718  (i, angles, angleValues, xyzq, stride,
1719  lata, latb, latc,
1720  force, energy,
1721  forceList, forceListCounter, forceListStarts, forceListNexts,
1722  virial,
1723  energy_F, energy_TI_1, energy_TI_2);
1724  }
1725  goto reduce;
1726  }
1727  indexTB -= numAnglesTB;
1728 
1729  if (indexTB < numDihedralsTB) {
1730  int i = threadIdx.x + indexTB*blockDim.x;
1731  if (doEnergy) {
1732  energyIndex = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL;
1733  if (doFEP) {
1734  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_F;
1735  }
1736  if (doTI) {
1737  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_TI_1;
1738  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_TI_2;
1739  }
1740  }
1741  if (doVirial) {
1742  virialIndex = ComputeBondedCUDAKernel::amdDiheVirialIndex_XX;
1743  }
1744  if (i < numDihedrals) {
1745  diheForce<T, doEnergy, doVirial, doFEP, doTI>
1746  (i, dihedrals, dihedralValues, xyzq, stride,
1747  lata, latb, latc,
1748  force, energy,
1749  forceList, forceListCounter, forceListStarts, forceListNexts,
1750  virial,
1751  energy_F, energy_TI_1, energy_TI_2);
1752  }
1753  goto reduce;
1754  }
1755  indexTB -= numDihedralsTB;
1756 
1757  if (indexTB < numImpropersTB) {
1758  int i = threadIdx.x + indexTB*blockDim.x;
1759  if (doEnergy) {
1760  energyIndex = ComputeBondedCUDAKernel::energyIndex_IMPROPER;
1761  if (doFEP) {
1762  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_IMPROPER_F;
1763  }
1764  if (doTI) {
1765  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_IMPROPER_TI_1;
1766  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_IMPROPER_TI_2;
1767  }
1768  }
1769  if (i < numImpropers) {
1770  diheForce<T, doEnergy, doVirial, doFEP, doTI>
1771  (i, impropers, improperValues, xyzq, stride,
1772  lata, latb, latc,
1773  force, energy,
1774  forceList, forceListCounter, forceListStarts, forceListNexts,
1775  virial,
1776  energy_F, energy_TI_1, energy_TI_2);
1777  }
1778  goto reduce;
1779  }
1780  indexTB -= numImpropersTB;
1781 
1782  if (indexTB < numExclusionsTB) {
1783  int i = threadIdx.x + indexTB*blockDim.x;
1784  if (doEnergy) {
1785  energyIndex = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW;
1786  if (doFEP) {
1787  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_F;
1788  }
1789  if (doTI) {
1790  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_1;
1791  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_2;
1792  }
1793  }
1794  if (doVirial) {
1795  virialIndex = ComputeBondedCUDAKernel::slowVirialIndex_XX;
1796  }
1797  if (i < numExclusions) {
1798  exclusionForce<T, doEnergy, doVirial, doFEP, doTI>
1799  (i, exclusions, r2_delta, r2_delta_expc,
1800 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
1801  r2_table, exclusionTable,
1802 #else
1803  r2_table_tex, exclusionTableTex,
1804 #endif
1805  xyzq, stride, lata, latb, latc, cutoff2,
1806  forceSlow, energy,
1807  forceList, forceListCounter, forceListStartsSlow, forceListNexts,
1808  virial,
1809  energy_F, energy_TI_1, energy_TI_2);
1810  }
1811  goto reduce;
1812  }
1813  indexTB -= numExclusionsTB;
1814 
1815  if (indexTB < numCrosstermsTB) {
1816  int i = threadIdx.x + indexTB*blockDim.x;
1817  if (doEnergy) {
1818  energyIndex = ComputeBondedCUDAKernel::energyIndex_CROSSTERM;
1819  if (doFEP) {
1820  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_CROSSTERM_F;
1821  }
1822  if (doTI) {
1823  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_CROSSTERM_TI_1;
1824  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_CROSSTERM_TI_2;
1825  }
1826  }
1827  if (doVirial) {
1828  virialIndex = ComputeBondedCUDAKernel::amdDiheVirialIndex_XX;
1829  }
1830  if (i < numCrossterms) {
1831  crosstermForce<T, doEnergy, doVirial, doFEP, doTI>
1832  (i, crossterms, crosstermValues,
1833  xyzq, stride, lata, latb, latc,
1834  force, energy,
1835  forceList, forceListCounter, forceListStarts, forceListNexts,
1836  virial,
1837  energy_F, energy_TI_1, energy_TI_2);
1838  }
1839  goto reduce;
1840  }
1841  // indexTB -= numCrosstermsTB;
1842 
1843  reduce:
1844 
1845  // Write energies to global memory
1846  if (doEnergy) {
1847  // energyIndex is constant within thread-block
1848  __shared__ double shEnergy[BONDEDFORCESKERNEL_NUM_WARP];
1849  __shared__ double shEnergy_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
1850  __shared__ double shEnergy_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
1851  __shared__ double shEnergy_F[BONDEDFORCESKERNEL_NUM_WARP];
1852 #pragma unroll
1853  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1854  energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, WARPSIZE);
1855 
1856  if (doFEP) {
1857  energy_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_F, i, WARPSIZE);
1858  }
1859  if (doTI) {
1860  energy_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_1, i, WARPSIZE);
1861  energy_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_2, i, WARPSIZE);
1862  }
1863  }
1864  int laneID = (threadIdx.x & (WARPSIZE - 1));
1865  int warpID = threadIdx.x / WARPSIZE;
1866  if (laneID == 0) {
1867  shEnergy[warpID] = energy;
1868  if (doFEP) {
1869  shEnergy_F[warpID] = energy_F;
1870  }
1871  if (doTI) {
1872  shEnergy_TI_1[warpID] = energy_TI_1;
1873  shEnergy_TI_2[warpID] = energy_TI_2;
1874  }
1875  }
1876  BLOCK_SYNC;
1877  if (warpID == 0) {
1878  energy = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy[laneID] : 0.0;
1879  if (doFEP) {
1880  energy_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy_F[laneID] : 0.0;
1881  }
1882  if (doTI) {
1883  energy_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy_TI_1[laneID] : 0.0;
1884  energy_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy_TI_2[laneID] : 0.0;
1885  }
1886 #pragma unroll
1887  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1888  energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, WARPSIZE);
1889 
1890  if (doFEP) {
1891  energy_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_F, i, WARPSIZE);
1892  }
1893  if (doTI) {
1894  energy_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_1, i, WARPSIZE);
1895  energy_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_2, i, WARPSIZE);
1896  }
1897  }
1898  if (laneID == 0) {
1899  const int bin = blockIdx.x % ATOMIC_BINS;
1900  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex], energy);
1901  if (doFEP) {
1902  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex_F], energy_F);
1903  }
1904  if (doTI) {
1905  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex_TI_1], energy_TI_1);
1906  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex_TI_2], energy_TI_2);
1907  }
1908  }
1909  }
1910  }
1911 
1912  // Write virials to global memory
1913 #ifdef WRITE_FULL_VIRIALS
1914  if (doVirial) {
1915 #pragma unroll
1916  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1917  virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, WARPSIZE);
1918  virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, WARPSIZE);
1919  virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, WARPSIZE);
1920  virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, WARPSIZE);
1921  virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, WARPSIZE);
1922  virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, WARPSIZE);
1923  virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, WARPSIZE);
1924  virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, WARPSIZE);
1925  virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, WARPSIZE);
1926  }
1927  __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirial[BONDEDFORCESKERNEL_NUM_WARP];
1928  int laneID = (threadIdx.x & (WARPSIZE - 1));
1929  int warpID = threadIdx.x / WARPSIZE;
1930  if (laneID == 0) {
1931  shVirial[warpID] = virial;
1932  }
1933  BLOCK_SYNC;
1934 
1935  if (warpID == 0) {
1936  virial.xx = 0.0;
1937  virial.xy = 0.0;
1938  virial.xz = 0.0;
1939  virial.yx = 0.0;
1940  virial.yy = 0.0;
1941  virial.yz = 0.0;
1942  virial.zx = 0.0;
1943  virial.zy = 0.0;
1944  virial.zz = 0.0;
1945  if (laneID < BONDEDFORCESKERNEL_NUM_WARP) virial = shVirial[laneID];
1946 #pragma unroll
1947  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1948  virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, WARPSIZE);
1949  virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, WARPSIZE);
1950  virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, WARPSIZE);
1951  virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, WARPSIZE);
1952  virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, WARPSIZE);
1953  virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, WARPSIZE);
1954  virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, WARPSIZE);
1955  virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, WARPSIZE);
1956  virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, WARPSIZE);
1957  }
1958 
1959  if (laneID == 0) {
1960  const int bin = blockIdx.x % ATOMIC_BINS;
1961  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 0], virial.xx);
1962  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 1], virial.xy);
1963  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 2], virial.xz);
1964  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 3], virial.yx);
1965  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 4], virial.yy);
1966  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 5], virial.yz);
1967  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 6], virial.zx);
1968  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 7], virial.zy);
1969  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 8], virial.zz);
1970  }
1971  }
1972  }
1973 #endif
1974 
1975 }
1976 
1977 template <typename T, bool doEnergy, bool doVirial, bool doElect, bool doFEP, bool doTI, bool doTable>
1978 __global__ void modifiedExclusionForcesKernel(
1979  const int start,
1980  const int numModifiedExclusions,
1981  const CudaExclusion* __restrict__ modifiedExclusions,
1982  const bool doSlow,
1983  const float one_scale14, // 1 - scale14
1984  const float cutoff2,
1985  const CudaNBConstants nbConstants,
1986  const int vdwCoefTableWidth,
1987  const float2* __restrict__ vdwCoefTable,
1988 #if !defined(USE_TABLE_ARRAYS)
1989  cudaTextureObject_t vdwCoefTableTex,
1990 #endif
1991 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
1992  const float4* __restrict__ modifiedExclusionForceTable,
1993  const float4* __restrict__ modifiedExclusionEnergyTable,
1994 #else
1995  cudaTextureObject_t modifiedExclusionForceTableTex,
1996  cudaTextureObject_t modifiedExclusionEnergyTableTex,
1997 #endif
1998  const float4* __restrict__ xyzq,
1999  const int stride,
2000  const float3 lata, const float3 latb, const float3 latc,
2001  T* __restrict__ forceNbond, T* __restrict__ forceSlow,
2002  T* __restrict__ forceList,
2003  int* forceListCounter,
2004  int* forceListStartsNbond,
2005  int* forceListStartsSlow,
2006  int* forceListNexts,
2007  double* __restrict__ energies_virials) {
2008 
2009  // index
2010  int i = threadIdx.x + (start + blockIdx.x)*blockDim.x;
2011 
2012  double energyVdw, energyNbond, energySlow;
2013  // Alchemical energies
2014  double energyVdw_F, energyVdw_TI_1, energyVdw_TI_2;
2015  double energyNbond_F, energyNbond_TI_1, energyNbond_TI_2;
2016  double energySlow_F, energySlow_TI_1, energySlow_TI_2;
2017  if (doEnergy) {
2018  energyVdw = 0.0;
2019  if (doFEP) {
2020  energyVdw_F = 0.0;
2021  }
2022  if (doTI) {
2023  energyVdw_TI_1 = 0.0;
2024  energyVdw_TI_2 = 0.0;
2025  }
2026  if (doElect) {
2027  energyNbond = 0.0;
2028  energySlow = 0.0;
2029  if (doFEP) {
2030  energyNbond_F = 0.0;
2031  energySlow_F = 0.0;
2032  }
2033  if (doTI) {
2034  energyNbond_TI_1 = 0.0;
2035  energyNbond_TI_2 = 0.0;
2036  energySlow_TI_1 = 0.0;
2037  energySlow_TI_2 = 0.0;
2038  }
2039  }
2040  }
2041 
2042 #ifdef WRITE_FULL_VIRIALS
2043  ComputeBondedCUDAKernel::BondedVirial<double> virialNbond;
2044  ComputeBondedCUDAKernel::BondedVirial<double> virialSlow;
2045  if (doVirial) {
2046  virialNbond.xx = 0.0;
2047  virialNbond.xy = 0.0;
2048  virialNbond.xz = 0.0;
2049  virialNbond.yx = 0.0;
2050  virialNbond.yy = 0.0;
2051  virialNbond.yz = 0.0;
2052  virialNbond.zx = 0.0;
2053  virialNbond.zy = 0.0;
2054  virialNbond.zz = 0.0;
2055  if (doElect) {
2056  virialSlow.xx = 0.0;
2057  virialSlow.xy = 0.0;
2058  virialSlow.xz = 0.0;
2059  virialSlow.yx = 0.0;
2060  virialSlow.yy = 0.0;
2061  virialSlow.yz = 0.0;
2062  virialSlow.zx = 0.0;
2063  virialSlow.zy = 0.0;
2064  virialSlow.zz = 0.0;
2065  }
2066  }
2067 #endif
2068 
2069  if (i < numModifiedExclusions)
2070  {
2071  modifiedExclusionForce<T, doEnergy, doVirial, doElect, doFEP, doTI, doTable>
2072  (i, modifiedExclusions, doSlow, one_scale14, vdwCoefTableWidth,
2073 #if __CUDA_ARCH__ >= 350 || defined(USE_TABLE_ARRAYS)
2074  vdwCoefTable,
2075 #else
2076  vdwCoefTableTex,
2077 #endif
2078  // for modified exclusions, we do regular tables if HIP and USE_FORCE_TABLES, otherwise it's textures
2079 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
2080  // tables
2081  modifiedExclusionForceTable,
2082  modifiedExclusionEnergyTable,
2083 #else
2084  // if CUDA build, non-tables, fall back to texture force/energy tables
2085  modifiedExclusionForceTableTex,
2086  modifiedExclusionEnergyTableTex,
2087 #endif
2088  xyzq, stride, lata, latb, latc, cutoff2, nbConstants,
2089  energyVdw, forceNbond, energyNbond,
2090  forceSlow, energySlow,
2091  forceList, forceListCounter, forceListStartsNbond, forceListStartsSlow, forceListNexts,
2092  virialNbond, virialSlow,
2093  energyVdw_F, energyVdw_TI_1, energyVdw_TI_2,
2094  energyNbond_F, energyNbond_TI_1, energyNbond_TI_2,
2095  energySlow_F, energySlow_TI_1, energySlow_TI_2);
2096  }
2097 
2098  // Write energies to global memory
2099  if (doEnergy) {
2100  __shared__ double shEnergyVdw[BONDEDFORCESKERNEL_NUM_WARP];
2101  __shared__ double shEnergyNbond[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
2102  __shared__ double shEnergySlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
2103  __shared__ double shEnergyVdw_F[BONDEDFORCESKERNEL_NUM_WARP];
2104  __shared__ double shEnergyVdw_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2105  __shared__ double shEnergyVdw_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2106  __shared__ double shEnergyNbond_F[BONDEDFORCESKERNEL_NUM_WARP];
2107  __shared__ double shEnergyNbond_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2108  __shared__ double shEnergyNbond_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2109  __shared__ double shEnergySlow_F[BONDEDFORCESKERNEL_NUM_WARP];
2110  __shared__ double shEnergySlow_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2111  __shared__ double shEnergySlow_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2112 #pragma unroll
2113  for (int i=WARPSIZE/2;i >= 1;i/=2) {
2114  energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
2115 
2116  if (doFEP) {
2117  energyVdw_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_F, i, WARPSIZE);
2118  }
2119  if (doTI) {
2120  energyVdw_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_1, i, WARPSIZE);
2121  energyVdw_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_2, i, WARPSIZE);
2122  }
2123  if (doElect) {
2124  energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, WARPSIZE);
2125  energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
2126  if (doFEP) {
2127  energyNbond_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_F, i, WARPSIZE);
2128  energySlow_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_F, i, WARPSIZE);
2129  }
2130  if (doTI) {
2131  energyNbond_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_1, i, WARPSIZE);
2132  energyNbond_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_2, i, WARPSIZE);
2133  energySlow_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_1, i, WARPSIZE);
2134  energySlow_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_2, i, WARPSIZE);
2135  }
2136  }
2137  }
2138  int laneID = (threadIdx.x & (WARPSIZE - 1));
2139  int warpID = threadIdx.x / WARPSIZE;
2140  if (laneID == 0) {
2141  shEnergyVdw[warpID] = energyVdw;
2142  if (doFEP) {
2143  shEnergyVdw_F[warpID] = energyVdw_F;
2144  }
2145  if (doTI) {
2146  shEnergyVdw_TI_1[warpID] = energyVdw_TI_1;
2147  shEnergyVdw_TI_2[warpID] = energyVdw_TI_2;
2148  }
2149  if (doElect) {
2150  shEnergyNbond[warpID] = energyNbond;
2151  shEnergySlow[warpID] = energySlow;
2152  if (doFEP) {
2153  shEnergyNbond_F[warpID] = energyNbond_F;
2154  shEnergySlow_F[warpID] = energySlow_F;
2155  }
2156  if (doTI) {
2157  shEnergyNbond_TI_1[warpID] = energyNbond_TI_1;
2158  shEnergyNbond_TI_2[warpID] = energyNbond_TI_2;
2159  shEnergySlow_TI_1[warpID] = energySlow_TI_1;
2160  shEnergySlow_TI_2[warpID] = energySlow_TI_2;
2161  }
2162  }
2163  }
2164  BLOCK_SYNC;
2165  if (warpID == 0) {
2166  energyVdw = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw[laneID] : 0.0;
2167  if (doFEP) {
2168  energyVdw_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw_F[laneID] : 0.0;
2169  }
2170  if (doTI) {
2171  energyVdw_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw_TI_1[laneID] : 0.0;
2172  energyVdw_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw_TI_2[laneID] : 0.0;
2173  }
2174  if (doElect) {
2175  energyNbond = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond[laneID] : 0.0;
2176  energySlow = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow[laneID] : 0.0;
2177  if (doFEP) {
2178  energyNbond_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond_F[laneID] : 0.0;
2179  energySlow_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow_F[laneID] : 0.0;
2180  }
2181  if (doTI) {
2182  energyNbond_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond_TI_1[laneID] : 0.0;
2183  energyNbond_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond_TI_2[laneID] : 0.0;
2184  energySlow_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow_TI_1[laneID] : 0.0;
2185  energySlow_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow_TI_2[laneID] : 0.0;
2186  }
2187  }
2188 #pragma unroll
2189  for (int i=WARPSIZE/2;i >= 1;i/=2) {
2190  energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
2191 
2192  if (doFEP) {
2193  energyVdw_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_F, i, WARPSIZE);
2194  }
2195  if (doTI) {
2196  energyVdw_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_1, i, WARPSIZE);
2197  energyVdw_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_2, i, WARPSIZE);
2198  }
2199  if (doElect) {
2200  energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, WARPSIZE);
2201  energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
2202  if (doFEP) {
2203  energyNbond_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_F, i, WARPSIZE);
2204  energySlow_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_F, i, WARPSIZE);
2205  }
2206  if (doTI) {
2207  energyNbond_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_1, i, WARPSIZE);
2208  energyNbond_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_2, i, WARPSIZE);
2209  energySlow_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_1, i, WARPSIZE);
2210  energySlow_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_2, i, WARPSIZE);
2211  }
2212  }
2213  }
2214  if (laneID == 0) {
2215  const int bin = blockIdx.x % ATOMIC_BINS;
2216  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ], energyVdw);
2217  if (doFEP) {
2218  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ_F], energyVdw_F);
2219  }
2220  if (doTI) {
2221  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ_TI_1], energyVdw_TI_1);
2222  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ_TI_2], energyVdw_TI_2);
2223  }
2224  if (doElect) {
2225  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT], energyNbond);
2226  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW], energySlow);
2227  if (doFEP) {
2228  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_F], energyNbond_F);
2229  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_F], energySlow_F);
2230  }
2231  if (doTI) {
2232  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_TI_1], energyNbond_TI_1);
2233  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_TI_2], energyNbond_TI_2);
2234  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_1], energySlow_TI_1);
2235  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_2], energySlow_TI_2);
2236  }
2237  }
2238  }
2239  }
2240  }
2241 
2242  // Write virials to global memory
2243 #ifdef WRITE_FULL_VIRIALS
2244  if (doVirial) {
2245 #pragma unroll
2246  for (int i=WARPSIZE/2;i >= 1;i/=2) {
2247  virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, WARPSIZE);
2248  virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, WARPSIZE);
2249  virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, WARPSIZE);
2250  virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, WARPSIZE);
2251  virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, WARPSIZE);
2252  virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, WARPSIZE);
2253  virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, WARPSIZE);
2254  virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, WARPSIZE);
2255  virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, WARPSIZE);
2256  if (doElect && doSlow) {
2257  virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, WARPSIZE);
2258  virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, WARPSIZE);
2259  virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, WARPSIZE);
2260  virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, WARPSIZE);
2261  virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, WARPSIZE);
2262  virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, WARPSIZE);
2263  virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, WARPSIZE);
2264  virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, WARPSIZE);
2265  virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, WARPSIZE);
2266  }
2267  }
2268  __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirialNbond[BONDEDFORCESKERNEL_NUM_WARP];
2269  __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirialSlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
2270  int laneID = (threadIdx.x & (WARPSIZE - 1));
2271  int warpID = threadIdx.x / WARPSIZE;
2272  if (laneID == 0) {
2273  shVirialNbond[warpID] = virialNbond;
2274  if (doElect && doSlow) {
2275  shVirialSlow[warpID] = virialSlow;
2276  }
2277  }
2278  BLOCK_SYNC;
2279 
2280  virialNbond.xx = 0.0;
2281  virialNbond.xy = 0.0;
2282  virialNbond.xz = 0.0;
2283  virialNbond.yx = 0.0;
2284  virialNbond.yy = 0.0;
2285  virialNbond.yz = 0.0;
2286  virialNbond.zx = 0.0;
2287  virialNbond.zy = 0.0;
2288  virialNbond.zz = 0.0;
2289  if (doElect && doSlow) {
2290  virialSlow.xx = 0.0;
2291  virialSlow.xy = 0.0;
2292  virialSlow.xz = 0.0;
2293  virialSlow.yx = 0.0;
2294  virialSlow.yy = 0.0;
2295  virialSlow.yz = 0.0;
2296  virialSlow.zx = 0.0;
2297  virialSlow.zy = 0.0;
2298  virialSlow.zz = 0.0;
2299  }
2300 
2301  if (warpID == 0) {
2302  if (laneID < BONDEDFORCESKERNEL_NUM_WARP) {
2303  virialNbond = shVirialNbond[laneID];
2304  if (doElect && doSlow) {
2305  virialSlow = shVirialSlow[laneID];
2306  }
2307  }
2308 #pragma unroll
2309  for (int i=WARPSIZE/2;i >= 1;i/=2) {
2310  virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, WARPSIZE);
2311  virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, WARPSIZE);
2312  virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, WARPSIZE);
2313  virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, WARPSIZE);
2314  virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, WARPSIZE);
2315  virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, WARPSIZE);
2316  virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, WARPSIZE);
2317  virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, WARPSIZE);
2318  virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, WARPSIZE);
2319  if (doElect && doSlow) {
2320  virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, WARPSIZE);
2321  virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, WARPSIZE);
2322  virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, WARPSIZE);
2323  virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, WARPSIZE);
2324  virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, WARPSIZE);
2325  virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, WARPSIZE);
2326  virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, WARPSIZE);
2327  virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, WARPSIZE);
2328  virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, WARPSIZE);
2329  }
2330  }
2331 
2332  if (laneID == 0)
2333  {
2334  const int bin = blockIdx.x % ATOMIC_BINS;
2335  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XX], virialNbond.xx);
2336  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XY], virialNbond.xy);
2337  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XZ], virialNbond.xz);
2338  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YX], virialNbond.yx);
2339  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YY], virialNbond.yy);
2340  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YZ], virialNbond.yz);
2341  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZX], virialNbond.zx);
2342  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZY], virialNbond.zy);
2343  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZZ], virialNbond.zz);
2344  if (doElect && doSlow) {
2345  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XX], virialSlow.xx);
2346  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XY], virialSlow.xy);
2347  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XZ], virialSlow.xz);
2348  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YX], virialSlow.yx);
2349  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YY], virialSlow.yy);
2350  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YZ], virialSlow.yz);
2351  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZX], virialSlow.zx);
2352  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZY], virialSlow.zy);
2353  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZZ], virialSlow.zz);
2354  }
2355  }
2356  }
2357  }
2358 #endif
2359 
2360 }
2361 
2362 template <typename T>
2363 __global__ void gatherBondedForcesKernel(
2364  const int start,
2365  const int atomStorageSize,
2366  const int stride,
2367  const T* __restrict__ forceList,
2368  const int* __restrict__ forceListStarts,
2369  const int* __restrict__ forceListNexts,
2370  T* __restrict__ force) {
2371 
2372  int i = threadIdx.x + (start + blockIdx.x) * blockDim.x;
2373 
2374  if (i < atomStorageSize) {
2375  T fx = 0;
2376  T fy = 0;
2377  T fz = 0;
2378  int pos = forceListStarts[i];
2379  while (pos != -1) {
2380  fx += forceList[pos * 3 + 0];
2381  fy += forceList[pos * 3 + 1];
2382  fz += forceList[pos * 3 + 2];
2383  pos = forceListNexts[pos];
2384  }
2385  force[i ] = fx;
2386  force[i + stride ] = fy;
2387  force[i + stride * 2] = fz;
2388  }
2389 }
2390 
2391 __global__ void reduceBondedBinsKernel(double* energies_virials) {
2392  const int bin = threadIdx.x;
2393 
2394  typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
2395  __shared__ typename WarpReduce::TempStorage tempStorage;
2396 
2397  double v = WarpReduce(tempStorage).Sum(energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + blockIdx.x]);
2398  if (threadIdx.x == 0) {
2399  energies_virials[blockIdx.x] = v;
2400  }
2401 }
2402 
2403 // ##############################################################################################
2404 // ##############################################################################################
2405 // ##############################################################################################
2406 
2407 //
2408 // Class constructor
2409 //
2410 ComputeBondedCUDAKernel::ComputeBondedCUDAKernel(int deviceID, CudaNonbondedTables& cudaNonbondedTables) :
2411 deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables) {
2412 
2413  cudaCheck(cudaSetDevice(deviceID));
2414 
2415  tupleData = NULL;
2416  tupleDataSize = 0;
2417 
2418  bonds = NULL;
2419  angles = NULL;
2420  dihedrals = NULL;
2421  impropers = NULL;
2422  modifiedExclusions = NULL;
2423  exclusions = NULL;
2424 
2425  numBonds = 0;
2426  numAngles = 0;
2427  numDihedrals = 0;
2428  numImpropers = 0;
2429  numModifiedExclusions = 0;
2430  numExclusions = 0;
2431  numCrossterms = 0;
2432 
2433  bondValues = NULL;
2434  angleValues = NULL;
2435  dihedralValues = NULL;
2436  improperValues = NULL;
2437  crosstermValues = NULL;
2438 
2439  xyzq = NULL;
2440  xyzqSize = 0;
2441 
2442  forces = NULL;
2443  forcesSize = 0;
2444 
2445  forceList = NULL;
2446  forceListStarts = NULL;
2447  forceListNexts = NULL;
2448  forceListSize = 0;
2449  forceListStartsSize = 0;
2450  forceListNextsSize = 0;
2451  allocate_device<int>(&forceListCounter, 1);
2452 
2453  allocate_device<double>(&energies_virials, ATOMIC_BINS * energies_virials_SIZE);
2454 }
2455 
2456 //
2457 // Class destructor
2458 //
2459 ComputeBondedCUDAKernel::~ComputeBondedCUDAKernel() {
2460  cudaCheck(cudaSetDevice(deviceID));
2461 
2462  deallocate_device<double>(&energies_virials);
2463  // deallocate_device<BondedVirial>(&virial);
2464 
2465  if (tupleData != NULL) deallocate_device<char>(&tupleData);
2466  if (xyzq != NULL) deallocate_device<float4>(&xyzq);
2467  if (forces != NULL) deallocate_device<FORCE_TYPE>(&forces);
2468 
2469  if (forceList != NULL) deallocate_device<FORCE_TYPE>(&forceList);
2470  if (forceListCounter != NULL) deallocate_device<int>(&forceListCounter);
2471  if (forceListStarts != NULL) deallocate_device<int>(&forceListStarts);
2472  if (forceListNexts != NULL) deallocate_device<int>(&forceListNexts);
2473 
2474  if (bondValues != NULL) deallocate_device<CudaBondValue>(&bondValues);
2475  if (angleValues != NULL) deallocate_device<CudaAngleValue>(&angleValues);
2476  if (dihedralValues != NULL) deallocate_device<CudaDihedralValue>(&dihedralValues);
2477  if (improperValues != NULL) deallocate_device<CudaDihedralValue>(&improperValues);
2478  if (crosstermValues != NULL) deallocate_device<CudaCrosstermValue>(&crosstermValues);
2479 }
2480 
2481 void ComputeBondedCUDAKernel::setupBondValues(int numBondValues, CudaBondValue* h_bondValues) {
2482  allocate_device<CudaBondValue>(&bondValues, numBondValues);
2483  copy_HtoD_sync<CudaBondValue>(h_bondValues, bondValues, numBondValues);
2484 }
2485 
2486 void ComputeBondedCUDAKernel::setupAngleValues(int numAngleValues, CudaAngleValue* h_angleValues) {
2487  allocate_device<CudaAngleValue>(&angleValues, numAngleValues);
2488  copy_HtoD_sync<CudaAngleValue>(h_angleValues, angleValues, numAngleValues);
2489 }
2490 
2491 void ComputeBondedCUDAKernel::setupDihedralValues(int numDihedralValues, CudaDihedralValue* h_dihedralValues) {
2492  allocate_device<CudaDihedralValue>(&dihedralValues, numDihedralValues);
2493  copy_HtoD_sync<CudaDihedralValue>(h_dihedralValues, dihedralValues, numDihedralValues);
2494 }
2495 
2496 void ComputeBondedCUDAKernel::setupImproperValues(int numImproperValues, CudaDihedralValue* h_improperValues) {
2497  allocate_device<CudaDihedralValue>(&improperValues, numImproperValues);
2498  copy_HtoD_sync<CudaDihedralValue>(h_improperValues, improperValues, numImproperValues);
2499 }
2500 
2501 void ComputeBondedCUDAKernel::setupCrosstermValues(int numCrosstermValues, CudaCrosstermValue* h_crosstermValues) {
2502  allocate_device<CudaCrosstermValue>(&crosstermValues, numCrosstermValues);
2503  copy_HtoD_sync<CudaCrosstermValue>(h_crosstermValues, crosstermValues, numCrosstermValues);
2504 }
2505 
2506 void ComputeBondedCUDAKernel::updateCudaAlchParameters(const CudaAlchParameters* h_cudaAlchParameters, cudaStream_t stream) {
2507  cudaCheck(cudaMemcpyToSymbolAsync(AlchBondedCUDA::alchParams, h_cudaAlchParameters, 1*sizeof(AlchBondedCUDA::alchParams), 0, cudaMemcpyHostToDevice, stream));
2508 }
2509 
2510 void ComputeBondedCUDAKernel::updateCudaAlchLambdas(const CudaAlchLambdas* h_cudaAlchLambdas, cudaStream_t stream) {
2511  cudaCheck(cudaMemcpyToSymbolAsync(AlchBondedCUDA::alchLambdas, h_cudaAlchLambdas, 1*sizeof(AlchBondedCUDA::alchLambdas), 0, cudaMemcpyHostToDevice, stream));
2512 }
2513 
2514 void ComputeBondedCUDAKernel::updateCudaAlchFlags(const CudaAlchFlags& h_cudaAlchFlags) {
2515  alchFlags = h_cudaAlchFlags;
2516 }
2517 
2518 //
2519 // Update bonded lists
2520 //
2521 void ComputeBondedCUDAKernel::setTupleCounts(
2522  const TupleCounts count
2523 ) {
2524  numBonds = count.bond;
2525  numAngles = count.angle;
2526  numDihedrals = count.dihedral;
2527  numImpropers = count.improper;
2528  numModifiedExclusions = count.modifiedExclusion;
2529  numExclusions = count.exclusion;
2530  numCrossterms = count.crossterm;
2531 }
2532 
2533 TupleCounts ComputeBondedCUDAKernel::getTupleCounts() {
2534  TupleCounts count;
2535 
2536  count.bond = numBonds;
2537  count.angle = numAngles;
2538  count.dihedral = numDihedrals;
2539  count.improper = numImpropers;
2540  count.modifiedExclusion = numModifiedExclusions;
2541  count.exclusion = numExclusions;
2542  count.crossterm = numCrossterms;
2543 
2544  return count;
2545 }
2546 
2547 TupleData ComputeBondedCUDAKernel::getData() {
2548  TupleData data;
2549  data.bond = bonds;
2550  data.angle = angles;
2551  data.dihedral = dihedrals;
2552  data.improper = impropers;
2553  data.modifiedExclusion = modifiedExclusions;
2554  data.exclusion = exclusions;
2555  data.crossterm = crossterms;
2556 
2557  return data;
2558 }
2559 
2560 size_t ComputeBondedCUDAKernel::reallocateTupleBuffer(
2561  const TupleCounts countIn,
2562  cudaStream_t
2563 ) {
2564  const int numBondsWA = warpAlign(countIn.bond);
2565  const int numAnglesWA = warpAlign(countIn.angle);
2566  const int numDihedralsWA = warpAlign(countIn.dihedral);
2567  const int numImpropersWA = warpAlign(countIn.improper);
2568  const int numModifiedExclusionsWA = warpAlign(countIn.modifiedExclusion);
2569  const int numExclusionsWA = warpAlign(countIn.exclusion);
2570  const int numCrosstermsWA = warpAlign(countIn.crossterm);
2571 
2572  const size_t sizeTot = numBondsWA*sizeof(CudaBond)
2573  + numAnglesWA*sizeof(CudaAngle)
2574  + numDihedralsWA*sizeof(CudaDihedral)
2575  + numImpropersWA*sizeof(CudaDihedral)
2576  + numModifiedExclusionsWA*sizeof(CudaExclusion)
2577  + numExclusionsWA*sizeof(CudaExclusion)
2578  + numCrosstermsWA*sizeof(CudaCrossterm);
2579 
2580  reallocate_device<char>(&tupleData, &tupleDataSize, sizeTot, kTupleOveralloc);
2581 
2582  // Setup pointers
2583  size_t pos = 0;
2584  bonds = (CudaBond *)&tupleData[pos];
2585  pos += numBondsWA*sizeof(CudaBond);
2586 
2587  angles = (CudaAngle* )&tupleData[pos];
2588  pos += numAnglesWA*sizeof(CudaAngle);
2589 
2590  dihedrals = (CudaDihedral* )&tupleData[pos];
2591  pos += numDihedralsWA*sizeof(CudaDihedral);
2592 
2593  impropers = (CudaDihedral* )&tupleData[pos];
2594  pos += numImpropersWA*sizeof(CudaDihedral);
2595 
2596  modifiedExclusions = (CudaExclusion* )&tupleData[pos];
2597  pos += numModifiedExclusionsWA*sizeof(CudaExclusion);
2598 
2599  exclusions = (CudaExclusion* )&tupleData[pos];
2600  pos += numExclusionsWA*sizeof(CudaExclusion);
2601 
2602  crossterms = (CudaCrossterm* )&tupleData[pos];
2603  pos += numCrosstermsWA*sizeof(CudaCrossterm);
2604 
2605  return sizeTot;
2606 }
2607 
2608 void ComputeBondedCUDAKernel::update(
2609  const int numBondsIn,
2610  const int numAnglesIn,
2611  const int numDihedralsIn,
2612  const int numImpropersIn,
2613  const int numModifiedExclusionsIn,
2614  const int numExclusionsIn,
2615  const int numCrosstermsIn,
2616  const char* h_tupleData,
2617  cudaStream_t stream) {
2618 
2619  numBonds = numBondsIn;
2620  numAngles = numAnglesIn;
2621  numDihedrals = numDihedralsIn;
2622  numImpropers = numImpropersIn;
2623  numModifiedExclusions = numModifiedExclusionsIn;
2624  numExclusions = numExclusionsIn;
2625  numCrossterms = numCrosstermsIn;
2626 
2627  const size_t sizeTot = reallocateTupleBuffer(getTupleCounts(), stream);
2628 
2629  copy_HtoD<char>(h_tupleData, tupleData, sizeTot, stream);
2630 }
2631 
2632 void ComputeBondedCUDAKernel::updateAtomBuffer(
2633  const int atomStorageSize,
2634  cudaStream_t stream
2635 ) {
2636  reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, 1.4f);
2637 }
2638 
2639 //
2640 // Return stride for force array
2641 //
2642 int ComputeBondedCUDAKernel::getForceStride(const int atomStorageSize) {
2643 #ifdef USE_STRIDED_FORCE
2644  // Align stride to 256 bytes
2645  return ((atomStorageSize*sizeof(FORCE_TYPE) - 1)/256 + 1)*256/sizeof(FORCE_TYPE);
2646 #else
2647  return 1;
2648 #endif
2649 }
2650 
2651 //
2652 // Return size of single force array
2653 //
2654 int ComputeBondedCUDAKernel::getForceSize(const int atomStorageSize) {
2655 #ifdef USE_STRIDED_FORCE
2656  return (3*getForceStride(atomStorageSize));
2657 #else
2658  return (3*atomStorageSize);
2659 #endif
2660 }
2661 
2662 //
2663 // Return size of the all force arrays
2664 //
2665 int ComputeBondedCUDAKernel::getAllForceSize(const int atomStorageSize, const bool doSlow) {
2666 
2667  int forceSize = getForceSize(atomStorageSize);
2668 
2669  if (numModifiedExclusions > 0 || numExclusions > 0) {
2670  if (doSlow) {
2671  // All three force arrays [normal, nbond, slow]
2672  forceSize *= 3;
2673  } else {
2674  // Two force arrays [normal, nbond]
2675  forceSize *= 2;
2676  }
2677  }
2678 
2679  return forceSize;
2680 }
2681 
2682 //
2683 // Compute bonded forces
2684 //
2685 void ComputeBondedCUDAKernel::bondedForce(
2686  const double scale14, const int atomStorageSize,
2687  const bool doEnergy, const bool doVirial, const bool doSlow,
2688  const bool doTable,
2689  const float3 lata, const float3 latb, const float3 latc,
2690  const float cutoff2, const float r2_delta, const int r2_delta_expc,
2691  const CudaNBConstants nbConstants,
2692  const float4* h_xyzq, FORCE_TYPE* h_forces,
2693  double *h_energies_virials, bool atomsChanged, bool CUDASOAintegrate,
2694  bool useDeviceMigration, cudaStream_t stream) {
2695 
2696  int forceStorageSize = getAllForceSize(atomStorageSize, true);
2697  int forceCopySize = getAllForceSize(atomStorageSize, doSlow);
2698  int forceStride = getForceStride(atomStorageSize);
2699 
2700  int forceSize = getForceSize(atomStorageSize);
2701  int startNbond = forceSize;
2702  int startSlow = 2*forceSize;
2703  // Re-allocate coordinate and force arrays if neccessary
2704  reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, 1.4f);
2705  reallocate_device<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2706 
2707 #if !defined(USE_BONDED_FORCE_ATOMIC_STORE)
2708  // function stores
2709  // numBonds bondForce 2
2710  // numAngles angleForce 3
2711  // numDihedrals diheForce 4
2712  // numImpropers diheForce 4
2713  // numExclusions exclusionForce 2
2714  // numCrossterms crosstermForce 8
2715  // numModifiedExclusions modifiedExclusionForce 4
2716  int listSize = 3 * (numBonds * 2 + numAngles * 3 + numDihedrals * 4 + numImpropers * 4 + numExclusions * 2 + numCrossterms * 8 + numModifiedExclusions * 4);
2717  reallocate_device<FORCE_TYPE>(&forceList, &forceListSize, listSize, 1.4f);
2718  reallocate_device<int>(&forceListNexts, &forceListNextsSize, listSize, 1.4f);
2719  reallocate_device<int>(&forceListStarts, &forceListStartsSize, 3 * atomStorageSize, 1.4f);
2720  int* forceListStartsNbond = forceListStarts + atomStorageSize;
2721  int* forceListStartsSlow = forceListStarts + 2 * atomStorageSize;
2722 
2723  clear_device_array<int>(forceListCounter, 1, stream);
2724  cudaCheck(cudaMemsetAsync(forceListStarts, -1, sizeof(int) * 3 * atomStorageSize, stream));
2725 #else
2726  int* forceListStartsNbond = NULL;
2727  int* forceListStartsSlow = NULL;
2728 #endif
2729 
2730 #ifdef NODEGROUP_FORCE_REGISTER
2731  if(CUDASOAintegrate){
2732  if(atomsChanged && !useDeviceMigration) copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
2733  }else copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
2734 #else
2735  copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
2736 #endif
2737 #if defined(USE_BONDED_FORCE_ATOMIC_STORE)
2738  clear_device_array<FORCE_TYPE>(forces, forceCopySize, stream);
2739 #endif
2740  if (doEnergy || doVirial ) {
2741  clear_device_array<double>(energies_virials, ATOMIC_BINS * energies_virials_SIZE, stream);
2742  }
2743 
2744  // Check if we are doing alchemical free energy calculation
2745  // Is checking alchOn required?
2746  const bool doFEP = (alchFlags.alchOn) && (alchFlags.alchFepOn);
2747  const bool doTI = (alchFlags.alchOn) && (alchFlags.alchThermIntOn);
2748 
2749  float one_scale14 = (float)(1.0 - scale14);
2750 
2751  // If doSlow = false, these exclusions are not calculated
2752  int numExclusionsDoSlow = doSlow ? numExclusions : 0;
2753 
2754  int nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
2755 
2756  int numBondsTB = (numBonds + nthread - 1)/nthread;
2757  int numAnglesTB = (numAngles + nthread - 1)/nthread;
2758  int numDihedralsTB = (numDihedrals + nthread - 1)/nthread;
2759  int numImpropersTB = (numImpropers + nthread - 1)/nthread;
2760  int numExclusionsTB= (numExclusionsDoSlow + nthread - 1)/nthread;
2761  int numCrosstermsTB= (numCrossterms + nthread - 1)/nthread;
2762 
2763  int nblock = numBondsTB + numAnglesTB + numDihedralsTB + numImpropersTB +
2764  numExclusionsTB + numCrosstermsTB;
2765  int shmem_size = 0;
2766 
2767  // printf("%d %d %d %d %d %d nblock %d\n",
2768  // numBonds, numAngles, numDihedrals, numImpropers, numModifiedExclusions, numExclusions, nblock);
2769 
2770  int max_nblock = deviceCUDA->getMaxNumBlocks();
2771 
2772  int start = 0;
2773  while (start < nblock)
2774  {
2775  int nleft = nblock - start;
2776  int nblock_use = min(max_nblock, nleft);
2777 
2778 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
2779  #define TABLE_PARAMS cudaNonbondedTables.get_r2_table(), \
2780  cudaNonbondedTables.getExclusionTable()
2781 #else
2782  #define TABLE_PARAMS cudaNonbondedTables.get_r2_table_tex(), \
2783  cudaNonbondedTables.getExclusionTableTex()
2784 #endif
2785 
2786 #if defined(USE_TABLE_ARRAYS)
2787 #define CALL(DOENERGY, DOVIRIAL, DOFEP, DOTI) \
2788  bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOFEP, DOTI> \
2789  <<< nblock_use, nthread, shmem_size, stream >>> \
2790  (start, numBonds, bonds, bondValues, \
2791  numAngles, angles, angleValues, \
2792  numDihedrals, dihedrals, dihedralValues, \
2793  numImpropers, impropers, improperValues, \
2794  numExclusionsDoSlow, exclusions, \
2795  numCrossterms, crossterms, crosstermValues, \
2796  cutoff2, \
2797  r2_delta, r2_delta_expc, \
2798  TABLE_PARAMS, \
2799  xyzq, forceStride, \
2800  lata, latb, latc, \
2801  forces, &forces[startSlow], \
2802  forceList, forceListCounter, forceListStarts, forceListStartsSlow, forceListNexts, \
2803  energies_virials);
2804 #else
2805 #define CALL(DOENERGY, DOVIRIAL, DOFEP, DOTI) \
2806  bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOFEP, DOTI> \
2807  <<< nblock_use, nthread, shmem_size, stream >>> \
2808  (start, numBonds, bonds, bondValues, \
2809  numAngles, angles, angleValues, \
2810  numDihedrals, dihedrals, dihedralValues, \
2811  numImpropers, impropers, improperValues, \
2812  numExclusionsDoSlow, exclusions, \
2813  numCrossterms, crossterms, crosstermValues, \
2814  cutoff2, \
2815  r2_delta, r2_delta_expc, \
2816  cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable(), \
2817  cudaNonbondedTables.get_r2_table_tex(), cudaNonbondedTables.getExclusionTableTex(), \
2818  xyzq, forceStride, \
2819  lata, latb, latc, \
2820  forces, &forces[startSlow], \
2821  forceList, forceListCounter, forceListStarts, forceListStartsSlow, forceListNexts, \
2822  energies_virials);
2823 #endif
2824  if (!doEnergy && !doVirial && !doFEP && !doTI) CALL(0, 0, 0, 0);
2825  if (!doEnergy && doVirial && !doFEP && !doTI) CALL(0, 1, 0, 0);
2826  if ( doEnergy && !doVirial && !doFEP && !doTI) CALL(1, 0, 0, 0);
2827  if ( doEnergy && doVirial && !doFEP && !doTI) CALL(1, 1, 0, 0);
2828 
2829  if (!doEnergy && !doVirial && doFEP && !doTI) CALL(0, 0, 1, 0);
2830  if (!doEnergy && doVirial && doFEP && !doTI) CALL(0, 1, 1, 0);
2831  if ( doEnergy && !doVirial && doFEP && !doTI) CALL(1, 0, 1, 0);
2832  if ( doEnergy && doVirial && doFEP && !doTI) CALL(1, 1, 1, 0);
2833 
2834  if (!doEnergy && !doVirial && !doFEP && doTI) CALL(0, 0, 0, 1);
2835  if (!doEnergy && doVirial && !doFEP && doTI) CALL(0, 1, 0, 1);
2836  if ( doEnergy && !doVirial && !doFEP && doTI) CALL(1, 0, 0, 1);
2837  if ( doEnergy && doVirial && !doFEP && doTI) CALL(1, 1, 0, 1);
2838 
2839  // Can't enable both FEP and TI, don't expand the following functions.
2840 #if 0
2841  if (!doEnergy && !doVirial && doFEP && doTI) CALL(0, 0, 1, 1);
2842  if (!doEnergy && doVirial && doFEP && doTI) CALL(0, 1, 1, 1);
2843  if ( doEnergy && !doVirial && doFEP && doTI) CALL(1, 0, 1, 1);
2844  if ( doEnergy && doVirial && doFEP && doTI) CALL(1, 1, 1, 1);
2845 #endif
2846 
2847 #undef CALL
2848 #undef TABLE_PARAMS
2849  cudaCheck(cudaGetLastError());
2850 
2851  start += nblock_use;
2852  }
2853 
2854  nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
2855  nblock = (numModifiedExclusions + nthread - 1)/nthread;
2856 
2857  bool doElect = (one_scale14 == 0.0f) ? false : true;
2858 
2859  start = 0;
2860  while (start < nblock)
2861  {
2862  int nleft = nblock - start;
2863  int nblock_use = min(max_nblock, nleft);
2864 
2865 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
2866 #define TABLE_PARAMS \
2867  cudaNonbondedTables.getExclusionVdwCoefTable(), \
2868  cudaNonbondedTables.getModifiedExclusionForceTable(), \
2869  cudaNonbondedTables.getModifiedExclusionEnergyTable()
2870 #else
2871 #define TABLE_PARAMS \
2872  cudaNonbondedTables.getExclusionVdwCoefTable(), \
2873  cudaNonbondedTables.getExclusionVdwCoefTableTex(), \
2874  cudaNonbondedTables.getModifiedExclusionForceTableTex(), \
2875  cudaNonbondedTables.getModifiedExclusionEnergyTableTex()
2876 #endif
2877 
2878 #define CALL(DOENERGY, DOVIRIAL, DOELECT, DOFEP, DOTI, DOTABLE) \
2879  modifiedExclusionForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOELECT, DOFEP, DOTI, DOTABLE> \
2880  <<< nblock_use, nthread, shmem_size, stream >>> \
2881  (start, numModifiedExclusions, modifiedExclusions, \
2882  doSlow, one_scale14, cutoff2, nbConstants, \
2883  cudaNonbondedTables.getVdwCoefTableWidth(), \
2884  TABLE_PARAMS, \
2885  xyzq, forceStride, lata, latb, latc, \
2886  &forces[startNbond], &forces[startSlow], \
2887  forceList, forceListCounter, forceListStartsNbond, forceListStartsSlow, forceListNexts, \
2888  energies_virials);
2889 
2890 
2891 
2892  if (!doEnergy && !doVirial && !doElect && !doFEP && !doTI && doTable) CALL(0, 0, 0, 0, 0, 1);
2893  if (!doEnergy && doVirial && !doElect && !doFEP && !doTI && doTable) CALL(0, 1, 0, 0, 0, 1);
2894  if ( doEnergy && !doVirial && !doElect && !doFEP && !doTI && doTable) CALL(1, 0, 0, 0, 0, 1);
2895  if ( doEnergy && doVirial && !doElect && !doFEP && !doTI && doTable) CALL(1, 1, 0, 0, 0, 1);
2896 
2897  if (!doEnergy && !doVirial && doElect && !doFEP && !doTI && doTable) CALL(0, 0, 1, 0, 0, 1);
2898  if (!doEnergy && doVirial && doElect && !doFEP && !doTI && doTable) CALL(0, 1, 1, 0, 0, 1);
2899  if ( doEnergy && !doVirial && doElect && !doFEP && !doTI && doTable) CALL(1, 0, 1, 0, 0, 1);
2900  if ( doEnergy && doVirial && doElect && !doFEP && !doTI && doTable) CALL(1, 1, 1, 0, 0, 1);
2901 
2902  if (!doEnergy && !doVirial && !doElect && doFEP && !doTI && doTable) CALL(0, 0, 0, 1, 0, 1);
2903  if (!doEnergy && doVirial && !doElect && doFEP && !doTI && doTable) CALL(0, 1, 0, 1, 0, 1);
2904  if ( doEnergy && !doVirial && !doElect && doFEP && !doTI && doTable) CALL(1, 0, 0, 1, 0, 1);
2905  if ( doEnergy && doVirial && !doElect && doFEP && !doTI && doTable) CALL(1, 1, 0, 1, 0, 1);
2906 
2907  if (!doEnergy && !doVirial && doElect && doFEP && !doTI && doTable) CALL(0, 0, 1, 1, 0, 1);
2908  if (!doEnergy && doVirial && doElect && doFEP && !doTI && doTable) CALL(0, 1, 1, 1, 0, 1);
2909  if ( doEnergy && !doVirial && doElect && doFEP && !doTI && doTable) CALL(1, 0, 1, 1, 0, 1);
2910  if ( doEnergy && doVirial && doElect && doFEP && !doTI && doTable) CALL(1, 1, 1, 1, 0, 1);
2911 
2912  if (!doEnergy && !doVirial && !doElect && !doFEP && doTI && doTable) CALL(0, 0, 0, 0, 1, 1);
2913  if (!doEnergy && doVirial && !doElect && !doFEP && doTI && doTable) CALL(0, 1, 0, 0, 1, 1);
2914  if ( doEnergy && !doVirial && !doElect && !doFEP && doTI && doTable) CALL(1, 0, 0, 0, 1, 1);
2915  if ( doEnergy && doVirial && !doElect && !doFEP && doTI && doTable) CALL(1, 1, 0, 0, 1, 1);
2916 
2917  if (!doEnergy && !doVirial && doElect && !doFEP && doTI && doTable) CALL(0, 0, 1, 0, 1, 1);
2918  if (!doEnergy && doVirial && doElect && !doFEP && doTI && doTable) CALL(0, 1, 1, 0, 1, 1);
2919  if ( doEnergy && !doVirial && doElect && !doFEP && doTI && doTable) CALL(1, 0, 1, 0, 1, 1);
2920  if ( doEnergy && doVirial && doElect && !doFEP && doTI && doTable) CALL(1, 1, 1, 0, 1, 1);
2921 
2922  // doTable disabled is only supported by no FEP/ no TI
2923  if (!doEnergy && !doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(0, 0, 0, 0, 0, 0);
2924  if (!doEnergy && doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(0, 1, 0, 0, 0, 0);
2925  if ( doEnergy && !doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(1, 0, 0, 0, 0, 0);
2926  if ( doEnergy && doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(1, 1, 0, 0, 0, 0);
2927 
2928  if (!doEnergy && !doVirial && doElect && !doFEP && !doTI && !doTable) CALL(0, 0, 1, 0, 0, 0);
2929  if (!doEnergy && doVirial && doElect && !doFEP && !doTI && !doTable) CALL(0, 1, 1, 0, 0, 0);
2930  if ( doEnergy && !doVirial && doElect && !doFEP && !doTI && !doTable) CALL(1, 0, 1, 0, 0, 0);
2931  if ( doEnergy && doVirial && doElect && !doFEP && !doTI && !doTable) CALL(1, 1, 1, 0, 0, 0);
2932 
2933  // Can't enable both FEP and TI, don't expand the following functions.
2934 #if 0
2935  if (!doEnergy && !doVirial && !doElect && doFEP && doTI) CALL(0, 0, 0, 1, 1);
2936  if (!doEnergy && doVirial && !doElect && doFEP && doTI) CALL(0, 1, 0, 1, 1);
2937  if ( doEnergy && !doVirial && !doElect && doFEP && doTI) CALL(1, 0, 0, 1, 1);
2938  if ( doEnergy && doVirial && !doElect && doFEP && doTI) CALL(1, 1, 0, 1, 1);
2939 
2940  if (!doEnergy && !doVirial && doElect && doFEP && doTI) CALL(0, 0, 1, 1, 1);
2941  if (!doEnergy && doVirial && doElect && doFEP && doTI) CALL(0, 1, 1, 1, 1);
2942  if ( doEnergy && !doVirial && doElect && doFEP && doTI) CALL(1, 0, 1, 1, 1);
2943  if ( doEnergy && doVirial && doElect && doFEP && doTI) CALL(1, 1, 1, 1, 1);
2944 #endif
2945 
2946 #undef CALL
2947  cudaCheck(cudaGetLastError());
2948 
2949  start += nblock_use;
2950  }
2951 #if !defined(USE_BONDED_FORCE_ATOMIC_STORE)
2952  nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
2953  nblock = (atomStorageSize + nthread - 1)/nthread;
2954 
2955  start = 0;
2956  while (start < nblock)
2957  {
2958  int nleft = nblock - start;
2959  int nblock_use = min(max_nblock, nleft);
2960 
2961  // cudaCheck(hipDeviceSynchronize());
2962  // auto t0 = std::chrono::high_resolution_clock::now();
2963 
2964  gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
2965  start, atomStorageSize, forceStride,
2966  forceList, forceListStarts, forceListNexts,
2967  forces);
2968  gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
2969  start, atomStorageSize, forceStride,
2970  forceList, forceListStartsNbond, forceListNexts,
2971  &forces[startNbond]);
2972  if (doSlow) {
2973  gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
2974  start, atomStorageSize, forceStride,
2975  forceList, forceListStartsSlow, forceListNexts,
2976  &forces[startSlow]);
2977  }
2978  cudaCheck(cudaGetLastError());
2979 
2980  // cudaCheck(hipStreamSynchronize(stream));
2981  // auto t1 = std::chrono::high_resolution_clock::now();
2982  // std::chrono::duration<double> diff1 = t1 - t0;
2983  // std::cout << "gatherBondedForcesKernel";
2984  // std::cout << " " << std::setprecision(3) << diff1.count() * 1e3 << " ms" << std::endl;
2985 
2986  start += nblock_use;
2987  }
2988 #endif
2989 
2990 #ifdef NODEGROUP_FORCE_REGISTER
2991  if((atomsChanged && !useDeviceMigration) || !CUDASOAintegrate){
2992  copy_DtoH<double>(forces, h_forces, forceCopySize, stream);
2993 #if 0
2994  // XXX TODO: ERASE THIS AFTERWARDS
2995  // this is not numAtoms, this is something else
2996  // will print the force inside the compute and afterwards
2997  FILE* pos_nb_atoms = fopen("compute_b_dforce.txt", "w");
2998  //fprintf(pos_nb_atoms, "Calculating %d positions\n", tlKernel.getCudaPatchesSize());
2999  // positions are wrong here.
3000  //for(int i = 0; i < atomStorageSize; i++){
3001  for(int i = 29415; i < 29895; i++){
3002  fprintf(pos_nb_atoms, "%2.4lf\n", h_forcesDP[i]);
3003  }
3004  fclose(pos_nb_atoms);
3005 #endif
3006  }
3007 
3008 #else
3009  copy_DtoH<double>(forces, h_forces, forceCopySize, stream);
3010 #endif
3011  if (doEnergy || doVirial) {
3012  if (ATOMIC_BINS > 1) {
3013  // Reduce energies_virials[ATOMIC_BINS][energies_virials_SIZE] in-place (results are in energies_virials[0])
3014  reduceBondedBinsKernel<<<energies_virials_SIZE, ATOMIC_BINS, 0, stream>>>(energies_virials);
3015  }
3016  // virial copy, is this necessary?
3017  copy_DtoH<double>(energies_virials, h_energies_virials, energies_virials_SIZE, stream);
3018  }
3019 }
3020 
3021