NAMD
ComputeBondedCUDAKernel.cu
Go to the documentation of this file.
1 #include "TupleTypesCUDA.h"
2 #ifdef WIN32
3 #define _USE_MATH_DEFINES
4 #define __thread __declspec(thread)
5 #endif // WIN32
6 
7 #ifdef NAMD_HIP
8 #include <hipcub/hipcub.hpp>
9 #define cub hipcub
10 #endif // NAMD_HIP
11 
12 #ifdef NAMD_CUDA
13 #include <cuda.h>
14 #if __CUDACC_VER_MAJOR__ >= 11
15 #include <cub/cub.cuh>
16 #else
17 #include <namd_cub/cub.cuh>
18 #endif
19 #endif // NAMD_CUDA
20 
21 #include <math.h>
22 
23 #include "ComputeBondedCUDAKernel.h"
24 #include "CudaComputeNonbondedInteractions.h"
25 
26 #ifdef FUTURE_CUDADEVICE
27 #include "CudaDevice.h"
28 #else
29 #include "DeviceCUDA.h"
30 extern __thread DeviceCUDA *deviceCUDA;
31 #endif
32 
33 #ifndef USE_TABLE_ARRAYS
34 __device__ __forceinline__
35 float4 sampleTableTex(cudaTextureObject_t tex, float k) {
36 #if defined(NAMD_CUDA)
37  return tex1D<float4>(tex, k);
38 #else
39  const int tableSize = FORCE_ENERGY_TABLE_SIZE;
40  const float x = k * (float)tableSize - 0.5f;
41  const float f = floorf(x);
42  const float a = x - f;
43  const unsigned int i = (unsigned int)f;
44  const int i0 = i < tableSize - 1 ? i : tableSize - 1;
45  const int i1 = i0 + 1;
46  const float4 t0 = tex1Dfetch<float4>(tex, i0);
47  const float4 t1 = tex1Dfetch<float4>(tex, i1);
48  return make_float4(
49  a * (t1.x - t0.x) + t0.x,
50  a * (t1.y - t0.y) + t0.y,
51  a * (t1.z - t0.z) + t0.z,
52  a * (t1.w - t0.w) + t0.w);
53 #endif
54 }
55 #endif
56 
57 __device__ __forceinline__
58 float4 tableLookup(const float4* table, const float k)
59 {
60  const int tableSize = FORCE_ENERGY_TABLE_SIZE;
61  const float x = k * static_cast<float>(tableSize) - 0.5f;
62  const float f = floorf(x);
63  const float a = x - f;
64  const int i = static_cast<int>(f);
65  const int i0 = max(0, min(tableSize - 1, i));
66  const int i1 = max(0, min(tableSize - 1, i + 1));
67  const float4 t0 = __ldg(&table[i0]);
68  const float4 t1 = __ldg(&table[i1]);
69  return make_float4(
70  a * (t1.x - t0.x) + t0.x,
71  a * (t1.y - t0.y) + t0.y,
72  a * (t1.z - t0.z) + t0.z,
73  a * (t1.w - t0.w) + t0.w);
74 }
75 
76 // global variable for alchemical transformation
77 namespace AlchBondedCUDA {
78  // Alchemical parameters and lambdas
79  __constant__ CudaAlchParameters alchParams;
80  __constant__ CudaAlchLambdas alchLambdas;
81 }
82 
83 
84 template <typename T>
85 __forceinline__ __device__
86 void convertForces(const float x, const float y, const float z,
87  T &Tx, T &Ty, T &Tz) {
88 
89  Tx = (T)(x);
90  Ty = (T)(y);
91  Tz = (T)(z);
92 }
93 
94 template <typename T>
95 __forceinline__ __device__
96 void calcComponentForces(
97  float fij,
98  const float dx, const float dy, const float dz,
99  T &fxij, T &fyij, T &fzij) {
100 
101  fxij = (T)(fij*dx);
102  fyij = (T)(fij*dy);
103  fzij = (T)(fij*dz);
104 
105 }
106 
107 template <typename T>
108 __forceinline__ __device__
109 void calcComponentForces(
110  float fij1,
111  const float dx1, const float dy1, const float dz1,
112  float fij2,
113  const float dx2, const float dy2, const float dz2,
114  T &fxij, T &fyij, T &fzij) {
115 
116  fxij = (T)(fij1*dx1 + fij2*dx2);
117  fyij = (T)(fij1*dy1 + fij2*dy2);
118  fzij = (T)(fij1*dz1 + fij2*dz2);
119 }
120 
121 __forceinline__ __device__
122 int warpAggregatingAtomicInc(int* counter) {
123 #if BOUNDINGBOXSIZE == 64
124  WarpMask mask = __ballot(1);
125  int total = __popcll(mask);
126  int prefix = __popcll(mask & cub::LaneMaskLt());
127  int firstLane = __ffsll(mask) - 1;
128 #else
129  WarpMask mask = __ballot(1);
130  int total = __popc(mask);
131  int prefix = __popc(mask & cub::LaneMaskLt());
132  int firstLane = __ffs(mask) - 1;
133 #endif
134  int start = 0;
135  if (prefix == 0) {
136  start = atomicAdd(counter, total);
137  }
138  start = WARP_SHUFFLE(mask, start, firstLane, WARPSIZE);
139  return start + prefix;
140 }
141 
142 template <typename T>
143 __forceinline__ __device__
144 void storeForces(const T fx, const T fy, const T fz,
145  const int ind, const int stride,
146  T* force,
147  T* forceList, int* forceListCounter,
148  int* forceListStarts, int* forceListNexts) {
149 
150 #if defined(USE_BONDED_FORCE_ATOMIC_STORE)
151 #if defined(NAMD_HIP)
152  // Try to decrease conflicts between lanes if there are repeting ind
153  WarpMask mask = NAMD_WARP_BALLOT(WARP_FULL_MASK, 1);
154  if (mask == ~(WarpMask)0) { // All lanes are active (may be not true for the last block)
155  const int laneId = threadIdx.x % WARPSIZE;
156  const int prevLaneInd = WARP_SHUFFLE(WARP_FULL_MASK, ind, laneId - 1, WARPSIZE);
157  const bool isHead = laneId == 0 || ind != prevLaneInd;
158  if (!NAMD_WARP_ALL(WARP_FULL_MASK, isHead)) {
159  // There are segments of repeating ind
160  typedef cub::WarpReduce<T> WarpReduce;
161  __shared__ typename WarpReduce::TempStorage temp_storage;
162  const T sumfx = WarpReduce(temp_storage).HeadSegmentedSum(fx, isHead);
163  const T sumfy = WarpReduce(temp_storage).HeadSegmentedSum(fy, isHead);
164  const T sumfz = WarpReduce(temp_storage).HeadSegmentedSum(fz, isHead);
165  if (isHead) {
166  atomicAdd(&force[ind ], sumfx);
167  atomicAdd(&force[ind + stride ], sumfy);
168  atomicAdd(&force[ind + stride*2], sumfz);
169  }
170  return;
171  }
172  }
173  // Not all lanes are active (the last block) or there is no repeating ind
174  atomicAdd(&force[ind ], fx);
175  atomicAdd(&force[ind + stride ], fy);
176  atomicAdd(&force[ind + stride*2], fz);
177 #else
178  atomicAdd(&force[ind ], fx);
179  atomicAdd(&force[ind + stride ], fy);
180  atomicAdd(&force[ind + stride*2], fz);
181 #endif
182 #else
183  const int newPos = warpAggregatingAtomicInc(forceListCounter);
184  forceListNexts[newPos] = atomicExch(&forceListStarts[ind], newPos);
185  forceList[newPos * 3 + 0] = fx;
186  forceList[newPos * 3 + 1] = fy;
187  forceList[newPos * 3 + 2] = fz;
188 #endif
189 }
190 
191 //
192 // Calculates bonds
193 //
194 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
195 __device__ void bondForce(
196  const int index,
197  const CudaBond* __restrict__ bondList,
198  const CudaBondValue* __restrict__ bondValues,
199  const float4* __restrict__ xyzq,
200  const int stride,
201  const float3 lata, const float3 latb, const float3 latc,
202  T* __restrict__ force, double &energy,
203  T* __restrict__ forceList, int* forceListCounter,
204  int* forceListStarts, int* forceListNexts,
205 #ifdef WRITE_FULL_VIRIALS
206  ComputeBondedCUDAKernel::BondedVirial<double>& virial,
207 #else
208  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
209 #endif
210  double &energy_F, double &energy_TI_1, double &energy_TI_2
211  ) {
212 
213  CudaBond bl = bondList[index];
214  CudaBondValue bondValue = bondValues[bl.itype];
215 // if (bondValue.x0 == 0.0f) return;
216 
217  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
218  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
219  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
220 
221  float4 xyzqi = xyzq[bl.i];
222  float4 xyzqj = xyzq[bl.j];
223 
224  float xij = xyzqi.x + shx - xyzqj.x;
225  float yij = xyzqi.y + shy - xyzqj.y;
226  float zij = xyzqi.z + shz - xyzqj.z;
227 
228  float r = sqrtf(xij*xij + yij*yij + zij*zij);
229 
230  float db = r - bondValue.x0;
231  if (bondValue.x1) {
232  // in this case, the bond represents a harmonic wall potential
233  // where x0 is the lower wall and x1 is the upper
234  db = (r > bondValue.x1 ? r - bondValue.x1 : (r > bondValue.x0 ? 0 : db));
235  }
236  float fij = db * bondValue.k * bl.scale;
237 
238  // Define a temporary variable to store the energy
239  // used by both alchemical and non-alchemical route
240  float energy_tmp = 0.0f;
241  if (doEnergy) {
242  energy_tmp += fij * db;
243  }
244 
245  // Alchemical route
246  // JM: Get this shit off here
247  float alch_scale = 1.0f;
248  if (doFEP || doTI) {
249  switch (bl.fepBondedType) {
250  case 1: {
251  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
252  if (doTI) {
253  energy_TI_1 += (double)(energy_tmp);
254  }
255  if (doFEP) {
256  energy_F += (double)(energy_tmp * ((AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1));
257  }
258  break;
259  }
260  case 2: {
261  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
262  if (doTI) {
263  energy_TI_2 += (double)(energy_tmp);
264  }
265  if (doFEP) {
266  energy_F += (double)(energy_tmp * ((AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2));
267  }
268  break;
269  }
270  }
271  }
272 
273  if (doEnergy) {
274  if (doFEP || doTI) {
275  energy += (double)(energy_tmp * alch_scale);
276  } else {
277  energy += (double)(energy_tmp);
278  }
279  }
280  fij *= -2.0f/r;
281  // XXX TODO: Get this off here
282  // XXX TODO: Decide on a templated parameter nomenclature
283  if (doFEP || doTI) {
284  fij *= alch_scale;
285  }
286 
287  T T_fxij, T_fyij, T_fzij;
288  calcComponentForces<T>(fij, xij, yij, zij, T_fxij, T_fyij, T_fzij);
289 
290  // Store forces
291  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
292  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
293 
294  // Store virial
295  if (doVirial) {
296 #ifdef WRITE_FULL_VIRIALS
297  float fxij = fij*xij;
298  float fyij = fij*yij;
299  float fzij = fij*zij;
300  virial.xx = (fxij*xij);
301  virial.xy = (fxij*yij);
302  virial.xz = (fxij*zij);
303  virial.yx = (fyij*xij);
304  virial.yy = (fyij*yij);
305  virial.yz = (fyij*zij);
306  virial.zx = (fzij*xij);
307  virial.zy = (fzij*yij);
308  virial.zz = (fzij*zij);
309 #endif
310  }
311 }
312 
313 //
314 // Calculates modified exclusions
315 //
316 template <typename T, bool doEnergy, bool doVirial, bool doElect, bool doFEP, bool doTI, bool doTable>
317 __device__ void modifiedExclusionForce(
318  const int index,
319  const CudaExclusion* __restrict__ exclusionList,
320  const bool doSlow,
321  const float one_scale14, // 1 - scale14
322  const int vdwCoefTableWidth,
323 #if defined(USE_TABLE_ARRAYS) || __CUDA_ARCH__ >= 350
324  const float2* __restrict__ vdwCoefTable,
325 #else
326  cudaTextureObject_t vdwCoefTableTex,
327 #endif
328 #ifdef USE_TABLE_ARRAYS
329  const float4* __restrict__ forceTable, const float4* __restrict__ energyTable,
330 #else
331  cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
332 #endif
333  const float4* __restrict__ xyzq,
334  const int stride,
335  const float3 lata, const float3 latb, const float3 latc,
336  const float cutoff2,
337  const CudaNBConstants nbConstants,
338  double &energyVdw,
339  T* __restrict__ forceNbond, double &energyNbond,
340  T* __restrict__ forceSlow, double &energySlow,
341  T* __restrict__ forceList, int* forceListCounter,
342  int* forceListStartsNbond, int* forceListStartsSlow, int* forceListNexts,
343 #ifdef WRITE_FULL_VIRIALS
344  ComputeBondedCUDAKernel::BondedVirial<double>& virialNbond,
345  ComputeBondedCUDAKernel::BondedVirial<double>& virialSlow,
346 #else
347  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialNbond,
348  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialSlow,
349 #endif
350  double &energyVdw_F, double &energyVdw_TI_1, double &energyVdw_TI_2,
351  double &energyNbond_F, double &energyNbond_TI_1, double &energyNbond_TI_2,
352  double &energySlow_F, double &energySlow_TI_1, double &energySlow_TI_2
353  ) {
354 
355  CudaExclusion bl = exclusionList[index];
356 
357  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
358  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
359  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
360 
361  float4 xyzqi = xyzq[bl.i];
362  float4 xyzqj = xyzq[bl.j];
363 
364  float xij = xyzqi.x + shx - xyzqj.x;
365  float yij = xyzqi.y + shy - xyzqj.y;
366  float zij = xyzqi.z + shz - xyzqj.z;
367 
368  float r2 = xij*xij + yij*yij + zij*zij;
369  if (r2 < cutoff2) {
370 
371  float rinv = rsqrtf(r2);
372 
373  float qq;
374  if (doElect) qq = one_scale14 * xyzqi.w * xyzqj.w;
375 
376  int vdwIndex = bl.vdwtypei + bl.vdwtypej*vdwCoefTableWidth;
377 #if defined(USE_TABLE_ARRAYS) || __CUDA_ARCH__ >= 350
378  float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
379 #else
380  float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
381 #endif
382 
383  // Alchemical route
384  float myElecLambda = 1.0f;
385  float myElecLambda2 = 1.0f;
386  float myVdwLambda = 1.0f;
387  float myVdwLambda2 = 1.0f;
388  float myVdwShift = 0.0f;
389  float myVdwShift2 = 0.0f;
390  double alch_vdw_energy;
391  double alch_vdw_energy_2;
392  float alch_vdw_force = 0.0f;
393  float alch_vdw_dUdl = 0.0f;
394  double* p_energyVdw_TI = NULL;
395  double* p_energyNbond_TI = NULL;
396  double* p_energySlow_TI = NULL;
397  float p0Factor;
398  if (doFEP || doTI) {
399  p0Factor = 0.0f;
400  /*lambda values 'up' are for atoms scaled up with lambda (partition 1)*/
401  float elecLambdaUp = (AlchBondedCUDA::alchLambdas).elecLambdaUp;
402  float vdwLambdaUp = (AlchBondedCUDA::alchLambdas).vdwLambdaUp;
403  float vdwShiftUp = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambdaUp);
404  float elecLambda2Up = (AlchBondedCUDA::alchLambdas).elecLambda2Up;
405  float vdwLambda2Up = (AlchBondedCUDA::alchLambdas).vdwLambda2Up;
406  float vdwShift2Up = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambda2Up);
407  /*lambda values 'down' are for atoms scaled down with lambda (partition 2)*/
408  float elecLambdaDown = (AlchBondedCUDA::alchLambdas).elecLambdaDown;
409  float vdwLambdaDown = (AlchBondedCUDA::alchLambdas).vdwLambdaDown;
410  float vdwShiftDown = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambdaDown);
411  float elecLambda2Down = (AlchBondedCUDA::alchLambdas).elecLambda2Down;
412  float vdwLambda2Down = (AlchBondedCUDA::alchLambdas).vdwLambda2Down;
413  float vdwShift2Down = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambda2Down);
414  switch (bl.pswitch) {
415  case 0: myVdwLambda = 1.0f;
416  myVdwLambda2 = 1.0f;
417  myElecLambda = 1.0f;
418  myElecLambda2 = 1.0f;
419  p0Factor = 1.0f;
420  break;
421  case 1: myVdwLambda = vdwLambdaUp;
422  myVdwLambda2 = vdwLambda2Up;
423  myElecLambda = elecLambdaUp;
424  myElecLambda2 = elecLambda2Up;
425  myVdwShift = vdwShiftUp;
426  myVdwShift2 = vdwShift2Up;
427  p_energyVdw_TI = &(energyVdw_TI_1);
428  p_energyNbond_TI= &(energyNbond_TI_1);
429  p_energySlow_TI = &(energySlow_TI_1);
430  break;
431  case 2: myVdwLambda = vdwLambdaDown;
432  myVdwLambda2 = vdwLambda2Down;
433  myElecLambda = elecLambdaDown;
434  myElecLambda2 = elecLambda2Down;
435  myVdwShift = vdwShiftDown;
436  myVdwShift2 = vdwShift2Down;
437  p_energyVdw_TI = &(energyVdw_TI_2);
438  p_energyNbond_TI= &(energyNbond_TI_2);
439  p_energySlow_TI = &(energySlow_TI_2);
440  break;
441  default: myVdwLambda = 0.0f;
442  myVdwLambda2 = 0.0f;
443  myElecLambda = 0.0f;
444  myElecLambda2 = 0.0f;
445  break;
446  }
447  if (bl.pswitch != 99 && bl.pswitch != 0) {
448  // Common part of FEP and TI
449  const float& switchOn2 = (AlchBondedCUDA::alchParams).switchDist2;
450  const float switchfactor = 1.0f / ((cutoff2 - switchOn2) * (cutoff2 - switchOn2) * (cutoff2 - switchOn2));
451  const float switchmul = (r2 > switchOn2 ? (switchfactor * (cutoff2 - r2) * (cutoff2 - r2) * (cutoff2 - 3.0f * switchOn2 + 2.0f * r2)) : 1.0f);
452  const float switchmul2 = (r2 > switchOn2 ? (switchfactor * (cutoff2 - r2) * 12.0f * (r2 - switchOn2)) : 0.0f);
453  // A = ljab.x ; B = ljab.y
454  const float r2_1 = 1.0f / (r2 + myVdwShift);
455  const float r2_2 = 1.0f / (r2 + myVdwShift2);
456  const float r6_1 = r2_1 * r2_1 * r2_1;
457  const float r6_2 = r2_2 * r2_2 * r2_2;
458  const float U1 = ljab.x * r6_1 * r6_1 - ljab.y * r6_1;
459  const float U2 = ljab.x * r6_2 * r6_2 - ljab.y * r6_2;
460  // rinv is already calculated above
461  if (doEnergy) {
462  alch_vdw_energy = double(myVdwLambda * switchmul * U1);
463  if (doFEP) {
464  alch_vdw_energy_2 = double(myVdwLambda2 * switchmul * U2);
465  }
466  }
467  alch_vdw_force = myVdwLambda * (switchmul * (12.0f * U1 + 6.0f * ljab.y * r6_1) * r2_1 + switchmul2 * U1);
468  if (doTI) {
469  alch_vdw_dUdl = switchmul * (U1 + myVdwLambda * (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (6.0f * U1 + 3.0f * ljab.y * r6_1) * r2_1);
470  }
471  } else {
472  if (doEnergy) {
473  alch_vdw_energy = 0.0;
474  alch_vdw_energy_2 = 0.0;
475  }
476  }
477  }
478 
479  float4 fi;
480  float4 ei;
481  if (doTable) {
482 #ifdef NAMD_HIP
483 #ifdef USE_TABLE_ARRAYS
484  fi = tableLookup(forceTable, rinv);
485 #else
486  fi = sampleTableTex(forceTableTex, rinv);
487 #endif
488 #else
489  fi = tex1D<float4>(forceTableTex, rinv);
490 #endif
491  }
492 
493  if (doEnergy && doTable) {
494 #ifdef NAMD_HIP
495 #ifdef USE_TABLE_ARRAYS
496  ei = tableLookup(energyTable, rinv);
497 #else
498  ei = sampleTableTex(energyTableTex, rinv);
499 #endif
500 #else
501  ei = tex1D<float4>(energyTableTex, rinv);
502 #endif
503  if (doFEP || doTI) {
504  energyVdw += (double)((ljab.x * ei.z + ljab.y * ei.y) * myElecLambda * p0Factor);
505  energyVdw += alch_vdw_energy;
506  if (doFEP) {
507  energyVdw_F += (double)((ljab.x * ei.z + ljab.y * ei.y) * myElecLambda2 * p0Factor);
508  energyVdw_F += alch_vdw_energy_2;
509  }
510  if (doTI) {
511  if (p_energyVdw_TI != NULL) {
512  (*p_energyVdw_TI) += alch_vdw_dUdl;
513  }
514  }
515  } else {
516  energyVdw += (double)(ljab.x * ei.z + ljab.y * ei.y);
517  }
518  if (doElect) {
519  energyNbond += qq * ei.x * myElecLambda;
520  if (doFEP) {
521  energyNbond_F += qq * ei.x * myElecLambda2;
522  }
523  if (doTI) {
524  if (p_energyNbond_TI != NULL) (*p_energyNbond_TI) += qq * ei.x;
525  }
526  if (doSlow) {
527  energySlow += qq * ei.w * myElecLambda;
528  if (doFEP) {
529  energySlow_F += qq * ei.w * myElecLambda2;
530  }
531  if (doTI) {
532  if (p_energySlow_TI != NULL) (*p_energySlow_TI) += qq * ei.w;
533  }
534  }
535  }
536  }
537 
538  float fNbond;
539  float fSlow;
540  if (doTable) {
541  if (doElect) {
542  fNbond = -(ljab.x * fi.z + ljab.y * fi.y + qq * fi.x);
543  } else {
544  fNbond = -(ljab.x * fi.z + ljab.y * fi.y);
545  }
546  } else {
547  float e_vdw, e_elec, e_slow;
548  if (doEnergy) {
549  e_vdw = 0.0f;
550  e_elec = 0.0f;
551  e_slow = 0.0f;
552  }
553 
554  cudaModExclForceMagCalc_VdwEnergySwitch_PMEC1<doEnergy>(
555  doSlow, doElect, r2, rinv, qq, ljab,
556  nbConstants, fNbond, fSlow, e_vdw, e_elec, e_slow);
557 
558  if (doEnergy) {
559  energyVdw += (double) (e_vdw);
560  energyNbond += (double) (e_elec * myElecLambda);
561  energySlow += (double) (e_slow * myElecLambda);
562  }
563  }
564  // fNbond = fFast + fVdw
565  if (doFEP || doTI) {
566  if (bl.pswitch != 0) {
567  fNbond -= -(ljab.x * fi.z + ljab.y * fi.y);
568  fNbond = fNbond * myElecLambda + alch_vdw_force * float(bl.pswitch == 1 || bl.pswitch == 2);
569  }
570  // if pswitch == 0, myElecLambda = 1.0
571  }
572  T T_fxij, T_fyij, T_fzij;
573  calcComponentForces<T>(fNbond, xij, yij, zij, T_fxij, T_fyij, T_fzij);
574  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceNbond, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
575  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceNbond, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
576 
577  if (doSlow && doElect) {
578  if (doTable) {
579  fSlow = -qq * fi.w;
580  }
581  if (doFEP || doTI) {
582  fSlow *= myElecLambda;
583  }
584  T T_fxij, T_fyij, T_fzij;
585  calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
586  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
587  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
588  }
589 
590  // Store virial
591  if (doVirial) {
592 #ifdef WRITE_FULL_VIRIALS
593  float fxij = fNbond*xij;
594  float fyij = fNbond*yij;
595  float fzij = fNbond*zij;
596  virialNbond.xx = (fxij*xij);
597  virialNbond.xy = (fxij*yij);
598  virialNbond.xz = (fxij*zij);
599  virialNbond.yx = (fyij*xij);
600  virialNbond.yy = (fyij*yij);
601  virialNbond.yz = (fyij*zij);
602  virialNbond.zx = (fzij*xij);
603  virialNbond.zy = (fzij*yij);
604  virialNbond.zz = (fzij*zij);
605 #endif
606  }
607 
608  // Store virial
609  if (doVirial && doSlow && doElect) {
610 #ifdef WRITE_FULL_VIRIALS
611  float fxij = fSlow*xij;
612  float fyij = fSlow*yij;
613  float fzij = fSlow*zij;
614  virialSlow.xx = (fxij*xij);
615  virialSlow.xy = (fxij*yij);
616  virialSlow.xz = (fxij*zij);
617  virialSlow.yx = (fyij*xij);
618  virialSlow.yy = (fyij*yij);
619  virialSlow.yz = (fyij*zij);
620  virialSlow.zx = (fzij*xij);
621  virialSlow.zy = (fzij*yij);
622  virialSlow.zz = (fzij*zij);
623 #endif
624  }
625 
626  }
627 }
628 
629 //
630 // Calculates exclusions. Here doSlow = true
631 //
632 // doFull = doFullElectrostatics = doSlow
633 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
634 __device__ void exclusionForce(
635  const int index,
636  const CudaExclusion* __restrict__ exclusionList,
637  const float r2_delta, const int r2_delta_expc,
638 
639 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
640  const float* __restrict__ r2_table,
641  const float4* __restrict__ exclusionTable,
642 #else
643  cudaTextureObject_t r2_table_tex,
644  cudaTextureObject_t exclusionTableTex,
645 #endif
646  const float4* __restrict__ xyzq,
647  const int stride,
648  const float3 lata, const float3 latb, const float3 latc,
649  const float cutoff2,
650  T* __restrict__ forceSlow, double &energySlow,
651  T* __restrict__ forceList, int* forceListCounter,
652  int* forceListStartsSlow, int* forceListNexts,
653 #ifdef WRITE_FULL_VIRIALS
654  ComputeBondedCUDAKernel::BondedVirial<double>& virialSlow,
655 #else
656  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialSlow,
657 #endif
658  double &energy_F, double &energy_TI_1, double &energy_TI_2
659  ) {
660 
661  CudaExclusion bl = exclusionList[index];
662 
663  float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
664  float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
665  float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
666 
667  float4 xyzqi = xyzq[bl.i];
668  float4 xyzqj = xyzq[bl.j];
669 
670  float xij = xyzqi.x + shx - xyzqj.x;
671  float yij = xyzqi.y + shy - xyzqj.y;
672  float zij = xyzqi.z + shz - xyzqj.z;
673 
674  float r2 = xij*xij + yij*yij + zij*zij;
675  if (r2 < cutoff2) {
676  r2 += r2_delta;
677 
678  union { float f; int i; } r2i;
679  r2i.f = r2;
680  int table_i = (r2i.i >> 17) + r2_delta_expc; // table_i >= 0
681 
682 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
683 #ifdef NAMD_HIP
684  float r2_table_val = r2_table[table_i];
685 #else
686  float r2_table_val = __ldg(&r2_table[table_i]);
687 #endif
688 #else
689  float r2_table_val = tex1Dfetch<float>(r2_table_tex, table_i);
690 #endif
691  float diffa = r2 - r2_table_val;
692  float qq = xyzqi.w * xyzqj.w;
693 
694 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
695 #ifdef NAMD_HIP
696  float4 slow = exclusionTable[table_i];
697 #else
698  float4 slow = __ldg(&exclusionTable[table_i]);
699 #endif
700 #else
701  float4 slow = tex1Dfetch<float4>(exclusionTableTex, table_i);
702 #endif
703  // Alchemical route
704  float myElecLambda;
705  float myElecLambda2;
706  double* p_energy_TI;
707  if (doFEP || doTI) {
708  myElecLambda = 1.0f;
709  myElecLambda2 = 1.0f;
710  if (doTI) p_energy_TI = NULL;
711  /*lambda values 'up' are for atoms scaled up with lambda (partition 1)*/
712  float elecLambdaUp = (AlchBondedCUDA::alchLambdas).elecLambdaUp;
713  float elecLambda2Up = (AlchBondedCUDA::alchLambdas).elecLambda2Up;
714  /*lambda values 'down' are for atoms scaled down with lambda (partition 2)*/
715  float elecLambdaDown = (AlchBondedCUDA::alchLambdas).elecLambdaDown;
716  float elecLambda2Down = (AlchBondedCUDA::alchLambdas).elecLambda2Down;
717  switch (bl.pswitch) {
718  case 0: myElecLambda = 1.0f;
719  myElecLambda2 = 1.0f;
720  break;
721  case 1: myElecLambda = elecLambdaUp;
722  myElecLambda2 = elecLambda2Up;
723  p_energy_TI = &(energy_TI_1);
724  break;
725  case 2: myElecLambda = elecLambdaDown;
726  myElecLambda2 = elecLambda2Down;
727  p_energy_TI = &(energy_TI_2);
728  break;
729  default: myElecLambda = 0.0f;
730  myElecLambda2 = 0.0f;
731  break;
732  }
733  }
734  if (doEnergy) {
735  double energy_slow_tmp = (double)(qq*(((diffa * (1.0f/6.0f)*slow.x + 0.25f*slow.y ) * diffa + 0.5f*slow.z ) * diffa + slow.w));
736  if (doFEP || doTI) {
737  energySlow += energy_slow_tmp * myElecLambda;
738  if (doFEP) {
739  energy_F += energy_slow_tmp * myElecLambda2;
740  }
741  if (doTI) {
742  if (p_energy_TI != NULL) {
743  (*p_energy_TI) += energy_slow_tmp;
744  }
745  }
746  } else {
747  energySlow += energy_slow_tmp;
748  }
749  }
750 
751  float fSlow = -qq*((diffa * slow.x + slow.y) * diffa + slow.z);
752  if (doFEP || doTI) {
753  fSlow *= myElecLambda;
754  }
755 
756  T T_fxij, T_fyij, T_fzij;
757  calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
758  storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
759  storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
760 
761  // Store virial
762  if (doVirial) {
763 #ifdef WRITE_FULL_VIRIALS
764  float fxij = fSlow*xij;
765  float fyij = fSlow*yij;
766  float fzij = fSlow*zij;
767  virialSlow.xx = (fxij*xij);
768  virialSlow.xy = (fxij*yij);
769  virialSlow.xz = (fxij*zij);
770  virialSlow.yx = (fyij*xij);
771  virialSlow.yy = (fyij*yij);
772  virialSlow.yz = (fyij*zij);
773  virialSlow.zx = (fzij*xij);
774  virialSlow.zy = (fzij*yij);
775  virialSlow.zz = (fzij*zij);
776 #endif
777  }
778  }
779 }
780 
781 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
782 __device__ void angleForce(const int index,
783  const CudaAngle* __restrict__ angleList,
784  const CudaAngleValue* __restrict__ angleValues,
785  const float4* __restrict__ xyzq,
786  const int stride,
787  const float3 lata, const float3 latb, const float3 latc,
788  T* __restrict__ force, double &energy,
789  T* __restrict__ forceList, int* forceListCounter,
790  int* forceListStarts, int* forceListNexts,
791 #ifdef WRITE_FULL_VIRIALS
792  ComputeBondedCUDAKernel::BondedVirial<double>& virial,
793 #else
794  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
795 #endif
796  double &energy_F, double &energy_TI_1, double &energy_TI_2
797  ) {
798 
799  CudaAngle al = angleList[index];
800 
801  float ishx = al.ioffsetXYZ.x*lata.x + al.ioffsetXYZ.y*latb.x + al.ioffsetXYZ.z*latc.x;
802  float ishy = al.ioffsetXYZ.x*lata.y + al.ioffsetXYZ.y*latb.y + al.ioffsetXYZ.z*latc.y;
803  float ishz = al.ioffsetXYZ.x*lata.z + al.ioffsetXYZ.y*latb.z + al.ioffsetXYZ.z*latc.z;
804 
805  float kshx = al.koffsetXYZ.x*lata.x + al.koffsetXYZ.y*latb.x + al.koffsetXYZ.z*latc.x;
806  float kshy = al.koffsetXYZ.x*lata.y + al.koffsetXYZ.y*latb.y + al.koffsetXYZ.z*latc.y;
807  float kshz = al.koffsetXYZ.x*lata.z + al.koffsetXYZ.y*latb.z + al.koffsetXYZ.z*latc.z;
808 
809  float xij = xyzq[al.i].x + ishx - xyzq[al.j].x;
810  float yij = xyzq[al.i].y + ishy - xyzq[al.j].y;
811  float zij = xyzq[al.i].z + ishz - xyzq[al.j].z;
812 
813  float xkj = xyzq[al.k].x + kshx - xyzq[al.j].x;
814  float ykj = xyzq[al.k].y + kshy - xyzq[al.j].y;
815  float zkj = xyzq[al.k].z + kshz - xyzq[al.j].z;
816 
817  float rij_inv = rsqrtf(xij*xij + yij*yij + zij*zij);
818  float rkj_inv = rsqrtf(xkj*xkj + ykj*ykj + zkj*zkj);
819 
820  float xijr = xij*rij_inv;
821  float yijr = yij*rij_inv;
822  float zijr = zij*rij_inv;
823  float xkjr = xkj*rkj_inv;
824  float ykjr = ykj*rkj_inv;
825  float zkjr = zkj*rkj_inv;
826  float cos_theta = xijr*xkjr + yijr*ykjr + zijr*zkjr;
827 
828  CudaAngleValue angleValue = angleValues[al.itype];
829  angleValue.k *= al.scale;
830 
831  float diff;
832  if (angleValue.normal == 1) {
833  // Restrict values of cos_theta to the interval [-0.999 ... 0.999]
834  cos_theta = min(0.999f, max(-0.999f, cos_theta));
835  float theta = acosf(cos_theta);
836  diff = theta - angleValue.theta0;
837  } else {
838  diff = cos_theta - angleValue.theta0;
839  }
840 
841  float energy_tmp = 0.0f;
842  if (doEnergy) {
843  energy_tmp += angleValue.k * diff * diff;
844  }
845 
846  // Alchemical route
847  float alch_scale;
848  // Be careful: alch_scale_energy_fep is 0 here!
849  float alch_scale_energy_fep;
850  double* p_energy_TI;
851  // NOTE: On account of the Urey-Bradley terms, energy calculation is not
852  // finished so we cannot add energy_tmp to energy_TI_* and energy_F here!
853  // but we still need the scaling factor for forces,
854  // so the code here is a little bit different from the bondForce.
855  // calculate a scale factor for FEP energy first, and then scale it to
856  // the final result after finishing energy evaluation, and also use
857  // a pointer point to the correct energy_TI_x term.
858  if (doFEP || doTI) {
859  alch_scale = 1.0f;
860  alch_scale_energy_fep = 0.0f;
861  if (doTI) p_energy_TI = NULL;
862  switch (al.fepBondedType) {
863  case 1: {
864  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
865  if (doEnergy) {
866  if (doTI) {
867  p_energy_TI = &(energy_TI_1);
868  }
869  if (doFEP) {
870  alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1;
871  }
872  }
873  break;
874  }
875  case 2: {
876  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
877  if (doEnergy) {
878  if (doTI) {
879  p_energy_TI = &(energy_TI_2);
880  }
881  if (doFEP) {
882  alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2;
883  }
884  }
885  break;
886  }
887  }
888  }
889 
890  if (angleValue.normal == 1) {
891  float inv_sin_theta = rsqrtf(1.0f - cos_theta*cos_theta);
892  if (inv_sin_theta > 1.0e6) {
893  diff = (diff < 0.0f) ? 1.0f : -1.0f;
894  } else {
895  diff *= -inv_sin_theta;
896  }
897  }
898  diff *= -2.0f*angleValue.k;
899 
900  // Do alchemical scaling
901  if (doFEP || doTI) {
902  diff *= alch_scale;
903  }
904 
905  float dtxi = rij_inv*(xkjr - cos_theta*xijr);
906  float dtxj = rkj_inv*(xijr - cos_theta*xkjr);
907  float dtyi = rij_inv*(ykjr - cos_theta*yijr);
908  float dtyj = rkj_inv*(yijr - cos_theta*ykjr);
909  float dtzi = rij_inv*(zkjr - cos_theta*zijr);
910  float dtzj = rkj_inv*(zijr - cos_theta*zkjr);
911 
912  T T_dtxi, T_dtyi, T_dtzi;
913  T T_dtxj, T_dtyj, T_dtzj;
914  calcComponentForces<T>(diff, dtxi, dtyi, dtzi, T_dtxi, T_dtyi, T_dtzi);
915  calcComponentForces<T>(diff, dtxj, dtyj, dtzj, T_dtxj, T_dtyj, T_dtzj);
916  T T_dtxk = -T_dtxi - T_dtxj;
917  T T_dtyk = -T_dtyi - T_dtyj;
918  T T_dtzk = -T_dtzi - T_dtzj;
919  storeForces<T>(T_dtxk, T_dtyk, T_dtzk, al.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
920 
921  if (angleValue.k_ub) {
922  float xik = xij - xkj;
923  float yik = yij - ykj;
924  float zik = zij - zkj;
925  float rik_inv = rsqrtf(xik*xik + yik*yik + zik*zik);
926  float rik = 1.0f/rik_inv;
927  float diff_ub = rik - angleValue.r_ub;
928  if (doEnergy) {
929  energy_tmp += angleValue.k_ub * diff_ub * diff_ub;
930  }
931  diff_ub *= -2.0f*angleValue.k_ub*rik_inv;
932  if (doFEP || doTI) {
933  diff_ub *= alch_scale;
934  }
935  T T_dxik, T_dyik, T_dzik;
936  calcComponentForces<T>(diff_ub, xik, yik, zik, T_dxik, T_dyik, T_dzik);
937  T_dtxi += T_dxik;
938  T_dtyi += T_dyik;
939  T_dtzi += T_dzik;
940  T_dtxj -= T_dxik;
941  T_dtyj -= T_dyik;
942  T_dtzj -= T_dzik;
943  }
944 
945 
946 
947  if (doEnergy) {
948  if (doFEP || doTI) {
949  energy += double(energy_tmp * alch_scale);
950  if (doTI) {
951  if (p_energy_TI != NULL) {
952  *(p_energy_TI) += double(energy_tmp);
953  }
954  }
955  if (doFEP) {
956  energy_F += double(energy_tmp * alch_scale_energy_fep);
957  }
958  } else {
959  energy += double(energy_tmp);
960  }
961  }
962 
963  storeForces<T>(T_dtxi, T_dtyi, T_dtzi, al.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
964  storeForces<T>(T_dtxj, T_dtyj, T_dtzj, al.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
965 
966  // Store virial
967  if (doVirial) {
968 #ifdef WRITE_FULL_VIRIALS
969  float fxi = ((float)T_dtxi);
970  float fyi = ((float)T_dtyi);
971  float fzi = ((float)T_dtzi);
972  float fxk = ((float)T_dtxj);
973  float fyk = ((float)T_dtyj);
974  float fzk = ((float)T_dtzj);
975  virial.xx = (fxi*xij) + (fxk*xkj);
976  virial.xy = (fxi*yij) + (fxk*ykj);
977  virial.xz = (fxi*zij) + (fxk*zkj);
978  virial.yx = (fyi*xij) + (fyk*xkj);
979  virial.yy = (fyi*yij) + (fyk*ykj);
980  virial.yz = (fyi*zij) + (fyk*zkj);
981  virial.zx = (fzi*xij) + (fzk*xkj);
982  virial.zy = (fzi*yij) + (fzk*ykj);
983  virial.zz = (fzi*zij) + (fzk*zkj);
984 #endif
985  }
986 }
987 
988 //
989 // Dihedral computation
990 //
991 // Out: df, e
992 //
993 template <bool doEnergy>
994 __forceinline__ __device__
995 void diheComp(const CudaDihedralValue* dihedralValues, int ic,
996  const float sin_phi, const float cos_phi, const float scale, float& df, double& e) {
997 
998  df = 0.0f;
999  if (doEnergy) e = 0.0;
1000 
1001  float phi = atan2f(sin_phi, cos_phi);
1002 
1003  bool lrep = true;
1004  while (lrep) {
1005  CudaDihedralValue dihedralValue = dihedralValues[ic];
1006  dihedralValue.k *= scale;
1007 
1008  // Last dihedral has n low bit set to 0
1009  lrep = (dihedralValue.n & 1);
1010  dihedralValue.n >>= 1;
1011  if (dihedralValue.n) {
1012  float nf = dihedralValue.n;
1013  float x = nf*phi - dihedralValue.delta;
1014  if (doEnergy) {
1015  float sin_x, cos_x;
1016  sincosf(x, &sin_x, &cos_x);
1017  e += (double)(dihedralValue.k*(1.0f + cos_x));
1018  df += (double)(nf*dihedralValue.k*sin_x);
1019  } else {
1020  float sin_x = sinf(x);
1021  df += (double)(nf*dihedralValue.k*sin_x);
1022  }
1023  } else {
1024  float diff = phi - dihedralValue.delta;
1025  if (diff < -(float)(M_PI)) diff += (float)(2.0*M_PI);
1026  if (diff > (float)(M_PI)) diff -= (float)(2.0*M_PI);
1027  if (doEnergy) {
1028  e += (double)(dihedralValue.k*diff*diff);
1029  }
1030  df -= 2.0f*dihedralValue.k*diff;
1031  }
1032  ic++;
1033  }
1034 
1035 }
1036 
1037 
1038 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1039 __device__ void diheForce(const int index,
1040  const CudaDihedral* __restrict__ diheList,
1041  const CudaDihedralValue* __restrict__ dihedralValues,
1042  const float4* __restrict__ xyzq,
1043  const int stride,
1044  const float3 lata, const float3 latb, const float3 latc,
1045  T* __restrict__ force, double &energy,
1046  T* __restrict__ forceList, int* forceListCounter, int* forceListStarts, int* forceListNexts,
1047 #ifdef WRITE_FULL_VIRIALS
1048  ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1049 #else
1050  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1051 #endif
1052  double &energy_F, double &energy_TI_1, double &energy_TI_2
1053  ) {
1054 
1055  CudaDihedral dl = diheList[index];
1056 
1057  float ishx = dl.ioffsetXYZ.x*lata.x + dl.ioffsetXYZ.y*latb.x + dl.ioffsetXYZ.z*latc.x;
1058  float ishy = dl.ioffsetXYZ.x*lata.y + dl.ioffsetXYZ.y*latb.y + dl.ioffsetXYZ.z*latc.y;
1059  float ishz = dl.ioffsetXYZ.x*lata.z + dl.ioffsetXYZ.y*latb.z + dl.ioffsetXYZ.z*latc.z;
1060 
1061  float jshx = dl.joffsetXYZ.x*lata.x + dl.joffsetXYZ.y*latb.x + dl.joffsetXYZ.z*latc.x;
1062  float jshy = dl.joffsetXYZ.x*lata.y + dl.joffsetXYZ.y*latb.y + dl.joffsetXYZ.z*latc.y;
1063  float jshz = dl.joffsetXYZ.x*lata.z + dl.joffsetXYZ.y*latb.z + dl.joffsetXYZ.z*latc.z;
1064 
1065  float lshx = dl.loffsetXYZ.x*lata.x + dl.loffsetXYZ.y*latb.x + dl.loffsetXYZ.z*latc.x;
1066  float lshy = dl.loffsetXYZ.x*lata.y + dl.loffsetXYZ.y*latb.y + dl.loffsetXYZ.z*latc.y;
1067  float lshz = dl.loffsetXYZ.x*lata.z + dl.loffsetXYZ.y*latb.z + dl.loffsetXYZ.z*latc.z;
1068 
1069  float xij = xyzq[dl.i].x + ishx - xyzq[dl.j].x;
1070  float yij = xyzq[dl.i].y + ishy - xyzq[dl.j].y;
1071  float zij = xyzq[dl.i].z + ishz - xyzq[dl.j].z;
1072 
1073  float xjk = xyzq[dl.j].x + jshx - xyzq[dl.k].x;
1074  float yjk = xyzq[dl.j].y + jshy - xyzq[dl.k].y;
1075  float zjk = xyzq[dl.j].z + jshz - xyzq[dl.k].z;
1076 
1077  float xlk = xyzq[dl.l].x + lshx - xyzq[dl.k].x;
1078  float ylk = xyzq[dl.l].y + lshy - xyzq[dl.k].y;
1079  float zlk = xyzq[dl.l].z + lshz - xyzq[dl.k].z;
1080 
1081  // A=F^G, B=H^G.
1082  float ax = yij*zjk - zij*yjk;
1083  float ay = zij*xjk - xij*zjk;
1084  float az = xij*yjk - yij*xjk;
1085  float bx = ylk*zjk - zlk*yjk;
1086  float by = zlk*xjk - xlk*zjk;
1087  float bz = xlk*yjk - ylk*xjk;
1088 
1089  float ra2 = ax*ax + ay*ay + az*az;
1090  float rb2 = bx*bx + by*by + bz*bz;
1091  float rg = sqrtf(xjk*xjk + yjk*yjk + zjk*zjk);
1092 
1093  // if((ra2 <= rxmin2) .or. (rb2 <= rxmin2) .or. (rg <= rxmin)) then
1094  // nlinear = nlinear + 1
1095  // endif
1096 
1097  float rgr = 1.0f / rg;
1098  float ra2r = 1.0f / ra2;
1099  float rb2r = 1.0f / rb2;
1100  float rabr = sqrtf(ra2r*rb2r);
1101 
1102  float cos_phi = (ax*bx + ay*by + az*bz)*rabr;
1103  //
1104  // Note that sin(phi).G/|G|=B^A/(|A|.|B|)
1105  // which can be simplify to sin(phi)=|G|H.A/(|A|.|B|)
1106  float sin_phi = rg*rabr*(ax*xlk + ay*ylk + az*zlk);
1107  //
1108  // Energy and derivative contributions.
1109 
1110  float df;
1111  double e;
1112  diheComp<doEnergy>(dihedralValues, dl.itype, sin_phi, cos_phi, dl.scale, df, e);
1113 
1114  // Alchemical transformation
1115  float alch_scale;
1116  if (doFEP || doTI) {
1117  alch_scale = 1.0f;
1118  switch (dl.fepBondedType) {
1119  case 1: {
1120  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1121  if (doEnergy) {
1122  if (doTI) {
1123  energy_TI_1 += (double)(e);
1124  }
1125  if (doFEP) {
1126  energy_F += (double)(e * ((AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1));
1127  }
1128  }
1129  break;
1130  }
1131  case 2: {
1132  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1133  if (doEnergy) {
1134  if (doTI) {
1135  energy_TI_2 += (double)(e);
1136  }
1137  if (doFEP) {
1138  energy_F += (double)(e * ((AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2));
1139  }
1140  }
1141  break;
1142  }
1143  }
1144  }
1145 
1146  if (doEnergy) {
1147  if (doFEP || doTI) {
1148  energy += e * alch_scale;
1149  } else {
1150  energy += e;
1151  }
1152  }
1153 
1154  //
1155  // Compute derivatives wrt catesian coordinates.
1156  //
1157  // GAA=dE/dphi.|G|/A^2, GBB=dE/dphi.|G|/B^2, FG=F.G, HG=H.G
1158  // FGA=dE/dphi*F.G/(|G|A^2), HGB=dE/dphi*H.G/(|G|B^2)
1159 
1160  float fg = xij*xjk + yij*yjk + zij*zjk;
1161  float hg = xlk*xjk + ylk*yjk + zlk*zjk;
1162  ra2r *= df;
1163  rb2r *= df;
1164  float fga = fg*ra2r*rgr;
1165  float hgb = hg*rb2r*rgr;
1166  float gaa = ra2r*rg;
1167  float gbb = rb2r*rg;
1168  // DFi=dE/dFi, DGi=dE/dGi, DHi=dE/dHi.
1169 
1170  // Remember T is long long int
1171  // Don't try to scale T_fix and similar variables directly by float
1172  if (doFEP || doTI) {
1173  fga *= alch_scale;
1174  hgb *= alch_scale;
1175  gaa *= alch_scale;
1176  gbb *= alch_scale;
1177  }
1178 
1179  // Store forces
1180  T T_fix, T_fiy, T_fiz;
1181  calcComponentForces<T>(-gaa, ax, ay, az, T_fix, T_fiy, T_fiz);
1182  storeForces<T>(T_fix, T_fiy, T_fiz, dl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1183 
1184  T dgx, dgy, dgz;
1185  calcComponentForces<T>(fga, ax, ay, az, -hgb, bx, by, bz, dgx, dgy, dgz);
1186  T T_fjx = dgx - T_fix;
1187  T T_fjy = dgy - T_fiy;
1188  T T_fjz = dgz - T_fiz;
1189  storeForces<T>(T_fjx, T_fjy, T_fjz, dl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1190 
1191  T dhx, dhy, dhz;
1192  calcComponentForces<T>(gbb, bx, by, bz, dhx, dhy, dhz);
1193  T T_fkx = -dhx - dgx;
1194  T T_fky = -dhy - dgy;
1195  T T_fkz = -dhz - dgz;
1196  storeForces<T>(T_fkx, T_fky, T_fkz, dl.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1197  storeForces<T>(dhx, dhy, dhz, dl.l, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1198 
1199  // Store virial
1200  if (doVirial) {
1201 #ifdef WRITE_FULL_VIRIALS
1202  float fix = ((float)T_fix);
1203  float fiy = ((float)T_fiy);
1204  float fiz = ((float)T_fiz);
1205  float fjx = ((float)dgx);
1206  float fjy = ((float)dgy);
1207  float fjz = ((float)dgz);
1208  float flx = ((float)dhx);
1209  float fly = ((float)dhy);
1210  float flz = ((float)dhz);
1211  virial.xx = (fix*xij) + (fjx*xjk) + (flx*xlk);
1212  virial.xy = (fix*yij) + (fjx*yjk) + (flx*ylk);
1213  virial.xz = (fix*zij) + (fjx*zjk) + (flx*zlk);
1214  virial.yx = (fiy*xij) + (fjy*xjk) + (fly*xlk);
1215  virial.yy = (fiy*yij) + (fjy*yjk) + (fly*ylk);
1216  virial.yz = (fiy*zij) + (fjy*zjk) + (fly*zlk);
1217  virial.zx = (fiz*xij) + (fjz*xjk) + (flz*xlk);
1218  virial.zy = (fiz*yij) + (fjz*yjk) + (flz*ylk);
1219  virial.zz = (fiz*zij) + (fjz*zjk) + (flz*zlk);
1220 #endif
1221  }
1222 
1223 }
1224 
1225 __device__ __forceinline__ float3 cross(const float3 v1, const float3 v2) {
1226  return make_float3(
1227  v1.y*v2.z-v2.y*v1.z,
1228  v2.x*v1.z-v1.x*v2.z,
1229  v1.x*v2.y-v2.x*v1.y
1230  );
1231 }
1232 
1233 __device__ __forceinline__ float dot(const float3 v1, const float3 v2) {
1234  return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z);
1235 }
1236 
1237 //
1238 // Calculates crossterms
1239 //
1240 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1241 __device__ void crosstermForce(
1242  const int index,
1243  const CudaCrossterm* __restrict__ crosstermList,
1244  const CudaCrosstermValue* __restrict__ crosstermValues,
1245  const float4* __restrict__ xyzq,
1246  const int stride,
1247  const float3 lata, const float3 latb, const float3 latc,
1248  T* __restrict__ force, double &energy,
1249  T* __restrict__ forceList, int* forceListCounter, int* forceListStarts, int* forceListNexts,
1250 #ifdef WRITE_FULL_VIRIALS
1251  ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1252 #else
1253  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1254 #endif
1255  double &energy_F, double &energy_TI_1, double &energy_TI_2
1256  ) {
1257 
1258  CudaCrossterm cl = crosstermList[index];
1259 
1260  // ----------------------------------------------------------------------------
1261  // Angle between 1 - 2 - 3 - 4
1262  //
1263  // 1 - 2
1264  float3 sh12 = make_float3(
1265  cl.offset12XYZ.x*lata.x + cl.offset12XYZ.y*latb.x + cl.offset12XYZ.z*latc.x,
1266  cl.offset12XYZ.x*lata.y + cl.offset12XYZ.y*latb.y + cl.offset12XYZ.z*latc.y,
1267  cl.offset12XYZ.x*lata.z + cl.offset12XYZ.y*latb.z + cl.offset12XYZ.z*latc.z);
1268 
1269  float4 xyzq1 = xyzq[cl.i1];
1270  float4 xyzq2 = xyzq[cl.i2];
1271 
1272  float3 r12 = make_float3(
1273  xyzq1.x + sh12.x - xyzq2.x,
1274  xyzq1.y + sh12.y - xyzq2.y,
1275  xyzq1.z + sh12.z - xyzq2.z);
1276 
1277  // 2 - 3
1278  float3 sh23 = make_float3(
1279  cl.offset23XYZ.x*lata.x + cl.offset23XYZ.y*latb.x + cl.offset23XYZ.z*latc.x,
1280  cl.offset23XYZ.x*lata.y + cl.offset23XYZ.y*latb.y + cl.offset23XYZ.z*latc.y,
1281  cl.offset23XYZ.x*lata.z + cl.offset23XYZ.y*latb.z + cl.offset23XYZ.z*latc.z);
1282 
1283  float4 xyzq3 = xyzq[cl.i3];
1284 
1285  float3 r23 = make_float3(
1286  xyzq2.x + sh23.x - xyzq3.x,
1287  xyzq2.y + sh23.y - xyzq3.y,
1288  xyzq2.z + sh23.z - xyzq3.z);
1289 
1290  // 3 - 4
1291  float3 sh34 = make_float3(
1292  cl.offset34XYZ.x*lata.x + cl.offset34XYZ.y*latb.x + cl.offset34XYZ.z*latc.x,
1293  cl.offset34XYZ.x*lata.y + cl.offset34XYZ.y*latb.y + cl.offset34XYZ.z*latc.y,
1294  cl.offset34XYZ.x*lata.z + cl.offset34XYZ.y*latb.z + cl.offset34XYZ.z*latc.z);
1295 
1296  float4 xyzq4 = xyzq[cl.i4];
1297 
1298  float3 r34 = make_float3(
1299  xyzq3.x + sh34.x - xyzq4.x,
1300  xyzq3.y + sh34.y - xyzq4.y,
1301  xyzq3.z + sh34.z - xyzq4.z);
1302 
1303  // Calculate the cross products
1304  float3 A = cross(r12, r23);
1305  float3 B = cross(r23, r34);
1306  float3 C = cross(r23, A);
1307 
1308  // Calculate the inverse distances
1309  float inv_rA = rsqrtf(dot(A, A));
1310  float inv_rB = rsqrtf(dot(B, B));
1311  float inv_rC = rsqrtf(dot(C, C));
1312 
1313  // Calculate the sin and cos
1314  float cos_phi = dot(A, B)*(inv_rA*inv_rB);
1315  float sin_phi = dot(C, B)*(inv_rC*inv_rB);
1316 
1317  float phi = -atan2f(sin_phi,cos_phi);
1318 
1319  // ----------------------------------------------------------------------------
1320  // Angle between 5 - 6 - 7 - 8
1321  //
1322 
1323  // 5 - 6
1324  float3 sh56 = make_float3(
1325  cl.offset56XYZ.x*lata.x + cl.offset56XYZ.y*latb.x + cl.offset56XYZ.z*latc.x,
1326  cl.offset56XYZ.x*lata.y + cl.offset56XYZ.y*latb.y + cl.offset56XYZ.z*latc.y,
1327  cl.offset56XYZ.x*lata.z + cl.offset56XYZ.y*latb.z + cl.offset56XYZ.z*latc.z);
1328 
1329  float4 xyzq5 = xyzq[cl.i5];
1330  float4 xyzq6 = xyzq[cl.i6];
1331 
1332  float3 r56 = make_float3(
1333  xyzq5.x + sh56.x - xyzq6.x,
1334  xyzq5.y + sh56.y - xyzq6.y,
1335  xyzq5.z + sh56.z - xyzq6.z);
1336 
1337  // 6 - 7
1338  float3 sh67 = make_float3(
1339  cl.offset67XYZ.x*lata.x + cl.offset67XYZ.y*latb.x + cl.offset67XYZ.z*latc.x,
1340  cl.offset67XYZ.x*lata.y + cl.offset67XYZ.y*latb.y + cl.offset67XYZ.z*latc.y,
1341  cl.offset67XYZ.x*lata.z + cl.offset67XYZ.y*latb.z + cl.offset67XYZ.z*latc.z);
1342 
1343  float4 xyzq7 = xyzq[cl.i7];
1344 
1345  float3 r67 = make_float3(
1346  xyzq6.x + sh67.x - xyzq7.x,
1347  xyzq6.y + sh67.y - xyzq7.y,
1348  xyzq6.z + sh67.z - xyzq7.z);
1349 
1350  // 7 - 8
1351  float3 sh78 = make_float3(
1352  cl.offset78XYZ.x*lata.x + cl.offset78XYZ.y*latb.x + cl.offset78XYZ.z*latc.x,
1353  cl.offset78XYZ.x*lata.y + cl.offset78XYZ.y*latb.y + cl.offset78XYZ.z*latc.y,
1354  cl.offset78XYZ.x*lata.z + cl.offset78XYZ.y*latb.z + cl.offset78XYZ.z*latc.z);
1355 
1356  float4 xyzq8 = xyzq[cl.i8];
1357 
1358  float3 r78 = make_float3(
1359  xyzq7.x + sh78.x - xyzq8.x,
1360  xyzq7.y + sh78.y - xyzq8.y,
1361  xyzq7.z + sh78.z - xyzq8.z);
1362 
1363  // Calculate the cross products
1364  float3 D = cross(r56, r67);
1365  float3 E = cross(r67, r78);
1366  float3 F = cross(r67, D);
1367 
1368  // Calculate the inverse distances
1369  float inv_rD = rsqrtf(dot(D, D));
1370  float inv_rE = rsqrtf(dot(E, E));
1371  float inv_rF = rsqrtf(dot(F, F));
1372 
1373  // Calculate the sin and cos
1374  float cos_psi = dot(D, E)*(inv_rD*inv_rE);
1375  float sin_psi = dot(F, E)*(inv_rF*inv_rE);
1376 
1377  float psi = -atan2f(sin_psi,cos_psi);
1378 
1379  // ----------------------------------------------------------------------------
1380  // Calculate the energy
1381 
1382  const float inv_h = (float)( (CudaCrosstermValue::dim) / (2.0*M_PI) );
1383 
1384  // Shift angles
1385  phi = fmod(phi + (float)M_PI, 2.0f*(float)M_PI);
1386  psi = fmod(psi + (float)M_PI, 2.0f*(float)M_PI);
1387 
1388  // distance measured in grid points between angle and smallest value
1389  float phi_h = phi * inv_h;
1390  float psi_h = psi * inv_h;
1391 
1392  // find smallest numbered grid point in stencil
1393  int iphi = (int)floor(phi_h);
1394  int ipsi = (int)floor(psi_h);
1395 
1396  const CudaCrosstermValue& cp = crosstermValues[cl.itype];
1397 
1398  // Load coefficients
1399  float4 c[4];
1400  c[0] = cp.c[iphi][ipsi][0];
1401  c[1] = cp.c[iphi][ipsi][1];
1402  c[2] = cp.c[iphi][ipsi][2];
1403  c[3] = cp.c[iphi][ipsi][3];
1404 
1405  float dphi = phi_h - (float)iphi;
1406  float dpsi = psi_h - (float)ipsi;
1407 
1408  float alch_scale;
1409  float alch_scale_energy_fep;
1410  double* p_energy_TI;
1411  if (doFEP || doTI) {
1412  alch_scale = 1.0f;
1413  alch_scale_energy_fep = 0.0f;
1414  if (doTI) p_energy_TI = NULL;
1415  switch (cl.fepBondedType) {
1416  case 1: {
1417  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1418  if (doEnergy) {
1419  if (doTI) {
1420  p_energy_TI = &(energy_TI_1);
1421  }
1422  if (doFEP) {
1423  alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1;
1424  }
1425  }
1426  break;
1427  }
1428  case 2: {
1429  alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1430  if (doEnergy) {
1431  if (doTI) {
1432  p_energy_TI = &(energy_TI_2);
1433  }
1434  if (doFEP) {
1435  alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2;
1436  }
1437  }
1438  break;
1439  }
1440  }
1441  }
1442 
1443  if (doEnergy) {
1444  float energyf = c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) );
1445  energyf = energyf*dpsi + c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) );
1446  energyf = energyf*dpsi + c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) );
1447  energyf = energyf*dpsi + c[0].x + dphi*( c[0].y + dphi*( c[0].z + dphi*c[0].w ) );
1448  if (doFEP || doTI) {
1449  energy += energyf * cl.scale * alch_scale;
1450  if (doFEP) {
1451  energy_F += energyf * cl.scale * alch_scale_energy_fep;
1452  }
1453  if (doTI) {
1454  if (p_energy_TI != NULL) {
1455  (*p_energy_TI) += energyf * cl.scale;
1456  }
1457  }
1458  } else {
1459  energy += energyf * cl.scale;
1460  }
1461  }
1462 
1463  float dEdphi = 3.0f*(c[0].w + dpsi*( c[1].w + dpsi*( c[2].w + dpsi*c[3].w ) ));
1464  dEdphi = dEdphi*dphi + 2.0f*(c[0].z + dpsi*( c[1].z + dpsi*( c[2].z + dpsi*c[3].z ) ));
1465  dEdphi = dEdphi*dphi + (c[0].y + dpsi*( c[1].y + dpsi*( c[2].y + dpsi*c[3].y ) )); // 13 muls
1466  dEdphi *= cl.scale*inv_h;
1467 
1468  float dEdpsi = 3.0f*(c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) ));
1469  dEdpsi = dEdpsi*dpsi + 2.0f*(c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) ));
1470  dEdpsi = dEdpsi*dpsi + (c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) )); // 13 muls
1471  dEdpsi *= cl.scale*inv_h;
1472 
1473  // float normCross1 = dot(A, A);
1474  float square_r23 = dot(r23, r23);
1475  float norm_r23 = sqrtf(square_r23);
1476  float inv_square_r23 = 1.0f/square_r23;
1477  float ff1 = dEdphi*norm_r23*inv_rA*inv_rA;
1478  float ff2 = -dot(r12, r23)*inv_square_r23;
1479  float ff3 = -dot(r34, r23)*inv_square_r23;
1480  float ff4 = -dEdphi*norm_r23*inv_rB*inv_rB;
1481 
1482  // NOTE: The reason why scaling ff1 and ff4 is enough:
1483  // f1, f4 are already scaled, and in following t1's formula:
1484  // first term (t1.x) : ff2*f1.x - ff3*f4.x
1485  // ^ ^
1486  // scaled scaled
1487  // so t1.x is scaled, and also t1.y and t1.z => t1 is scaled
1488  // then let's look at f2.x:
1489  // f2.x = t1.x - f1.x
1490  // ^ ^
1491  // scaled scaled => f2.x is scaled
1492  // and also f2.y and f2.z are scaled => f2 is scaled => similarly f3 is scaled
1493  // As a result scaling ff1 and ff4 is enough. DONT scale ff2 and ff3!
1494  if (doFEP || doTI) {
1495  ff1 *= alch_scale;
1496  ff4 *= alch_scale;
1497  }
1498 
1499  float3 f1 = make_float3(ff1*A.x, ff1*A.y, ff1*A.z);
1500  float3 f4 = make_float3(ff4*B.x, ff4*B.y, ff4*B.z);
1501  float3 t1 = make_float3( ff2*f1.x - ff3*f4.x, ff2*f1.y - ff3*f4.y, ff2*f1.z - ff3*f4.z );
1502  float3 f2 = make_float3( t1.x - f1.x, t1.y - f1.y, t1.z - f1.z);
1503  float3 f3 = make_float3( -t1.x - f4.x, -t1.y - f4.y, -t1.z - f4.z);
1504 
1505  T T_f1x, T_f1y, T_f1z;
1506  T T_f2x, T_f2y, T_f2z;
1507  T T_f3x, T_f3y, T_f3z;
1508  T T_f4x, T_f4y, T_f4z;
1509  convertForces<T>(f1.x, f1.y, f1.z, T_f1x, T_f1y, T_f1z);
1510  convertForces<T>(f2.x, f2.y, f2.z, T_f2x, T_f2y, T_f2z);
1511  convertForces<T>(f3.x, f3.y, f3.z, T_f3x, T_f3y, T_f3z);
1512  convertForces<T>(f4.x, f4.y, f4.z, T_f4x, T_f4y, T_f4z);
1513  storeForces<T>(T_f1x, T_f1y, T_f1z, cl.i1, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1514  storeForces<T>(T_f2x, T_f2y, T_f2z, cl.i2, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1515  storeForces<T>(T_f3x, T_f3y, T_f3z, cl.i3, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1516  storeForces<T>(T_f4x, T_f4y, T_f4z, cl.i4, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1517 
1518  float square_r67 = dot(r67, r67);
1519  float norm_r67 = sqrtf(square_r67);
1520  float inv_square_r67 = 1.0f/(square_r67);
1521  ff1 = dEdpsi*norm_r67*inv_rD*inv_rD;
1522  ff2 = -dot(r56, r67)*inv_square_r67;
1523  ff3 = -dot(r78, r67)*inv_square_r67;
1524  ff4 = -dEdpsi*norm_r67*inv_rE*inv_rE;
1525 
1526  if (doFEP || doTI) {
1527  ff1 *= alch_scale;
1528  ff4 *= alch_scale;
1529  }
1530 
1531  float3 f5 = make_float3( ff1*D.x, ff1*D.y, ff1*D.z );
1532  float3 f8 = make_float3( ff4*E.x, ff4*E.y, ff4*E.z );
1533  float3 t2 = make_float3( ff2*f5.x - ff3*f8.x, ff2*f5.y - ff3*f8.y, ff2*f5.z - ff3*f8.z );
1534  float3 f6 = make_float3( t2.x - f5.x, t2.y - f5.y, t2.z - f5.z );
1535  float3 f7 = make_float3(-t2.x - f8.x, -t2.y - f8.y, -t2.z - f8.z );
1536 
1537  T T_f5x, T_f5y, T_f5z;
1538  T T_f6x, T_f6y, T_f6z;
1539  T T_f7x, T_f7y, T_f7z;
1540  T T_f8x, T_f8y, T_f8z;
1541  convertForces<T>(f5.x, f5.y, f5.z, T_f5x, T_f5y, T_f5z);
1542  convertForces<T>(f6.x, f6.y, f6.z, T_f6x, T_f6y, T_f6z);
1543  convertForces<T>(f7.x, f7.y, f7.z, T_f7x, T_f7y, T_f7z);
1544  convertForces<T>(f8.x, f8.y, f8.z, T_f8x, T_f8y, T_f8z);
1545  storeForces<T>(T_f5x, T_f5y, T_f5z, cl.i5, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1546  storeForces<T>(T_f6x, T_f6y, T_f6z, cl.i6, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1547  storeForces<T>(T_f7x, T_f7y, T_f7z, cl.i7, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1548  storeForces<T>(T_f8x, T_f8y, T_f8z, cl.i8, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1549 
1550  // Store virial
1551  if (doVirial) {
1552 #ifdef WRITE_FULL_VIRIALS
1553  float3 s12 = make_float3( f1.x + f2.x, f1.y + f2.y, f1.z + f2.z );
1554  float3 s56 = make_float3( f5.x + f6.x, f5.y + f6.y, f5.z + f6.z );
1555  virial.xx = f1.x*r12.x + s12.x*r23.x - f4.x*r34.x + f5.x*r56.x + s56.x*r67.x - f8.x*r78.x;
1556  virial.xy = f1.x*r12.y + s12.x*r23.y - f4.x*r34.y + f5.x*r56.y + s56.x*r67.y - f8.x*r78.y;
1557  virial.xz = f1.x*r12.z + s12.x*r23.z - f4.x*r34.z + f5.x*r56.z + s56.x*r67.z - f8.x*r78.z;
1558  virial.yx = f1.y*r12.x + s12.y*r23.x - f4.y*r34.x + f5.y*r56.x + s56.y*r67.x - f8.y*r78.x;
1559  virial.yy = f1.y*r12.y + s12.y*r23.y - f4.y*r34.y + f5.y*r56.y + s56.y*r67.y - f8.y*r78.y;
1560  virial.yz = f1.y*r12.z + s12.y*r23.z - f4.y*r34.z + f5.y*r56.z + s56.y*r67.z - f8.y*r78.z;
1561  virial.zx = f1.z*r12.x + s12.z*r23.x - f4.z*r34.x + f5.z*r56.x + s56.z*r67.x - f8.z*r78.x;
1562  virial.zy = f1.z*r12.y + s12.z*r23.y - f4.z*r34.y + f5.z*r56.y + s56.z*r67.y - f8.z*r78.y;
1563  virial.zz = f1.z*r12.z + s12.z*r23.z - f4.z*r34.z + f5.z*r56.z + s56.z*r67.z - f8.z*r78.z;
1564  }
1565 #endif
1566 
1567 }
1568 
1569 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1570 __device__ void tholeForce(
1571  const int index,
1572  const CudaThole* __restrict__ tholeList,
1573  const float4* __restrict__ xyzq,
1574  const int stride,
1575  const float3 lata, const float3 latb, const float3 latc,
1576  T* __restrict__ force, double &energy,
1577  T* __restrict__ forceList, int* forceListCounter,
1578  int* forceListStarts, int* forceListNexts,
1579 #ifdef WRITE_FULL_VIRIALS
1580  ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1581 #else
1582  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1583 #endif
1584  double &energy_F, double &energy_TI_1, double &energy_TI_2
1585 ) {
1586  // For reference, similar calculation can be found in OpenMM's code repo:
1587  // https://github.com/openmm/openmm/blob/8f123ef287693bb4b553163a9d5fd101ea9e4462/plugins/drude/platforms/common/src/kernels/drudePairForce.cc
1588  // CHARMM source code: charmm/source/gener/drude.F90, FUNCTION E_THOLE
1589  const CudaThole tl = tholeList[index];
1590 
1591  const float shx_aiaj = tl.offset_aiaj.x*lata.x + tl.offset_aiaj.y*latb.x + tl.offset_aiaj.z*latc.x;
1592  const float shy_aiaj = tl.offset_aiaj.x*lata.y + tl.offset_aiaj.y*latb.y + tl.offset_aiaj.z*latc.y;
1593  const float shz_aiaj = tl.offset_aiaj.x*lata.z + tl.offset_aiaj.y*latb.z + tl.offset_aiaj.z*latc.z;
1594 
1595  const float shx_aidj = tl.offset_aidj.x*lata.x + tl.offset_aidj.y*latb.x + tl.offset_aidj.z*latc.x;
1596  const float shy_aidj = tl.offset_aidj.x*lata.y + tl.offset_aidj.y*latb.y + tl.offset_aidj.z*latc.y;
1597  const float shz_aidj = tl.offset_aidj.x*lata.z + tl.offset_aidj.y*latb.z + tl.offset_aidj.z*latc.z;
1598 
1599  const float shx_diaj = tl.offset_diaj.x*lata.x + tl.offset_diaj.y*latb.x + tl.offset_diaj.z*latc.x;
1600  const float shy_diaj = tl.offset_diaj.x*lata.y + tl.offset_diaj.y*latb.y + tl.offset_diaj.z*latc.y;
1601  const float shz_diaj = tl.offset_diaj.x*lata.z + tl.offset_diaj.y*latb.z + tl.offset_diaj.z*latc.z;
1602 
1603  const float shx_didj = tl.offset_didj.x*lata.x + tl.offset_didj.y*latb.x + tl.offset_didj.z*latc.x;
1604  const float shy_didj = tl.offset_didj.x*lata.y + tl.offset_didj.y*latb.y + tl.offset_didj.z*latc.y;
1605  const float shz_didj = tl.offset_didj.x*lata.z + tl.offset_didj.y*latb.z + tl.offset_didj.z*latc.z;
1606 
1607  const float raa_x = xyzq[tl.i].x + shx_aiaj - xyzq[tl.k].x;
1608  const float raa_y = xyzq[tl.i].y + shy_aiaj - xyzq[tl.k].y;
1609  const float raa_z = xyzq[tl.i].z + shz_aiaj - xyzq[tl.k].z;
1610 
1611  const float rad_x = xyzq[tl.i].x + shx_aidj - xyzq[tl.l].x;
1612  const float rad_y = xyzq[tl.i].y + shy_aidj - xyzq[tl.l].y;
1613  const float rad_z = xyzq[tl.i].z + shz_aidj - xyzq[tl.l].z;
1614 
1615  const float rda_x = xyzq[tl.j].x + shx_diaj - xyzq[tl.k].x;
1616  const float rda_y = xyzq[tl.j].y + shy_diaj - xyzq[tl.k].y;
1617  const float rda_z = xyzq[tl.j].z + shz_diaj - xyzq[tl.k].z;
1618 
1619  const float rdd_x = xyzq[tl.j].x + shx_didj - xyzq[tl.l].x;
1620  const float rdd_y = xyzq[tl.j].y + shy_didj - xyzq[tl.l].y;
1621  const float rdd_z = xyzq[tl.j].z + shz_didj - xyzq[tl.l].z;
1622 
1623  const float raa_invlen = rnorm3df(raa_x, raa_y, raa_z);
1624  const float rad_invlen = rnorm3df(rad_x, rad_y, rad_z);
1625  const float rda_invlen = rnorm3df(rda_x, rda_y, rda_z);
1626  const float rdd_invlen = rnorm3df(rdd_x, rdd_y, rdd_z);
1627 
1628  const float auaa = tl.aa / raa_invlen;
1629  const float auad = tl.aa / rad_invlen;
1630  const float auda = tl.aa / rda_invlen;
1631  const float audd = tl.aa / rdd_invlen;
1632 
1633  // exp(-ar)
1634  const float expauaa = expf(-auaa);
1635  const float expauad = expf(-auad);
1636  const float expauda = expf(-auda);
1637  const float expaudd = expf(-audd);
1638 
1639  // (1 + ar/2)
1640  float polyauaa = 1.0f + 0.5f*auaa;
1641  float polyauad = 1.0f + 0.5f*auad;
1642  float polyauda = 1.0f + 0.5f*auda;
1643  float polyaudd = 1.0f + 0.5f*audd;
1644 
1645  if (doEnergy) {
1646  float ethole = 0;
1647  ethole += tl.qq * raa_invlen * (1.0f - polyauaa * expauaa);
1648  ethole += -tl.qq * rad_invlen * (1.0f - polyauad * expauad);
1649  ethole += -tl.qq * rda_invlen * (1.0f - polyauda * expauda);
1650  ethole += tl.qq * rdd_invlen * (1.0f - polyaudd * expaudd);
1651  if (doFEP || doTI) {
1652  switch (tl.fepBondedType) {
1653  case 1: {
1654  // 0 < typeSum && typeSum < order
1655  if (doTI) energy_TI_1 += (double)ethole;
1656  if (doFEP) {
1657  // elecLambda2Up = simParams->getElecLambda(alchLambda2)
1658  energy_F += (AlchBondedCUDA::alchLambdas).elecLambda2Up * ethole;
1659  // elecLambdaUp = simParams->getElecLambda(alchLambda)
1660  ethole *= (AlchBondedCUDA::alchLambdas).elecLambdaUp;
1661  }
1662  break;
1663  }
1664  case 2: {
1665  if (doTI) energy_TI_2 += (double)ethole;
1666  if (doFEP) {
1667  // elecLambda2Down = simParams->getElecLambda(1-alchLambda2)
1668  energy_F += (AlchBondedCUDA::alchLambdas).elecLambda2Down * ethole;
1669  // elecLambdaDown = simParams->getElecLambda(1-alchLambda)
1670  ethole *= (AlchBondedCUDA::alchLambdas).elecLambdaDown;
1671  break;
1672  }
1673  }
1674  }
1675  }
1676  energy += (double)ethole;
1677  }
1678 
1679  polyauaa = 1.0f + auaa*polyauaa;
1680  polyauad = 1.0f + auad*polyauad;
1681  polyauda = 1.0f + auda*polyauda;
1682  polyaudd = 1.0f + audd*polyaudd;
1683 
1684  const float raa_invlen3 = raa_invlen * raa_invlen * raa_invlen;
1685  const float rad_invlen3 = rad_invlen * rad_invlen * rad_invlen;
1686  const float rda_invlen3 = rda_invlen * rda_invlen * rda_invlen;
1687  const float rdd_invlen3 = rdd_invlen * rdd_invlen * rdd_invlen;
1688 
1689  // df = (1/r) (dU/dr)
1690  float dfaa = -tl.qq * raa_invlen3 * (polyauaa*expauaa - 1);
1691  float dfad = tl.qq * rad_invlen3 * (polyauad*expauad - 1);
1692  float dfda = tl.qq * rda_invlen3 * (polyauda*expauda - 1);
1693  float dfdd = -tl.qq * rdd_invlen3 * (polyaudd*expaudd - 1);
1694 
1695  if (doFEP || doTI) {
1696  switch (tl.fepBondedType) {
1697  case 1: {
1698  dfaa *= (AlchBondedCUDA::alchLambdas).elecLambdaUp;
1699  dfad *= (AlchBondedCUDA::alchLambdas).elecLambdaUp;
1700  dfda *= (AlchBondedCUDA::alchLambdas).elecLambdaUp;
1701  dfdd *= (AlchBondedCUDA::alchLambdas).elecLambdaUp;
1702  break;
1703  }
1704  case 2: {
1705  dfaa *= (AlchBondedCUDA::alchLambdas).elecLambdaDown;
1706  dfad *= (AlchBondedCUDA::alchLambdas).elecLambdaDown;
1707  dfda *= (AlchBondedCUDA::alchLambdas).elecLambdaDown;
1708  dfdd *= (AlchBondedCUDA::alchLambdas).elecLambdaDown;
1709  break;
1710  }
1711  }
1712  }
1713 
1714  T T_fx0, T_fy0, T_fz0;
1715  calcComponentForces<T>(dfaa, raa_x, raa_y, raa_z, dfad, rad_x, rad_y, rad_z, T_fx0, T_fy0, T_fz0);
1716  storeForces<T>(T_fx0, T_fy0, T_fz0, tl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1717 
1718  T T_fx1, T_fy1, T_fz1;
1719  calcComponentForces<T>(dfda, rda_x, rda_y, rda_z, dfdd, rdd_x, rdd_y, rdd_z, T_fx1, T_fy1, T_fz1);
1720  storeForces<T>(T_fx1, T_fy1, T_fz1, tl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1721 
1722  T T_fx2, T_fy2, T_fz2;
1723  calcComponentForces<T>(dfaa, raa_x, raa_y, raa_z, dfda, rda_x, rda_y, rda_z, T_fx2, T_fy2, T_fz2);
1724  storeForces<T>(-T_fx2, -T_fy2, -T_fz2, tl.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1725 
1726  T T_fx3, T_fy3, T_fz3;
1727  calcComponentForces<T>(dfad, rad_x, rad_y, rad_z, dfdd, rdd_x, rdd_y, rdd_z, T_fx3, T_fy3, T_fz3);
1728  storeForces<T>(-T_fx3, -T_fy3, -T_fz3, tl.l, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1729 
1730  if (doVirial) {
1731 #ifdef WRITE_FULL_VIRIALS
1732  const float faa_x = dfaa * raa_x;
1733  const float faa_y = dfaa * raa_y;
1734  const float faa_z = dfaa * raa_z;
1735  const float fad_x = dfad * rad_x;
1736  const float fad_y = dfad * rad_y;
1737  const float fad_z = dfad * rad_z;
1738  const float fda_x = dfda * rda_x;
1739  const float fda_y = dfda * rda_y;
1740  const float fda_z = dfda * rda_z;
1741  const float fdd_x = dfdd * rdd_x;
1742  const float fdd_y = dfdd * rdd_y;
1743  const float fdd_z = dfdd * rdd_z;
1744  virial.xx = faa_x * raa_x + fad_x * rad_x + fda_x * rda_x + fdd_x * rdd_x;
1745  virial.xy = faa_x * raa_y + fad_x * rad_y + fda_x * rda_y + fdd_x * rdd_y;
1746  virial.xz = faa_x * raa_z + fad_x * rad_z + fda_x * rda_z + fdd_x * rdd_z;
1747  virial.yx = faa_y * raa_x + fad_y * rad_x + fda_y * rda_x + fdd_y * rdd_x;
1748  virial.yy = faa_y * raa_y + fad_y * rad_y + fda_y * rda_y + fdd_y * rdd_y;
1749  virial.yz = faa_y * raa_z + fad_y * rad_z + fda_y * rda_z + fdd_y * rdd_z;
1750  virial.zx = faa_z * raa_x + fad_z * rad_x + fda_z * rda_x + fdd_z * rdd_x;
1751  virial.zy = faa_z * raa_y + fad_z * rad_y + fda_z * rda_y + fdd_z * rdd_y;
1752  virial.zz = faa_z * raa_z + fad_z * rad_z + fda_z * rda_z + fdd_z * rdd_z;
1753 #endif
1754  }
1755 }
1756 
1757 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1758 __device__ void anisoForce(
1759  const int index,
1760  const CudaAniso* __restrict__ anisoList,
1761  const float4* __restrict__ xyzq,
1762  const int stride,
1763  const float3 lata, const float3 latb, const float3 latc,
1764  T* __restrict__ force, double &energy,
1765  T* __restrict__ forceList, int* forceListCounter,
1766  int* forceListStarts, int* forceListNexts,
1767 #ifdef WRITE_FULL_VIRIALS
1768  ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1769 #else
1770  ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1771 #endif
1772  double &energy_F, double &energy_TI_1, double &energy_TI_2
1773 ) {
1774  const CudaAniso al = anisoList[index];
1775 
1776  const float shx_il = al.offset_il.x*lata.x + al.offset_il.y*latb.x + al.offset_il.z*latc.x;
1777  const float shy_il = al.offset_il.x*lata.y + al.offset_il.y*latb.y + al.offset_il.z*latc.y;
1778  const float shz_il = al.offset_il.x*lata.z + al.offset_il.y*latb.z + al.offset_il.z*latc.z;
1779 
1780  const float shx_mn = al.offset_mn.x*lata.x + al.offset_mn.y*latb.x + al.offset_mn.z*latc.x;
1781  const float shy_mn = al.offset_mn.x*lata.y + al.offset_mn.y*latb.y + al.offset_mn.z*latc.y;
1782  const float shz_mn = al.offset_mn.x*lata.z + al.offset_mn.y*latb.z + al.offset_mn.z*latc.z;
1783 
1784  // calculate parallel and perpendicular displacement vectors
1785  // shortest vector image: ri - rl
1786  const float ril_x = xyzq[al.i].x + shx_il - xyzq[al.l].x;
1787  const float ril_y = xyzq[al.i].y + shy_il - xyzq[al.l].y;
1788  const float ril_z = xyzq[al.i].z + shz_il - xyzq[al.l].z;
1789 
1790  // shortest vector image: rm - rn
1791  const float rmn_x = xyzq[al.m].x + shx_mn - xyzq[al.n].x;
1792  const float rmn_y = xyzq[al.m].y + shy_mn - xyzq[al.n].y;
1793  const float rmn_z = xyzq[al.m].z + shz_mn - xyzq[al.n].z;
1794 
1795  // need recip lengths of r_il, r_mn
1796  const float ril_invlen = rnorm3df(ril_x, ril_y, ril_z);
1797  const float rmn_invlen = rnorm3df(rmn_x, rmn_y, rmn_z);
1798 
1799  // normalize r_il, r_mn
1800  const float u1_x = ril_x * ril_invlen;
1801  const float u1_y = ril_y * ril_invlen;
1802  const float u1_z = ril_z * ril_invlen;
1803  const float u2_x = rmn_x * rmn_invlen;
1804  const float u2_y = rmn_y * rmn_invlen;
1805  const float u2_z = rmn_z * rmn_invlen;
1806 
1807  // Drude displacement vector (ri, rj are in same patch)
1808  const float dr_x = xyzq[al.j].x - xyzq[al.i].x;
1809  const float dr_y = xyzq[al.j].y - xyzq[al.i].y;
1810  const float dr_z = xyzq[al.j].z - xyzq[al.i].z;
1811 
1812  // parallel displacement
1813  const float dpar = dr_x * u1_x + dr_y * u1_y + dr_z * u1_z;
1814 
1815  // perpendicular displacement
1816  const float dperp = dr_x * u2_x + dr_y * u2_y + dr_z * u2_z;
1817 
1818  // force on atom j
1819  float fj_x = -al.kiso0 * dr_x;
1820  float fj_y = -al.kiso0 * dr_y;
1821  float fj_z = -al.kiso0 * dr_z;
1822  fj_x -= al.kpar0 * dpar * u1_x;
1823  fj_y -= al.kpar0 * dpar * u1_y;
1824  fj_z -= al.kpar0 * dpar * u1_z;
1825  fj_x -= al.kperp0 * dperp * u2_x;
1826  fj_y -= al.kperp0 * dperp * u2_y;
1827  fj_z -= al.kperp0 * dperp * u2_z;
1828 
1829  // force on atom l
1830  float fl_x = al.kpar0 * dpar * ril_invlen * dr_x;
1831  float fl_y = al.kpar0 * dpar * ril_invlen * dr_y;
1832  float fl_z = al.kpar0 * dpar * ril_invlen * dr_z;
1833  fl_x -= al.kpar0 * dpar * dpar * ril_invlen * u1_x;
1834  fl_y -= al.kpar0 * dpar * dpar * ril_invlen * u1_y;
1835  fl_z -= al.kpar0 * dpar * dpar * ril_invlen * u1_z;
1836 
1837  // force on atom m
1838  float fm_x = al.kperp0 * dperp * dperp * rmn_invlen * u2_x;
1839  float fm_y = al.kperp0 * dperp * dperp * rmn_invlen * u2_y;
1840  float fm_z = al.kperp0 * dperp * dperp * rmn_invlen * u2_z;
1841  fm_x -= al.kperp0 * dperp * rmn_invlen * dr_x;
1842  fm_y -= al.kperp0 * dperp * rmn_invlen * dr_y;
1843  fm_z -= al.kperp0 * dperp * rmn_invlen * dr_z;
1844 
1845  if (doEnergy) {
1846  // aniso spring energy
1847  // kpar reduces response along carbonyl vector
1848  // kperp reduces response perp to bond vector
1849  // (reg in and out of plane response)
1850  const float dr2 = dr_x * dr_x + dr_y * dr_y + dr_z * dr_z;
1851  float eaniso = 0.5f * al.kpar0 * dpar * dpar + 0.5f * al.kperp0 * dperp * dperp + 0.5f * al.kiso0 * dr2;
1852  if (doFEP || doTI) {
1853  switch (al.fepBondedType) {
1854  case 1: {
1855  if (doTI) energy_TI_1 += (double)eaniso;
1856  if (doFEP) {
1857  energy_F += (double)(eaniso * ((AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1));
1858  eaniso *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1859  fj_x *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1860  fj_y *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1861  fj_z *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1862  fl_x *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1863  fl_y *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1864  fl_z *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1865  fm_x *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1866  fm_y *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1867  fm_z *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1868  }
1869  break;
1870  }
1871  case 2: {
1872  if (doTI) energy_TI_2 += (double)eaniso;
1873  if (doFEP) {
1874  energy_F += (double)(eaniso * ((AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2));
1875  eaniso *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1876  fj_x *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1877  fj_y *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1878  fj_z *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1879  fl_x *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1880  fl_y *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1881  fl_z *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1882  fm_x *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1883  fm_y *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1884  fm_z *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1885  }
1886  break;
1887  }
1888  }
1889  }
1890  energy += (double)eaniso;
1891  }
1892 
1893  // accumulate forces
1894  T T_fx0, T_fy0, T_fz0;
1895  convertForces<T>(-(fj_x + fl_x), -(fj_y + fl_y), -(fj_z + fl_z), T_fx0, T_fy0, T_fz0);
1896  storeForces<T>(T_fx0, T_fy0, T_fz0, al.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1897 
1898  T T_fx1, T_fy1, T_fz1;
1899  convertForces<T>(fj_x, fj_y, fj_z, T_fx1, T_fy1, T_fz1);
1900  storeForces<T>(T_fx1, T_fy1, T_fz1, al.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1901 
1902  T T_fx2, T_fy2, T_fz2;
1903  convertForces<T>(fl_x, fl_y, fl_z, T_fx2, T_fy2, T_fz2);
1904  storeForces<T>(T_fx2, T_fy2, T_fz2, al.l, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1905 
1906  T T_fx3, T_fy3, T_fz3;
1907  convertForces<T>(fm_x, fm_y, fm_z, T_fx3, T_fy3, T_fz3);
1908  storeForces<T>(T_fx3, T_fy3, T_fz3, al.m, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1909 
1910  T T_fx4, T_fy4, T_fz4;
1911  convertForces<T>(-fm_x, -fm_y, -fm_z, T_fx4, T_fy4, T_fz4);
1912  storeForces<T>(T_fx4, T_fy4, T_fz4, al.n, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1913 
1914  if (doVirial) {
1915 #ifdef WRITE_FULL_VIRIALS
1916  virial.xx = fj_x * dr_x - fl_x * ril_x + fm_x * rmn_x;
1917  virial.xy = fj_x * dr_y - fl_x * ril_y + fm_x * rmn_y;
1918  virial.xz = fj_x * dr_z - fl_x * ril_z + fm_x * rmn_z;
1919  virial.yx = fj_y * dr_x - fl_y * ril_x + fm_y * rmn_x;
1920  virial.yy = fj_y * dr_y - fl_y * ril_y + fm_y * rmn_y;
1921  virial.yz = fj_y * dr_z - fl_y * ril_z + fm_y * rmn_z;
1922  virial.zx = fj_z * dr_x - fl_z * ril_x + fm_z * rmn_x;
1923  virial.zy = fj_z * dr_y - fl_z * ril_y + fm_z * rmn_y;
1924  virial.zz = fj_z * dr_z - fl_z * ril_z + fm_z * rmn_z;
1925 #endif
1926  }
1927 }
1928 
1929 #ifndef NAMD_CUDA
1930 #define BONDEDFORCESKERNEL_NUM_WARP 2
1931 #else
1932 #define BONDEDFORCESKERNEL_NUM_WARP 4
1933 #endif
1934 //
1935 // Calculates all forces in a single kernel call
1936 //
1937 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1938 __global__ void bondedForcesKernel(
1939  const int start,
1940 
1941  const int numBonds,
1942  const CudaBond* __restrict__ bonds,
1943  const CudaBondValue* __restrict__ bondValues,
1944 
1945  const int numAngles,
1946  const CudaAngle* __restrict__ angles,
1947  const CudaAngleValue* __restrict__ angleValues,
1948 
1949  const int numDihedrals,
1950  const CudaDihedral* __restrict__ dihedrals,
1951  const CudaDihedralValue* __restrict__ dihedralValues,
1952 
1953  const int numImpropers,
1954  const CudaDihedral* __restrict__ impropers,
1955  const CudaDihedralValue* __restrict__ improperValues,
1956 
1957  const int numExclusions,
1958  const CudaExclusion* __restrict__ exclusions,
1959 
1960  const int numCrossterms,
1961  const CudaCrossterm* __restrict__ crossterms,
1962  const CudaCrosstermValue* __restrict__ crosstermValues,
1963 
1964  const int numTholes,
1965  const CudaThole* __restrict__ tholes,
1966 
1967  const int numAnisos,
1968  const CudaAniso* __restrict__ anisos,
1969 
1970  const float cutoff2,
1971  const float r2_delta, const int r2_delta_expc,
1972 
1973  const float* __restrict__ r2_table,
1974  const float4* __restrict__ exclusionTable,
1975 
1976 #if !defined(USE_TABLE_ARRAYS)
1977  cudaTextureObject_t r2_table_tex,
1978  cudaTextureObject_t exclusionTableTex,
1979 #endif
1980 
1981  const float4* __restrict__ xyzq,
1982  const int stride,
1983  const float3 lata, const float3 latb, const float3 latc,
1984  T* __restrict__ force,
1985  T* __restrict__ forceSlow,
1986  T* __restrict__ forceList,
1987  int* forceListCounter,
1988  int* forceListStarts,
1989  int* forceListStartsSlow,
1990  int* forceListNexts,
1991  double* __restrict__ energies_virials) {
1992 
1993  // Thread-block index
1994  int indexTB = start + blockIdx.x;
1995 
1996  const int numBondsTB = (numBonds + blockDim.x - 1)/blockDim.x;
1997  const int numAnglesTB = (numAngles + blockDim.x - 1)/blockDim.x;
1998  const int numDihedralsTB = (numDihedrals + blockDim.x - 1)/blockDim.x;
1999  const int numImpropersTB = (numImpropers + blockDim.x - 1)/blockDim.x;
2000  const int numExclusionsTB= (numExclusions + blockDim.x - 1)/blockDim.x;
2001  const int numCrosstermsTB= (numCrossterms + blockDim.x - 1)/blockDim.x;
2002  const int numTholesTB = (numTholes + blockDim.x - 1)/blockDim.x;
2003  const int numAnisosTB = (numAnisos + blockDim.x - 1)/blockDim.x;
2004 
2005  // Each thread computes single bonded interaction.
2006  // Each thread block computes single bonded type
2007  double energy;
2008  double energy_F;
2009  double energy_TI_1;
2010  double energy_TI_2;
2011  int energyIndex;
2012  int energyIndex_F;
2013  int energyIndex_TI_1;
2014  int energyIndex_TI_2;
2015 
2016  if (doEnergy) {
2017  energy = 0.0;
2018  energyIndex = -1;
2019  if (doFEP) {
2020  energy_F = 0.0;
2021  energyIndex_F = -1;
2022  }
2023  if (doTI) {
2024  energy_TI_1 = 0.0;
2025  energy_TI_2 = 0.0;
2026  energyIndex_TI_1 = -1;
2027  energyIndex_TI_2 = -1;
2028  }
2029  }
2030 
2031 #ifdef WRITE_FULL_VIRIALS
2032  ComputeBondedCUDAKernel::BondedVirial<double> virial;
2033  int virialIndex;
2034  if (doVirial) {
2035  virial.xx = 0.0;
2036  virial.xy = 0.0;
2037  virial.xz = 0.0;
2038  virial.yx = 0.0;
2039  virial.yy = 0.0;
2040  virial.yz = 0.0;
2041  virial.zx = 0.0;
2042  virial.zy = 0.0;
2043  virial.zz = 0.0;
2044  virialIndex = ComputeBondedCUDAKernel::normalVirialIndex_XX;
2045  }
2046 #endif
2047 
2048  if (indexTB < numBondsTB) {
2049  int i = threadIdx.x + indexTB*blockDim.x;
2050  if (doEnergy) {
2051  energyIndex = ComputeBondedCUDAKernel::energyIndex_BOND;
2052  if (doFEP) {
2053  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_BOND_F;
2054  }
2055  if (doTI) {
2056  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_BOND_TI_1;
2057  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_BOND_TI_2;
2058  }
2059  }
2060  if (i < numBonds) {
2061  bondForce<T, doEnergy, doVirial, doFEP, doTI>
2062  (i, bonds, bondValues, xyzq,
2063  stride, lata, latb, latc,
2064  force, energy,
2065  forceList, forceListCounter, forceListStarts, forceListNexts,
2066  virial,
2067  energy_F, energy_TI_1, energy_TI_2);
2068  }
2069  goto reduce;
2070  }
2071  indexTB -= numBondsTB;
2072 
2073  if (indexTB < numAnglesTB) {
2074  int i = threadIdx.x + indexTB*blockDim.x;
2075  if (doEnergy) {
2076  energyIndex = ComputeBondedCUDAKernel::energyIndex_ANGLE;
2077  if (doFEP) {
2078  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_ANGLE_F;
2079  }
2080  if (doTI) {
2081  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_ANGLE_TI_1;
2082  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_ANGLE_TI_2;
2083  }
2084  }
2085  if (i < numAngles) {
2086  angleForce<T, doEnergy, doVirial, doFEP, doTI>
2087  (i, angles, angleValues, xyzq, stride,
2088  lata, latb, latc,
2089  force, energy,
2090  forceList, forceListCounter, forceListStarts, forceListNexts,
2091  virial,
2092  energy_F, energy_TI_1, energy_TI_2);
2093  }
2094  goto reduce;
2095  }
2096  indexTB -= numAnglesTB;
2097 
2098  if (indexTB < numDihedralsTB) {
2099  int i = threadIdx.x + indexTB*blockDim.x;
2100  if (doEnergy) {
2101  energyIndex = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL;
2102  if (doFEP) {
2103  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_F;
2104  }
2105  if (doTI) {
2106  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_TI_1;
2107  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_TI_2;
2108  }
2109  }
2110  if (doVirial) {
2111  virialIndex = ComputeBondedCUDAKernel::amdDiheVirialIndex_XX;
2112  }
2113  if (i < numDihedrals) {
2114  diheForce<T, doEnergy, doVirial, doFEP, doTI>
2115  (i, dihedrals, dihedralValues, xyzq, stride,
2116  lata, latb, latc,
2117  force, energy,
2118  forceList, forceListCounter, forceListStarts, forceListNexts,
2119  virial,
2120  energy_F, energy_TI_1, energy_TI_2);
2121  }
2122  goto reduce;
2123  }
2124  indexTB -= numDihedralsTB;
2125 
2126  if (indexTB < numImpropersTB) {
2127  int i = threadIdx.x + indexTB*blockDim.x;
2128  if (doEnergy) {
2129  energyIndex = ComputeBondedCUDAKernel::energyIndex_IMPROPER;
2130  if (doFEP) {
2131  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_IMPROPER_F;
2132  }
2133  if (doTI) {
2134  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_IMPROPER_TI_1;
2135  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_IMPROPER_TI_2;
2136  }
2137  }
2138  if (i < numImpropers) {
2139  diheForce<T, doEnergy, doVirial, doFEP, doTI>
2140  (i, impropers, improperValues, xyzq, stride,
2141  lata, latb, latc,
2142  force, energy,
2143  forceList, forceListCounter, forceListStarts, forceListNexts,
2144  virial,
2145  energy_F, energy_TI_1, energy_TI_2);
2146  }
2147  goto reduce;
2148  }
2149  indexTB -= numImpropersTB;
2150 
2151  if (indexTB < numExclusionsTB) {
2152  int i = threadIdx.x + indexTB*blockDim.x;
2153  if (doEnergy) {
2154  energyIndex = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW;
2155  if (doFEP) {
2156  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_F;
2157  }
2158  if (doTI) {
2159  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_1;
2160  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_2;
2161  }
2162  }
2163  if (doVirial) {
2164  virialIndex = ComputeBondedCUDAKernel::slowVirialIndex_XX;
2165  }
2166  if (i < numExclusions) {
2167  exclusionForce<T, doEnergy, doVirial, doFEP, doTI>
2168  (i, exclusions, r2_delta, r2_delta_expc,
2169 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
2170  r2_table, exclusionTable,
2171 #else
2172  r2_table_tex, exclusionTableTex,
2173 #endif
2174  xyzq, stride, lata, latb, latc, cutoff2,
2175  forceSlow, energy,
2176  forceList, forceListCounter, forceListStartsSlow, forceListNexts,
2177  virial,
2178  energy_F, energy_TI_1, energy_TI_2);
2179  }
2180  goto reduce;
2181  }
2182  indexTB -= numExclusionsTB;
2183 
2184  if (indexTB < numCrosstermsTB) {
2185  int i = threadIdx.x + indexTB*blockDim.x;
2186  if (doEnergy) {
2187  energyIndex = ComputeBondedCUDAKernel::energyIndex_CROSSTERM;
2188  if (doFEP) {
2189  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_CROSSTERM_F;
2190  }
2191  if (doTI) {
2192  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_CROSSTERM_TI_1;
2193  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_CROSSTERM_TI_2;
2194  }
2195  }
2196  if (doVirial) {
2197  virialIndex = ComputeBondedCUDAKernel::amdDiheVirialIndex_XX;
2198  }
2199  if (i < numCrossterms) {
2200  crosstermForce<T, doEnergy, doVirial, doFEP, doTI>
2201  (i, crossterms, crosstermValues,
2202  xyzq, stride, lata, latb, latc,
2203  force, energy,
2204  forceList, forceListCounter, forceListStarts, forceListNexts,
2205  virial,
2206  energy_F, energy_TI_1, energy_TI_2);
2207  }
2208  goto reduce;
2209  }
2210  indexTB -= numCrosstermsTB;
2211 
2212  if (indexTB < numTholesTB) {
2213  int i = threadIdx.x + indexTB*blockDim.x;
2214  if (doEnergy) {
2215  energyIndex = ComputeBondedCUDAKernel::energyIndex_THOLE;
2216  if (doFEP) {
2217  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_THOLE_F;
2218  }
2219  if (doTI) {
2220  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_THOLE_TI_1;
2221  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_THOLE_TI_2;
2222  }
2223  }
2224  if (doVirial) {
2225  virialIndex = ComputeBondedCUDAKernel::normalVirialIndex_XX;
2226  }
2227  if (i < numTholes) {
2228  tholeForce<T, doEnergy, doVirial, doFEP, doTI>(
2229  i, tholes, xyzq, stride, lata, latb, latc,
2230  force, energy, forceList, forceListCounter,
2231  forceListStarts, forceListNexts,
2232  virial, energy_F, energy_TI_1, energy_TI_2);
2233  }
2234  goto reduce;
2235  }
2236  indexTB -= numTholesTB;
2237 
2238  if (indexTB < numAnisosTB) {
2239  int i = threadIdx.x + indexTB*blockDim.x;
2240  if (doEnergy) {
2241  energyIndex = ComputeBondedCUDAKernel::energyIndex_ANISO;
2242  if (doFEP) {
2243  energyIndex_F = ComputeBondedCUDAKernel::energyIndex_ANISO_F;
2244  }
2245  if (doTI) {
2246  energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_ANISO_TI_1;
2247  energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_ANISO_TI_2;
2248  }
2249  }
2250  if (doVirial) {
2251  virialIndex = ComputeBondedCUDAKernel::normalVirialIndex_XX;
2252  }
2253  if (i <numAnisos) {
2254  anisoForce<T, doEnergy, doVirial, doFEP, doTI>(
2255  i, anisos, xyzq, stride, lata, latb, latc,
2256  force, energy, forceList, forceListCounter,
2257  forceListStarts, forceListNexts,
2258  virial, energy_F, energy_TI_1, energy_TI_2);
2259  }
2260  goto reduce;
2261  }
2262  // index -= numAnisosTB
2263 
2264  reduce:
2265 
2266  // Write energies to global memory
2267  if (doEnergy) {
2268  // energyIndex is constant within thread-block
2269  __shared__ double shEnergy[BONDEDFORCESKERNEL_NUM_WARP];
2270  __shared__ double shEnergy_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2271  __shared__ double shEnergy_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2272  __shared__ double shEnergy_F[BONDEDFORCESKERNEL_NUM_WARP];
2273 #pragma unroll
2274  for (int i=WARPSIZE/2;i >= 1;i/=2) {
2275  energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, WARPSIZE);
2276 
2277  if (doFEP) {
2278  energy_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_F, i, WARPSIZE);
2279  }
2280  if (doTI) {
2281  energy_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_1, i, WARPSIZE);
2282  energy_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_2, i, WARPSIZE);
2283  }
2284  }
2285  int laneID = (threadIdx.x & (WARPSIZE - 1));
2286  int warpID = threadIdx.x / WARPSIZE;
2287  if (laneID == 0) {
2288  shEnergy[warpID] = energy;
2289  if (doFEP) {
2290  shEnergy_F[warpID] = energy_F;
2291  }
2292  if (doTI) {
2293  shEnergy_TI_1[warpID] = energy_TI_1;
2294  shEnergy_TI_2[warpID] = energy_TI_2;
2295  }
2296  }
2297  BLOCK_SYNC;
2298  if (warpID == 0) {
2299  energy = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy[laneID] : 0.0;
2300  if (doFEP) {
2301  energy_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy_F[laneID] : 0.0;
2302  }
2303  if (doTI) {
2304  energy_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy_TI_1[laneID] : 0.0;
2305  energy_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy_TI_2[laneID] : 0.0;
2306  }
2307 #pragma unroll
2308  for (int i=WARPSIZE/2;i >= 1;i/=2) {
2309  energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, WARPSIZE);
2310 
2311  if (doFEP) {
2312  energy_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_F, i, WARPSIZE);
2313  }
2314  if (doTI) {
2315  energy_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_1, i, WARPSIZE);
2316  energy_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_2, i, WARPSIZE);
2317  }
2318  }
2319  if (laneID == 0) {
2320  const int bin = blockIdx.x % ATOMIC_BINS;
2321  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex], energy);
2322  if (doFEP) {
2323  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex_F], energy_F);
2324  }
2325  if (doTI) {
2326  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex_TI_1], energy_TI_1);
2327  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex_TI_2], energy_TI_2);
2328  }
2329  }
2330  }
2331  }
2332 
2333  // Write virials to global memory
2334 #ifdef WRITE_FULL_VIRIALS
2335  if (doVirial) {
2336 #pragma unroll
2337  for (int i=WARPSIZE/2;i >= 1;i/=2) {
2338  virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, WARPSIZE);
2339  virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, WARPSIZE);
2340  virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, WARPSIZE);
2341  virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, WARPSIZE);
2342  virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, WARPSIZE);
2343  virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, WARPSIZE);
2344  virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, WARPSIZE);
2345  virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, WARPSIZE);
2346  virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, WARPSIZE);
2347  }
2348  __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirial[BONDEDFORCESKERNEL_NUM_WARP];
2349  int laneID = (threadIdx.x & (WARPSIZE - 1));
2350  int warpID = threadIdx.x / WARPSIZE;
2351  if (laneID == 0) {
2352  shVirial[warpID] = virial;
2353  }
2354  BLOCK_SYNC;
2355 
2356  if (warpID == 0) {
2357  virial.xx = 0.0;
2358  virial.xy = 0.0;
2359  virial.xz = 0.0;
2360  virial.yx = 0.0;
2361  virial.yy = 0.0;
2362  virial.yz = 0.0;
2363  virial.zx = 0.0;
2364  virial.zy = 0.0;
2365  virial.zz = 0.0;
2366  if (laneID < BONDEDFORCESKERNEL_NUM_WARP) virial = shVirial[laneID];
2367 #pragma unroll
2368  for (int i=WARPSIZE/2;i >= 1;i/=2) {
2369  virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, WARPSIZE);
2370  virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, WARPSIZE);
2371  virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, WARPSIZE);
2372  virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, WARPSIZE);
2373  virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, WARPSIZE);
2374  virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, WARPSIZE);
2375  virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, WARPSIZE);
2376  virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, WARPSIZE);
2377  virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, WARPSIZE);
2378  }
2379 
2380  if (laneID == 0) {
2381  const int bin = blockIdx.x % ATOMIC_BINS;
2382  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 0], virial.xx);
2383  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 1], virial.xy);
2384  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 2], virial.xz);
2385  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 3], virial.yx);
2386  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 4], virial.yy);
2387  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 5], virial.yz);
2388  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 6], virial.zx);
2389  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 7], virial.zy);
2390  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 8], virial.zz);
2391  }
2392  }
2393  }
2394 #endif
2395 
2396 }
2397 
2398 template <typename T, bool doEnergy, bool doVirial, bool doElect, bool doFEP, bool doTI, bool doTable>
2399 __global__ void modifiedExclusionForcesKernel(
2400  const int start,
2401  const int numModifiedExclusions,
2402  const CudaExclusion* __restrict__ modifiedExclusions,
2403  const bool doSlow,
2404  const float one_scale14, // 1 - scale14
2405  const float cutoff2,
2406  const CudaNBConstants nbConstants,
2407  const int vdwCoefTableWidth,
2408  const float2* __restrict__ vdwCoefTable,
2409 #if !defined(USE_TABLE_ARRAYS)
2410  cudaTextureObject_t vdwCoefTableTex,
2411 #endif
2412 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
2413  const float4* __restrict__ modifiedExclusionForceTable,
2414  const float4* __restrict__ modifiedExclusionEnergyTable,
2415 #else
2416  cudaTextureObject_t modifiedExclusionForceTableTex,
2417  cudaTextureObject_t modifiedExclusionEnergyTableTex,
2418 #endif
2419  const float4* __restrict__ xyzq,
2420  const int stride,
2421  const float3 lata, const float3 latb, const float3 latc,
2422  T* __restrict__ forceNbond, T* __restrict__ forceSlow,
2423  T* __restrict__ forceList,
2424  int* forceListCounter,
2425  int* forceListStartsNbond,
2426  int* forceListStartsSlow,
2427  int* forceListNexts,
2428  double* __restrict__ energies_virials) {
2429 
2430  // index
2431  int i = threadIdx.x + (start + blockIdx.x)*blockDim.x;
2432 
2433  double energyVdw, energyNbond, energySlow;
2434  // Alchemical energies
2435  double energyVdw_F, energyVdw_TI_1, energyVdw_TI_2;
2436  double energyNbond_F, energyNbond_TI_1, energyNbond_TI_2;
2437  double energySlow_F, energySlow_TI_1, energySlow_TI_2;
2438  if (doEnergy) {
2439  energyVdw = 0.0;
2440  if (doFEP) {
2441  energyVdw_F = 0.0;
2442  }
2443  if (doTI) {
2444  energyVdw_TI_1 = 0.0;
2445  energyVdw_TI_2 = 0.0;
2446  }
2447  if (doElect) {
2448  energyNbond = 0.0;
2449  energySlow = 0.0;
2450  if (doFEP) {
2451  energyNbond_F = 0.0;
2452  energySlow_F = 0.0;
2453  }
2454  if (doTI) {
2455  energyNbond_TI_1 = 0.0;
2456  energyNbond_TI_2 = 0.0;
2457  energySlow_TI_1 = 0.0;
2458  energySlow_TI_2 = 0.0;
2459  }
2460  }
2461  }
2462 
2463 #ifdef WRITE_FULL_VIRIALS
2464  ComputeBondedCUDAKernel::BondedVirial<double> virialNbond;
2465  ComputeBondedCUDAKernel::BondedVirial<double> virialSlow;
2466  if (doVirial) {
2467  virialNbond.xx = 0.0;
2468  virialNbond.xy = 0.0;
2469  virialNbond.xz = 0.0;
2470  virialNbond.yx = 0.0;
2471  virialNbond.yy = 0.0;
2472  virialNbond.yz = 0.0;
2473  virialNbond.zx = 0.0;
2474  virialNbond.zy = 0.0;
2475  virialNbond.zz = 0.0;
2476  if (doElect) {
2477  virialSlow.xx = 0.0;
2478  virialSlow.xy = 0.0;
2479  virialSlow.xz = 0.0;
2480  virialSlow.yx = 0.0;
2481  virialSlow.yy = 0.0;
2482  virialSlow.yz = 0.0;
2483  virialSlow.zx = 0.0;
2484  virialSlow.zy = 0.0;
2485  virialSlow.zz = 0.0;
2486  }
2487  }
2488 #endif
2489 
2490  if (i < numModifiedExclusions)
2491  {
2492  modifiedExclusionForce<T, doEnergy, doVirial, doElect, doFEP, doTI, doTable>
2493  (i, modifiedExclusions, doSlow, one_scale14, vdwCoefTableWidth,
2494 #if __CUDA_ARCH__ >= 350 || defined(USE_TABLE_ARRAYS)
2495  vdwCoefTable,
2496 #else
2497  vdwCoefTableTex,
2498 #endif
2499  // for modified exclusions, we do regular tables if HIP and USE_FORCE_TABLES, otherwise it's textures
2500 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
2501  // tables
2502  modifiedExclusionForceTable,
2503  modifiedExclusionEnergyTable,
2504 #else
2505  // if CUDA build, non-tables, fall back to texture force/energy tables
2506  modifiedExclusionForceTableTex,
2507  modifiedExclusionEnergyTableTex,
2508 #endif
2509  xyzq, stride, lata, latb, latc, cutoff2, nbConstants,
2510  energyVdw, forceNbond, energyNbond,
2511  forceSlow, energySlow,
2512  forceList, forceListCounter, forceListStartsNbond, forceListStartsSlow, forceListNexts,
2513  virialNbond, virialSlow,
2514  energyVdw_F, energyVdw_TI_1, energyVdw_TI_2,
2515  energyNbond_F, energyNbond_TI_1, energyNbond_TI_2,
2516  energySlow_F, energySlow_TI_1, energySlow_TI_2);
2517  }
2518 
2519  // Write energies to global memory
2520  if (doEnergy) {
2521  __shared__ double shEnergyVdw[BONDEDFORCESKERNEL_NUM_WARP];
2522  __shared__ double shEnergyNbond[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
2523  __shared__ double shEnergySlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
2524  __shared__ double shEnergyVdw_F[BONDEDFORCESKERNEL_NUM_WARP];
2525  __shared__ double shEnergyVdw_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2526  __shared__ double shEnergyVdw_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2527  __shared__ double shEnergyNbond_F[BONDEDFORCESKERNEL_NUM_WARP];
2528  __shared__ double shEnergyNbond_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2529  __shared__ double shEnergyNbond_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2530  __shared__ double shEnergySlow_F[BONDEDFORCESKERNEL_NUM_WARP];
2531  __shared__ double shEnergySlow_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2532  __shared__ double shEnergySlow_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2533 #pragma unroll
2534  for (int i=WARPSIZE/2;i >= 1;i/=2) {
2535  energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
2536 
2537  if (doFEP) {
2538  energyVdw_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_F, i, WARPSIZE);
2539  }
2540  if (doTI) {
2541  energyVdw_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_1, i, WARPSIZE);
2542  energyVdw_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_2, i, WARPSIZE);
2543  }
2544  if (doElect) {
2545  energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, WARPSIZE);
2546  energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
2547  if (doFEP) {
2548  energyNbond_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_F, i, WARPSIZE);
2549  energySlow_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_F, i, WARPSIZE);
2550  }
2551  if (doTI) {
2552  energyNbond_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_1, i, WARPSIZE);
2553  energyNbond_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_2, i, WARPSIZE);
2554  energySlow_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_1, i, WARPSIZE);
2555  energySlow_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_2, i, WARPSIZE);
2556  }
2557  }
2558  }
2559  int laneID = (threadIdx.x & (WARPSIZE - 1));
2560  int warpID = threadIdx.x / WARPSIZE;
2561  if (laneID == 0) {
2562  shEnergyVdw[warpID] = energyVdw;
2563  if (doFEP) {
2564  shEnergyVdw_F[warpID] = energyVdw_F;
2565  }
2566  if (doTI) {
2567  shEnergyVdw_TI_1[warpID] = energyVdw_TI_1;
2568  shEnergyVdw_TI_2[warpID] = energyVdw_TI_2;
2569  }
2570  if (doElect) {
2571  shEnergyNbond[warpID] = energyNbond;
2572  shEnergySlow[warpID] = energySlow;
2573  if (doFEP) {
2574  shEnergyNbond_F[warpID] = energyNbond_F;
2575  shEnergySlow_F[warpID] = energySlow_F;
2576  }
2577  if (doTI) {
2578  shEnergyNbond_TI_1[warpID] = energyNbond_TI_1;
2579  shEnergyNbond_TI_2[warpID] = energyNbond_TI_2;
2580  shEnergySlow_TI_1[warpID] = energySlow_TI_1;
2581  shEnergySlow_TI_2[warpID] = energySlow_TI_2;
2582  }
2583  }
2584  }
2585  BLOCK_SYNC;
2586  if (warpID == 0) {
2587  energyVdw = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw[laneID] : 0.0;
2588  if (doFEP) {
2589  energyVdw_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw_F[laneID] : 0.0;
2590  }
2591  if (doTI) {
2592  energyVdw_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw_TI_1[laneID] : 0.0;
2593  energyVdw_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw_TI_2[laneID] : 0.0;
2594  }
2595  if (doElect) {
2596  energyNbond = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond[laneID] : 0.0;
2597  energySlow = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow[laneID] : 0.0;
2598  if (doFEP) {
2599  energyNbond_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond_F[laneID] : 0.0;
2600  energySlow_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow_F[laneID] : 0.0;
2601  }
2602  if (doTI) {
2603  energyNbond_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond_TI_1[laneID] : 0.0;
2604  energyNbond_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond_TI_2[laneID] : 0.0;
2605  energySlow_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow_TI_1[laneID] : 0.0;
2606  energySlow_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow_TI_2[laneID] : 0.0;
2607  }
2608  }
2609 #pragma unroll
2610  for (int i=WARPSIZE/2;i >= 1;i/=2) {
2611  energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
2612 
2613  if (doFEP) {
2614  energyVdw_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_F, i, WARPSIZE);
2615  }
2616  if (doTI) {
2617  energyVdw_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_1, i, WARPSIZE);
2618  energyVdw_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_2, i, WARPSIZE);
2619  }
2620  if (doElect) {
2621  energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, WARPSIZE);
2622  energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
2623  if (doFEP) {
2624  energyNbond_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_F, i, WARPSIZE);
2625  energySlow_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_F, i, WARPSIZE);
2626  }
2627  if (doTI) {
2628  energyNbond_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_1, i, WARPSIZE);
2629  energyNbond_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_2, i, WARPSIZE);
2630  energySlow_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_1, i, WARPSIZE);
2631  energySlow_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_2, i, WARPSIZE);
2632  }
2633  }
2634  }
2635  if (laneID == 0) {
2636  const int bin = blockIdx.x % ATOMIC_BINS;
2637  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ], energyVdw);
2638  if (doFEP) {
2639  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ_F], energyVdw_F);
2640  }
2641  if (doTI) {
2642  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ_TI_1], energyVdw_TI_1);
2643  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ_TI_2], energyVdw_TI_2);
2644  }
2645  if (doElect) {
2646  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT], energyNbond);
2647  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW], energySlow);
2648  if (doFEP) {
2649  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_F], energyNbond_F);
2650  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_F], energySlow_F);
2651  }
2652  if (doTI) {
2653  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_TI_1], energyNbond_TI_1);
2654  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_TI_2], energyNbond_TI_2);
2655  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_1], energySlow_TI_1);
2656  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_2], energySlow_TI_2);
2657  }
2658  }
2659  }
2660  }
2661  }
2662 
2663  // Write virials to global memory
2664 #ifdef WRITE_FULL_VIRIALS
2665  if (doVirial) {
2666 #pragma unroll
2667  for (int i=WARPSIZE/2;i >= 1;i/=2) {
2668  virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, WARPSIZE);
2669  virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, WARPSIZE);
2670  virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, WARPSIZE);
2671  virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, WARPSIZE);
2672  virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, WARPSIZE);
2673  virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, WARPSIZE);
2674  virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, WARPSIZE);
2675  virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, WARPSIZE);
2676  virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, WARPSIZE);
2677  if (doElect && doSlow) {
2678  virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, WARPSIZE);
2679  virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, WARPSIZE);
2680  virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, WARPSIZE);
2681  virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, WARPSIZE);
2682  virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, WARPSIZE);
2683  virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, WARPSIZE);
2684  virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, WARPSIZE);
2685  virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, WARPSIZE);
2686  virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, WARPSIZE);
2687  }
2688  }
2689  __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirialNbond[BONDEDFORCESKERNEL_NUM_WARP];
2690  __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirialSlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
2691  int laneID = (threadIdx.x & (WARPSIZE - 1));
2692  int warpID = threadIdx.x / WARPSIZE;
2693  if (laneID == 0) {
2694  shVirialNbond[warpID] = virialNbond;
2695  if (doElect && doSlow) {
2696  shVirialSlow[warpID] = virialSlow;
2697  }
2698  }
2699  BLOCK_SYNC;
2700 
2701  virialNbond.xx = 0.0;
2702  virialNbond.xy = 0.0;
2703  virialNbond.xz = 0.0;
2704  virialNbond.yx = 0.0;
2705  virialNbond.yy = 0.0;
2706  virialNbond.yz = 0.0;
2707  virialNbond.zx = 0.0;
2708  virialNbond.zy = 0.0;
2709  virialNbond.zz = 0.0;
2710  if (doElect && doSlow) {
2711  virialSlow.xx = 0.0;
2712  virialSlow.xy = 0.0;
2713  virialSlow.xz = 0.0;
2714  virialSlow.yx = 0.0;
2715  virialSlow.yy = 0.0;
2716  virialSlow.yz = 0.0;
2717  virialSlow.zx = 0.0;
2718  virialSlow.zy = 0.0;
2719  virialSlow.zz = 0.0;
2720  }
2721 
2722  if (warpID == 0) {
2723  if (laneID < BONDEDFORCESKERNEL_NUM_WARP) {
2724  virialNbond = shVirialNbond[laneID];
2725  if (doElect && doSlow) {
2726  virialSlow = shVirialSlow[laneID];
2727  }
2728  }
2729 #pragma unroll
2730  for (int i=WARPSIZE/2;i >= 1;i/=2) {
2731  virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, WARPSIZE);
2732  virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, WARPSIZE);
2733  virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, WARPSIZE);
2734  virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, WARPSIZE);
2735  virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, WARPSIZE);
2736  virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, WARPSIZE);
2737  virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, WARPSIZE);
2738  virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, WARPSIZE);
2739  virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, WARPSIZE);
2740  if (doElect && doSlow) {
2741  virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, WARPSIZE);
2742  virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, WARPSIZE);
2743  virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, WARPSIZE);
2744  virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, WARPSIZE);
2745  virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, WARPSIZE);
2746  virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, WARPSIZE);
2747  virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, WARPSIZE);
2748  virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, WARPSIZE);
2749  virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, WARPSIZE);
2750  }
2751  }
2752 
2753  if (laneID == 0)
2754  {
2755  const int bin = blockIdx.x % ATOMIC_BINS;
2756  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XX], virialNbond.xx);
2757  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XY], virialNbond.xy);
2758  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XZ], virialNbond.xz);
2759  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YX], virialNbond.yx);
2760  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YY], virialNbond.yy);
2761  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YZ], virialNbond.yz);
2762  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZX], virialNbond.zx);
2763  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZY], virialNbond.zy);
2764  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZZ], virialNbond.zz);
2765  if (doElect && doSlow) {
2766  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XX], virialSlow.xx);
2767  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XY], virialSlow.xy);
2768  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XZ], virialSlow.xz);
2769  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YX], virialSlow.yx);
2770  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YY], virialSlow.yy);
2771  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YZ], virialSlow.yz);
2772  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZX], virialSlow.zx);
2773  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZY], virialSlow.zy);
2774  atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZZ], virialSlow.zz);
2775  }
2776  }
2777  }
2778  }
2779 #endif
2780 
2781 }
2782 
2783 template <typename T>
2784 __global__ void gatherBondedForcesKernel(
2785  const int start,
2786  const int atomStorageSize,
2787  const int stride,
2788  const T* __restrict__ forceList,
2789  const int* __restrict__ forceListStarts,
2790  const int* __restrict__ forceListNexts,
2791  T* __restrict__ force) {
2792 
2793  int i = threadIdx.x + (start + blockIdx.x) * blockDim.x;
2794 
2795  if (i < atomStorageSize) {
2796  T fx = 0;
2797  T fy = 0;
2798  T fz = 0;
2799  int pos = forceListStarts[i];
2800  while (pos != -1) {
2801  fx += forceList[pos * 3 + 0];
2802  fy += forceList[pos * 3 + 1];
2803  fz += forceList[pos * 3 + 2];
2804  pos = forceListNexts[pos];
2805  }
2806  force[i ] = fx;
2807  force[i + stride ] = fy;
2808  force[i + stride * 2] = fz;
2809  }
2810 }
2811 
2812 __global__ void reduceBondedBinsKernel(double* energies_virials) {
2813  const int bin = threadIdx.x;
2814 
2815  typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
2816  __shared__ typename WarpReduce::TempStorage tempStorage;
2817 
2818  double v = WarpReduce(tempStorage).Sum(energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + blockIdx.x]);
2819  if (threadIdx.x == 0) {
2820  energies_virials[blockIdx.x] = v;
2821  }
2822 }
2823 
2824 // ##############################################################################################
2825 // ##############################################################################################
2826 // ##############################################################################################
2827 
2828 //
2829 // Class constructor
2830 //
2831 ComputeBondedCUDAKernel::ComputeBondedCUDAKernel(int deviceID, CudaNonbondedTables& cudaNonbondedTables) :
2832 deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables) {
2833 
2834  cudaCheck(cudaSetDevice(deviceID));
2835 
2836  tupleData = NULL;
2837  tupleDataSize = 0;
2838 
2839  bonds = NULL;
2840  angles = NULL;
2841  dihedrals = NULL;
2842  impropers = NULL;
2843  modifiedExclusions = NULL;
2844  exclusions = NULL;
2845  tholes = NULL;
2846  anisos = NULL;
2847 
2848  numBonds = 0;
2849  numAngles = 0;
2850  numDihedrals = 0;
2851  numImpropers = 0;
2852  numModifiedExclusions = 0;
2853  numExclusions = 0;
2854  numCrossterms = 0;
2855  numTholes = 0;
2856  numAnisos = 0;
2857 
2858  bondValues = NULL;
2859  angleValues = NULL;
2860  dihedralValues = NULL;
2861  improperValues = NULL;
2862  crosstermValues = NULL;
2863 
2864  xyzq = NULL;
2865  xyzqSize = 0;
2866 
2867  forces = NULL;
2868  forcesSize = 0;
2869 
2870  forceList = NULL;
2871  forceListStarts = NULL;
2872  forceListNexts = NULL;
2873  forceListSize = 0;
2874  forceListStartsSize = 0;
2875  forceListNextsSize = 0;
2876  allocate_device<int>(&forceListCounter, 1);
2877 
2878  allocate_device<double>(&energies_virials, ATOMIC_BINS * energies_virials_SIZE);
2879 }
2880 
2881 //
2882 // Class destructor
2883 //
2884 ComputeBondedCUDAKernel::~ComputeBondedCUDAKernel() {
2885  cudaCheck(cudaSetDevice(deviceID));
2886 
2887  deallocate_device<double>(&energies_virials);
2888  // deallocate_device<BondedVirial>(&virial);
2889 
2890  if (tupleData != NULL) deallocate_device<char>(&tupleData);
2891  if (xyzq != NULL) deallocate_device<float4>(&xyzq);
2892  if (forces != NULL) deallocate_device<FORCE_TYPE>(&forces);
2893 
2894  if (forceList != NULL) deallocate_device<FORCE_TYPE>(&forceList);
2895  if (forceListCounter != NULL) deallocate_device<int>(&forceListCounter);
2896  if (forceListStarts != NULL) deallocate_device<int>(&forceListStarts);
2897  if (forceListNexts != NULL) deallocate_device<int>(&forceListNexts);
2898 
2899  if (bondValues != NULL) deallocate_device<CudaBondValue>(&bondValues);
2900  if (angleValues != NULL) deallocate_device<CudaAngleValue>(&angleValues);
2901  if (dihedralValues != NULL) deallocate_device<CudaDihedralValue>(&dihedralValues);
2902  if (improperValues != NULL) deallocate_device<CudaDihedralValue>(&improperValues);
2903  if (crosstermValues != NULL) deallocate_device<CudaCrosstermValue>(&crosstermValues);
2904 }
2905 
2906 void ComputeBondedCUDAKernel::setupBondValues(int numBondValues, CudaBondValue* h_bondValues) {
2907  allocate_device<CudaBondValue>(&bondValues, numBondValues);
2908  copy_HtoD_sync<CudaBondValue>(h_bondValues, bondValues, numBondValues);
2909 }
2910 
2911 void ComputeBondedCUDAKernel::setupAngleValues(int numAngleValues, CudaAngleValue* h_angleValues) {
2912  allocate_device<CudaAngleValue>(&angleValues, numAngleValues);
2913  copy_HtoD_sync<CudaAngleValue>(h_angleValues, angleValues, numAngleValues);
2914 }
2915 
2916 void ComputeBondedCUDAKernel::setupDihedralValues(int numDihedralValues, CudaDihedralValue* h_dihedralValues) {
2917  allocate_device<CudaDihedralValue>(&dihedralValues, numDihedralValues);
2918  copy_HtoD_sync<CudaDihedralValue>(h_dihedralValues, dihedralValues, numDihedralValues);
2919 }
2920 
2921 void ComputeBondedCUDAKernel::setupImproperValues(int numImproperValues, CudaDihedralValue* h_improperValues) {
2922  allocate_device<CudaDihedralValue>(&improperValues, numImproperValues);
2923  copy_HtoD_sync<CudaDihedralValue>(h_improperValues, improperValues, numImproperValues);
2924 }
2925 
2926 void ComputeBondedCUDAKernel::setupCrosstermValues(int numCrosstermValues, CudaCrosstermValue* h_crosstermValues) {
2927  allocate_device<CudaCrosstermValue>(&crosstermValues, numCrosstermValues);
2928  copy_HtoD_sync<CudaCrosstermValue>(h_crosstermValues, crosstermValues, numCrosstermValues);
2929 }
2930 
2931 void ComputeBondedCUDAKernel::updateCudaAlchParameters(const CudaAlchParameters* h_cudaAlchParameters, cudaStream_t stream) {
2932  cudaCheck(cudaMemcpyToSymbolAsync(AlchBondedCUDA::alchParams, h_cudaAlchParameters, 1*sizeof(AlchBondedCUDA::alchParams), 0, cudaMemcpyHostToDevice, stream));
2933 }
2934 
2935 void ComputeBondedCUDAKernel::updateCudaAlchLambdas(const CudaAlchLambdas* h_cudaAlchLambdas, cudaStream_t stream) {
2936  cudaCheck(cudaMemcpyToSymbolAsync(AlchBondedCUDA::alchLambdas, h_cudaAlchLambdas, 1*sizeof(AlchBondedCUDA::alchLambdas), 0, cudaMemcpyHostToDevice, stream));
2937 }
2938 
2939 void ComputeBondedCUDAKernel::updateCudaAlchFlags(const CudaAlchFlags& h_cudaAlchFlags) {
2940  alchFlags = h_cudaAlchFlags;
2941 }
2942 
2943 //
2944 // Update bonded lists
2945 //
2946 void ComputeBondedCUDAKernel::setTupleCounts(
2947  const TupleCounts count
2948 ) {
2949  numBonds = count.bond;
2950  numAngles = count.angle;
2951  numDihedrals = count.dihedral;
2952  numImpropers = count.improper;
2953  numModifiedExclusions = count.modifiedExclusion;
2954  numExclusions = count.exclusion;
2955  numCrossterms = count.crossterm;
2956  numTholes = count.thole;
2957  numAnisos = count.aniso;
2958 }
2959 
2960 TupleCounts ComputeBondedCUDAKernel::getTupleCounts() {
2961  TupleCounts count;
2962 
2963  count.bond = numBonds;
2964  count.angle = numAngles;
2965  count.dihedral = numDihedrals;
2966  count.improper = numImpropers;
2967  count.modifiedExclusion = numModifiedExclusions;
2968  count.exclusion = numExclusions;
2969  count.crossterm = numCrossterms;
2970  count.thole = numTholes;
2971  count.aniso = numAnisos;
2972 
2973  return count;
2974 }
2975 
2976 TupleData ComputeBondedCUDAKernel::getData() {
2977  TupleData data;
2978  data.bond = bonds;
2979  data.angle = angles;
2980  data.dihedral = dihedrals;
2981  data.improper = impropers;
2982  data.modifiedExclusion = modifiedExclusions;
2983  data.exclusion = exclusions;
2984  data.crossterm = crossterms;
2985  data.thole = tholes;
2986  data.aniso = anisos;
2987 
2988  return data;
2989 }
2990 
2991 size_t ComputeBondedCUDAKernel::reallocateTupleBuffer(
2992  const TupleCounts countIn,
2993  cudaStream_t
2994 ) {
2995  const int numBondsWA = warpAlign(countIn.bond);
2996  const int numAnglesWA = warpAlign(countIn.angle);
2997  const int numDihedralsWA = warpAlign(countIn.dihedral);
2998  const int numImpropersWA = warpAlign(countIn.improper);
2999  const int numModifiedExclusionsWA = warpAlign(countIn.modifiedExclusion);
3000  const int numExclusionsWA = warpAlign(countIn.exclusion);
3001  const int numCrosstermsWA = warpAlign(countIn.crossterm);
3002  const int numTholesWA = warpAlign(countIn.thole);
3003  const int numAnisosWA = warpAlign(countIn.aniso);
3004 
3005  const size_t sizeTot = numBondsWA*sizeof(CudaBond)
3006  + numAnglesWA*sizeof(CudaAngle)
3007  + numDihedralsWA*sizeof(CudaDihedral)
3008  + numImpropersWA*sizeof(CudaDihedral)
3009  + numModifiedExclusionsWA*sizeof(CudaExclusion)
3010  + numExclusionsWA*sizeof(CudaExclusion)
3011  + numCrosstermsWA*sizeof(CudaCrossterm)
3012  + numTholesWA*sizeof(CudaThole)
3013  + numAnisosWA*sizeof(CudaAniso);
3014 
3015  reallocate_device<char>(&tupleData, &tupleDataSize, sizeTot, kTupleOveralloc);
3016 
3017  // Setup pointers
3018  size_t pos = 0;
3019  bonds = (CudaBond *)&tupleData[pos];
3020  pos += numBondsWA*sizeof(CudaBond);
3021 
3022  angles = (CudaAngle* )&tupleData[pos];
3023  pos += numAnglesWA*sizeof(CudaAngle);
3024 
3025  dihedrals = (CudaDihedral* )&tupleData[pos];
3026  pos += numDihedralsWA*sizeof(CudaDihedral);
3027 
3028  impropers = (CudaDihedral* )&tupleData[pos];
3029  pos += numImpropersWA*sizeof(CudaDihedral);
3030 
3031  modifiedExclusions = (CudaExclusion* )&tupleData[pos];
3032  pos += numModifiedExclusionsWA*sizeof(CudaExclusion);
3033 
3034  exclusions = (CudaExclusion* )&tupleData[pos];
3035  pos += numExclusionsWA*sizeof(CudaExclusion);
3036 
3037  crossterms = (CudaCrossterm* )&tupleData[pos];
3038  pos += numCrosstermsWA*sizeof(CudaCrossterm);
3039 
3040  tholes = (CudaThole *)&tupleData[pos];
3041  pos += numTholesWA*sizeof(CudaThole);
3042 
3043  anisos = (CudaAniso *)&tupleData[pos];
3044  pos += numAnisosWA*sizeof(CudaAniso);
3045 
3046  return sizeTot;
3047 }
3048 
3049 void ComputeBondedCUDAKernel::update(
3050  const int numBondsIn,
3051  const int numAnglesIn,
3052  const int numDihedralsIn,
3053  const int numImpropersIn,
3054  const int numModifiedExclusionsIn,
3055  const int numExclusionsIn,
3056  const int numCrosstermsIn,
3057  const int numTholesIn,
3058  const int numAnisosIn,
3059  const char* h_tupleData,
3060  cudaStream_t stream) {
3061 
3062  numBonds = numBondsIn;
3063  numAngles = numAnglesIn;
3064  numDihedrals = numDihedralsIn;
3065  numImpropers = numImpropersIn;
3066  numModifiedExclusions = numModifiedExclusionsIn;
3067  numExclusions = numExclusionsIn;
3068  numCrossterms = numCrosstermsIn;
3069  numTholes = numTholesIn;
3070  numAnisos = numAnisosIn;
3071 
3072  const size_t sizeTot = reallocateTupleBuffer(getTupleCounts(), stream);
3073 
3074  copy_HtoD<char>(h_tupleData, tupleData, sizeTot, stream);
3075 }
3076 
3077 void ComputeBondedCUDAKernel::updateAtomBuffer(
3078  const int atomStorageSize,
3079  cudaStream_t stream
3080 ) {
3081  reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, 1.4f);
3082 }
3083 
3084 //
3085 // Return stride for force array
3086 //
3087 int ComputeBondedCUDAKernel::getForceStride(const int atomStorageSize) {
3088 #ifdef USE_STRIDED_FORCE
3089  // Align stride to 256 bytes
3090  return ((atomStorageSize*sizeof(FORCE_TYPE) - 1)/256 + 1)*256/sizeof(FORCE_TYPE);
3091 #else
3092  return 1;
3093 #endif
3094 }
3095 
3096 //
3097 // Return size of single force array
3098 //
3099 int ComputeBondedCUDAKernel::getForceSize(const int atomStorageSize) {
3100 #ifdef USE_STRIDED_FORCE
3101  return (3*getForceStride(atomStorageSize));
3102 #else
3103  return (3*atomStorageSize);
3104 #endif
3105 }
3106 
3107 //
3108 // Return size of the all force arrays
3109 //
3110 int ComputeBondedCUDAKernel::getAllForceSize(const int atomStorageSize, const bool doSlow) {
3111 
3112  int forceSize = getForceSize(atomStorageSize);
3113 
3114  if (numModifiedExclusions > 0 || numExclusions > 0) {
3115  if (doSlow) {
3116  // All three force arrays [normal, nbond, slow]
3117  forceSize *= 3;
3118  } else {
3119  // Two force arrays [normal, nbond]
3120  forceSize *= 2;
3121  }
3122  }
3123 
3124  return forceSize;
3125 }
3126 
3127 //
3128 // Compute bonded forces
3129 //
3130 void ComputeBondedCUDAKernel::bondedForce(
3131  const double scale14, const int atomStorageSize,
3132  const bool doEnergy, const bool doVirial, const bool doSlow,
3133  const bool doTable,
3134  const float3 lata, const float3 latb, const float3 latc,
3135  const float cutoff2, const float r2_delta, const int r2_delta_expc,
3136  const CudaNBConstants nbConstants,
3137  const float4* h_xyzq, FORCE_TYPE* h_forces,
3138  double *h_energies_virials, bool atomsChanged, bool CUDASOAintegrate,
3139  bool useDeviceMigration, cudaStream_t stream) {
3140 
3141  int forceStorageSize = getAllForceSize(atomStorageSize, true);
3142  int forceCopySize = getAllForceSize(atomStorageSize, doSlow);
3143  int forceStride = getForceStride(atomStorageSize);
3144 
3145  int forceSize = getForceSize(atomStorageSize);
3146  int startNbond = forceSize;
3147  int startSlow = 2*forceSize;
3148  // Re-allocate coordinate and force arrays if neccessary
3149  reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, 1.4f);
3150  reallocate_device<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
3151 
3152 #if !defined(USE_BONDED_FORCE_ATOMIC_STORE)
3153  // function stores
3154  // numBonds bondForce 2
3155  // numAngles angleForce 3
3156  // numDihedrals diheForce 4
3157  // numImpropers diheForce 4
3158  // numExclusions exclusionForce 2
3159  // numCrossterms crosstermForce 8
3160  // numModifiedExclusions modifiedExclusionForce 4
3161  // numTholes tholeForce 4
3162  // numAnisos anisoForce 5
3163  int listSize = 3 * (numBonds * 2 + numAngles * 3 + numDihedrals * 4 + numImpropers * 4 + numExclusions * 2 + numCrossterms * 8 + numModifiedExclusions * 4 + numTholes * 4 + numAnisos * 5);
3164  reallocate_device<FORCE_TYPE>(&forceList, &forceListSize, listSize, 1.4f);
3165  reallocate_device<int>(&forceListNexts, &forceListNextsSize, listSize, 1.4f);
3166  reallocate_device<int>(&forceListStarts, &forceListStartsSize, 3 * atomStorageSize, 1.4f);
3167  int* forceListStartsNbond = forceListStarts + atomStorageSize;
3168  int* forceListStartsSlow = forceListStarts + 2 * atomStorageSize;
3169 
3170  clear_device_array<int>(forceListCounter, 1, stream);
3171  cudaCheck(cudaMemsetAsync(forceListStarts, -1, sizeof(int) * 3 * atomStorageSize, stream));
3172 #else
3173  int* forceListStartsNbond = NULL;
3174  int* forceListStartsSlow = NULL;
3175 #endif
3176 
3177 #ifdef NODEGROUP_FORCE_REGISTER
3178  if(CUDASOAintegrate){
3179  if(atomsChanged && !useDeviceMigration) copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
3180  }else copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
3181 #else
3182  copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
3183 #endif
3184 #if defined(USE_BONDED_FORCE_ATOMIC_STORE)
3185  clear_device_array<FORCE_TYPE>(forces, forceCopySize, stream);
3186 #endif
3187  if (doEnergy || doVirial ) {
3188  clear_device_array<double>(energies_virials, ATOMIC_BINS * energies_virials_SIZE, stream);
3189  }
3190 
3191  // Check if we are doing alchemical free energy calculation
3192  // Is checking alchOn required?
3193  const bool doFEP = (alchFlags.alchOn) && (alchFlags.alchFepOn);
3194  const bool doTI = (alchFlags.alchOn) && (alchFlags.alchThermIntOn);
3195 
3196  float one_scale14 = (float)(1.0 - scale14);
3197 
3198  // If doSlow = false, these exclusions are not calculated
3199  int numExclusionsDoSlow = doSlow ? numExclusions : 0;
3200 
3201  int nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
3202 
3203  int numBondsTB = (numBonds + nthread - 1)/nthread;
3204  int numAnglesTB = (numAngles + nthread - 1)/nthread;
3205  int numDihedralsTB = (numDihedrals + nthread - 1)/nthread;
3206  int numImpropersTB = (numImpropers + nthread - 1)/nthread;
3207  int numExclusionsTB= (numExclusionsDoSlow + nthread - 1)/nthread;
3208  int numCrosstermsTB= (numCrossterms + nthread - 1)/nthread;
3209  int numTholesTB = (numTholes + nthread - 1)/nthread;
3210  int numAnisosTB = (numAnisos + nthread - 1)/nthread;
3211 
3212  int nblock = numBondsTB + numAnglesTB + numDihedralsTB + numImpropersTB +
3213  numExclusionsTB + numCrosstermsTB + numTholesTB + numAnisosTB;
3214  int shmem_size = 0;
3215 
3216  // printf("%d %d %d %d %d %d nblock %d\n",
3217  // numBonds, numAngles, numDihedrals, numImpropers, numModifiedExclusions, numExclusions, nblock);
3218 
3219  int max_nblock = deviceCUDA->getMaxNumBlocks();
3220 
3221  int start = 0;
3222  while (start < nblock)
3223  {
3224  int nleft = nblock - start;
3225  int nblock_use = min(max_nblock, nleft);
3226 
3227 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
3228  #define TABLE_PARAMS cudaNonbondedTables.get_r2_table(), \
3229  cudaNonbondedTables.getExclusionTable()
3230 #else
3231  #define TABLE_PARAMS cudaNonbondedTables.get_r2_table_tex(), \
3232  cudaNonbondedTables.getExclusionTableTex()
3233 #endif
3234 
3235 #if defined(USE_TABLE_ARRAYS)
3236 #define CALL(DOENERGY, DOVIRIAL, DOFEP, DOTI) \
3237  bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOFEP, DOTI> \
3238  <<< nblock_use, nthread, shmem_size, stream >>> \
3239  (start, numBonds, bonds, bondValues, \
3240  numAngles, angles, angleValues, \
3241  numDihedrals, dihedrals, dihedralValues, \
3242  numImpropers, impropers, improperValues, \
3243  numExclusionsDoSlow, exclusions, \
3244  numCrossterms, crossterms, crosstermValues, \
3245  numTholes, tholes, \
3246  numAnisos, anisos, \
3247  cutoff2, \
3248  r2_delta, r2_delta_expc, \
3249  TABLE_PARAMS, \
3250  xyzq, forceStride, \
3251  lata, latb, latc, \
3252  forces, &forces[startSlow], \
3253  forceList, forceListCounter, forceListStarts, forceListStartsSlow, forceListNexts, \
3254  energies_virials);
3255 #else
3256 #define CALL(DOENERGY, DOVIRIAL, DOFEP, DOTI) \
3257  bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOFEP, DOTI> \
3258  <<< nblock_use, nthread, shmem_size, stream >>> \
3259  (start, numBonds, bonds, bondValues, \
3260  numAngles, angles, angleValues, \
3261  numDihedrals, dihedrals, dihedralValues, \
3262  numImpropers, impropers, improperValues, \
3263  numExclusionsDoSlow, exclusions, \
3264  numCrossterms, crossterms, crosstermValues, \
3265  numTholes, tholes, \
3266  numAnisos, anisos, \
3267  cutoff2, \
3268  r2_delta, r2_delta_expc, \
3269  cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable(), \
3270  cudaNonbondedTables.get_r2_table_tex(), cudaNonbondedTables.getExclusionTableTex(), \
3271  xyzq, forceStride, \
3272  lata, latb, latc, \
3273  forces, &forces[startSlow], \
3274  forceList, forceListCounter, forceListStarts, forceListStartsSlow, forceListNexts, \
3275  energies_virials);
3276 #endif
3277  if (!doEnergy && !doVirial && !doFEP && !doTI) CALL(0, 0, 0, 0);
3278  if (!doEnergy && doVirial && !doFEP && !doTI) CALL(0, 1, 0, 0);
3279  if ( doEnergy && !doVirial && !doFEP && !doTI) CALL(1, 0, 0, 0);
3280  if ( doEnergy && doVirial && !doFEP && !doTI) CALL(1, 1, 0, 0);
3281 
3282  if (!doEnergy && !doVirial && doFEP && !doTI) CALL(0, 0, 1, 0);
3283  if (!doEnergy && doVirial && doFEP && !doTI) CALL(0, 1, 1, 0);
3284  if ( doEnergy && !doVirial && doFEP && !doTI) CALL(1, 0, 1, 0);
3285  if ( doEnergy && doVirial && doFEP && !doTI) CALL(1, 1, 1, 0);
3286 
3287  if (!doEnergy && !doVirial && !doFEP && doTI) CALL(0, 0, 0, 1);
3288  if (!doEnergy && doVirial && !doFEP && doTI) CALL(0, 1, 0, 1);
3289  if ( doEnergy && !doVirial && !doFEP && doTI) CALL(1, 0, 0, 1);
3290  if ( doEnergy && doVirial && !doFEP && doTI) CALL(1, 1, 0, 1);
3291 
3292  // Can't enable both FEP and TI, don't expand the following functions.
3293 #if 0
3294  if (!doEnergy && !doVirial && doFEP && doTI) CALL(0, 0, 1, 1);
3295  if (!doEnergy && doVirial && doFEP && doTI) CALL(0, 1, 1, 1);
3296  if ( doEnergy && !doVirial && doFEP && doTI) CALL(1, 0, 1, 1);
3297  if ( doEnergy && doVirial && doFEP && doTI) CALL(1, 1, 1, 1);
3298 #endif
3299 
3300 #undef CALL
3301 #undef TABLE_PARAMS
3302  cudaCheck(cudaGetLastError());
3303 
3304  start += nblock_use;
3305  }
3306 
3307  nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
3308  nblock = (numModifiedExclusions + nthread - 1)/nthread;
3309 
3310  bool doElect = (one_scale14 == 0.0f) ? false : true;
3311 
3312  start = 0;
3313  while (start < nblock)
3314  {
3315  int nleft = nblock - start;
3316  int nblock_use = min(max_nblock, nleft);
3317 
3318 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
3319 #define TABLE_PARAMS \
3320  cudaNonbondedTables.getExclusionVdwCoefTable(), \
3321  cudaNonbondedTables.getModifiedExclusionForceTable(), \
3322  cudaNonbondedTables.getModifiedExclusionEnergyTable()
3323 #else
3324 #define TABLE_PARAMS \
3325  cudaNonbondedTables.getExclusionVdwCoefTable(), \
3326  cudaNonbondedTables.getExclusionVdwCoefTableTex(), \
3327  cudaNonbondedTables.getModifiedExclusionForceTableTex(), \
3328  cudaNonbondedTables.getModifiedExclusionEnergyTableTex()
3329 #endif
3330 
3331 #define CALL(DOENERGY, DOVIRIAL, DOELECT, DOFEP, DOTI, DOTABLE) \
3332  modifiedExclusionForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOELECT, DOFEP, DOTI, DOTABLE> \
3333  <<< nblock_use, nthread, shmem_size, stream >>> \
3334  (start, numModifiedExclusions, modifiedExclusions, \
3335  doSlow, one_scale14, cutoff2, nbConstants, \
3336  cudaNonbondedTables.getVdwCoefTableWidth(), \
3337  TABLE_PARAMS, \
3338  xyzq, forceStride, lata, latb, latc, \
3339  &forces[startNbond], &forces[startSlow], \
3340  forceList, forceListCounter, forceListStartsNbond, forceListStartsSlow, forceListNexts, \
3341  energies_virials);
3342 
3343 
3344 
3345  if (!doEnergy && !doVirial && !doElect && !doFEP && !doTI && doTable) CALL(0, 0, 0, 0, 0, 1);
3346  if (!doEnergy && doVirial && !doElect && !doFEP && !doTI && doTable) CALL(0, 1, 0, 0, 0, 1);
3347  if ( doEnergy && !doVirial && !doElect && !doFEP && !doTI && doTable) CALL(1, 0, 0, 0, 0, 1);
3348  if ( doEnergy && doVirial && !doElect && !doFEP && !doTI && doTable) CALL(1, 1, 0, 0, 0, 1);
3349 
3350  if (!doEnergy && !doVirial && doElect && !doFEP && !doTI && doTable) CALL(0, 0, 1, 0, 0, 1);
3351  if (!doEnergy && doVirial && doElect && !doFEP && !doTI && doTable) CALL(0, 1, 1, 0, 0, 1);
3352  if ( doEnergy && !doVirial && doElect && !doFEP && !doTI && doTable) CALL(1, 0, 1, 0, 0, 1);
3353  if ( doEnergy && doVirial && doElect && !doFEP && !doTI && doTable) CALL(1, 1, 1, 0, 0, 1);
3354 
3355  if (!doEnergy && !doVirial && !doElect && doFEP && !doTI && doTable) CALL(0, 0, 0, 1, 0, 1);
3356  if (!doEnergy && doVirial && !doElect && doFEP && !doTI && doTable) CALL(0, 1, 0, 1, 0, 1);
3357  if ( doEnergy && !doVirial && !doElect && doFEP && !doTI && doTable) CALL(1, 0, 0, 1, 0, 1);
3358  if ( doEnergy && doVirial && !doElect && doFEP && !doTI && doTable) CALL(1, 1, 0, 1, 0, 1);
3359 
3360  if (!doEnergy && !doVirial && doElect && doFEP && !doTI && doTable) CALL(0, 0, 1, 1, 0, 1);
3361  if (!doEnergy && doVirial && doElect && doFEP && !doTI && doTable) CALL(0, 1, 1, 1, 0, 1);
3362  if ( doEnergy && !doVirial && doElect && doFEP && !doTI && doTable) CALL(1, 0, 1, 1, 0, 1);
3363  if ( doEnergy && doVirial && doElect && doFEP && !doTI && doTable) CALL(1, 1, 1, 1, 0, 1);
3364 
3365  if (!doEnergy && !doVirial && !doElect && !doFEP && doTI && doTable) CALL(0, 0, 0, 0, 1, 1);
3366  if (!doEnergy && doVirial && !doElect && !doFEP && doTI && doTable) CALL(0, 1, 0, 0, 1, 1);
3367  if ( doEnergy && !doVirial && !doElect && !doFEP && doTI && doTable) CALL(1, 0, 0, 0, 1, 1);
3368  if ( doEnergy && doVirial && !doElect && !doFEP && doTI && doTable) CALL(1, 1, 0, 0, 1, 1);
3369 
3370  if (!doEnergy && !doVirial && doElect && !doFEP && doTI && doTable) CALL(0, 0, 1, 0, 1, 1);
3371  if (!doEnergy && doVirial && doElect && !doFEP && doTI && doTable) CALL(0, 1, 1, 0, 1, 1);
3372  if ( doEnergy && !doVirial && doElect && !doFEP && doTI && doTable) CALL(1, 0, 1, 0, 1, 1);
3373  if ( doEnergy && doVirial && doElect && !doFEP && doTI && doTable) CALL(1, 1, 1, 0, 1, 1);
3374 
3375  // doTable disabled is only supported by no FEP/ no TI
3376  if (!doEnergy && !doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(0, 0, 0, 0, 0, 0);
3377  if (!doEnergy && doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(0, 1, 0, 0, 0, 0);
3378  if ( doEnergy && !doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(1, 0, 0, 0, 0, 0);
3379  if ( doEnergy && doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(1, 1, 0, 0, 0, 0);
3380 
3381  if (!doEnergy && !doVirial && doElect && !doFEP && !doTI && !doTable) CALL(0, 0, 1, 0, 0, 0);
3382  if (!doEnergy && doVirial && doElect && !doFEP && !doTI && !doTable) CALL(0, 1, 1, 0, 0, 0);
3383  if ( doEnergy && !doVirial && doElect && !doFEP && !doTI && !doTable) CALL(1, 0, 1, 0, 0, 0);
3384  if ( doEnergy && doVirial && doElect && !doFEP && !doTI && !doTable) CALL(1, 1, 1, 0, 0, 0);
3385 
3386  // Can't enable both FEP and TI, don't expand the following functions.
3387 #if 0
3388  if (!doEnergy && !doVirial && !doElect && doFEP && doTI) CALL(0, 0, 0, 1, 1);
3389  if (!doEnergy && doVirial && !doElect && doFEP && doTI) CALL(0, 1, 0, 1, 1);
3390  if ( doEnergy && !doVirial && !doElect && doFEP && doTI) CALL(1, 0, 0, 1, 1);
3391  if ( doEnergy && doVirial && !doElect && doFEP && doTI) CALL(1, 1, 0, 1, 1);
3392 
3393  if (!doEnergy && !doVirial && doElect && doFEP && doTI) CALL(0, 0, 1, 1, 1);
3394  if (!doEnergy && doVirial && doElect && doFEP && doTI) CALL(0, 1, 1, 1, 1);
3395  if ( doEnergy && !doVirial && doElect && doFEP && doTI) CALL(1, 0, 1, 1, 1);
3396  if ( doEnergy && doVirial && doElect && doFEP && doTI) CALL(1, 1, 1, 1, 1);
3397 #endif
3398 
3399 #undef CALL
3400  cudaCheck(cudaGetLastError());
3401 
3402  start += nblock_use;
3403  }
3404 #if !defined(USE_BONDED_FORCE_ATOMIC_STORE)
3405  nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
3406  nblock = (atomStorageSize + nthread - 1)/nthread;
3407 
3408  start = 0;
3409  while (start < nblock)
3410  {
3411  int nleft = nblock - start;
3412  int nblock_use = min(max_nblock, nleft);
3413 
3414  // cudaCheck(hipDeviceSynchronize());
3415  // auto t0 = std::chrono::high_resolution_clock::now();
3416 
3417  gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
3418  start, atomStorageSize, forceStride,
3419  forceList, forceListStarts, forceListNexts,
3420  forces);
3421  gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
3422  start, atomStorageSize, forceStride,
3423  forceList, forceListStartsNbond, forceListNexts,
3424  &forces[startNbond]);
3425  if (doSlow) {
3426  gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
3427  start, atomStorageSize, forceStride,
3428  forceList, forceListStartsSlow, forceListNexts,
3429  &forces[startSlow]);
3430  }
3431  cudaCheck(cudaGetLastError());
3432 
3433  // cudaCheck(hipStreamSynchronize(stream));
3434  // auto t1 = std::chrono::high_resolution_clock::now();
3435  // std::chrono::duration<double> diff1 = t1 - t0;
3436  // std::cout << "gatherBondedForcesKernel";
3437  // std::cout << " " << std::setprecision(3) << diff1.count() * 1e3 << " ms" << std::endl;
3438 
3439  start += nblock_use;
3440  }
3441 #endif
3442 
3443 #ifdef NODEGROUP_FORCE_REGISTER
3444  if((atomsChanged && !useDeviceMigration) || !CUDASOAintegrate){
3445  copy_DtoH<double>(forces, h_forces, forceCopySize, stream);
3446 #if 0
3447  // XXX TODO: ERASE THIS AFTERWARDS
3448  // this is not numAtoms, this is something else
3449  // will print the force inside the compute and afterwards
3450  FILE* pos_nb_atoms = fopen("compute_b_dforce.txt", "w");
3451  //fprintf(pos_nb_atoms, "Calculating %d positions\n", tlKernel.getCudaPatchesSize());
3452  // positions are wrong here.
3453  //for(int i = 0; i < atomStorageSize; i++){
3454  for(int i = 29415; i < 29895; i++){
3455  fprintf(pos_nb_atoms, "%2.4lf\n", h_forcesDP[i]);
3456  }
3457  fclose(pos_nb_atoms);
3458 #endif
3459  }
3460 
3461 #else
3462  copy_DtoH<double>(forces, h_forces, forceCopySize, stream);
3463 #endif
3464  if (doEnergy || doVirial) {
3465  if (ATOMIC_BINS > 1) {
3466  // Reduce energies_virials[ATOMIC_BINS][energies_virials_SIZE] in-place (results are in energies_virials[0])
3467  reduceBondedBinsKernel<<<energies_virials_SIZE, ATOMIC_BINS, 0, stream>>>(energies_virials);
3468  }
3469  // virial copy, is this necessary?
3470  copy_DtoH<double>(energies_virials, h_energies_virials, energies_virials_SIZE, stream);
3471  }
3472 }
3473 
3474