1 #include "TupleTypesCUDA.h"
3 #define _USE_MATH_DEFINES
4 #define __thread __declspec(thread)
8 #include <hipcub/hipcub.hpp>
14 #if __CUDACC_VER_MAJOR__ >= 11
15 #include <cub/cub.cuh>
17 #include <namd_cub/cub.cuh>
23 #include "ComputeBondedCUDAKernel.h"
24 #include "CudaComputeNonbondedInteractions.h"
26 #ifdef FUTURE_CUDADEVICE
27 #include "CudaDevice.h"
29 #include "DeviceCUDA.h"
30 extern __thread DeviceCUDA *deviceCUDA;
33 #ifndef USE_TABLE_ARRAYS
34 __device__ __forceinline__
35 float4 sampleTableTex(cudaTextureObject_t tex, float k) {
36 #if defined(NAMD_CUDA)
37 return tex1D<float4>(tex, k);
39 const int tableSize = FORCE_ENERGY_TABLE_SIZE;
40 const float x = k * (float)tableSize - 0.5f;
41 const float f = floorf(x);
42 const float a = x - f;
43 const unsigned int i = (unsigned int)f;
44 const int i0 = i < tableSize - 1 ? i : tableSize - 1;
45 const int i1 = i0 + 1;
46 const float4 t0 = tex1Dfetch<float4>(tex, i0);
47 const float4 t1 = tex1Dfetch<float4>(tex, i1);
49 a * (t1.x - t0.x) + t0.x,
50 a * (t1.y - t0.y) + t0.y,
51 a * (t1.z - t0.z) + t0.z,
52 a * (t1.w - t0.w) + t0.w);
57 __device__ __forceinline__
58 float4 tableLookup(const float4* table, const float k)
60 const int tableSize = FORCE_ENERGY_TABLE_SIZE;
61 const float x = k * static_cast<float>(tableSize) - 0.5f;
62 const float f = floorf(x);
63 const float a = x - f;
64 const int i = static_cast<int>(f);
65 const int i0 = max(0, min(tableSize - 1, i));
66 const int i1 = max(0, min(tableSize - 1, i + 1));
67 const float4 t0 = __ldg(&table[i0]);
68 const float4 t1 = __ldg(&table[i1]);
70 a * (t1.x - t0.x) + t0.x,
71 a * (t1.y - t0.y) + t0.y,
72 a * (t1.z - t0.z) + t0.z,
73 a * (t1.w - t0.w) + t0.w);
76 // global variable for alchemical transformation
77 namespace AlchBondedCUDA {
78 // Alchemical parameters and lambdas
79 __constant__ CudaAlchParameters alchParams;
80 __constant__ CudaAlchLambdas alchLambdas;
85 __forceinline__ __device__
86 void convertForces(const float x, const float y, const float z,
87 T &Tx, T &Ty, T &Tz) {
95 __forceinline__ __device__
96 void calcComponentForces(
98 const float dx, const float dy, const float dz,
99 T &fxij, T &fyij, T &fzij) {
107 template <typename T>
108 __forceinline__ __device__
109 void calcComponentForces(
111 const float dx1, const float dy1, const float dz1,
113 const float dx2, const float dy2, const float dz2,
114 T &fxij, T &fyij, T &fzij) {
116 fxij = (T)(fij1*dx1 + fij2*dx2);
117 fyij = (T)(fij1*dy1 + fij2*dy2);
118 fzij = (T)(fij1*dz1 + fij2*dz2);
121 __forceinline__ __device__
122 int warpAggregatingAtomicInc(int* counter) {
123 #if BOUNDINGBOXSIZE == 64
124 WarpMask mask = __ballot(1);
125 int total = __popcll(mask);
126 int prefix = __popcll(mask & cub::LaneMaskLt());
127 int firstLane = __ffsll(mask) - 1;
129 WarpMask mask = __ballot(1);
130 int total = __popc(mask);
131 int prefix = __popc(mask & cub::LaneMaskLt());
132 int firstLane = __ffs(mask) - 1;
136 start = atomicAdd(counter, total);
138 start = WARP_SHUFFLE(mask, start, firstLane, WARPSIZE);
139 return start + prefix;
142 template <typename T>
143 __forceinline__ __device__
144 void storeForces(const T fx, const T fy, const T fz,
145 const int ind, const int stride,
147 T* forceList, int* forceListCounter,
148 int* forceListStarts, int* forceListNexts) {
150 #if defined(USE_BONDED_FORCE_ATOMIC_STORE)
151 #if defined(NAMD_HIP)
152 // Try to decrease conflicts between lanes if there are repeting ind
153 WarpMask mask = NAMD_WARP_BALLOT(WARP_FULL_MASK, 1);
154 if (mask == ~(WarpMask)0) { // All lanes are active (may be not true for the last block)
155 const int laneId = threadIdx.x % WARPSIZE;
156 const int prevLaneInd = WARP_SHUFFLE(WARP_FULL_MASK, ind, laneId - 1, WARPSIZE);
157 const bool isHead = laneId == 0 || ind != prevLaneInd;
158 if (!NAMD_WARP_ALL(WARP_FULL_MASK, isHead)) {
159 // There are segments of repeating ind
160 typedef cub::WarpReduce<T> WarpReduce;
161 __shared__ typename WarpReduce::TempStorage temp_storage;
162 const T sumfx = WarpReduce(temp_storage).HeadSegmentedSum(fx, isHead);
163 const T sumfy = WarpReduce(temp_storage).HeadSegmentedSum(fy, isHead);
164 const T sumfz = WarpReduce(temp_storage).HeadSegmentedSum(fz, isHead);
166 atomicAdd(&force[ind ], sumfx);
167 atomicAdd(&force[ind + stride ], sumfy);
168 atomicAdd(&force[ind + stride*2], sumfz);
173 // Not all lanes are active (the last block) or there is no repeating ind
174 atomicAdd(&force[ind ], fx);
175 atomicAdd(&force[ind + stride ], fy);
176 atomicAdd(&force[ind + stride*2], fz);
178 atomicAdd(&force[ind ], fx);
179 atomicAdd(&force[ind + stride ], fy);
180 atomicAdd(&force[ind + stride*2], fz);
183 const int newPos = warpAggregatingAtomicInc(forceListCounter);
184 forceListNexts[newPos] = atomicExch(&forceListStarts[ind], newPos);
185 forceList[newPos * 3 + 0] = fx;
186 forceList[newPos * 3 + 1] = fy;
187 forceList[newPos * 3 + 2] = fz;
194 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
195 __device__ void bondForce(
197 const CudaBond* __restrict__ bondList,
198 const CudaBondValue* __restrict__ bondValues,
199 const float4* __restrict__ xyzq,
201 const float3 lata, const float3 latb, const float3 latc,
202 T* __restrict__ force, double &energy,
203 T* __restrict__ forceList, int* forceListCounter,
204 int* forceListStarts, int* forceListNexts,
205 #ifdef WRITE_FULL_VIRIALS
206 ComputeBondedCUDAKernel::BondedVirial<double>& virial,
208 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
210 double &energy_F, double &energy_TI_1, double &energy_TI_2
213 CudaBond bl = bondList[index];
214 CudaBondValue bondValue = bondValues[bl.itype];
215 // if (bondValue.x0 == 0.0f) return;
217 float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
218 float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
219 float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
221 float4 xyzqi = xyzq[bl.i];
222 float4 xyzqj = xyzq[bl.j];
224 float xij = xyzqi.x + shx - xyzqj.x;
225 float yij = xyzqi.y + shy - xyzqj.y;
226 float zij = xyzqi.z + shz - xyzqj.z;
228 float r = sqrtf(xij*xij + yij*yij + zij*zij);
230 float db = r - bondValue.x0;
232 // in this case, the bond represents a harmonic wall potential
233 // where x0 is the lower wall and x1 is the upper
234 db = (r > bondValue.x1 ? r - bondValue.x1 : (r > bondValue.x0 ? 0 : db));
236 float fij = db * bondValue.k * bl.scale;
238 // Define a temporary variable to store the energy
239 // used by both alchemical and non-alchemical route
240 float energy_tmp = 0.0f;
242 energy_tmp += fij * db;
246 // JM: Get this shit off here
247 float alch_scale = 1.0f;
249 switch (bl.fepBondedType) {
251 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
253 energy_TI_1 += (double)(energy_tmp);
256 energy_F += (double)(energy_tmp * ((AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1));
261 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
263 energy_TI_2 += (double)(energy_tmp);
266 energy_F += (double)(energy_tmp * ((AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2));
275 energy += (double)(energy_tmp * alch_scale);
277 energy += (double)(energy_tmp);
281 // XXX TODO: Get this off here
282 // XXX TODO: Decide on a templated parameter nomenclature
287 T T_fxij, T_fyij, T_fzij;
288 calcComponentForces<T>(fij, xij, yij, zij, T_fxij, T_fyij, T_fzij);
291 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
292 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
296 #ifdef WRITE_FULL_VIRIALS
297 float fxij = fij*xij;
298 float fyij = fij*yij;
299 float fzij = fij*zij;
300 virial.xx = (fxij*xij);
301 virial.xy = (fxij*yij);
302 virial.xz = (fxij*zij);
303 virial.yx = (fyij*xij);
304 virial.yy = (fyij*yij);
305 virial.yz = (fyij*zij);
306 virial.zx = (fzij*xij);
307 virial.zy = (fzij*yij);
308 virial.zz = (fzij*zij);
314 // Calculates modified exclusions
316 template <typename T, bool doEnergy, bool doVirial, bool doElect, bool doFEP, bool doTI, bool doTable>
317 __device__ void modifiedExclusionForce(
319 const CudaExclusion* __restrict__ exclusionList,
321 const float one_scale14, // 1 - scale14
322 const int vdwCoefTableWidth,
323 #if defined(USE_TABLE_ARRAYS) || __CUDA_ARCH__ >= 350
324 const float2* __restrict__ vdwCoefTable,
326 cudaTextureObject_t vdwCoefTableTex,
328 #ifdef USE_TABLE_ARRAYS
329 const float4* __restrict__ forceTable, const float4* __restrict__ energyTable,
331 cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
333 const float4* __restrict__ xyzq,
335 const float3 lata, const float3 latb, const float3 latc,
337 const CudaNBConstants nbConstants,
339 T* __restrict__ forceNbond, double &energyNbond,
340 T* __restrict__ forceSlow, double &energySlow,
341 T* __restrict__ forceList, int* forceListCounter,
342 int* forceListStartsNbond, int* forceListStartsSlow, int* forceListNexts,
343 #ifdef WRITE_FULL_VIRIALS
344 ComputeBondedCUDAKernel::BondedVirial<double>& virialNbond,
345 ComputeBondedCUDAKernel::BondedVirial<double>& virialSlow,
347 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialNbond,
348 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialSlow,
350 double &energyVdw_F, double &energyVdw_TI_1, double &energyVdw_TI_2,
351 double &energyNbond_F, double &energyNbond_TI_1, double &energyNbond_TI_2,
352 double &energySlow_F, double &energySlow_TI_1, double &energySlow_TI_2
355 CudaExclusion bl = exclusionList[index];
357 float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
358 float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
359 float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
361 float4 xyzqi = xyzq[bl.i];
362 float4 xyzqj = xyzq[bl.j];
364 float xij = xyzqi.x + shx - xyzqj.x;
365 float yij = xyzqi.y + shy - xyzqj.y;
366 float zij = xyzqi.z + shz - xyzqj.z;
368 float r2 = xij*xij + yij*yij + zij*zij;
371 float rinv = rsqrtf(r2);
374 if (doElect) qq = one_scale14 * xyzqi.w * xyzqj.w;
376 int vdwIndex = bl.vdwtypei + bl.vdwtypej*vdwCoefTableWidth;
377 #if defined(USE_TABLE_ARRAYS) || __CUDA_ARCH__ >= 350
378 float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
380 float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
384 float myElecLambda = 1.0f;
385 float myElecLambda2 = 1.0f;
386 float myVdwLambda = 1.0f;
387 float myVdwLambda2 = 1.0f;
388 float myVdwShift = 0.0f;
389 float myVdwShift2 = 0.0f;
390 double alch_vdw_energy;
391 double alch_vdw_energy_2;
392 float alch_vdw_force = 0.0f;
393 float alch_vdw_dUdl = 0.0f;
394 double* p_energyVdw_TI = NULL;
395 double* p_energyNbond_TI = NULL;
396 double* p_energySlow_TI = NULL;
400 /*lambda values 'up' are for atoms scaled up with lambda (partition 1)*/
401 float elecLambdaUp = (AlchBondedCUDA::alchLambdas).elecLambdaUp;
402 float vdwLambdaUp = (AlchBondedCUDA::alchLambdas).vdwLambdaUp;
403 float vdwShiftUp = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambdaUp);
404 float elecLambda2Up = (AlchBondedCUDA::alchLambdas).elecLambda2Up;
405 float vdwLambda2Up = (AlchBondedCUDA::alchLambdas).vdwLambda2Up;
406 float vdwShift2Up = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambda2Up);
407 /*lambda values 'down' are for atoms scaled down with lambda (partition 2)*/
408 float elecLambdaDown = (AlchBondedCUDA::alchLambdas).elecLambdaDown;
409 float vdwLambdaDown = (AlchBondedCUDA::alchLambdas).vdwLambdaDown;
410 float vdwShiftDown = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambdaDown);
411 float elecLambda2Down = (AlchBondedCUDA::alchLambdas).elecLambda2Down;
412 float vdwLambda2Down = (AlchBondedCUDA::alchLambdas).vdwLambda2Down;
413 float vdwShift2Down = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambda2Down);
414 switch (bl.pswitch) {
415 case 0: myVdwLambda = 1.0f;
418 myElecLambda2 = 1.0f;
421 case 1: myVdwLambda = vdwLambdaUp;
422 myVdwLambda2 = vdwLambda2Up;
423 myElecLambda = elecLambdaUp;
424 myElecLambda2 = elecLambda2Up;
425 myVdwShift = vdwShiftUp;
426 myVdwShift2 = vdwShift2Up;
427 p_energyVdw_TI = &(energyVdw_TI_1);
428 p_energyNbond_TI= &(energyNbond_TI_1);
429 p_energySlow_TI = &(energySlow_TI_1);
431 case 2: myVdwLambda = vdwLambdaDown;
432 myVdwLambda2 = vdwLambda2Down;
433 myElecLambda = elecLambdaDown;
434 myElecLambda2 = elecLambda2Down;
435 myVdwShift = vdwShiftDown;
436 myVdwShift2 = vdwShift2Down;
437 p_energyVdw_TI = &(energyVdw_TI_2);
438 p_energyNbond_TI= &(energyNbond_TI_2);
439 p_energySlow_TI = &(energySlow_TI_2);
441 default: myVdwLambda = 0.0f;
444 myElecLambda2 = 0.0f;
447 if (bl.pswitch != 99 && bl.pswitch != 0) {
448 // Common part of FEP and TI
449 const float& switchOn2 = (AlchBondedCUDA::alchParams).switchDist2;
450 const float switchfactor = 1.0f / ((cutoff2 - switchOn2) * (cutoff2 - switchOn2) * (cutoff2 - switchOn2));
451 const float switchmul = (r2 > switchOn2 ? (switchfactor * (cutoff2 - r2) * (cutoff2 - r2) * (cutoff2 - 3.0f * switchOn2 + 2.0f * r2)) : 1.0f);
452 const float switchmul2 = (r2 > switchOn2 ? (switchfactor * (cutoff2 - r2) * 12.0f * (r2 - switchOn2)) : 0.0f);
453 // A = ljab.x ; B = ljab.y
454 const float r2_1 = 1.0f / (r2 + myVdwShift);
455 const float r2_2 = 1.0f / (r2 + myVdwShift2);
456 const float r6_1 = r2_1 * r2_1 * r2_1;
457 const float r6_2 = r2_2 * r2_2 * r2_2;
458 const float U1 = ljab.x * r6_1 * r6_1 - ljab.y * r6_1;
459 const float U2 = ljab.x * r6_2 * r6_2 - ljab.y * r6_2;
460 // rinv is already calculated above
462 alch_vdw_energy = double(myVdwLambda * switchmul * U1);
464 alch_vdw_energy_2 = double(myVdwLambda2 * switchmul * U2);
467 alch_vdw_force = myVdwLambda * (switchmul * (12.0f * U1 + 6.0f * ljab.y * r6_1) * r2_1 + switchmul2 * U1);
469 alch_vdw_dUdl = switchmul * (U1 + myVdwLambda * (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (6.0f * U1 + 3.0f * ljab.y * r6_1) * r2_1);
473 alch_vdw_energy = 0.0;
474 alch_vdw_energy_2 = 0.0;
483 #ifdef USE_TABLE_ARRAYS
484 fi = tableLookup(forceTable, rinv);
486 fi = sampleTableTex(forceTableTex, rinv);
489 fi = tex1D<float4>(forceTableTex, rinv);
493 if (doEnergy && doTable) {
495 #ifdef USE_TABLE_ARRAYS
496 ei = tableLookup(energyTable, rinv);
498 ei = sampleTableTex(energyTableTex, rinv);
501 ei = tex1D<float4>(energyTableTex, rinv);
504 energyVdw += (double)((ljab.x * ei.z + ljab.y * ei.y) * myElecLambda * p0Factor);
505 energyVdw += alch_vdw_energy;
507 energyVdw_F += (double)((ljab.x * ei.z + ljab.y * ei.y) * myElecLambda2 * p0Factor);
508 energyVdw_F += alch_vdw_energy_2;
511 if (p_energyVdw_TI != NULL) {
512 (*p_energyVdw_TI) += alch_vdw_dUdl;
516 energyVdw += (double)(ljab.x * ei.z + ljab.y * ei.y);
519 energyNbond += qq * ei.x * myElecLambda;
521 energyNbond_F += qq * ei.x * myElecLambda2;
524 if (p_energyNbond_TI != NULL) (*p_energyNbond_TI) += qq * ei.x;
527 energySlow += qq * ei.w * myElecLambda;
529 energySlow_F += qq * ei.w * myElecLambda2;
532 if (p_energySlow_TI != NULL) (*p_energySlow_TI) += qq * ei.w;
542 fNbond = -(ljab.x * fi.z + ljab.y * fi.y + qq * fi.x);
544 fNbond = -(ljab.x * fi.z + ljab.y * fi.y);
547 float e_vdw, e_elec, e_slow;
554 cudaModExclForceMagCalc_VdwEnergySwitch_PMEC1<doEnergy>(
555 doSlow, doElect, r2, rinv, qq, ljab,
556 nbConstants, fNbond, fSlow, e_vdw, e_elec, e_slow);
559 energyVdw += (double) (e_vdw);
560 energyNbond += (double) (e_elec * myElecLambda);
561 energySlow += (double) (e_slow * myElecLambda);
564 // fNbond = fFast + fVdw
566 if (bl.pswitch != 0) {
567 fNbond -= -(ljab.x * fi.z + ljab.y * fi.y);
568 fNbond = fNbond * myElecLambda + alch_vdw_force * float(bl.pswitch == 1 || bl.pswitch == 2);
570 // if pswitch == 0, myElecLambda = 1.0
572 T T_fxij, T_fyij, T_fzij;
573 calcComponentForces<T>(fNbond, xij, yij, zij, T_fxij, T_fyij, T_fzij);
574 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceNbond, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
575 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceNbond, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
577 if (doSlow && doElect) {
582 fSlow *= myElecLambda;
584 T T_fxij, T_fyij, T_fzij;
585 calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
586 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
587 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
592 #ifdef WRITE_FULL_VIRIALS
593 float fxij = fNbond*xij;
594 float fyij = fNbond*yij;
595 float fzij = fNbond*zij;
596 virialNbond.xx = (fxij*xij);
597 virialNbond.xy = (fxij*yij);
598 virialNbond.xz = (fxij*zij);
599 virialNbond.yx = (fyij*xij);
600 virialNbond.yy = (fyij*yij);
601 virialNbond.yz = (fyij*zij);
602 virialNbond.zx = (fzij*xij);
603 virialNbond.zy = (fzij*yij);
604 virialNbond.zz = (fzij*zij);
609 if (doVirial && doSlow && doElect) {
610 #ifdef WRITE_FULL_VIRIALS
611 float fxij = fSlow*xij;
612 float fyij = fSlow*yij;
613 float fzij = fSlow*zij;
614 virialSlow.xx = (fxij*xij);
615 virialSlow.xy = (fxij*yij);
616 virialSlow.xz = (fxij*zij);
617 virialSlow.yx = (fyij*xij);
618 virialSlow.yy = (fyij*yij);
619 virialSlow.yz = (fyij*zij);
620 virialSlow.zx = (fzij*xij);
621 virialSlow.zy = (fzij*yij);
622 virialSlow.zz = (fzij*zij);
630 // Calculates exclusions. Here doSlow = true
632 // doFull = doFullElectrostatics = doSlow
633 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
634 __device__ void exclusionForce(
636 const CudaExclusion* __restrict__ exclusionList,
637 const float r2_delta, const int r2_delta_expc,
639 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
640 const float* __restrict__ r2_table,
641 const float4* __restrict__ exclusionTable,
643 cudaTextureObject_t r2_table_tex,
644 cudaTextureObject_t exclusionTableTex,
646 const float4* __restrict__ xyzq,
648 const float3 lata, const float3 latb, const float3 latc,
650 T* __restrict__ forceSlow, double &energySlow,
651 T* __restrict__ forceList, int* forceListCounter,
652 int* forceListStartsSlow, int* forceListNexts,
653 #ifdef WRITE_FULL_VIRIALS
654 ComputeBondedCUDAKernel::BondedVirial<double>& virialSlow,
656 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialSlow,
658 double &energy_F, double &energy_TI_1, double &energy_TI_2
661 CudaExclusion bl = exclusionList[index];
663 float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
664 float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
665 float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
667 float4 xyzqi = xyzq[bl.i];
668 float4 xyzqj = xyzq[bl.j];
670 float xij = xyzqi.x + shx - xyzqj.x;
671 float yij = xyzqi.y + shy - xyzqj.y;
672 float zij = xyzqi.z + shz - xyzqj.z;
674 float r2 = xij*xij + yij*yij + zij*zij;
678 union { float f; int i; } r2i;
680 int table_i = (r2i.i >> 17) + r2_delta_expc; // table_i >= 0
682 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
684 float r2_table_val = r2_table[table_i];
686 float r2_table_val = __ldg(&r2_table[table_i]);
689 float r2_table_val = tex1Dfetch<float>(r2_table_tex, table_i);
691 float diffa = r2 - r2_table_val;
692 float qq = xyzqi.w * xyzqj.w;
694 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
696 float4 slow = exclusionTable[table_i];
698 float4 slow = __ldg(&exclusionTable[table_i]);
701 float4 slow = tex1Dfetch<float4>(exclusionTableTex, table_i);
709 myElecLambda2 = 1.0f;
710 if (doTI) p_energy_TI = NULL;
711 /*lambda values 'up' are for atoms scaled up with lambda (partition 1)*/
712 float elecLambdaUp = (AlchBondedCUDA::alchLambdas).elecLambdaUp;
713 float elecLambda2Up = (AlchBondedCUDA::alchLambdas).elecLambda2Up;
714 /*lambda values 'down' are for atoms scaled down with lambda (partition 2)*/
715 float elecLambdaDown = (AlchBondedCUDA::alchLambdas).elecLambdaDown;
716 float elecLambda2Down = (AlchBondedCUDA::alchLambdas).elecLambda2Down;
717 switch (bl.pswitch) {
718 case 0: myElecLambda = 1.0f;
719 myElecLambda2 = 1.0f;
721 case 1: myElecLambda = elecLambdaUp;
722 myElecLambda2 = elecLambda2Up;
723 p_energy_TI = &(energy_TI_1);
725 case 2: myElecLambda = elecLambdaDown;
726 myElecLambda2 = elecLambda2Down;
727 p_energy_TI = &(energy_TI_2);
729 default: myElecLambda = 0.0f;
730 myElecLambda2 = 0.0f;
735 double energy_slow_tmp = (double)(qq*(((diffa * (1.0f/6.0f)*slow.x + 0.25f*slow.y ) * diffa + 0.5f*slow.z ) * diffa + slow.w));
737 energySlow += energy_slow_tmp * myElecLambda;
739 energy_F += energy_slow_tmp * myElecLambda2;
742 if (p_energy_TI != NULL) {
743 (*p_energy_TI) += energy_slow_tmp;
747 energySlow += energy_slow_tmp;
751 float fSlow = -qq*((diffa * slow.x + slow.y) * diffa + slow.z);
753 fSlow *= myElecLambda;
756 T T_fxij, T_fyij, T_fzij;
757 calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
758 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
759 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
763 #ifdef WRITE_FULL_VIRIALS
764 float fxij = fSlow*xij;
765 float fyij = fSlow*yij;
766 float fzij = fSlow*zij;
767 virialSlow.xx = (fxij*xij);
768 virialSlow.xy = (fxij*yij);
769 virialSlow.xz = (fxij*zij);
770 virialSlow.yx = (fyij*xij);
771 virialSlow.yy = (fyij*yij);
772 virialSlow.yz = (fyij*zij);
773 virialSlow.zx = (fzij*xij);
774 virialSlow.zy = (fzij*yij);
775 virialSlow.zz = (fzij*zij);
781 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
782 __device__ void angleForce(const int index,
783 const CudaAngle* __restrict__ angleList,
784 const CudaAngleValue* __restrict__ angleValues,
785 const float4* __restrict__ xyzq,
787 const float3 lata, const float3 latb, const float3 latc,
788 T* __restrict__ force, double &energy,
789 T* __restrict__ forceList, int* forceListCounter,
790 int* forceListStarts, int* forceListNexts,
791 #ifdef WRITE_FULL_VIRIALS
792 ComputeBondedCUDAKernel::BondedVirial<double>& virial,
794 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
796 double &energy_F, double &energy_TI_1, double &energy_TI_2
799 CudaAngle al = angleList[index];
801 float ishx = al.ioffsetXYZ.x*lata.x + al.ioffsetXYZ.y*latb.x + al.ioffsetXYZ.z*latc.x;
802 float ishy = al.ioffsetXYZ.x*lata.y + al.ioffsetXYZ.y*latb.y + al.ioffsetXYZ.z*latc.y;
803 float ishz = al.ioffsetXYZ.x*lata.z + al.ioffsetXYZ.y*latb.z + al.ioffsetXYZ.z*latc.z;
805 float kshx = al.koffsetXYZ.x*lata.x + al.koffsetXYZ.y*latb.x + al.koffsetXYZ.z*latc.x;
806 float kshy = al.koffsetXYZ.x*lata.y + al.koffsetXYZ.y*latb.y + al.koffsetXYZ.z*latc.y;
807 float kshz = al.koffsetXYZ.x*lata.z + al.koffsetXYZ.y*latb.z + al.koffsetXYZ.z*latc.z;
809 float xij = xyzq[al.i].x + ishx - xyzq[al.j].x;
810 float yij = xyzq[al.i].y + ishy - xyzq[al.j].y;
811 float zij = xyzq[al.i].z + ishz - xyzq[al.j].z;
813 float xkj = xyzq[al.k].x + kshx - xyzq[al.j].x;
814 float ykj = xyzq[al.k].y + kshy - xyzq[al.j].y;
815 float zkj = xyzq[al.k].z + kshz - xyzq[al.j].z;
817 float rij_inv = rsqrtf(xij*xij + yij*yij + zij*zij);
818 float rkj_inv = rsqrtf(xkj*xkj + ykj*ykj + zkj*zkj);
820 float xijr = xij*rij_inv;
821 float yijr = yij*rij_inv;
822 float zijr = zij*rij_inv;
823 float xkjr = xkj*rkj_inv;
824 float ykjr = ykj*rkj_inv;
825 float zkjr = zkj*rkj_inv;
826 float cos_theta = xijr*xkjr + yijr*ykjr + zijr*zkjr;
828 CudaAngleValue angleValue = angleValues[al.itype];
829 angleValue.k *= al.scale;
832 if (angleValue.normal == 1) {
833 // Restrict values of cos_theta to the interval [-0.999 ... 0.999]
834 cos_theta = min(0.999f, max(-0.999f, cos_theta));
835 float theta = acosf(cos_theta);
836 diff = theta - angleValue.theta0;
838 diff = cos_theta - angleValue.theta0;
841 float energy_tmp = 0.0f;
843 energy_tmp += angleValue.k * diff * diff;
848 // Be careful: alch_scale_energy_fep is 0 here!
849 float alch_scale_energy_fep;
851 // NOTE: On account of the Urey-Bradley terms, energy calculation is not
852 // finished so we cannot add energy_tmp to energy_TI_* and energy_F here!
853 // but we still need the scaling factor for forces,
854 // so the code here is a little bit different from the bondForce.
855 // calculate a scale factor for FEP energy first, and then scale it to
856 // the final result after finishing energy evaluation, and also use
857 // a pointer point to the correct energy_TI_x term.
860 alch_scale_energy_fep = 0.0f;
861 if (doTI) p_energy_TI = NULL;
862 switch (al.fepBondedType) {
864 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
867 p_energy_TI = &(energy_TI_1);
870 alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1;
876 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
879 p_energy_TI = &(energy_TI_2);
882 alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2;
890 if (angleValue.normal == 1) {
891 float inv_sin_theta = rsqrtf(1.0f - cos_theta*cos_theta);
892 if (inv_sin_theta > 1.0e6) {
893 diff = (diff < 0.0f) ? 1.0f : -1.0f;
895 diff *= -inv_sin_theta;
898 diff *= -2.0f*angleValue.k;
900 // Do alchemical scaling
905 float dtxi = rij_inv*(xkjr - cos_theta*xijr);
906 float dtxj = rkj_inv*(xijr - cos_theta*xkjr);
907 float dtyi = rij_inv*(ykjr - cos_theta*yijr);
908 float dtyj = rkj_inv*(yijr - cos_theta*ykjr);
909 float dtzi = rij_inv*(zkjr - cos_theta*zijr);
910 float dtzj = rkj_inv*(zijr - cos_theta*zkjr);
912 T T_dtxi, T_dtyi, T_dtzi;
913 T T_dtxj, T_dtyj, T_dtzj;
914 calcComponentForces<T>(diff, dtxi, dtyi, dtzi, T_dtxi, T_dtyi, T_dtzi);
915 calcComponentForces<T>(diff, dtxj, dtyj, dtzj, T_dtxj, T_dtyj, T_dtzj);
916 T T_dtxk = -T_dtxi - T_dtxj;
917 T T_dtyk = -T_dtyi - T_dtyj;
918 T T_dtzk = -T_dtzi - T_dtzj;
919 storeForces<T>(T_dtxk, T_dtyk, T_dtzk, al.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
921 if (angleValue.k_ub) {
922 float xik = xij - xkj;
923 float yik = yij - ykj;
924 float zik = zij - zkj;
925 float rik_inv = rsqrtf(xik*xik + yik*yik + zik*zik);
926 float rik = 1.0f/rik_inv;
927 float diff_ub = rik - angleValue.r_ub;
929 energy_tmp += angleValue.k_ub * diff_ub * diff_ub;
931 diff_ub *= -2.0f*angleValue.k_ub*rik_inv;
933 diff_ub *= alch_scale;
935 T T_dxik, T_dyik, T_dzik;
936 calcComponentForces<T>(diff_ub, xik, yik, zik, T_dxik, T_dyik, T_dzik);
949 energy += double(energy_tmp * alch_scale);
951 if (p_energy_TI != NULL) {
952 *(p_energy_TI) += double(energy_tmp);
956 energy_F += double(energy_tmp * alch_scale_energy_fep);
959 energy += double(energy_tmp);
963 storeForces<T>(T_dtxi, T_dtyi, T_dtzi, al.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
964 storeForces<T>(T_dtxj, T_dtyj, T_dtzj, al.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
968 #ifdef WRITE_FULL_VIRIALS
969 float fxi = ((float)T_dtxi);
970 float fyi = ((float)T_dtyi);
971 float fzi = ((float)T_dtzi);
972 float fxk = ((float)T_dtxj);
973 float fyk = ((float)T_dtyj);
974 float fzk = ((float)T_dtzj);
975 virial.xx = (fxi*xij) + (fxk*xkj);
976 virial.xy = (fxi*yij) + (fxk*ykj);
977 virial.xz = (fxi*zij) + (fxk*zkj);
978 virial.yx = (fyi*xij) + (fyk*xkj);
979 virial.yy = (fyi*yij) + (fyk*ykj);
980 virial.yz = (fyi*zij) + (fyk*zkj);
981 virial.zx = (fzi*xij) + (fzk*xkj);
982 virial.zy = (fzi*yij) + (fzk*ykj);
983 virial.zz = (fzi*zij) + (fzk*zkj);
989 // Dihedral computation
993 template <bool doEnergy>
994 __forceinline__ __device__
995 void diheComp(const CudaDihedralValue* dihedralValues, int ic,
996 const float sin_phi, const float cos_phi, const float scale, float& df, double& e) {
999 if (doEnergy) e = 0.0;
1001 float phi = atan2f(sin_phi, cos_phi);
1005 CudaDihedralValue dihedralValue = dihedralValues[ic];
1006 dihedralValue.k *= scale;
1008 // Last dihedral has n low bit set to 0
1009 lrep = (dihedralValue.n & 1);
1010 dihedralValue.n >>= 1;
1011 if (dihedralValue.n) {
1012 float nf = dihedralValue.n;
1013 float x = nf*phi - dihedralValue.delta;
1016 sincosf(x, &sin_x, &cos_x);
1017 e += (double)(dihedralValue.k*(1.0f + cos_x));
1018 df += (double)(nf*dihedralValue.k*sin_x);
1020 float sin_x = sinf(x);
1021 df += (double)(nf*dihedralValue.k*sin_x);
1024 float diff = phi - dihedralValue.delta;
1025 if (diff < -(float)(M_PI)) diff += (float)(2.0*M_PI);
1026 if (diff > (float)(M_PI)) diff -= (float)(2.0*M_PI);
1028 e += (double)(dihedralValue.k*diff*diff);
1030 df -= 2.0f*dihedralValue.k*diff;
1038 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1039 __device__ void diheForce(const int index,
1040 const CudaDihedral* __restrict__ diheList,
1041 const CudaDihedralValue* __restrict__ dihedralValues,
1042 const float4* __restrict__ xyzq,
1044 const float3 lata, const float3 latb, const float3 latc,
1045 T* __restrict__ force, double &energy,
1046 T* __restrict__ forceList, int* forceListCounter, int* forceListStarts, int* forceListNexts,
1047 #ifdef WRITE_FULL_VIRIALS
1048 ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1050 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1052 double &energy_F, double &energy_TI_1, double &energy_TI_2
1055 CudaDihedral dl = diheList[index];
1057 float ishx = dl.ioffsetXYZ.x*lata.x + dl.ioffsetXYZ.y*latb.x + dl.ioffsetXYZ.z*latc.x;
1058 float ishy = dl.ioffsetXYZ.x*lata.y + dl.ioffsetXYZ.y*latb.y + dl.ioffsetXYZ.z*latc.y;
1059 float ishz = dl.ioffsetXYZ.x*lata.z + dl.ioffsetXYZ.y*latb.z + dl.ioffsetXYZ.z*latc.z;
1061 float jshx = dl.joffsetXYZ.x*lata.x + dl.joffsetXYZ.y*latb.x + dl.joffsetXYZ.z*latc.x;
1062 float jshy = dl.joffsetXYZ.x*lata.y + dl.joffsetXYZ.y*latb.y + dl.joffsetXYZ.z*latc.y;
1063 float jshz = dl.joffsetXYZ.x*lata.z + dl.joffsetXYZ.y*latb.z + dl.joffsetXYZ.z*latc.z;
1065 float lshx = dl.loffsetXYZ.x*lata.x + dl.loffsetXYZ.y*latb.x + dl.loffsetXYZ.z*latc.x;
1066 float lshy = dl.loffsetXYZ.x*lata.y + dl.loffsetXYZ.y*latb.y + dl.loffsetXYZ.z*latc.y;
1067 float lshz = dl.loffsetXYZ.x*lata.z + dl.loffsetXYZ.y*latb.z + dl.loffsetXYZ.z*latc.z;
1069 float xij = xyzq[dl.i].x + ishx - xyzq[dl.j].x;
1070 float yij = xyzq[dl.i].y + ishy - xyzq[dl.j].y;
1071 float zij = xyzq[dl.i].z + ishz - xyzq[dl.j].z;
1073 float xjk = xyzq[dl.j].x + jshx - xyzq[dl.k].x;
1074 float yjk = xyzq[dl.j].y + jshy - xyzq[dl.k].y;
1075 float zjk = xyzq[dl.j].z + jshz - xyzq[dl.k].z;
1077 float xlk = xyzq[dl.l].x + lshx - xyzq[dl.k].x;
1078 float ylk = xyzq[dl.l].y + lshy - xyzq[dl.k].y;
1079 float zlk = xyzq[dl.l].z + lshz - xyzq[dl.k].z;
1082 float ax = yij*zjk - zij*yjk;
1083 float ay = zij*xjk - xij*zjk;
1084 float az = xij*yjk - yij*xjk;
1085 float bx = ylk*zjk - zlk*yjk;
1086 float by = zlk*xjk - xlk*zjk;
1087 float bz = xlk*yjk - ylk*xjk;
1089 float ra2 = ax*ax + ay*ay + az*az;
1090 float rb2 = bx*bx + by*by + bz*bz;
1091 float rg = sqrtf(xjk*xjk + yjk*yjk + zjk*zjk);
1093 // if((ra2 <= rxmin2) .or. (rb2 <= rxmin2) .or. (rg <= rxmin)) then
1094 // nlinear = nlinear + 1
1097 float rgr = 1.0f / rg;
1098 float ra2r = 1.0f / ra2;
1099 float rb2r = 1.0f / rb2;
1100 float rabr = sqrtf(ra2r*rb2r);
1102 float cos_phi = (ax*bx + ay*by + az*bz)*rabr;
1104 // Note that sin(phi).G/|G|=B^A/(|A|.|B|)
1105 // which can be simplify to sin(phi)=|G|H.A/(|A|.|B|)
1106 float sin_phi = rg*rabr*(ax*xlk + ay*ylk + az*zlk);
1108 // Energy and derivative contributions.
1112 diheComp<doEnergy>(dihedralValues, dl.itype, sin_phi, cos_phi, dl.scale, df, e);
1114 // Alchemical transformation
1116 if (doFEP || doTI) {
1118 switch (dl.fepBondedType) {
1120 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1123 energy_TI_1 += (double)(e);
1126 energy_F += (double)(e * ((AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1));
1132 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1135 energy_TI_2 += (double)(e);
1138 energy_F += (double)(e * ((AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2));
1147 if (doFEP || doTI) {
1148 energy += e * alch_scale;
1155 // Compute derivatives wrt catesian coordinates.
1157 // GAA=dE/dphi.|G|/A^2, GBB=dE/dphi.|G|/B^2, FG=F.G, HG=H.G
1158 // FGA=dE/dphi*F.G/(|G|A^2), HGB=dE/dphi*H.G/(|G|B^2)
1160 float fg = xij*xjk + yij*yjk + zij*zjk;
1161 float hg = xlk*xjk + ylk*yjk + zlk*zjk;
1164 float fga = fg*ra2r*rgr;
1165 float hgb = hg*rb2r*rgr;
1166 float gaa = ra2r*rg;
1167 float gbb = rb2r*rg;
1168 // DFi=dE/dFi, DGi=dE/dGi, DHi=dE/dHi.
1170 // Remember T is long long int
1171 // Don't try to scale T_fix and similar variables directly by float
1172 if (doFEP || doTI) {
1180 T T_fix, T_fiy, T_fiz;
1181 calcComponentForces<T>(-gaa, ax, ay, az, T_fix, T_fiy, T_fiz);
1182 storeForces<T>(T_fix, T_fiy, T_fiz, dl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1185 calcComponentForces<T>(fga, ax, ay, az, -hgb, bx, by, bz, dgx, dgy, dgz);
1186 T T_fjx = dgx - T_fix;
1187 T T_fjy = dgy - T_fiy;
1188 T T_fjz = dgz - T_fiz;
1189 storeForces<T>(T_fjx, T_fjy, T_fjz, dl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1192 calcComponentForces<T>(gbb, bx, by, bz, dhx, dhy, dhz);
1193 T T_fkx = -dhx - dgx;
1194 T T_fky = -dhy - dgy;
1195 T T_fkz = -dhz - dgz;
1196 storeForces<T>(T_fkx, T_fky, T_fkz, dl.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1197 storeForces<T>(dhx, dhy, dhz, dl.l, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1201 #ifdef WRITE_FULL_VIRIALS
1202 float fix = ((float)T_fix);
1203 float fiy = ((float)T_fiy);
1204 float fiz = ((float)T_fiz);
1205 float fjx = ((float)dgx);
1206 float fjy = ((float)dgy);
1207 float fjz = ((float)dgz);
1208 float flx = ((float)dhx);
1209 float fly = ((float)dhy);
1210 float flz = ((float)dhz);
1211 virial.xx = (fix*xij) + (fjx*xjk) + (flx*xlk);
1212 virial.xy = (fix*yij) + (fjx*yjk) + (flx*ylk);
1213 virial.xz = (fix*zij) + (fjx*zjk) + (flx*zlk);
1214 virial.yx = (fiy*xij) + (fjy*xjk) + (fly*xlk);
1215 virial.yy = (fiy*yij) + (fjy*yjk) + (fly*ylk);
1216 virial.yz = (fiy*zij) + (fjy*zjk) + (fly*zlk);
1217 virial.zx = (fiz*xij) + (fjz*xjk) + (flz*xlk);
1218 virial.zy = (fiz*yij) + (fjz*yjk) + (flz*ylk);
1219 virial.zz = (fiz*zij) + (fjz*zjk) + (flz*zlk);
1225 __device__ __forceinline__ float3 cross(const float3 v1, const float3 v2) {
1227 v1.y*v2.z-v2.y*v1.z,
1228 v2.x*v1.z-v1.x*v2.z,
1233 __device__ __forceinline__ float dot(const float3 v1, const float3 v2) {
1234 return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z);
1238 // Calculates crossterms
1240 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1241 __device__ void crosstermForce(
1243 const CudaCrossterm* __restrict__ crosstermList,
1244 const CudaCrosstermValue* __restrict__ crosstermValues,
1245 const float4* __restrict__ xyzq,
1247 const float3 lata, const float3 latb, const float3 latc,
1248 T* __restrict__ force, double &energy,
1249 T* __restrict__ forceList, int* forceListCounter, int* forceListStarts, int* forceListNexts,
1250 #ifdef WRITE_FULL_VIRIALS
1251 ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1253 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1255 double &energy_F, double &energy_TI_1, double &energy_TI_2
1258 CudaCrossterm cl = crosstermList[index];
1260 // ----------------------------------------------------------------------------
1261 // Angle between 1 - 2 - 3 - 4
1264 float3 sh12 = make_float3(
1265 cl.offset12XYZ.x*lata.x + cl.offset12XYZ.y*latb.x + cl.offset12XYZ.z*latc.x,
1266 cl.offset12XYZ.x*lata.y + cl.offset12XYZ.y*latb.y + cl.offset12XYZ.z*latc.y,
1267 cl.offset12XYZ.x*lata.z + cl.offset12XYZ.y*latb.z + cl.offset12XYZ.z*latc.z);
1269 float4 xyzq1 = xyzq[cl.i1];
1270 float4 xyzq2 = xyzq[cl.i2];
1272 float3 r12 = make_float3(
1273 xyzq1.x + sh12.x - xyzq2.x,
1274 xyzq1.y + sh12.y - xyzq2.y,
1275 xyzq1.z + sh12.z - xyzq2.z);
1278 float3 sh23 = make_float3(
1279 cl.offset23XYZ.x*lata.x + cl.offset23XYZ.y*latb.x + cl.offset23XYZ.z*latc.x,
1280 cl.offset23XYZ.x*lata.y + cl.offset23XYZ.y*latb.y + cl.offset23XYZ.z*latc.y,
1281 cl.offset23XYZ.x*lata.z + cl.offset23XYZ.y*latb.z + cl.offset23XYZ.z*latc.z);
1283 float4 xyzq3 = xyzq[cl.i3];
1285 float3 r23 = make_float3(
1286 xyzq2.x + sh23.x - xyzq3.x,
1287 xyzq2.y + sh23.y - xyzq3.y,
1288 xyzq2.z + sh23.z - xyzq3.z);
1291 float3 sh34 = make_float3(
1292 cl.offset34XYZ.x*lata.x + cl.offset34XYZ.y*latb.x + cl.offset34XYZ.z*latc.x,
1293 cl.offset34XYZ.x*lata.y + cl.offset34XYZ.y*latb.y + cl.offset34XYZ.z*latc.y,
1294 cl.offset34XYZ.x*lata.z + cl.offset34XYZ.y*latb.z + cl.offset34XYZ.z*latc.z);
1296 float4 xyzq4 = xyzq[cl.i4];
1298 float3 r34 = make_float3(
1299 xyzq3.x + sh34.x - xyzq4.x,
1300 xyzq3.y + sh34.y - xyzq4.y,
1301 xyzq3.z + sh34.z - xyzq4.z);
1303 // Calculate the cross products
1304 float3 A = cross(r12, r23);
1305 float3 B = cross(r23, r34);
1306 float3 C = cross(r23, A);
1308 // Calculate the inverse distances
1309 float inv_rA = rsqrtf(dot(A, A));
1310 float inv_rB = rsqrtf(dot(B, B));
1311 float inv_rC = rsqrtf(dot(C, C));
1313 // Calculate the sin and cos
1314 float cos_phi = dot(A, B)*(inv_rA*inv_rB);
1315 float sin_phi = dot(C, B)*(inv_rC*inv_rB);
1317 float phi = -atan2f(sin_phi,cos_phi);
1319 // ----------------------------------------------------------------------------
1320 // Angle between 5 - 6 - 7 - 8
1324 float3 sh56 = make_float3(
1325 cl.offset56XYZ.x*lata.x + cl.offset56XYZ.y*latb.x + cl.offset56XYZ.z*latc.x,
1326 cl.offset56XYZ.x*lata.y + cl.offset56XYZ.y*latb.y + cl.offset56XYZ.z*latc.y,
1327 cl.offset56XYZ.x*lata.z + cl.offset56XYZ.y*latb.z + cl.offset56XYZ.z*latc.z);
1329 float4 xyzq5 = xyzq[cl.i5];
1330 float4 xyzq6 = xyzq[cl.i6];
1332 float3 r56 = make_float3(
1333 xyzq5.x + sh56.x - xyzq6.x,
1334 xyzq5.y + sh56.y - xyzq6.y,
1335 xyzq5.z + sh56.z - xyzq6.z);
1338 float3 sh67 = make_float3(
1339 cl.offset67XYZ.x*lata.x + cl.offset67XYZ.y*latb.x + cl.offset67XYZ.z*latc.x,
1340 cl.offset67XYZ.x*lata.y + cl.offset67XYZ.y*latb.y + cl.offset67XYZ.z*latc.y,
1341 cl.offset67XYZ.x*lata.z + cl.offset67XYZ.y*latb.z + cl.offset67XYZ.z*latc.z);
1343 float4 xyzq7 = xyzq[cl.i7];
1345 float3 r67 = make_float3(
1346 xyzq6.x + sh67.x - xyzq7.x,
1347 xyzq6.y + sh67.y - xyzq7.y,
1348 xyzq6.z + sh67.z - xyzq7.z);
1351 float3 sh78 = make_float3(
1352 cl.offset78XYZ.x*lata.x + cl.offset78XYZ.y*latb.x + cl.offset78XYZ.z*latc.x,
1353 cl.offset78XYZ.x*lata.y + cl.offset78XYZ.y*latb.y + cl.offset78XYZ.z*latc.y,
1354 cl.offset78XYZ.x*lata.z + cl.offset78XYZ.y*latb.z + cl.offset78XYZ.z*latc.z);
1356 float4 xyzq8 = xyzq[cl.i8];
1358 float3 r78 = make_float3(
1359 xyzq7.x + sh78.x - xyzq8.x,
1360 xyzq7.y + sh78.y - xyzq8.y,
1361 xyzq7.z + sh78.z - xyzq8.z);
1363 // Calculate the cross products
1364 float3 D = cross(r56, r67);
1365 float3 E = cross(r67, r78);
1366 float3 F = cross(r67, D);
1368 // Calculate the inverse distances
1369 float inv_rD = rsqrtf(dot(D, D));
1370 float inv_rE = rsqrtf(dot(E, E));
1371 float inv_rF = rsqrtf(dot(F, F));
1373 // Calculate the sin and cos
1374 float cos_psi = dot(D, E)*(inv_rD*inv_rE);
1375 float sin_psi = dot(F, E)*(inv_rF*inv_rE);
1377 float psi = -atan2f(sin_psi,cos_psi);
1379 // ----------------------------------------------------------------------------
1380 // Calculate the energy
1382 const float inv_h = (float)( (CudaCrosstermValue::dim) / (2.0*M_PI) );
1385 phi = fmod(phi + (float)M_PI, 2.0f*(float)M_PI);
1386 psi = fmod(psi + (float)M_PI, 2.0f*(float)M_PI);
1388 // distance measured in grid points between angle and smallest value
1389 float phi_h = phi * inv_h;
1390 float psi_h = psi * inv_h;
1392 // find smallest numbered grid point in stencil
1393 int iphi = (int)floor(phi_h);
1394 int ipsi = (int)floor(psi_h);
1396 const CudaCrosstermValue& cp = crosstermValues[cl.itype];
1398 // Load coefficients
1400 c[0] = cp.c[iphi][ipsi][0];
1401 c[1] = cp.c[iphi][ipsi][1];
1402 c[2] = cp.c[iphi][ipsi][2];
1403 c[3] = cp.c[iphi][ipsi][3];
1405 float dphi = phi_h - (float)iphi;
1406 float dpsi = psi_h - (float)ipsi;
1409 float alch_scale_energy_fep;
1410 double* p_energy_TI;
1411 if (doFEP || doTI) {
1413 alch_scale_energy_fep = 0.0f;
1414 if (doTI) p_energy_TI = NULL;
1415 switch (cl.fepBondedType) {
1417 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1420 p_energy_TI = &(energy_TI_1);
1423 alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1;
1429 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1432 p_energy_TI = &(energy_TI_2);
1435 alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2;
1444 float energyf = c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) );
1445 energyf = energyf*dpsi + c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) );
1446 energyf = energyf*dpsi + c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) );
1447 energyf = energyf*dpsi + c[0].x + dphi*( c[0].y + dphi*( c[0].z + dphi*c[0].w ) );
1448 if (doFEP || doTI) {
1449 energy += energyf * cl.scale * alch_scale;
1451 energy_F += energyf * cl.scale * alch_scale_energy_fep;
1454 if (p_energy_TI != NULL) {
1455 (*p_energy_TI) += energyf * cl.scale;
1459 energy += energyf * cl.scale;
1463 float dEdphi = 3.0f*(c[0].w + dpsi*( c[1].w + dpsi*( c[2].w + dpsi*c[3].w ) ));
1464 dEdphi = dEdphi*dphi + 2.0f*(c[0].z + dpsi*( c[1].z + dpsi*( c[2].z + dpsi*c[3].z ) ));
1465 dEdphi = dEdphi*dphi + (c[0].y + dpsi*( c[1].y + dpsi*( c[2].y + dpsi*c[3].y ) )); // 13 muls
1466 dEdphi *= cl.scale*inv_h;
1468 float dEdpsi = 3.0f*(c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) ));
1469 dEdpsi = dEdpsi*dpsi + 2.0f*(c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) ));
1470 dEdpsi = dEdpsi*dpsi + (c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) )); // 13 muls
1471 dEdpsi *= cl.scale*inv_h;
1473 // float normCross1 = dot(A, A);
1474 float square_r23 = dot(r23, r23);
1475 float norm_r23 = sqrtf(square_r23);
1476 float inv_square_r23 = 1.0f/square_r23;
1477 float ff1 = dEdphi*norm_r23*inv_rA*inv_rA;
1478 float ff2 = -dot(r12, r23)*inv_square_r23;
1479 float ff3 = -dot(r34, r23)*inv_square_r23;
1480 float ff4 = -dEdphi*norm_r23*inv_rB*inv_rB;
1482 // NOTE: The reason why scaling ff1 and ff4 is enough:
1483 // f1, f4 are already scaled, and in following t1's formula:
1484 // first term (t1.x) : ff2*f1.x - ff3*f4.x
1487 // so t1.x is scaled, and also t1.y and t1.z => t1 is scaled
1488 // then let's look at f2.x:
1489 // f2.x = t1.x - f1.x
1491 // scaled scaled => f2.x is scaled
1492 // and also f2.y and f2.z are scaled => f2 is scaled => similarly f3 is scaled
1493 // As a result scaling ff1 and ff4 is enough. DONT scale ff2 and ff3!
1494 if (doFEP || doTI) {
1499 float3 f1 = make_float3(ff1*A.x, ff1*A.y, ff1*A.z);
1500 float3 f4 = make_float3(ff4*B.x, ff4*B.y, ff4*B.z);
1501 float3 t1 = make_float3( ff2*f1.x - ff3*f4.x, ff2*f1.y - ff3*f4.y, ff2*f1.z - ff3*f4.z );
1502 float3 f2 = make_float3( t1.x - f1.x, t1.y - f1.y, t1.z - f1.z);
1503 float3 f3 = make_float3( -t1.x - f4.x, -t1.y - f4.y, -t1.z - f4.z);
1505 T T_f1x, T_f1y, T_f1z;
1506 T T_f2x, T_f2y, T_f2z;
1507 T T_f3x, T_f3y, T_f3z;
1508 T T_f4x, T_f4y, T_f4z;
1509 convertForces<T>(f1.x, f1.y, f1.z, T_f1x, T_f1y, T_f1z);
1510 convertForces<T>(f2.x, f2.y, f2.z, T_f2x, T_f2y, T_f2z);
1511 convertForces<T>(f3.x, f3.y, f3.z, T_f3x, T_f3y, T_f3z);
1512 convertForces<T>(f4.x, f4.y, f4.z, T_f4x, T_f4y, T_f4z);
1513 storeForces<T>(T_f1x, T_f1y, T_f1z, cl.i1, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1514 storeForces<T>(T_f2x, T_f2y, T_f2z, cl.i2, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1515 storeForces<T>(T_f3x, T_f3y, T_f3z, cl.i3, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1516 storeForces<T>(T_f4x, T_f4y, T_f4z, cl.i4, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1518 float square_r67 = dot(r67, r67);
1519 float norm_r67 = sqrtf(square_r67);
1520 float inv_square_r67 = 1.0f/(square_r67);
1521 ff1 = dEdpsi*norm_r67*inv_rD*inv_rD;
1522 ff2 = -dot(r56, r67)*inv_square_r67;
1523 ff3 = -dot(r78, r67)*inv_square_r67;
1524 ff4 = -dEdpsi*norm_r67*inv_rE*inv_rE;
1526 if (doFEP || doTI) {
1531 float3 f5 = make_float3( ff1*D.x, ff1*D.y, ff1*D.z );
1532 float3 f8 = make_float3( ff4*E.x, ff4*E.y, ff4*E.z );
1533 float3 t2 = make_float3( ff2*f5.x - ff3*f8.x, ff2*f5.y - ff3*f8.y, ff2*f5.z - ff3*f8.z );
1534 float3 f6 = make_float3( t2.x - f5.x, t2.y - f5.y, t2.z - f5.z );
1535 float3 f7 = make_float3(-t2.x - f8.x, -t2.y - f8.y, -t2.z - f8.z );
1537 T T_f5x, T_f5y, T_f5z;
1538 T T_f6x, T_f6y, T_f6z;
1539 T T_f7x, T_f7y, T_f7z;
1540 T T_f8x, T_f8y, T_f8z;
1541 convertForces<T>(f5.x, f5.y, f5.z, T_f5x, T_f5y, T_f5z);
1542 convertForces<T>(f6.x, f6.y, f6.z, T_f6x, T_f6y, T_f6z);
1543 convertForces<T>(f7.x, f7.y, f7.z, T_f7x, T_f7y, T_f7z);
1544 convertForces<T>(f8.x, f8.y, f8.z, T_f8x, T_f8y, T_f8z);
1545 storeForces<T>(T_f5x, T_f5y, T_f5z, cl.i5, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1546 storeForces<T>(T_f6x, T_f6y, T_f6z, cl.i6, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1547 storeForces<T>(T_f7x, T_f7y, T_f7z, cl.i7, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1548 storeForces<T>(T_f8x, T_f8y, T_f8z, cl.i8, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1552 #ifdef WRITE_FULL_VIRIALS
1553 float3 s12 = make_float3( f1.x + f2.x, f1.y + f2.y, f1.z + f2.z );
1554 float3 s56 = make_float3( f5.x + f6.x, f5.y + f6.y, f5.z + f6.z );
1555 virial.xx = f1.x*r12.x + s12.x*r23.x - f4.x*r34.x + f5.x*r56.x + s56.x*r67.x - f8.x*r78.x;
1556 virial.xy = f1.x*r12.y + s12.x*r23.y - f4.x*r34.y + f5.x*r56.y + s56.x*r67.y - f8.x*r78.y;
1557 virial.xz = f1.x*r12.z + s12.x*r23.z - f4.x*r34.z + f5.x*r56.z + s56.x*r67.z - f8.x*r78.z;
1558 virial.yx = f1.y*r12.x + s12.y*r23.x - f4.y*r34.x + f5.y*r56.x + s56.y*r67.x - f8.y*r78.x;
1559 virial.yy = f1.y*r12.y + s12.y*r23.y - f4.y*r34.y + f5.y*r56.y + s56.y*r67.y - f8.y*r78.y;
1560 virial.yz = f1.y*r12.z + s12.y*r23.z - f4.y*r34.z + f5.y*r56.z + s56.y*r67.z - f8.y*r78.z;
1561 virial.zx = f1.z*r12.x + s12.z*r23.x - f4.z*r34.x + f5.z*r56.x + s56.z*r67.x - f8.z*r78.x;
1562 virial.zy = f1.z*r12.y + s12.z*r23.y - f4.z*r34.y + f5.z*r56.y + s56.z*r67.y - f8.z*r78.y;
1563 virial.zz = f1.z*r12.z + s12.z*r23.z - f4.z*r34.z + f5.z*r56.z + s56.z*r67.z - f8.z*r78.z;
1569 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1570 __device__ void tholeForce(
1572 const CudaThole* __restrict__ tholeList,
1573 const float4* __restrict__ xyzq,
1575 const float3 lata, const float3 latb, const float3 latc,
1576 T* __restrict__ force, double &energy,
1577 T* __restrict__ forceList, int* forceListCounter,
1578 int* forceListStarts, int* forceListNexts,
1579 #ifdef WRITE_FULL_VIRIALS
1580 ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1582 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1584 double &energy_F, double &energy_TI_1, double &energy_TI_2
1586 // For reference, similar calculation can be found in OpenMM's code repo:
1587 // https://github.com/openmm/openmm/blob/8f123ef287693bb4b553163a9d5fd101ea9e4462/plugins/drude/platforms/common/src/kernels/drudePairForce.cc
1588 // CHARMM source code: charmm/source/gener/drude.F90, FUNCTION E_THOLE
1589 const CudaThole tl = tholeList[index];
1591 const float shx_aiaj = tl.offset_aiaj.x*lata.x + tl.offset_aiaj.y*latb.x + tl.offset_aiaj.z*latc.x;
1592 const float shy_aiaj = tl.offset_aiaj.x*lata.y + tl.offset_aiaj.y*latb.y + tl.offset_aiaj.z*latc.y;
1593 const float shz_aiaj = tl.offset_aiaj.x*lata.z + tl.offset_aiaj.y*latb.z + tl.offset_aiaj.z*latc.z;
1595 const float shx_aidj = tl.offset_aidj.x*lata.x + tl.offset_aidj.y*latb.x + tl.offset_aidj.z*latc.x;
1596 const float shy_aidj = tl.offset_aidj.x*lata.y + tl.offset_aidj.y*latb.y + tl.offset_aidj.z*latc.y;
1597 const float shz_aidj = tl.offset_aidj.x*lata.z + tl.offset_aidj.y*latb.z + tl.offset_aidj.z*latc.z;
1599 const float shx_diaj = tl.offset_diaj.x*lata.x + tl.offset_diaj.y*latb.x + tl.offset_diaj.z*latc.x;
1600 const float shy_diaj = tl.offset_diaj.x*lata.y + tl.offset_diaj.y*latb.y + tl.offset_diaj.z*latc.y;
1601 const float shz_diaj = tl.offset_diaj.x*lata.z + tl.offset_diaj.y*latb.z + tl.offset_diaj.z*latc.z;
1603 const float shx_didj = tl.offset_didj.x*lata.x + tl.offset_didj.y*latb.x + tl.offset_didj.z*latc.x;
1604 const float shy_didj = tl.offset_didj.x*lata.y + tl.offset_didj.y*latb.y + tl.offset_didj.z*latc.y;
1605 const float shz_didj = tl.offset_didj.x*lata.z + tl.offset_didj.y*latb.z + tl.offset_didj.z*latc.z;
1607 const float raa_x = xyzq[tl.i].x + shx_aiaj - xyzq[tl.k].x;
1608 const float raa_y = xyzq[tl.i].y + shy_aiaj - xyzq[tl.k].y;
1609 const float raa_z = xyzq[tl.i].z + shz_aiaj - xyzq[tl.k].z;
1611 const float rad_x = xyzq[tl.i].x + shx_aidj - xyzq[tl.l].x;
1612 const float rad_y = xyzq[tl.i].y + shy_aidj - xyzq[tl.l].y;
1613 const float rad_z = xyzq[tl.i].z + shz_aidj - xyzq[tl.l].z;
1615 const float rda_x = xyzq[tl.j].x + shx_diaj - xyzq[tl.k].x;
1616 const float rda_y = xyzq[tl.j].y + shy_diaj - xyzq[tl.k].y;
1617 const float rda_z = xyzq[tl.j].z + shz_diaj - xyzq[tl.k].z;
1619 const float rdd_x = xyzq[tl.j].x + shx_didj - xyzq[tl.l].x;
1620 const float rdd_y = xyzq[tl.j].y + shy_didj - xyzq[tl.l].y;
1621 const float rdd_z = xyzq[tl.j].z + shz_didj - xyzq[tl.l].z;
1623 const float raa_invlen = rnorm3df(raa_x, raa_y, raa_z);
1624 const float rad_invlen = rnorm3df(rad_x, rad_y, rad_z);
1625 const float rda_invlen = rnorm3df(rda_x, rda_y, rda_z);
1626 const float rdd_invlen = rnorm3df(rdd_x, rdd_y, rdd_z);
1628 const float auaa = tl.aa / raa_invlen;
1629 const float auad = tl.aa / rad_invlen;
1630 const float auda = tl.aa / rda_invlen;
1631 const float audd = tl.aa / rdd_invlen;
1634 const float expauaa = expf(-auaa);
1635 const float expauad = expf(-auad);
1636 const float expauda = expf(-auda);
1637 const float expaudd = expf(-audd);
1640 float polyauaa = 1.0f + 0.5f*auaa;
1641 float polyauad = 1.0f + 0.5f*auad;
1642 float polyauda = 1.0f + 0.5f*auda;
1643 float polyaudd = 1.0f + 0.5f*audd;
1647 ethole += tl.qq * raa_invlen * (1.0f - polyauaa * expauaa);
1648 ethole += -tl.qq * rad_invlen * (1.0f - polyauad * expauad);
1649 ethole += -tl.qq * rda_invlen * (1.0f - polyauda * expauda);
1650 ethole += tl.qq * rdd_invlen * (1.0f - polyaudd * expaudd);
1651 if (doFEP || doTI) {
1652 switch (tl.fepBondedType) {
1654 // 0 < typeSum && typeSum < order
1655 if (doTI) energy_TI_1 += (double)ethole;
1657 // elecLambda2Up = simParams->getElecLambda(alchLambda2)
1658 energy_F += (AlchBondedCUDA::alchLambdas).elecLambda2Up * ethole;
1659 // elecLambdaUp = simParams->getElecLambda(alchLambda)
1660 ethole *= (AlchBondedCUDA::alchLambdas).elecLambdaUp;
1665 if (doTI) energy_TI_2 += (double)ethole;
1667 // elecLambda2Down = simParams->getElecLambda(1-alchLambda2)
1668 energy_F += (AlchBondedCUDA::alchLambdas).elecLambda2Down * ethole;
1669 // elecLambdaDown = simParams->getElecLambda(1-alchLambda)
1670 ethole *= (AlchBondedCUDA::alchLambdas).elecLambdaDown;
1676 energy += (double)ethole;
1679 polyauaa = 1.0f + auaa*polyauaa;
1680 polyauad = 1.0f + auad*polyauad;
1681 polyauda = 1.0f + auda*polyauda;
1682 polyaudd = 1.0f + audd*polyaudd;
1684 const float raa_invlen3 = raa_invlen * raa_invlen * raa_invlen;
1685 const float rad_invlen3 = rad_invlen * rad_invlen * rad_invlen;
1686 const float rda_invlen3 = rda_invlen * rda_invlen * rda_invlen;
1687 const float rdd_invlen3 = rdd_invlen * rdd_invlen * rdd_invlen;
1689 // df = (1/r) (dU/dr)
1690 float dfaa = -tl.qq * raa_invlen3 * (polyauaa*expauaa - 1);
1691 float dfad = tl.qq * rad_invlen3 * (polyauad*expauad - 1);
1692 float dfda = tl.qq * rda_invlen3 * (polyauda*expauda - 1);
1693 float dfdd = -tl.qq * rdd_invlen3 * (polyaudd*expaudd - 1);
1695 if (doFEP || doTI) {
1696 switch (tl.fepBondedType) {
1698 dfaa *= (AlchBondedCUDA::alchLambdas).elecLambdaUp;
1699 dfad *= (AlchBondedCUDA::alchLambdas).elecLambdaUp;
1700 dfda *= (AlchBondedCUDA::alchLambdas).elecLambdaUp;
1701 dfdd *= (AlchBondedCUDA::alchLambdas).elecLambdaUp;
1705 dfaa *= (AlchBondedCUDA::alchLambdas).elecLambdaDown;
1706 dfad *= (AlchBondedCUDA::alchLambdas).elecLambdaDown;
1707 dfda *= (AlchBondedCUDA::alchLambdas).elecLambdaDown;
1708 dfdd *= (AlchBondedCUDA::alchLambdas).elecLambdaDown;
1714 T T_fx0, T_fy0, T_fz0;
1715 calcComponentForces<T>(dfaa, raa_x, raa_y, raa_z, dfad, rad_x, rad_y, rad_z, T_fx0, T_fy0, T_fz0);
1716 storeForces<T>(T_fx0, T_fy0, T_fz0, tl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1718 T T_fx1, T_fy1, T_fz1;
1719 calcComponentForces<T>(dfda, rda_x, rda_y, rda_z, dfdd, rdd_x, rdd_y, rdd_z, T_fx1, T_fy1, T_fz1);
1720 storeForces<T>(T_fx1, T_fy1, T_fz1, tl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1722 T T_fx2, T_fy2, T_fz2;
1723 calcComponentForces<T>(dfaa, raa_x, raa_y, raa_z, dfda, rda_x, rda_y, rda_z, T_fx2, T_fy2, T_fz2);
1724 storeForces<T>(-T_fx2, -T_fy2, -T_fz2, tl.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1726 T T_fx3, T_fy3, T_fz3;
1727 calcComponentForces<T>(dfad, rad_x, rad_y, rad_z, dfdd, rdd_x, rdd_y, rdd_z, T_fx3, T_fy3, T_fz3);
1728 storeForces<T>(-T_fx3, -T_fy3, -T_fz3, tl.l, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1731 #ifdef WRITE_FULL_VIRIALS
1732 const float faa_x = dfaa * raa_x;
1733 const float faa_y = dfaa * raa_y;
1734 const float faa_z = dfaa * raa_z;
1735 const float fad_x = dfad * rad_x;
1736 const float fad_y = dfad * rad_y;
1737 const float fad_z = dfad * rad_z;
1738 const float fda_x = dfda * rda_x;
1739 const float fda_y = dfda * rda_y;
1740 const float fda_z = dfda * rda_z;
1741 const float fdd_x = dfdd * rdd_x;
1742 const float fdd_y = dfdd * rdd_y;
1743 const float fdd_z = dfdd * rdd_z;
1744 virial.xx = faa_x * raa_x + fad_x * rad_x + fda_x * rda_x + fdd_x * rdd_x;
1745 virial.xy = faa_x * raa_y + fad_x * rad_y + fda_x * rda_y + fdd_x * rdd_y;
1746 virial.xz = faa_x * raa_z + fad_x * rad_z + fda_x * rda_z + fdd_x * rdd_z;
1747 virial.yx = faa_y * raa_x + fad_y * rad_x + fda_y * rda_x + fdd_y * rdd_x;
1748 virial.yy = faa_y * raa_y + fad_y * rad_y + fda_y * rda_y + fdd_y * rdd_y;
1749 virial.yz = faa_y * raa_z + fad_y * rad_z + fda_y * rda_z + fdd_y * rdd_z;
1750 virial.zx = faa_z * raa_x + fad_z * rad_x + fda_z * rda_x + fdd_z * rdd_x;
1751 virial.zy = faa_z * raa_y + fad_z * rad_y + fda_z * rda_y + fdd_z * rdd_y;
1752 virial.zz = faa_z * raa_z + fad_z * rad_z + fda_z * rda_z + fdd_z * rdd_z;
1757 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1758 __device__ void anisoForce(
1760 const CudaAniso* __restrict__ anisoList,
1761 const float4* __restrict__ xyzq,
1763 const float3 lata, const float3 latb, const float3 latc,
1764 T* __restrict__ force, double &energy,
1765 T* __restrict__ forceList, int* forceListCounter,
1766 int* forceListStarts, int* forceListNexts,
1767 #ifdef WRITE_FULL_VIRIALS
1768 ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1770 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1772 double &energy_F, double &energy_TI_1, double &energy_TI_2
1774 const CudaAniso al = anisoList[index];
1776 const float shx_il = al.offset_il.x*lata.x + al.offset_il.y*latb.x + al.offset_il.z*latc.x;
1777 const float shy_il = al.offset_il.x*lata.y + al.offset_il.y*latb.y + al.offset_il.z*latc.y;
1778 const float shz_il = al.offset_il.x*lata.z + al.offset_il.y*latb.z + al.offset_il.z*latc.z;
1780 const float shx_mn = al.offset_mn.x*lata.x + al.offset_mn.y*latb.x + al.offset_mn.z*latc.x;
1781 const float shy_mn = al.offset_mn.x*lata.y + al.offset_mn.y*latb.y + al.offset_mn.z*latc.y;
1782 const float shz_mn = al.offset_mn.x*lata.z + al.offset_mn.y*latb.z + al.offset_mn.z*latc.z;
1784 // calculate parallel and perpendicular displacement vectors
1785 // shortest vector image: ri - rl
1786 const float ril_x = xyzq[al.i].x + shx_il - xyzq[al.l].x;
1787 const float ril_y = xyzq[al.i].y + shy_il - xyzq[al.l].y;
1788 const float ril_z = xyzq[al.i].z + shz_il - xyzq[al.l].z;
1790 // shortest vector image: rm - rn
1791 const float rmn_x = xyzq[al.m].x + shx_mn - xyzq[al.n].x;
1792 const float rmn_y = xyzq[al.m].y + shy_mn - xyzq[al.n].y;
1793 const float rmn_z = xyzq[al.m].z + shz_mn - xyzq[al.n].z;
1795 // need recip lengths of r_il, r_mn
1796 const float ril_invlen = rnorm3df(ril_x, ril_y, ril_z);
1797 const float rmn_invlen = rnorm3df(rmn_x, rmn_y, rmn_z);
1799 // normalize r_il, r_mn
1800 const float u1_x = ril_x * ril_invlen;
1801 const float u1_y = ril_y * ril_invlen;
1802 const float u1_z = ril_z * ril_invlen;
1803 const float u2_x = rmn_x * rmn_invlen;
1804 const float u2_y = rmn_y * rmn_invlen;
1805 const float u2_z = rmn_z * rmn_invlen;
1807 // Drude displacement vector (ri, rj are in same patch)
1808 const float dr_x = xyzq[al.j].x - xyzq[al.i].x;
1809 const float dr_y = xyzq[al.j].y - xyzq[al.i].y;
1810 const float dr_z = xyzq[al.j].z - xyzq[al.i].z;
1812 // parallel displacement
1813 const float dpar = dr_x * u1_x + dr_y * u1_y + dr_z * u1_z;
1815 // perpendicular displacement
1816 const float dperp = dr_x * u2_x + dr_y * u2_y + dr_z * u2_z;
1819 float fj_x = -al.kiso0 * dr_x;
1820 float fj_y = -al.kiso0 * dr_y;
1821 float fj_z = -al.kiso0 * dr_z;
1822 fj_x -= al.kpar0 * dpar * u1_x;
1823 fj_y -= al.kpar0 * dpar * u1_y;
1824 fj_z -= al.kpar0 * dpar * u1_z;
1825 fj_x -= al.kperp0 * dperp * u2_x;
1826 fj_y -= al.kperp0 * dperp * u2_y;
1827 fj_z -= al.kperp0 * dperp * u2_z;
1830 float fl_x = al.kpar0 * dpar * ril_invlen * dr_x;
1831 float fl_y = al.kpar0 * dpar * ril_invlen * dr_y;
1832 float fl_z = al.kpar0 * dpar * ril_invlen * dr_z;
1833 fl_x -= al.kpar0 * dpar * dpar * ril_invlen * u1_x;
1834 fl_y -= al.kpar0 * dpar * dpar * ril_invlen * u1_y;
1835 fl_z -= al.kpar0 * dpar * dpar * ril_invlen * u1_z;
1838 float fm_x = al.kperp0 * dperp * dperp * rmn_invlen * u2_x;
1839 float fm_y = al.kperp0 * dperp * dperp * rmn_invlen * u2_y;
1840 float fm_z = al.kperp0 * dperp * dperp * rmn_invlen * u2_z;
1841 fm_x -= al.kperp0 * dperp * rmn_invlen * dr_x;
1842 fm_y -= al.kperp0 * dperp * rmn_invlen * dr_y;
1843 fm_z -= al.kperp0 * dperp * rmn_invlen * dr_z;
1846 // aniso spring energy
1847 // kpar reduces response along carbonyl vector
1848 // kperp reduces response perp to bond vector
1849 // (reg in and out of plane response)
1850 const float dr2 = dr_x * dr_x + dr_y * dr_y + dr_z * dr_z;
1851 float eaniso = 0.5f * al.kpar0 * dpar * dpar + 0.5f * al.kperp0 * dperp * dperp + 0.5f * al.kiso0 * dr2;
1852 if (doFEP || doTI) {
1853 switch (al.fepBondedType) {
1855 if (doTI) energy_TI_1 += (double)eaniso;
1857 energy_F += (double)(eaniso * ((AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1));
1858 eaniso *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1859 fj_x *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1860 fj_y *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1861 fj_z *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1862 fl_x *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1863 fl_y *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1864 fl_z *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1865 fm_x *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1866 fm_y *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1867 fm_z *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1872 if (doTI) energy_TI_2 += (double)eaniso;
1874 energy_F += (double)(eaniso * ((AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2));
1875 eaniso *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1876 fj_x *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1877 fj_y *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1878 fj_z *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1879 fl_x *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1880 fl_y *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1881 fl_z *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1882 fm_x *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1883 fm_y *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1884 fm_z *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1890 energy += (double)eaniso;
1893 // accumulate forces
1894 T T_fx0, T_fy0, T_fz0;
1895 convertForces<T>(-(fj_x + fl_x), -(fj_y + fl_y), -(fj_z + fl_z), T_fx0, T_fy0, T_fz0);
1896 storeForces<T>(T_fx0, T_fy0, T_fz0, al.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1898 T T_fx1, T_fy1, T_fz1;
1899 convertForces<T>(fj_x, fj_y, fj_z, T_fx1, T_fy1, T_fz1);
1900 storeForces<T>(T_fx1, T_fy1, T_fz1, al.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1902 T T_fx2, T_fy2, T_fz2;
1903 convertForces<T>(fl_x, fl_y, fl_z, T_fx2, T_fy2, T_fz2);
1904 storeForces<T>(T_fx2, T_fy2, T_fz2, al.l, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1906 T T_fx3, T_fy3, T_fz3;
1907 convertForces<T>(fm_x, fm_y, fm_z, T_fx3, T_fy3, T_fz3);
1908 storeForces<T>(T_fx3, T_fy3, T_fz3, al.m, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1910 T T_fx4, T_fy4, T_fz4;
1911 convertForces<T>(-fm_x, -fm_y, -fm_z, T_fx4, T_fy4, T_fz4);
1912 storeForces<T>(T_fx4, T_fy4, T_fz4, al.n, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1915 #ifdef WRITE_FULL_VIRIALS
1916 virial.xx = fj_x * dr_x - fl_x * ril_x + fm_x * rmn_x;
1917 virial.xy = fj_x * dr_y - fl_x * ril_y + fm_x * rmn_y;
1918 virial.xz = fj_x * dr_z - fl_x * ril_z + fm_x * rmn_z;
1919 virial.yx = fj_y * dr_x - fl_y * ril_x + fm_y * rmn_x;
1920 virial.yy = fj_y * dr_y - fl_y * ril_y + fm_y * rmn_y;
1921 virial.yz = fj_y * dr_z - fl_y * ril_z + fm_y * rmn_z;
1922 virial.zx = fj_z * dr_x - fl_z * ril_x + fm_z * rmn_x;
1923 virial.zy = fj_z * dr_y - fl_z * ril_y + fm_z * rmn_y;
1924 virial.zz = fj_z * dr_z - fl_z * ril_z + fm_z * rmn_z;
1930 #define BONDEDFORCESKERNEL_NUM_WARP 2
1932 #define BONDEDFORCESKERNEL_NUM_WARP 4
1935 // Calculates all forces in a single kernel call
1937 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1938 __global__ void bondedForcesKernel(
1942 const CudaBond* __restrict__ bonds,
1943 const CudaBondValue* __restrict__ bondValues,
1945 const int numAngles,
1946 const CudaAngle* __restrict__ angles,
1947 const CudaAngleValue* __restrict__ angleValues,
1949 const int numDihedrals,
1950 const CudaDihedral* __restrict__ dihedrals,
1951 const CudaDihedralValue* __restrict__ dihedralValues,
1953 const int numImpropers,
1954 const CudaDihedral* __restrict__ impropers,
1955 const CudaDihedralValue* __restrict__ improperValues,
1957 const int numExclusions,
1958 const CudaExclusion* __restrict__ exclusions,
1960 const int numCrossterms,
1961 const CudaCrossterm* __restrict__ crossterms,
1962 const CudaCrosstermValue* __restrict__ crosstermValues,
1964 const int numTholes,
1965 const CudaThole* __restrict__ tholes,
1967 const int numAnisos,
1968 const CudaAniso* __restrict__ anisos,
1970 const float cutoff2,
1971 const float r2_delta, const int r2_delta_expc,
1973 const float* __restrict__ r2_table,
1974 const float4* __restrict__ exclusionTable,
1976 #if !defined(USE_TABLE_ARRAYS)
1977 cudaTextureObject_t r2_table_tex,
1978 cudaTextureObject_t exclusionTableTex,
1981 const float4* __restrict__ xyzq,
1983 const float3 lata, const float3 latb, const float3 latc,
1984 T* __restrict__ force,
1985 T* __restrict__ forceSlow,
1986 T* __restrict__ forceList,
1987 int* forceListCounter,
1988 int* forceListStarts,
1989 int* forceListStartsSlow,
1990 int* forceListNexts,
1991 double* __restrict__ energies_virials) {
1993 // Thread-block index
1994 int indexTB = start + blockIdx.x;
1996 const int numBondsTB = (numBonds + blockDim.x - 1)/blockDim.x;
1997 const int numAnglesTB = (numAngles + blockDim.x - 1)/blockDim.x;
1998 const int numDihedralsTB = (numDihedrals + blockDim.x - 1)/blockDim.x;
1999 const int numImpropersTB = (numImpropers + blockDim.x - 1)/blockDim.x;
2000 const int numExclusionsTB= (numExclusions + blockDim.x - 1)/blockDim.x;
2001 const int numCrosstermsTB= (numCrossterms + blockDim.x - 1)/blockDim.x;
2002 const int numTholesTB = (numTholes + blockDim.x - 1)/blockDim.x;
2003 const int numAnisosTB = (numAnisos + blockDim.x - 1)/blockDim.x;
2005 // Each thread computes single bonded interaction.
2006 // Each thread block computes single bonded type
2013 int energyIndex_TI_1;
2014 int energyIndex_TI_2;
2026 energyIndex_TI_1 = -1;
2027 energyIndex_TI_2 = -1;
2031 #ifdef WRITE_FULL_VIRIALS
2032 ComputeBondedCUDAKernel::BondedVirial<double> virial;
2044 virialIndex = ComputeBondedCUDAKernel::normalVirialIndex_XX;
2048 if (indexTB < numBondsTB) {
2049 int i = threadIdx.x + indexTB*blockDim.x;
2051 energyIndex = ComputeBondedCUDAKernel::energyIndex_BOND;
2053 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_BOND_F;
2056 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_BOND_TI_1;
2057 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_BOND_TI_2;
2061 bondForce<T, doEnergy, doVirial, doFEP, doTI>
2062 (i, bonds, bondValues, xyzq,
2063 stride, lata, latb, latc,
2065 forceList, forceListCounter, forceListStarts, forceListNexts,
2067 energy_F, energy_TI_1, energy_TI_2);
2071 indexTB -= numBondsTB;
2073 if (indexTB < numAnglesTB) {
2074 int i = threadIdx.x + indexTB*blockDim.x;
2076 energyIndex = ComputeBondedCUDAKernel::energyIndex_ANGLE;
2078 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_ANGLE_F;
2081 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_ANGLE_TI_1;
2082 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_ANGLE_TI_2;
2085 if (i < numAngles) {
2086 angleForce<T, doEnergy, doVirial, doFEP, doTI>
2087 (i, angles, angleValues, xyzq, stride,
2090 forceList, forceListCounter, forceListStarts, forceListNexts,
2092 energy_F, energy_TI_1, energy_TI_2);
2096 indexTB -= numAnglesTB;
2098 if (indexTB < numDihedralsTB) {
2099 int i = threadIdx.x + indexTB*blockDim.x;
2101 energyIndex = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL;
2103 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_F;
2106 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_TI_1;
2107 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_TI_2;
2111 virialIndex = ComputeBondedCUDAKernel::amdDiheVirialIndex_XX;
2113 if (i < numDihedrals) {
2114 diheForce<T, doEnergy, doVirial, doFEP, doTI>
2115 (i, dihedrals, dihedralValues, xyzq, stride,
2118 forceList, forceListCounter, forceListStarts, forceListNexts,
2120 energy_F, energy_TI_1, energy_TI_2);
2124 indexTB -= numDihedralsTB;
2126 if (indexTB < numImpropersTB) {
2127 int i = threadIdx.x + indexTB*blockDim.x;
2129 energyIndex = ComputeBondedCUDAKernel::energyIndex_IMPROPER;
2131 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_IMPROPER_F;
2134 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_IMPROPER_TI_1;
2135 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_IMPROPER_TI_2;
2138 if (i < numImpropers) {
2139 diheForce<T, doEnergy, doVirial, doFEP, doTI>
2140 (i, impropers, improperValues, xyzq, stride,
2143 forceList, forceListCounter, forceListStarts, forceListNexts,
2145 energy_F, energy_TI_1, energy_TI_2);
2149 indexTB -= numImpropersTB;
2151 if (indexTB < numExclusionsTB) {
2152 int i = threadIdx.x + indexTB*blockDim.x;
2154 energyIndex = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW;
2156 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_F;
2159 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_1;
2160 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_2;
2164 virialIndex = ComputeBondedCUDAKernel::slowVirialIndex_XX;
2166 if (i < numExclusions) {
2167 exclusionForce<T, doEnergy, doVirial, doFEP, doTI>
2168 (i, exclusions, r2_delta, r2_delta_expc,
2169 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
2170 r2_table, exclusionTable,
2172 r2_table_tex, exclusionTableTex,
2174 xyzq, stride, lata, latb, latc, cutoff2,
2176 forceList, forceListCounter, forceListStartsSlow, forceListNexts,
2178 energy_F, energy_TI_1, energy_TI_2);
2182 indexTB -= numExclusionsTB;
2184 if (indexTB < numCrosstermsTB) {
2185 int i = threadIdx.x + indexTB*blockDim.x;
2187 energyIndex = ComputeBondedCUDAKernel::energyIndex_CROSSTERM;
2189 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_CROSSTERM_F;
2192 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_CROSSTERM_TI_1;
2193 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_CROSSTERM_TI_2;
2197 virialIndex = ComputeBondedCUDAKernel::amdDiheVirialIndex_XX;
2199 if (i < numCrossterms) {
2200 crosstermForce<T, doEnergy, doVirial, doFEP, doTI>
2201 (i, crossterms, crosstermValues,
2202 xyzq, stride, lata, latb, latc,
2204 forceList, forceListCounter, forceListStarts, forceListNexts,
2206 energy_F, energy_TI_1, energy_TI_2);
2210 indexTB -= numCrosstermsTB;
2212 if (indexTB < numTholesTB) {
2213 int i = threadIdx.x + indexTB*blockDim.x;
2215 energyIndex = ComputeBondedCUDAKernel::energyIndex_THOLE;
2217 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_THOLE_F;
2220 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_THOLE_TI_1;
2221 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_THOLE_TI_2;
2225 virialIndex = ComputeBondedCUDAKernel::normalVirialIndex_XX;
2227 if (i < numTholes) {
2228 tholeForce<T, doEnergy, doVirial, doFEP, doTI>(
2229 i, tholes, xyzq, stride, lata, latb, latc,
2230 force, energy, forceList, forceListCounter,
2231 forceListStarts, forceListNexts,
2232 virial, energy_F, energy_TI_1, energy_TI_2);
2236 indexTB -= numTholesTB;
2238 if (indexTB < numAnisosTB) {
2239 int i = threadIdx.x + indexTB*blockDim.x;
2241 energyIndex = ComputeBondedCUDAKernel::energyIndex_ANISO;
2243 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_ANISO_F;
2246 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_ANISO_TI_1;
2247 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_ANISO_TI_2;
2251 virialIndex = ComputeBondedCUDAKernel::normalVirialIndex_XX;
2254 anisoForce<T, doEnergy, doVirial, doFEP, doTI>(
2255 i, anisos, xyzq, stride, lata, latb, latc,
2256 force, energy, forceList, forceListCounter,
2257 forceListStarts, forceListNexts,
2258 virial, energy_F, energy_TI_1, energy_TI_2);
2262 // index -= numAnisosTB
2266 // Write energies to global memory
2268 // energyIndex is constant within thread-block
2269 __shared__ double shEnergy[BONDEDFORCESKERNEL_NUM_WARP];
2270 __shared__ double shEnergy_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2271 __shared__ double shEnergy_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2272 __shared__ double shEnergy_F[BONDEDFORCESKERNEL_NUM_WARP];
2274 for (int i=WARPSIZE/2;i >= 1;i/=2) {
2275 energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, WARPSIZE);
2278 energy_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_F, i, WARPSIZE);
2281 energy_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_1, i, WARPSIZE);
2282 energy_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_2, i, WARPSIZE);
2285 int laneID = (threadIdx.x & (WARPSIZE - 1));
2286 int warpID = threadIdx.x / WARPSIZE;
2288 shEnergy[warpID] = energy;
2290 shEnergy_F[warpID] = energy_F;
2293 shEnergy_TI_1[warpID] = energy_TI_1;
2294 shEnergy_TI_2[warpID] = energy_TI_2;
2299 energy = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy[laneID] : 0.0;
2301 energy_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy_F[laneID] : 0.0;
2304 energy_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy_TI_1[laneID] : 0.0;
2305 energy_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy_TI_2[laneID] : 0.0;
2308 for (int i=WARPSIZE/2;i >= 1;i/=2) {
2309 energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, WARPSIZE);
2312 energy_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_F, i, WARPSIZE);
2315 energy_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_1, i, WARPSIZE);
2316 energy_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_2, i, WARPSIZE);
2320 const int bin = blockIdx.x % ATOMIC_BINS;
2321 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex], energy);
2323 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex_F], energy_F);
2326 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex_TI_1], energy_TI_1);
2327 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex_TI_2], energy_TI_2);
2333 // Write virials to global memory
2334 #ifdef WRITE_FULL_VIRIALS
2337 for (int i=WARPSIZE/2;i >= 1;i/=2) {
2338 virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, WARPSIZE);
2339 virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, WARPSIZE);
2340 virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, WARPSIZE);
2341 virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, WARPSIZE);
2342 virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, WARPSIZE);
2343 virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, WARPSIZE);
2344 virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, WARPSIZE);
2345 virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, WARPSIZE);
2346 virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, WARPSIZE);
2348 __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirial[BONDEDFORCESKERNEL_NUM_WARP];
2349 int laneID = (threadIdx.x & (WARPSIZE - 1));
2350 int warpID = threadIdx.x / WARPSIZE;
2352 shVirial[warpID] = virial;
2366 if (laneID < BONDEDFORCESKERNEL_NUM_WARP) virial = shVirial[laneID];
2368 for (int i=WARPSIZE/2;i >= 1;i/=2) {
2369 virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, WARPSIZE);
2370 virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, WARPSIZE);
2371 virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, WARPSIZE);
2372 virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, WARPSIZE);
2373 virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, WARPSIZE);
2374 virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, WARPSIZE);
2375 virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, WARPSIZE);
2376 virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, WARPSIZE);
2377 virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, WARPSIZE);
2381 const int bin = blockIdx.x % ATOMIC_BINS;
2382 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 0], virial.xx);
2383 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 1], virial.xy);
2384 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 2], virial.xz);
2385 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 3], virial.yx);
2386 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 4], virial.yy);
2387 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 5], virial.yz);
2388 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 6], virial.zx);
2389 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 7], virial.zy);
2390 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 8], virial.zz);
2398 template <typename T, bool doEnergy, bool doVirial, bool doElect, bool doFEP, bool doTI, bool doTable>
2399 __global__ void modifiedExclusionForcesKernel(
2401 const int numModifiedExclusions,
2402 const CudaExclusion* __restrict__ modifiedExclusions,
2404 const float one_scale14, // 1 - scale14
2405 const float cutoff2,
2406 const CudaNBConstants nbConstants,
2407 const int vdwCoefTableWidth,
2408 const float2* __restrict__ vdwCoefTable,
2409 #if !defined(USE_TABLE_ARRAYS)
2410 cudaTextureObject_t vdwCoefTableTex,
2412 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
2413 const float4* __restrict__ modifiedExclusionForceTable,
2414 const float4* __restrict__ modifiedExclusionEnergyTable,
2416 cudaTextureObject_t modifiedExclusionForceTableTex,
2417 cudaTextureObject_t modifiedExclusionEnergyTableTex,
2419 const float4* __restrict__ xyzq,
2421 const float3 lata, const float3 latb, const float3 latc,
2422 T* __restrict__ forceNbond, T* __restrict__ forceSlow,
2423 T* __restrict__ forceList,
2424 int* forceListCounter,
2425 int* forceListStartsNbond,
2426 int* forceListStartsSlow,
2427 int* forceListNexts,
2428 double* __restrict__ energies_virials) {
2431 int i = threadIdx.x + (start + blockIdx.x)*blockDim.x;
2433 double energyVdw, energyNbond, energySlow;
2434 // Alchemical energies
2435 double energyVdw_F, energyVdw_TI_1, energyVdw_TI_2;
2436 double energyNbond_F, energyNbond_TI_1, energyNbond_TI_2;
2437 double energySlow_F, energySlow_TI_1, energySlow_TI_2;
2444 energyVdw_TI_1 = 0.0;
2445 energyVdw_TI_2 = 0.0;
2451 energyNbond_F = 0.0;
2455 energyNbond_TI_1 = 0.0;
2456 energyNbond_TI_2 = 0.0;
2457 energySlow_TI_1 = 0.0;
2458 energySlow_TI_2 = 0.0;
2463 #ifdef WRITE_FULL_VIRIALS
2464 ComputeBondedCUDAKernel::BondedVirial<double> virialNbond;
2465 ComputeBondedCUDAKernel::BondedVirial<double> virialSlow;
2467 virialNbond.xx = 0.0;
2468 virialNbond.xy = 0.0;
2469 virialNbond.xz = 0.0;
2470 virialNbond.yx = 0.0;
2471 virialNbond.yy = 0.0;
2472 virialNbond.yz = 0.0;
2473 virialNbond.zx = 0.0;
2474 virialNbond.zy = 0.0;
2475 virialNbond.zz = 0.0;
2477 virialSlow.xx = 0.0;
2478 virialSlow.xy = 0.0;
2479 virialSlow.xz = 0.0;
2480 virialSlow.yx = 0.0;
2481 virialSlow.yy = 0.0;
2482 virialSlow.yz = 0.0;
2483 virialSlow.zx = 0.0;
2484 virialSlow.zy = 0.0;
2485 virialSlow.zz = 0.0;
2490 if (i < numModifiedExclusions)
2492 modifiedExclusionForce<T, doEnergy, doVirial, doElect, doFEP, doTI, doTable>
2493 (i, modifiedExclusions, doSlow, one_scale14, vdwCoefTableWidth,
2494 #if __CUDA_ARCH__ >= 350 || defined(USE_TABLE_ARRAYS)
2499 // for modified exclusions, we do regular tables if HIP and USE_FORCE_TABLES, otherwise it's textures
2500 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
2502 modifiedExclusionForceTable,
2503 modifiedExclusionEnergyTable,
2505 // if CUDA build, non-tables, fall back to texture force/energy tables
2506 modifiedExclusionForceTableTex,
2507 modifiedExclusionEnergyTableTex,
2509 xyzq, stride, lata, latb, latc, cutoff2, nbConstants,
2510 energyVdw, forceNbond, energyNbond,
2511 forceSlow, energySlow,
2512 forceList, forceListCounter, forceListStartsNbond, forceListStartsSlow, forceListNexts,
2513 virialNbond, virialSlow,
2514 energyVdw_F, energyVdw_TI_1, energyVdw_TI_2,
2515 energyNbond_F, energyNbond_TI_1, energyNbond_TI_2,
2516 energySlow_F, energySlow_TI_1, energySlow_TI_2);
2519 // Write energies to global memory
2521 __shared__ double shEnergyVdw[BONDEDFORCESKERNEL_NUM_WARP];
2522 __shared__ double shEnergyNbond[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
2523 __shared__ double shEnergySlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
2524 __shared__ double shEnergyVdw_F[BONDEDFORCESKERNEL_NUM_WARP];
2525 __shared__ double shEnergyVdw_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2526 __shared__ double shEnergyVdw_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2527 __shared__ double shEnergyNbond_F[BONDEDFORCESKERNEL_NUM_WARP];
2528 __shared__ double shEnergyNbond_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2529 __shared__ double shEnergyNbond_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2530 __shared__ double shEnergySlow_F[BONDEDFORCESKERNEL_NUM_WARP];
2531 __shared__ double shEnergySlow_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2532 __shared__ double shEnergySlow_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2534 for (int i=WARPSIZE/2;i >= 1;i/=2) {
2535 energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
2538 energyVdw_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_F, i, WARPSIZE);
2541 energyVdw_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_1, i, WARPSIZE);
2542 energyVdw_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_2, i, WARPSIZE);
2545 energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, WARPSIZE);
2546 energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
2548 energyNbond_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_F, i, WARPSIZE);
2549 energySlow_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_F, i, WARPSIZE);
2552 energyNbond_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_1, i, WARPSIZE);
2553 energyNbond_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_2, i, WARPSIZE);
2554 energySlow_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_1, i, WARPSIZE);
2555 energySlow_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_2, i, WARPSIZE);
2559 int laneID = (threadIdx.x & (WARPSIZE - 1));
2560 int warpID = threadIdx.x / WARPSIZE;
2562 shEnergyVdw[warpID] = energyVdw;
2564 shEnergyVdw_F[warpID] = energyVdw_F;
2567 shEnergyVdw_TI_1[warpID] = energyVdw_TI_1;
2568 shEnergyVdw_TI_2[warpID] = energyVdw_TI_2;
2571 shEnergyNbond[warpID] = energyNbond;
2572 shEnergySlow[warpID] = energySlow;
2574 shEnergyNbond_F[warpID] = energyNbond_F;
2575 shEnergySlow_F[warpID] = energySlow_F;
2578 shEnergyNbond_TI_1[warpID] = energyNbond_TI_1;
2579 shEnergyNbond_TI_2[warpID] = energyNbond_TI_2;
2580 shEnergySlow_TI_1[warpID] = energySlow_TI_1;
2581 shEnergySlow_TI_2[warpID] = energySlow_TI_2;
2587 energyVdw = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw[laneID] : 0.0;
2589 energyVdw_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw_F[laneID] : 0.0;
2592 energyVdw_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw_TI_1[laneID] : 0.0;
2593 energyVdw_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw_TI_2[laneID] : 0.0;
2596 energyNbond = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond[laneID] : 0.0;
2597 energySlow = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow[laneID] : 0.0;
2599 energyNbond_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond_F[laneID] : 0.0;
2600 energySlow_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow_F[laneID] : 0.0;
2603 energyNbond_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond_TI_1[laneID] : 0.0;
2604 energyNbond_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond_TI_2[laneID] : 0.0;
2605 energySlow_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow_TI_1[laneID] : 0.0;
2606 energySlow_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow_TI_2[laneID] : 0.0;
2610 for (int i=WARPSIZE/2;i >= 1;i/=2) {
2611 energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
2614 energyVdw_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_F, i, WARPSIZE);
2617 energyVdw_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_1, i, WARPSIZE);
2618 energyVdw_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_2, i, WARPSIZE);
2621 energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, WARPSIZE);
2622 energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
2624 energyNbond_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_F, i, WARPSIZE);
2625 energySlow_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_F, i, WARPSIZE);
2628 energyNbond_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_1, i, WARPSIZE);
2629 energyNbond_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_2, i, WARPSIZE);
2630 energySlow_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_1, i, WARPSIZE);
2631 energySlow_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_2, i, WARPSIZE);
2636 const int bin = blockIdx.x % ATOMIC_BINS;
2637 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ], energyVdw);
2639 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ_F], energyVdw_F);
2642 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ_TI_1], energyVdw_TI_1);
2643 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ_TI_2], energyVdw_TI_2);
2646 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT], energyNbond);
2647 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW], energySlow);
2649 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_F], energyNbond_F);
2650 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_F], energySlow_F);
2653 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_TI_1], energyNbond_TI_1);
2654 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_TI_2], energyNbond_TI_2);
2655 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_1], energySlow_TI_1);
2656 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_2], energySlow_TI_2);
2663 // Write virials to global memory
2664 #ifdef WRITE_FULL_VIRIALS
2667 for (int i=WARPSIZE/2;i >= 1;i/=2) {
2668 virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, WARPSIZE);
2669 virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, WARPSIZE);
2670 virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, WARPSIZE);
2671 virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, WARPSIZE);
2672 virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, WARPSIZE);
2673 virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, WARPSIZE);
2674 virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, WARPSIZE);
2675 virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, WARPSIZE);
2676 virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, WARPSIZE);
2677 if (doElect && doSlow) {
2678 virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, WARPSIZE);
2679 virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, WARPSIZE);
2680 virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, WARPSIZE);
2681 virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, WARPSIZE);
2682 virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, WARPSIZE);
2683 virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, WARPSIZE);
2684 virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, WARPSIZE);
2685 virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, WARPSIZE);
2686 virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, WARPSIZE);
2689 __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirialNbond[BONDEDFORCESKERNEL_NUM_WARP];
2690 __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirialSlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
2691 int laneID = (threadIdx.x & (WARPSIZE - 1));
2692 int warpID = threadIdx.x / WARPSIZE;
2694 shVirialNbond[warpID] = virialNbond;
2695 if (doElect && doSlow) {
2696 shVirialSlow[warpID] = virialSlow;
2701 virialNbond.xx = 0.0;
2702 virialNbond.xy = 0.0;
2703 virialNbond.xz = 0.0;
2704 virialNbond.yx = 0.0;
2705 virialNbond.yy = 0.0;
2706 virialNbond.yz = 0.0;
2707 virialNbond.zx = 0.0;
2708 virialNbond.zy = 0.0;
2709 virialNbond.zz = 0.0;
2710 if (doElect && doSlow) {
2711 virialSlow.xx = 0.0;
2712 virialSlow.xy = 0.0;
2713 virialSlow.xz = 0.0;
2714 virialSlow.yx = 0.0;
2715 virialSlow.yy = 0.0;
2716 virialSlow.yz = 0.0;
2717 virialSlow.zx = 0.0;
2718 virialSlow.zy = 0.0;
2719 virialSlow.zz = 0.0;
2723 if (laneID < BONDEDFORCESKERNEL_NUM_WARP) {
2724 virialNbond = shVirialNbond[laneID];
2725 if (doElect && doSlow) {
2726 virialSlow = shVirialSlow[laneID];
2730 for (int i=WARPSIZE/2;i >= 1;i/=2) {
2731 virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, WARPSIZE);
2732 virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, WARPSIZE);
2733 virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, WARPSIZE);
2734 virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, WARPSIZE);
2735 virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, WARPSIZE);
2736 virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, WARPSIZE);
2737 virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, WARPSIZE);
2738 virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, WARPSIZE);
2739 virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, WARPSIZE);
2740 if (doElect && doSlow) {
2741 virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, WARPSIZE);
2742 virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, WARPSIZE);
2743 virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, WARPSIZE);
2744 virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, WARPSIZE);
2745 virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, WARPSIZE);
2746 virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, WARPSIZE);
2747 virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, WARPSIZE);
2748 virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, WARPSIZE);
2749 virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, WARPSIZE);
2755 const int bin = blockIdx.x % ATOMIC_BINS;
2756 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XX], virialNbond.xx);
2757 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XY], virialNbond.xy);
2758 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XZ], virialNbond.xz);
2759 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YX], virialNbond.yx);
2760 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YY], virialNbond.yy);
2761 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YZ], virialNbond.yz);
2762 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZX], virialNbond.zx);
2763 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZY], virialNbond.zy);
2764 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZZ], virialNbond.zz);
2765 if (doElect && doSlow) {
2766 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XX], virialSlow.xx);
2767 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XY], virialSlow.xy);
2768 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XZ], virialSlow.xz);
2769 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YX], virialSlow.yx);
2770 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YY], virialSlow.yy);
2771 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YZ], virialSlow.yz);
2772 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZX], virialSlow.zx);
2773 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZY], virialSlow.zy);
2774 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZZ], virialSlow.zz);
2783 template <typename T>
2784 __global__ void gatherBondedForcesKernel(
2786 const int atomStorageSize,
2788 const T* __restrict__ forceList,
2789 const int* __restrict__ forceListStarts,
2790 const int* __restrict__ forceListNexts,
2791 T* __restrict__ force) {
2793 int i = threadIdx.x + (start + blockIdx.x) * blockDim.x;
2795 if (i < atomStorageSize) {
2799 int pos = forceListStarts[i];
2801 fx += forceList[pos * 3 + 0];
2802 fy += forceList[pos * 3 + 1];
2803 fz += forceList[pos * 3 + 2];
2804 pos = forceListNexts[pos];
2807 force[i + stride ] = fy;
2808 force[i + stride * 2] = fz;
2812 __global__ void reduceBondedBinsKernel(double* energies_virials) {
2813 const int bin = threadIdx.x;
2815 typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
2816 __shared__ typename WarpReduce::TempStorage tempStorage;
2818 double v = WarpReduce(tempStorage).Sum(energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + blockIdx.x]);
2819 if (threadIdx.x == 0) {
2820 energies_virials[blockIdx.x] = v;
2824 // ##############################################################################################
2825 // ##############################################################################################
2826 // ##############################################################################################
2829 // Class constructor
2831 ComputeBondedCUDAKernel::ComputeBondedCUDAKernel(int deviceID, CudaNonbondedTables& cudaNonbondedTables) :
2832 deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables) {
2834 cudaCheck(cudaSetDevice(deviceID));
2843 modifiedExclusions = NULL;
2852 numModifiedExclusions = 0;
2860 dihedralValues = NULL;
2861 improperValues = NULL;
2862 crosstermValues = NULL;
2871 forceListStarts = NULL;
2872 forceListNexts = NULL;
2874 forceListStartsSize = 0;
2875 forceListNextsSize = 0;
2876 allocate_device<int>(&forceListCounter, 1);
2878 allocate_device<double>(&energies_virials, ATOMIC_BINS * energies_virials_SIZE);
2884 ComputeBondedCUDAKernel::~ComputeBondedCUDAKernel() {
2885 cudaCheck(cudaSetDevice(deviceID));
2887 deallocate_device<double>(&energies_virials);
2888 // deallocate_device<BondedVirial>(&virial);
2890 if (tupleData != NULL) deallocate_device<char>(&tupleData);
2891 if (xyzq != NULL) deallocate_device<float4>(&xyzq);
2892 if (forces != NULL) deallocate_device<FORCE_TYPE>(&forces);
2894 if (forceList != NULL) deallocate_device<FORCE_TYPE>(&forceList);
2895 if (forceListCounter != NULL) deallocate_device<int>(&forceListCounter);
2896 if (forceListStarts != NULL) deallocate_device<int>(&forceListStarts);
2897 if (forceListNexts != NULL) deallocate_device<int>(&forceListNexts);
2899 if (bondValues != NULL) deallocate_device<CudaBondValue>(&bondValues);
2900 if (angleValues != NULL) deallocate_device<CudaAngleValue>(&angleValues);
2901 if (dihedralValues != NULL) deallocate_device<CudaDihedralValue>(&dihedralValues);
2902 if (improperValues != NULL) deallocate_device<CudaDihedralValue>(&improperValues);
2903 if (crosstermValues != NULL) deallocate_device<CudaCrosstermValue>(&crosstermValues);
2906 void ComputeBondedCUDAKernel::setupBondValues(int numBondValues, CudaBondValue* h_bondValues) {
2907 allocate_device<CudaBondValue>(&bondValues, numBondValues);
2908 copy_HtoD_sync<CudaBondValue>(h_bondValues, bondValues, numBondValues);
2911 void ComputeBondedCUDAKernel::setupAngleValues(int numAngleValues, CudaAngleValue* h_angleValues) {
2912 allocate_device<CudaAngleValue>(&angleValues, numAngleValues);
2913 copy_HtoD_sync<CudaAngleValue>(h_angleValues, angleValues, numAngleValues);
2916 void ComputeBondedCUDAKernel::setupDihedralValues(int numDihedralValues, CudaDihedralValue* h_dihedralValues) {
2917 allocate_device<CudaDihedralValue>(&dihedralValues, numDihedralValues);
2918 copy_HtoD_sync<CudaDihedralValue>(h_dihedralValues, dihedralValues, numDihedralValues);
2921 void ComputeBondedCUDAKernel::setupImproperValues(int numImproperValues, CudaDihedralValue* h_improperValues) {
2922 allocate_device<CudaDihedralValue>(&improperValues, numImproperValues);
2923 copy_HtoD_sync<CudaDihedralValue>(h_improperValues, improperValues, numImproperValues);
2926 void ComputeBondedCUDAKernel::setupCrosstermValues(int numCrosstermValues, CudaCrosstermValue* h_crosstermValues) {
2927 allocate_device<CudaCrosstermValue>(&crosstermValues, numCrosstermValues);
2928 copy_HtoD_sync<CudaCrosstermValue>(h_crosstermValues, crosstermValues, numCrosstermValues);
2931 void ComputeBondedCUDAKernel::updateCudaAlchParameters(const CudaAlchParameters* h_cudaAlchParameters, cudaStream_t stream) {
2932 cudaCheck(cudaMemcpyToSymbolAsync(AlchBondedCUDA::alchParams, h_cudaAlchParameters, 1*sizeof(AlchBondedCUDA::alchParams), 0, cudaMemcpyHostToDevice, stream));
2935 void ComputeBondedCUDAKernel::updateCudaAlchLambdas(const CudaAlchLambdas* h_cudaAlchLambdas, cudaStream_t stream) {
2936 cudaCheck(cudaMemcpyToSymbolAsync(AlchBondedCUDA::alchLambdas, h_cudaAlchLambdas, 1*sizeof(AlchBondedCUDA::alchLambdas), 0, cudaMemcpyHostToDevice, stream));
2939 void ComputeBondedCUDAKernel::updateCudaAlchFlags(const CudaAlchFlags& h_cudaAlchFlags) {
2940 alchFlags = h_cudaAlchFlags;
2944 // Update bonded lists
2946 void ComputeBondedCUDAKernel::setTupleCounts(
2947 const TupleCounts count
2949 numBonds = count.bond;
2950 numAngles = count.angle;
2951 numDihedrals = count.dihedral;
2952 numImpropers = count.improper;
2953 numModifiedExclusions = count.modifiedExclusion;
2954 numExclusions = count.exclusion;
2955 numCrossterms = count.crossterm;
2956 numTholes = count.thole;
2957 numAnisos = count.aniso;
2960 TupleCounts ComputeBondedCUDAKernel::getTupleCounts() {
2963 count.bond = numBonds;
2964 count.angle = numAngles;
2965 count.dihedral = numDihedrals;
2966 count.improper = numImpropers;
2967 count.modifiedExclusion = numModifiedExclusions;
2968 count.exclusion = numExclusions;
2969 count.crossterm = numCrossterms;
2970 count.thole = numTholes;
2971 count.aniso = numAnisos;
2976 TupleData ComputeBondedCUDAKernel::getData() {
2979 data.angle = angles;
2980 data.dihedral = dihedrals;
2981 data.improper = impropers;
2982 data.modifiedExclusion = modifiedExclusions;
2983 data.exclusion = exclusions;
2984 data.crossterm = crossterms;
2985 data.thole = tholes;
2986 data.aniso = anisos;
2991 size_t ComputeBondedCUDAKernel::reallocateTupleBuffer(
2992 const TupleCounts countIn,
2995 const int numBondsWA = warpAlign(countIn.bond);
2996 const int numAnglesWA = warpAlign(countIn.angle);
2997 const int numDihedralsWA = warpAlign(countIn.dihedral);
2998 const int numImpropersWA = warpAlign(countIn.improper);
2999 const int numModifiedExclusionsWA = warpAlign(countIn.modifiedExclusion);
3000 const int numExclusionsWA = warpAlign(countIn.exclusion);
3001 const int numCrosstermsWA = warpAlign(countIn.crossterm);
3002 const int numTholesWA = warpAlign(countIn.thole);
3003 const int numAnisosWA = warpAlign(countIn.aniso);
3005 const size_t sizeTot = numBondsWA*sizeof(CudaBond)
3006 + numAnglesWA*sizeof(CudaAngle)
3007 + numDihedralsWA*sizeof(CudaDihedral)
3008 + numImpropersWA*sizeof(CudaDihedral)
3009 + numModifiedExclusionsWA*sizeof(CudaExclusion)
3010 + numExclusionsWA*sizeof(CudaExclusion)
3011 + numCrosstermsWA*sizeof(CudaCrossterm)
3012 + numTholesWA*sizeof(CudaThole)
3013 + numAnisosWA*sizeof(CudaAniso);
3015 reallocate_device<char>(&tupleData, &tupleDataSize, sizeTot, kTupleOveralloc);
3019 bonds = (CudaBond *)&tupleData[pos];
3020 pos += numBondsWA*sizeof(CudaBond);
3022 angles = (CudaAngle* )&tupleData[pos];
3023 pos += numAnglesWA*sizeof(CudaAngle);
3025 dihedrals = (CudaDihedral* )&tupleData[pos];
3026 pos += numDihedralsWA*sizeof(CudaDihedral);
3028 impropers = (CudaDihedral* )&tupleData[pos];
3029 pos += numImpropersWA*sizeof(CudaDihedral);
3031 modifiedExclusions = (CudaExclusion* )&tupleData[pos];
3032 pos += numModifiedExclusionsWA*sizeof(CudaExclusion);
3034 exclusions = (CudaExclusion* )&tupleData[pos];
3035 pos += numExclusionsWA*sizeof(CudaExclusion);
3037 crossterms = (CudaCrossterm* )&tupleData[pos];
3038 pos += numCrosstermsWA*sizeof(CudaCrossterm);
3040 tholes = (CudaThole *)&tupleData[pos];
3041 pos += numTholesWA*sizeof(CudaThole);
3043 anisos = (CudaAniso *)&tupleData[pos];
3044 pos += numAnisosWA*sizeof(CudaAniso);
3049 void ComputeBondedCUDAKernel::update(
3050 const int numBondsIn,
3051 const int numAnglesIn,
3052 const int numDihedralsIn,
3053 const int numImpropersIn,
3054 const int numModifiedExclusionsIn,
3055 const int numExclusionsIn,
3056 const int numCrosstermsIn,
3057 const int numTholesIn,
3058 const int numAnisosIn,
3059 const char* h_tupleData,
3060 cudaStream_t stream) {
3062 numBonds = numBondsIn;
3063 numAngles = numAnglesIn;
3064 numDihedrals = numDihedralsIn;
3065 numImpropers = numImpropersIn;
3066 numModifiedExclusions = numModifiedExclusionsIn;
3067 numExclusions = numExclusionsIn;
3068 numCrossterms = numCrosstermsIn;
3069 numTholes = numTholesIn;
3070 numAnisos = numAnisosIn;
3072 const size_t sizeTot = reallocateTupleBuffer(getTupleCounts(), stream);
3074 copy_HtoD<char>(h_tupleData, tupleData, sizeTot, stream);
3077 void ComputeBondedCUDAKernel::updateAtomBuffer(
3078 const int atomStorageSize,
3081 reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, 1.4f);
3085 // Return stride for force array
3087 int ComputeBondedCUDAKernel::getForceStride(const int atomStorageSize) {
3088 #ifdef USE_STRIDED_FORCE
3089 // Align stride to 256 bytes
3090 return ((atomStorageSize*sizeof(FORCE_TYPE) - 1)/256 + 1)*256/sizeof(FORCE_TYPE);
3097 // Return size of single force array
3099 int ComputeBondedCUDAKernel::getForceSize(const int atomStorageSize) {
3100 #ifdef USE_STRIDED_FORCE
3101 return (3*getForceStride(atomStorageSize));
3103 return (3*atomStorageSize);
3108 // Return size of the all force arrays
3110 int ComputeBondedCUDAKernel::getAllForceSize(const int atomStorageSize, const bool doSlow) {
3112 int forceSize = getForceSize(atomStorageSize);
3114 if (numModifiedExclusions > 0 || numExclusions > 0) {
3116 // All three force arrays [normal, nbond, slow]
3119 // Two force arrays [normal, nbond]
3128 // Compute bonded forces
3130 void ComputeBondedCUDAKernel::bondedForce(
3131 const double scale14, const int atomStorageSize,
3132 const bool doEnergy, const bool doVirial, const bool doSlow,
3134 const float3 lata, const float3 latb, const float3 latc,
3135 const float cutoff2, const float r2_delta, const int r2_delta_expc,
3136 const CudaNBConstants nbConstants,
3137 const float4* h_xyzq, FORCE_TYPE* h_forces,
3138 double *h_energies_virials, bool atomsChanged, bool CUDASOAintegrate,
3139 bool useDeviceMigration, cudaStream_t stream) {
3141 int forceStorageSize = getAllForceSize(atomStorageSize, true);
3142 int forceCopySize = getAllForceSize(atomStorageSize, doSlow);
3143 int forceStride = getForceStride(atomStorageSize);
3145 int forceSize = getForceSize(atomStorageSize);
3146 int startNbond = forceSize;
3147 int startSlow = 2*forceSize;
3148 // Re-allocate coordinate and force arrays if neccessary
3149 reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, 1.4f);
3150 reallocate_device<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
3152 #if !defined(USE_BONDED_FORCE_ATOMIC_STORE)
3154 // numBonds bondForce 2
3155 // numAngles angleForce 3
3156 // numDihedrals diheForce 4
3157 // numImpropers diheForce 4
3158 // numExclusions exclusionForce 2
3159 // numCrossterms crosstermForce 8
3160 // numModifiedExclusions modifiedExclusionForce 4
3161 // numTholes tholeForce 4
3162 // numAnisos anisoForce 5
3163 int listSize = 3 * (numBonds * 2 + numAngles * 3 + numDihedrals * 4 + numImpropers * 4 + numExclusions * 2 + numCrossterms * 8 + numModifiedExclusions * 4 + numTholes * 4 + numAnisos * 5);
3164 reallocate_device<FORCE_TYPE>(&forceList, &forceListSize, listSize, 1.4f);
3165 reallocate_device<int>(&forceListNexts, &forceListNextsSize, listSize, 1.4f);
3166 reallocate_device<int>(&forceListStarts, &forceListStartsSize, 3 * atomStorageSize, 1.4f);
3167 int* forceListStartsNbond = forceListStarts + atomStorageSize;
3168 int* forceListStartsSlow = forceListStarts + 2 * atomStorageSize;
3170 clear_device_array<int>(forceListCounter, 1, stream);
3171 cudaCheck(cudaMemsetAsync(forceListStarts, -1, sizeof(int) * 3 * atomStorageSize, stream));
3173 int* forceListStartsNbond = NULL;
3174 int* forceListStartsSlow = NULL;
3177 #ifdef NODEGROUP_FORCE_REGISTER
3178 if(CUDASOAintegrate){
3179 if(atomsChanged && !useDeviceMigration) copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
3180 }else copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
3182 copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
3184 #if defined(USE_BONDED_FORCE_ATOMIC_STORE)
3185 clear_device_array<FORCE_TYPE>(forces, forceCopySize, stream);
3187 if (doEnergy || doVirial ) {
3188 clear_device_array<double>(energies_virials, ATOMIC_BINS * energies_virials_SIZE, stream);
3191 // Check if we are doing alchemical free energy calculation
3192 // Is checking alchOn required?
3193 const bool doFEP = (alchFlags.alchOn) && (alchFlags.alchFepOn);
3194 const bool doTI = (alchFlags.alchOn) && (alchFlags.alchThermIntOn);
3196 float one_scale14 = (float)(1.0 - scale14);
3198 // If doSlow = false, these exclusions are not calculated
3199 int numExclusionsDoSlow = doSlow ? numExclusions : 0;
3201 int nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
3203 int numBondsTB = (numBonds + nthread - 1)/nthread;
3204 int numAnglesTB = (numAngles + nthread - 1)/nthread;
3205 int numDihedralsTB = (numDihedrals + nthread - 1)/nthread;
3206 int numImpropersTB = (numImpropers + nthread - 1)/nthread;
3207 int numExclusionsTB= (numExclusionsDoSlow + nthread - 1)/nthread;
3208 int numCrosstermsTB= (numCrossterms + nthread - 1)/nthread;
3209 int numTholesTB = (numTholes + nthread - 1)/nthread;
3210 int numAnisosTB = (numAnisos + nthread - 1)/nthread;
3212 int nblock = numBondsTB + numAnglesTB + numDihedralsTB + numImpropersTB +
3213 numExclusionsTB + numCrosstermsTB + numTholesTB + numAnisosTB;
3216 // printf("%d %d %d %d %d %d nblock %d\n",
3217 // numBonds, numAngles, numDihedrals, numImpropers, numModifiedExclusions, numExclusions, nblock);
3219 int max_nblock = deviceCUDA->getMaxNumBlocks();
3222 while (start < nblock)
3224 int nleft = nblock - start;
3225 int nblock_use = min(max_nblock, nleft);
3227 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
3228 #define TABLE_PARAMS cudaNonbondedTables.get_r2_table(), \
3229 cudaNonbondedTables.getExclusionTable()
3231 #define TABLE_PARAMS cudaNonbondedTables.get_r2_table_tex(), \
3232 cudaNonbondedTables.getExclusionTableTex()
3235 #if defined(USE_TABLE_ARRAYS)
3236 #define CALL(DOENERGY, DOVIRIAL, DOFEP, DOTI) \
3237 bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOFEP, DOTI> \
3238 <<< nblock_use, nthread, shmem_size, stream >>> \
3239 (start, numBonds, bonds, bondValues, \
3240 numAngles, angles, angleValues, \
3241 numDihedrals, dihedrals, dihedralValues, \
3242 numImpropers, impropers, improperValues, \
3243 numExclusionsDoSlow, exclusions, \
3244 numCrossterms, crossterms, crosstermValues, \
3245 numTholes, tholes, \
3246 numAnisos, anisos, \
3248 r2_delta, r2_delta_expc, \
3250 xyzq, forceStride, \
3252 forces, &forces[startSlow], \
3253 forceList, forceListCounter, forceListStarts, forceListStartsSlow, forceListNexts, \
3256 #define CALL(DOENERGY, DOVIRIAL, DOFEP, DOTI) \
3257 bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOFEP, DOTI> \
3258 <<< nblock_use, nthread, shmem_size, stream >>> \
3259 (start, numBonds, bonds, bondValues, \
3260 numAngles, angles, angleValues, \
3261 numDihedrals, dihedrals, dihedralValues, \
3262 numImpropers, impropers, improperValues, \
3263 numExclusionsDoSlow, exclusions, \
3264 numCrossterms, crossterms, crosstermValues, \
3265 numTholes, tholes, \
3266 numAnisos, anisos, \
3268 r2_delta, r2_delta_expc, \
3269 cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable(), \
3270 cudaNonbondedTables.get_r2_table_tex(), cudaNonbondedTables.getExclusionTableTex(), \
3271 xyzq, forceStride, \
3273 forces, &forces[startSlow], \
3274 forceList, forceListCounter, forceListStarts, forceListStartsSlow, forceListNexts, \
3277 if (!doEnergy && !doVirial && !doFEP && !doTI) CALL(0, 0, 0, 0);
3278 if (!doEnergy && doVirial && !doFEP && !doTI) CALL(0, 1, 0, 0);
3279 if ( doEnergy && !doVirial && !doFEP && !doTI) CALL(1, 0, 0, 0);
3280 if ( doEnergy && doVirial && !doFEP && !doTI) CALL(1, 1, 0, 0);
3282 if (!doEnergy && !doVirial && doFEP && !doTI) CALL(0, 0, 1, 0);
3283 if (!doEnergy && doVirial && doFEP && !doTI) CALL(0, 1, 1, 0);
3284 if ( doEnergy && !doVirial && doFEP && !doTI) CALL(1, 0, 1, 0);
3285 if ( doEnergy && doVirial && doFEP && !doTI) CALL(1, 1, 1, 0);
3287 if (!doEnergy && !doVirial && !doFEP && doTI) CALL(0, 0, 0, 1);
3288 if (!doEnergy && doVirial && !doFEP && doTI) CALL(0, 1, 0, 1);
3289 if ( doEnergy && !doVirial && !doFEP && doTI) CALL(1, 0, 0, 1);
3290 if ( doEnergy && doVirial && !doFEP && doTI) CALL(1, 1, 0, 1);
3292 // Can't enable both FEP and TI, don't expand the following functions.
3294 if (!doEnergy && !doVirial && doFEP && doTI) CALL(0, 0, 1, 1);
3295 if (!doEnergy && doVirial && doFEP && doTI) CALL(0, 1, 1, 1);
3296 if ( doEnergy && !doVirial && doFEP && doTI) CALL(1, 0, 1, 1);
3297 if ( doEnergy && doVirial && doFEP && doTI) CALL(1, 1, 1, 1);
3302 cudaCheck(cudaGetLastError());
3304 start += nblock_use;
3307 nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
3308 nblock = (numModifiedExclusions + nthread - 1)/nthread;
3310 bool doElect = (one_scale14 == 0.0f) ? false : true;
3313 while (start < nblock)
3315 int nleft = nblock - start;
3316 int nblock_use = min(max_nblock, nleft);
3318 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
3319 #define TABLE_PARAMS \
3320 cudaNonbondedTables.getExclusionVdwCoefTable(), \
3321 cudaNonbondedTables.getModifiedExclusionForceTable(), \
3322 cudaNonbondedTables.getModifiedExclusionEnergyTable()
3324 #define TABLE_PARAMS \
3325 cudaNonbondedTables.getExclusionVdwCoefTable(), \
3326 cudaNonbondedTables.getExclusionVdwCoefTableTex(), \
3327 cudaNonbondedTables.getModifiedExclusionForceTableTex(), \
3328 cudaNonbondedTables.getModifiedExclusionEnergyTableTex()
3331 #define CALL(DOENERGY, DOVIRIAL, DOELECT, DOFEP, DOTI, DOTABLE) \
3332 modifiedExclusionForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOELECT, DOFEP, DOTI, DOTABLE> \
3333 <<< nblock_use, nthread, shmem_size, stream >>> \
3334 (start, numModifiedExclusions, modifiedExclusions, \
3335 doSlow, one_scale14, cutoff2, nbConstants, \
3336 cudaNonbondedTables.getVdwCoefTableWidth(), \
3338 xyzq, forceStride, lata, latb, latc, \
3339 &forces[startNbond], &forces[startSlow], \
3340 forceList, forceListCounter, forceListStartsNbond, forceListStartsSlow, forceListNexts, \
3345 if (!doEnergy && !doVirial && !doElect && !doFEP && !doTI && doTable) CALL(0, 0, 0, 0, 0, 1);
3346 if (!doEnergy && doVirial && !doElect && !doFEP && !doTI && doTable) CALL(0, 1, 0, 0, 0, 1);
3347 if ( doEnergy && !doVirial && !doElect && !doFEP && !doTI && doTable) CALL(1, 0, 0, 0, 0, 1);
3348 if ( doEnergy && doVirial && !doElect && !doFEP && !doTI && doTable) CALL(1, 1, 0, 0, 0, 1);
3350 if (!doEnergy && !doVirial && doElect && !doFEP && !doTI && doTable) CALL(0, 0, 1, 0, 0, 1);
3351 if (!doEnergy && doVirial && doElect && !doFEP && !doTI && doTable) CALL(0, 1, 1, 0, 0, 1);
3352 if ( doEnergy && !doVirial && doElect && !doFEP && !doTI && doTable) CALL(1, 0, 1, 0, 0, 1);
3353 if ( doEnergy && doVirial && doElect && !doFEP && !doTI && doTable) CALL(1, 1, 1, 0, 0, 1);
3355 if (!doEnergy && !doVirial && !doElect && doFEP && !doTI && doTable) CALL(0, 0, 0, 1, 0, 1);
3356 if (!doEnergy && doVirial && !doElect && doFEP && !doTI && doTable) CALL(0, 1, 0, 1, 0, 1);
3357 if ( doEnergy && !doVirial && !doElect && doFEP && !doTI && doTable) CALL(1, 0, 0, 1, 0, 1);
3358 if ( doEnergy && doVirial && !doElect && doFEP && !doTI && doTable) CALL(1, 1, 0, 1, 0, 1);
3360 if (!doEnergy && !doVirial && doElect && doFEP && !doTI && doTable) CALL(0, 0, 1, 1, 0, 1);
3361 if (!doEnergy && doVirial && doElect && doFEP && !doTI && doTable) CALL(0, 1, 1, 1, 0, 1);
3362 if ( doEnergy && !doVirial && doElect && doFEP && !doTI && doTable) CALL(1, 0, 1, 1, 0, 1);
3363 if ( doEnergy && doVirial && doElect && doFEP && !doTI && doTable) CALL(1, 1, 1, 1, 0, 1);
3365 if (!doEnergy && !doVirial && !doElect && !doFEP && doTI && doTable) CALL(0, 0, 0, 0, 1, 1);
3366 if (!doEnergy && doVirial && !doElect && !doFEP && doTI && doTable) CALL(0, 1, 0, 0, 1, 1);
3367 if ( doEnergy && !doVirial && !doElect && !doFEP && doTI && doTable) CALL(1, 0, 0, 0, 1, 1);
3368 if ( doEnergy && doVirial && !doElect && !doFEP && doTI && doTable) CALL(1, 1, 0, 0, 1, 1);
3370 if (!doEnergy && !doVirial && doElect && !doFEP && doTI && doTable) CALL(0, 0, 1, 0, 1, 1);
3371 if (!doEnergy && doVirial && doElect && !doFEP && doTI && doTable) CALL(0, 1, 1, 0, 1, 1);
3372 if ( doEnergy && !doVirial && doElect && !doFEP && doTI && doTable) CALL(1, 0, 1, 0, 1, 1);
3373 if ( doEnergy && doVirial && doElect && !doFEP && doTI && doTable) CALL(1, 1, 1, 0, 1, 1);
3375 // doTable disabled is only supported by no FEP/ no TI
3376 if (!doEnergy && !doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(0, 0, 0, 0, 0, 0);
3377 if (!doEnergy && doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(0, 1, 0, 0, 0, 0);
3378 if ( doEnergy && !doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(1, 0, 0, 0, 0, 0);
3379 if ( doEnergy && doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(1, 1, 0, 0, 0, 0);
3381 if (!doEnergy && !doVirial && doElect && !doFEP && !doTI && !doTable) CALL(0, 0, 1, 0, 0, 0);
3382 if (!doEnergy && doVirial && doElect && !doFEP && !doTI && !doTable) CALL(0, 1, 1, 0, 0, 0);
3383 if ( doEnergy && !doVirial && doElect && !doFEP && !doTI && !doTable) CALL(1, 0, 1, 0, 0, 0);
3384 if ( doEnergy && doVirial && doElect && !doFEP && !doTI && !doTable) CALL(1, 1, 1, 0, 0, 0);
3386 // Can't enable both FEP and TI, don't expand the following functions.
3388 if (!doEnergy && !doVirial && !doElect && doFEP && doTI) CALL(0, 0, 0, 1, 1);
3389 if (!doEnergy && doVirial && !doElect && doFEP && doTI) CALL(0, 1, 0, 1, 1);
3390 if ( doEnergy && !doVirial && !doElect && doFEP && doTI) CALL(1, 0, 0, 1, 1);
3391 if ( doEnergy && doVirial && !doElect && doFEP && doTI) CALL(1, 1, 0, 1, 1);
3393 if (!doEnergy && !doVirial && doElect && doFEP && doTI) CALL(0, 0, 1, 1, 1);
3394 if (!doEnergy && doVirial && doElect && doFEP && doTI) CALL(0, 1, 1, 1, 1);
3395 if ( doEnergy && !doVirial && doElect && doFEP && doTI) CALL(1, 0, 1, 1, 1);
3396 if ( doEnergy && doVirial && doElect && doFEP && doTI) CALL(1, 1, 1, 1, 1);
3400 cudaCheck(cudaGetLastError());
3402 start += nblock_use;
3404 #if !defined(USE_BONDED_FORCE_ATOMIC_STORE)
3405 nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
3406 nblock = (atomStorageSize + nthread - 1)/nthread;
3409 while (start < nblock)
3411 int nleft = nblock - start;
3412 int nblock_use = min(max_nblock, nleft);
3414 // cudaCheck(hipDeviceSynchronize());
3415 // auto t0 = std::chrono::high_resolution_clock::now();
3417 gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
3418 start, atomStorageSize, forceStride,
3419 forceList, forceListStarts, forceListNexts,
3421 gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
3422 start, atomStorageSize, forceStride,
3423 forceList, forceListStartsNbond, forceListNexts,
3424 &forces[startNbond]);
3426 gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
3427 start, atomStorageSize, forceStride,
3428 forceList, forceListStartsSlow, forceListNexts,
3429 &forces[startSlow]);
3431 cudaCheck(cudaGetLastError());
3433 // cudaCheck(hipStreamSynchronize(stream));
3434 // auto t1 = std::chrono::high_resolution_clock::now();
3435 // std::chrono::duration<double> diff1 = t1 - t0;
3436 // std::cout << "gatherBondedForcesKernel";
3437 // std::cout << " " << std::setprecision(3) << diff1.count() * 1e3 << " ms" << std::endl;
3439 start += nblock_use;
3443 #ifdef NODEGROUP_FORCE_REGISTER
3444 if((atomsChanged && !useDeviceMigration) || !CUDASOAintegrate){
3445 copy_DtoH<double>(forces, h_forces, forceCopySize, stream);
3447 // XXX TODO: ERASE THIS AFTERWARDS
3448 // this is not numAtoms, this is something else
3449 // will print the force inside the compute and afterwards
3450 FILE* pos_nb_atoms = fopen("compute_b_dforce.txt", "w");
3451 //fprintf(pos_nb_atoms, "Calculating %d positions\n", tlKernel.getCudaPatchesSize());
3452 // positions are wrong here.
3453 //for(int i = 0; i < atomStorageSize; i++){
3454 for(int i = 29415; i < 29895; i++){
3455 fprintf(pos_nb_atoms, "%2.4lf\n", h_forcesDP[i]);
3457 fclose(pos_nb_atoms);
3462 copy_DtoH<double>(forces, h_forces, forceCopySize, stream);
3464 if (doEnergy || doVirial) {
3465 if (ATOMIC_BINS > 1) {
3466 // Reduce energies_virials[ATOMIC_BINS][energies_virials_SIZE] in-place (results are in energies_virials[0])
3467 reduceBondedBinsKernel<<<energies_virials_SIZE, ATOMIC_BINS, 0, stream>>>(energies_virials);
3469 // virial copy, is this necessary?
3470 copy_DtoH<double>(energies_virials, h_energies_virials, energies_virials_SIZE, stream);