2 #define _USE_MATH_DEFINES
3 #define __thread __declspec(thread)
7 #include <hipcub/hipcub.hpp>
13 #if __CUDACC_VER_MAJOR__ >= 11
14 #include <cub/cub.cuh>
16 #include <namd_cub/cub.cuh>
22 #include "ComputeBondedCUDAKernel.h"
23 #include "CudaComputeNonbondedInteractions.h"
25 #ifdef FUTURE_CUDADEVICE
26 #include "CudaDevice.h"
28 #include "DeviceCUDA.h"
29 extern __thread DeviceCUDA *deviceCUDA;
32 #ifndef USE_TABLE_ARRAYS
33 __device__ __forceinline__
34 float4 sampleTableTex(cudaTextureObject_t tex, float k) {
35 #if defined(NAMD_CUDA)
36 return tex1D<float4>(tex, k);
38 const int tableSize = FORCE_ENERGY_TABLE_SIZE;
39 const float x = k * (float)tableSize - 0.5f;
40 const float f = floorf(x);
41 const float a = x - f;
42 const unsigned int i = (unsigned int)f;
43 const int i0 = i < tableSize - 1 ? i : tableSize - 1;
44 const int i1 = i0 + 1;
45 const float4 t0 = tex1Dfetch<float4>(tex, i0);
46 const float4 t1 = tex1Dfetch<float4>(tex, i1);
48 a * (t1.x - t0.x) + t0.x,
49 a * (t1.y - t0.y) + t0.y,
50 a * (t1.z - t0.z) + t0.z,
51 a * (t1.w - t0.w) + t0.w);
56 __device__ __forceinline__
57 float4 tableLookup(const float4* table, const float k)
59 const int tableSize = FORCE_ENERGY_TABLE_SIZE;
60 const float x = k * static_cast<float>(tableSize) - 0.5f;
61 const float f = floorf(x);
62 const float a = x - f;
63 const int i = static_cast<int>(f);
64 const int i0 = max(0, min(tableSize - 1, i));
65 const int i1 = max(0, min(tableSize - 1, i + 1));
66 const float4 t0 = __ldg(&table[i0]);
67 const float4 t1 = __ldg(&table[i1]);
69 a * (t1.x - t0.x) + t0.x,
70 a * (t1.y - t0.y) + t0.y,
71 a * (t1.z - t0.z) + t0.z,
72 a * (t1.w - t0.w) + t0.w);
75 // global variable for alchemical transformation
76 namespace AlchBondedCUDA {
77 // Alchemical parameters and lambdas
78 __constant__ CudaAlchParameters alchParams;
79 __constant__ CudaAlchLambdas alchLambdas;
84 __forceinline__ __device__
85 void convertForces(const float x, const float y, const float z,
86 T &Tx, T &Ty, T &Tz) {
94 __forceinline__ __device__
95 void calcComponentForces(
97 const float dx, const float dy, const float dz,
98 T &fxij, T &fyij, T &fzij) {
106 template <typename T>
107 __forceinline__ __device__
108 void calcComponentForces(
110 const float dx1, const float dy1, const float dz1,
112 const float dx2, const float dy2, const float dz2,
113 T &fxij, T &fyij, T &fzij) {
115 fxij = (T)(fij1*dx1 + fij2*dx2);
116 fyij = (T)(fij1*dy1 + fij2*dy2);
117 fzij = (T)(fij1*dz1 + fij2*dz2);
120 __forceinline__ __device__
121 int warpAggregatingAtomicInc(int* counter) {
122 #if BOUNDINGBOXSIZE == 64
123 WarpMask mask = __ballot(1);
124 int total = __popcll(mask);
125 int prefix = __popcll(mask & cub::LaneMaskLt());
126 int firstLane = __ffsll(mask) - 1;
128 WarpMask mask = __ballot(1);
129 int total = __popc(mask);
130 int prefix = __popc(mask & cub::LaneMaskLt());
131 int firstLane = __ffs(mask) - 1;
135 start = atomicAdd(counter, total);
137 start = WARP_SHUFFLE(mask, start, firstLane, WARPSIZE);
138 return start + prefix;
141 template <typename T>
142 __forceinline__ __device__
143 void storeForces(const T fx, const T fy, const T fz,
144 const int ind, const int stride,
146 T* forceList, int* forceListCounter,
147 int* forceListStarts, int* forceListNexts) {
149 #if defined(USE_BONDED_FORCE_ATOMIC_STORE)
150 #if defined(NAMD_HIP)
151 // Try to decrease conflicts between lanes if there are repeting ind
152 WarpMask mask = NAMD_WARP_BALLOT(WARP_FULL_MASK, 1);
153 if (mask == ~(WarpMask)0) { // All lanes are active (may be not true for the last block)
154 const int laneId = threadIdx.x % WARPSIZE;
155 const int prevLaneInd = WARP_SHUFFLE(WARP_FULL_MASK, ind, laneId - 1, WARPSIZE);
156 const bool isHead = laneId == 0 || ind != prevLaneInd;
157 if (!NAMD_WARP_ALL(WARP_FULL_MASK, isHead)) {
158 // There are segments of repeating ind
159 typedef cub::WarpReduce<T> WarpReduce;
160 __shared__ typename WarpReduce::TempStorage temp_storage;
161 const T sumfx = WarpReduce(temp_storage).HeadSegmentedSum(fx, isHead);
162 const T sumfy = WarpReduce(temp_storage).HeadSegmentedSum(fy, isHead);
163 const T sumfz = WarpReduce(temp_storage).HeadSegmentedSum(fz, isHead);
165 atomicAdd(&force[ind ], sumfx);
166 atomicAdd(&force[ind + stride ], sumfy);
167 atomicAdd(&force[ind + stride*2], sumfz);
172 // Not all lanes are active (the last block) or there is no repeating ind
173 atomicAdd(&force[ind ], fx);
174 atomicAdd(&force[ind + stride ], fy);
175 atomicAdd(&force[ind + stride*2], fz);
177 atomicAdd(&force[ind ], fx);
178 atomicAdd(&force[ind + stride ], fy);
179 atomicAdd(&force[ind + stride*2], fz);
182 const int newPos = warpAggregatingAtomicInc(forceListCounter);
183 forceListNexts[newPos] = atomicExch(&forceListStarts[ind], newPos);
184 forceList[newPos * 3 + 0] = fx;
185 forceList[newPos * 3 + 1] = fy;
186 forceList[newPos * 3 + 2] = fz;
193 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
194 __device__ void bondForce(
196 const CudaBond* __restrict__ bondList,
197 const CudaBondValue* __restrict__ bondValues,
198 const float4* __restrict__ xyzq,
200 const float3 lata, const float3 latb, const float3 latc,
201 T* __restrict__ force, double &energy,
202 T* __restrict__ forceList, int* forceListCounter,
203 int* forceListStarts, int* forceListNexts,
204 #ifdef WRITE_FULL_VIRIALS
205 ComputeBondedCUDAKernel::BondedVirial<double>& virial,
207 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
209 double &energy_F, double &energy_TI_1, double &energy_TI_2
212 CudaBond bl = bondList[index];
213 CudaBondValue bondValue = bondValues[bl.itype];
214 // if (bondValue.x0 == 0.0f) return;
216 float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
217 float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
218 float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
220 float4 xyzqi = xyzq[bl.i];
221 float4 xyzqj = xyzq[bl.j];
223 float xij = xyzqi.x + shx - xyzqj.x;
224 float yij = xyzqi.y + shy - xyzqj.y;
225 float zij = xyzqi.z + shz - xyzqj.z;
227 float r = sqrtf(xij*xij + yij*yij + zij*zij);
229 float db = r - bondValue.x0;
231 // in this case, the bond represents a harmonic wall potential
232 // where x0 is the lower wall and x1 is the upper
233 db = (r > bondValue.x1 ? r - bondValue.x1 : (r > bondValue.x0 ? 0 : db));
235 float fij = db * bondValue.k * bl.scale;
237 // Define a temporary variable to store the energy
238 // used by both alchemical and non-alchemical route
239 float energy_tmp = 0.0f;
241 energy_tmp += fij * db;
245 // JM: Get this shit off here
246 float alch_scale = 1.0f;
248 switch (bl.fepBondedType) {
250 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
252 energy_TI_1 += (double)(energy_tmp);
255 energy_F += (double)(energy_tmp * ((AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1));
260 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
262 energy_TI_2 += (double)(energy_tmp);
265 energy_F += (double)(energy_tmp * ((AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2));
274 energy += (double)(energy_tmp * alch_scale);
276 energy += (double)(energy_tmp);
280 // XXX TODO: Get this off here
281 // XXX TODO: Decide on a templated parameter nomenclature
286 T T_fxij, T_fyij, T_fzij;
287 calcComponentForces<T>(fij, xij, yij, zij, T_fxij, T_fyij, T_fzij);
290 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
291 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
295 #ifdef WRITE_FULL_VIRIALS
296 float fxij = fij*xij;
297 float fyij = fij*yij;
298 float fzij = fij*zij;
299 virial.xx = (fxij*xij);
300 virial.xy = (fxij*yij);
301 virial.xz = (fxij*zij);
302 virial.yx = (fyij*xij);
303 virial.yy = (fyij*yij);
304 virial.yz = (fyij*zij);
305 virial.zx = (fzij*xij);
306 virial.zy = (fzij*yij);
307 virial.zz = (fzij*zij);
313 // Calculates modified exclusions
315 template <typename T, bool doEnergy, bool doVirial, bool doElect, bool doFEP, bool doTI, bool doTable>
316 __device__ void modifiedExclusionForce(
318 const CudaExclusion* __restrict__ exclusionList,
320 const float one_scale14, // 1 - scale14
321 const int vdwCoefTableWidth,
322 #if defined(USE_TABLE_ARRAYS) || __CUDA_ARCH__ >= 350
323 const float2* __restrict__ vdwCoefTable,
325 cudaTextureObject_t vdwCoefTableTex,
327 #ifdef USE_TABLE_ARRAYS
328 const float4* __restrict__ forceTable, const float4* __restrict__ energyTable,
330 cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
332 const float4* __restrict__ xyzq,
334 const float3 lata, const float3 latb, const float3 latc,
336 const CudaNBConstants nbConstants,
338 T* __restrict__ forceNbond, double &energyNbond,
339 T* __restrict__ forceSlow, double &energySlow,
340 T* __restrict__ forceList, int* forceListCounter,
341 int* forceListStartsNbond, int* forceListStartsSlow, int* forceListNexts,
342 #ifdef WRITE_FULL_VIRIALS
343 ComputeBondedCUDAKernel::BondedVirial<double>& virialNbond,
344 ComputeBondedCUDAKernel::BondedVirial<double>& virialSlow,
346 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialNbond,
347 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialSlow,
349 double &energyVdw_F, double &energyVdw_TI_1, double &energyVdw_TI_2,
350 double &energyNbond_F, double &energyNbond_TI_1, double &energyNbond_TI_2,
351 double &energySlow_F, double &energySlow_TI_1, double &energySlow_TI_2
354 CudaExclusion bl = exclusionList[index];
356 float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
357 float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
358 float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
360 float4 xyzqi = xyzq[bl.i];
361 float4 xyzqj = xyzq[bl.j];
363 float xij = xyzqi.x + shx - xyzqj.x;
364 float yij = xyzqi.y + shy - xyzqj.y;
365 float zij = xyzqi.z + shz - xyzqj.z;
367 float r2 = xij*xij + yij*yij + zij*zij;
370 float rinv = rsqrtf(r2);
373 if (doElect) qq = one_scale14 * xyzqi.w * xyzqj.w;
375 int vdwIndex = bl.vdwtypei + bl.vdwtypej*vdwCoefTableWidth;
376 #if defined(USE_TABLE_ARRAYS) || __CUDA_ARCH__ >= 350
377 float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
379 float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
383 float myElecLambda = 1.0f;
384 float myElecLambda2 = 1.0f;
385 float myVdwLambda = 1.0f;
386 float myVdwLambda2 = 1.0f;
387 float myVdwShift = 0.0f;
388 float myVdwShift2 = 0.0f;
389 double alch_vdw_energy;
390 double alch_vdw_energy_2;
391 float alch_vdw_force = 0.0f;
392 float alch_vdw_dUdl = 0.0f;
393 double* p_energyVdw_TI = NULL;
394 double* p_energyNbond_TI = NULL;
395 double* p_energySlow_TI = NULL;
399 /*lambda values 'up' are for atoms scaled up with lambda (partition 1)*/
400 float elecLambdaUp = (AlchBondedCUDA::alchLambdas).elecLambdaUp;
401 float vdwLambdaUp = (AlchBondedCUDA::alchLambdas).vdwLambdaUp;
402 float vdwShiftUp = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambdaUp);
403 float elecLambda2Up = (AlchBondedCUDA::alchLambdas).elecLambda2Up;
404 float vdwLambda2Up = (AlchBondedCUDA::alchLambdas).vdwLambda2Up;
405 float vdwShift2Up = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambda2Up);
406 /*lambda values 'down' are for atoms scaled down with lambda (partition 2)*/
407 float elecLambdaDown = (AlchBondedCUDA::alchLambdas).elecLambdaDown;
408 float vdwLambdaDown = (AlchBondedCUDA::alchLambdas).vdwLambdaDown;
409 float vdwShiftDown = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambdaDown);
410 float elecLambda2Down = (AlchBondedCUDA::alchLambdas).elecLambda2Down;
411 float vdwLambda2Down = (AlchBondedCUDA::alchLambdas).vdwLambda2Down;
412 float vdwShift2Down = (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (1.0f - vdwLambda2Down);
413 switch (bl.pswitch) {
414 case 0: myVdwLambda = 1.0f;
417 myElecLambda2 = 1.0f;
420 case 1: myVdwLambda = vdwLambdaUp;
421 myVdwLambda2 = vdwLambda2Up;
422 myElecLambda = elecLambdaUp;
423 myElecLambda2 = elecLambda2Up;
424 myVdwShift = vdwShiftUp;
425 myVdwShift2 = vdwShift2Up;
426 p_energyVdw_TI = &(energyVdw_TI_1);
427 p_energyNbond_TI= &(energyNbond_TI_1);
428 p_energySlow_TI = &(energySlow_TI_1);
430 case 2: myVdwLambda = vdwLambdaDown;
431 myVdwLambda2 = vdwLambda2Down;
432 myElecLambda = elecLambdaDown;
433 myElecLambda2 = elecLambda2Down;
434 myVdwShift = vdwShiftDown;
435 myVdwShift2 = vdwShift2Down;
436 p_energyVdw_TI = &(energyVdw_TI_2);
437 p_energyNbond_TI= &(energyNbond_TI_2);
438 p_energySlow_TI = &(energySlow_TI_2);
440 default: myVdwLambda = 0.0f;
443 myElecLambda2 = 0.0f;
446 if (bl.pswitch != 99 && bl.pswitch != 0) {
447 // Common part of FEP and TI
448 const float& switchOn2 = (AlchBondedCUDA::alchParams).switchDist2;
449 const float switchfactor = 1.0f / ((cutoff2 - switchOn2) * (cutoff2 - switchOn2) * (cutoff2 - switchOn2));
450 const float switchmul = (r2 > switchOn2 ? (switchfactor * (cutoff2 - r2) * (cutoff2 - r2) * (cutoff2 - 3.0f * switchOn2 + 2.0f * r2)) : 1.0f);
451 const float switchmul2 = (r2 > switchOn2 ? (switchfactor * (cutoff2 - r2) * 12.0f * (r2 - switchOn2)) : 0.0f);
452 // A = ljab.x ; B = ljab.y
453 const float r2_1 = 1.0f / (r2 + myVdwShift);
454 const float r2_2 = 1.0f / (r2 + myVdwShift2);
455 const float r6_1 = r2_1 * r2_1 * r2_1;
456 const float r6_2 = r2_2 * r2_2 * r2_2;
457 const float U1 = ljab.x * r6_1 * r6_1 - ljab.y * r6_1;
458 const float U2 = ljab.x * r6_2 * r6_2 - ljab.y * r6_2;
459 // rinv is already calculated above
461 alch_vdw_energy = double(myVdwLambda * switchmul * U1);
463 alch_vdw_energy_2 = double(myVdwLambda2 * switchmul * U2);
466 alch_vdw_force = myVdwLambda * (switchmul * (12.0f * U1 + 6.0f * ljab.y * r6_1) * r2_1 + switchmul2 * U1);
468 alch_vdw_dUdl = switchmul * (U1 + myVdwLambda * (AlchBondedCUDA::alchParams).alchVdwShiftCoeff * (6.0f * U1 + 3.0f * ljab.y * r6_1) * r2_1);
472 alch_vdw_energy = 0.0;
473 alch_vdw_energy_2 = 0.0;
482 #ifdef USE_TABLE_ARRAYS
483 fi = tableLookup(forceTable, rinv);
485 fi = sampleTableTex(forceTableTex, rinv);
488 fi = tex1D<float4>(forceTableTex, rinv);
492 if (doEnergy && doTable) {
494 #ifdef USE_TABLE_ARRAYS
495 ei = tableLookup(energyTable, rinv);
497 ei = sampleTableTex(energyTableTex, rinv);
500 ei = tex1D<float4>(energyTableTex, rinv);
503 energyVdw += (double)((ljab.x * ei.z + ljab.y * ei.y) * myElecLambda * p0Factor);
504 energyVdw += alch_vdw_energy;
506 energyVdw_F += (double)((ljab.x * ei.z + ljab.y * ei.y) * myElecLambda2 * p0Factor);
507 energyVdw_F += alch_vdw_energy_2;
510 if (p_energyVdw_TI != NULL) {
511 (*p_energyVdw_TI) += alch_vdw_dUdl;
515 energyVdw += (double)(ljab.x * ei.z + ljab.y * ei.y);
518 energyNbond += qq * ei.x * myElecLambda;
520 energyNbond_F += qq * ei.x * myElecLambda2;
523 if (p_energyNbond_TI != NULL) (*p_energyNbond_TI) += qq * ei.x;
526 energySlow += qq * ei.w * myElecLambda;
528 energySlow_F += qq * ei.w * myElecLambda2;
531 if (p_energySlow_TI != NULL) (*p_energySlow_TI) += qq * ei.w;
541 fNbond = -(ljab.x * fi.z + ljab.y * fi.y + qq * fi.x);
543 fNbond = -(ljab.x * fi.z + ljab.y * fi.y);
546 float e_vdw, e_elec, e_slow;
553 cudaModExclForceMagCalc_VdwEnergySwitch_PMEC1<doEnergy>(
554 doSlow, doElect, r2, rinv, qq, ljab,
555 nbConstants, fNbond, fSlow, e_vdw, e_elec, e_slow);
558 energyVdw += (double) (e_vdw);
559 energyNbond += (double) (e_elec * myElecLambda);
560 energySlow += (double) (e_slow * myElecLambda);
563 // fNbond = fFast + fVdw
565 if (bl.pswitch != 0) {
566 fNbond -= -(ljab.x * fi.z + ljab.y * fi.y);
567 fNbond = fNbond * myElecLambda + alch_vdw_force * float(bl.pswitch == 1 || bl.pswitch == 2);
569 // if pswitch == 0, myElecLambda = 1.0
571 T T_fxij, T_fyij, T_fzij;
572 calcComponentForces<T>(fNbond, xij, yij, zij, T_fxij, T_fyij, T_fzij);
573 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceNbond, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
574 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceNbond, forceList, forceListCounter, forceListStartsNbond, forceListNexts);
576 if (doSlow && doElect) {
581 fSlow *= myElecLambda;
583 T T_fxij, T_fyij, T_fzij;
584 calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
585 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
586 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
591 #ifdef WRITE_FULL_VIRIALS
592 float fxij = fNbond*xij;
593 float fyij = fNbond*yij;
594 float fzij = fNbond*zij;
595 virialNbond.xx = (fxij*xij);
596 virialNbond.xy = (fxij*yij);
597 virialNbond.xz = (fxij*zij);
598 virialNbond.yx = (fyij*xij);
599 virialNbond.yy = (fyij*yij);
600 virialNbond.yz = (fyij*zij);
601 virialNbond.zx = (fzij*xij);
602 virialNbond.zy = (fzij*yij);
603 virialNbond.zz = (fzij*zij);
608 if (doVirial && doSlow && doElect) {
609 #ifdef WRITE_FULL_VIRIALS
610 float fxij = fSlow*xij;
611 float fyij = fSlow*yij;
612 float fzij = fSlow*zij;
613 virialSlow.xx = (fxij*xij);
614 virialSlow.xy = (fxij*yij);
615 virialSlow.xz = (fxij*zij);
616 virialSlow.yx = (fyij*xij);
617 virialSlow.yy = (fyij*yij);
618 virialSlow.yz = (fyij*zij);
619 virialSlow.zx = (fzij*xij);
620 virialSlow.zy = (fzij*yij);
621 virialSlow.zz = (fzij*zij);
629 // Calculates exclusions. Here doSlow = true
631 // doFull = doFullElectrostatics = doSlow
632 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
633 __device__ void exclusionForce(
635 const CudaExclusion* __restrict__ exclusionList,
636 const float r2_delta, const int r2_delta_expc,
638 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
639 const float* __restrict__ r2_table,
640 const float4* __restrict__ exclusionTable,
642 cudaTextureObject_t r2_table_tex,
643 cudaTextureObject_t exclusionTableTex,
645 const float4* __restrict__ xyzq,
647 const float3 lata, const float3 latb, const float3 latc,
649 T* __restrict__ forceSlow, double &energySlow,
650 T* __restrict__ forceList, int* forceListCounter,
651 int* forceListStartsSlow, int* forceListNexts,
652 #ifdef WRITE_FULL_VIRIALS
653 ComputeBondedCUDAKernel::BondedVirial<double>& virialSlow,
655 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virialSlow,
657 double &energy_F, double &energy_TI_1, double &energy_TI_2
660 CudaExclusion bl = exclusionList[index];
662 float shx = bl.ioffsetXYZ.x*lata.x + bl.ioffsetXYZ.y*latb.x + bl.ioffsetXYZ.z*latc.x;
663 float shy = bl.ioffsetXYZ.x*lata.y + bl.ioffsetXYZ.y*latb.y + bl.ioffsetXYZ.z*latc.y;
664 float shz = bl.ioffsetXYZ.x*lata.z + bl.ioffsetXYZ.y*latb.z + bl.ioffsetXYZ.z*latc.z;
666 float4 xyzqi = xyzq[bl.i];
667 float4 xyzqj = xyzq[bl.j];
669 float xij = xyzqi.x + shx - xyzqj.x;
670 float yij = xyzqi.y + shy - xyzqj.y;
671 float zij = xyzqi.z + shz - xyzqj.z;
673 float r2 = xij*xij + yij*yij + zij*zij;
677 union { float f; int i; } r2i;
679 int table_i = (r2i.i >> 17) + r2_delta_expc; // table_i >= 0
681 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
683 float r2_table_val = r2_table[table_i];
685 float r2_table_val = __ldg(&r2_table[table_i]);
688 float r2_table_val = tex1Dfetch<float>(r2_table_tex, table_i);
690 float diffa = r2 - r2_table_val;
691 float qq = xyzqi.w * xyzqj.w;
693 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
695 float4 slow = exclusionTable[table_i];
697 float4 slow = __ldg(&exclusionTable[table_i]);
700 float4 slow = tex1Dfetch<float4>(exclusionTableTex, table_i);
708 myElecLambda2 = 1.0f;
709 if (doTI) p_energy_TI = NULL;
710 /*lambda values 'up' are for atoms scaled up with lambda (partition 1)*/
711 float elecLambdaUp = (AlchBondedCUDA::alchLambdas).elecLambdaUp;
712 float elecLambda2Up = (AlchBondedCUDA::alchLambdas).elecLambda2Up;
713 /*lambda values 'down' are for atoms scaled down with lambda (partition 2)*/
714 float elecLambdaDown = (AlchBondedCUDA::alchLambdas).elecLambdaDown;
715 float elecLambda2Down = (AlchBondedCUDA::alchLambdas).elecLambda2Down;
716 switch (bl.pswitch) {
717 case 0: myElecLambda = 1.0f;
718 myElecLambda2 = 1.0f;
720 case 1: myElecLambda = elecLambdaUp;
721 myElecLambda2 = elecLambda2Up;
722 p_energy_TI = &(energy_TI_1);
724 case 2: myElecLambda = elecLambdaDown;
725 myElecLambda2 = elecLambda2Down;
726 p_energy_TI = &(energy_TI_2);
728 default: myElecLambda = 0.0f;
729 myElecLambda2 = 0.0f;
734 double energy_slow_tmp = (double)(qq*(((diffa * (1.0f/6.0f)*slow.x + 0.25f*slow.y ) * diffa + 0.5f*slow.z ) * diffa + slow.w));
736 energySlow += energy_slow_tmp * myElecLambda;
738 energy_F += energy_slow_tmp * myElecLambda2;
741 if (p_energy_TI != NULL) {
742 (*p_energy_TI) += energy_slow_tmp;
746 energySlow += energy_slow_tmp;
750 float fSlow = -qq*((diffa * slow.x + slow.y) * diffa + slow.z);
752 fSlow *= myElecLambda;
755 T T_fxij, T_fyij, T_fzij;
756 calcComponentForces<T>(fSlow, xij, yij, zij, T_fxij, T_fyij, T_fzij);
757 storeForces<T>(T_fxij, T_fyij, T_fzij, bl.i, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
758 storeForces<T>(-T_fxij, -T_fyij, -T_fzij, bl.j, stride, forceSlow, forceList, forceListCounter, forceListStartsSlow, forceListNexts);
762 #ifdef WRITE_FULL_VIRIALS
763 float fxij = fSlow*xij;
764 float fyij = fSlow*yij;
765 float fzij = fSlow*zij;
766 virialSlow.xx = (fxij*xij);
767 virialSlow.xy = (fxij*yij);
768 virialSlow.xz = (fxij*zij);
769 virialSlow.yx = (fyij*xij);
770 virialSlow.yy = (fyij*yij);
771 virialSlow.yz = (fyij*zij);
772 virialSlow.zx = (fzij*xij);
773 virialSlow.zy = (fzij*yij);
774 virialSlow.zz = (fzij*zij);
780 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
781 __device__ void angleForce(const int index,
782 const CudaAngle* __restrict__ angleList,
783 const CudaAngleValue* __restrict__ angleValues,
784 const float4* __restrict__ xyzq,
786 const float3 lata, const float3 latb, const float3 latc,
787 T* __restrict__ force, double &energy,
788 T* __restrict__ forceList, int* forceListCounter,
789 int* forceListStarts, int* forceListNexts,
790 #ifdef WRITE_FULL_VIRIALS
791 ComputeBondedCUDAKernel::BondedVirial<double>& virial,
793 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
795 double &energy_F, double &energy_TI_1, double &energy_TI_2
798 CudaAngle al = angleList[index];
800 float ishx = al.ioffsetXYZ.x*lata.x + al.ioffsetXYZ.y*latb.x + al.ioffsetXYZ.z*latc.x;
801 float ishy = al.ioffsetXYZ.x*lata.y + al.ioffsetXYZ.y*latb.y + al.ioffsetXYZ.z*latc.y;
802 float ishz = al.ioffsetXYZ.x*lata.z + al.ioffsetXYZ.y*latb.z + al.ioffsetXYZ.z*latc.z;
804 float kshx = al.koffsetXYZ.x*lata.x + al.koffsetXYZ.y*latb.x + al.koffsetXYZ.z*latc.x;
805 float kshy = al.koffsetXYZ.x*lata.y + al.koffsetXYZ.y*latb.y + al.koffsetXYZ.z*latc.y;
806 float kshz = al.koffsetXYZ.x*lata.z + al.koffsetXYZ.y*latb.z + al.koffsetXYZ.z*latc.z;
808 float xij = xyzq[al.i].x + ishx - xyzq[al.j].x;
809 float yij = xyzq[al.i].y + ishy - xyzq[al.j].y;
810 float zij = xyzq[al.i].z + ishz - xyzq[al.j].z;
812 float xkj = xyzq[al.k].x + kshx - xyzq[al.j].x;
813 float ykj = xyzq[al.k].y + kshy - xyzq[al.j].y;
814 float zkj = xyzq[al.k].z + kshz - xyzq[al.j].z;
816 float rij_inv = rsqrtf(xij*xij + yij*yij + zij*zij);
817 float rkj_inv = rsqrtf(xkj*xkj + ykj*ykj + zkj*zkj);
819 float xijr = xij*rij_inv;
820 float yijr = yij*rij_inv;
821 float zijr = zij*rij_inv;
822 float xkjr = xkj*rkj_inv;
823 float ykjr = ykj*rkj_inv;
824 float zkjr = zkj*rkj_inv;
825 float cos_theta = xijr*xkjr + yijr*ykjr + zijr*zkjr;
827 CudaAngleValue angleValue = angleValues[al.itype];
828 angleValue.k *= al.scale;
831 if (angleValue.normal == 1) {
832 // Restrict values of cos_theta to the interval [-0.999 ... 0.999]
833 cos_theta = min(0.999f, max(-0.999f, cos_theta));
834 float theta = acosf(cos_theta);
835 diff = theta - angleValue.theta0;
837 diff = cos_theta - angleValue.theta0;
840 float energy_tmp = 0.0f;
842 energy_tmp += angleValue.k * diff * diff;
847 // Be careful: alch_scale_energy_fep is 0 here!
848 float alch_scale_energy_fep;
850 // NOTE: On account of the Urey-Bradley terms, energy calculation is not
851 // finished so we cannot add energy_tmp to energy_TI_* and energy_F here!
852 // but we still need the scaling factor for forces,
853 // so the code here is a little bit different from the bondForce.
854 // calculate a scale factor for FEP energy first, and then scale it to
855 // the final result after finishing energy evaluation, and also use
856 // a pointer point to the correct energy_TI_x term.
859 alch_scale_energy_fep = 0.0f;
860 if (doTI) p_energy_TI = NULL;
861 switch (al.fepBondedType) {
863 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
866 p_energy_TI = &(energy_TI_1);
869 alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1;
875 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
878 p_energy_TI = &(energy_TI_2);
881 alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2;
889 if (angleValue.normal == 1) {
890 float inv_sin_theta = rsqrtf(1.0f - cos_theta*cos_theta);
891 if (inv_sin_theta > 1.0e6) {
892 diff = (diff < 0.0f) ? 1.0f : -1.0f;
894 diff *= -inv_sin_theta;
897 diff *= -2.0f*angleValue.k;
899 // Do alchemical scaling
904 float dtxi = rij_inv*(xkjr - cos_theta*xijr);
905 float dtxj = rkj_inv*(xijr - cos_theta*xkjr);
906 float dtyi = rij_inv*(ykjr - cos_theta*yijr);
907 float dtyj = rkj_inv*(yijr - cos_theta*ykjr);
908 float dtzi = rij_inv*(zkjr - cos_theta*zijr);
909 float dtzj = rkj_inv*(zijr - cos_theta*zkjr);
911 T T_dtxi, T_dtyi, T_dtzi;
912 T T_dtxj, T_dtyj, T_dtzj;
913 calcComponentForces<T>(diff, dtxi, dtyi, dtzi, T_dtxi, T_dtyi, T_dtzi);
914 calcComponentForces<T>(diff, dtxj, dtyj, dtzj, T_dtxj, T_dtyj, T_dtzj);
915 T T_dtxk = -T_dtxi - T_dtxj;
916 T T_dtyk = -T_dtyi - T_dtyj;
917 T T_dtzk = -T_dtzi - T_dtzj;
918 storeForces<T>(T_dtxk, T_dtyk, T_dtzk, al.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
920 if (angleValue.k_ub) {
921 float xik = xij - xkj;
922 float yik = yij - ykj;
923 float zik = zij - zkj;
924 float rik_inv = rsqrtf(xik*xik + yik*yik + zik*zik);
925 float rik = 1.0f/rik_inv;
926 float diff_ub = rik - angleValue.r_ub;
928 energy_tmp += angleValue.k_ub * diff_ub * diff_ub;
930 diff_ub *= -2.0f*angleValue.k_ub*rik_inv;
932 diff_ub *= alch_scale;
934 T T_dxik, T_dyik, T_dzik;
935 calcComponentForces<T>(diff_ub, xik, yik, zik, T_dxik, T_dyik, T_dzik);
948 energy += double(energy_tmp * alch_scale);
950 if (p_energy_TI != NULL) {
951 *(p_energy_TI) += double(energy_tmp);
955 energy_F += double(energy_tmp * alch_scale_energy_fep);
958 energy += double(energy_tmp);
962 storeForces<T>(T_dtxi, T_dtyi, T_dtzi, al.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
963 storeForces<T>(T_dtxj, T_dtyj, T_dtzj, al.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
967 #ifdef WRITE_FULL_VIRIALS
968 float fxi = ((float)T_dtxi);
969 float fyi = ((float)T_dtyi);
970 float fzi = ((float)T_dtzi);
971 float fxk = ((float)T_dtxj);
972 float fyk = ((float)T_dtyj);
973 float fzk = ((float)T_dtzj);
974 virial.xx = (fxi*xij) + (fxk*xkj);
975 virial.xy = (fxi*yij) + (fxk*ykj);
976 virial.xz = (fxi*zij) + (fxk*zkj);
977 virial.yx = (fyi*xij) + (fyk*xkj);
978 virial.yy = (fyi*yij) + (fyk*ykj);
979 virial.yz = (fyi*zij) + (fyk*zkj);
980 virial.zx = (fzi*xij) + (fzk*xkj);
981 virial.zy = (fzi*yij) + (fzk*ykj);
982 virial.zz = (fzi*zij) + (fzk*zkj);
988 // Dihedral computation
992 template <bool doEnergy>
993 __forceinline__ __device__
994 void diheComp(const CudaDihedralValue* dihedralValues, int ic,
995 const float sin_phi, const float cos_phi, const float scale, float& df, double& e) {
998 if (doEnergy) e = 0.0;
1000 float phi = atan2f(sin_phi, cos_phi);
1004 CudaDihedralValue dihedralValue = dihedralValues[ic];
1005 dihedralValue.k *= scale;
1007 // Last dihedral has n low bit set to 0
1008 lrep = (dihedralValue.n & 1);
1009 dihedralValue.n >>= 1;
1010 if (dihedralValue.n) {
1011 float nf = dihedralValue.n;
1012 float x = nf*phi - dihedralValue.delta;
1015 sincosf(x, &sin_x, &cos_x);
1016 e += (double)(dihedralValue.k*(1.0f + cos_x));
1017 df += (double)(nf*dihedralValue.k*sin_x);
1019 float sin_x = sinf(x);
1020 df += (double)(nf*dihedralValue.k*sin_x);
1023 float diff = phi - dihedralValue.delta;
1024 if (diff < -(float)(M_PI)) diff += (float)(2.0*M_PI);
1025 if (diff > (float)(M_PI)) diff -= (float)(2.0*M_PI);
1027 e += (double)(dihedralValue.k*diff*diff);
1029 df -= 2.0f*dihedralValue.k*diff;
1037 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1038 __device__ void diheForce(const int index,
1039 const CudaDihedral* __restrict__ diheList,
1040 const CudaDihedralValue* __restrict__ dihedralValues,
1041 const float4* __restrict__ xyzq,
1043 const float3 lata, const float3 latb, const float3 latc,
1044 T* __restrict__ force, double &energy,
1045 T* __restrict__ forceList, int* forceListCounter, int* forceListStarts, int* forceListNexts,
1046 #ifdef WRITE_FULL_VIRIALS
1047 ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1049 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1051 double &energy_F, double &energy_TI_1, double &energy_TI_2
1054 CudaDihedral dl = diheList[index];
1056 float ishx = dl.ioffsetXYZ.x*lata.x + dl.ioffsetXYZ.y*latb.x + dl.ioffsetXYZ.z*latc.x;
1057 float ishy = dl.ioffsetXYZ.x*lata.y + dl.ioffsetXYZ.y*latb.y + dl.ioffsetXYZ.z*latc.y;
1058 float ishz = dl.ioffsetXYZ.x*lata.z + dl.ioffsetXYZ.y*latb.z + dl.ioffsetXYZ.z*latc.z;
1060 float jshx = dl.joffsetXYZ.x*lata.x + dl.joffsetXYZ.y*latb.x + dl.joffsetXYZ.z*latc.x;
1061 float jshy = dl.joffsetXYZ.x*lata.y + dl.joffsetXYZ.y*latb.y + dl.joffsetXYZ.z*latc.y;
1062 float jshz = dl.joffsetXYZ.x*lata.z + dl.joffsetXYZ.y*latb.z + dl.joffsetXYZ.z*latc.z;
1064 float lshx = dl.loffsetXYZ.x*lata.x + dl.loffsetXYZ.y*latb.x + dl.loffsetXYZ.z*latc.x;
1065 float lshy = dl.loffsetXYZ.x*lata.y + dl.loffsetXYZ.y*latb.y + dl.loffsetXYZ.z*latc.y;
1066 float lshz = dl.loffsetXYZ.x*lata.z + dl.loffsetXYZ.y*latb.z + dl.loffsetXYZ.z*latc.z;
1068 float xij = xyzq[dl.i].x + ishx - xyzq[dl.j].x;
1069 float yij = xyzq[dl.i].y + ishy - xyzq[dl.j].y;
1070 float zij = xyzq[dl.i].z + ishz - xyzq[dl.j].z;
1072 float xjk = xyzq[dl.j].x + jshx - xyzq[dl.k].x;
1073 float yjk = xyzq[dl.j].y + jshy - xyzq[dl.k].y;
1074 float zjk = xyzq[dl.j].z + jshz - xyzq[dl.k].z;
1076 float xlk = xyzq[dl.l].x + lshx - xyzq[dl.k].x;
1077 float ylk = xyzq[dl.l].y + lshy - xyzq[dl.k].y;
1078 float zlk = xyzq[dl.l].z + lshz - xyzq[dl.k].z;
1081 float ax = yij*zjk - zij*yjk;
1082 float ay = zij*xjk - xij*zjk;
1083 float az = xij*yjk - yij*xjk;
1084 float bx = ylk*zjk - zlk*yjk;
1085 float by = zlk*xjk - xlk*zjk;
1086 float bz = xlk*yjk - ylk*xjk;
1088 float ra2 = ax*ax + ay*ay + az*az;
1089 float rb2 = bx*bx + by*by + bz*bz;
1090 float rg = sqrtf(xjk*xjk + yjk*yjk + zjk*zjk);
1092 // if((ra2 <= rxmin2) .or. (rb2 <= rxmin2) .or. (rg <= rxmin)) then
1093 // nlinear = nlinear + 1
1096 float rgr = 1.0f / rg;
1097 float ra2r = 1.0f / ra2;
1098 float rb2r = 1.0f / rb2;
1099 float rabr = sqrtf(ra2r*rb2r);
1101 float cos_phi = (ax*bx + ay*by + az*bz)*rabr;
1103 // Note that sin(phi).G/|G|=B^A/(|A|.|B|)
1104 // which can be simplify to sin(phi)=|G|H.A/(|A|.|B|)
1105 float sin_phi = rg*rabr*(ax*xlk + ay*ylk + az*zlk);
1107 // Energy and derivative contributions.
1111 diheComp<doEnergy>(dihedralValues, dl.itype, sin_phi, cos_phi, dl.scale, df, e);
1113 // Alchemical transformation
1115 if (doFEP || doTI) {
1117 switch (dl.fepBondedType) {
1119 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1122 energy_TI_1 += (double)(e);
1125 energy_F += (double)(e * ((AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1));
1131 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1134 energy_TI_2 += (double)(e);
1137 energy_F += (double)(e * ((AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2));
1146 if (doFEP || doTI) {
1147 energy += e * alch_scale;
1154 // Compute derivatives wrt catesian coordinates.
1156 // GAA=dE/dphi.|G|/A^2, GBB=dE/dphi.|G|/B^2, FG=F.G, HG=H.G
1157 // FGA=dE/dphi*F.G/(|G|A^2), HGB=dE/dphi*H.G/(|G|B^2)
1159 float fg = xij*xjk + yij*yjk + zij*zjk;
1160 float hg = xlk*xjk + ylk*yjk + zlk*zjk;
1163 float fga = fg*ra2r*rgr;
1164 float hgb = hg*rb2r*rgr;
1165 float gaa = ra2r*rg;
1166 float gbb = rb2r*rg;
1167 // DFi=dE/dFi, DGi=dE/dGi, DHi=dE/dHi.
1169 // Remember T is long long int
1170 // Don't try to scale T_fix and similar variables directly by float
1171 if (doFEP || doTI) {
1179 T T_fix, T_fiy, T_fiz;
1180 calcComponentForces<T>(-gaa, ax, ay, az, T_fix, T_fiy, T_fiz);
1181 storeForces<T>(T_fix, T_fiy, T_fiz, dl.i, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1184 calcComponentForces<T>(fga, ax, ay, az, -hgb, bx, by, bz, dgx, dgy, dgz);
1185 T T_fjx = dgx - T_fix;
1186 T T_fjy = dgy - T_fiy;
1187 T T_fjz = dgz - T_fiz;
1188 storeForces<T>(T_fjx, T_fjy, T_fjz, dl.j, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1191 calcComponentForces<T>(gbb, bx, by, bz, dhx, dhy, dhz);
1192 T T_fkx = -dhx - dgx;
1193 T T_fky = -dhy - dgy;
1194 T T_fkz = -dhz - dgz;
1195 storeForces<T>(T_fkx, T_fky, T_fkz, dl.k, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1196 storeForces<T>(dhx, dhy, dhz, dl.l, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1200 #ifdef WRITE_FULL_VIRIALS
1201 float fix = ((float)T_fix);
1202 float fiy = ((float)T_fiy);
1203 float fiz = ((float)T_fiz);
1204 float fjx = ((float)dgx);
1205 float fjy = ((float)dgy);
1206 float fjz = ((float)dgz);
1207 float flx = ((float)dhx);
1208 float fly = ((float)dhy);
1209 float flz = ((float)dhz);
1210 virial.xx = (fix*xij) + (fjx*xjk) + (flx*xlk);
1211 virial.xy = (fix*yij) + (fjx*yjk) + (flx*ylk);
1212 virial.xz = (fix*zij) + (fjx*zjk) + (flx*zlk);
1213 virial.yx = (fiy*xij) + (fjy*xjk) + (fly*xlk);
1214 virial.yy = (fiy*yij) + (fjy*yjk) + (fly*ylk);
1215 virial.yz = (fiy*zij) + (fjy*zjk) + (fly*zlk);
1216 virial.zx = (fiz*xij) + (fjz*xjk) + (flz*xlk);
1217 virial.zy = (fiz*yij) + (fjz*yjk) + (flz*ylk);
1218 virial.zz = (fiz*zij) + (fjz*zjk) + (flz*zlk);
1224 __device__ __forceinline__ float3 cross(const float3 v1, const float3 v2) {
1226 v1.y*v2.z-v2.y*v1.z,
1227 v2.x*v1.z-v1.x*v2.z,
1232 __device__ __forceinline__ float dot(const float3 v1, const float3 v2) {
1233 return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z);
1237 // Calculates crossterms
1239 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1240 __device__ void crosstermForce(
1242 const CudaCrossterm* __restrict__ crosstermList,
1243 const CudaCrosstermValue* __restrict__ crosstermValues,
1244 const float4* __restrict__ xyzq,
1246 const float3 lata, const float3 latb, const float3 latc,
1247 T* __restrict__ force, double &energy,
1248 T* __restrict__ forceList, int* forceListCounter, int* forceListStarts, int* forceListNexts,
1249 #ifdef WRITE_FULL_VIRIALS
1250 ComputeBondedCUDAKernel::BondedVirial<double>& virial,
1252 ComputeBondedCUDAKernel::BondedVirial* __restrict__ virial,
1254 double &energy_F, double &energy_TI_1, double &energy_TI_2
1257 CudaCrossterm cl = crosstermList[index];
1259 // ----------------------------------------------------------------------------
1260 // Angle between 1 - 2 - 3 - 4
1263 float3 sh12 = make_float3(
1264 cl.offset12XYZ.x*lata.x + cl.offset12XYZ.y*latb.x + cl.offset12XYZ.z*latc.x,
1265 cl.offset12XYZ.x*lata.y + cl.offset12XYZ.y*latb.y + cl.offset12XYZ.z*latc.y,
1266 cl.offset12XYZ.x*lata.z + cl.offset12XYZ.y*latb.z + cl.offset12XYZ.z*latc.z);
1268 float4 xyzq1 = xyzq[cl.i1];
1269 float4 xyzq2 = xyzq[cl.i2];
1271 float3 r12 = make_float3(
1272 xyzq1.x + sh12.x - xyzq2.x,
1273 xyzq1.y + sh12.y - xyzq2.y,
1274 xyzq1.z + sh12.z - xyzq2.z);
1277 float3 sh23 = make_float3(
1278 cl.offset23XYZ.x*lata.x + cl.offset23XYZ.y*latb.x + cl.offset23XYZ.z*latc.x,
1279 cl.offset23XYZ.x*lata.y + cl.offset23XYZ.y*latb.y + cl.offset23XYZ.z*latc.y,
1280 cl.offset23XYZ.x*lata.z + cl.offset23XYZ.y*latb.z + cl.offset23XYZ.z*latc.z);
1282 float4 xyzq3 = xyzq[cl.i3];
1284 float3 r23 = make_float3(
1285 xyzq2.x + sh23.x - xyzq3.x,
1286 xyzq2.y + sh23.y - xyzq3.y,
1287 xyzq2.z + sh23.z - xyzq3.z);
1290 float3 sh34 = make_float3(
1291 cl.offset34XYZ.x*lata.x + cl.offset34XYZ.y*latb.x + cl.offset34XYZ.z*latc.x,
1292 cl.offset34XYZ.x*lata.y + cl.offset34XYZ.y*latb.y + cl.offset34XYZ.z*latc.y,
1293 cl.offset34XYZ.x*lata.z + cl.offset34XYZ.y*latb.z + cl.offset34XYZ.z*latc.z);
1295 float4 xyzq4 = xyzq[cl.i4];
1297 float3 r34 = make_float3(
1298 xyzq3.x + sh34.x - xyzq4.x,
1299 xyzq3.y + sh34.y - xyzq4.y,
1300 xyzq3.z + sh34.z - xyzq4.z);
1302 // Calculate the cross products
1303 float3 A = cross(r12, r23);
1304 float3 B = cross(r23, r34);
1305 float3 C = cross(r23, A);
1307 // Calculate the inverse distances
1308 float inv_rA = rsqrtf(dot(A, A));
1309 float inv_rB = rsqrtf(dot(B, B));
1310 float inv_rC = rsqrtf(dot(C, C));
1312 // Calculate the sin and cos
1313 float cos_phi = dot(A, B)*(inv_rA*inv_rB);
1314 float sin_phi = dot(C, B)*(inv_rC*inv_rB);
1316 float phi = -atan2f(sin_phi,cos_phi);
1318 // ----------------------------------------------------------------------------
1319 // Angle between 5 - 6 - 7 - 8
1323 float3 sh56 = make_float3(
1324 cl.offset56XYZ.x*lata.x + cl.offset56XYZ.y*latb.x + cl.offset56XYZ.z*latc.x,
1325 cl.offset56XYZ.x*lata.y + cl.offset56XYZ.y*latb.y + cl.offset56XYZ.z*latc.y,
1326 cl.offset56XYZ.x*lata.z + cl.offset56XYZ.y*latb.z + cl.offset56XYZ.z*latc.z);
1328 float4 xyzq5 = xyzq[cl.i5];
1329 float4 xyzq6 = xyzq[cl.i6];
1331 float3 r56 = make_float3(
1332 xyzq5.x + sh56.x - xyzq6.x,
1333 xyzq5.y + sh56.y - xyzq6.y,
1334 xyzq5.z + sh56.z - xyzq6.z);
1337 float3 sh67 = make_float3(
1338 cl.offset67XYZ.x*lata.x + cl.offset67XYZ.y*latb.x + cl.offset67XYZ.z*latc.x,
1339 cl.offset67XYZ.x*lata.y + cl.offset67XYZ.y*latb.y + cl.offset67XYZ.z*latc.y,
1340 cl.offset67XYZ.x*lata.z + cl.offset67XYZ.y*latb.z + cl.offset67XYZ.z*latc.z);
1342 float4 xyzq7 = xyzq[cl.i7];
1344 float3 r67 = make_float3(
1345 xyzq6.x + sh67.x - xyzq7.x,
1346 xyzq6.y + sh67.y - xyzq7.y,
1347 xyzq6.z + sh67.z - xyzq7.z);
1350 float3 sh78 = make_float3(
1351 cl.offset78XYZ.x*lata.x + cl.offset78XYZ.y*latb.x + cl.offset78XYZ.z*latc.x,
1352 cl.offset78XYZ.x*lata.y + cl.offset78XYZ.y*latb.y + cl.offset78XYZ.z*latc.y,
1353 cl.offset78XYZ.x*lata.z + cl.offset78XYZ.y*latb.z + cl.offset78XYZ.z*latc.z);
1355 float4 xyzq8 = xyzq[cl.i8];
1357 float3 r78 = make_float3(
1358 xyzq7.x + sh78.x - xyzq8.x,
1359 xyzq7.y + sh78.y - xyzq8.y,
1360 xyzq7.z + sh78.z - xyzq8.z);
1362 // Calculate the cross products
1363 float3 D = cross(r56, r67);
1364 float3 E = cross(r67, r78);
1365 float3 F = cross(r67, D);
1367 // Calculate the inverse distances
1368 float inv_rD = rsqrtf(dot(D, D));
1369 float inv_rE = rsqrtf(dot(E, E));
1370 float inv_rF = rsqrtf(dot(F, F));
1372 // Calculate the sin and cos
1373 float cos_psi = dot(D, E)*(inv_rD*inv_rE);
1374 float sin_psi = dot(F, E)*(inv_rF*inv_rE);
1376 float psi = -atan2f(sin_psi,cos_psi);
1378 // ----------------------------------------------------------------------------
1379 // Calculate the energy
1381 const float inv_h = (float)( (CudaCrosstermValue::dim) / (2.0*M_PI) );
1384 phi = fmod(phi + (float)M_PI, 2.0f*(float)M_PI);
1385 psi = fmod(psi + (float)M_PI, 2.0f*(float)M_PI);
1387 // distance measured in grid points between angle and smallest value
1388 float phi_h = phi * inv_h;
1389 float psi_h = psi * inv_h;
1391 // find smallest numbered grid point in stencil
1392 int iphi = (int)floor(phi_h);
1393 int ipsi = (int)floor(psi_h);
1395 const CudaCrosstermValue& cp = crosstermValues[cl.itype];
1397 // Load coefficients
1399 c[0] = cp.c[iphi][ipsi][0];
1400 c[1] = cp.c[iphi][ipsi][1];
1401 c[2] = cp.c[iphi][ipsi][2];
1402 c[3] = cp.c[iphi][ipsi][3];
1404 float dphi = phi_h - (float)iphi;
1405 float dpsi = psi_h - (float)ipsi;
1408 float alch_scale_energy_fep;
1409 double* p_energy_TI;
1410 if (doFEP || doTI) {
1412 alch_scale_energy_fep = 0.0f;
1413 if (doTI) p_energy_TI = NULL;
1414 switch (cl.fepBondedType) {
1416 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda1;
1419 p_energy_TI = &(energy_TI_1);
1422 alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda12 - (AlchBondedCUDA::alchLambdas).bondLambda1;
1428 alch_scale *= (AlchBondedCUDA::alchLambdas).bondLambda2;
1431 p_energy_TI = &(energy_TI_2);
1434 alch_scale_energy_fep = (AlchBondedCUDA::alchLambdas).bondLambda22 - (AlchBondedCUDA::alchLambdas).bondLambda2;
1443 float energyf = c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) );
1444 energyf = energyf*dpsi + c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) );
1445 energyf = energyf*dpsi + c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) );
1446 energyf = energyf*dpsi + c[0].x + dphi*( c[0].y + dphi*( c[0].z + dphi*c[0].w ) );
1447 if (doFEP || doTI) {
1448 energy += energyf * cl.scale * alch_scale;
1450 energy_F += energyf * cl.scale * alch_scale_energy_fep;
1453 if (p_energy_TI != NULL) {
1454 (*p_energy_TI) += energyf * cl.scale;
1458 energy += energyf * cl.scale;
1462 float dEdphi = 3.0f*(c[0].w + dpsi*( c[1].w + dpsi*( c[2].w + dpsi*c[3].w ) ));
1463 dEdphi = dEdphi*dphi + 2.0f*(c[0].z + dpsi*( c[1].z + dpsi*( c[2].z + dpsi*c[3].z ) ));
1464 dEdphi = dEdphi*dphi + (c[0].y + dpsi*( c[1].y + dpsi*( c[2].y + dpsi*c[3].y ) )); // 13 muls
1465 dEdphi *= cl.scale*inv_h;
1467 float dEdpsi = 3.0f*(c[3].x + dphi*( c[3].y + dphi*( c[3].z + dphi*c[3].w ) ));
1468 dEdpsi = dEdpsi*dpsi + 2.0f*(c[2].x + dphi*( c[2].y + dphi*( c[2].z + dphi*c[2].w ) ));
1469 dEdpsi = dEdpsi*dpsi + (c[1].x + dphi*( c[1].y + dphi*( c[1].z + dphi*c[1].w ) )); // 13 muls
1470 dEdpsi *= cl.scale*inv_h;
1472 // float normCross1 = dot(A, A);
1473 float square_r23 = dot(r23, r23);
1474 float norm_r23 = sqrtf(square_r23);
1475 float inv_square_r23 = 1.0f/square_r23;
1476 float ff1 = dEdphi*norm_r23*inv_rA*inv_rA;
1477 float ff2 = -dot(r12, r23)*inv_square_r23;
1478 float ff3 = -dot(r34, r23)*inv_square_r23;
1479 float ff4 = -dEdphi*norm_r23*inv_rB*inv_rB;
1481 // NOTE: The reason why scaling ff1 and ff4 is enough:
1482 // f1, f4 are already scaled, and in following t1's formula:
1483 // first term (t1.x) : ff2*f1.x - ff3*f4.x
1486 // so t1.x is scaled, and also t1.y and t1.z => t1 is scaled
1487 // then let's look at f2.x:
1488 // f2.x = t1.x - f1.x
1490 // scaled scaled => f2.x is scaled
1491 // and also f2.y and f2.z are scaled => f2 is scaled => similarly f3 is scaled
1492 // As a result scaling ff1 and ff4 is enough. DONT scale ff2 and ff3!
1493 if (doFEP || doTI) {
1498 float3 f1 = make_float3(ff1*A.x, ff1*A.y, ff1*A.z);
1499 float3 f4 = make_float3(ff4*B.x, ff4*B.y, ff4*B.z);
1500 float3 t1 = make_float3( ff2*f1.x - ff3*f4.x, ff2*f1.y - ff3*f4.y, ff2*f1.z - ff3*f4.z );
1501 float3 f2 = make_float3( t1.x - f1.x, t1.y - f1.y, t1.z - f1.z);
1502 float3 f3 = make_float3( -t1.x - f4.x, -t1.y - f4.y, -t1.z - f4.z);
1504 T T_f1x, T_f1y, T_f1z;
1505 T T_f2x, T_f2y, T_f2z;
1506 T T_f3x, T_f3y, T_f3z;
1507 T T_f4x, T_f4y, T_f4z;
1508 convertForces<T>(f1.x, f1.y, f1.z, T_f1x, T_f1y, T_f1z);
1509 convertForces<T>(f2.x, f2.y, f2.z, T_f2x, T_f2y, T_f2z);
1510 convertForces<T>(f3.x, f3.y, f3.z, T_f3x, T_f3y, T_f3z);
1511 convertForces<T>(f4.x, f4.y, f4.z, T_f4x, T_f4y, T_f4z);
1512 storeForces<T>(T_f1x, T_f1y, T_f1z, cl.i1, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1513 storeForces<T>(T_f2x, T_f2y, T_f2z, cl.i2, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1514 storeForces<T>(T_f3x, T_f3y, T_f3z, cl.i3, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1515 storeForces<T>(T_f4x, T_f4y, T_f4z, cl.i4, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1517 float square_r67 = dot(r67, r67);
1518 float norm_r67 = sqrtf(square_r67);
1519 float inv_square_r67 = 1.0f/(square_r67);
1520 ff1 = dEdpsi*norm_r67*inv_rD*inv_rD;
1521 ff2 = -dot(r56, r67)*inv_square_r67;
1522 ff3 = -dot(r78, r67)*inv_square_r67;
1523 ff4 = -dEdpsi*norm_r67*inv_rE*inv_rE;
1525 if (doFEP || doTI) {
1530 float3 f5 = make_float3( ff1*D.x, ff1*D.y, ff1*D.z );
1531 float3 f8 = make_float3( ff4*E.x, ff4*E.y, ff4*E.z );
1532 float3 t2 = make_float3( ff2*f5.x - ff3*f8.x, ff2*f5.y - ff3*f8.y, ff2*f5.z - ff3*f8.z );
1533 float3 f6 = make_float3( t2.x - f5.x, t2.y - f5.y, t2.z - f5.z );
1534 float3 f7 = make_float3(-t2.x - f8.x, -t2.y - f8.y, -t2.z - f8.z );
1536 T T_f5x, T_f5y, T_f5z;
1537 T T_f6x, T_f6y, T_f6z;
1538 T T_f7x, T_f7y, T_f7z;
1539 T T_f8x, T_f8y, T_f8z;
1540 convertForces<T>(f5.x, f5.y, f5.z, T_f5x, T_f5y, T_f5z);
1541 convertForces<T>(f6.x, f6.y, f6.z, T_f6x, T_f6y, T_f6z);
1542 convertForces<T>(f7.x, f7.y, f7.z, T_f7x, T_f7y, T_f7z);
1543 convertForces<T>(f8.x, f8.y, f8.z, T_f8x, T_f8y, T_f8z);
1544 storeForces<T>(T_f5x, T_f5y, T_f5z, cl.i5, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1545 storeForces<T>(T_f6x, T_f6y, T_f6z, cl.i6, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1546 storeForces<T>(T_f7x, T_f7y, T_f7z, cl.i7, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1547 storeForces<T>(T_f8x, T_f8y, T_f8z, cl.i8, stride, force, forceList, forceListCounter, forceListStarts, forceListNexts);
1551 #ifdef WRITE_FULL_VIRIALS
1552 float3 s12 = make_float3( f1.x + f2.x, f1.y + f2.y, f1.z + f2.z );
1553 float3 s56 = make_float3( f5.x + f6.x, f5.y + f6.y, f5.z + f6.z );
1554 virial.xx = f1.x*r12.x + s12.x*r23.x - f4.x*r34.x + f5.x*r56.x + s56.x*r67.x - f8.x*r78.x;
1555 virial.xy = f1.x*r12.y + s12.x*r23.y - f4.x*r34.y + f5.x*r56.y + s56.x*r67.y - f8.x*r78.y;
1556 virial.xz = f1.x*r12.z + s12.x*r23.z - f4.x*r34.z + f5.x*r56.z + s56.x*r67.z - f8.x*r78.z;
1557 virial.yx = f1.y*r12.x + s12.y*r23.x - f4.y*r34.x + f5.y*r56.x + s56.y*r67.x - f8.y*r78.x;
1558 virial.yy = f1.y*r12.y + s12.y*r23.y - f4.y*r34.y + f5.y*r56.y + s56.y*r67.y - f8.y*r78.y;
1559 virial.yz = f1.y*r12.z + s12.y*r23.z - f4.y*r34.z + f5.y*r56.z + s56.y*r67.z - f8.y*r78.z;
1560 virial.zx = f1.z*r12.x + s12.z*r23.x - f4.z*r34.x + f5.z*r56.x + s56.z*r67.x - f8.z*r78.x;
1561 virial.zy = f1.z*r12.y + s12.z*r23.y - f4.z*r34.y + f5.z*r56.y + s56.z*r67.y - f8.z*r78.y;
1562 virial.zz = f1.z*r12.z + s12.z*r23.z - f4.z*r34.z + f5.z*r56.z + s56.z*r67.z - f8.z*r78.z;
1569 #define BONDEDFORCESKERNEL_NUM_WARP 2
1571 #define BONDEDFORCESKERNEL_NUM_WARP 4
1574 // Calculates all forces in a single kernel call
1576 template <typename T, bool doEnergy, bool doVirial, bool doFEP, bool doTI>
1577 __global__ void bondedForcesKernel(
1581 const CudaBond* __restrict__ bonds,
1582 const CudaBondValue* __restrict__ bondValues,
1584 const int numAngles,
1585 const CudaAngle* __restrict__ angles,
1586 const CudaAngleValue* __restrict__ angleValues,
1588 const int numDihedrals,
1589 const CudaDihedral* __restrict__ dihedrals,
1590 const CudaDihedralValue* __restrict__ dihedralValues,
1592 const int numImpropers,
1593 const CudaDihedral* __restrict__ impropers,
1594 const CudaDihedralValue* __restrict__ improperValues,
1596 const int numExclusions,
1597 const CudaExclusion* __restrict__ exclusions,
1599 const int numCrossterms,
1600 const CudaCrossterm* __restrict__ crossterms,
1601 const CudaCrosstermValue* __restrict__ crosstermValues,
1603 const float cutoff2,
1604 const float r2_delta, const int r2_delta_expc,
1606 const float* __restrict__ r2_table,
1607 const float4* __restrict__ exclusionTable,
1609 #if !defined(USE_TABLE_ARRAYS)
1610 cudaTextureObject_t r2_table_tex,
1611 cudaTextureObject_t exclusionTableTex,
1614 const float4* __restrict__ xyzq,
1616 const float3 lata, const float3 latb, const float3 latc,
1617 T* __restrict__ force,
1618 T* __restrict__ forceSlow,
1619 T* __restrict__ forceList,
1620 int* forceListCounter,
1621 int* forceListStarts,
1622 int* forceListStartsSlow,
1623 int* forceListNexts,
1624 double* __restrict__ energies_virials) {
1626 // Thread-block index
1627 int indexTB = start + blockIdx.x;
1629 const int numBondsTB = (numBonds + blockDim.x - 1)/blockDim.x;
1630 const int numAnglesTB = (numAngles + blockDim.x - 1)/blockDim.x;
1631 const int numDihedralsTB = (numDihedrals + blockDim.x - 1)/blockDim.x;
1632 const int numImpropersTB = (numImpropers + blockDim.x - 1)/blockDim.x;
1633 const int numExclusionsTB= (numExclusions + blockDim.x - 1)/blockDim.x;
1634 const int numCrosstermsTB= (numCrossterms + blockDim.x - 1)/blockDim.x;
1636 // Each thread computes single bonded interaction.
1637 // Each thread block computes single bonded type
1644 int energyIndex_TI_1;
1645 int energyIndex_TI_2;
1657 energyIndex_TI_1 = -1;
1658 energyIndex_TI_2 = -1;
1662 #ifdef WRITE_FULL_VIRIALS
1663 ComputeBondedCUDAKernel::BondedVirial<double> virial;
1675 virialIndex = ComputeBondedCUDAKernel::normalVirialIndex_XX;
1679 if (indexTB < numBondsTB) {
1680 int i = threadIdx.x + indexTB*blockDim.x;
1682 energyIndex = ComputeBondedCUDAKernel::energyIndex_BOND;
1684 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_BOND_F;
1687 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_BOND_TI_1;
1688 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_BOND_TI_2;
1692 bondForce<T, doEnergy, doVirial, doFEP, doTI>
1693 (i, bonds, bondValues, xyzq,
1694 stride, lata, latb, latc,
1696 forceList, forceListCounter, forceListStarts, forceListNexts,
1698 energy_F, energy_TI_1, energy_TI_2);
1702 indexTB -= numBondsTB;
1704 if (indexTB < numAnglesTB) {
1705 int i = threadIdx.x + indexTB*blockDim.x;
1707 energyIndex = ComputeBondedCUDAKernel::energyIndex_ANGLE;
1709 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_ANGLE_F;
1712 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_ANGLE_TI_1;
1713 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_ANGLE_TI_2;
1716 if (i < numAngles) {
1717 angleForce<T, doEnergy, doVirial, doFEP, doTI>
1718 (i, angles, angleValues, xyzq, stride,
1721 forceList, forceListCounter, forceListStarts, forceListNexts,
1723 energy_F, energy_TI_1, energy_TI_2);
1727 indexTB -= numAnglesTB;
1729 if (indexTB < numDihedralsTB) {
1730 int i = threadIdx.x + indexTB*blockDim.x;
1732 energyIndex = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL;
1734 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_F;
1737 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_TI_1;
1738 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_DIHEDRAL_TI_2;
1742 virialIndex = ComputeBondedCUDAKernel::amdDiheVirialIndex_XX;
1744 if (i < numDihedrals) {
1745 diheForce<T, doEnergy, doVirial, doFEP, doTI>
1746 (i, dihedrals, dihedralValues, xyzq, stride,
1749 forceList, forceListCounter, forceListStarts, forceListNexts,
1751 energy_F, energy_TI_1, energy_TI_2);
1755 indexTB -= numDihedralsTB;
1757 if (indexTB < numImpropersTB) {
1758 int i = threadIdx.x + indexTB*blockDim.x;
1760 energyIndex = ComputeBondedCUDAKernel::energyIndex_IMPROPER;
1762 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_IMPROPER_F;
1765 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_IMPROPER_TI_1;
1766 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_IMPROPER_TI_2;
1769 if (i < numImpropers) {
1770 diheForce<T, doEnergy, doVirial, doFEP, doTI>
1771 (i, impropers, improperValues, xyzq, stride,
1774 forceList, forceListCounter, forceListStarts, forceListNexts,
1776 energy_F, energy_TI_1, energy_TI_2);
1780 indexTB -= numImpropersTB;
1782 if (indexTB < numExclusionsTB) {
1783 int i = threadIdx.x + indexTB*blockDim.x;
1785 energyIndex = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW;
1787 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_F;
1790 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_1;
1791 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_2;
1795 virialIndex = ComputeBondedCUDAKernel::slowVirialIndex_XX;
1797 if (i < numExclusions) {
1798 exclusionForce<T, doEnergy, doVirial, doFEP, doTI>
1799 (i, exclusions, r2_delta, r2_delta_expc,
1800 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP) || defined(USE_TABLE_ARRAYS)
1801 r2_table, exclusionTable,
1803 r2_table_tex, exclusionTableTex,
1805 xyzq, stride, lata, latb, latc, cutoff2,
1807 forceList, forceListCounter, forceListStartsSlow, forceListNexts,
1809 energy_F, energy_TI_1, energy_TI_2);
1813 indexTB -= numExclusionsTB;
1815 if (indexTB < numCrosstermsTB) {
1816 int i = threadIdx.x + indexTB*blockDim.x;
1818 energyIndex = ComputeBondedCUDAKernel::energyIndex_CROSSTERM;
1820 energyIndex_F = ComputeBondedCUDAKernel::energyIndex_CROSSTERM_F;
1823 energyIndex_TI_1 = ComputeBondedCUDAKernel::energyIndex_CROSSTERM_TI_1;
1824 energyIndex_TI_2 = ComputeBondedCUDAKernel::energyIndex_CROSSTERM_TI_2;
1828 virialIndex = ComputeBondedCUDAKernel::amdDiheVirialIndex_XX;
1830 if (i < numCrossterms) {
1831 crosstermForce<T, doEnergy, doVirial, doFEP, doTI>
1832 (i, crossterms, crosstermValues,
1833 xyzq, stride, lata, latb, latc,
1835 forceList, forceListCounter, forceListStarts, forceListNexts,
1837 energy_F, energy_TI_1, energy_TI_2);
1841 // indexTB -= numCrosstermsTB;
1845 // Write energies to global memory
1847 // energyIndex is constant within thread-block
1848 __shared__ double shEnergy[BONDEDFORCESKERNEL_NUM_WARP];
1849 __shared__ double shEnergy_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
1850 __shared__ double shEnergy_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
1851 __shared__ double shEnergy_F[BONDEDFORCESKERNEL_NUM_WARP];
1853 for (int i=WARPSIZE/2;i >= 1;i/=2) {
1854 energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, WARPSIZE);
1857 energy_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_F, i, WARPSIZE);
1860 energy_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_1, i, WARPSIZE);
1861 energy_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_2, i, WARPSIZE);
1864 int laneID = (threadIdx.x & (WARPSIZE - 1));
1865 int warpID = threadIdx.x / WARPSIZE;
1867 shEnergy[warpID] = energy;
1869 shEnergy_F[warpID] = energy_F;
1872 shEnergy_TI_1[warpID] = energy_TI_1;
1873 shEnergy_TI_2[warpID] = energy_TI_2;
1878 energy = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy[laneID] : 0.0;
1880 energy_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy_F[laneID] : 0.0;
1883 energy_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy_TI_1[laneID] : 0.0;
1884 energy_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergy_TI_2[laneID] : 0.0;
1887 for (int i=WARPSIZE/2;i >= 1;i/=2) {
1888 energy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy, i, WARPSIZE);
1891 energy_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_F, i, WARPSIZE);
1894 energy_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_1, i, WARPSIZE);
1895 energy_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energy_TI_2, i, WARPSIZE);
1899 const int bin = blockIdx.x % ATOMIC_BINS;
1900 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex], energy);
1902 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex_F], energy_F);
1905 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex_TI_1], energy_TI_1);
1906 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + energyIndex_TI_2], energy_TI_2);
1912 // Write virials to global memory
1913 #ifdef WRITE_FULL_VIRIALS
1916 for (int i=WARPSIZE/2;i >= 1;i/=2) {
1917 virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, WARPSIZE);
1918 virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, WARPSIZE);
1919 virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, WARPSIZE);
1920 virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, WARPSIZE);
1921 virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, WARPSIZE);
1922 virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, WARPSIZE);
1923 virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, WARPSIZE);
1924 virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, WARPSIZE);
1925 virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, WARPSIZE);
1927 __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirial[BONDEDFORCESKERNEL_NUM_WARP];
1928 int laneID = (threadIdx.x & (WARPSIZE - 1));
1929 int warpID = threadIdx.x / WARPSIZE;
1931 shVirial[warpID] = virial;
1945 if (laneID < BONDEDFORCESKERNEL_NUM_WARP) virial = shVirial[laneID];
1947 for (int i=WARPSIZE/2;i >= 1;i/=2) {
1948 virial.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xx, i, WARPSIZE);
1949 virial.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xy, i, WARPSIZE);
1950 virial.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.xz, i, WARPSIZE);
1951 virial.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yx, i, WARPSIZE);
1952 virial.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yy, i, WARPSIZE);
1953 virial.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.yz, i, WARPSIZE);
1954 virial.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zx, i, WARPSIZE);
1955 virial.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zy, i, WARPSIZE);
1956 virial.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virial.zz, i, WARPSIZE);
1960 const int bin = blockIdx.x % ATOMIC_BINS;
1961 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 0], virial.xx);
1962 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 1], virial.xy);
1963 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 2], virial.xz);
1964 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 3], virial.yx);
1965 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 4], virial.yy);
1966 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 5], virial.yz);
1967 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 6], virial.zx);
1968 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 7], virial.zy);
1969 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + virialIndex + 8], virial.zz);
1977 template <typename T, bool doEnergy, bool doVirial, bool doElect, bool doFEP, bool doTI, bool doTable>
1978 __global__ void modifiedExclusionForcesKernel(
1980 const int numModifiedExclusions,
1981 const CudaExclusion* __restrict__ modifiedExclusions,
1983 const float one_scale14, // 1 - scale14
1984 const float cutoff2,
1985 const CudaNBConstants nbConstants,
1986 const int vdwCoefTableWidth,
1987 const float2* __restrict__ vdwCoefTable,
1988 #if !defined(USE_TABLE_ARRAYS)
1989 cudaTextureObject_t vdwCoefTableTex,
1991 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
1992 const float4* __restrict__ modifiedExclusionForceTable,
1993 const float4* __restrict__ modifiedExclusionEnergyTable,
1995 cudaTextureObject_t modifiedExclusionForceTableTex,
1996 cudaTextureObject_t modifiedExclusionEnergyTableTex,
1998 const float4* __restrict__ xyzq,
2000 const float3 lata, const float3 latb, const float3 latc,
2001 T* __restrict__ forceNbond, T* __restrict__ forceSlow,
2002 T* __restrict__ forceList,
2003 int* forceListCounter,
2004 int* forceListStartsNbond,
2005 int* forceListStartsSlow,
2006 int* forceListNexts,
2007 double* __restrict__ energies_virials) {
2010 int i = threadIdx.x + (start + blockIdx.x)*blockDim.x;
2012 double energyVdw, energyNbond, energySlow;
2013 // Alchemical energies
2014 double energyVdw_F, energyVdw_TI_1, energyVdw_TI_2;
2015 double energyNbond_F, energyNbond_TI_1, energyNbond_TI_2;
2016 double energySlow_F, energySlow_TI_1, energySlow_TI_2;
2023 energyVdw_TI_1 = 0.0;
2024 energyVdw_TI_2 = 0.0;
2030 energyNbond_F = 0.0;
2034 energyNbond_TI_1 = 0.0;
2035 energyNbond_TI_2 = 0.0;
2036 energySlow_TI_1 = 0.0;
2037 energySlow_TI_2 = 0.0;
2042 #ifdef WRITE_FULL_VIRIALS
2043 ComputeBondedCUDAKernel::BondedVirial<double> virialNbond;
2044 ComputeBondedCUDAKernel::BondedVirial<double> virialSlow;
2046 virialNbond.xx = 0.0;
2047 virialNbond.xy = 0.0;
2048 virialNbond.xz = 0.0;
2049 virialNbond.yx = 0.0;
2050 virialNbond.yy = 0.0;
2051 virialNbond.yz = 0.0;
2052 virialNbond.zx = 0.0;
2053 virialNbond.zy = 0.0;
2054 virialNbond.zz = 0.0;
2056 virialSlow.xx = 0.0;
2057 virialSlow.xy = 0.0;
2058 virialSlow.xz = 0.0;
2059 virialSlow.yx = 0.0;
2060 virialSlow.yy = 0.0;
2061 virialSlow.yz = 0.0;
2062 virialSlow.zx = 0.0;
2063 virialSlow.zy = 0.0;
2064 virialSlow.zz = 0.0;
2069 if (i < numModifiedExclusions)
2071 modifiedExclusionForce<T, doEnergy, doVirial, doElect, doFEP, doTI, doTable>
2072 (i, modifiedExclusions, doSlow, one_scale14, vdwCoefTableWidth,
2073 #if __CUDA_ARCH__ >= 350 || defined(USE_TABLE_ARRAYS)
2078 // for modified exclusions, we do regular tables if HIP and USE_FORCE_TABLES, otherwise it's textures
2079 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
2081 modifiedExclusionForceTable,
2082 modifiedExclusionEnergyTable,
2084 // if CUDA build, non-tables, fall back to texture force/energy tables
2085 modifiedExclusionForceTableTex,
2086 modifiedExclusionEnergyTableTex,
2088 xyzq, stride, lata, latb, latc, cutoff2, nbConstants,
2089 energyVdw, forceNbond, energyNbond,
2090 forceSlow, energySlow,
2091 forceList, forceListCounter, forceListStartsNbond, forceListStartsSlow, forceListNexts,
2092 virialNbond, virialSlow,
2093 energyVdw_F, energyVdw_TI_1, energyVdw_TI_2,
2094 energyNbond_F, energyNbond_TI_1, energyNbond_TI_2,
2095 energySlow_F, energySlow_TI_1, energySlow_TI_2);
2098 // Write energies to global memory
2100 __shared__ double shEnergyVdw[BONDEDFORCESKERNEL_NUM_WARP];
2101 __shared__ double shEnergyNbond[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
2102 __shared__ double shEnergySlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
2103 __shared__ double shEnergyVdw_F[BONDEDFORCESKERNEL_NUM_WARP];
2104 __shared__ double shEnergyVdw_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2105 __shared__ double shEnergyVdw_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2106 __shared__ double shEnergyNbond_F[BONDEDFORCESKERNEL_NUM_WARP];
2107 __shared__ double shEnergyNbond_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2108 __shared__ double shEnergyNbond_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2109 __shared__ double shEnergySlow_F[BONDEDFORCESKERNEL_NUM_WARP];
2110 __shared__ double shEnergySlow_TI_1[BONDEDFORCESKERNEL_NUM_WARP];
2111 __shared__ double shEnergySlow_TI_2[BONDEDFORCESKERNEL_NUM_WARP];
2113 for (int i=WARPSIZE/2;i >= 1;i/=2) {
2114 energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
2117 energyVdw_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_F, i, WARPSIZE);
2120 energyVdw_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_1, i, WARPSIZE);
2121 energyVdw_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_2, i, WARPSIZE);
2124 energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, WARPSIZE);
2125 energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
2127 energyNbond_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_F, i, WARPSIZE);
2128 energySlow_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_F, i, WARPSIZE);
2131 energyNbond_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_1, i, WARPSIZE);
2132 energyNbond_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_2, i, WARPSIZE);
2133 energySlow_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_1, i, WARPSIZE);
2134 energySlow_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_2, i, WARPSIZE);
2138 int laneID = (threadIdx.x & (WARPSIZE - 1));
2139 int warpID = threadIdx.x / WARPSIZE;
2141 shEnergyVdw[warpID] = energyVdw;
2143 shEnergyVdw_F[warpID] = energyVdw_F;
2146 shEnergyVdw_TI_1[warpID] = energyVdw_TI_1;
2147 shEnergyVdw_TI_2[warpID] = energyVdw_TI_2;
2150 shEnergyNbond[warpID] = energyNbond;
2151 shEnergySlow[warpID] = energySlow;
2153 shEnergyNbond_F[warpID] = energyNbond_F;
2154 shEnergySlow_F[warpID] = energySlow_F;
2157 shEnergyNbond_TI_1[warpID] = energyNbond_TI_1;
2158 shEnergyNbond_TI_2[warpID] = energyNbond_TI_2;
2159 shEnergySlow_TI_1[warpID] = energySlow_TI_1;
2160 shEnergySlow_TI_2[warpID] = energySlow_TI_2;
2166 energyVdw = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw[laneID] : 0.0;
2168 energyVdw_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw_F[laneID] : 0.0;
2171 energyVdw_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw_TI_1[laneID] : 0.0;
2172 energyVdw_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyVdw_TI_2[laneID] : 0.0;
2175 energyNbond = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond[laneID] : 0.0;
2176 energySlow = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow[laneID] : 0.0;
2178 energyNbond_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond_F[laneID] : 0.0;
2179 energySlow_F = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow_F[laneID] : 0.0;
2182 energyNbond_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond_TI_1[laneID] : 0.0;
2183 energyNbond_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergyNbond_TI_2[laneID] : 0.0;
2184 energySlow_TI_1 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow_TI_1[laneID] : 0.0;
2185 energySlow_TI_2 = (laneID < BONDEDFORCESKERNEL_NUM_WARP) ? shEnergySlow_TI_2[laneID] : 0.0;
2189 for (int i=WARPSIZE/2;i >= 1;i/=2) {
2190 energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
2193 energyVdw_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_F, i, WARPSIZE);
2196 energyVdw_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_1, i, WARPSIZE);
2197 energyVdw_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_TI_2, i, WARPSIZE);
2200 energyNbond += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond, i, WARPSIZE);
2201 energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
2203 energyNbond_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_F, i, WARPSIZE);
2204 energySlow_F += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_F, i, WARPSIZE);
2207 energyNbond_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_1, i, WARPSIZE);
2208 energyNbond_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyNbond_TI_2, i, WARPSIZE);
2209 energySlow_TI_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_1, i, WARPSIZE);
2210 energySlow_TI_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_TI_2, i, WARPSIZE);
2215 const int bin = blockIdx.x % ATOMIC_BINS;
2216 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ], energyVdw);
2218 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ_F], energyVdw_F);
2221 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ_TI_1], energyVdw_TI_1);
2222 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_LJ_TI_2], energyVdw_TI_2);
2225 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT], energyNbond);
2226 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW], energySlow);
2228 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_F], energyNbond_F);
2229 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_F], energySlow_F);
2232 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_TI_1], energyNbond_TI_1);
2233 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_TI_2], energyNbond_TI_2);
2234 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_1], energySlow_TI_1);
2235 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW_TI_2], energySlow_TI_2);
2242 // Write virials to global memory
2243 #ifdef WRITE_FULL_VIRIALS
2246 for (int i=WARPSIZE/2;i >= 1;i/=2) {
2247 virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, WARPSIZE);
2248 virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, WARPSIZE);
2249 virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, WARPSIZE);
2250 virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, WARPSIZE);
2251 virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, WARPSIZE);
2252 virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, WARPSIZE);
2253 virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, WARPSIZE);
2254 virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, WARPSIZE);
2255 virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, WARPSIZE);
2256 if (doElect && doSlow) {
2257 virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, WARPSIZE);
2258 virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, WARPSIZE);
2259 virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, WARPSIZE);
2260 virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, WARPSIZE);
2261 virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, WARPSIZE);
2262 virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, WARPSIZE);
2263 virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, WARPSIZE);
2264 virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, WARPSIZE);
2265 virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, WARPSIZE);
2268 __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirialNbond[BONDEDFORCESKERNEL_NUM_WARP];
2269 __shared__ ComputeBondedCUDAKernel::BondedVirial<double> shVirialSlow[(doElect) ? BONDEDFORCESKERNEL_NUM_WARP : 1];
2270 int laneID = (threadIdx.x & (WARPSIZE - 1));
2271 int warpID = threadIdx.x / WARPSIZE;
2273 shVirialNbond[warpID] = virialNbond;
2274 if (doElect && doSlow) {
2275 shVirialSlow[warpID] = virialSlow;
2280 virialNbond.xx = 0.0;
2281 virialNbond.xy = 0.0;
2282 virialNbond.xz = 0.0;
2283 virialNbond.yx = 0.0;
2284 virialNbond.yy = 0.0;
2285 virialNbond.yz = 0.0;
2286 virialNbond.zx = 0.0;
2287 virialNbond.zy = 0.0;
2288 virialNbond.zz = 0.0;
2289 if (doElect && doSlow) {
2290 virialSlow.xx = 0.0;
2291 virialSlow.xy = 0.0;
2292 virialSlow.xz = 0.0;
2293 virialSlow.yx = 0.0;
2294 virialSlow.yy = 0.0;
2295 virialSlow.yz = 0.0;
2296 virialSlow.zx = 0.0;
2297 virialSlow.zy = 0.0;
2298 virialSlow.zz = 0.0;
2302 if (laneID < BONDEDFORCESKERNEL_NUM_WARP) {
2303 virialNbond = shVirialNbond[laneID];
2304 if (doElect && doSlow) {
2305 virialSlow = shVirialSlow[laneID];
2309 for (int i=WARPSIZE/2;i >= 1;i/=2) {
2310 virialNbond.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xx, i, WARPSIZE);
2311 virialNbond.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xy, i, WARPSIZE);
2312 virialNbond.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.xz, i, WARPSIZE);
2313 virialNbond.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yx, i, WARPSIZE);
2314 virialNbond.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yy, i, WARPSIZE);
2315 virialNbond.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.yz, i, WARPSIZE);
2316 virialNbond.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zx, i, WARPSIZE);
2317 virialNbond.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zy, i, WARPSIZE);
2318 virialNbond.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialNbond.zz, i, WARPSIZE);
2319 if (doElect && doSlow) {
2320 virialSlow.xx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xx, i, WARPSIZE);
2321 virialSlow.xy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xy, i, WARPSIZE);
2322 virialSlow.xz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.xz, i, WARPSIZE);
2323 virialSlow.yx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yx, i, WARPSIZE);
2324 virialSlow.yy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yy, i, WARPSIZE);
2325 virialSlow.yz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.yz, i, WARPSIZE);
2326 virialSlow.zx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zx, i, WARPSIZE);
2327 virialSlow.zy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zy, i, WARPSIZE);
2328 virialSlow.zz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, virialSlow.zz, i, WARPSIZE);
2334 const int bin = blockIdx.x % ATOMIC_BINS;
2335 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XX], virialNbond.xx);
2336 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XY], virialNbond.xy);
2337 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_XZ], virialNbond.xz);
2338 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YX], virialNbond.yx);
2339 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YY], virialNbond.yy);
2340 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_YZ], virialNbond.yz);
2341 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZX], virialNbond.zx);
2342 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZY], virialNbond.zy);
2343 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::nbondVirialIndex_ZZ], virialNbond.zz);
2344 if (doElect && doSlow) {
2345 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XX], virialSlow.xx);
2346 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XY], virialSlow.xy);
2347 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_XZ], virialSlow.xz);
2348 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YX], virialSlow.yx);
2349 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YY], virialSlow.yy);
2350 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_YZ], virialSlow.yz);
2351 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZX], virialSlow.zx);
2352 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZY], virialSlow.zy);
2353 atomicAdd(&energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + ComputeBondedCUDAKernel::slowVirialIndex_ZZ], virialSlow.zz);
2362 template <typename T>
2363 __global__ void gatherBondedForcesKernel(
2365 const int atomStorageSize,
2367 const T* __restrict__ forceList,
2368 const int* __restrict__ forceListStarts,
2369 const int* __restrict__ forceListNexts,
2370 T* __restrict__ force) {
2372 int i = threadIdx.x + (start + blockIdx.x) * blockDim.x;
2374 if (i < atomStorageSize) {
2378 int pos = forceListStarts[i];
2380 fx += forceList[pos * 3 + 0];
2381 fy += forceList[pos * 3 + 1];
2382 fz += forceList[pos * 3 + 2];
2383 pos = forceListNexts[pos];
2386 force[i + stride ] = fy;
2387 force[i + stride * 2] = fz;
2391 __global__ void reduceBondedBinsKernel(double* energies_virials) {
2392 const int bin = threadIdx.x;
2394 typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
2395 __shared__ typename WarpReduce::TempStorage tempStorage;
2397 double v = WarpReduce(tempStorage).Sum(energies_virials[bin * ComputeBondedCUDAKernel::energies_virials_SIZE + blockIdx.x]);
2398 if (threadIdx.x == 0) {
2399 energies_virials[blockIdx.x] = v;
2403 // ##############################################################################################
2404 // ##############################################################################################
2405 // ##############################################################################################
2408 // Class constructor
2410 ComputeBondedCUDAKernel::ComputeBondedCUDAKernel(int deviceID, CudaNonbondedTables& cudaNonbondedTables) :
2411 deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables) {
2413 cudaCheck(cudaSetDevice(deviceID));
2422 modifiedExclusions = NULL;
2429 numModifiedExclusions = 0;
2435 dihedralValues = NULL;
2436 improperValues = NULL;
2437 crosstermValues = NULL;
2446 forceListStarts = NULL;
2447 forceListNexts = NULL;
2449 forceListStartsSize = 0;
2450 forceListNextsSize = 0;
2451 allocate_device<int>(&forceListCounter, 1);
2453 allocate_device<double>(&energies_virials, ATOMIC_BINS * energies_virials_SIZE);
2459 ComputeBondedCUDAKernel::~ComputeBondedCUDAKernel() {
2460 cudaCheck(cudaSetDevice(deviceID));
2462 deallocate_device<double>(&energies_virials);
2463 // deallocate_device<BondedVirial>(&virial);
2465 if (tupleData != NULL) deallocate_device<char>(&tupleData);
2466 if (xyzq != NULL) deallocate_device<float4>(&xyzq);
2467 if (forces != NULL) deallocate_device<FORCE_TYPE>(&forces);
2469 if (forceList != NULL) deallocate_device<FORCE_TYPE>(&forceList);
2470 if (forceListCounter != NULL) deallocate_device<int>(&forceListCounter);
2471 if (forceListStarts != NULL) deallocate_device<int>(&forceListStarts);
2472 if (forceListNexts != NULL) deallocate_device<int>(&forceListNexts);
2474 if (bondValues != NULL) deallocate_device<CudaBondValue>(&bondValues);
2475 if (angleValues != NULL) deallocate_device<CudaAngleValue>(&angleValues);
2476 if (dihedralValues != NULL) deallocate_device<CudaDihedralValue>(&dihedralValues);
2477 if (improperValues != NULL) deallocate_device<CudaDihedralValue>(&improperValues);
2478 if (crosstermValues != NULL) deallocate_device<CudaCrosstermValue>(&crosstermValues);
2481 void ComputeBondedCUDAKernel::setupBondValues(int numBondValues, CudaBondValue* h_bondValues) {
2482 allocate_device<CudaBondValue>(&bondValues, numBondValues);
2483 copy_HtoD_sync<CudaBondValue>(h_bondValues, bondValues, numBondValues);
2486 void ComputeBondedCUDAKernel::setupAngleValues(int numAngleValues, CudaAngleValue* h_angleValues) {
2487 allocate_device<CudaAngleValue>(&angleValues, numAngleValues);
2488 copy_HtoD_sync<CudaAngleValue>(h_angleValues, angleValues, numAngleValues);
2491 void ComputeBondedCUDAKernel::setupDihedralValues(int numDihedralValues, CudaDihedralValue* h_dihedralValues) {
2492 allocate_device<CudaDihedralValue>(&dihedralValues, numDihedralValues);
2493 copy_HtoD_sync<CudaDihedralValue>(h_dihedralValues, dihedralValues, numDihedralValues);
2496 void ComputeBondedCUDAKernel::setupImproperValues(int numImproperValues, CudaDihedralValue* h_improperValues) {
2497 allocate_device<CudaDihedralValue>(&improperValues, numImproperValues);
2498 copy_HtoD_sync<CudaDihedralValue>(h_improperValues, improperValues, numImproperValues);
2501 void ComputeBondedCUDAKernel::setupCrosstermValues(int numCrosstermValues, CudaCrosstermValue* h_crosstermValues) {
2502 allocate_device<CudaCrosstermValue>(&crosstermValues, numCrosstermValues);
2503 copy_HtoD_sync<CudaCrosstermValue>(h_crosstermValues, crosstermValues, numCrosstermValues);
2506 void ComputeBondedCUDAKernel::updateCudaAlchParameters(const CudaAlchParameters* h_cudaAlchParameters, cudaStream_t stream) {
2507 cudaCheck(cudaMemcpyToSymbolAsync(AlchBondedCUDA::alchParams, h_cudaAlchParameters, 1*sizeof(AlchBondedCUDA::alchParams), 0, cudaMemcpyHostToDevice, stream));
2510 void ComputeBondedCUDAKernel::updateCudaAlchLambdas(const CudaAlchLambdas* h_cudaAlchLambdas, cudaStream_t stream) {
2511 cudaCheck(cudaMemcpyToSymbolAsync(AlchBondedCUDA::alchLambdas, h_cudaAlchLambdas, 1*sizeof(AlchBondedCUDA::alchLambdas), 0, cudaMemcpyHostToDevice, stream));
2514 void ComputeBondedCUDAKernel::updateCudaAlchFlags(const CudaAlchFlags& h_cudaAlchFlags) {
2515 alchFlags = h_cudaAlchFlags;
2519 // Update bonded lists
2521 void ComputeBondedCUDAKernel::setTupleCounts(
2522 const TupleCounts count
2524 numBonds = count.bond;
2525 numAngles = count.angle;
2526 numDihedrals = count.dihedral;
2527 numImpropers = count.improper;
2528 numModifiedExclusions = count.modifiedExclusion;
2529 numExclusions = count.exclusion;
2530 numCrossterms = count.crossterm;
2533 TupleCounts ComputeBondedCUDAKernel::getTupleCounts() {
2536 count.bond = numBonds;
2537 count.angle = numAngles;
2538 count.dihedral = numDihedrals;
2539 count.improper = numImpropers;
2540 count.modifiedExclusion = numModifiedExclusions;
2541 count.exclusion = numExclusions;
2542 count.crossterm = numCrossterms;
2547 TupleData ComputeBondedCUDAKernel::getData() {
2550 data.angle = angles;
2551 data.dihedral = dihedrals;
2552 data.improper = impropers;
2553 data.modifiedExclusion = modifiedExclusions;
2554 data.exclusion = exclusions;
2555 data.crossterm = crossterms;
2560 size_t ComputeBondedCUDAKernel::reallocateTupleBuffer(
2561 const TupleCounts countIn,
2564 const int numBondsWA = warpAlign(countIn.bond);
2565 const int numAnglesWA = warpAlign(countIn.angle);
2566 const int numDihedralsWA = warpAlign(countIn.dihedral);
2567 const int numImpropersWA = warpAlign(countIn.improper);
2568 const int numModifiedExclusionsWA = warpAlign(countIn.modifiedExclusion);
2569 const int numExclusionsWA = warpAlign(countIn.exclusion);
2570 const int numCrosstermsWA = warpAlign(countIn.crossterm);
2572 const size_t sizeTot = numBondsWA*sizeof(CudaBond)
2573 + numAnglesWA*sizeof(CudaAngle)
2574 + numDihedralsWA*sizeof(CudaDihedral)
2575 + numImpropersWA*sizeof(CudaDihedral)
2576 + numModifiedExclusionsWA*sizeof(CudaExclusion)
2577 + numExclusionsWA*sizeof(CudaExclusion)
2578 + numCrosstermsWA*sizeof(CudaCrossterm);
2580 reallocate_device<char>(&tupleData, &tupleDataSize, sizeTot, kTupleOveralloc);
2584 bonds = (CudaBond *)&tupleData[pos];
2585 pos += numBondsWA*sizeof(CudaBond);
2587 angles = (CudaAngle* )&tupleData[pos];
2588 pos += numAnglesWA*sizeof(CudaAngle);
2590 dihedrals = (CudaDihedral* )&tupleData[pos];
2591 pos += numDihedralsWA*sizeof(CudaDihedral);
2593 impropers = (CudaDihedral* )&tupleData[pos];
2594 pos += numImpropersWA*sizeof(CudaDihedral);
2596 modifiedExclusions = (CudaExclusion* )&tupleData[pos];
2597 pos += numModifiedExclusionsWA*sizeof(CudaExclusion);
2599 exclusions = (CudaExclusion* )&tupleData[pos];
2600 pos += numExclusionsWA*sizeof(CudaExclusion);
2602 crossterms = (CudaCrossterm* )&tupleData[pos];
2603 pos += numCrosstermsWA*sizeof(CudaCrossterm);
2608 void ComputeBondedCUDAKernel::update(
2609 const int numBondsIn,
2610 const int numAnglesIn,
2611 const int numDihedralsIn,
2612 const int numImpropersIn,
2613 const int numModifiedExclusionsIn,
2614 const int numExclusionsIn,
2615 const int numCrosstermsIn,
2616 const char* h_tupleData,
2617 cudaStream_t stream) {
2619 numBonds = numBondsIn;
2620 numAngles = numAnglesIn;
2621 numDihedrals = numDihedralsIn;
2622 numImpropers = numImpropersIn;
2623 numModifiedExclusions = numModifiedExclusionsIn;
2624 numExclusions = numExclusionsIn;
2625 numCrossterms = numCrosstermsIn;
2627 const size_t sizeTot = reallocateTupleBuffer(getTupleCounts(), stream);
2629 copy_HtoD<char>(h_tupleData, tupleData, sizeTot, stream);
2632 void ComputeBondedCUDAKernel::updateAtomBuffer(
2633 const int atomStorageSize,
2636 reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, 1.4f);
2640 // Return stride for force array
2642 int ComputeBondedCUDAKernel::getForceStride(const int atomStorageSize) {
2643 #ifdef USE_STRIDED_FORCE
2644 // Align stride to 256 bytes
2645 return ((atomStorageSize*sizeof(FORCE_TYPE) - 1)/256 + 1)*256/sizeof(FORCE_TYPE);
2652 // Return size of single force array
2654 int ComputeBondedCUDAKernel::getForceSize(const int atomStorageSize) {
2655 #ifdef USE_STRIDED_FORCE
2656 return (3*getForceStride(atomStorageSize));
2658 return (3*atomStorageSize);
2663 // Return size of the all force arrays
2665 int ComputeBondedCUDAKernel::getAllForceSize(const int atomStorageSize, const bool doSlow) {
2667 int forceSize = getForceSize(atomStorageSize);
2669 if (numModifiedExclusions > 0 || numExclusions > 0) {
2671 // All three force arrays [normal, nbond, slow]
2674 // Two force arrays [normal, nbond]
2683 // Compute bonded forces
2685 void ComputeBondedCUDAKernel::bondedForce(
2686 const double scale14, const int atomStorageSize,
2687 const bool doEnergy, const bool doVirial, const bool doSlow,
2689 const float3 lata, const float3 latb, const float3 latc,
2690 const float cutoff2, const float r2_delta, const int r2_delta_expc,
2691 const CudaNBConstants nbConstants,
2692 const float4* h_xyzq, FORCE_TYPE* h_forces,
2693 double *h_energies_virials, bool atomsChanged, bool CUDASOAintegrate,
2694 bool useDeviceMigration, cudaStream_t stream) {
2696 int forceStorageSize = getAllForceSize(atomStorageSize, true);
2697 int forceCopySize = getAllForceSize(atomStorageSize, doSlow);
2698 int forceStride = getForceStride(atomStorageSize);
2700 int forceSize = getForceSize(atomStorageSize);
2701 int startNbond = forceSize;
2702 int startSlow = 2*forceSize;
2703 // Re-allocate coordinate and force arrays if neccessary
2704 reallocate_device<float4>(&xyzq, &xyzqSize, atomStorageSize, 1.4f);
2705 reallocate_device<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
2707 #if !defined(USE_BONDED_FORCE_ATOMIC_STORE)
2709 // numBonds bondForce 2
2710 // numAngles angleForce 3
2711 // numDihedrals diheForce 4
2712 // numImpropers diheForce 4
2713 // numExclusions exclusionForce 2
2714 // numCrossterms crosstermForce 8
2715 // numModifiedExclusions modifiedExclusionForce 4
2716 int listSize = 3 * (numBonds * 2 + numAngles * 3 + numDihedrals * 4 + numImpropers * 4 + numExclusions * 2 + numCrossterms * 8 + numModifiedExclusions * 4);
2717 reallocate_device<FORCE_TYPE>(&forceList, &forceListSize, listSize, 1.4f);
2718 reallocate_device<int>(&forceListNexts, &forceListNextsSize, listSize, 1.4f);
2719 reallocate_device<int>(&forceListStarts, &forceListStartsSize, 3 * atomStorageSize, 1.4f);
2720 int* forceListStartsNbond = forceListStarts + atomStorageSize;
2721 int* forceListStartsSlow = forceListStarts + 2 * atomStorageSize;
2723 clear_device_array<int>(forceListCounter, 1, stream);
2724 cudaCheck(cudaMemsetAsync(forceListStarts, -1, sizeof(int) * 3 * atomStorageSize, stream));
2726 int* forceListStartsNbond = NULL;
2727 int* forceListStartsSlow = NULL;
2730 #ifdef NODEGROUP_FORCE_REGISTER
2731 if(CUDASOAintegrate){
2732 if(atomsChanged && !useDeviceMigration) copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
2733 }else copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
2735 copy_HtoD<float4>(h_xyzq, xyzq, atomStorageSize, stream);
2737 #if defined(USE_BONDED_FORCE_ATOMIC_STORE)
2738 clear_device_array<FORCE_TYPE>(forces, forceCopySize, stream);
2740 if (doEnergy || doVirial ) {
2741 clear_device_array<double>(energies_virials, ATOMIC_BINS * energies_virials_SIZE, stream);
2744 // Check if we are doing alchemical free energy calculation
2745 // Is checking alchOn required?
2746 const bool doFEP = (alchFlags.alchOn) && (alchFlags.alchFepOn);
2747 const bool doTI = (alchFlags.alchOn) && (alchFlags.alchThermIntOn);
2749 float one_scale14 = (float)(1.0 - scale14);
2751 // If doSlow = false, these exclusions are not calculated
2752 int numExclusionsDoSlow = doSlow ? numExclusions : 0;
2754 int nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
2756 int numBondsTB = (numBonds + nthread - 1)/nthread;
2757 int numAnglesTB = (numAngles + nthread - 1)/nthread;
2758 int numDihedralsTB = (numDihedrals + nthread - 1)/nthread;
2759 int numImpropersTB = (numImpropers + nthread - 1)/nthread;
2760 int numExclusionsTB= (numExclusionsDoSlow + nthread - 1)/nthread;
2761 int numCrosstermsTB= (numCrossterms + nthread - 1)/nthread;
2763 int nblock = numBondsTB + numAnglesTB + numDihedralsTB + numImpropersTB +
2764 numExclusionsTB + numCrosstermsTB;
2767 // printf("%d %d %d %d %d %d nblock %d\n",
2768 // numBonds, numAngles, numDihedrals, numImpropers, numModifiedExclusions, numExclusions, nblock);
2770 int max_nblock = deviceCUDA->getMaxNumBlocks();
2773 while (start < nblock)
2775 int nleft = nblock - start;
2776 int nblock_use = min(max_nblock, nleft);
2778 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
2779 #define TABLE_PARAMS cudaNonbondedTables.get_r2_table(), \
2780 cudaNonbondedTables.getExclusionTable()
2782 #define TABLE_PARAMS cudaNonbondedTables.get_r2_table_tex(), \
2783 cudaNonbondedTables.getExclusionTableTex()
2786 #if defined(USE_TABLE_ARRAYS)
2787 #define CALL(DOENERGY, DOVIRIAL, DOFEP, DOTI) \
2788 bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOFEP, DOTI> \
2789 <<< nblock_use, nthread, shmem_size, stream >>> \
2790 (start, numBonds, bonds, bondValues, \
2791 numAngles, angles, angleValues, \
2792 numDihedrals, dihedrals, dihedralValues, \
2793 numImpropers, impropers, improperValues, \
2794 numExclusionsDoSlow, exclusions, \
2795 numCrossterms, crossterms, crosstermValues, \
2797 r2_delta, r2_delta_expc, \
2799 xyzq, forceStride, \
2801 forces, &forces[startSlow], \
2802 forceList, forceListCounter, forceListStarts, forceListStartsSlow, forceListNexts, \
2805 #define CALL(DOENERGY, DOVIRIAL, DOFEP, DOTI) \
2806 bondedForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOFEP, DOTI> \
2807 <<< nblock_use, nthread, shmem_size, stream >>> \
2808 (start, numBonds, bonds, bondValues, \
2809 numAngles, angles, angleValues, \
2810 numDihedrals, dihedrals, dihedralValues, \
2811 numImpropers, impropers, improperValues, \
2812 numExclusionsDoSlow, exclusions, \
2813 numCrossterms, crossterms, crosstermValues, \
2815 r2_delta, r2_delta_expc, \
2816 cudaNonbondedTables.get_r2_table(), cudaNonbondedTables.getExclusionTable(), \
2817 cudaNonbondedTables.get_r2_table_tex(), cudaNonbondedTables.getExclusionTableTex(), \
2818 xyzq, forceStride, \
2820 forces, &forces[startSlow], \
2821 forceList, forceListCounter, forceListStarts, forceListStartsSlow, forceListNexts, \
2824 if (!doEnergy && !doVirial && !doFEP && !doTI) CALL(0, 0, 0, 0);
2825 if (!doEnergy && doVirial && !doFEP && !doTI) CALL(0, 1, 0, 0);
2826 if ( doEnergy && !doVirial && !doFEP && !doTI) CALL(1, 0, 0, 0);
2827 if ( doEnergy && doVirial && !doFEP && !doTI) CALL(1, 1, 0, 0);
2829 if (!doEnergy && !doVirial && doFEP && !doTI) CALL(0, 0, 1, 0);
2830 if (!doEnergy && doVirial && doFEP && !doTI) CALL(0, 1, 1, 0);
2831 if ( doEnergy && !doVirial && doFEP && !doTI) CALL(1, 0, 1, 0);
2832 if ( doEnergy && doVirial && doFEP && !doTI) CALL(1, 1, 1, 0);
2834 if (!doEnergy && !doVirial && !doFEP && doTI) CALL(0, 0, 0, 1);
2835 if (!doEnergy && doVirial && !doFEP && doTI) CALL(0, 1, 0, 1);
2836 if ( doEnergy && !doVirial && !doFEP && doTI) CALL(1, 0, 0, 1);
2837 if ( doEnergy && doVirial && !doFEP && doTI) CALL(1, 1, 0, 1);
2839 // Can't enable both FEP and TI, don't expand the following functions.
2841 if (!doEnergy && !doVirial && doFEP && doTI) CALL(0, 0, 1, 1);
2842 if (!doEnergy && doVirial && doFEP && doTI) CALL(0, 1, 1, 1);
2843 if ( doEnergy && !doVirial && doFEP && doTI) CALL(1, 0, 1, 1);
2844 if ( doEnergy && doVirial && doFEP && doTI) CALL(1, 1, 1, 1);
2849 cudaCheck(cudaGetLastError());
2851 start += nblock_use;
2854 nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
2855 nblock = (numModifiedExclusions + nthread - 1)/nthread;
2857 bool doElect = (one_scale14 == 0.0f) ? false : true;
2860 while (start < nblock)
2862 int nleft = nblock - start;
2863 int nblock_use = min(max_nblock, nleft);
2865 #if defined(USE_TABLE_ARRAYS) && defined(NAMD_HIP)
2866 #define TABLE_PARAMS \
2867 cudaNonbondedTables.getExclusionVdwCoefTable(), \
2868 cudaNonbondedTables.getModifiedExclusionForceTable(), \
2869 cudaNonbondedTables.getModifiedExclusionEnergyTable()
2871 #define TABLE_PARAMS \
2872 cudaNonbondedTables.getExclusionVdwCoefTable(), \
2873 cudaNonbondedTables.getExclusionVdwCoefTableTex(), \
2874 cudaNonbondedTables.getModifiedExclusionForceTableTex(), \
2875 cudaNonbondedTables.getModifiedExclusionEnergyTableTex()
2878 #define CALL(DOENERGY, DOVIRIAL, DOELECT, DOFEP, DOTI, DOTABLE) \
2879 modifiedExclusionForcesKernel<FORCE_TYPE, DOENERGY, DOVIRIAL, DOELECT, DOFEP, DOTI, DOTABLE> \
2880 <<< nblock_use, nthread, shmem_size, stream >>> \
2881 (start, numModifiedExclusions, modifiedExclusions, \
2882 doSlow, one_scale14, cutoff2, nbConstants, \
2883 cudaNonbondedTables.getVdwCoefTableWidth(), \
2885 xyzq, forceStride, lata, latb, latc, \
2886 &forces[startNbond], &forces[startSlow], \
2887 forceList, forceListCounter, forceListStartsNbond, forceListStartsSlow, forceListNexts, \
2892 if (!doEnergy && !doVirial && !doElect && !doFEP && !doTI && doTable) CALL(0, 0, 0, 0, 0, 1);
2893 if (!doEnergy && doVirial && !doElect && !doFEP && !doTI && doTable) CALL(0, 1, 0, 0, 0, 1);
2894 if ( doEnergy && !doVirial && !doElect && !doFEP && !doTI && doTable) CALL(1, 0, 0, 0, 0, 1);
2895 if ( doEnergy && doVirial && !doElect && !doFEP && !doTI && doTable) CALL(1, 1, 0, 0, 0, 1);
2897 if (!doEnergy && !doVirial && doElect && !doFEP && !doTI && doTable) CALL(0, 0, 1, 0, 0, 1);
2898 if (!doEnergy && doVirial && doElect && !doFEP && !doTI && doTable) CALL(0, 1, 1, 0, 0, 1);
2899 if ( doEnergy && !doVirial && doElect && !doFEP && !doTI && doTable) CALL(1, 0, 1, 0, 0, 1);
2900 if ( doEnergy && doVirial && doElect && !doFEP && !doTI && doTable) CALL(1, 1, 1, 0, 0, 1);
2902 if (!doEnergy && !doVirial && !doElect && doFEP && !doTI && doTable) CALL(0, 0, 0, 1, 0, 1);
2903 if (!doEnergy && doVirial && !doElect && doFEP && !doTI && doTable) CALL(0, 1, 0, 1, 0, 1);
2904 if ( doEnergy && !doVirial && !doElect && doFEP && !doTI && doTable) CALL(1, 0, 0, 1, 0, 1);
2905 if ( doEnergy && doVirial && !doElect && doFEP && !doTI && doTable) CALL(1, 1, 0, 1, 0, 1);
2907 if (!doEnergy && !doVirial && doElect && doFEP && !doTI && doTable) CALL(0, 0, 1, 1, 0, 1);
2908 if (!doEnergy && doVirial && doElect && doFEP && !doTI && doTable) CALL(0, 1, 1, 1, 0, 1);
2909 if ( doEnergy && !doVirial && doElect && doFEP && !doTI && doTable) CALL(1, 0, 1, 1, 0, 1);
2910 if ( doEnergy && doVirial && doElect && doFEP && !doTI && doTable) CALL(1, 1, 1, 1, 0, 1);
2912 if (!doEnergy && !doVirial && !doElect && !doFEP && doTI && doTable) CALL(0, 0, 0, 0, 1, 1);
2913 if (!doEnergy && doVirial && !doElect && !doFEP && doTI && doTable) CALL(0, 1, 0, 0, 1, 1);
2914 if ( doEnergy && !doVirial && !doElect && !doFEP && doTI && doTable) CALL(1, 0, 0, 0, 1, 1);
2915 if ( doEnergy && doVirial && !doElect && !doFEP && doTI && doTable) CALL(1, 1, 0, 0, 1, 1);
2917 if (!doEnergy && !doVirial && doElect && !doFEP && doTI && doTable) CALL(0, 0, 1, 0, 1, 1);
2918 if (!doEnergy && doVirial && doElect && !doFEP && doTI && doTable) CALL(0, 1, 1, 0, 1, 1);
2919 if ( doEnergy && !doVirial && doElect && !doFEP && doTI && doTable) CALL(1, 0, 1, 0, 1, 1);
2920 if ( doEnergy && doVirial && doElect && !doFEP && doTI && doTable) CALL(1, 1, 1, 0, 1, 1);
2922 // doTable disabled is only supported by no FEP/ no TI
2923 if (!doEnergy && !doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(0, 0, 0, 0, 0, 0);
2924 if (!doEnergy && doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(0, 1, 0, 0, 0, 0);
2925 if ( doEnergy && !doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(1, 0, 0, 0, 0, 0);
2926 if ( doEnergy && doVirial && !doElect && !doFEP && !doTI && !doTable) CALL(1, 1, 0, 0, 0, 0);
2928 if (!doEnergy && !doVirial && doElect && !doFEP && !doTI && !doTable) CALL(0, 0, 1, 0, 0, 0);
2929 if (!doEnergy && doVirial && doElect && !doFEP && !doTI && !doTable) CALL(0, 1, 1, 0, 0, 0);
2930 if ( doEnergy && !doVirial && doElect && !doFEP && !doTI && !doTable) CALL(1, 0, 1, 0, 0, 0);
2931 if ( doEnergy && doVirial && doElect && !doFEP && !doTI && !doTable) CALL(1, 1, 1, 0, 0, 0);
2933 // Can't enable both FEP and TI, don't expand the following functions.
2935 if (!doEnergy && !doVirial && !doElect && doFEP && doTI) CALL(0, 0, 0, 1, 1);
2936 if (!doEnergy && doVirial && !doElect && doFEP && doTI) CALL(0, 1, 0, 1, 1);
2937 if ( doEnergy && !doVirial && !doElect && doFEP && doTI) CALL(1, 0, 0, 1, 1);
2938 if ( doEnergy && doVirial && !doElect && doFEP && doTI) CALL(1, 1, 0, 1, 1);
2940 if (!doEnergy && !doVirial && doElect && doFEP && doTI) CALL(0, 0, 1, 1, 1);
2941 if (!doEnergy && doVirial && doElect && doFEP && doTI) CALL(0, 1, 1, 1, 1);
2942 if ( doEnergy && !doVirial && doElect && doFEP && doTI) CALL(1, 0, 1, 1, 1);
2943 if ( doEnergy && doVirial && doElect && doFEP && doTI) CALL(1, 1, 1, 1, 1);
2947 cudaCheck(cudaGetLastError());
2949 start += nblock_use;
2951 #if !defined(USE_BONDED_FORCE_ATOMIC_STORE)
2952 nthread = BONDEDFORCESKERNEL_NUM_WARP * WARPSIZE;
2953 nblock = (atomStorageSize + nthread - 1)/nthread;
2956 while (start < nblock)
2958 int nleft = nblock - start;
2959 int nblock_use = min(max_nblock, nleft);
2961 // cudaCheck(hipDeviceSynchronize());
2962 // auto t0 = std::chrono::high_resolution_clock::now();
2964 gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
2965 start, atomStorageSize, forceStride,
2966 forceList, forceListStarts, forceListNexts,
2968 gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
2969 start, atomStorageSize, forceStride,
2970 forceList, forceListStartsNbond, forceListNexts,
2971 &forces[startNbond]);
2973 gatherBondedForcesKernel<FORCE_TYPE><<<nblock_use, nthread, 0, stream>>>(
2974 start, atomStorageSize, forceStride,
2975 forceList, forceListStartsSlow, forceListNexts,
2976 &forces[startSlow]);
2978 cudaCheck(cudaGetLastError());
2980 // cudaCheck(hipStreamSynchronize(stream));
2981 // auto t1 = std::chrono::high_resolution_clock::now();
2982 // std::chrono::duration<double> diff1 = t1 - t0;
2983 // std::cout << "gatherBondedForcesKernel";
2984 // std::cout << " " << std::setprecision(3) << diff1.count() * 1e3 << " ms" << std::endl;
2986 start += nblock_use;
2990 #ifdef NODEGROUP_FORCE_REGISTER
2991 if((atomsChanged && !useDeviceMigration) || !CUDASOAintegrate){
2992 copy_DtoH<double>(forces, h_forces, forceCopySize, stream);
2994 // XXX TODO: ERASE THIS AFTERWARDS
2995 // this is not numAtoms, this is something else
2996 // will print the force inside the compute and afterwards
2997 FILE* pos_nb_atoms = fopen("compute_b_dforce.txt", "w");
2998 //fprintf(pos_nb_atoms, "Calculating %d positions\n", tlKernel.getCudaPatchesSize());
2999 // positions are wrong here.
3000 //for(int i = 0; i < atomStorageSize; i++){
3001 for(int i = 29415; i < 29895; i++){
3002 fprintf(pos_nb_atoms, "%2.4lf\n", h_forcesDP[i]);
3004 fclose(pos_nb_atoms);
3009 copy_DtoH<double>(forces, h_forces, forceCopySize, stream);
3011 if (doEnergy || doVirial) {
3012 if (ATOMIC_BINS > 1) {
3013 // Reduce energies_virials[ATOMIC_BINS][energies_virials_SIZE] in-place (results are in energies_virials[0])
3014 reduceBondedBinsKernel<<<energies_virials_SIZE, ATOMIC_BINS, 0, stream>>>(energies_virials);
3016 // virial copy, is this necessary?
3017 copy_DtoH<double>(energies_virials, h_energies_virials, energies_virials_SIZE, stream);