2 #include <hip/hip_runtime.h>
3 #include <hipcub/hipcub.hpp>
7 #include "HipDefines.h"
8 #include "CudaComputeNonbondedKernel.hip.h"
9 #include "CudaTileListKernel.hip.h"
10 #include "DeviceCUDA.h"
11 #include "CudaComputeNonbondedInteractions.h"
16 #define __thread __declspec(thread)
18 extern __thread DeviceCUDA *deviceCUDA;
20 #define NONBONDKERNEL_NUM_WARP 1
21 #define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP 4
22 #define REDUCEVIRIALENERGYKERNEL_NUM_WARP 4
23 #define REDUCEGBISENERGYKERNEL_NUM_WARP 4
25 #define NONBONDKERNEL_SWEEPS_PER_TILE WARPSIZE/BOUNDINGBOXSIZE
27 #define OVERALLOC 1.2f
29 void NAMD_die(const char *);
30 void NAMD_bug(const char *);
32 #define MAX_CONST_EXCLUSIONS 2048 // cache size is 8k
33 __constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS];
36 __constant__ AlchData alchflags;
40 typedef float __attribute__((ext_vector_type(2))) Native_float2_;
44 struct __attribute__((packed)) { Native_float2_ dxy; float dz; };
45 struct { float x, y, z; };
49 fast_float3() = default;
52 fast_float3(float x_, float y_, float z_) : dxy{ x_, y_ }, dz{ z_ } {}
55 fast_float3(Native_float2_ xy_, float z_) : dxy{ xy_ }, dz{ z_ } {}
58 operator float3() const
60 return float3{ x, y, z };
63 __forceinline__ __host__ __device__
64 fast_float3& operator=(float4 y)
72 static_assert(sizeof(fast_float3) == 12);
74 __forceinline__ __host__ __device__
75 fast_float3 operator*(fast_float3 x, fast_float3 y)
77 return fast_float3{ x.dxy * y.dxy, x.dz * y.dz };
80 __forceinline__ __host__ __device__
81 fast_float3 operator*(fast_float3 x, float y)
83 return fast_float3{ x.dxy * y, x.dz * y };
86 __forceinline__ __host__ __device__
87 fast_float3 operator*(float x, fast_float3 y)
89 return fast_float3{ x * y.dxy, x * y.dz };
92 __forceinline__ __host__ __device__
93 fast_float3 operator+(fast_float3 x, fast_float3 y)
95 return fast_float3{ x.dxy + y.dxy, x.dz + y.dz };
98 __forceinline__ __host__ __device__
99 fast_float3 operator-(fast_float3 x, fast_float3 y)
101 return fast_float3{ x.dxy - y.dxy, x.dz - y.dz };
105 static __forceinline__ __host__ __device__ fast_float3 make_fast_float3(const float x)
107 return fast_float3{ x, x, x };
110 static __forceinline__ __host__ __device__ fast_float3 make_fast_float3(const float3 x)
112 return fast_float3{ x.x, x.y, x.z };
115 static __forceinline__ __host__ __device__ fast_float3 make_fast_float3(const float4 x)
117 return fast_float3{ x.x, x.y, x.z };
120 static __forceinline__ __host__ __device__ float norm2(fast_float3 a)
122 fast_float3 b = a * a;
123 return (b.x + b.y + b.z);
126 static __forceinline__ __host__ __device__ float4 make_float4(fast_float3 a)
128 return make_float4(a.x, a.y, a.z, 0.f);
132 #ifndef USE_TABLE_ARRAYS
133 __device__ __forceinline__
134 float4 sampleTableTex(cudaTextureObject_t tex, float k) {
135 const int tableSize = FORCE_ENERGY_TABLE_SIZE;
136 const float x = k * (float)tableSize - 0.5f;
137 const float f = floorf(x);
138 const float a = x - f;
139 const unsigned int i = (unsigned int)f;
140 const int i0 = i < tableSize - 1 ? i : tableSize - 1;
141 const int i1 = i0 + 1;
142 const float4 t0 = tex1Dfetch<float4>(tex, i0);
143 const float4 t1 = tex1Dfetch<float4>(tex, i1);
145 a * (t1.x - t0.x) + t0.x,
146 a * (t1.y - t0.y) + t0.y,
147 a * (t1.z - t0.z) + t0.z,
148 a * (t1.w - t0.w) + t0.w);
151 __device__ __forceinline__
152 float4 sampleTableTexInv(cudaTextureObject_t tex, float k) {
153 const int tableSize = FORCE_ENERGY_TABLE_SIZE;
154 const float x = k * (float)tableSize - 0.5f;
155 const float f = floorf(x);
156 const float a = x - f;
157 const unsigned int i = (unsigned int)f;
158 const int i0 = i < tableSize - 1 ? i : tableSize - 1;
159 const int i1 = i0 + 1;
160 const float4 t0 = tex1Dfetch<float4>(tex, i0);
161 const float4 t1 = tex1Dfetch<float4>(tex, i1);
162 float4 t2 = (t1 - t0) * a + t0;
163 return make_float4(t2.z, t2.y, t2.x, t2.w);
170 // HIP implementation of tex1D has lower performance than this custom implementation of
173 //#define USE_TABLE_ARRAYS
177 static __forceinline__ __device__ const T& fast_load(const T* buffer, unsigned int idx, unsigned int offset = 0)
179 return *reinterpret_cast<const T*>(reinterpret_cast<const char*>(buffer) + idx * static_cast<unsigned int>(sizeof(T)) + offset * static_cast<unsigned int>(sizeof(T)));
182 __device__ __forceinline__
183 float4 tableLookup(const float4* table, const float k)
185 const int tableSize = FORCE_ENERGY_TABLE_SIZE;
186 const float x = k * static_cast<float>(tableSize) - 0.5f;
187 const float f = floorf(x);
188 const float a = x - f;
189 const int i = static_cast<int>(f);
190 const int i0 = i < tableSize - 1 ? i : tableSize - 1;
191 const int i1 = i0 + 1;
192 // const float4 t0 = __ldg(&table[i0]);
193 // const float4 t1 = __ldg(&table[i1]);
194 const float4 t0 = fast_load(table, i0);
195 const float4 t1 = fast_load(table, i1);
197 a * (t1.x - t0.x) + t0.x,
198 a * (t1.y - t0.y) + t0.y,
199 a * (t1.z - t0.z) + t0.z,
200 a * (t1.w - t0.w) + t0.w);
203 __device__ __forceinline__
204 float4 tableLookupInv(const float4* table, const float k)
206 constexpr int tableSize = FORCE_ENERGY_TABLE_SIZE;
207 const float x = k * static_cast<float>(tableSize) - 0.5f;
208 const float f = floorf(x);
209 const float a = x - f;
210 const int i = static_cast<int>(f);
211 const int i0 = i < tableSize - 1 ? i : tableSize - 1;
212 const int i1 = i0 + 1;
213 // const float4 t0 = __ldg(&table[i0]);
214 // const float4 t1 = __ldg(&table[i1]);
215 const float4 t0 = fast_load(table, i0);
216 const float4 t1 = fast_load(table, i1);
217 float4 t2 = (t1 - t0) * a + t0;
218 return make_float4(t2.z, t2.y, t2.x, t2.w);
222 template<bool doEnergy, bool doSlow>
223 __device__ __forceinline__
224 void calcForceEnergy(const float r2, const float qi, const float qj,
225 const float dx, const float dy, const float dz,
226 const int vdwtypei, const int vdwtypej,
227 #ifdef USE_TABLE_ARRAYS
228 const float2* __restrict__ vdwCoefTable,
229 const float4* __restrict__ forceTable,
230 const float4* __restrict__ energyTable,
232 cudaTextureObject_t vdwCoefTableTex,
233 cudaTextureObject_t forceTableTex,
234 cudaTextureObject_t energyTableTex,
236 fast_float3& iforce, fast_float3& iforceSlow, fast_float3& jforce, fast_float3& jforceSlow,
237 float& energyVdw, float& energyElec, float& energySlow) {
239 int vdwIndex = vdwtypej + vdwtypei;
241 #if defined(USE_TABLE_ARRAYS)
242 // float2 ljab = vdwCoefTable[vdwIndex];
243 float2 ljab = fast_load(vdwCoefTable, vdwIndex);
245 float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
247 float rinv = __frsqrt_rn(r2);
250 #if defined(USE_TABLE_ARRAYS)
251 float4 fi = tableLookupInv(forceTable, rinv);
252 if (doEnergy) ei = tableLookup(energyTable, rinv);
254 float4 fi = sampleTableTexInv(forceTableTex, rinv);
255 if (doEnergy) ei = sampleTableTex(energyTableTex, rinv);
258 float fSlow = qi * qj;
259 fast_float3 f3(fi.x, fi.y, fi.z); // no waster registers, we already allocated stuff
260 fast_float3 lj(ljab.x, ljab.y, fSlow);
261 // we want to operate on reverse data here, the order of the float4 needs to be
262 // fast_float3 fi = f.z, f.y, f.x;
263 // fast_float3 lj = ljab.x, ljab.y, fSlow
264 // so if we do fi = fi * lj;
265 // f is the norm(fi) // x y z
267 // float f = ljab.x * fi.z + ljab.y * fi.y + fSlow * fi.x;
269 float f = f3.x + f3.y + f3.z;
272 energyVdw += ljab.x * ei.z + ljab.y * ei.y;
273 energyElec += fSlow * ei.x;
274 if (doSlow) energySlow += fSlow * ei.w;
276 if (doSlow) fSlow *= fi.w;
278 // dx, dy, and dz are already store in contiguous registers -> wrap them in a fast_float3 and operate
279 fast_float3 dr(dx, dy, dz);
280 fast_float3 fr = dr *f;
281 // float fx = dx * f;
282 // float fy = dy * f;
283 // float fz = dz * f;
290 iforce = iforce + fr;
291 jforce = jforce - fr;
294 fast_float3 sr = dr*fSlow;
295 // float fxSlow = dx * fSlow;
296 // float fySlow = dy * fSlow;
297 // float fzSlow = dz * fSlow;
298 // iforceSlow.x += sr.x;
299 // iforceSlow.y += sr.y;
300 // iforceSlow.z += sr.z;
301 // jforceSlow.x -= sr.x;
302 // jforceSlow.y -= sr.y;
303 // jforceSlow.z -= sr.z;
304 iforceSlow = iforceSlow + sr;
305 jforceSlow = jforceSlow - sr;
309 template<bool doEnergy, bool doSlow>
310 __device__ __forceinline__
311 void calcForceEnergyMath(const float r2, const float qi, const float qj,
312 const float dx, const float dy, const float dz,
313 const int vdwtypei, const int vdwtypej,
314 #if defined(USE_TABLE_ARRAYS)
315 const float2* __restrict__ vdwCoefTable,
317 cudaTextureObject_t vdwCoefTableTex,
319 fast_float3& iforce, fast_float3& iforceSlow, fast_float3& jforce, fast_float3& jforceSlow,
320 float& energyVdw, float& energyElec, float& energySlow,
321 const CudaNBConstants nbConstants) {
323 int vdwIndex = vdwtypej + vdwtypei;
325 #if defined(USE_TABLE_ARRAYS)
326 float2 ljab = vdwCoefTable[vdwIndex];
328 float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
332 float rinv = rsqrtf(r2);
334 float charge = qi * qj;
336 cudaNBForceMagCalc_VdwEnergySwitch_PMEC1<doEnergy, doSlow>(
337 r2, rinv, charge, ljab, nbConstants,
338 f, fSlow, energyVdw, energyElec, energySlow);
339 // dx, dy, and dz are already store in contiguous registers -> wrap them in a fast_float3 and operate
340 fast_float3 ff(dx, dy, dz);
343 // float fx = dx * f;
344 // float fy = dy * f;
345 // float fz = dz * f;
346 fast_float3 iff = make_fast_float3(iforce);
347 fast_float3 jff = make_fast_float3(jforce);
363 float fxSlow = dx * fSlow;
364 float fySlow = dy * fSlow;
365 float fzSlow = dz * fSlow;
366 iforceSlow.x += fxSlow;
367 iforceSlow.y += fySlow;
368 iforceSlow.z += fzSlow;
369 jforceSlow.x -= fxSlow;
370 jforceSlow.y -= fySlow;
371 jforceSlow.z -= fzSlow;
375 /* JM: Special __device__ function to compute VDW forces for alchemy.
376 * Partially swiped from ComputeNonbondedFEP.C
378 template<bool doEnergy, bool doSlow, bool shift, bool vdwForceSwitch>
379 __device__ __forceinline__
380 void calcForceEnergyFEP(const float r2, const float qi, const float qj,
381 const float dx, const float dy, const float dz,
382 const int vdwtypei, const int vdwtypej,
384 /*const AlchData &alchflags, */
385 #ifdef USE_TABLE_ARRAYS
386 const float2* __restrict__ vdwCoefTable,
387 const float4* __restrict__ forceTable,
388 const float4* __restrict__ energyTable,
390 cudaTextureObject_t vdwCoefTableTex,
391 cudaTextureObject_t forceTableTex,
392 cudaTextureObject_t energyTableTex,
394 fast_float3& iforce, fast_float3& iforceSlow, fast_float3& jforce, fast_float3& jforceSlow,
395 float& energyVdw, float &energyVdw_s, float& energyElec, float& energySlow,
396 float& energyElec_s, float& energySlow_s) {
399 int vdwIndex = vdwtypej + vdwtypei;
400 #if defined(USE_TABLE_ARRAYS)
401 float2 ljab = fast_load(vdwCoefTable, vdwIndex);
403 float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex); //ljab.x is A and ljab.y is B
406 float myVdwLambda = 0.0f;
407 float myVdwLambda2 = 0.0f;
408 float myElecLambda = 0.0f;
409 float myElecLambda2 = 0.0f;
410 float rinv = __frsqrt_rn(r2);
412 float alch_vdw_energy = 0.0f;
413 float alch_vdw_energy_2 = 0.0f;
414 float alch_vdw_force = 0.0f;
415 float fSlow = qi * qj;
418 #ifdef USE_TABLE_ARRAYS
419 float4 fi = tableLookup(forceTable, rinv);
420 if (doEnergy) ei = tableLookup(energyTable, rinv);
422 float4 fi = sampleTableTex(forceTableTex, rinv);
423 if (doEnergy) ei = sampleTableTex(energyTableTex, rinv);
427 //John said that there is a better way to avoid divergences here
428 //alch: true if => 1-0, 1-1, 2-0, 2-2
429 //dec: true if => 1-1, 2-2 && decouple
430 //up: true if => 1-0 && 1,1
431 //down: true if => 2-0, && 2,2
432 int ref = (p1 == 0 && p2 == 0);
433 int alch = (!ref && !(p1 == 1 && p2 ==2) && !(p1 == 2 && p2 == 1));
434 int dec = (alch && (p1 == p2) && alchflags.alchDecouple);
435 int up = (alch && (p1 == 1 || p2 == 1) && !dec);
436 int down = (alch && (p1 == 2 || p2 == 2) && !dec);
441 /*--------------- VDW SPECIAL ALCH FORCES (Swiped from ComputeNonbondedFEP.C) ---------------*/
443 myVdwLambda = alchflags.vdwLambdaUp*(up) + alchflags.vdwLambdaDown*(down) + 1.f*(ref || dec);
444 myVdwLambda2 = alchflags.vdwLambda2Up*(up) + alchflags.vdwLambda2Down*(down) + 1.f*(ref || dec);
445 myElecLambda = alchflags.elecLambdaUp*(up) + alchflags.elecLambdaDown*(down) + 1.f*(ref || dec);
446 myElecLambda2 = alchflags.elecLambda2Up*(up) + alchflags.elecLambda2Down*(down) + 1.f*(ref || dec);
449 if (vdwForceSwitch) {
451 float switchdist6_1, switchdist6_2;
452 const float cutoff6 = alchflags.cutoff2 * alchflags.cutoff2 * alchflags.cutoff2;
454 //Templated parameter. No control divergence here
456 const float myVdwShift = alchflags.vdwShiftUp*up + alchflags.vdwShiftDown*(!up);
457 const float myVdwShift2 = alchflags.vdwShift2Up*up + alchflags.vdwShift2Down*(!up);
458 r2_1 = __fdividef(1.f,(r2 + myVdwShift));
459 r2_2 = __fdividef(1.f,(r2 + myVdwShift2));
460 switchdist6_1 = alchflags.switchdist2 + myVdwShift;
461 switchdist6_1 = switchdist6_1 * switchdist6_1 * switchdist6_1;
462 switchdist6_2 = alchflags.switchdist2 + myVdwShift2;
463 switchdist6_2 = switchdist6_2 * switchdist6_2 * switchdist6_2;
467 switchdist6_1 = alchflags.switchdist2 * alchflags.switchdist2 * alchflags.switchdist2;
468 switchdist6_2 = switchdist6_1;
470 const float r6_1 = r2_1*r2_1*r2_1;
471 const float r6_2 = r2_2*r2_2*r2_2;
472 if (r2 <= alchflags.switchdist2) {
473 const float U1 = ljab.x*r6_1*r6_1 - ljab.y*r6_1; // NB: unscaled, shorthand only!
474 const float U2 = ljab.x*r6_2*r6_2 - ljab.y*r6_2;
475 // A == ljab.x, B == ljab.y
476 const float dU_1 = -ljab.x / (cutoff6 * switchdist6_1) - (-ljab.y * rsqrtf(cutoff6 * switchdist6_1));
477 const float dU_2 = -ljab.x / (cutoff6 * switchdist6_2) - (-ljab.y * rsqrtf(cutoff6 * switchdist6_2));
478 alch_vdw_energy = myVdwLambda * (U1 + dU_1);
479 alch_vdw_energy_2 = myVdwLambda2 * (U2 + dU_2);
481 //Multiplied by -1.0 to match CPU values
482 alch_vdw_force =-1.f*myVdwLambda*((12.f*U1 + 6.f*ljab.y*r6_1)*r2_1);
484 const float r3_1 = sqrtf(r6_1);
485 const float r3_2 = sqrtf(r6_2);
486 const float inv_cutoff6 = 1.0f / cutoff6;
487 const float inv_cutoff3 = rsqrtf(cutoff6);
488 // A == ljab.x, B == ljab.y
489 const float k_vdwa_1 = ljab.x / (1.0f - switchdist6_1 * inv_cutoff6);
490 const float k_vdwb_1 = ljab.y / (1.0f - sqrtf(switchdist6_1 * inv_cutoff6));
491 const float k_vdwa_2 = ljab.x / (1.0f - switchdist6_2 * inv_cutoff6);
492 const float k_vdwb_2 = ljab.y / (1.0f - sqrtf(switchdist6_2 * inv_cutoff6));
493 const float tmpa_1 = r6_1 - inv_cutoff6;
494 const float tmpb_1 = r3_1 - inv_cutoff3;
495 const float tmpa_2 = r6_2 - inv_cutoff6;
496 const float tmpb_2 = r3_2 - inv_cutoff3;
497 alch_vdw_energy = myVdwLambda * (k_vdwa_1 * tmpa_1 * tmpa_1 - k_vdwb_1 * tmpb_1 * tmpb_1);
498 alch_vdw_energy_2 = myVdwLambda2 * (k_vdwa_2 * tmpa_2 * tmpa_2 - k_vdwb_2 * tmpb_2 * tmpb_2);
499 //Multiplied by -1.0 to match CPU values
500 alch_vdw_force = -1.0f * myVdwLambda * (6.0f * r2_1 * (2.0f * k_vdwa_1 * tmpa_1 * r6_1 - k_vdwb_1 * tmpb_1 * r3_1));
501 } // r2 <= alchflags.switchdist2
503 // potential switching
504 const float diff = alchflags.cutoff2 - r2;
506 const float switchmul = (alchflags.switchfactor*(diff)*(diff)*(alchflags.cutoff2 - 3.f*alchflags.switchdist2 + 2.f*r2))*(r2 > alchflags.switchdist2) + (1.f)*(r2 <= alchflags.switchdist2);
507 const float switchmul2 = (12.f*alchflags.switchfactor*(diff)*(r2 - alchflags.switchdist2))*(r2 > alchflags.switchdist2) + (0.f) * (r2 <= alchflags.switchdist2);
509 //Templated parameter. No control divergence here
511 //This templated parameter lets me get away with not making 2 divisions. But for myVdwShift != 0, how do I do this?
512 const float myVdwShift = alchflags.vdwShiftUp*up + alchflags.vdwShiftDown*(!up);
513 const float myVdwShift2 = alchflags.vdwShift2Up*up + alchflags.vdwShift2Down*(!up);
514 //r2_1 = 1.0/(r2 + myVdwShift);
515 //r2_2 = 1.0/(r2 + myVdwShift2);
516 r2_1 = __fdividef(1.f,(r2 + myVdwShift));
517 r2_2 = __fdividef(1.f,(r2 + myVdwShift2));
523 const float r6_1 = r2_1*r2_1*r2_1;
524 const float r6_2 = r2_2*r2_2*r2_2;
525 const float U1 = ljab.x*r6_1*r6_1 - ljab.y*r6_1; // NB: unscaled, shorthand only!
526 const float U2 = ljab.x*r6_2*r6_2 - ljab.y*r6_2;
527 alch_vdw_energy = myVdwLambda*switchmul*U1;
528 alch_vdw_energy_2 = myVdwLambda2*switchmul*U2;
530 //Multiplied by -1.0 to match CPU values
531 alch_vdw_force =-1.f*myVdwLambda*((switchmul*(12.f*U1 + 6.f*ljab.y*r6_1)*r2_1+ switchmul2*U1));
535 /*-----------------------------------------------------------*/
538 //All energies should be scaled by the corresponding lambda
539 energyVdw += (ljab.x * ei.z + ljab.y * ei.y)*(ref || dec) + alch_vdw_energy*(alch && !dec);
540 energyElec += (fSlow * ei.x)*myElecLambda;
541 energyVdw_s += (ljab.x * ei.z + ljab.y * ei.y)*(ref || dec) + alch_vdw_energy_2*(alch && !dec);
542 energyElec_s += (fSlow * ei.x)*myElecLambda2;
544 energySlow += (fSlow * ei.w)*myElecLambda;
545 energySlow_s += (fSlow * ei.w)*myElecLambda2;
549 if (doSlow) fSlow *= fi.w;
551 //We should include the regular VDW forces if not dealing with alch pairs
552 f = (f + ((ljab.x * fi.z + ljab.y * fi.y)*(!alch || dec)))*myElecLambda
553 + alch_vdw_force*(alch && !dec);
567 /*There's stuff that needs to be added here, when FAST AND NOSHORT macros are on*/
568 fSlow = myElecLambda*fSlow;
569 float fxSlow = dx * fSlow;
570 float fySlow = dy * fSlow;
571 float fzSlow = dz * fSlow;
572 iforceSlow.x += fxSlow;
573 iforceSlow.y += fySlow;
574 iforceSlow.z += fzSlow;
575 jforceSlow.x -= fxSlow;
576 jforceSlow.y -= fySlow;
577 jforceSlow.z -= fzSlow;
581 /* JM: Special __device__ function to compute VDW forces for TI.
584 template<bool doEnergy, bool doSlow, bool shift, bool vdwForceSwitch>
585 __device__ __forceinline__
586 void calcForceEnergyTI(const float r2, const float qi, const float qj,
587 const float dx, const float dy, const float dz,
588 const int vdwtypei, const int vdwtypej,
590 #ifdef USE_TABLE_ARRAYS
591 const float2* __restrict__ vdwCoefTable,
592 const float4* __restrict__ forceTable,
593 const float4* __restrict__ energyTable,
595 cudaTextureObject_t vdwCoefTableTex,
596 cudaTextureObject_t forceTableTex,
597 cudaTextureObject_t energyTableTex,
599 fast_float3& iforce, fast_float3& iforceSlow, fast_float3& jforce, fast_float3& jforceSlow,
600 float& energyVdw, float& energyVdw_ti_1, float& energyVdw_ti_2,
601 float& energyElec, float& energyElec_ti_1, float& energyElec_ti_2,
602 float& energySlow, float& energySlow_ti_1, float& energySlow_ti_2) {
604 int vdwIndex = vdwtypej + vdwtypei;
605 #if defined(USE_TABLE_ARRAYS)
606 float2 ljab = fast_load(vdwCoefTable, vdwIndex);
608 float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex); //ljab.x is A and ljab.y is B
611 /* JM: For TI, we have to deal ALCH1 OR ALCH2 during ComputeNonbondedBase2
612 * ALCH1 for appearing terms;
613 * ALCH2 for dissapearing terms;
614 * Instead of the _s energy terms, we need the to calculate:
616 * vdwEnergy_ti_1 and _2 for VDW energies. For those we need to add the special terms calculated on
617 * ComputeNonbondedTI.C
619 * elecEnergy_ti_1 and _2 for electrostatic energy. No correction needed here though.
623 float myVdwLambda = 0.0f;
624 float myElecLambda = 0.0f;
625 float rinv = __frsqrt_rn(r2);
627 float alch_vdw_energy = 0.0f;
628 float alch_vdw_force = 0.0f;
629 float alch_vdw_dUdl = 0.0f;
630 float fSlow = qi * qj;
632 #if defined(USE_TABLE_ARRAYS)
633 float4 fi = tableLookup(forceTable, rinv);
634 if (doEnergy) ei = tableLookup(energyTable, rinv);
636 float4 fi = sampleTableTex(forceTableTex, rinv);
637 if (doEnergy) ei = sampleTableTex(energyTableTex, rinv);
640 //John said that there is a better way to avoid divergences here
641 //alch: true if => 1-0, 1-1, 2-0, 2-2
642 //dec: true if => 1-1, 2-2 && decouple
643 //up: true if => 1-0 && 1,1
644 //down: true if => 2-0, && 2,2
645 int ref = (p1 == 0 && p2 == 0);
646 int alch = (!ref && !(p1 == 1 && p2 ==2) && !(p1 == 2 && p2 == 1));
647 int dec = (alch && (p1 == p2) && alchflags.alchDecouple);
648 int up = (alch && (p1 == 1 || p2 == 1) && !dec);
649 int down = (alch && (p1 == 2 || p2 == 2) && !dec);
654 /*--------------- VDW SPECIAL ALCH STUFF (Swiped from ComputeNonbondedTI.C) ---------------*/
655 myVdwLambda = alchflags.vdwLambdaUp*(up) + alchflags.vdwLambdaDown*(down) + 1.f*(ref || dec);
656 myElecLambda = alchflags.elecLambdaUp*(up) + alchflags.elecLambdaDown*(down) + 1.f*(ref || dec);
658 if (vdwForceSwitch) {
659 const float cutoff6 = alchflags.cutoff2 * alchflags.cutoff2 * alchflags.cutoff2;
662 const float myVdwShift = alchflags.vdwShiftUp*up + alchflags.vdwShiftDown*(!up);
663 r2_1 = __fdividef(1.f,(r2 + myVdwShift));
664 switchdist6 = alchflags.switchdist2 + myVdwShift;
665 switchdist6 = switchdist6 * switchdist6 * switchdist6;
668 switchdist6 = alchflags.switchdist2 * alchflags.switchdist2 * alchflags.switchdist2;
670 const float r6_1 = r2_1*r2_1*r2_1;
671 if (r2 <= alchflags.switchdist2) {
672 const float U = ljab.x*r6_1*r6_1 - ljab.y*r6_1;
673 const float dU = -ljab.x / (cutoff6 * switchdist6) - (-ljab.y * rsqrtf(cutoff6 * switchdist6));
674 alch_vdw_force = -1.f*(myVdwLambda*((12.f*U + 6.f*ljab.y*r6_1)*r2_1));
675 alch_vdw_energy = myVdwLambda * (U + dU);
676 alch_vdw_dUdl = U + myVdwLambda * alchflags.alchVdwShiftCoeff * (6.f*U + 3.f*ljab.y*r6_1)*r2_1 + dU;
678 const float r3_1 = sqrtf(r6_1);
679 const float inv_cutoff6 = 1.0f / cutoff6;
680 const float inv_cutoff3 = sqrtf(inv_cutoff6);
681 const float k_vdwa_1 = ljab.x / (1.0f - switchdist6 * inv_cutoff6);
682 const float k_vdwb_1 = ljab.y / (1.0f - sqrtf(switchdist6 * inv_cutoff6));
683 const float tmpa_1 = r6_1 - inv_cutoff6;
684 const float tmpb_1 = r3_1 - inv_cutoff3;
685 const float U = k_vdwa_1 * tmpa_1 * tmpa_1 - k_vdwb_1 * tmpb_1 * tmpb_1;
686 alch_vdw_force = -1.0f * myVdwLambda * (6.0f * r2_1 * (2.0f * k_vdwa_1 * tmpa_1 * r6_1 - k_vdwb_1 * tmpb_1 * r3_1));
687 alch_vdw_energy = myVdwLambda * U;
688 alch_vdw_dUdl = U + myVdwLambda * alchflags.alchVdwShiftCoeff * (3.0f * r2_1 * (2.0f * k_vdwa_1 * tmpa_1 * r6_1 - k_vdwb_1 * tmpb_1 * r3_1));
689 } // r2 <= alchflags.switchdist2
691 // potential switching
692 const float diff = alchflags.cutoff2 - r2;
693 const float switchmul = (r2 > alchflags.switchdist2 ? alchflags.switchfactor*(diff)*(diff) \
694 *(alchflags.cutoff2 - 3.f*alchflags.switchdist2 + 2.f*r2) : 1.f);
696 const float switchmul2 = (r2 > alchflags.switchdist2 ? \
697 12.f*alchflags.switchfactor*(diff) \
698 *(r2 - alchflags.switchdist2) : 0.f);
699 //Templated parameter. No control divergence here
701 const float myVdwShift = alchflags.vdwShiftUp*up + alchflags.vdwShiftDown*(!up);
702 r2_1 = __fdividef(1.f,(r2 + myVdwShift));
703 }else r2_1 = rinv*rinv;
705 const float r6_1 = r2_1*r2_1*r2_1;
706 const float U = ljab.x*r6_1*r6_1 - ljab.y*r6_1; // NB: unscaled! for shorthand only!
707 alch_vdw_energy = myVdwLambda*switchmul*U;
708 //Multiplied by -1.0 to match CPU values
709 alch_vdw_force = -1.f*(myVdwLambda*(switchmul*(12.f*U + 6.f*ljab.y*r6_1)*r2_1 \
711 alch_vdw_dUdl = (switchmul*(U + myVdwLambda*alchflags.alchVdwShiftCoeff \
712 *(6.f*U + 3.f*ljab.y*r6_1)*r2_1));
715 /*-------------------------------------------------------------------------*/
718 //All energies should be scaled by the corresponding lambda
719 energyVdw += (ljab.x * ei.z + ljab.y * ei.y)*(ref || dec) + alch_vdw_energy*(alch && !dec);
720 energyElec += (fSlow * ei.x)*myElecLambda;
722 energyVdw_ti_1 += alch_vdw_dUdl*up;
723 energyVdw_ti_2 += alch_vdw_dUdl*down;
724 energyElec_ti_1 += (fSlow * ei.x)*up;
725 energyElec_ti_2 += (fSlow * ei.x)*down;
728 energySlow += (fSlow * ei.w)*myElecLambda;
730 energySlow_ti_1 += (fSlow * ei.w)*up;
731 energySlow_ti_2 += (fSlow * ei.w)*down;
736 if (doSlow) fSlow *= fi.w;
737 //We should include the regular VDW forces if not dealing with alch pairs
738 f = (f + ((ljab.x * fi.z + ljab.y * fi.y)*(ref || dec)))*myElecLambda
739 + alch_vdw_force*(alch && !dec);
753 /*There's stuff that needs to be added here, when FAST AND NOSHORT macros are on*/
754 fSlow = myElecLambda*fSlow; /* FAST(NOSHORT(+alch_vdw_force))*/ //Those should also be zeroed
755 float fxSlow = dx * fSlow;
756 float fySlow = dy * fSlow;
757 float fzSlow = dz * fSlow;
758 iforceSlow.x += fxSlow;
759 iforceSlow.y += fySlow;
760 iforceSlow.z += fzSlow;
761 jforceSlow.x -= fxSlow;
762 jforceSlow.y -= fySlow;
763 jforceSlow.z -= fzSlow;
767 template<bool doSlow, typename T>
768 __device__ __forceinline__
769 void storeForces(const int pos, const T force, const T forceSlow,
770 float* __restrict__ devForces_x,
771 float* __restrict__ devForces_y,
772 float* __restrict__ devForces_z,
773 float* __restrict__ devForcesSlow_x,
774 float* __restrict__ devForcesSlow_y,
775 float* __restrict__ devForcesSlow_z)
777 #if defined(NAMD_HIP) && ((HIP_VERSION_MAJOR == 3) && (HIP_VERSION_MINOR > 3) || (HIP_VERSION_MAJOR > 3))
778 if (force.x != 0.0f || force.y != 0.0f || force.z != 0.0f) {
779 atomicAdd(&devForces_x[pos], force.x);
780 atomicAdd(&devForces_y[pos], force.y);
781 atomicAdd(&devForces_z[pos], force.z);
784 if (forceSlow.x != 0.0f || forceSlow.y != 0.0f || forceSlow.z != 0.0f) {
785 atomicAdd(&devForcesSlow_x[pos], forceSlow.x);
786 atomicAdd(&devForcesSlow_y[pos], forceSlow.y);
787 atomicAdd(&devForcesSlow_z[pos], forceSlow.z);
791 atomicAdd(&devForces_x[pos], force.x);
792 atomicAdd(&devForces_y[pos], force.y);
793 atomicAdd(&devForces_z[pos], force.z);
795 atomicAdd(&devForcesSlow_x[pos], forceSlow.x);
796 atomicAdd(&devForcesSlow_y[pos], forceSlow.y);
797 atomicAdd(&devForcesSlow_z[pos], forceSlow.z);
802 template<bool doSlow>
803 __device__ __forceinline__
804 void storeForces(const int pos, const float4 force, const float4 forceSlow,
805 float* __restrict__ devForces_x,
806 float* __restrict__ devForces_y,
807 float* __restrict__ devForces_z,
808 float* __restrict__ devForcesSlow_x,
809 float* __restrict__ devForcesSlow_y,
810 float* __restrict__ devForcesSlow_z)
812 #if defined(NAMD_HIP) && ((HIP_VERSION_MAJOR == 3) && (HIP_VERSION_MINOR > 3) || (HIP_VERSION_MAJOR > 3))
813 if (force.x != 0.0f || force.y != 0.0f || force.z != 0.0f) {
814 atomicAdd(&devForces_x[pos], force.x);
815 atomicAdd(&devForces_y[pos], force.y);
816 atomicAdd(&devForces_z[pos], force.z);
819 if (forceSlow.x != 0.0f || forceSlow.y != 0.0f || forceSlow.z != 0.0f) {
820 atomicAdd(&devForcesSlow_x[pos], forceSlow.x);
821 atomicAdd(&devForcesSlow_y[pos], forceSlow.y);
822 atomicAdd(&devForcesSlow_z[pos], forceSlow.z);
826 atomicAdd(&devForces_x[pos], force.x);
827 atomicAdd(&devForces_y[pos], force.y);
828 atomicAdd(&devForces_z[pos], force.z);
830 atomicAdd(&devForcesSlow_x[pos], forceSlow.x);
831 atomicAdd(&devForcesSlow_y[pos], forceSlow.y);
832 atomicAdd(&devForcesSlow_z[pos], forceSlow.z);
837 //#define USE_NEW_EXCL_METHOD
840 // Returns the lower estimate for the distance between a bounding box and a set of atoms
842 __device__ __forceinline__ float distsq(const BoundingBox a, const float4 b) {
843 float dx = max(0.0f, fabsf(a.x - b.x) - a.wx);
844 float dy = max(0.0f, fabsf(a.y - b.y) - a.wy);
845 float dz = max(0.0f, fabsf(a.z - b.z) - a.wz);
846 float r2 = dx*dx + dy*dy + dz*dz;
850 #define LARGE_FLOAT (float)(1.0e10)
853 // Nonbonded force kernel
855 template <bool doEnergy, bool doVirial, bool doSlow, bool doPairlist, bool doAlch, bool doFEP, bool doTI, bool doStreaming, bool doTable, bool doAlchVdwForceSwitching>
857 __launch_bounds__(WARPSIZE*NONBONDKERNEL_NUM_WARP,
858 doPairlist ? (10) : (doEnergy ? (10) : (10) ))
859 nonbondedForceKernel(
860 const int start, const int numTileLists,
861 const TileList* __restrict__ tileLists, TileExcl* __restrict__ tileExcls,
862 const int* __restrict__ tileJatomStart,
863 const int vdwCoefTableWidth,
864 #if defined(USE_TABLE_ARRAYS)
865 const float2* __restrict__ vdwCoefTable,
867 cudaTextureObject_t vdwCoefTableTex,
869 const int* __restrict__ vdwTypes,
870 const float3 lata, const float3 latb, const float3 latc,
871 const float4* __restrict__ xyzq, const float cutoff2, const CudaNBConstants nbConstants,
872 #ifdef USE_TABLE_ARRAYS
873 const float4* __restrict__ forceTable, const float4* __restrict__ energyTable,
875 cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
879 float plcutoff2, const PatchPairRecord* __restrict__ patchPairs,
880 const int* __restrict__ atomIndex,
881 const int2* __restrict__ exclIndexMaxDiff, const unsigned int* __restrict__ overflowExclusions,
882 unsigned int* __restrict__ tileListDepth, int* __restrict__ tileListOrder,
883 int* __restrict__ jtiles, TileListStat* __restrict__ tileListStat,
884 const BoundingBox* __restrict__ boundingBoxes,
885 #ifdef USE_NEW_EXCL_METHOD
886 const int* __restrict__ minmaxExclAtom,
889 float * __restrict__ devForce_x,
890 float * __restrict__ devForce_y,
891 float * __restrict__ devForce_z,
892 float * __restrict__ devForce_w,
893 float * __restrict__ devForceSlow_x,
894 float * __restrict__ devForceSlow_y,
895 float * __restrict__ devForceSlow_z,
896 float * __restrict__ devForceSlow_w,
897 // ---- USE_STREAMING_FORCES ----
898 const int numPatches,
899 unsigned int* __restrict__ patchNumCount,
900 const CudaPatchRecord* __restrict__ cudaPatches,
901 float4* __restrict__ mapForces, float4* __restrict__ mapForcesSlow,
902 int* __restrict__ mapPatchReadyQueue,
903 int* __restrict__ outputOrder,
904 // ------------------------------
905 TileListVirialEnergy* __restrict__ virialEnergy,
910 // Single warp takes care of one list of tiles
911 // for (int itileList = (threadIdx.x + blockDim.x*blockIdx.x)/WARPSIZE;itileList < numTileLists;itileList += blockDim.x*gridDim.x/WARPSIZE)
912 // int itileList = start + threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;
913 // The line above is the CUDA-exclusive version. HIPification in this case is selecting b const CudaNBConstants nbConstants, ased on specific parameters
914 // In the hip scheme: Each warp gets an itile and 2 jtiles
915 // if NONBONDEDKERNEL_NUM_WARP == 1, each warp does 2 tiles, so we do 2* blockIdx
916 // int itileList = start + (threadIdx.x/WARPSIZE + NONBONDKERNEL_NUM_WARP*blockIdx.x);
918 int itileList = start + (WARPSIZE == 64 ? blockIdx.x : (threadIdx.x/BOUNDINGBOXSIZE + NONBONDKERNEL_NUM_WARP*blockIdx.x));
919 if (itileList < numTileLists)
921 fast_float3 iforce, iforceSlow;
922 fast_float3 jforce, jforceSlow;
923 float energyVdw, energyElec, energySlow;
925 float energyVdw_s, energyElec_s, energySlow_s;
927 float energyVdw_ti_1, energyVdw_ti_2, energyElec_ti_1, energyElec_ti_2, energySlow_ti_1, energySlow_ti_2;
929 unsigned int itileListLen;
932 char part1, part2, p2;
933 bool doShift = doAlch && (alchflags.alchVdwShiftCoeff != 0.0f);
934 __shared__ float4 s_xyzq[NONBONDKERNEL_NUM_WARP][BOUNDINGBOXSIZE];
935 __shared__ int s_vdwtypej[NONBONDKERNEL_NUM_WARP][BOUNDINGBOXSIZE];
936 __shared__ float4 s_jforce[NONBONDKERNEL_SWEEPS_PER_TILE][BOUNDINGBOXSIZE];
937 __shared__ float4 s_jforceSlow[NONBONDKERNEL_SWEEPS_PER_TILE][BOUNDINGBOXSIZE];
938 __shared__ int s_jatomIndex[NONBONDKERNEL_NUM_WARP][BOUNDINGBOXSIZE];
940 // Warp index (0...WARPSIZE-1)
941 // JM: For BoundingBoxSize32 and WARPSIZE == 64, each warp gets two tile lists!
943 const int wid = threadIdx.x % BOUNDINGBOXSIZE;
944 const int iwarp = 0; // one workgroup does two sweeps
945 const int isweep = threadIdx.x / BOUNDINGBOXSIZE; // 2 sweeps for 32-sized bounding boxes
946 constexpr int nsweep = NONBONDKERNEL_SWEEPS_PER_TILE;
950 TileList tmp = tileLists[itileList];
951 int iatomStart = tmp.iatomStart;
952 int jtileStart = tmp.jtileStart;
953 int jtileEnd = tmp.jtileEnd;
954 patchInd = tmp.patchInd;
955 patchNumList = tmp.patchNumList;
957 float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x;
958 float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y;
959 float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z;
961 // DH - set zeroShift flag if magnitude of shift vector is zero
962 bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
964 int iatomSize, iatomFreeSize, jatomSize, jatomFreeSize;
966 PatchPairRecord PPStmp = patchPairs[itileList];
967 iatomSize = PPStmp.iatomSize;
968 iatomFreeSize = PPStmp.iatomFreeSize;
969 jatomSize = PPStmp.jatomSize;
970 jatomFreeSize = PPStmp.jatomFreeSize;
973 // Write to global memory here to avoid register spilling
976 virialEnergy[itileList].shx = shx;
977 virialEnergy[itileList].shy = shy;
978 virialEnergy[itileList].shz = shz;
982 // Load i-atom data (and shift coordinates)
984 float4 xyzq_i = xyzq[iatomStart + wid];
985 if (doAlch) part1 = p[iatomStart + wid];
989 xyz_i.x = xyzq_i.x; // no wasted registers here, just aliasing as float4s are vectorized
992 int vdwtypei = vdwTypes[iatomStart + wid]*vdwCoefTableWidth;
994 // Load i-atom data (and shift coordinates)
995 BoundingBox boundingBoxI;
997 boundingBoxI = boundingBoxes[iatomStart/BOUNDINGBOXSIZE];
998 boundingBoxI.x += shx;
999 boundingBoxI.y += shy;
1000 boundingBoxI.z += shz;
1003 // Get i-atom global index
1004 #ifdef USE_NEW_EXCL_METHOD
1005 int iatomIndex, minExclAtom, maxExclAtom;
1010 #ifdef USE_NEW_EXCL_METHOD
1011 iatomIndex = atomIndex[iatomStart + wid];
1012 int2 tmp = minmaxExclAtom[iatomStart + wid];
1013 minExclAtom = tmp.x;
1014 maxExclAtom = tmp.y;
1016 iatomIndex = atomIndex[iatomStart + wid];
1020 // i-forces in registers
1026 // float3 iforceSlow;
1028 iforceSlow.x = 0.0f;
1029 iforceSlow.y = 0.0f;
1030 iforceSlow.z = 0.0f;
1033 // float energyVdw, energyElec, energySlow;
1037 energyVdw_ti_1 = 0.0f;
1038 energyVdw_ti_2 = 0.0f;
1040 energyElec_ti_1 = 0.0f;
1041 energyElec_ti_2 = 0.0f;
1042 energyElec_s = 0.0f;
1045 energySlow_s = 0.0f;
1046 energySlow_ti_1 = 0.0f;
1047 energySlow_ti_2 = 0.0f;
1051 // Number of exclusions
1052 // NOTE: Lowest bit is used as indicator bit for tile pairs:
1053 // bit 0 tile has no atoms within pairlist cutoff
1054 // bit 1 tile has atoms within pairlist cutoff
1056 if (doPairlist) nexcluded = 0;
1058 // Number of i loops and free atoms
1061 int nloopi = min(iatomSize - iatomStart, BOUNDINGBOXSIZE);
1062 nfreei = max(iatomFreeSize - iatomStart, 0);
1063 if (wid >= nloopi) {
1064 xyzq_i.x = -LARGE_FLOAT;
1065 xyzq_i.y = -LARGE_FLOAT;
1066 xyzq_i.z = -LARGE_FLOAT;
1071 // int itileListLen;
1072 // int minJatomStart;
1074 // minJatomStart = tileJatomStart[jtileStart];
1078 // Exclusion index and maxdiff
1079 int iexclIndex, iexclMaxdiff;
1081 int2 tmp = exclIndexMaxDiff[iatomStart + wid];
1083 iexclMaxdiff = tmp.y;
1085 // fetch two tiles at once here
1086 for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
1087 // Load j-atom starting index and exclusion mask
1088 int jatomStart = tileJatomStart[jtile];
1090 float4 xyzq_j = xyzq[jatomStart + wid];
1091 NAMD_WARP_SYNC(WARP_FULL_MASK);
1092 if (doAlch) p2 = p[jatomStart + wid];
1094 // Check for early bail
1096 float r2bb = distsq(boundingBoxI, xyzq_j);
1097 if (NAMD_WARP_ALL(WARP_FULL_MASK, r2bb > plcutoff2)) continue;
1099 WarpMask excl = (doPairlist) ? 0 : tileExcls[jtile].excl[wid];
1100 int vdwtypej = vdwTypes[jatomStart + wid];
1101 s_vdwtypej[iwarp][wid] = vdwtypej;
1103 // Get i-atom global index
1105 s_jatomIndex[iwarp][wid] = atomIndex[jatomStart + wid];
1108 // Number of j loops and free atoms
1111 int nloopj = min(jatomSize - jatomStart, BOUNDINGBOXSIZE);
1112 nfreej = max(jatomFreeSize - jatomStart, 0);
1113 //if (nfreei == 0 && nfreej == 0) continue;
1114 if (wid >= nloopj) {
1115 xyzq_j.x = LARGE_FLOAT;
1116 xyzq_j.y = LARGE_FLOAT;
1117 xyzq_j.z = LARGE_FLOAT;
1120 s_xyzq[iwarp][wid] = xyzq_j;
1122 // DH - self requires that zeroShift is also set
1123 const bool self = zeroShift && (iatomStart == jatomStart);
1124 const int modval = (self) ? 2*BOUNDINGBOXSIZE-1 : BOUNDINGBOXSIZE-1;
1126 s_jforce[isweep][wid] = make_float4(0.0f, 0.0f, 0.0f, 0.f);
1128 s_jforceSlow[isweep][wid] = make_float4(0.0f, 0.0f, 0.0f, 0.f);
1129 NAMD_WARP_SYNC(WARP_FULL_MASK);
1133 // NOTE: Pairlist update, we must also include the diagonal since this is used
1135 // Clear the lowest (indicator) bit
1138 // For self tiles, do the diagonal term (t=0).
1139 // NOTE: No energies are computed here, since this self-diagonal term is only for GBIS phase 2
1140 if (self && !isweep) {
1141 int j = (0 + wid) & modval;
1142 xyzq_j = s_xyzq[iwarp][j];
1143 float dx = xyzq_j.x - xyzq_i.x;
1144 float dy = xyzq_j.y - xyzq_i.y;
1145 float dz = xyzq_j.z - xyzq_i.z;
1146 float r2 = dx*dx + dy*dy + dz*dz;
1148 if (j < BOUNDINGBOXSIZE && r2 < plcutoff2) {
1149 // We have atom pair within the pairlist cutoff => Set indicator bit
1153 for (int t = (self && !isweep) ? isweep+nsweep : isweep;t < BOUNDINGBOXSIZE; t+=nsweep) {
1154 int j = (t + wid) & modval;
1156 // NOTE: __shfl() operation can give non-sense here because j may be >= WARPSIZE.
1157 // However, if (j < WARPSIZE ..) below makes sure that these non-sense
1158 // results are not used
1159 if (doAlch) part2 = WARP_SHUFFLE(WARP_FULL_MASK, p2, j, WARPSIZE);
1161 excl >>= nsweep;// moves it two sweeps
1162 if (j < BOUNDINGBOXSIZE) {
1163 xyzq_j = s_xyzq[iwarp][j];
1164 float dx = xyzq_j.x - xyzq_i.x;
1165 float dy = xyzq_j.y - xyzq_i.y;
1166 float dz = xyzq_j.z - xyzq_i.z;
1167 float r2 = dx*dx + dy*dy + dz*dz;
1168 if (r2 < plcutoff2) {
1169 // We have atom pair within the pairlist cutoff => Set indicator bit
1171 if (j < nfreej || wid < nfreei) {
1172 bool excluded = false;
1173 int indexdiff = s_jatomIndex[iwarp][j] - iatomIndex;
1174 if ( abs(indexdiff) <= iexclMaxdiff) {
1175 indexdiff += iexclIndex;
1176 int indexword = ((unsigned int) indexdiff) >> 5;
1178 indexword = overflowExclusions[indexword];
1181 excluded = ((indexword & (1<<(indexdiff&31))) != 0);
1183 if (excluded) nexcluded += 2;
1184 if (!excluded) excl |= (WarpMask)1 << (BOUNDINGBOXSIZE-(nsweep - isweep)); // good luck understanding this
1186 if(!excluded && r2 < cutoff2){
1187 jforce = float4(s_jforce[isweep][j]);
1188 if(doSlow) jforceSlow = float4(s_jforceSlow[isweep][j]);
1191 calcForceEnergyFEP<doEnergy, doSlow, true, doAlchVdwForceSwitching>(
1192 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1193 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1194 #ifdef USE_TABLE_ARRAYS
1199 #ifdef USE_TABLE_ARRAYS
1200 forceTable, energyTable,
1202 forceTableTex, energyTableTex,
1206 energyVdw, energyVdw_s,
1207 energyElec, energySlow, energyElec_s, energySlow_s);
1209 calcForceEnergyTI<doEnergy, doSlow, true, doAlchVdwForceSwitching>(
1210 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1211 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1212 #ifdef USE_TABLE_ARRAYS
1217 #ifdef USE_TABLE_ARRAYS
1218 forceTable, energyTable,
1220 forceTableTex, energyTableTex,
1224 energyVdw, energyVdw_ti_1,
1225 energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
1226 energySlow, energySlow_ti_1, energySlow_ti_2);
1230 calcForceEnergyFEP<doEnergy, doSlow, false, doAlchVdwForceSwitching>(
1231 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1232 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1233 #ifdef USE_TABLE_ARRAYS
1238 #ifdef USE_TABLE_ARRAYS
1239 forceTable, energyTable,
1241 forceTableTex, energyTableTex,
1245 energyVdw, energyVdw_s,
1246 energyElec, energySlow, energyElec_s, energySlow_s);
1248 calcForceEnergyTI<doEnergy, doSlow, false, doAlchVdwForceSwitching>(
1249 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1250 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1251 #ifdef USE_TABLE_ARRAYS
1256 #ifdef USE_TABLE_ARRAYS
1257 forceTable, energyTable,
1259 forceTableTex, energyTableTex,
1263 energyVdw, energyVdw_ti_1,
1264 energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
1265 energySlow, energySlow_ti_1, energySlow_ti_2);
1268 s_jforce[isweep][j] = make_float4(jforce.x, jforce.y, jforce.z, 1.f);
1269 if(doSlow) s_jforceSlow[isweep][j] = make_float4(jforceSlow.x, jforceSlow.y, jforceSlow.z, 1.f);
1270 }//if !excluded && r2 < cutoff2
1272 if (!excluded && r2 < cutoff2) {
1273 jforce = float4(s_jforce[isweep][j]);
1274 if(doSlow) jforceSlow = float4(s_jforceSlow[isweep][j]);
1276 calcForceEnergy<doEnergy, doSlow>(
1277 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1278 vdwtypei, s_vdwtypej[iwarp][j],
1279 #ifdef USE_TABLE_ARRAYS
1284 #ifdef USE_TABLE_ARRAYS
1285 forceTable, energyTable,
1287 forceTableTex, energyTableTex,
1292 energyElec, energySlow);
1294 calcForceEnergyMath<doEnergy, doSlow>(
1295 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1296 vdwtypei, s_vdwtypej[iwarp][j],
1297 #ifdef USE_TABLE_ARRAYS
1305 energyElec, energySlow, nbConstants);
1308 s_jforce[isweep][j] = make_float4(jforce.x, jforce.y, jforce.z, 1.f);
1309 if(doSlow) s_jforceSlow[isweep][j] = make_float4(jforceSlow.x, jforceSlow.y, jforceSlow.z, 1.f);
1315 NAMD_WARP_SYNC(WARP_FULL_MASK);
1318 // Just compute forces
1320 // Clear the first bit
1321 excl = excl & (~(WarpMask)1);
1324 int ind_sweep = isweep + wid;
1326 for (int t = 0; t < BOUNDINGBOXSIZE;t+= nsweep) {
1328 int j = (ind_sweep + t) & (BOUNDINGBOXSIZE-1);
1329 part2 = WARP_SHUFFLE(WARP_FULL_MASK, p2, j, WARPSIZE);
1333 int j = (ind_sweep + t) & (BOUNDINGBOXSIZE-1);
1334 xyzq_j = (float4)s_xyzq[iwarp][j];
1338 fast_float3 dr = xyz_j - xyz_i;
1339 // float dx = xyzq_j.x - xyzq_i.x;
1340 // float dy = xyzq_j.y - xyzq_i.y;
1341 // float dz = xyzq_j.z - xyzq_i.z;
1342 // float r2 = dx*dx + dy*dy + dz*dz;
1346 float r2 = norm2(dr);
1348 if(r2 < cutoff2){ // (r2 < cutoff2)
1349 jforce = float4(s_jforce[isweep][j]);
1350 if(doSlow) jforceSlow = float4(s_jforceSlow[isweep][j]);
1353 calcForceEnergyFEP<doEnergy, doSlow, true, doAlchVdwForceSwitching>(
1354 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1355 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1356 #ifdef USE_TABLE_ARRAYS
1361 #ifdef USE_TABLE_ARRAYS
1362 forceTable, energyTable,
1364 forceTableTex, energyTableTex,
1368 energyVdw, energyVdw_s,
1369 energyElec, energySlow, energyElec_s, energySlow_s);
1371 calcForceEnergyTI<doEnergy, doSlow, true, doAlchVdwForceSwitching>(
1372 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1373 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1374 #ifdef USE_TABLE_ARRAYS
1379 #ifdef USE_TABLE_ARRAYS
1380 forceTable, energyTable,
1382 forceTableTex, energyTableTex,
1386 energyVdw, energyVdw_ti_1,
1387 energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
1388 energySlow, energySlow_ti_1, energySlow_ti_2);
1392 calcForceEnergyFEP<doEnergy, doSlow, false, doAlchVdwForceSwitching>(
1393 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1394 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1395 #ifdef USE_TABLE_ARRAYS
1400 #ifdef USE_TABLE_ARRAYS
1401 forceTable, energyTable,
1403 forceTableTex, energyTableTex,
1407 energyVdw, energyVdw_s,
1408 energyElec, energySlow, energyElec_s, energySlow_s);
1410 calcForceEnergyTI<doEnergy, doSlow, false, doAlchVdwForceSwitching>(
1411 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1412 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1413 #ifdef USE_TABLE_ARRAYS
1418 #ifdef USE_TABLE_ARRAYS
1419 forceTable, energyTable,
1421 forceTableTex, energyTableTex,
1425 energyVdw, energyVdw_ti_1,
1426 energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
1427 energySlow, energySlow_ti_1, energySlow_ti_2);
1430 s_jforce[isweep][j] = make_float4(jforce.x, jforce.y, jforce.z, 1.f);
1431 if(doSlow) s_jforceSlow[isweep][j] = make_float4(jforceSlow.x, jforceSlow.y, jforceSlow.z, 1.f);
1435 jforce = float4(s_jforce[isweep][j]);
1436 if(doSlow) jforceSlow = float4(s_jforceSlow[isweep][j]);
1438 calcForceEnergy<doEnergy, doSlow>(
1439 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1440 vdwtypei, s_vdwtypej[iwarp][j],
1441 #ifdef USE_TABLE_ARRAYS
1446 #ifdef USE_TABLE_ARRAYS
1447 forceTable, energyTable,
1449 forceTableTex, energyTableTex,
1453 energyVdw, energyElec, energySlow);
1456 calcForceEnergyMath<doEnergy, doSlow>(
1457 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1458 vdwtypei, s_vdwtypej[iwarp][j],
1459 #ifdef USE_TABLE_ARRAYS
1467 energyElec, energySlow, nbConstants);
1469 s_jforce[isweep][j] = make_float4(jforce.x, jforce.y, jforce.z, 0.f);
1470 if(doSlow) s_jforceSlow[isweep][j] = make_float4(jforceSlow.x, jforceSlow.y, jforceSlow.z, 0.f);
1475 NAMD_WARP_SYNC(WARP_FULL_MASK);
1479 // Write j-forces - shuffle them before storing to global memory
1480 jforce = s_jforce[isweep][wid];
1481 if(doSlow) jforceSlow = s_jforceSlow[isweep][wid];
1482 float4 jforceAccum = make_float4(jforce.x, jforce.y, jforce.z, 1.f);
1483 float4 jforceSlowAccum;
1484 if (doSlow) jforceSlowAccum = make_float4(jforceSlow.x, jforceSlow.y, jforceSlow.z, 1.f);
1486 jforce.x += WARP_SHUFFLE(WARP_FULL_MASK, jforce.x, wid + BOUNDINGBOXSIZE, WARPSIZE);
1487 jforce.y += WARP_SHUFFLE(WARP_FULL_MASK, jforce.y, wid + BOUNDINGBOXSIZE, WARPSIZE);
1488 jforce.z += WARP_SHUFFLE(WARP_FULL_MASK, jforce.z, wid + BOUNDINGBOXSIZE, WARPSIZE);
1490 jforceSlow.x += WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.x, wid + BOUNDINGBOXSIZE, WARPSIZE);
1491 jforceSlow.y += WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.y, wid + BOUNDINGBOXSIZE, WARPSIZE);
1492 jforceSlow.z += WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.z, wid + BOUNDINGBOXSIZE, WARPSIZE);
1495 for(int k = 1 ; k < nsweep; k++){
1496 jforceAccum.x += WARP_SHUFFLE(WARP_FULL_MASK, jforce.x, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1497 jforceAccum.y += WARP_SHUFFLE(WARP_FULL_MASK, jforce.y, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1498 jforceAccum.z += WARP_SHUFFLE(WARP_FULL_MASK, jforce.z, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1500 jforceSlowAccum.x += WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.x, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1501 jforceSlowAccum.y += WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.y, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1502 jforceSlowAccum.z += WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.z, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1507 storeForces<doSlow, float4>(jatomStart + wid, jforceAccum, jforceSlowAccum,
1508 devForce_x, devForce_y, devForce_z,
1509 devForceSlow_x, devForceSlow_y, devForceSlow_z);
1514 // We have to calculate the global flags of this part here, which is
1515 WarpMask res_excl = excl;
1516 for(int k = 1 ; k < nsweep; k++){
1517 // shuffle the integers to make sure we have complete flags
1518 res_excl |= WARP_SHUFFLE(WARP_FULL_MASK, excl, wid + BOUNDINGBOXSIZE*k, WARPSIZE);
1520 // after that, everyone grabs the value from the lower warp
1521 int anyexcl = (65536 | NAMD_WARP_ANY(WARP_FULL_MASK, res_excl != 0));
1522 // Mark this jtile as non-empty:
1523 // VdW: 1 if tile has atom pairs within pairlist cutoff and some these atoms interact
1524 // GBIS: 65536 if tile has atom pairs within pairlist cutoff but not necessary interacting (i.e. these atoms are fixed or excluded)
1525 if (!isweep && anyexcl){ // lower threads in warp updates the values
1526 if (wid) jtiles[jtile] = anyexcl;
1528 tileExcls[jtile].excl[wid] = res_excl;
1530 // lower 16 bits number of tiles with atom pairs within pairlist cutoff that interact
1531 // upper 16 bits number of tiles with atom pairs within pairlist cutoff (but not necessary interacting)
1532 // low sweep has the correct value for this, so all good
1533 itileListLen += anyexcl;
1534 // NOTE, this minJatomStart is only stored once for the first tile list entry
1535 // minJatomStart = min(minJatomStart, jatomStart);
1543 // shfl before writing - lower tile gets high tile value
1544 float3 iforceAccum = iforce;
1545 float3 iforceSlowAccum = iforceSlow;
1546 for(int k = 1 ; k < nsweep; k++){
1547 iforceAccum.x += WARP_SHUFFLE(WARP_FULL_MASK, iforce.x, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1548 iforceAccum.y += WARP_SHUFFLE(WARP_FULL_MASK, iforce.y, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1549 iforceAccum.z += WARP_SHUFFLE(WARP_FULL_MASK, iforce.z, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1551 iforceSlowAccum.x += WARP_SHUFFLE(WARP_FULL_MASK, iforceSlow.x, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1552 iforceSlowAccum.y += WARP_SHUFFLE(WARP_FULL_MASK, iforceSlow.y, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1553 iforceSlowAccum.z += WARP_SHUFFLE(WARP_FULL_MASK, iforceSlow.z, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1557 storeForces<doSlow, float3>(iatomStart + wid, iforceAccum, iforceSlowAccum,
1558 devForce_x, devForce_y, devForce_z,
1559 devForceSlow_x, devForceSlow_y, devForceSlow_z);
1563 // Done with computation
1565 // Save pairlist stuff
1567 if (wid == 0 && isweep == 0){
1568 // minJatomStart is in range [0 ... atomStorageSize-1]
1569 //int atom0 = (minJatomStart)/WARPSIZE;
1571 // int storageOffset = atomStorageSize/WARPSIZE;
1572 // int itileListLen = 0;
1573 // for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) itileListLen += jtiles[jtile];
1574 // Store 0 if itileListLen == 0
1575 // tileListDepth[itileList] = (itileListLen > 0)*(itileListLen*storageOffset + atom0);
1576 // race condition here if we have the warp fetching two tiles from the same list
1577 tileListDepth[itileList] = itileListLen;
1578 tileListOrder[itileList] = itileList;
1579 // Number of active tilelists with tile with atom pairs within pairlist cutoff that interact
1580 if ((itileListLen & 65535) > 0) atomicAdd(&tileListStat->numTileLists, 1);
1581 // Number of active tilelists with tiles with atom pairs within pairlist cutoff (but not necessary interacting)
1582 if (itileListLen > 0) atomicAdd(&tileListStat->numTileListsGBIS, 1);
1583 // NOTE: always numTileListsGBIS >= numTileLists
1586 typedef cub::WarpReduce<int> WarpReduceInt;
1587 __shared__ typename WarpReduceInt::TempStorage tempStorage[NONBONDKERNEL_NUM_WARP];
1588 const int warpId = threadIdx.x / WARPSIZE;
1589 // Remove indicator bit
1591 volatile int nexcludedWarp = WarpReduceInt(tempStorage[warpId]).Sum(nexcluded);
1592 if (threadIdx.x % WARPSIZE == 0){
1593 atomicAdd(&tileListStat->numExcluded, nexcludedWarp);
1598 typedef cub::WarpReduce<float> WarpReduce;
1599 __shared__ typename WarpReduce::TempStorage tempStorage[NONBONDKERNEL_NUM_WARP];
1600 const int warpId = threadIdx.x / WARPSIZE;
1601 volatile float iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforce.x);
1602 NAMD_WARP_SYNC(WARP_FULL_MASK);
1603 volatile float iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforce.y);
1604 NAMD_WARP_SYNC(WARP_FULL_MASK);
1605 volatile float iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforce.z);
1606 NAMD_WARP_SYNC(WARP_FULL_MASK);
1608 virialEnergy[itileList].forcex = iforcexSum;
1609 virialEnergy[itileList].forcey = iforceySum;
1610 virialEnergy[itileList].forcez = iforcezSum;
1614 iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.x);
1615 NAMD_WARP_SYNC(WARP_FULL_MASK);
1616 iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.y);
1617 NAMD_WARP_SYNC(WARP_FULL_MASK);
1618 iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.z);
1619 NAMD_WARP_SYNC(WARP_FULL_MASK);
1621 virialEnergy[itileList].forceSlowx = iforcexSum;
1622 virialEnergy[itileList].forceSlowy = iforceySum;
1623 virialEnergy[itileList].forceSlowz = iforcezSum;
1630 // NOTE: We must hand write these warp-wide reductions to avoid excess register spillage
1631 // (Why does CUB suck here?)
1633 for (int i=WARPSIZE/2;i >= 1;i/=2) {
1634 energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
1635 energyElec += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec, i, WARPSIZE);
1636 if(doFEP) energyVdw_s += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_s, i, WARPSIZE);
1637 if(doFEP) energyElec_s += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec_s, i, WARPSIZE);
1639 energyVdw_ti_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_ti_1, i, WARPSIZE);
1640 energyVdw_ti_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_ti_2, i, WARPSIZE);
1641 energyElec_ti_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec_ti_1, i, WARPSIZE);
1642 energyElec_ti_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec_ti_2, i, WARPSIZE);
1645 energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
1646 if(doFEP) energySlow_s += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_s, i, WARPSIZE);
1648 energySlow_ti_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_ti_1, i, WARPSIZE);
1649 energySlow_ti_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_ti_2, i, WARPSIZE);
1654 if (threadIdx.x % WARPSIZE == 0) {
1655 virialEnergy[itileList].energyVdw = energyVdw;
1656 virialEnergy[itileList].energyElec = energyElec;
1657 if (doFEP) virialEnergy[itileList].energyVdw_s = energyVdw_s;
1658 if (doFEP) virialEnergy[itileList].energyElec_s = energyElec_s;
1660 virialEnergy[itileList].energyVdw_ti_1 = energyVdw_ti_1;
1661 virialEnergy[itileList].energyVdw_ti_2 = energyVdw_ti_2;
1662 virialEnergy[itileList].energyElec_ti_1 = energyElec_ti_1;
1663 virialEnergy[itileList].energyElec_ti_2 = energyElec_ti_2;
1666 virialEnergy[itileList].energySlow = energySlow;
1667 if(doFEP) virialEnergy[itileList].energySlow_s = energySlow_s;
1669 virialEnergy[itileList].energySlow_ti_1 = energySlow_ti_1;
1670 virialEnergy[itileList].energySlow_ti_2 = energySlow_ti_2;
1675 // XXX TODO: Disable streaming and see what happens
1678 // Make sure devForces and devForcesSlow have been written into device memory
1679 NAMD_WARP_SYNC(WARP_FULL_MASK);
1682 int patchDone[2] = {false, false};
1683 const int wid = threadIdx.x % WARPSIZE;
1685 int patchCountOld0 = atomicInc(&patchNumCount[patchInd.x], (unsigned int)(patchNumList.x-1));
1686 patchDone[0] = (patchCountOld0 + 1 == patchNumList.x);
1687 if (patchInd.x != patchInd.y) {
1688 int patchCountOld1 = atomicInc(&patchNumCount[patchInd.y], (unsigned int)(patchNumList.y-1));
1689 patchDone[1] = (patchCountOld1 + 1 == patchNumList.y);
1693 patchDone[0] = NAMD_WARP_ANY(WARP_FULL_MASK, patchDone[0]);
1694 patchDone[1] = NAMD_WARP_ANY(WARP_FULL_MASK, patchDone[1]);
1697 // Patch 1 is done, write onto host-mapped memory
1698 CudaPatchRecord patch = cudaPatches[patchInd.x];
1699 int start = patch.atomStart;
1700 int end = start + patch.numAtoms;
1701 for (int i=start+wid;i < end;i+=WARPSIZE) {
1702 mapForces[i] = make_float4(devForce_x[i],
1703 devForce_y[i], devForce_z[i], devForce_w[i]);
1705 mapForcesSlow[i] = make_float4(devForceSlow_x[i],
1706 devForceSlow_y[i], devForceSlow_z[i], devForceSlow_w[i]);
1712 CudaPatchRecord patch = cudaPatches[patchInd.y];
1713 int start = patch.atomStart;
1714 int end = start + patch.numAtoms;
1715 for (int i=start+wid;i < end;i+=WARPSIZE) {
1716 mapForces[i] = make_float4(devForce_x[i],
1717 devForce_y[i], devForce_z[i], devForce_w[i]);
1719 mapForcesSlow[i] = make_float4(devForceSlow_x[i],
1720 devForceSlow_y[i], devForceSlow_z[i], devForceSlow_w[i]);
1725 if (patchDone[0] || patchDone[1]) {
1726 // Make sure mapForces and mapForcesSlow are up-to-date
1727 NAMD_WARP_SYNC(WARP_FULL_MASK);
1728 __threadfence_system();
1729 // Add patch into "patchReadyQueue"
1732 int ind = atomicAdd(&tileListStat->patchReadyQueueCount, 1);
1733 // int ind = atomicInc((unsigned int *)&mapPatchReadyQueue[numPatches], numPatches-1);
1734 mapPatchReadyQueue[ind] = patchInd.x;
1737 int ind = atomicAdd(&tileListStat->patchReadyQueueCount, 1);
1738 // int ind = atomicInc((unsigned int *)&mapPatchReadyQueue[numPatches], numPatches-1);
1739 mapPatchReadyQueue[ind] = patchInd.y;
1745 if (doStreaming && outputOrder != NULL && threadIdx.x % WARPSIZE == 0) {
1746 int index = atomicAdd(&tileListStat->outputOrderIndex, 1);
1747 outputOrder[index] = itileList;
1749 } // if (itileList < numTileLists)
1753 // Finish up - reduce virials from nonbonded kernel
1755 __global__ void reduceNonbondedVirialKernel(const bool doSlow,
1756 const int atomStorageSize,
1757 const float4* __restrict__ xyzq,
1758 const float4* __restrict__ devForces, const float4* __restrict__ devForcesSlow,
1759 VirialEnergy* __restrict__ virialEnergy) {
1761 for (int ibase = blockIdx.x*blockDim.x;ibase < atomStorageSize;ibase += blockDim.x*gridDim.x)
1763 int i = ibase + threadIdx.x;
1765 // Set to zero to avoid nan*0
1770 float4 force, forceSlow;
1777 if (i < atomStorageSize) {
1779 force = devForces[i];
1780 if (doSlow) forceSlow = devForcesSlow[i];
1782 // Reduce across the entire thread block
1783 float vxxt = force.x*pos.x;
1784 float vxyt = force.x*pos.y;
1785 float vxzt = force.x*pos.z;
1786 float vyxt = force.y*pos.x;
1787 float vyyt = force.y*pos.y;
1788 float vyzt = force.y*pos.z;
1789 float vzxt = force.z*pos.x;
1790 float vzyt = force.z*pos.y;
1791 float vzzt = force.z*pos.z;
1793 const int bin = blockIdx.x % ATOMIC_BINS;
1795 typedef cub::BlockReduce<float, REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1796 __shared__ typename BlockReduce::TempStorage tempStorage;
1797 float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
1798 float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
1799 float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
1800 float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
1801 float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
1802 float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
1803 float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
1804 float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
1805 float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
1806 if (threadIdx.x == 0) {
1807 atomicAdd(&virialEnergy[bin].virial[0], (double)vxx);
1808 atomicAdd(&virialEnergy[bin].virial[1], (double)vxy);
1809 atomicAdd(&virialEnergy[bin].virial[2], (double)vxz);
1810 atomicAdd(&virialEnergy[bin].virial[3], (double)vyx);
1811 atomicAdd(&virialEnergy[bin].virial[4], (double)vyy);
1812 atomicAdd(&virialEnergy[bin].virial[5], (double)vyz);
1813 atomicAdd(&virialEnergy[bin].virial[6], (double)vzx);
1814 atomicAdd(&virialEnergy[bin].virial[7], (double)vzy);
1815 atomicAdd(&virialEnergy[bin].virial[8], (double)vzz);
1819 // if (isnan(forceSlow.x) || isnan(forceSlow.y) || isnan(forceSlow.z))
1820 float vxxSlowt = forceSlow.x*pos.x;
1821 float vxySlowt = forceSlow.x*pos.y;
1822 float vxzSlowt = forceSlow.x*pos.z;
1823 float vyxSlowt = forceSlow.y*pos.x;
1824 float vyySlowt = forceSlow.y*pos.y;
1825 float vyzSlowt = forceSlow.y*pos.z;
1826 float vzxSlowt = forceSlow.z*pos.x;
1827 float vzySlowt = forceSlow.z*pos.y;
1828 float vzzSlowt = forceSlow.z*pos.z;
1829 float vxxSlow = BlockReduce(tempStorage).Sum(vxxSlowt); BLOCK_SYNC;
1830 float vxySlow = BlockReduce(tempStorage).Sum(vxySlowt); BLOCK_SYNC;
1831 float vxzSlow = BlockReduce(tempStorage).Sum(vxzSlowt); BLOCK_SYNC;
1832 float vyxSlow = BlockReduce(tempStorage).Sum(vyxSlowt); BLOCK_SYNC;
1833 float vyySlow = BlockReduce(tempStorage).Sum(vyySlowt); BLOCK_SYNC;
1834 float vyzSlow = BlockReduce(tempStorage).Sum(vyzSlowt); BLOCK_SYNC;
1835 float vzxSlow = BlockReduce(tempStorage).Sum(vzxSlowt); BLOCK_SYNC;
1836 float vzySlow = BlockReduce(tempStorage).Sum(vzySlowt); BLOCK_SYNC;
1837 float vzzSlow = BlockReduce(tempStorage).Sum(vzzSlowt); BLOCK_SYNC;
1838 if (threadIdx.x == 0) {
1839 atomicAdd(&virialEnergy[bin].virialSlow[0], (double)vxxSlow);
1840 atomicAdd(&virialEnergy[bin].virialSlow[1], (double)vxySlow);
1841 atomicAdd(&virialEnergy[bin].virialSlow[2], (double)vxzSlow);
1842 atomicAdd(&virialEnergy[bin].virialSlow[3], (double)vyxSlow);
1843 atomicAdd(&virialEnergy[bin].virialSlow[4], (double)vyySlow);
1844 atomicAdd(&virialEnergy[bin].virialSlow[5], (double)vyzSlow);
1845 atomicAdd(&virialEnergy[bin].virialSlow[6], (double)vzxSlow);
1846 atomicAdd(&virialEnergy[bin].virialSlow[7], (double)vzySlow);
1847 atomicAdd(&virialEnergy[bin].virialSlow[8], (double)vzzSlow);
1854 __global__ void reduceVirialEnergyKernel(
1855 const bool doEnergy, const bool doVirial, const bool doSlow,
1856 const int numTileLists,
1857 const TileListVirialEnergy* __restrict__ tileListVirialEnergy,
1858 VirialEnergy* __restrict__ virialEnergy) {
1860 for (int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
1862 int itileList = ibase + threadIdx.x;
1863 TileListVirialEnergy ve;
1864 if (itileList < numTileLists) {
1865 ve = tileListVirialEnergy[itileList];
1867 // Set to zero to avoid nan*0
1875 ve.forceSlowx = 0.0f;
1876 ve.forceSlowy = 0.0f;
1877 ve.forceSlowz = 0.0f;
1881 ve.energyVdw_s = 0.0;
1882 ve.energyElec = 0.0;
1883 ve.energySlow = 0.0;
1884 ve.energyElec_s = 0.0;
1885 ve.energySlow_s = 0.0;
1888 ve.energyVdw_ti_1 = 0.0;
1889 ve.energyVdw_ti_2 = 0.0;
1890 ve.energyElec_ti_1 = 0.0;
1891 ve.energyElec_ti_2 = 0.0;
1892 ve.energySlow_ti_1 = 0.0;
1893 ve.energySlow_ti_2 = 0.0;
1894 // ve.energyGBIS = 0.0;
1898 const int bin = blockIdx.x % ATOMIC_BINS;
1901 typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1902 __shared__ typename BlockReduce::TempStorage tempStorage;
1903 float vxxt = ve.forcex*ve.shx;
1904 float vxyt = ve.forcex*ve.shy;
1905 float vxzt = ve.forcex*ve.shz;
1906 float vyxt = ve.forcey*ve.shx;
1907 float vyyt = ve.forcey*ve.shy;
1908 float vyzt = ve.forcey*ve.shz;
1909 float vzxt = ve.forcez*ve.shx;
1910 float vzyt = ve.forcez*ve.shy;
1911 float vzzt = ve.forcez*ve.shz;
1912 float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
1913 float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
1914 float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
1915 float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
1916 float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
1917 float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
1918 float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
1919 float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
1920 float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
1921 if (threadIdx.x == 0) {
1922 atomicAdd(&virialEnergy[bin].virial[0], (double)vxx);
1923 atomicAdd(&virialEnergy[bin].virial[1], (double)vxy);
1924 atomicAdd(&virialEnergy[bin].virial[2], (double)vxz);
1925 atomicAdd(&virialEnergy[bin].virial[3], (double)vyx);
1926 atomicAdd(&virialEnergy[bin].virial[4], (double)vyy);
1927 atomicAdd(&virialEnergy[bin].virial[5], (double)vyz);
1928 atomicAdd(&virialEnergy[bin].virial[6], (double)vzx);
1929 atomicAdd(&virialEnergy[bin].virial[7], (double)vzy);
1930 atomicAdd(&virialEnergy[bin].virial[8], (double)vzz);
1934 typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1935 __shared__ typename BlockReduce::TempStorage tempStorage;
1936 float vxxt = ve.forceSlowx*ve.shx;
1937 float vxyt = ve.forceSlowx*ve.shy;
1938 float vxzt = ve.forceSlowx*ve.shz;
1939 float vyxt = ve.forceSlowy*ve.shx;
1940 float vyyt = ve.forceSlowy*ve.shy;
1941 float vyzt = ve.forceSlowy*ve.shz;
1942 float vzxt = ve.forceSlowz*ve.shx;
1943 float vzyt = ve.forceSlowz*ve.shy;
1944 float vzzt = ve.forceSlowz*ve.shz;
1945 float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
1946 float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
1947 float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
1948 float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
1949 float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
1950 float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
1951 float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
1952 float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
1953 float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
1954 if (threadIdx.x == 0) {
1955 atomicAdd(&virialEnergy[bin].virialSlow[0], (double)vxx);
1956 atomicAdd(&virialEnergy[bin].virialSlow[1], (double)vxy);
1957 atomicAdd(&virialEnergy[bin].virialSlow[2], (double)vxz);
1958 atomicAdd(&virialEnergy[bin].virialSlow[3], (double)vyx);
1959 atomicAdd(&virialEnergy[bin].virialSlow[4], (double)vyy);
1960 atomicAdd(&virialEnergy[bin].virialSlow[5], (double)vyz);
1961 atomicAdd(&virialEnergy[bin].virialSlow[6], (double)vzx);
1962 atomicAdd(&virialEnergy[bin].virialSlow[7], (double)vzy);
1963 atomicAdd(&virialEnergy[bin].virialSlow[8], (double)vzz);
1969 typedef cub::BlockReduce<double, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1970 /* Maybe we should guard the TI and FEP energies, since those are not to be calculated on regular MDs */
1971 __shared__ typename BlockReduce::TempStorage tempStorage;
1972 double energyVdw = BlockReduce(tempStorage).Sum(ve.energyVdw); BLOCK_SYNC;
1973 double energyVdw_s = BlockReduce(tempStorage).Sum(ve.energyVdw_s); BLOCK_SYNC;
1974 double energyElec = BlockReduce(tempStorage).Sum(ve.energyElec); BLOCK_SYNC;
1975 double energyElec_s = BlockReduce(tempStorage).Sum(ve.energyElec_s); BLOCK_SYNC;
1976 double energyVdw_ti_1 = BlockReduce(tempStorage).Sum(ve.energyVdw_ti_1); BLOCK_SYNC;
1977 double energyVdw_ti_2 = BlockReduce(tempStorage).Sum(ve.energyVdw_ti_2); BLOCK_SYNC;
1978 double energyElec_ti_1 = BlockReduce(tempStorage).Sum(ve.energyElec_ti_1); BLOCK_SYNC;
1979 double energyElec_ti_2 = BlockReduce(tempStorage).Sum(ve.energyElec_ti_2); BLOCK_SYNC;
1980 if (threadIdx.x == 0){
1981 atomicAdd(&virialEnergy[bin].energyVdw, energyVdw);
1982 atomicAdd(&virialEnergy[bin].energyVdw_s, energyVdw_s);
1983 atomicAdd(&virialEnergy[bin].energyElec, energyElec);
1984 atomicAdd(&virialEnergy[bin].energyElec_s, energyElec_s);
1985 atomicAdd(&virialEnergy[bin].energyVdw_ti_1, energyVdw_ti_1);
1986 atomicAdd(&virialEnergy[bin].energyVdw_ti_2, energyVdw_ti_2);
1987 atomicAdd(&virialEnergy[bin].energyElec_ti_1, energyElec_ti_1);
1988 atomicAdd(&virialEnergy[bin].energyElec_ti_2, energyElec_ti_2);
1991 double energySlow = BlockReduce(tempStorage).Sum(ve.energySlow); BLOCK_SYNC;
1992 double energySlow_s = BlockReduce(tempStorage).Sum(ve.energySlow_s); BLOCK_SYNC;
1993 double energySlow_ti_1 = BlockReduce(tempStorage).Sum(ve.energySlow_ti_1); BLOCK_SYNC;
1994 double energySlow_ti_2 = BlockReduce(tempStorage).Sum(ve.energySlow_ti_2); BLOCK_SYNC;
1995 if (threadIdx.x == 0) {
1996 atomicAdd(&virialEnergy[bin].energySlow, energySlow);
1997 atomicAdd(&virialEnergy[bin].energySlow_s, energySlow_s);
1998 atomicAdd(&virialEnergy[bin].energySlow_ti_1, energySlow_ti_1);
1999 atomicAdd(&virialEnergy[bin].energySlow_ti_2, energySlow_ti_2);
2008 __global__ void reduceGBISEnergyKernel(const int numTileLists,
2009 const TileListVirialEnergy* __restrict__ tileListVirialEnergy,
2010 VirialEnergy* __restrict__ virialEnergy) {
2012 for (int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
2014 int itileList = ibase + threadIdx.x;
2015 double energyGBISt = 0.0;
2016 if (itileList < numTileLists) {
2017 energyGBISt = tileListVirialEnergy[itileList].energyGBIS;
2020 const int bin = blockIdx.x % ATOMIC_BINS;
2022 typedef cub::BlockReduce<double, REDUCEGBISENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
2023 __shared__ typename BlockReduce::TempStorage tempStorage;
2024 double energyGBIS = BlockReduce(tempStorage).Sum(energyGBISt); BLOCK_SYNC;
2025 if (threadIdx.x == 0) atomicAdd(&virialEnergy[bin].energyGBIS, energyGBIS);
2029 __global__ void reduceNonbondedBinsKernel(
2030 const bool doVirial,
2031 const bool doEnergy,
2034 VirialEnergy* __restrict__ virialEnergy) {
2036 const int bin = threadIdx.x;
2038 typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
2039 __shared__ typename WarpReduce::TempStorage tempStorage;
2042 double vxx = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[0]);
2043 double vxy = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[1]);
2044 double vxz = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[2]);
2045 double vyx = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[3]);
2046 double vyy = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[4]);
2047 double vyz = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[5]);
2048 double vzx = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[6]);
2049 double vzy = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[7]);
2050 double vzz = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[8]);
2051 if (threadIdx.x == 0) {
2052 virialEnergy->virial[0] = vxx;
2053 virialEnergy->virial[1] = vxy;
2054 virialEnergy->virial[2] = vxz;
2055 virialEnergy->virial[3] = vyx;
2056 virialEnergy->virial[4] = vyy;
2057 virialEnergy->virial[5] = vyz;
2058 virialEnergy->virial[6] = vzx;
2059 virialEnergy->virial[7] = vzy;
2060 virialEnergy->virial[8] = vzz;
2064 double vxxSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[0]);
2065 double vxySlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[1]);
2066 double vxzSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[2]);
2067 double vyxSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[3]);
2068 double vyySlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[4]);
2069 double vyzSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[5]);
2070 double vzxSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[6]);
2071 double vzySlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[7]);
2072 double vzzSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[8]);
2073 if (threadIdx.x == 0) {
2074 virialEnergy->virialSlow[0] = vxxSlow;
2075 virialEnergy->virialSlow[1] = vxySlow;
2076 virialEnergy->virialSlow[2] = vxzSlow;
2077 virialEnergy->virialSlow[3] = vyxSlow;
2078 virialEnergy->virialSlow[4] = vyySlow;
2079 virialEnergy->virialSlow[5] = vyzSlow;
2080 virialEnergy->virialSlow[6] = vzxSlow;
2081 virialEnergy->virialSlow[7] = vzySlow;
2082 virialEnergy->virialSlow[8] = vzzSlow;
2088 double energyVdw = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyVdw);
2089 double energyVdw_s = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyVdw_s);
2090 double energyElec = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyElec);
2091 double energyElec_s = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyElec_s);
2092 double energyVdw_ti_1 = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyVdw_ti_1);
2093 double energyVdw_ti_2 = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyVdw_ti_2);
2094 double energyElec_ti_1 = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyElec_ti_1);
2095 double energyElec_ti_2 = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyElec_ti_2);
2096 if (threadIdx.x == 0) {
2097 virialEnergy->energyVdw = energyVdw;
2098 virialEnergy->energyVdw_s = energyVdw_s;
2099 virialEnergy->energyElec = energyElec;
2100 virialEnergy->energyElec_s = energyElec_s;
2101 virialEnergy->energyVdw_ti_1 = energyVdw_ti_1;
2102 virialEnergy->energyVdw_ti_2 = energyVdw_ti_2;
2103 virialEnergy->energyElec_ti_1 = energyElec_ti_1;
2104 virialEnergy->energyElec_ti_2 = energyElec_ti_2;
2107 double energySlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].energySlow);
2108 double energySlow_s = WarpReduce(tempStorage).Sum(virialEnergy[bin].energySlow_s);
2109 double energySlow_ti_1 = WarpReduce(tempStorage).Sum(virialEnergy[bin].energySlow_ti_1);
2110 double energySlow_ti_2 = WarpReduce(tempStorage).Sum(virialEnergy[bin].energySlow_ti_2);
2111 if (threadIdx.x == 0) {
2112 virialEnergy->energySlow = energySlow;
2113 virialEnergy->energySlow_s = energySlow_s;
2114 virialEnergy->energySlow_ti_1 = energySlow_ti_1;
2115 virialEnergy->energySlow_ti_2 = energySlow_ti_2;
2119 double energyGBIS = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyGBIS);
2120 if (threadIdx.x == 0) {
2121 virialEnergy->energyGBIS = energyGBIS;
2127 // ##############################################################################################
2128 // ##############################################################################################
2129 // ##############################################################################################
2131 CudaComputeNonbondedKernel::CudaComputeNonbondedKernel(int deviceID, CudaNonbondedTables& cudaNonbondedTables,
2132 bool doStreaming) : deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables), doStreaming(doStreaming) {
2134 cudaCheck(cudaSetDevice(deviceID));
2136 overflowExclusions = NULL;
2137 overflowExclusionsSize = 0;
2139 exclIndexMaxDiff = NULL;
2140 exclIndexMaxDiffSize = 0;
2148 patchNumCount = NULL;
2149 patchNumCountSize = 0;
2151 patchReadyQueue = NULL;
2152 patchReadyQueueSize = 0;
2154 force_x = force_y = force_z = force_w = NULL;
2156 forceSlow_x = forceSlow_y = forceSlow_z = forceSlow_w = NULL;
2160 void CudaComputeNonbondedKernel::reallocate_forceSOA(int atomStorageSize)
2163 reallocate_device<float>(&force_x, &forceSize, atomStorageSize, 1.4f);
2164 reallocate_device<float>(&force_y, &forceSize, atomStorageSize, 1.4f);
2165 reallocate_device<float>(&force_z, &forceSize, atomStorageSize, 1.4f);
2166 reallocate_device<float>(&force_w, &forceSize, atomStorageSize, 1.4f);
2167 reallocate_device<float>(&forceSlow_x, &forceSlowSize, atomStorageSize, 1.4f);
2168 reallocate_device<float>(&forceSlow_y, &forceSlowSize, atomStorageSize, 1.4f);
2169 reallocate_device<float>(&forceSlow_z, &forceSlowSize, atomStorageSize, 1.4f);
2170 reallocate_device<float>(&forceSlow_w, &forceSlowSize, atomStorageSize, 1.4f);
2172 reallocate_device<float>(&force_x, &forceSize, atomStorageSize*8, 1.4f);
2173 force_y = force_x + atomStorageSize;
2174 force_z = force_y + atomStorageSize;
2175 force_w = force_z + atomStorageSize;
2176 forceSlow_x = force_w + atomStorageSize;
2177 forceSlow_y = forceSlow_x + atomStorageSize;
2178 forceSlow_z = forceSlow_y + atomStorageSize;
2179 forceSlow_w = forceSlow_z + atomStorageSize;
2183 CudaComputeNonbondedKernel::~CudaComputeNonbondedKernel() {
2184 cudaCheck(cudaSetDevice(deviceID));
2185 if (overflowExclusions != NULL) deallocate_device<unsigned int>(&overflowExclusions);
2186 if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff);
2187 if (atomIndex != NULL) deallocate_device<int>(&atomIndex);
2188 if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes);
2189 if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount);
2190 if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue);
2192 if (force_x != NULL) deallocate_device<float>(&force_x);
2193 if (force_y != NULL) deallocate_device<float>(&force_y);
2194 if (force_z != NULL) deallocate_device<float>(&force_z);
2195 if (force_w != NULL) deallocate_device<float>(&force_w);
2196 if (forceSlow_x != NULL) deallocate_device<float>(&forceSlow_x);
2197 if (forceSlow_y != NULL) deallocate_device<float>(&forceSlow_y);
2198 if (forceSlow_z != NULL) deallocate_device<float>(&forceSlow_z);
2199 if (forceSlow_w != NULL) deallocate_device<float>(&forceSlow_w);
2201 if (force_x != NULL) deallocate_device<float>(&force_x);
2205 void CudaComputeNonbondedKernel::updateVdwTypesExcl(const int atomStorageSize, const int* h_vdwTypes,
2206 const int2* h_exclIndexMaxDiff, const int* h_atomIndex, cudaStream_t stream) {
2208 reallocate_device<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, OVERALLOC);
2209 reallocate_device<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, OVERALLOC);
2210 reallocate_device<int>(&atomIndex, &atomIndexSize, atomStorageSize, OVERALLOC);
2212 copy_HtoD<int>(h_vdwTypes, vdwTypes, atomStorageSize, stream);
2213 copy_HtoD<int2>(h_exclIndexMaxDiff, exclIndexMaxDiff, atomStorageSize, stream);
2214 copy_HtoD<int>(h_atomIndex, atomIndex, atomStorageSize, stream);
2217 int* CudaComputeNonbondedKernel::getPatchReadyQueue() {
2219 NAMD_die("CudaComputeNonbondedKernel::getPatchReadyQueue() called on non-streaming kernel");
2221 return patchReadyQueue;
2224 template <int doSlow>
2225 __global__ void transposeForcesKernel(float4 *f, float4 *fSlow,
2226 float *fx, float *fy, float *fz, float *fw,
2227 float *fSlowx, float *fSlowy, float *fSlowz, float *fSloww,
2230 int tid = blockIdx.x*blockDim.x + threadIdx.x;
2232 f[tid] = make_float4(fx[tid], fy[tid], fz[tid], fw[tid]);
2233 fx[tid] = 0.f; fy[tid] = 0.f; fz[tid] = 0.f; fw[tid] = 0.f;
2235 fSlow[tid] = make_float4(fSlowx[tid], fSlowy[tid], fSlowz[tid], fSloww[tid]);
2236 fSlowx[tid] = 0.f; fSlowy[tid] = 0.f; fSlowz[tid] = 0.f; fSloww[tid] = 0.f;
2243 void CudaComputeNonbondedKernel::nonbondedForce(CudaTileListKernel& tlKernel,
2244 const int atomStorageSize, const bool atomsChanged, const bool doMinimize,
2245 const bool doPairlist,
2246 const bool doEnergy, const bool doVirial, const bool doSlow, const bool doAlch,
2247 const bool doAlchVdwForceSwitching,
2248 const bool doFEP, const bool doTI, const bool doTable,
2249 const float3 lata, const float3 latb, const float3 latc,
2250 const float4* h_xyzq, const float cutoff2,
2251 const CudaNBConstants nbConstants,
2252 float4* d_forces, float4* d_forcesSlow,
2253 float4* h_forces, float4* h_forcesSlow, AlchData *srcFlags,
2254 bool lambdaWindowUpdated, char *part,
2255 bool CUDASOAintegrator, bool useDeviceMigration,
2256 cudaStream_t stream) {
2258 #ifdef NODEGROUP_FORCE_REGISTER
2259 if (!atomsChanged && !CUDASOAintegrator) copy_HtoD<float4>(h_xyzq, tlKernel.get_xyzq(), atomStorageSize, stream);
2261 if (!doPairlist) copy_HtoD<float4>(h_xyzq, tlKernel.get_xyzq(), atomStorageSize, stream);
2265 // Copy partition to device. This is not necessary if both CUDASOAintegrator and useDeviceMigration
2267 if (doPairlist && (!CUDASOAintegrator || !useDeviceMigration)) {
2268 copy_HtoD< char>(part, tlKernel.get_part(), atomStorageSize, stream);
2270 //Copies flags to constant memory
2271 if(lambdaWindowUpdated) cudaCheck(cudaMemcpyToSymbol(alchflags, srcFlags, sizeof(AlchData)));
2274 // XXX TODO: Get rid of the clears
2276 if (doStreaming) tlKernel.clearTileListStat(stream);
2277 if(atomsChanged || doMinimize){
2278 clear_device_array<float>(force_x, atomStorageSize*4, stream);
2279 if(doSlow) clear_device_array<float>(forceSlow_x, atomStorageSize*4, stream);
2283 // --- streaming ----
2284 float4* m_forces = NULL;
2285 float4* m_forcesSlow = NULL;
2286 int* m_patchReadyQueue = NULL;
2288 unsigned long long *calculatedPairs;
2289 unsigned long long *skippedPairs;
2290 unsigned int* patchNumCountPtr = NULL;
2293 numPatches = tlKernel.getNumPatches();
2294 if (reallocate_device<unsigned int>(&patchNumCount, &patchNumCountSize, numPatches)) {
2295 // If re-allocated, clear array
2296 clear_device_array<unsigned int>(patchNumCount, numPatches, stream);
2298 patchNumCountPtr = patchNumCount;
2299 bool re = reallocate_host<int>(&patchReadyQueue, &patchReadyQueueSize, numPatches, cudaHostAllocMapped);
2301 // If re-allocated, re-set to "-1"
2302 for (int i=0;i < numPatches;i++) patchReadyQueue[i] = -1;
2304 cudaCheck(cudaHostGetDevicePointer((void**)&m_patchReadyQueue, patchReadyQueue, 0));
2305 cudaCheck(cudaHostGetDevicePointer((void**)&m_forces, h_forces, 0));
2306 cudaCheck(cudaHostGetDevicePointer((void**)&m_forcesSlow, h_forcesSlow, 0));
2308 // -----------------
2310 if (doVirial || doEnergy) {
2311 tlKernel.setTileListVirialEnergyLength(tlKernel.getNumTileLists());
2316 int* outputOrderPtr = tlKernel.getOutputOrder();
2318 int nwarp = NONBONDKERNEL_NUM_WARP;
2319 int nthread = WARPSIZE*nwarp;
2327 int options = doEnergy + (doVirial << 1) + (doSlow << 2) +
2328 (doPairlist << 3) + (doAlch << 4) + (doFEP << 5) + (doTI << 6) + (doStreaming << 7) + (doTable << 8) + (doAlchVdwForceSwitching << 9);
2331 while (start < tlKernel.getNumTileLists()) {
2333 int nleft = tlKernel.getNumTileLists() - start;
2334 int nblock = min(deviceCUDA->getMaxNumBlocks(), (nleft-1)/nwarp+1);
2338 #ifdef USE_TABLE_ARRAYS
2339 #define VDW_TABLE_PARAMS cudaNonbondedTables.getVdwCoefTable()
2340 #define TABLE_PARAMS \
2341 cudaNonbondedTables.getForceTable(), cudaNonbondedTables.getEnergyTable()
2343 #define VDW_TABLE_PARAMS cudaNonbondedTables.getVdwCoefTableTex()
2344 #define TABLE_PARAMS \
2345 cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex()
2349 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOALCH, DOFEP, DOTI, DOSTREAMING, DOALCHWDWFORCESWITCHING) \
2350 nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOALCH, DOFEP, DOTI, DOSTREAMING, DOALCHWDWFORCESWITCHING > \
2351 <<< nblock, nthread, shMemSize, stream >>> \
2352 (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
2353 cudaNonbondedTables.getVdwCoefTableWidth(), \
2355 vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, nbConstants, \
2357 tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
2358 tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
2359 tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \
2360 force_x, force_y, force_z, force_w, \
2361 forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w, \
2362 numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
2363 outputOrderPtr, tlKernel.getTileListVirialEnergy(), tlKernel.get_part(), calculatedPairs, skippedPairs); called=true
2365 bool called = false;
2368 if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 0, 0, 0, 1, 0);
2369 if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 0, 0, 0, 1, 0);
2370 if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 0, 0, 0, 1, 0);
2371 if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 0, 0, 0, 1, 0);
2372 if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 0, 0, 0, 1, 0);
2373 if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 0, 0, 0, 1, 0);
2374 if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 0, 0, 0, 1, 0);
2375 if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 0, 0, 0, 1, 0);
2378 if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 0, 0, 0, 1, 0);
2379 if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 0, 0, 0, 1, 0);
2380 if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 0, 0, 0, 1, 0);
2381 if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 0, 0, 0, 1, 0);
2382 if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 0, 0, 0, 1, 0);
2383 if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 0, 0, 0, 1, 0);
2384 if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 0, 0, 0, 1, 0);
2385 if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 0, 0, 0, 1, 0);
2388 if (doAlchVdwForceSwitching) {
2389 if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 1, 0, 1, 1);
2390 if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 1, 0, 1, 1);
2391 if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 1, 0, 1, 1);
2392 if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 1, 0, 1, 1);
2393 if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 1, 0, 1, 1);
2394 if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 1, 0, 1, 1);
2395 if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 1, 0, 1, 1);
2396 if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 1, 0, 1, 1);
2398 if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 1, 0, 1, 1);
2399 if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 1, 0, 1, 1);
2400 if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 1, 0, 1, 1);
2401 if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 1, 0, 1, 1);
2402 if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 1, 0, 1, 1);
2403 if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 1, 0, 1, 1);
2404 if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 1, 0, 1, 1);
2405 if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 1, 0, 1, 1);
2407 if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 1, 0, 1, 0);
2408 if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 1, 0, 1, 0);
2409 if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 1, 0, 1, 0);
2410 if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 1, 0, 1, 0);
2411 if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 1, 0, 1, 0);
2412 if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 1, 0, 1, 0);
2413 if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 1, 0, 1, 0);
2414 if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 1, 0, 1, 0);
2416 if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 1, 0, 1, 0);
2417 if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 1, 0, 1, 0);
2418 if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 1, 0, 1, 0);
2419 if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 1, 0, 1, 0);
2420 if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 1, 0, 1, 0);
2421 if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 1, 0, 1, 0);
2422 if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 1, 0, 1, 0);
2423 if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 1, 0, 1, 0);
2426 if (doAlchVdwForceSwitching) {
2427 if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 0, 1, 1, 1);
2428 if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 0, 1, 1, 1);
2429 if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 0, 1, 1, 1);
2430 if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 0, 1, 1, 1);
2431 if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 0, 1, 1, 1);
2432 if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 0, 1, 1, 1);
2433 if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 0, 1, 1, 1);
2434 if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 0, 1, 1, 1);
2436 if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 0, 1, 1, 1);
2437 if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 0, 1, 1, 1);
2438 if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 0, 1, 1, 1);
2439 if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 0, 1, 1, 1);
2440 if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 0, 1, 1, 1);
2441 if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 0, 1, 1, 1);
2442 if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 0, 1, 1, 1);
2443 if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 0, 1, 1, 1);
2445 if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 0, 1, 1, 0);
2446 if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 0, 1, 1, 0);
2447 if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 0, 1, 1, 0);
2448 if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 0, 1, 1, 0);
2449 if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 0, 1, 1, 0);
2450 if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 0, 1, 1, 0);
2451 if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 0, 1, 1, 0);
2452 if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 0, 1, 1, 0);
2454 if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 0, 1, 1, 0);
2455 if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 0, 1, 1, 0);
2456 if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 0, 1, 1, 0);
2457 if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 0, 1, 1, 0);
2458 if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 0, 1, 1, 0);
2459 if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 0, 1, 1, 0);
2460 if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 0, 1, 1, 0);
2461 if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 0, 1, 1, 0);
2468 if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 0, 0, 0, 0, 0);
2469 if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 0, 0, 0, 0, 0);
2470 if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 0, 0, 0, 0, 0);
2471 if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 0, 0, 0, 0, 0);
2472 if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 0, 0, 0, 0, 0);
2473 if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 0, 0, 0, 0, 0);
2474 if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 0, 0, 0, 0, 0);
2475 if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 0, 0, 0, 0, 0);
2478 if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 0, 0, 0, 0, 0);
2479 if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 0, 0, 0, 0, 0);
2480 if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 0, 0, 0, 0, 0);
2481 if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 0, 0, 0, 0, 0);
2482 if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 0, 0, 0, 0, 0);
2483 if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 0, 0, 0, 0, 0);
2484 if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 0, 0, 0, 0, 0);
2485 if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 0, 0, 0, 0, 0);
2488 if (doAlchVdwForceSwitching) {
2489 if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 1, 0, 0, 1);
2490 if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 1, 0, 0, 1);
2491 if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 1, 0, 0, 1);
2492 if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 1, 0, 0, 1);
2493 if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 1, 0, 0, 1);
2494 if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 1, 0, 0, 1);
2495 if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 1, 0, 0, 1);
2496 if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 1, 0, 0, 1);
2498 if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 1, 0, 0, 1);
2499 if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 1, 0, 0, 1);
2500 if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 1, 0, 0, 1);
2501 if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 1, 0, 0, 1);
2502 if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 1, 0, 0, 1);
2503 if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 1, 0, 0, 1);
2504 if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 1, 0, 0, 1);
2505 if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 1, 0, 0, 1);
2507 if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 1, 0, 0, 0);
2508 if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 1, 0, 0, 0);
2509 if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 1, 0, 0, 0);
2510 if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 1, 0, 0, 0);
2511 if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 1, 0, 0, 0);
2512 if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 1, 0, 0, 0);
2513 if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 1, 0, 0, 0);
2514 if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 1, 0, 0, 0);
2516 if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 1, 0, 0, 0);
2517 if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 1, 0, 0, 0);
2518 if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 1, 0, 0, 0);
2519 if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 1, 0, 0, 0);
2520 if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 1, 0, 0, 0);
2521 if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 1, 0, 0, 0);
2522 if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 1, 0, 0, 0);
2523 if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 1, 0, 0, 0);
2526 if (doAlchVdwForceSwitching) {
2527 if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 0, 1, 0, 1);
2528 if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 0, 1, 0, 1);
2529 if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 0, 1, 0, 1);
2530 if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 0, 1, 0, 1);
2531 if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 0, 1, 0, 1);
2532 if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 0, 1, 0, 1);
2533 if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 0, 1, 0, 1);
2534 if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 0, 1, 0, 1);
2536 if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 0, 1, 0, 1);
2537 if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 0, 1, 0, 1);
2538 if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 0, 1, 0, 1);
2539 if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 0, 1, 0, 1);
2540 if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 0, 1, 0, 1);
2541 if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 0, 1, 0, 1);
2542 if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 0, 1, 0, 1);
2543 if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 0, 1, 0, 1);
2545 if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 0, 1, 0, 0);
2546 if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 0, 1, 0, 0);
2547 if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 0, 1, 0, 0);
2548 if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 0, 1, 0, 0);
2549 if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 0, 1, 0, 0);
2550 if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 0, 1, 0, 0);
2551 if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 0, 1, 0, 0);
2552 if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 0, 1, 0, 0);
2554 if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 0, 1, 0, 0);
2555 if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 0, 1, 0, 0);
2556 if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 0, 1, 0, 0);
2557 if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 0, 1, 0, 0);
2558 if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 0, 1, 0, 0);
2559 if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 0, 1, 0, 0);
2560 if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 0, 1, 0, 0);
2561 if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 0, 1, 0, 0);
2568 NAMD_die("CudaComputeNonbondedKernel::nonbondedForce, none of the kernels called");
2573 #ifdef USE_TABLE_ARRAYS
2574 #define VDW_TABLE_PARAMS cudaNonbondedTables.getVdwCoefTable()
2575 #define TABLE_PARAMS \
2576 cudaNonbondedTables.getForceTable(), cudaNonbondedTables.getEnergyTable()
2578 #define VDW_TABLE_PARAMS cudaNonbondedTables.getVdwCoefTableTex()
2579 #define TABLE_PARAMS \
2580 cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex()
2583 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOALCH, DOFEP, DOTI, DOSTREAMING, DOTABLE, DOALCHWDWFORCESWITCHING) \
2584 nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOALCH, DOFEP, DOTI, DOSTREAMING, DOTABLE, DOALCHWDWFORCESWITCHING> \
2585 <<< nblock, nthread, shMemSize, stream >>> \
2586 (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
2587 cudaNonbondedTables.getVdwCoefTableWidth(), \
2589 vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, nbConstants, \
2591 tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
2592 tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
2593 tlKernel.getBoundingBoxes(), \
2594 force_x, force_y, force_z, force_w, \
2595 forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w, \
2596 numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
2597 outputOrderPtr, tlKernel.getTileListVirialEnergy(), tlKernel.get_part())
2604 case 0: CALL(0, 0, 0, 0, 0, 0, 0, 0, 0, 0); break;
2605 case 1: CALL(1, 0, 0, 0, 0, 0, 0, 0, 0, 0); break;
2606 case 2: CALL(0, 1, 0, 0, 0, 0, 0, 0, 0, 0); break;
2607 case 3: CALL(1, 1, 0, 0, 0, 0, 0, 0, 0, 0); break;
2608 case 4: CALL(0, 0, 1, 0, 0, 0, 0, 0, 0, 0); break;
2609 case 5: CALL(1, 0, 1, 0, 0, 0, 0, 0, 0, 0); break;
2610 case 6: CALL(0, 1, 1, 0, 0, 0, 0, 0, 0, 0); break;
2611 case 7: CALL(1, 1, 1, 0, 0, 0, 0, 0, 0, 0); break;
2612 case 8: CALL(0, 0, 0, 1, 0, 0, 0, 0, 0, 0); break;
2613 case 9: CALL(1, 0, 0, 1, 0, 0, 0, 0, 0, 0); break;
2614 case 10: CALL(0, 1, 0, 1, 0, 0, 0, 0, 0, 0); break;
2615 case 11: CALL(1, 1, 0, 1, 0, 0, 0, 0, 0, 0); break;
2616 case 12: CALL(0, 0, 1, 1, 0, 0, 0, 0, 0, 0); break;
2617 case 13: CALL(1, 0, 1, 1, 0, 0, 0, 0, 0, 0); break;
2618 case 14: CALL(0, 1, 1, 1, 0, 0, 0, 0, 0, 0); break;
2619 case 15: CALL(1, 1, 1, 1, 0, 0, 0, 0, 0, 0); break;
2622 case 16: CALL(0, 0, 0, 0, 1, 0, 0, 0, 0, 0); break;
2623 case 17: CALL(1, 0, 0, 0, 1, 0, 0, 0, 0, 0); break;
2624 case 18: CALL(0, 1, 0, 0, 1, 0, 0, 0, 0, 0); break;
2625 case 19: CALL(1, 1, 0, 0, 1, 0, 0, 0, 0, 0); break;
2626 case 20: CALL(0, 0, 1, 0, 1, 0, 0, 0, 0, 0); break;
2627 case 21: CALL(1, 0, 1, 0, 1, 0, 0, 0, 0, 0); break;
2628 case 22: CALL(0, 1, 1, 0, 1, 0, 0, 0, 0, 0); break;
2629 case 23: CALL(1, 1, 1, 0, 1, 0, 0, 0, 0, 0); break;
2630 case 24: CALL(0, 0, 0, 1, 1, 0, 0, 0, 0, 0); break;
2631 case 25: CALL(1, 0, 0, 1, 1, 0, 0, 0, 0, 0); break;
2632 case 26: CALL(0, 1, 0, 1, 1, 0, 0, 0, 0, 0); break;
2633 case 27: CALL(1, 1, 0, 1, 1, 0, 0, 0, 0, 0); break;
2634 case 28: CALL(0, 0, 1, 1, 1, 0, 0, 0, 0, 0); break;
2635 case 29: CALL(1, 0, 1, 1, 1, 0, 0, 0, 0, 0); break;
2636 case 30: CALL(0, 1, 1, 1, 1, 0, 0, 0, 0, 0); break;
2637 case 31: CALL(1, 1, 1, 1, 1, 0, 0, 0, 0, 0); break;
2639 case 32: CALL(0, 0, 0, 0, 0, 1, 0, 0, 0, 0); break;
2640 case 33: CALL(1, 0, 0, 0, 0, 1, 0, 0, 0, 0); break;
2641 case 34: CALL(0, 1, 0, 0, 0, 1, 0, 0, 0, 0); break;
2642 case 35: CALL(1, 1, 0, 0, 0, 1, 0, 0, 0, 0); break;
2643 case 36: CALL(0, 0, 1, 0, 0, 1, 0, 0, 0, 0); break;
2644 case 37: CALL(1, 0, 1, 0, 0, 1, 0, 0, 0, 0); break;
2645 case 38: CALL(0, 1, 1, 0, 0, 1, 0, 0, 0, 0); break;
2646 case 39: CALL(1, 1, 1, 0, 0, 1, 0, 0, 0, 0); break;
2647 case 40: CALL(0, 0, 0, 1, 0, 1, 0, 0, 0, 0); break;
2648 case 41: CALL(1, 0, 0, 1, 0, 1, 0, 0, 0, 0); break;
2649 case 42: CALL(0, 1, 0, 1, 0, 1, 0, 0, 0, 0); break;
2650 case 43: CALL(1, 1, 0, 1, 0, 1, 0, 0, 0, 0); break;
2651 case 44: CALL(0, 0, 1, 1, 0, 1, 0, 0, 0, 0); break;
2652 case 45: CALL(1, 0, 1, 1, 0, 1, 0, 0, 0, 0); break;
2653 case 46: CALL(0, 1, 1, 1, 0, 1, 0, 0, 0, 0); break;
2654 case 47: CALL(1, 1, 1, 1, 0, 1, 0, 0, 0, 0); break;
2656 case 48: CALL(0, 0, 0, 0, 1, 1, 0, 0, 0, 0); break;
2657 case 49: CALL(1, 0, 0, 0, 1, 1, 0, 0, 0, 0); break;
2658 case 50: CALL(0, 1, 0, 0, 1, 1, 0, 0, 0, 0); break;
2659 case 51: CALL(1, 1, 0, 0, 1, 1, 0, 0, 0, 0); break;
2660 case 52: CALL(0, 0, 1, 0, 1, 1, 0, 0, 0, 0); break;
2661 case 53: CALL(1, 0, 1, 0, 1, 1, 0, 0, 0, 0); break;
2662 case 54: CALL(0, 1, 1, 0, 1, 1, 0, 0, 0, 0); break;
2663 case 55: CALL(1, 1, 1, 0, 1, 1, 0, 0, 0, 0); break;
2664 case 56: CALL(0, 0, 0, 1, 1, 1, 0, 0, 0, 0); break;
2665 case 57: CALL(1, 0, 0, 1, 1, 1, 0, 0, 0, 0); break;
2666 case 58: CALL(0, 1, 0, 1, 1, 1, 0, 0, 0, 0); break;
2667 case 59: CALL(1, 1, 0, 1, 1, 1, 0, 0, 0, 0); break;
2668 case 60: CALL(0, 0, 1, 1, 1, 1, 0, 0, 0, 0); break;
2669 case 61: CALL(1, 0, 1, 1, 1, 1, 0, 0, 0, 0); break;
2670 case 62: CALL(0, 1, 1, 1, 1, 1, 0, 0, 0, 0); break;
2671 case 63: CALL(1, 1, 1, 1, 1, 1, 0, 0, 0, 0); break;
2673 case 64: CALL(0, 0, 0, 0, 0, 0, 1, 0, 0, 0); break;
2674 case 65: CALL(1, 0, 0, 0, 0, 0, 1, 0, 0, 0); break;
2675 case 66: CALL(0, 1, 0, 0, 0, 0, 1, 0, 0, 0); break;
2676 case 67: CALL(1, 1, 0, 0, 0, 0, 1, 0, 0, 0); break;
2677 case 68: CALL(0, 0, 1, 0, 0, 0, 1, 0, 0, 0); break;
2678 case 69: CALL(1, 0, 1, 0, 0, 0, 1, 0, 0, 0); break;
2679 case 70: CALL(0, 1, 1, 0, 0, 0, 1, 0, 0, 0); break;
2680 case 71: CALL(1, 1, 1, 0, 0, 0, 1, 0, 0, 0); break;
2681 case 72: CALL(0, 0, 0, 1, 0, 0, 1, 0, 0, 0); break;
2682 case 73: CALL(1, 0, 0, 1, 0, 0, 1, 0, 0, 0); break;
2683 case 74: CALL(0, 1, 0, 1, 0, 0, 1, 0, 0, 0); break;
2684 case 75: CALL(1, 1, 0, 1, 0, 0, 1, 0, 0, 0); break;
2685 case 76: CALL(0, 0, 1, 1, 0, 0, 1, 0, 0, 0); break;
2686 case 77: CALL(1, 0, 1, 1, 0, 0, 1, 0, 0, 0); break;
2687 case 78: CALL(0, 1, 1, 1, 0, 0, 1, 0, 0, 0); break;
2688 case 79: CALL(1, 1, 1, 1, 0, 0, 1, 0, 0, 0); break;
2690 case 80: CALL(0, 0, 0, 0, 1, 0, 1, 0, 0, 0); break;
2691 case 81: CALL(1, 0, 0, 0, 1, 0, 1, 0, 0, 0); break;
2692 case 82: CALL(0, 1, 0, 0, 1, 0, 1, 0, 0, 0); break;
2693 case 83: CALL(1, 1, 0, 0, 1, 0, 1, 0, 0, 0); break;
2694 case 84: CALL(0, 0, 1, 0, 1, 0, 1, 0, 0, 0); break;
2695 case 85: CALL(1, 0, 1, 0, 1, 0, 1, 0, 0, 0); break;
2696 case 86: CALL(0, 1, 1, 0, 1, 0, 1, 0, 0, 0); break;
2697 case 87: CALL(1, 1, 1, 0, 1, 0, 1, 0, 0, 0); break;
2698 case 88: CALL(0, 0, 0, 1, 1, 0, 1, 0, 0, 0); break;
2699 case 89: CALL(1, 0, 0, 1, 1, 0, 1, 0, 0, 0); break;
2700 case 90: CALL(0, 1, 0, 1, 1, 0, 1, 0, 0, 0); break;
2701 case 91: CALL(1, 1, 0, 1, 1, 0, 1, 0, 0, 0); break;
2702 case 92: CALL(0, 0, 1, 1, 1, 0, 1, 0, 0, 0); break;
2703 case 93: CALL(1, 0, 1, 1, 1, 0, 1, 0, 0, 0); break;
2704 case 94: CALL(0, 1, 1, 1, 1, 0, 1, 0, 0, 0); break;
2705 case 95: CALL(1, 1, 1, 1, 1, 0, 1, 0, 0, 0); break;
2707 case 96: CALL(0, 0, 0, 0, 0, 1, 1, 0, 0, 0); break;
2708 case 97: CALL(1, 0, 0, 0, 0, 1, 1, 0, 0, 0); break;
2709 case 98: CALL(0, 1, 0, 0, 0, 1, 1, 0, 0, 0); break;
2710 case 99: CALL(1, 1, 0, 0, 0, 1, 1, 0, 0, 0); break;
2711 case 100: CALL(0, 0, 1, 0, 0, 1, 1, 0, 0, 0); break;
2712 case 101: CALL(1, 0, 1, 0, 0, 1, 1, 0, 0, 0); break;
2713 case 102: CALL(0, 1, 1, 0, 0, 1, 1, 0, 0, 0); break;
2714 case 103: CALL(1, 1, 1, 0, 0, 1, 1, 0, 0, 0); break;
2715 case 104: CALL(0, 0, 0, 1, 0, 1, 1, 0, 0, 0); break;
2716 case 105: CALL(1, 0, 0, 1, 0, 1, 1, 0, 0, 0); break;
2717 case 106: CALL(0, 1, 0, 1, 0, 1, 1, 0, 0, 0); break;
2718 case 107: CALL(1, 1, 0, 1, 0, 1, 1, 0, 0, 0); break;
2719 case 108: CALL(0, 0, 1, 1, 0, 1, 1, 0, 0, 0); break;
2720 case 109: CALL(1, 0, 1, 1, 0, 1, 1, 0, 0, 0); break;
2721 case 110: CALL(0, 1, 1, 1, 0, 1, 1, 0, 0, 0); break;
2722 case 111: CALL(1, 1, 1, 1, 0, 1, 1, 0, 0, 0); break;
2724 case 112: CALL(0, 0, 0, 0, 1, 1, 1, 0, 0, 0); break;
2725 case 113: CALL(1, 0, 0, 0, 1, 1, 1, 0, 0, 0); break;
2726 case 114: CALL(0, 1, 0, 0, 1, 1, 1, 0, 0, 0); break;
2727 case 115: CALL(1, 1, 0, 0, 1, 1, 1, 0, 0, 0); break;
2728 case 116: CALL(0, 0, 1, 0, 1, 1, 1, 0, 0, 0); break;
2729 case 117: CALL(1, 0, 1, 0, 1, 1, 1, 0, 0, 0); break;
2730 case 118: CALL(0, 1, 1, 0, 1, 1, 1, 0, 0, 0); break;
2731 case 119: CALL(1, 1, 1, 0, 1, 1, 1, 0, 0, 0); break;
2732 case 120: CALL(0, 0, 0, 1, 1, 1, 1, 0, 0, 0); break;
2733 case 121: CALL(1, 0, 0, 1, 1, 1, 1, 0, 0, 0); break;
2734 case 122: CALL(0, 1, 0, 1, 1, 1, 1, 0, 0, 0); break;
2735 case 123: CALL(1, 1, 0, 1, 1, 1, 1, 0, 0, 0); break;
2736 case 124: CALL(0, 0, 1, 1, 1, 1, 1, 0, 0, 0); break;
2737 case 125: CALL(1, 0, 1, 1, 1, 1, 1, 0, 0, 0); break;
2738 case 126: CALL(0, 1, 1, 1, 1, 1, 1, 0, 0, 0); break;
2739 case 127: CALL(1, 1, 1, 1, 1, 1, 1, 0, 0, 0); break;
2742 case 128: CALL(0, 0, 0, 0, 0, 0, 0, 1, 0, 0); break;
2743 case 129: CALL(1, 0, 0, 0, 0, 0, 0, 1, 0, 0); break;
2744 case 130: CALL(0, 1, 0, 0, 0, 0, 0, 1, 0, 0); break;
2745 case 131: CALL(1, 1, 0, 0, 0, 0, 0, 1, 0, 0); break;
2746 case 132: CALL(0, 0, 1, 0, 0, 0, 0, 1, 0, 0); break;
2747 case 133: CALL(1, 0, 1, 0, 0, 0, 0, 1, 0, 0); break;
2748 case 134: CALL(0, 1, 1, 0, 0, 0, 0, 1, 0, 0); break;
2749 case 135: CALL(1, 1, 1, 0, 0, 0, 0, 1, 0, 0); break;
2750 case 136: CALL(0, 0, 0, 1, 0, 0, 0, 1, 0, 0); break;
2751 case 137: CALL(1, 0, 0, 1, 0, 0, 0, 1, 0, 0); break;
2752 case 138: CALL(0, 1, 0, 1, 0, 0, 0, 1, 0, 0); break;
2753 case 139: CALL(1, 1, 0, 1, 0, 0, 0, 1, 0, 0); break;
2754 case 140: CALL(0, 0, 1, 1, 0, 0, 0, 1, 0, 0); break;
2755 case 141: CALL(1, 0, 1, 1, 0, 0, 0, 1, 0, 0); break;
2756 case 142: CALL(0, 1, 1, 1, 0, 0, 0, 1, 0, 0); break;
2757 case 143: CALL(1, 1, 1, 1, 0, 0, 0, 1, 0, 0); break;
2760 case 144: CALL(0, 0, 0, 0, 1, 0, 0, 1, 0, 0); break;
2761 case 145: CALL(1, 0, 0, 0, 1, 0, 0, 1, 0, 0); break;
2762 case 146: CALL(0, 1, 0, 0, 1, 0, 0, 1, 0, 0); break;
2763 case 147: CALL(1, 1, 0, 0, 1, 0, 0, 1, 0, 0); break;
2764 case 148: CALL(0, 0, 1, 0, 1, 0, 0, 1, 0, 0); break;
2765 case 149: CALL(1, 0, 1, 0, 1, 0, 0, 1, 0, 0); break;
2766 case 150: CALL(0, 1, 1, 0, 1, 0, 0, 1, 0, 0); break;
2767 case 151: CALL(1, 1, 1, 0, 1, 0, 0, 1, 0, 0); break;
2768 case 152: CALL(0, 0, 0, 1, 1, 0, 0, 1, 0, 0); break;
2769 case 153: CALL(1, 0, 0, 1, 1, 0, 0, 1, 0, 0); break;
2770 case 154: CALL(0, 1, 0, 1, 1, 0, 0, 1, 0, 0); break;
2771 case 155: CALL(1, 1, 0, 1, 1, 0, 0, 1, 0, 0); break;
2772 case 156: CALL(0, 0, 1, 1, 1, 0, 0, 1, 0, 0); break;
2773 case 157: CALL(1, 0, 1, 1, 1, 0, 0, 1, 0, 0); break;
2774 case 158: CALL(0, 1, 1, 1, 1, 0, 0, 1, 0, 0); break;
2775 case 159: CALL(1, 1, 1, 1, 1, 0, 0, 1, 0, 0); break;
2777 case 160: CALL(0, 0, 0, 0, 0, 1, 0, 1, 0, 0); break;
2778 case 161: CALL(1, 0, 0, 0, 0, 1, 0, 1, 0, 0); break;
2779 case 162: CALL(0, 1, 0, 0, 0, 1, 0, 1, 0, 0); break;
2780 case 163: CALL(1, 1, 0, 0, 0, 1, 0, 1, 0, 0); break;
2781 case 164: CALL(0, 0, 1, 0, 0, 1, 0, 1, 0, 0); break;
2782 case 165: CALL(1, 0, 1, 0, 0, 1, 0, 1, 0, 0); break;
2783 case 166: CALL(0, 1, 1, 0, 0, 1, 0, 1, 0, 0); break;
2784 case 167: CALL(1, 1, 1, 0, 0, 1, 0, 1, 0, 0); break;
2785 case 168: CALL(0, 0, 0, 1, 0, 1, 0, 1, 0, 0); break;
2786 case 169: CALL(1, 0, 0, 1, 0, 1, 0, 1, 0, 0); break;
2787 case 170: CALL(0, 1, 0, 1, 0, 1, 0, 1, 0, 0); break;
2788 case 171: CALL(1, 1, 0, 1, 0, 1, 0, 1, 0, 0); break;
2789 case 172: CALL(0, 0, 1, 1, 0, 1, 0, 1, 0, 0); break;
2790 case 173: CALL(1, 0, 1, 1, 0, 1, 0, 1, 0, 0); break;
2791 case 174: CALL(0, 1, 1, 1, 0, 1, 0, 1, 0, 0); break;
2792 case 175: CALL(1, 1, 1, 1, 0, 1, 0, 1, 0, 0); break;
2794 case 176: CALL(0, 0, 0, 0, 1, 1, 0, 1, 0, 0); break;
2795 case 177: CALL(1, 0, 0, 0, 1, 1, 0, 1, 0, 0); break;
2796 case 178: CALL(0, 1, 0, 0, 1, 1, 0, 1, 0, 0); break;
2797 case 179: CALL(1, 1, 0, 0, 1, 1, 0, 1, 0, 0); break;
2798 case 180: CALL(0, 0, 1, 0, 1, 1, 0, 1, 0, 0); break;
2799 case 181: CALL(1, 0, 1, 0, 1, 1, 0, 1, 0, 0); break;
2800 case 182: CALL(0, 1, 1, 0, 1, 1, 0, 1, 0, 0); break;
2801 case 183: CALL(1, 1, 1, 0, 1, 1, 0, 1, 0, 0); break;
2802 case 184: CALL(0, 0, 0, 1, 1, 1, 0, 1, 0, 0); break;
2803 case 185: CALL(1, 0, 0, 1, 1, 1, 0, 1, 0, 0); break;
2804 case 186: CALL(0, 1, 0, 1, 1, 1, 0, 1, 0, 0); break;
2805 case 187: CALL(1, 1, 0, 1, 1, 1, 0, 1, 0, 0); break;
2806 case 188: CALL(0, 0, 1, 1, 1, 1, 0, 1, 0, 0); break;
2807 case 189: CALL(1, 0, 1, 1, 1, 1, 0, 1, 0, 0); break;
2808 case 190: CALL(0, 1, 1, 1, 1, 1, 0, 1, 0, 0); break;
2809 case 191: CALL(1, 1, 1, 1, 1, 1, 0, 1, 0, 0); break;
2811 case 192: CALL(0, 0, 0, 0, 0, 0, 1, 1, 0, 0); break;
2812 case 193: CALL(1, 0, 0, 0, 0, 0, 1, 1, 0, 0); break;
2813 case 194: CALL(0, 1, 0, 0, 0, 0, 1, 1, 0, 0); break;
2814 case 195: CALL(1, 1, 0, 0, 0, 0, 1, 1, 0, 0); break;
2815 case 196: CALL(0, 0, 1, 0, 0, 0, 1, 1, 0, 0); break;
2816 case 197: CALL(1, 0, 1, 0, 0, 0, 1, 1, 0, 0); break;
2817 case 198: CALL(0, 1, 1, 0, 0, 0, 1, 1, 0, 0); break;
2818 case 199: CALL(1, 1, 1, 0, 0, 0, 1, 1, 0, 0); break;
2819 case 200: CALL(0, 0, 0, 1, 0, 0, 1, 1, 0, 0); break;
2820 case 201: CALL(1, 0, 0, 1, 0, 0, 1, 1, 0, 0); break;
2821 case 202: CALL(0, 1, 0, 1, 0, 0, 1, 1, 0, 0); break;
2822 case 203: CALL(1, 1, 0, 1, 0, 0, 1, 1, 0, 0); break;
2823 case 204: CALL(0, 0, 1, 1, 0, 0, 1, 1, 0, 0); break;
2824 case 205: CALL(1, 0, 1, 1, 0, 0, 1, 1, 0, 0); break;
2825 case 206: CALL(0, 1, 1, 1, 0, 0, 1, 1, 0, 0); break;
2826 case 207: CALL(1, 1, 1, 1, 0, 0, 1, 1, 0, 0); break;
2828 case 208: CALL(0, 0, 0, 0, 1, 0, 1, 1, 0, 0); break;
2829 case 209: CALL(1, 0, 0, 0, 1, 0, 1, 1, 0, 0); break;
2830 case 210: CALL(0, 1, 0, 0, 1, 0, 1, 1, 0, 0); break;
2831 case 211: CALL(1, 1, 0, 0, 1, 0, 1, 1, 0, 0); break;
2832 case 212: CALL(0, 0, 1, 0, 1, 0, 1, 1, 0, 0); break;
2833 case 213: CALL(1, 0, 1, 0, 1, 0, 1, 1, 0, 0); break;
2834 case 214: CALL(0, 1, 1, 0, 1, 0, 1, 1, 0, 0); break;
2835 case 215: CALL(1, 1, 1, 0, 1, 0, 1, 1, 0, 0); break;
2836 case 216: CALL(0, 0, 0, 1, 1, 0, 1, 1, 0, 0); break;
2837 case 217: CALL(1, 0, 0, 1, 1, 0, 1, 1, 0, 0); break;
2838 case 218: CALL(0, 1, 0, 1, 1, 0, 1, 1, 0, 0); break;
2839 case 219: CALL(1, 1, 0, 1, 1, 0, 1, 1, 0, 0); break;
2840 case 220: CALL(0, 0, 1, 1, 1, 0, 1, 1, 0, 0); break;
2841 case 221: CALL(1, 0, 1, 1, 1, 0, 1, 1, 0, 0); break;
2842 case 222: CALL(0, 1, 1, 1, 1, 0, 1, 1, 0, 0); break;
2843 case 223: CALL(1, 1, 1, 1, 1, 0, 1, 1, 0, 0); break;
2845 case 224: CALL(0, 0, 0, 0, 0, 1, 1, 1, 0, 0); break;
2846 case 225: CALL(1, 0, 0, 0, 0, 1, 1, 1, 0, 0); break;
2847 case 226: CALL(0, 1, 0, 0, 0, 1, 1, 1, 0, 0); break;
2848 case 227: CALL(1, 1, 0, 0, 0, 1, 1, 1, 0, 0); break;
2849 case 228: CALL(0, 0, 1, 0, 0, 1, 1, 1, 0, 0); break;
2850 case 229: CALL(1, 0, 1, 0, 0, 1, 1, 1, 0, 0); break;
2851 case 230: CALL(0, 1, 1, 0, 0, 1, 1, 1, 0, 0); break;
2852 case 231: CALL(1, 1, 1, 0, 0, 1, 1, 1, 0, 0); break;
2853 case 232: CALL(0, 0, 0, 1, 0, 1, 1, 1, 0, 0); break;
2854 case 233: CALL(1, 0, 0, 1, 0, 1, 1, 1, 0, 0); break;
2855 case 234: CALL(0, 1, 0, 1, 0, 1, 1, 1, 0, 0); break;
2856 case 235: CALL(1, 1, 0, 1, 0, 1, 1, 1, 0, 0); break;
2857 case 236: CALL(0, 0, 1, 1, 0, 1, 1, 1, 0, 0); break;
2858 case 237: CALL(1, 0, 1, 1, 0, 1, 1, 1, 0, 0); break;
2859 case 238: CALL(0, 1, 1, 1, 0, 1, 1, 1, 0, 0); break;
2860 case 239: CALL(1, 1, 1, 1, 0, 1, 1, 1, 0, 0); break;
2862 case 240: CALL(0, 0, 0, 0, 1, 1, 1, 1, 0, 0); break;
2863 case 241: CALL(1, 0, 0, 0, 1, 1, 1, 1, 0, 0); break;
2864 case 242: CALL(0, 1, 0, 0, 1, 1, 1, 1, 0, 0); break;
2865 case 243: CALL(1, 1, 0, 0, 1, 1, 1, 1, 0, 0); break;
2866 case 244: CALL(0, 0, 1, 0, 1, 1, 1, 1, 0, 0); break;
2867 case 245: CALL(1, 0, 1, 0, 1, 1, 1, 1, 0, 0); break;
2868 case 246: CALL(0, 1, 1, 0, 1, 1, 1, 1, 0, 0); break;
2869 case 247: CALL(1, 1, 1, 0, 1, 1, 1, 1, 0, 0); break;
2870 case 248: CALL(0, 0, 0, 1, 1, 1, 1, 1, 0, 0); break;
2871 case 249: CALL(1, 0, 0, 1, 1, 1, 1, 1, 0, 0); break;
2872 case 250: CALL(0, 1, 0, 1, 1, 1, 1, 1, 0, 0); break;
2873 case 251: CALL(1, 1, 0, 1, 1, 1, 1, 1, 0, 0); break;
2874 case 252: CALL(0, 0, 1, 1, 1, 1, 1, 1, 0, 0); break;
2875 case 253: CALL(1, 0, 1, 1, 1, 1, 1, 1, 0, 0); break;
2876 case 254: CALL(0, 1, 1, 1, 1, 1, 1, 1, 0, 0); break;
2877 case 255: CALL(1, 1, 1, 1, 1, 1, 1, 1, 0, 0); break;
2880 case 256: CALL(0, 0, 0, 0, 0, 0, 0, 0, 1, 0); break;
2881 case 257: CALL(1, 0, 0, 0, 0, 0, 0, 0, 1, 0); break;
2882 case 258: CALL(0, 1, 0, 0, 0, 0, 0, 0, 1, 0); break;
2883 case 259: CALL(1, 1, 0, 0, 0, 0, 0, 0, 1, 0); break;
2884 case 260: CALL(0, 0, 1, 0, 0, 0, 0, 0, 1, 0); break;
2885 case 261: CALL(1, 0, 1, 0, 0, 0, 0, 0, 1, 0); break;
2886 case 262: CALL(0, 1, 1, 0, 0, 0, 0, 0, 1, 0); break;
2887 case 263: CALL(1, 1, 1, 0, 0, 0, 0, 0, 1, 0); break;
2888 case 264: CALL(0, 0, 0, 1, 0, 0, 0, 0, 1, 0); break;
2889 case 265: CALL(1, 0, 0, 1, 0, 0, 0, 0, 1, 0); break;
2890 case 266: CALL(0, 1, 0, 1, 0, 0, 0, 0, 1, 0); break;
2891 case 267: CALL(1, 1, 0, 1, 0, 0, 0, 0, 1, 0); break;
2892 case 268: CALL(0, 0, 1, 1, 0, 0, 0, 0, 1, 0); break;
2893 case 269: CALL(1, 0, 1, 1, 0, 0, 0, 0, 1, 0); break;
2894 case 270: CALL(0, 1, 1, 1, 0, 0, 0, 0, 1, 0); break;
2895 case 271: CALL(1, 1, 1, 1, 0, 0, 0, 0, 1, 0); break;
2898 case 272: CALL(0, 0, 0, 0, 1, 0, 0, 0, 1, 0); break;
2899 case 273: CALL(1, 0, 0, 0, 1, 0, 0, 0, 1, 0); break;
2900 case 274: CALL(0, 1, 0, 0, 1, 0, 0, 0, 1, 0); break;
2901 case 275: CALL(1, 1, 0, 0, 1, 0, 0, 0, 1, 0); break;
2902 case 276: CALL(0, 0, 1, 0, 1, 0, 0, 0, 1, 0); break;
2903 case 277: CALL(1, 0, 1, 0, 1, 0, 0, 0, 1, 0); break;
2904 case 278: CALL(0, 1, 1, 0, 1, 0, 0, 0, 1, 0); break;
2905 case 279: CALL(1, 1, 1, 0, 1, 0, 0, 0, 1, 0); break;
2906 case 280: CALL(0, 0, 0, 1, 1, 0, 0, 0, 1, 0); break;
2907 case 281: CALL(1, 0, 0, 1, 1, 0, 0, 0, 1, 0); break;
2908 case 282: CALL(0, 1, 0, 1, 1, 0, 0, 0, 1, 0); break;
2909 case 283: CALL(1, 1, 0, 1, 1, 0, 0, 0, 1, 0); break;
2910 case 284: CALL(0, 0, 1, 1, 1, 0, 0, 0, 1, 0); break;
2911 case 285: CALL(1, 0, 1, 1, 1, 0, 0, 0, 1, 0); break;
2912 case 286: CALL(0, 1, 1, 1, 1, 0, 0, 0, 1, 0); break;
2913 case 287: CALL(1, 1, 1, 1, 1, 0, 0, 0, 1, 0); break;
2915 case 288: CALL(0, 0, 0, 0, 0, 1, 0, 0, 1, 0); break;
2916 case 289: CALL(1, 0, 0, 0, 0, 1, 0, 0, 1, 0); break;
2917 case 290: CALL(0, 1, 0, 0, 0, 1, 0, 0, 1, 0); break;
2918 case 291: CALL(1, 1, 0, 0, 0, 1, 0, 0, 1, 0); break;
2919 case 292: CALL(0, 0, 1, 0, 0, 1, 0, 0, 1, 0); break;
2920 case 293: CALL(1, 0, 1, 0, 0, 1, 0, 0, 1, 0); break;
2921 case 294: CALL(0, 1, 1, 0, 0, 1, 0, 0, 1, 0); break;
2922 case 295: CALL(1, 1, 1, 0, 0, 1, 0, 0, 1, 0); break;
2923 case 296: CALL(0, 0, 0, 1, 0, 1, 0, 0, 1, 0); break;
2924 case 297: CALL(1, 0, 0, 1, 0, 1, 0, 0, 1, 0); break;
2925 case 298: CALL(0, 1, 0, 1, 0, 1, 0, 0, 1, 0); break;
2926 case 299: CALL(1, 1, 0, 1, 0, 1, 0, 0, 1, 0); break;
2927 case 300: CALL(0, 0, 1, 1, 0, 1, 0, 0, 1, 0); break;
2928 case 301: CALL(1, 0, 1, 1, 0, 1, 0, 0, 1, 0); break;
2929 case 302: CALL(0, 1, 1, 1, 0, 1, 0, 0, 1, 0); break;
2930 case 303: CALL(1, 1, 1, 1, 0, 1, 0, 0, 1, 0); break;
2933 case 304: CALL(0, 0, 0, 0, 1, 1, 0, 0, 1, 0); break;
2934 case 305: CALL(1, 0, 0, 0, 1, 1, 0, 0, 1, 0); break;
2935 case 306: CALL(0, 1, 0, 0, 1, 1, 0, 0, 1, 0); break;
2936 case 307: CALL(1, 1, 0, 0, 1, 1, 0, 0, 1, 0); break;
2937 case 308: CALL(0, 0, 1, 0, 1, 1, 0, 0, 1, 0); break;
2938 case 309: CALL(1, 0, 1, 0, 1, 1, 0, 0, 1, 0); break;
2939 case 310: CALL(0, 1, 1, 0, 1, 1, 0, 0, 1, 0); break;
2940 case 311: CALL(1, 1, 1, 0, 1, 1, 0, 0, 1, 0); break;
2941 case 312: CALL(0, 0, 0, 1, 1, 1, 0, 0, 1, 0); break;
2942 case 313: CALL(1, 0, 0, 1, 1, 1, 0, 0, 1, 0); break;
2943 case 314: CALL(0, 1, 0, 1, 1, 1, 0, 0, 1, 0); break;
2944 case 315: CALL(1, 1, 0, 1, 1, 1, 0, 0, 1, 0); break;
2945 case 316: CALL(0, 0, 1, 1, 1, 1, 0, 0, 1, 0); break;
2946 case 317: CALL(1, 0, 1, 1, 1, 1, 0, 0, 1, 0); break;
2947 case 318: CALL(0, 1, 1, 1, 1, 1, 0, 0, 1, 0); break;
2948 case 319: CALL(1, 1, 1, 1, 1, 1, 0, 0, 1, 0); break;
2951 case 320: CALL(0, 0, 0, 0, 0, 0, 1, 0, 1, 0); break;
2952 case 321: CALL(1, 0, 0, 0, 0, 0, 1, 0, 1, 0); break;
2953 case 322: CALL(0, 1, 0, 0, 0, 0, 1, 0, 1, 0); break;
2954 case 323: CALL(1, 1, 0, 0, 0, 0, 1, 0, 1, 0); break;
2955 case 324: CALL(0, 0, 1, 0, 0, 0, 1, 0, 1, 0); break;
2956 case 325: CALL(1, 0, 1, 0, 0, 0, 1, 0, 1, 0); break;
2957 case 326: CALL(0, 1, 1, 0, 0, 0, 1, 0, 1, 0); break;
2958 case 327: CALL(1, 1, 1, 0, 0, 0, 1, 0, 1, 0); break;
2959 case 328: CALL(0, 0, 0, 1, 0, 0, 1, 0, 1, 0); break;
2960 case 329: CALL(1, 0, 0, 1, 0, 0, 1, 0, 1, 0); break;
2961 case 330: CALL(0, 1, 0, 1, 0, 0, 1, 0, 1, 0); break;
2962 case 331: CALL(1, 1, 0, 1, 0, 0, 1, 0, 1, 0); break;
2963 case 332: CALL(0, 0, 1, 1, 0, 0, 1, 0, 1, 0); break;
2964 case 333: CALL(1, 0, 1, 1, 0, 0, 1, 0, 1, 0); break;
2965 case 334: CALL(0, 1, 1, 1, 0, 0, 1, 0, 1, 0); break;
2966 case 335: CALL(1, 1, 1, 1, 0, 0, 1, 0, 1, 0); break;
2969 case 336: CALL(0, 0, 0, 0, 1, 0, 1, 0, 1, 0); break;
2970 case 337: CALL(1, 0, 0, 0, 1, 0, 1, 0, 1, 0); break;
2971 case 338: CALL(0, 1, 0, 0, 1, 0, 1, 0, 1, 0); break;
2972 case 339: CALL(1, 1, 0, 0, 1, 0, 1, 0, 1, 0); break;
2973 case 340: CALL(0, 0, 1, 0, 1, 0, 1, 0, 1, 0); break;
2974 case 341: CALL(1, 0, 1, 0, 1, 0, 1, 0, 1, 0); break;
2975 case 342: CALL(0, 1, 1, 0, 1, 0, 1, 0, 1, 0); break;
2976 case 343: CALL(1, 1, 1, 0, 1, 0, 1, 0, 1, 0); break;
2977 case 344: CALL(0, 0, 0, 1, 1, 0, 1, 0, 1, 0); break;
2978 case 345: CALL(1, 0, 0, 1, 1, 0, 1, 0, 1, 0); break;
2979 case 346: CALL(0, 1, 0, 1, 1, 0, 1, 0, 1, 0); break;
2980 case 347: CALL(1, 1, 0, 1, 1, 0, 1, 0, 1, 0); break;
2981 case 348: CALL(0, 0, 1, 1, 1, 0, 1, 0, 1, 0); break;
2982 case 349: CALL(1, 0, 1, 1, 1, 0, 1, 0, 1, 0); break;
2983 case 350: CALL(0, 1, 1, 1, 1, 0, 1, 0, 1, 0); break;
2984 case 351: CALL(1, 1, 1, 1, 1, 0, 1, 0, 1, 0); break;
2987 case 352: CALL(0, 0, 0, 0, 0, 1, 1, 0, 1, 0); break;
2988 case 353: CALL(1, 0, 0, 0, 0, 1, 1, 0, 1, 0); break;
2989 case 354: CALL(0, 1, 0, 0, 0, 1, 1, 0, 1, 0); break;
2990 case 355: CALL(1, 1, 0, 0, 0, 1, 1, 0, 1, 0); break;
2991 case 356: CALL(0, 0, 1, 0, 0, 1, 1, 0, 1, 0); break;
2992 case 357: CALL(1, 0, 1, 0, 0, 1, 1, 0, 1, 0); break;
2993 case 358: CALL(0, 1, 1, 0, 0, 1, 1, 0, 1, 0); break;
2994 case 359: CALL(1, 1, 1, 0, 0, 1, 1, 0, 1, 0); break;
2995 case 360: CALL(0, 0, 0, 1, 0, 1, 1, 0, 1, 0); break;
2996 case 361: CALL(1, 0, 0, 1, 0, 1, 1, 0, 1, 0); break;
2997 case 362: CALL(0, 1, 0, 1, 0, 1, 1, 0, 1, 0); break;
2998 case 363: CALL(1, 1, 0, 1, 0, 1, 1, 0, 1, 0); break;
2999 case 364: CALL(0, 0, 1, 1, 0, 1, 1, 0, 1, 0); break;
3000 case 365: CALL(1, 0, 1, 1, 0, 1, 1, 0, 1, 0); break;
3001 case 366: CALL(0, 1, 1, 1, 0, 1, 1, 0, 1, 0); break;
3002 case 367: CALL(1, 1, 1, 1, 0, 1, 1, 0, 1, 0); break;
3004 case 368: CALL(0, 0, 0, 0, 1, 1, 1, 0, 1, 0); break;
3005 case 369: CALL(1, 0, 0, 0, 1, 1, 1, 0, 1, 0); break;
3006 case 370: CALL(0, 1, 0, 0, 1, 1, 1, 0, 1, 0); break;
3007 case 371: CALL(1, 1, 0, 0, 1, 1, 1, 0, 1, 0); break;
3008 case 372: CALL(0, 0, 1, 0, 1, 1, 1, 0, 1, 0); break;
3009 case 373: CALL(1, 0, 1, 0, 1, 1, 1, 0, 1, 0); break;
3010 case 374: CALL(0, 1, 1, 0, 1, 1, 1, 0, 1, 0); break;
3011 case 375: CALL(1, 1, 1, 0, 1, 1, 1, 0, 1, 0); break;
3012 case 376: CALL(0, 0, 0, 1, 1, 1, 1, 0, 1, 0); break;
3013 case 377: CALL(1, 0, 0, 1, 1, 1, 1, 0, 1, 0); break;
3014 case 378: CALL(0, 1, 0, 1, 1, 1, 1, 0, 1, 0); break;
3015 case 379: CALL(1, 1, 0, 1, 1, 1, 1, 0, 1, 0); break;
3016 case 380: CALL(0, 0, 1, 1, 1, 1, 1, 0, 1, 0); break;
3017 case 381: CALL(1, 0, 1, 1, 1, 1, 1, 0, 1, 0); break;
3018 case 382: CALL(0, 1, 1, 1, 1, 1, 1, 0, 1, 0); break;
3019 case 383: CALL(1, 1, 1, 1, 1, 1, 1, 0, 1, 0); break;
3022 case 384: CALL(0, 0, 0, 0, 0, 0, 0, 1, 1, 0); break;
3023 case 385: CALL(1, 0, 0, 0, 0, 0, 0, 1, 1, 0); break;
3024 case 386: CALL(0, 1, 0, 0, 0, 0, 0, 1, 1, 0); break;
3025 case 387: CALL(1, 1, 0, 0, 0, 0, 0, 1, 1, 0); break;
3026 case 388: CALL(0, 0, 1, 0, 0, 0, 0, 1, 1, 0); break;
3027 case 389: CALL(1, 0, 1, 0, 0, 0, 0, 1, 1, 0); break;
3028 case 390: CALL(0, 1, 1, 0, 0, 0, 0, 1, 1, 0); break;
3029 case 391: CALL(1, 1, 1, 0, 0, 0, 0, 1, 1, 0); break;
3030 case 392: CALL(0, 0, 0, 1, 0, 0, 0, 1, 1, 0); break;
3031 case 393: CALL(1, 0, 0, 1, 0, 0, 0, 1, 1, 0); break;
3032 case 394: CALL(0, 1, 0, 1, 0, 0, 0, 1, 1, 0); break;
3033 case 395: CALL(1, 1, 0, 1, 0, 0, 0, 1, 1, 0); break;
3034 case 396: CALL(0, 0, 1, 1, 0, 0, 0, 1, 1, 0); break;
3035 case 397: CALL(1, 0, 1, 1, 0, 0, 0, 1, 1, 0); break;
3036 case 398: CALL(0, 1, 1, 1, 0, 0, 0, 1, 1, 0); break;
3037 case 399: CALL(1, 1, 1, 1, 0, 0, 0, 1, 1, 0); break;
3040 case 400: CALL(0, 0, 0, 0, 1, 0, 0, 1, 1, 0); break;
3041 case 401: CALL(1, 0, 0, 0, 1, 0, 0, 1, 1, 0); break;
3042 case 402: CALL(0, 1, 0, 0, 1, 0, 0, 1, 1, 0); break;
3043 case 403: CALL(1, 1, 0, 0, 1, 0, 0, 1, 1, 0); break;
3044 case 404: CALL(0, 0, 1, 0, 1, 0, 0, 1, 1, 0); break;
3045 case 405: CALL(1, 0, 1, 0, 1, 0, 0, 1, 1, 0); break;
3046 case 406: CALL(0, 1, 1, 0, 1, 0, 0, 1, 1, 0); break;
3047 case 407: CALL(1, 1, 1, 0, 1, 0, 0, 1, 1, 0); break;
3048 case 408: CALL(0, 0, 0, 1, 1, 0, 0, 1, 1, 0); break;
3049 case 409: CALL(1, 0, 0, 1, 1, 0, 0, 1, 1, 0); break;
3050 case 410: CALL(0, 1, 0, 1, 1, 0, 0, 1, 1, 0); break;
3051 case 411: CALL(1, 1, 0, 1, 1, 0, 0, 1, 1, 0); break;
3052 case 412: CALL(0, 0, 1, 1, 1, 0, 0, 1, 1, 0); break;
3053 case 413: CALL(1, 0, 1, 1, 1, 0, 0, 1, 1, 0); break;
3054 case 414: CALL(0, 1, 1, 1, 1, 0, 0, 1, 1, 0); break;
3055 case 415: CALL(1, 1, 1, 1, 1, 0, 0, 1, 1, 0); break;
3057 case 416: CALL(0, 0, 0, 0, 0, 1, 0, 1, 1, 0); break;
3058 case 417: CALL(1, 0, 0, 0, 0, 1, 0, 1, 1, 0); break;
3059 case 418: CALL(0, 1, 0, 0, 0, 1, 0, 1, 1, 0); break;
3060 case 419: CALL(1, 1, 0, 0, 0, 1, 0, 1, 1, 0); break;
3061 case 420: CALL(0, 0, 1, 0, 0, 1, 0, 1, 1, 0); break;
3062 case 421: CALL(1, 0, 1, 0, 0, 1, 0, 1, 1, 0); break;
3063 case 422: CALL(0, 1, 1, 0, 0, 1, 0, 1, 1, 0); break;
3064 case 423: CALL(1, 1, 1, 0, 0, 1, 0, 1, 1, 0); break;
3065 case 424: CALL(0, 0, 0, 1, 0, 1, 0, 1, 1, 0); break;
3066 case 425: CALL(1, 0, 0, 1, 0, 1, 0, 1, 1, 0); break;
3067 case 426: CALL(0, 1, 0, 1, 0, 1, 0, 1, 1, 0); break;
3068 case 427: CALL(1, 1, 0, 1, 0, 1, 0, 1, 1, 0); break;
3069 case 428: CALL(0, 0, 1, 1, 0, 1, 0, 1, 1, 0); break;
3070 case 429: CALL(1, 0, 1, 1, 0, 1, 0, 1, 1, 0); break;
3071 case 430: CALL(0, 1, 1, 1, 0, 1, 0, 1, 1, 0); break;
3072 case 431: CALL(1, 1, 1, 1, 0, 1, 0, 1, 1, 0); break;
3075 case 432: CALL(0, 0, 0, 0, 1, 1, 0, 1, 1, 0); break;
3076 case 433: CALL(1, 0, 0, 0, 1, 1, 0, 1, 1, 0); break;
3077 case 434: CALL(0, 1, 0, 0, 1, 1, 0, 1, 1, 0); break;
3078 case 435: CALL(1, 1, 0, 0, 1, 1, 0, 1, 1, 0); break;
3079 case 436: CALL(0, 0, 1, 0, 1, 1, 0, 1, 1, 0); break;
3080 case 437: CALL(1, 0, 1, 0, 1, 1, 0, 1, 1, 0); break;
3081 case 438: CALL(0, 1, 1, 0, 1, 1, 0, 1, 1, 0); break;
3082 case 439: CALL(1, 1, 1, 0, 1, 1, 0, 1, 1, 0); break;
3083 case 440: CALL(0, 0, 0, 1, 1, 1, 0, 1, 1, 0); break;
3084 case 441: CALL(1, 0, 0, 1, 1, 1, 0, 1, 1, 0); break;
3085 case 442: CALL(0, 1, 0, 1, 1, 1, 0, 1, 1, 0); break;
3086 case 443: CALL(1, 1, 0, 1, 1, 1, 0, 1, 1, 0); break;
3087 case 444: CALL(0, 0, 1, 1, 1, 1, 0, 1, 1, 0); break;
3088 case 445: CALL(1, 0, 1, 1, 1, 1, 0, 1, 1, 0); break;
3089 case 446: CALL(0, 1, 1, 1, 1, 1, 0, 1, 1, 0); break;
3090 case 447: CALL(1, 1, 1, 1, 1, 1, 0, 1, 1, 0); break;
3093 case 448: CALL(0, 0, 0, 0, 0, 0, 1, 1, 1, 0); break;
3094 case 449: CALL(1, 0, 0, 0, 0, 0, 1, 1, 1, 0); break;
3095 case 450: CALL(0, 1, 0, 0, 0, 0, 1, 1, 1, 0); break;
3096 case 451: CALL(1, 1, 0, 0, 0, 0, 1, 1, 1, 0); break;
3097 case 452: CALL(0, 0, 1, 0, 0, 0, 1, 1, 1, 0); break;
3098 case 453: CALL(1, 0, 1, 0, 0, 0, 1, 1, 1, 0); break;
3099 case 454: CALL(0, 1, 1, 0, 0, 0, 1, 1, 1, 0); break;
3100 case 455: CALL(1, 1, 1, 0, 0, 0, 1, 1, 1, 0); break;
3101 case 456: CALL(0, 0, 0, 1, 0, 0, 1, 1, 1, 0); break;
3102 case 457: CALL(1, 0, 0, 1, 0, 0, 1, 1, 1, 0); break;
3103 case 458: CALL(0, 1, 0, 1, 0, 0, 1, 1, 1, 0); break;
3104 case 459: CALL(1, 1, 0, 1, 0, 0, 1, 1, 1, 0); break;
3105 case 460: CALL(0, 0, 1, 1, 0, 0, 1, 1, 1, 0); break;
3106 case 461: CALL(1, 0, 1, 1, 0, 0, 1, 1, 1, 0); break;
3107 case 462: CALL(0, 1, 1, 1, 0, 0, 1, 1, 1, 0); break;
3108 case 463: CALL(1, 1, 1, 1, 0, 0, 1, 1, 1, 0); break;
3111 case 464: CALL(0, 0, 0, 0, 1, 0, 1, 1, 1, 0); break;
3112 case 465: CALL(1, 0, 0, 0, 1, 0, 1, 1, 1, 0); break;
3113 case 466: CALL(0, 1, 0, 0, 1, 0, 1, 1, 1, 0); break;
3114 case 467: CALL(1, 1, 0, 0, 1, 0, 1, 1, 1, 0); break;
3115 case 468: CALL(0, 0, 1, 0, 1, 0, 1, 1, 1, 0); break;
3116 case 469: CALL(1, 0, 1, 0, 1, 0, 1, 1, 1, 0); break;
3117 case 470: CALL(0, 1, 1, 0, 1, 0, 1, 1, 1, 0); break;
3118 case 471: CALL(1, 1, 1, 0, 1, 0, 1, 1, 1, 0); break;
3119 case 472: CALL(0, 0, 0, 1, 1, 0, 1, 1, 1, 0); break;
3120 case 473: CALL(1, 0, 0, 1, 1, 0, 1, 1, 1, 0); break;
3121 case 474: CALL(0, 1, 0, 1, 1, 0, 1, 1, 1, 0); break;
3122 case 475: CALL(1, 1, 0, 1, 1, 0, 1, 1, 1, 0); break;
3123 case 476: CALL(0, 0, 1, 1, 1, 0, 1, 1, 1, 0); break;
3124 case 477: CALL(1, 0, 1, 1, 1, 0, 1, 1, 1, 0); break;
3125 case 478: CALL(0, 1, 1, 1, 1, 0, 1, 1, 1, 0); break;
3126 case 479: CALL(1, 1, 1, 1, 1, 0, 1, 1, 1, 0); break;
3129 case 480: CALL(0, 0, 0, 0, 0, 1, 1, 1, 1, 0); break;
3130 case 481: CALL(1, 0, 0, 0, 0, 1, 1, 1, 1, 0); break;
3131 case 482: CALL(0, 1, 0, 0, 0, 1, 1, 1, 1, 0); break;
3132 case 483: CALL(1, 1, 0, 0, 0, 1, 1, 1, 1, 0); break;
3133 case 484: CALL(0, 0, 1, 0, 0, 1, 1, 1, 1, 0); break;
3134 case 485: CALL(1, 0, 1, 0, 0, 1, 1, 1, 1, 0); break;
3135 case 486: CALL(0, 1, 1, 0, 0, 1, 1, 1, 1, 0); break;
3136 case 487: CALL(1, 1, 1, 0, 0, 1, 1, 1, 1, 0); break;
3137 case 488: CALL(0, 0, 0, 1, 0, 1, 1, 1, 1, 0); break;
3138 case 489: CALL(1, 0, 0, 1, 0, 1, 1, 1, 1, 0); break;
3139 case 490: CALL(0, 1, 0, 1, 0, 1, 1, 1, 1, 0); break;
3140 case 491: CALL(1, 1, 0, 1, 0, 1, 1, 1, 1, 0); break;
3141 case 492: CALL(0, 0, 1, 1, 0, 1, 1, 1, 1, 0); break;
3142 case 493: CALL(1, 0, 1, 1, 0, 1, 1, 1, 1, 0); break;
3143 case 494: CALL(0, 1, 1, 1, 0, 1, 1, 1, 1, 0); break;
3144 case 495: CALL(1, 1, 1, 1, 0, 1, 1, 1, 1, 0); break;
3146 case 496: CALL(0, 0, 0, 0, 1, 1, 1, 1, 1, 0); break;
3147 case 497: CALL(1, 0, 0, 0, 1, 1, 1, 1, 1, 0); break;
3148 case 498: CALL(0, 1, 0, 0, 1, 1, 1, 1, 1, 0); break;
3149 case 499: CALL(1, 1, 0, 0, 1, 1, 1, 1, 1, 0); break;
3150 case 500: CALL(0, 0, 1, 0, 1, 1, 1, 1, 1, 0); break;
3151 case 501: CALL(1, 0, 1, 0, 1, 1, 1, 1, 1, 0); break;
3152 case 502: CALL(0, 1, 1, 0, 1, 1, 1, 1, 1, 0); break;
3153 case 503: CALL(1, 1, 1, 0, 1, 1, 1, 1, 1, 0); break;
3154 case 504: CALL(0, 0, 0, 1, 1, 1, 1, 1, 1, 0); break;
3155 case 505: CALL(1, 0, 0, 1, 1, 1, 1, 1, 1, 0); break;
3156 case 506: CALL(0, 1, 0, 1, 1, 1, 1, 1, 1, 0); break;
3157 case 507: CALL(1, 1, 0, 1, 1, 1, 1, 1, 1, 0); break;
3158 case 508: CALL(0, 0, 1, 1, 1, 1, 1, 1, 1, 0); break;
3159 case 509: CALL(1, 0, 1, 1, 1, 1, 1, 1, 1, 0); break;
3160 case 510: CALL(0, 1, 1, 1, 1, 1, 1, 1, 1, 0); break;
3161 case 511: CALL(1, 1, 1, 1, 1, 1, 1, 1, 1, 0); break;
3164 * Haochuan: the calls starting from 512 to 1023 were generated by the following python script
3165 * #!/usr/bin/env python3
3166 * def gen_call(option: int):
3167 * doEnergy = option & 1
3168 * doVirial = (option >> 1) & 1
3169 * doSlow = (option >> 2) & 1
3170 * doPairlist = (option >> 3) & 1
3171 * doAlch = (option >> 4) & 1
3172 * doFEP = (option >> 5) & 1
3173 * doTI = (option >> 6) & 1
3174 * doStreaming = (option >> 7) & 1
3175 * doTable = (option >> 8) & 1
3176 * doAlchVdwForceSwitching = (option >> 9) & 1
3177 * incompatible = False
3178 * incompatible = incompatible | (doFEP and doTI)
3179 * incompatible = incompatible | (doAlch and ((not doFEP) and (not doTI)))
3180 * incompatible = incompatible | ((not doAlch) and (doFEP or doTI or doAlchVdwForceSwitching))
3181 * incompatible = incompatible | ((not doTable) and (doAlch or doTI or doFEP or doAlchVdwForceSwitching))
3184 * print(f' // case {option}: CALL({doEnergy}, {doVirial}, {doSlow}, {doPairlist}, {doAlch}, {doFEP}, {doTI}, {doStreaming}, {doTable}, {doAlchVdwForceSwitching}); break;')
3186 * print(f' case {option}: CALL({doEnergy}, {doVirial}, {doSlow}, {doPairlist}, {doAlch}, {doFEP}, {doTI}, {doStreaming}, {doTable}, {doAlchVdwForceSwitching}); break;')
3189 * for i in range(512, 1024):
3193 // case 512: CALL(0, 0, 0, 0, 0, 0, 0, 0, 0, 1); break;
3194 // case 513: CALL(1, 0, 0, 0, 0, 0, 0, 0, 0, 1); break;
3195 // case 514: CALL(0, 1, 0, 0, 0, 0, 0, 0, 0, 1); break;
3196 // case 515: CALL(1, 1, 0, 0, 0, 0, 0, 0, 0, 1); break;
3197 // case 516: CALL(0, 0, 1, 0, 0, 0, 0, 0, 0, 1); break;
3198 // case 517: CALL(1, 0, 1, 0, 0, 0, 0, 0, 0, 1); break;
3199 // case 518: CALL(0, 1, 1, 0, 0, 0, 0, 0, 0, 1); break;
3200 // case 519: CALL(1, 1, 1, 0, 0, 0, 0, 0, 0, 1); break;
3201 // case 520: CALL(0, 0, 0, 1, 0, 0, 0, 0, 0, 1); break;
3202 // case 521: CALL(1, 0, 0, 1, 0, 0, 0, 0, 0, 1); break;
3203 // case 522: CALL(0, 1, 0, 1, 0, 0, 0, 0, 0, 1); break;
3204 // case 523: CALL(1, 1, 0, 1, 0, 0, 0, 0, 0, 1); break;
3205 // case 524: CALL(0, 0, 1, 1, 0, 0, 0, 0, 0, 1); break;
3206 // case 525: CALL(1, 0, 1, 1, 0, 0, 0, 0, 0, 1); break;
3207 // case 526: CALL(0, 1, 1, 1, 0, 0, 0, 0, 0, 1); break;
3208 // case 527: CALL(1, 1, 1, 1, 0, 0, 0, 0, 0, 1); break;
3209 // case 528: CALL(0, 0, 0, 0, 1, 0, 0, 0, 0, 1); break;
3210 // case 529: CALL(1, 0, 0, 0, 1, 0, 0, 0, 0, 1); break;
3211 // case 530: CALL(0, 1, 0, 0, 1, 0, 0, 0, 0, 1); break;
3212 // case 531: CALL(1, 1, 0, 0, 1, 0, 0, 0, 0, 1); break;
3213 // case 532: CALL(0, 0, 1, 0, 1, 0, 0, 0, 0, 1); break;
3214 // case 533: CALL(1, 0, 1, 0, 1, 0, 0, 0, 0, 1); break;
3215 // case 534: CALL(0, 1, 1, 0, 1, 0, 0, 0, 0, 1); break;
3216 // case 535: CALL(1, 1, 1, 0, 1, 0, 0, 0, 0, 1); break;
3217 // case 536: CALL(0, 0, 0, 1, 1, 0, 0, 0, 0, 1); break;
3218 // case 537: CALL(1, 0, 0, 1, 1, 0, 0, 0, 0, 1); break;
3219 // case 538: CALL(0, 1, 0, 1, 1, 0, 0, 0, 0, 1); break;
3220 // case 539: CALL(1, 1, 0, 1, 1, 0, 0, 0, 0, 1); break;
3221 // case 540: CALL(0, 0, 1, 1, 1, 0, 0, 0, 0, 1); break;
3222 // case 541: CALL(1, 0, 1, 1, 1, 0, 0, 0, 0, 1); break;
3223 // case 542: CALL(0, 1, 1, 1, 1, 0, 0, 0, 0, 1); break;
3224 // case 543: CALL(1, 1, 1, 1, 1, 0, 0, 0, 0, 1); break;
3225 // case 544: CALL(0, 0, 0, 0, 0, 1, 0, 0, 0, 1); break;
3226 // case 545: CALL(1, 0, 0, 0, 0, 1, 0, 0, 0, 1); break;
3227 // case 546: CALL(0, 1, 0, 0, 0, 1, 0, 0, 0, 1); break;
3228 // case 547: CALL(1, 1, 0, 0, 0, 1, 0, 0, 0, 1); break;
3229 // case 548: CALL(0, 0, 1, 0, 0, 1, 0, 0, 0, 1); break;
3230 // case 549: CALL(1, 0, 1, 0, 0, 1, 0, 0, 0, 1); break;
3231 // case 550: CALL(0, 1, 1, 0, 0, 1, 0, 0, 0, 1); break;
3232 // case 551: CALL(1, 1, 1, 0, 0, 1, 0, 0, 0, 1); break;
3233 // case 552: CALL(0, 0, 0, 1, 0, 1, 0, 0, 0, 1); break;
3234 // case 553: CALL(1, 0, 0, 1, 0, 1, 0, 0, 0, 1); break;
3235 // case 554: CALL(0, 1, 0, 1, 0, 1, 0, 0, 0, 1); break;
3236 // case 555: CALL(1, 1, 0, 1, 0, 1, 0, 0, 0, 1); break;
3237 // case 556: CALL(0, 0, 1, 1, 0, 1, 0, 0, 0, 1); break;
3238 // case 557: CALL(1, 0, 1, 1, 0, 1, 0, 0, 0, 1); break;
3239 // case 558: CALL(0, 1, 1, 1, 0, 1, 0, 0, 0, 1); break;
3240 // case 559: CALL(1, 1, 1, 1, 0, 1, 0, 0, 0, 1); break;
3241 // case 560: CALL(0, 0, 0, 0, 1, 1, 0, 0, 0, 1); break;
3242 // case 561: CALL(1, 0, 0, 0, 1, 1, 0, 0, 0, 1); break;
3243 // case 562: CALL(0, 1, 0, 0, 1, 1, 0, 0, 0, 1); break;
3244 // case 563: CALL(1, 1, 0, 0, 1, 1, 0, 0, 0, 1); break;
3245 // case 564: CALL(0, 0, 1, 0, 1, 1, 0, 0, 0, 1); break;
3246 // case 565: CALL(1, 0, 1, 0, 1, 1, 0, 0, 0, 1); break;
3247 // case 566: CALL(0, 1, 1, 0, 1, 1, 0, 0, 0, 1); break;
3248 // case 567: CALL(1, 1, 1, 0, 1, 1, 0, 0, 0, 1); break;
3249 // case 568: CALL(0, 0, 0, 1, 1, 1, 0, 0, 0, 1); break;
3250 // case 569: CALL(1, 0, 0, 1, 1, 1, 0, 0, 0, 1); break;
3251 // case 570: CALL(0, 1, 0, 1, 1, 1, 0, 0, 0, 1); break;
3252 // case 571: CALL(1, 1, 0, 1, 1, 1, 0, 0, 0, 1); break;
3253 // case 572: CALL(0, 0, 1, 1, 1, 1, 0, 0, 0, 1); break;
3254 // case 573: CALL(1, 0, 1, 1, 1, 1, 0, 0, 0, 1); break;
3255 // case 574: CALL(0, 1, 1, 1, 1, 1, 0, 0, 0, 1); break;
3256 // case 575: CALL(1, 1, 1, 1, 1, 1, 0, 0, 0, 1); break;
3257 // case 576: CALL(0, 0, 0, 0, 0, 0, 1, 0, 0, 1); break;
3258 // case 577: CALL(1, 0, 0, 0, 0, 0, 1, 0, 0, 1); break;
3259 // case 578: CALL(0, 1, 0, 0, 0, 0, 1, 0, 0, 1); break;
3260 // case 579: CALL(1, 1, 0, 0, 0, 0, 1, 0, 0, 1); break;
3261 // case 580: CALL(0, 0, 1, 0, 0, 0, 1, 0, 0, 1); break;
3262 // case 581: CALL(1, 0, 1, 0, 0, 0, 1, 0, 0, 1); break;
3263 // case 582: CALL(0, 1, 1, 0, 0, 0, 1, 0, 0, 1); break;
3264 // case 583: CALL(1, 1, 1, 0, 0, 0, 1, 0, 0, 1); break;
3265 // case 584: CALL(0, 0, 0, 1, 0, 0, 1, 0, 0, 1); break;
3266 // case 585: CALL(1, 0, 0, 1, 0, 0, 1, 0, 0, 1); break;
3267 // case 586: CALL(0, 1, 0, 1, 0, 0, 1, 0, 0, 1); break;
3268 // case 587: CALL(1, 1, 0, 1, 0, 0, 1, 0, 0, 1); break;
3269 // case 588: CALL(0, 0, 1, 1, 0, 0, 1, 0, 0, 1); break;
3270 // case 589: CALL(1, 0, 1, 1, 0, 0, 1, 0, 0, 1); break;
3271 // case 590: CALL(0, 1, 1, 1, 0, 0, 1, 0, 0, 1); break;
3272 // case 591: CALL(1, 1, 1, 1, 0, 0, 1, 0, 0, 1); break;
3273 // case 592: CALL(0, 0, 0, 0, 1, 0, 1, 0, 0, 1); break;
3274 // case 593: CALL(1, 0, 0, 0, 1, 0, 1, 0, 0, 1); break;
3275 // case 594: CALL(0, 1, 0, 0, 1, 0, 1, 0, 0, 1); break;
3276 // case 595: CALL(1, 1, 0, 0, 1, 0, 1, 0, 0, 1); break;
3277 // case 596: CALL(0, 0, 1, 0, 1, 0, 1, 0, 0, 1); break;
3278 // case 597: CALL(1, 0, 1, 0, 1, 0, 1, 0, 0, 1); break;
3279 // case 598: CALL(0, 1, 1, 0, 1, 0, 1, 0, 0, 1); break;
3280 // case 599: CALL(1, 1, 1, 0, 1, 0, 1, 0, 0, 1); break;
3281 // case 600: CALL(0, 0, 0, 1, 1, 0, 1, 0, 0, 1); break;
3282 // case 601: CALL(1, 0, 0, 1, 1, 0, 1, 0, 0, 1); break;
3283 // case 602: CALL(0, 1, 0, 1, 1, 0, 1, 0, 0, 1); break;
3284 // case 603: CALL(1, 1, 0, 1, 1, 0, 1, 0, 0, 1); break;
3285 // case 604: CALL(0, 0, 1, 1, 1, 0, 1, 0, 0, 1); break;
3286 // case 605: CALL(1, 0, 1, 1, 1, 0, 1, 0, 0, 1); break;
3287 // case 606: CALL(0, 1, 1, 1, 1, 0, 1, 0, 0, 1); break;
3288 // case 607: CALL(1, 1, 1, 1, 1, 0, 1, 0, 0, 1); break;
3289 // case 608: CALL(0, 0, 0, 0, 0, 1, 1, 0, 0, 1); break;
3290 // case 609: CALL(1, 0, 0, 0, 0, 1, 1, 0, 0, 1); break;
3291 // case 610: CALL(0, 1, 0, 0, 0, 1, 1, 0, 0, 1); break;
3292 // case 611: CALL(1, 1, 0, 0, 0, 1, 1, 0, 0, 1); break;
3293 // case 612: CALL(0, 0, 1, 0, 0, 1, 1, 0, 0, 1); break;
3294 // case 613: CALL(1, 0, 1, 0, 0, 1, 1, 0, 0, 1); break;
3295 // case 614: CALL(0, 1, 1, 0, 0, 1, 1, 0, 0, 1); break;
3296 // case 615: CALL(1, 1, 1, 0, 0, 1, 1, 0, 0, 1); break;
3297 // case 616: CALL(0, 0, 0, 1, 0, 1, 1, 0, 0, 1); break;
3298 // case 617: CALL(1, 0, 0, 1, 0, 1, 1, 0, 0, 1); break;
3299 // case 618: CALL(0, 1, 0, 1, 0, 1, 1, 0, 0, 1); break;
3300 // case 619: CALL(1, 1, 0, 1, 0, 1, 1, 0, 0, 1); break;
3301 // case 620: CALL(0, 0, 1, 1, 0, 1, 1, 0, 0, 1); break;
3302 // case 621: CALL(1, 0, 1, 1, 0, 1, 1, 0, 0, 1); break;
3303 // case 622: CALL(0, 1, 1, 1, 0, 1, 1, 0, 0, 1); break;
3304 // case 623: CALL(1, 1, 1, 1, 0, 1, 1, 0, 0, 1); break;
3305 // case 624: CALL(0, 0, 0, 0, 1, 1, 1, 0, 0, 1); break;
3306 // case 625: CALL(1, 0, 0, 0, 1, 1, 1, 0, 0, 1); break;
3307 // case 626: CALL(0, 1, 0, 0, 1, 1, 1, 0, 0, 1); break;
3308 // case 627: CALL(1, 1, 0, 0, 1, 1, 1, 0, 0, 1); break;
3309 // case 628: CALL(0, 0, 1, 0, 1, 1, 1, 0, 0, 1); break;
3310 // case 629: CALL(1, 0, 1, 0, 1, 1, 1, 0, 0, 1); break;
3311 // case 630: CALL(0, 1, 1, 0, 1, 1, 1, 0, 0, 1); break;
3312 // case 631: CALL(1, 1, 1, 0, 1, 1, 1, 0, 0, 1); break;
3313 // case 632: CALL(0, 0, 0, 1, 1, 1, 1, 0, 0, 1); break;
3314 // case 633: CALL(1, 0, 0, 1, 1, 1, 1, 0, 0, 1); break;
3315 // case 634: CALL(0, 1, 0, 1, 1, 1, 1, 0, 0, 1); break;
3316 // case 635: CALL(1, 1, 0, 1, 1, 1, 1, 0, 0, 1); break;
3317 // case 636: CALL(0, 0, 1, 1, 1, 1, 1, 0, 0, 1); break;
3318 // case 637: CALL(1, 0, 1, 1, 1, 1, 1, 0, 0, 1); break;
3319 // case 638: CALL(0, 1, 1, 1, 1, 1, 1, 0, 0, 1); break;
3320 // case 639: CALL(1, 1, 1, 1, 1, 1, 1, 0, 0, 1); break;
3321 // case 640: CALL(0, 0, 0, 0, 0, 0, 0, 1, 0, 1); break;
3322 // case 641: CALL(1, 0, 0, 0, 0, 0, 0, 1, 0, 1); break;
3323 // case 642: CALL(0, 1, 0, 0, 0, 0, 0, 1, 0, 1); break;
3324 // case 643: CALL(1, 1, 0, 0, 0, 0, 0, 1, 0, 1); break;
3325 // case 644: CALL(0, 0, 1, 0, 0, 0, 0, 1, 0, 1); break;
3326 // case 645: CALL(1, 0, 1, 0, 0, 0, 0, 1, 0, 1); break;
3327 // case 646: CALL(0, 1, 1, 0, 0, 0, 0, 1, 0, 1); break;
3328 // case 647: CALL(1, 1, 1, 0, 0, 0, 0, 1, 0, 1); break;
3329 // case 648: CALL(0, 0, 0, 1, 0, 0, 0, 1, 0, 1); break;
3330 // case 649: CALL(1, 0, 0, 1, 0, 0, 0, 1, 0, 1); break;
3331 // case 650: CALL(0, 1, 0, 1, 0, 0, 0, 1, 0, 1); break;
3332 // case 651: CALL(1, 1, 0, 1, 0, 0, 0, 1, 0, 1); break;
3333 // case 652: CALL(0, 0, 1, 1, 0, 0, 0, 1, 0, 1); break;
3334 // case 653: CALL(1, 0, 1, 1, 0, 0, 0, 1, 0, 1); break;
3335 // case 654: CALL(0, 1, 1, 1, 0, 0, 0, 1, 0, 1); break;
3336 // case 655: CALL(1, 1, 1, 1, 0, 0, 0, 1, 0, 1); break;
3337 // case 656: CALL(0, 0, 0, 0, 1, 0, 0, 1, 0, 1); break;
3338 // case 657: CALL(1, 0, 0, 0, 1, 0, 0, 1, 0, 1); break;
3339 // case 658: CALL(0, 1, 0, 0, 1, 0, 0, 1, 0, 1); break;
3340 // case 659: CALL(1, 1, 0, 0, 1, 0, 0, 1, 0, 1); break;
3341 // case 660: CALL(0, 0, 1, 0, 1, 0, 0, 1, 0, 1); break;
3342 // case 661: CALL(1, 0, 1, 0, 1, 0, 0, 1, 0, 1); break;
3343 // case 662: CALL(0, 1, 1, 0, 1, 0, 0, 1, 0, 1); break;
3344 // case 663: CALL(1, 1, 1, 0, 1, 0, 0, 1, 0, 1); break;
3345 // case 664: CALL(0, 0, 0, 1, 1, 0, 0, 1, 0, 1); break;
3346 // case 665: CALL(1, 0, 0, 1, 1, 0, 0, 1, 0, 1); break;
3347 // case 666: CALL(0, 1, 0, 1, 1, 0, 0, 1, 0, 1); break;
3348 // case 667: CALL(1, 1, 0, 1, 1, 0, 0, 1, 0, 1); break;
3349 // case 668: CALL(0, 0, 1, 1, 1, 0, 0, 1, 0, 1); break;
3350 // case 669: CALL(1, 0, 1, 1, 1, 0, 0, 1, 0, 1); break;
3351 // case 670: CALL(0, 1, 1, 1, 1, 0, 0, 1, 0, 1); break;
3352 // case 671: CALL(1, 1, 1, 1, 1, 0, 0, 1, 0, 1); break;
3353 // case 672: CALL(0, 0, 0, 0, 0, 1, 0, 1, 0, 1); break;
3354 // case 673: CALL(1, 0, 0, 0, 0, 1, 0, 1, 0, 1); break;
3355 // case 674: CALL(0, 1, 0, 0, 0, 1, 0, 1, 0, 1); break;
3356 // case 675: CALL(1, 1, 0, 0, 0, 1, 0, 1, 0, 1); break;
3357 // case 676: CALL(0, 0, 1, 0, 0, 1, 0, 1, 0, 1); break;
3358 // case 677: CALL(1, 0, 1, 0, 0, 1, 0, 1, 0, 1); break;
3359 // case 678: CALL(0, 1, 1, 0, 0, 1, 0, 1, 0, 1); break;
3360 // case 679: CALL(1, 1, 1, 0, 0, 1, 0, 1, 0, 1); break;
3361 // case 680: CALL(0, 0, 0, 1, 0, 1, 0, 1, 0, 1); break;
3362 // case 681: CALL(1, 0, 0, 1, 0, 1, 0, 1, 0, 1); break;
3363 // case 682: CALL(0, 1, 0, 1, 0, 1, 0, 1, 0, 1); break;
3364 // case 683: CALL(1, 1, 0, 1, 0, 1, 0, 1, 0, 1); break;
3365 // case 684: CALL(0, 0, 1, 1, 0, 1, 0, 1, 0, 1); break;
3366 // case 685: CALL(1, 0, 1, 1, 0, 1, 0, 1, 0, 1); break;
3367 // case 686: CALL(0, 1, 1, 1, 0, 1, 0, 1, 0, 1); break;
3368 // case 687: CALL(1, 1, 1, 1, 0, 1, 0, 1, 0, 1); break;
3369 // case 688: CALL(0, 0, 0, 0, 1, 1, 0, 1, 0, 1); break;
3370 // case 689: CALL(1, 0, 0, 0, 1, 1, 0, 1, 0, 1); break;
3371 // case 690: CALL(0, 1, 0, 0, 1, 1, 0, 1, 0, 1); break;
3372 // case 691: CALL(1, 1, 0, 0, 1, 1, 0, 1, 0, 1); break;
3373 // case 692: CALL(0, 0, 1, 0, 1, 1, 0, 1, 0, 1); break;
3374 // case 693: CALL(1, 0, 1, 0, 1, 1, 0, 1, 0, 1); break;
3375 // case 694: CALL(0, 1, 1, 0, 1, 1, 0, 1, 0, 1); break;
3376 // case 695: CALL(1, 1, 1, 0, 1, 1, 0, 1, 0, 1); break;
3377 // case 696: CALL(0, 0, 0, 1, 1, 1, 0, 1, 0, 1); break;
3378 // case 697: CALL(1, 0, 0, 1, 1, 1, 0, 1, 0, 1); break;
3379 // case 698: CALL(0, 1, 0, 1, 1, 1, 0, 1, 0, 1); break;
3380 // case 699: CALL(1, 1, 0, 1, 1, 1, 0, 1, 0, 1); break;
3381 // case 700: CALL(0, 0, 1, 1, 1, 1, 0, 1, 0, 1); break;
3382 // case 701: CALL(1, 0, 1, 1, 1, 1, 0, 1, 0, 1); break;
3383 // case 702: CALL(0, 1, 1, 1, 1, 1, 0, 1, 0, 1); break;
3384 // case 703: CALL(1, 1, 1, 1, 1, 1, 0, 1, 0, 1); break;
3385 // case 704: CALL(0, 0, 0, 0, 0, 0, 1, 1, 0, 1); break;
3386 // case 705: CALL(1, 0, 0, 0, 0, 0, 1, 1, 0, 1); break;
3387 // case 706: CALL(0, 1, 0, 0, 0, 0, 1, 1, 0, 1); break;
3388 // case 707: CALL(1, 1, 0, 0, 0, 0, 1, 1, 0, 1); break;
3389 // case 708: CALL(0, 0, 1, 0, 0, 0, 1, 1, 0, 1); break;
3390 // case 709: CALL(1, 0, 1, 0, 0, 0, 1, 1, 0, 1); break;
3391 // case 710: CALL(0, 1, 1, 0, 0, 0, 1, 1, 0, 1); break;
3392 // case 711: CALL(1, 1, 1, 0, 0, 0, 1, 1, 0, 1); break;
3393 // case 712: CALL(0, 0, 0, 1, 0, 0, 1, 1, 0, 1); break;
3394 // case 713: CALL(1, 0, 0, 1, 0, 0, 1, 1, 0, 1); break;
3395 // case 714: CALL(0, 1, 0, 1, 0, 0, 1, 1, 0, 1); break;
3396 // case 715: CALL(1, 1, 0, 1, 0, 0, 1, 1, 0, 1); break;
3397 // case 716: CALL(0, 0, 1, 1, 0, 0, 1, 1, 0, 1); break;
3398 // case 717: CALL(1, 0, 1, 1, 0, 0, 1, 1, 0, 1); break;
3399 // case 718: CALL(0, 1, 1, 1, 0, 0, 1, 1, 0, 1); break;
3400 // case 719: CALL(1, 1, 1, 1, 0, 0, 1, 1, 0, 1); break;
3401 // case 720: CALL(0, 0, 0, 0, 1, 0, 1, 1, 0, 1); break;
3402 // case 721: CALL(1, 0, 0, 0, 1, 0, 1, 1, 0, 1); break;
3403 // case 722: CALL(0, 1, 0, 0, 1, 0, 1, 1, 0, 1); break;
3404 // case 723: CALL(1, 1, 0, 0, 1, 0, 1, 1, 0, 1); break;
3405 // case 724: CALL(0, 0, 1, 0, 1, 0, 1, 1, 0, 1); break;
3406 // case 725: CALL(1, 0, 1, 0, 1, 0, 1, 1, 0, 1); break;
3407 // case 726: CALL(0, 1, 1, 0, 1, 0, 1, 1, 0, 1); break;
3408 // case 727: CALL(1, 1, 1, 0, 1, 0, 1, 1, 0, 1); break;
3409 // case 728: CALL(0, 0, 0, 1, 1, 0, 1, 1, 0, 1); break;
3410 // case 729: CALL(1, 0, 0, 1, 1, 0, 1, 1, 0, 1); break;
3411 // case 730: CALL(0, 1, 0, 1, 1, 0, 1, 1, 0, 1); break;
3412 // case 731: CALL(1, 1, 0, 1, 1, 0, 1, 1, 0, 1); break;
3413 // case 732: CALL(0, 0, 1, 1, 1, 0, 1, 1, 0, 1); break;
3414 // case 733: CALL(1, 0, 1, 1, 1, 0, 1, 1, 0, 1); break;
3415 // case 734: CALL(0, 1, 1, 1, 1, 0, 1, 1, 0, 1); break;
3416 // case 735: CALL(1, 1, 1, 1, 1, 0, 1, 1, 0, 1); break;
3417 // case 736: CALL(0, 0, 0, 0, 0, 1, 1, 1, 0, 1); break;
3418 // case 737: CALL(1, 0, 0, 0, 0, 1, 1, 1, 0, 1); break;
3419 // case 738: CALL(0, 1, 0, 0, 0, 1, 1, 1, 0, 1); break;
3420 // case 739: CALL(1, 1, 0, 0, 0, 1, 1, 1, 0, 1); break;
3421 // case 740: CALL(0, 0, 1, 0, 0, 1, 1, 1, 0, 1); break;
3422 // case 741: CALL(1, 0, 1, 0, 0, 1, 1, 1, 0, 1); break;
3423 // case 742: CALL(0, 1, 1, 0, 0, 1, 1, 1, 0, 1); break;
3424 // case 743: CALL(1, 1, 1, 0, 0, 1, 1, 1, 0, 1); break;
3425 // case 744: CALL(0, 0, 0, 1, 0, 1, 1, 1, 0, 1); break;
3426 // case 745: CALL(1, 0, 0, 1, 0, 1, 1, 1, 0, 1); break;
3427 // case 746: CALL(0, 1, 0, 1, 0, 1, 1, 1, 0, 1); break;
3428 // case 747: CALL(1, 1, 0, 1, 0, 1, 1, 1, 0, 1); break;
3429 // case 748: CALL(0, 0, 1, 1, 0, 1, 1, 1, 0, 1); break;
3430 // case 749: CALL(1, 0, 1, 1, 0, 1, 1, 1, 0, 1); break;
3431 // case 750: CALL(0, 1, 1, 1, 0, 1, 1, 1, 0, 1); break;
3432 // case 751: CALL(1, 1, 1, 1, 0, 1, 1, 1, 0, 1); break;
3433 // case 752: CALL(0, 0, 0, 0, 1, 1, 1, 1, 0, 1); break;
3434 // case 753: CALL(1, 0, 0, 0, 1, 1, 1, 1, 0, 1); break;
3435 // case 754: CALL(0, 1, 0, 0, 1, 1, 1, 1, 0, 1); break;
3436 // case 755: CALL(1, 1, 0, 0, 1, 1, 1, 1, 0, 1); break;
3437 // case 756: CALL(0, 0, 1, 0, 1, 1, 1, 1, 0, 1); break;
3438 // case 757: CALL(1, 0, 1, 0, 1, 1, 1, 1, 0, 1); break;
3439 // case 758: CALL(0, 1, 1, 0, 1, 1, 1, 1, 0, 1); break;
3440 // case 759: CALL(1, 1, 1, 0, 1, 1, 1, 1, 0, 1); break;
3441 // case 760: CALL(0, 0, 0, 1, 1, 1, 1, 1, 0, 1); break;
3442 // case 761: CALL(1, 0, 0, 1, 1, 1, 1, 1, 0, 1); break;
3443 // case 762: CALL(0, 1, 0, 1, 1, 1, 1, 1, 0, 1); break;
3444 // case 763: CALL(1, 1, 0, 1, 1, 1, 1, 1, 0, 1); break;
3445 // case 764: CALL(0, 0, 1, 1, 1, 1, 1, 1, 0, 1); break;
3446 // case 765: CALL(1, 0, 1, 1, 1, 1, 1, 1, 0, 1); break;
3447 // case 766: CALL(0, 1, 1, 1, 1, 1, 1, 1, 0, 1); break;
3448 // case 767: CALL(1, 1, 1, 1, 1, 1, 1, 1, 0, 1); break;
3449 // case 768: CALL(0, 0, 0, 0, 0, 0, 0, 0, 1, 1); break;
3450 // case 769: CALL(1, 0, 0, 0, 0, 0, 0, 0, 1, 1); break;
3451 // case 770: CALL(0, 1, 0, 0, 0, 0, 0, 0, 1, 1); break;
3452 // case 771: CALL(1, 1, 0, 0, 0, 0, 0, 0, 1, 1); break;
3453 // case 772: CALL(0, 0, 1, 0, 0, 0, 0, 0, 1, 1); break;
3454 // case 773: CALL(1, 0, 1, 0, 0, 0, 0, 0, 1, 1); break;
3455 // case 774: CALL(0, 1, 1, 0, 0, 0, 0, 0, 1, 1); break;
3456 // case 775: CALL(1, 1, 1, 0, 0, 0, 0, 0, 1, 1); break;
3457 // case 776: CALL(0, 0, 0, 1, 0, 0, 0, 0, 1, 1); break;
3458 // case 777: CALL(1, 0, 0, 1, 0, 0, 0, 0, 1, 1); break;
3459 // case 778: CALL(0, 1, 0, 1, 0, 0, 0, 0, 1, 1); break;
3460 // case 779: CALL(1, 1, 0, 1, 0, 0, 0, 0, 1, 1); break;
3461 // case 780: CALL(0, 0, 1, 1, 0, 0, 0, 0, 1, 1); break;
3462 // case 781: CALL(1, 0, 1, 1, 0, 0, 0, 0, 1, 1); break;
3463 // case 782: CALL(0, 1, 1, 1, 0, 0, 0, 0, 1, 1); break;
3464 // case 783: CALL(1, 1, 1, 1, 0, 0, 0, 0, 1, 1); break;
3465 // case 784: CALL(0, 0, 0, 0, 1, 0, 0, 0, 1, 1); break;
3466 // case 785: CALL(1, 0, 0, 0, 1, 0, 0, 0, 1, 1); break;
3467 // case 786: CALL(0, 1, 0, 0, 1, 0, 0, 0, 1, 1); break;
3468 // case 787: CALL(1, 1, 0, 0, 1, 0, 0, 0, 1, 1); break;
3469 // case 788: CALL(0, 0, 1, 0, 1, 0, 0, 0, 1, 1); break;
3470 // case 789: CALL(1, 0, 1, 0, 1, 0, 0, 0, 1, 1); break;
3471 // case 790: CALL(0, 1, 1, 0, 1, 0, 0, 0, 1, 1); break;
3472 // case 791: CALL(1, 1, 1, 0, 1, 0, 0, 0, 1, 1); break;
3473 // case 792: CALL(0, 0, 0, 1, 1, 0, 0, 0, 1, 1); break;
3474 // case 793: CALL(1, 0, 0, 1, 1, 0, 0, 0, 1, 1); break;
3475 // case 794: CALL(0, 1, 0, 1, 1, 0, 0, 0, 1, 1); break;
3476 // case 795: CALL(1, 1, 0, 1, 1, 0, 0, 0, 1, 1); break;
3477 // case 796: CALL(0, 0, 1, 1, 1, 0, 0, 0, 1, 1); break;
3478 // case 797: CALL(1, 0, 1, 1, 1, 0, 0, 0, 1, 1); break;
3479 // case 798: CALL(0, 1, 1, 1, 1, 0, 0, 0, 1, 1); break;
3480 // case 799: CALL(1, 1, 1, 1, 1, 0, 0, 0, 1, 1); break;
3481 // case 800: CALL(0, 0, 0, 0, 0, 1, 0, 0, 1, 1); break;
3482 // case 801: CALL(1, 0, 0, 0, 0, 1, 0, 0, 1, 1); break;
3483 // case 802: CALL(0, 1, 0, 0, 0, 1, 0, 0, 1, 1); break;
3484 // case 803: CALL(1, 1, 0, 0, 0, 1, 0, 0, 1, 1); break;
3485 // case 804: CALL(0, 0, 1, 0, 0, 1, 0, 0, 1, 1); break;
3486 // case 805: CALL(1, 0, 1, 0, 0, 1, 0, 0, 1, 1); break;
3487 // case 806: CALL(0, 1, 1, 0, 0, 1, 0, 0, 1, 1); break;
3488 // case 807: CALL(1, 1, 1, 0, 0, 1, 0, 0, 1, 1); break;
3489 // case 808: CALL(0, 0, 0, 1, 0, 1, 0, 0, 1, 1); break;
3490 // case 809: CALL(1, 0, 0, 1, 0, 1, 0, 0, 1, 1); break;
3491 // case 810: CALL(0, 1, 0, 1, 0, 1, 0, 0, 1, 1); break;
3492 // case 811: CALL(1, 1, 0, 1, 0, 1, 0, 0, 1, 1); break;
3493 // case 812: CALL(0, 0, 1, 1, 0, 1, 0, 0, 1, 1); break;
3494 // case 813: CALL(1, 0, 1, 1, 0, 1, 0, 0, 1, 1); break;
3495 // case 814: CALL(0, 1, 1, 1, 0, 1, 0, 0, 1, 1); break;
3496 // case 815: CALL(1, 1, 1, 1, 0, 1, 0, 0, 1, 1); break;
3497 case 816: CALL(0, 0, 0, 0, 1, 1, 0, 0, 1, 1); break;
3498 case 817: CALL(1, 0, 0, 0, 1, 1, 0, 0, 1, 1); break;
3499 case 818: CALL(0, 1, 0, 0, 1, 1, 0, 0, 1, 1); break;
3500 case 819: CALL(1, 1, 0, 0, 1, 1, 0, 0, 1, 1); break;
3501 case 820: CALL(0, 0, 1, 0, 1, 1, 0, 0, 1, 1); break;
3502 case 821: CALL(1, 0, 1, 0, 1, 1, 0, 0, 1, 1); break;
3503 case 822: CALL(0, 1, 1, 0, 1, 1, 0, 0, 1, 1); break;
3504 case 823: CALL(1, 1, 1, 0, 1, 1, 0, 0, 1, 1); break;
3505 case 824: CALL(0, 0, 0, 1, 1, 1, 0, 0, 1, 1); break;
3506 case 825: CALL(1, 0, 0, 1, 1, 1, 0, 0, 1, 1); break;
3507 case 826: CALL(0, 1, 0, 1, 1, 1, 0, 0, 1, 1); break;
3508 case 827: CALL(1, 1, 0, 1, 1, 1, 0, 0, 1, 1); break;
3509 case 828: CALL(0, 0, 1, 1, 1, 1, 0, 0, 1, 1); break;
3510 case 829: CALL(1, 0, 1, 1, 1, 1, 0, 0, 1, 1); break;
3511 case 830: CALL(0, 1, 1, 1, 1, 1, 0, 0, 1, 1); break;
3512 case 831: CALL(1, 1, 1, 1, 1, 1, 0, 0, 1, 1); break;
3513 // case 832: CALL(0, 0, 0, 0, 0, 0, 1, 0, 1, 1); break;
3514 // case 833: CALL(1, 0, 0, 0, 0, 0, 1, 0, 1, 1); break;
3515 // case 834: CALL(0, 1, 0, 0, 0, 0, 1, 0, 1, 1); break;
3516 // case 835: CALL(1, 1, 0, 0, 0, 0, 1, 0, 1, 1); break;
3517 // case 836: CALL(0, 0, 1, 0, 0, 0, 1, 0, 1, 1); break;
3518 // case 837: CALL(1, 0, 1, 0, 0, 0, 1, 0, 1, 1); break;
3519 // case 838: CALL(0, 1, 1, 0, 0, 0, 1, 0, 1, 1); break;
3520 // case 839: CALL(1, 1, 1, 0, 0, 0, 1, 0, 1, 1); break;
3521 // case 840: CALL(0, 0, 0, 1, 0, 0, 1, 0, 1, 1); break;
3522 // case 841: CALL(1, 0, 0, 1, 0, 0, 1, 0, 1, 1); break;
3523 // case 842: CALL(0, 1, 0, 1, 0, 0, 1, 0, 1, 1); break;
3524 // case 843: CALL(1, 1, 0, 1, 0, 0, 1, 0, 1, 1); break;
3525 // case 844: CALL(0, 0, 1, 1, 0, 0, 1, 0, 1, 1); break;
3526 // case 845: CALL(1, 0, 1, 1, 0, 0, 1, 0, 1, 1); break;
3527 // case 846: CALL(0, 1, 1, 1, 0, 0, 1, 0, 1, 1); break;
3528 // case 847: CALL(1, 1, 1, 1, 0, 0, 1, 0, 1, 1); break;
3529 case 848: CALL(0, 0, 0, 0, 1, 0, 1, 0, 1, 1); break;
3530 case 849: CALL(1, 0, 0, 0, 1, 0, 1, 0, 1, 1); break;
3531 case 850: CALL(0, 1, 0, 0, 1, 0, 1, 0, 1, 1); break;
3532 case 851: CALL(1, 1, 0, 0, 1, 0, 1, 0, 1, 1); break;
3533 case 852: CALL(0, 0, 1, 0, 1, 0, 1, 0, 1, 1); break;
3534 case 853: CALL(1, 0, 1, 0, 1, 0, 1, 0, 1, 1); break;
3535 case 854: CALL(0, 1, 1, 0, 1, 0, 1, 0, 1, 1); break;
3536 case 855: CALL(1, 1, 1, 0, 1, 0, 1, 0, 1, 1); break;
3537 case 856: CALL(0, 0, 0, 1, 1, 0, 1, 0, 1, 1); break;
3538 case 857: CALL(1, 0, 0, 1, 1, 0, 1, 0, 1, 1); break;
3539 case 858: CALL(0, 1, 0, 1, 1, 0, 1, 0, 1, 1); break;
3540 case 859: CALL(1, 1, 0, 1, 1, 0, 1, 0, 1, 1); break;
3541 case 860: CALL(0, 0, 1, 1, 1, 0, 1, 0, 1, 1); break;
3542 case 861: CALL(1, 0, 1, 1, 1, 0, 1, 0, 1, 1); break;
3543 case 862: CALL(0, 1, 1, 1, 1, 0, 1, 0, 1, 1); break;
3544 case 863: CALL(1, 1, 1, 1, 1, 0, 1, 0, 1, 1); break;
3545 // case 864: CALL(0, 0, 0, 0, 0, 1, 1, 0, 1, 1); break;
3546 // case 865: CALL(1, 0, 0, 0, 0, 1, 1, 0, 1, 1); break;
3547 // case 866: CALL(0, 1, 0, 0, 0, 1, 1, 0, 1, 1); break;
3548 // case 867: CALL(1, 1, 0, 0, 0, 1, 1, 0, 1, 1); break;
3549 // case 868: CALL(0, 0, 1, 0, 0, 1, 1, 0, 1, 1); break;
3550 // case 869: CALL(1, 0, 1, 0, 0, 1, 1, 0, 1, 1); break;
3551 // case 870: CALL(0, 1, 1, 0, 0, 1, 1, 0, 1, 1); break;
3552 // case 871: CALL(1, 1, 1, 0, 0, 1, 1, 0, 1, 1); break;
3553 // case 872: CALL(0, 0, 0, 1, 0, 1, 1, 0, 1, 1); break;
3554 // case 873: CALL(1, 0, 0, 1, 0, 1, 1, 0, 1, 1); break;
3555 // case 874: CALL(0, 1, 0, 1, 0, 1, 1, 0, 1, 1); break;
3556 // case 875: CALL(1, 1, 0, 1, 0, 1, 1, 0, 1, 1); break;
3557 // case 876: CALL(0, 0, 1, 1, 0, 1, 1, 0, 1, 1); break;
3558 // case 877: CALL(1, 0, 1, 1, 0, 1, 1, 0, 1, 1); break;
3559 // case 878: CALL(0, 1, 1, 1, 0, 1, 1, 0, 1, 1); break;
3560 // case 879: CALL(1, 1, 1, 1, 0, 1, 1, 0, 1, 1); break;
3561 // case 880: CALL(0, 0, 0, 0, 1, 1, 1, 0, 1, 1); break;
3562 // case 881: CALL(1, 0, 0, 0, 1, 1, 1, 0, 1, 1); break;
3563 // case 882: CALL(0, 1, 0, 0, 1, 1, 1, 0, 1, 1); break;
3564 // case 883: CALL(1, 1, 0, 0, 1, 1, 1, 0, 1, 1); break;
3565 // case 884: CALL(0, 0, 1, 0, 1, 1, 1, 0, 1, 1); break;
3566 // case 885: CALL(1, 0, 1, 0, 1, 1, 1, 0, 1, 1); break;
3567 // case 886: CALL(0, 1, 1, 0, 1, 1, 1, 0, 1, 1); break;
3568 // case 887: CALL(1, 1, 1, 0, 1, 1, 1, 0, 1, 1); break;
3569 // case 888: CALL(0, 0, 0, 1, 1, 1, 1, 0, 1, 1); break;
3570 // case 889: CALL(1, 0, 0, 1, 1, 1, 1, 0, 1, 1); break;
3571 // case 890: CALL(0, 1, 0, 1, 1, 1, 1, 0, 1, 1); break;
3572 // case 891: CALL(1, 1, 0, 1, 1, 1, 1, 0, 1, 1); break;
3573 // case 892: CALL(0, 0, 1, 1, 1, 1, 1, 0, 1, 1); break;
3574 // case 893: CALL(1, 0, 1, 1, 1, 1, 1, 0, 1, 1); break;
3575 // case 894: CALL(0, 1, 1, 1, 1, 1, 1, 0, 1, 1); break;
3576 // case 895: CALL(1, 1, 1, 1, 1, 1, 1, 0, 1, 1); break;
3577 // case 896: CALL(0, 0, 0, 0, 0, 0, 0, 1, 1, 1); break;
3578 // case 897: CALL(1, 0, 0, 0, 0, 0, 0, 1, 1, 1); break;
3579 // case 898: CALL(0, 1, 0, 0, 0, 0, 0, 1, 1, 1); break;
3580 // case 899: CALL(1, 1, 0, 0, 0, 0, 0, 1, 1, 1); break;
3581 // case 900: CALL(0, 0, 1, 0, 0, 0, 0, 1, 1, 1); break;
3582 // case 901: CALL(1, 0, 1, 0, 0, 0, 0, 1, 1, 1); break;
3583 // case 902: CALL(0, 1, 1, 0, 0, 0, 0, 1, 1, 1); break;
3584 // case 903: CALL(1, 1, 1, 0, 0, 0, 0, 1, 1, 1); break;
3585 // case 904: CALL(0, 0, 0, 1, 0, 0, 0, 1, 1, 1); break;
3586 // case 905: CALL(1, 0, 0, 1, 0, 0, 0, 1, 1, 1); break;
3587 // case 906: CALL(0, 1, 0, 1, 0, 0, 0, 1, 1, 1); break;
3588 // case 907: CALL(1, 1, 0, 1, 0, 0, 0, 1, 1, 1); break;
3589 // case 908: CALL(0, 0, 1, 1, 0, 0, 0, 1, 1, 1); break;
3590 // case 909: CALL(1, 0, 1, 1, 0, 0, 0, 1, 1, 1); break;
3591 // case 910: CALL(0, 1, 1, 1, 0, 0, 0, 1, 1, 1); break;
3592 // case 911: CALL(1, 1, 1, 1, 0, 0, 0, 1, 1, 1); break;
3593 // case 912: CALL(0, 0, 0, 0, 1, 0, 0, 1, 1, 1); break;
3594 // case 913: CALL(1, 0, 0, 0, 1, 0, 0, 1, 1, 1); break;
3595 // case 914: CALL(0, 1, 0, 0, 1, 0, 0, 1, 1, 1); break;
3596 // case 915: CALL(1, 1, 0, 0, 1, 0, 0, 1, 1, 1); break;
3597 // case 916: CALL(0, 0, 1, 0, 1, 0, 0, 1, 1, 1); break;
3598 // case 917: CALL(1, 0, 1, 0, 1, 0, 0, 1, 1, 1); break;
3599 // case 918: CALL(0, 1, 1, 0, 1, 0, 0, 1, 1, 1); break;
3600 // case 919: CALL(1, 1, 1, 0, 1, 0, 0, 1, 1, 1); break;
3601 // case 920: CALL(0, 0, 0, 1, 1, 0, 0, 1, 1, 1); break;
3602 // case 921: CALL(1, 0, 0, 1, 1, 0, 0, 1, 1, 1); break;
3603 // case 922: CALL(0, 1, 0, 1, 1, 0, 0, 1, 1, 1); break;
3604 // case 923: CALL(1, 1, 0, 1, 1, 0, 0, 1, 1, 1); break;
3605 // case 924: CALL(0, 0, 1, 1, 1, 0, 0, 1, 1, 1); break;
3606 // case 925: CALL(1, 0, 1, 1, 1, 0, 0, 1, 1, 1); break;
3607 // case 926: CALL(0, 1, 1, 1, 1, 0, 0, 1, 1, 1); break;
3608 // case 927: CALL(1, 1, 1, 1, 1, 0, 0, 1, 1, 1); break;
3609 // case 928: CALL(0, 0, 0, 0, 0, 1, 0, 1, 1, 1); break;
3610 // case 929: CALL(1, 0, 0, 0, 0, 1, 0, 1, 1, 1); break;
3611 // case 930: CALL(0, 1, 0, 0, 0, 1, 0, 1, 1, 1); break;
3612 // case 931: CALL(1, 1, 0, 0, 0, 1, 0, 1, 1, 1); break;
3613 // case 932: CALL(0, 0, 1, 0, 0, 1, 0, 1, 1, 1); break;
3614 // case 933: CALL(1, 0, 1, 0, 0, 1, 0, 1, 1, 1); break;
3615 // case 934: CALL(0, 1, 1, 0, 0, 1, 0, 1, 1, 1); break;
3616 // case 935: CALL(1, 1, 1, 0, 0, 1, 0, 1, 1, 1); break;
3617 // case 936: CALL(0, 0, 0, 1, 0, 1, 0, 1, 1, 1); break;
3618 // case 937: CALL(1, 0, 0, 1, 0, 1, 0, 1, 1, 1); break;
3619 // case 938: CALL(0, 1, 0, 1, 0, 1, 0, 1, 1, 1); break;
3620 // case 939: CALL(1, 1, 0, 1, 0, 1, 0, 1, 1, 1); break;
3621 // case 940: CALL(0, 0, 1, 1, 0, 1, 0, 1, 1, 1); break;
3622 // case 941: CALL(1, 0, 1, 1, 0, 1, 0, 1, 1, 1); break;
3623 // case 942: CALL(0, 1, 1, 1, 0, 1, 0, 1, 1, 1); break;
3624 // case 943: CALL(1, 1, 1, 1, 0, 1, 0, 1, 1, 1); break;
3625 case 944: CALL(0, 0, 0, 0, 1, 1, 0, 1, 1, 1); break;
3626 case 945: CALL(1, 0, 0, 0, 1, 1, 0, 1, 1, 1); break;
3627 case 946: CALL(0, 1, 0, 0, 1, 1, 0, 1, 1, 1); break;
3628 case 947: CALL(1, 1, 0, 0, 1, 1, 0, 1, 1, 1); break;
3629 case 948: CALL(0, 0, 1, 0, 1, 1, 0, 1, 1, 1); break;
3630 case 949: CALL(1, 0, 1, 0, 1, 1, 0, 1, 1, 1); break;
3631 case 950: CALL(0, 1, 1, 0, 1, 1, 0, 1, 1, 1); break;
3632 case 951: CALL(1, 1, 1, 0, 1, 1, 0, 1, 1, 1); break;
3633 case 952: CALL(0, 0, 0, 1, 1, 1, 0, 1, 1, 1); break;
3634 case 953: CALL(1, 0, 0, 1, 1, 1, 0, 1, 1, 1); break;
3635 case 954: CALL(0, 1, 0, 1, 1, 1, 0, 1, 1, 1); break;
3636 case 955: CALL(1, 1, 0, 1, 1, 1, 0, 1, 1, 1); break;
3637 case 956: CALL(0, 0, 1, 1, 1, 1, 0, 1, 1, 1); break;
3638 case 957: CALL(1, 0, 1, 1, 1, 1, 0, 1, 1, 1); break;
3639 case 958: CALL(0, 1, 1, 1, 1, 1, 0, 1, 1, 1); break;
3640 case 959: CALL(1, 1, 1, 1, 1, 1, 0, 1, 1, 1); break;
3641 // case 960: CALL(0, 0, 0, 0, 0, 0, 1, 1, 1, 1); break;
3642 // case 961: CALL(1, 0, 0, 0, 0, 0, 1, 1, 1, 1); break;
3643 // case 962: CALL(0, 1, 0, 0, 0, 0, 1, 1, 1, 1); break;
3644 // case 963: CALL(1, 1, 0, 0, 0, 0, 1, 1, 1, 1); break;
3645 // case 964: CALL(0, 0, 1, 0, 0, 0, 1, 1, 1, 1); break;
3646 // case 965: CALL(1, 0, 1, 0, 0, 0, 1, 1, 1, 1); break;
3647 // case 966: CALL(0, 1, 1, 0, 0, 0, 1, 1, 1, 1); break;
3648 // case 967: CALL(1, 1, 1, 0, 0, 0, 1, 1, 1, 1); break;
3649 // case 968: CALL(0, 0, 0, 1, 0, 0, 1, 1, 1, 1); break;
3650 // case 969: CALL(1, 0, 0, 1, 0, 0, 1, 1, 1, 1); break;
3651 // case 970: CALL(0, 1, 0, 1, 0, 0, 1, 1, 1, 1); break;
3652 // case 971: CALL(1, 1, 0, 1, 0, 0, 1, 1, 1, 1); break;
3653 // case 972: CALL(0, 0, 1, 1, 0, 0, 1, 1, 1, 1); break;
3654 // case 973: CALL(1, 0, 1, 1, 0, 0, 1, 1, 1, 1); break;
3655 // case 974: CALL(0, 1, 1, 1, 0, 0, 1, 1, 1, 1); break;
3656 // case 975: CALL(1, 1, 1, 1, 0, 0, 1, 1, 1, 1); break;
3657 case 976: CALL(0, 0, 0, 0, 1, 0, 1, 1, 1, 1); break;
3658 case 977: CALL(1, 0, 0, 0, 1, 0, 1, 1, 1, 1); break;
3659 case 978: CALL(0, 1, 0, 0, 1, 0, 1, 1, 1, 1); break;
3660 case 979: CALL(1, 1, 0, 0, 1, 0, 1, 1, 1, 1); break;
3661 case 980: CALL(0, 0, 1, 0, 1, 0, 1, 1, 1, 1); break;
3662 case 981: CALL(1, 0, 1, 0, 1, 0, 1, 1, 1, 1); break;
3663 case 982: CALL(0, 1, 1, 0, 1, 0, 1, 1, 1, 1); break;
3664 case 983: CALL(1, 1, 1, 0, 1, 0, 1, 1, 1, 1); break;
3665 case 984: CALL(0, 0, 0, 1, 1, 0, 1, 1, 1, 1); break;
3666 case 985: CALL(1, 0, 0, 1, 1, 0, 1, 1, 1, 1); break;
3667 case 986: CALL(0, 1, 0, 1, 1, 0, 1, 1, 1, 1); break;
3668 case 987: CALL(1, 1, 0, 1, 1, 0, 1, 1, 1, 1); break;
3669 case 988: CALL(0, 0, 1, 1, 1, 0, 1, 1, 1, 1); break;
3670 case 989: CALL(1, 0, 1, 1, 1, 0, 1, 1, 1, 1); break;
3671 case 990: CALL(0, 1, 1, 1, 1, 0, 1, 1, 1, 1); break;
3672 case 991: CALL(1, 1, 1, 1, 1, 0, 1, 1, 1, 1); break;
3673 // case 992: CALL(0, 0, 0, 0, 0, 1, 1, 1, 1, 1); break;
3674 // case 993: CALL(1, 0, 0, 0, 0, 1, 1, 1, 1, 1); break;
3675 // case 994: CALL(0, 1, 0, 0, 0, 1, 1, 1, 1, 1); break;
3676 // case 995: CALL(1, 1, 0, 0, 0, 1, 1, 1, 1, 1); break;
3677 // case 996: CALL(0, 0, 1, 0, 0, 1, 1, 1, 1, 1); break;
3678 // case 997: CALL(1, 0, 1, 0, 0, 1, 1, 1, 1, 1); break;
3679 // case 998: CALL(0, 1, 1, 0, 0, 1, 1, 1, 1, 1); break;
3680 // case 999: CALL(1, 1, 1, 0, 0, 1, 1, 1, 1, 1); break;
3681 // case 1000: CALL(0, 0, 0, 1, 0, 1, 1, 1, 1, 1); break;
3682 // case 1001: CALL(1, 0, 0, 1, 0, 1, 1, 1, 1, 1); break;
3683 // case 1002: CALL(0, 1, 0, 1, 0, 1, 1, 1, 1, 1); break;
3684 // case 1003: CALL(1, 1, 0, 1, 0, 1, 1, 1, 1, 1); break;
3685 // case 1004: CALL(0, 0, 1, 1, 0, 1, 1, 1, 1, 1); break;
3686 // case 1005: CALL(1, 0, 1, 1, 0, 1, 1, 1, 1, 1); break;
3687 // case 1006: CALL(0, 1, 1, 1, 0, 1, 1, 1, 1, 1); break;
3688 // case 1007: CALL(1, 1, 1, 1, 0, 1, 1, 1, 1, 1); break;
3689 // case 1008: CALL(0, 0, 0, 0, 1, 1, 1, 1, 1, 1); break;
3690 // case 1009: CALL(1, 0, 0, 0, 1, 1, 1, 1, 1, 1); break;
3691 // case 1010: CALL(0, 1, 0, 0, 1, 1, 1, 1, 1, 1); break;
3692 // case 1011: CALL(1, 1, 0, 0, 1, 1, 1, 1, 1, 1); break;
3693 // case 1012: CALL(0, 0, 1, 0, 1, 1, 1, 1, 1, 1); break;
3694 // case 1013: CALL(1, 0, 1, 0, 1, 1, 1, 1, 1, 1); break;
3695 // case 1014: CALL(0, 1, 1, 0, 1, 1, 1, 1, 1, 1); break;
3696 // case 1015: CALL(1, 1, 1, 0, 1, 1, 1, 1, 1, 1); break;
3697 // case 1016: CALL(0, 0, 0, 1, 1, 1, 1, 1, 1, 1); break;
3698 // case 1017: CALL(1, 0, 0, 1, 1, 1, 1, 1, 1, 1); break;
3699 // case 1018: CALL(0, 1, 0, 1, 1, 1, 1, 1, 1, 1); break;
3700 // case 1019: CALL(1, 1, 0, 1, 1, 1, 1, 1, 1, 1); break;
3701 // case 1020: CALL(0, 0, 1, 1, 1, 1, 1, 1, 1, 1); break;
3702 // case 1021: CALL(1, 0, 1, 1, 1, 1, 1, 1, 1, 1); break;
3703 // case 1022: CALL(0, 1, 1, 1, 1, 1, 1, 1, 1, 1); break;
3704 // case 1023: CALL(1, 1, 1, 1, 1, 1, 1, 1, 1, 1); break;
3708 "doEnergy=%d doVirial=%d doSlow=%d doPairlist=%d "
3709 "doAlch=%d doFEP=%d doTI=%d doStreaming=%d doTable=%d"
3711 doEnergy, doVirial, doSlow, doPairlist, doAlch, doFEP, doTI,
3712 doStreaming, doTable, options);
3715 std::string call_options;
3716 call_options += "doEnergy = " + std::to_string(int(doEnergy));
3717 call_options += ", doVirial = " + std::to_string(int(doVirial));
3718 call_options += ", doSlow = " + std::to_string(int(doSlow));
3719 call_options += ", doPairlist = " + std::to_string(int(doPairlist));
3720 call_options += ", doAlch = " + std::to_string(int(doAlch));
3721 call_options += ", doFEP = " + std::to_string(int(doFEP));
3722 call_options += ", doTI = " + std::to_string(int(doTI));
3723 call_options += ", doStreaming = " + std::to_string(int(doStreaming));
3724 // call_options += ", doTable = " + std::to_string(int(doTable));
3725 call_options += ", doAlchVdwForceSwitching = " + std::to_string(int(doAlchVdwForceSwitching));
3726 const std::string error = "CudaComputeNonbondedKernel::nonbondedForce, none of the kernels called. Options are:\n" + call_options;
3727 NAMD_bug(error.c_str());
3736 cudaCheck(cudaGetLastError());
3738 start += nblock*nwarp;
3741 if ( doVirial || ! doStreaming ){
3743 int grid = (atomStorageSize + block - 1)/block;
3745 transposeForcesKernel<1><<<grid, block, 0, stream>>>(d_forces, d_forcesSlow,
3746 force_x, force_y, force_z, force_w,
3747 forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
3750 transposeForcesKernel<0><<<grid, block, 0, stream>>>(d_forces, d_forcesSlow,
3751 force_x, force_y, force_z, force_w,
3752 forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
3758 // Perform virial and energy reductions for non-bonded force calculation
3760 void CudaComputeNonbondedKernel::reduceVirialEnergy(CudaTileListKernel& tlKernel,
3761 const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS,
3762 float4* d_forces, float4* d_forcesSlow,
3763 VirialEnergy* d_virialEnergy, cudaStream_t stream) {
3765 if (doEnergy || doVirial) {
3766 clear_device_array<VirialEnergy>(d_virialEnergy, ATOMIC_BINS, stream);
3771 int nthread = REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*WARPSIZE;
3772 int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
3773 reduceNonbondedVirialKernel <<< nblock, nthread, 0, stream >>>
3774 (doSlow, atomStorageSize, tlKernel.get_xyzq(), d_forces, d_forcesSlow, d_virialEnergy);
3775 cudaCheck(cudaGetLastError());
3778 if (doVirial || doEnergy)
3780 int nthread = REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE;
3781 int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyLength()-1)/nthread+1);
3782 reduceVirialEnergyKernel <<< nblock, nthread, 0, stream >>>
3783 (doEnergy, doVirial, doSlow, tlKernel.getTileListVirialEnergyLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
3784 cudaCheck(cudaGetLastError());
3787 if (doGBIS && doEnergy)
3789 int nthread = REDUCEGBISENERGYKERNEL_NUM_WARP*WARPSIZE;
3790 int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyGBISLength()-1)/nthread+1);
3791 reduceGBISEnergyKernel <<< nblock, nthread, 0, stream >>>
3792 (tlKernel.getTileListVirialEnergyGBISLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
3793 cudaCheck(cudaGetLastError());
3796 if (ATOMIC_BINS > 1)
3798 // Reduce d_virialEnergy[ATOMIC_BINS] in-place (results are in d_virialEnergy[0])
3799 reduceNonbondedBinsKernel<<<1, ATOMIC_BINS, 0, stream>>>(doVirial, doEnergy, doSlow, doGBIS, d_virialEnergy);
3803 void CudaComputeNonbondedKernel::bindExclusions(int numExclusions, unsigned int* exclusion_bits) {
3804 reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions);
3805 copy_HtoD_sync<unsigned int>(exclusion_bits, overflowExclusions, numExclusions);
3809 void CudaComputeNonbondedKernel::setExclusionsByAtom(int2* h_data, const int num_atoms) {
3810 // Global data structure shouldn't be reallocated
3811 if (d_exclusionsByAtom == NULL) allocate_device<int2>(&d_exclusionsByAtom, num_atoms);
3812 copy_HtoD_sync<int2>(h_data, d_exclusionsByAtom, num_atoms);
3817 template<bool kDoAlch>
3818 __global__ void updateVdwTypesExclKernel(
3819 const int numPatches,
3820 const CudaLocalRecord* localRecords,
3821 const int* global_vdwTypes,
3822 const int* global_id,
3823 const int* patchSortOrder,
3824 const int2* exclusionsByAtom,
3825 const int* global_partition,
3831 __shared__ CudaLocalRecord s_record;
3832 using AccessType = int32_t;
3833 AccessType* s_record_buffer = (AccessType*) &s_record;
3835 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
3836 // Read in the CudaLocalRecord using multiple threads. This should
3838 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
3839 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
3843 const int numAtoms = s_record.numAtoms;
3844 const int offset = s_record.bufferOffset;
3845 const int offsetNB = s_record.bufferOffsetNBPad;
3847 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
3848 const int order = patchSortOrder[offset + i];
3849 const int id = global_id[offset + order];
3850 vdwTypes [offsetNB + i] = global_vdwTypes[offset + order];
3851 atomIndex [offsetNB + i] = id;
3852 exclusions[offsetNB + i].x = exclusionsByAtom[id].y;
3853 exclusions[offsetNB + i].y = exclusionsByAtom[id].x;
3855 part [offsetNB + i] = global_partition[offset + order];
3863 void CudaComputeNonbondedKernel::updateVdwTypesExclOnGPU(CudaTileListKernel& tlKernel,
3864 const int numPatches, const int atomStorageSize, const bool alchOn,
3865 CudaLocalRecord* localRecords,
3866 const int* d_vdwTypes, const int* d_id, const int* d_sortOrder,
3867 const int* d_partition,
3870 reallocate_device<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, OVERALLOC);
3871 reallocate_device<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, OVERALLOC);
3872 reallocate_device<int>(&atomIndex, &atomIndexSize, atomStorageSize, OVERALLOC);
3874 const int numBlocks = numPatches;
3875 const int numThreads = 512;
3878 updateVdwTypesExclKernel<true><<<numBlocks, numThreads, 0, stream>>>(
3879 numPatches, localRecords,
3880 d_vdwTypes, d_id, d_sortOrder, d_exclusionsByAtom, d_partition,
3881 vdwTypes, atomIndex, exclIndexMaxDiff, tlKernel.get_part()
3884 updateVdwTypesExclKernel<false><<<numBlocks, numThreads, 0, stream>>>(
3885 numPatches, localRecords,
3886 d_vdwTypes, d_id, d_sortOrder, d_exclusionsByAtom, d_partition,
3887 vdwTypes, atomIndex, exclIndexMaxDiff, tlKernel.get_part()