2 #if __CUDACC_VER_MAJOR__ >= 11
5 #include <namd_cub/cub.cuh>
10 #include "CudaComputeNonbondedKernel.h"
11 #include "CudaTileListKernel.h"
12 #include "DeviceCUDA.h"
13 #include "CudaComputeNonbondedInteractions.h"
15 #if defined(NAMD_CUDA)
18 #define __thread __declspec(thread)
20 extern __thread DeviceCUDA *deviceCUDA;
22 #define OVERALLOC 1.2f
24 void NAMD_die(const char *);
25 void NAMD_bug(const char *);
27 #define MAX_CONST_EXCLUSIONS 2048 // cache size is 8k
28 __constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS];
31 __constant__ AlchData alchflags;
32 #define NONBONDKERNEL_NUM_WARP 4
35 __device__ __forceinline__
39 __device__ __forceinline__
40 float3 make_zero<float3>() {
41 return make_float3(0.0f, 0.0f, 0.0f);
45 __device__ __forceinline__
46 float4 make_zero<float4>() {
47 return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
50 template<bool doEnergy, bool doSlow, typename jForceType>
51 __device__ __forceinline__
52 void calcForceEnergyMath(const float r2, const float qi, const float qj,
53 const float dx, const float dy, const float dz,
54 const int vdwtypei, const int vdwtypej, const float2* __restrict__ vdwCoefTable,
55 cudaTextureObject_t vdwCoefTableTex,
56 cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
57 float3& iforce, float3& iforceSlow, jForceType& jforce, jForceType& jforceSlow,
58 float& energyVdw, float& energyElec, float& energySlow,
59 const CudaNBConstants nbConstants) {
61 int vdwIndex = vdwtypej + vdwtypei;
62 #if __CUDA_ARCH__ >= 350
63 float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
65 float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
68 float rinv = rsqrtf(r2);
70 float charge = qi * qj;
72 cudaNBForceMagCalc_VdwEnergySwitch_PMEC1<doEnergy, doSlow>(
73 r2, rinv, charge, ljab, nbConstants,
74 f, fSlow, energyVdw, energyElec, energySlow);
86 float fxSlow = dx * fSlow;
87 float fySlow = dy * fSlow;
88 float fzSlow = dz * fSlow;
89 iforceSlow.x += fxSlow;
90 iforceSlow.y += fySlow;
91 iforceSlow.z += fzSlow;
92 jforceSlow.x -= fxSlow;
93 jforceSlow.y -= fySlow;
94 jforceSlow.z -= fzSlow;
99 template<bool doEnergy, bool doSlow, typename jForceType>
100 __device__ __forceinline__
101 void calcForceEnergy(const float r2, const float qi, const float qj,
102 const float dx, const float dy, const float dz,
103 const int vdwtypei, const int vdwtypej, const float2* __restrict__ vdwCoefTable,
104 cudaTextureObject_t vdwCoefTableTex,
105 cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
106 float3& iforce, float3& iforceSlow, jForceType& jforce, jForceType& jforceSlow,
107 float& energyVdw, float& energyElec, float& energySlow) {
109 int vdwIndex = vdwtypej + vdwtypei;
110 #if __CUDA_ARCH__ >= 350
111 float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
113 float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
116 float rinv = rsqrtf(r2);
120 fi = tex1D<float4>(forceTableTex, rinv);
121 if (doEnergy) ei = tex1D<float4>(energyTableTex, rinv);
124 f = ljab.x * fi.z + ljab.y * fi.y + fSlow * fi.x;
127 energyVdw += ljab.x * ei.z + ljab.y * ei.y;
128 energyElec += fSlow * ei.x;
131 energySlow += fSlow * ei.w;
134 if (doSlow) fSlow *= fi.w;
147 float fxSlow = dx * fSlow;
148 float fySlow = dy * fSlow;
149 float fzSlow = dz * fSlow;
150 iforceSlow.x += fxSlow;
151 iforceSlow.y += fySlow;
152 iforceSlow.z += fzSlow;
153 jforceSlow.x -= fxSlow;
154 jforceSlow.y -= fySlow;
155 jforceSlow.z -= fzSlow;
160 /* JM: Special __device__ function to compute VDW forces for alchemy.
161 * Partially swiped from ComputeNonbondedFEP.C
163 template<bool doEnergy, bool doSlow, bool shift, bool vdwForceSwitch, typename jForceType>
164 __device__ __forceinline__
165 void calcForceEnergyFEP(const float r2, const float qi, const float qj,
166 const float dx, const float dy, const float dz,
167 const int vdwtypei, const int vdwtypej,
169 /*const AlchData &alchflags, */
170 const float2* __restrict__ vdwCoefTable,
171 cudaTextureObject_t vdwCoefTableTex,
172 cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
173 float3& iforce, float3& iforceSlow, jForceType& jforce, jForceType& jforceSlow,
174 float& energyVdw, float &energyVdw_s, float& energyElec, float& energySlow,
175 float& energyElec_s, float& energySlow_s) {
178 int vdwIndex = vdwtypej + vdwtypei;
179 #if __CUDA_ARCH__ >= 350
180 float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
182 float2 ljab = tex1D<float2>(vdwCoefTableTex, vdwIndex); //ljab.x is A and ljab.y is B
185 float myVdwLambda = 0.0f;
186 float myVdwLambda2 = 0.0f;
187 float myElecLambda = 0.0f;
188 float myElecLambda2 = 0.0f;
189 float rinv = rsqrtf(r2);
191 float alch_vdw_energy = 0.0f;
192 float alch_vdw_energy_2 = 0.0f;
193 float alch_vdw_force = 0.0f;
194 float fSlow = qi * qj;
196 float4 fi = tex1D<float4>(forceTableTex, rinv);
197 if (doEnergy) ei = tex1D<float4>(energyTableTex, rinv);
199 //John said that there is a better way to avoid divergences here
200 //alch: true if => 1-0, 1-1, 2-0, 2-2
201 //dec: true if => 1-1, 2-2 && decouple
202 //up: true if => 1-0 && 1,1
203 //down: true if => 2-0, && 2,2
204 int ref = (p1 == 0 && p2 == 0);
205 int alch = (!ref && !(p1 == 1 && p2 ==2) && !(p1 == 2 && p2 == 1));
206 int dec = (alch && (p1 == p2) && alchflags.alchDecouple);
207 int up = (alch && (p1 == 1 || p2 == 1) && !dec);
208 int down = (alch && (p1 == 2 || p2 == 2) && !dec);
213 /*--------------- VDW SPECIAL ALCH FORCES (Swiped from ComputeNonbondedFEP.C) ---------------*/
215 myVdwLambda = alchflags.vdwLambdaUp*(up) + alchflags.vdwLambdaDown*(down) + 1.f*(ref || dec);
216 myVdwLambda2 = alchflags.vdwLambda2Up*(up) + alchflags.vdwLambda2Down*(down) + 1.f*(ref || dec);
217 myElecLambda = alchflags.elecLambdaUp*(up) + alchflags.elecLambdaDown*(down) + 1.f*(ref || dec);
218 myElecLambda2 = alchflags.elecLambda2Up*(up) + alchflags.elecLambda2Down*(down) + 1.f*(ref || dec);
221 if (vdwForceSwitch) {
223 float switchdist6_1, switchdist6_2;
224 const float cutoff6 = alchflags.cutoff2 * alchflags.cutoff2 * alchflags.cutoff2;
226 //Templated parameter. No control divergence here
228 const float myVdwShift = alchflags.vdwShiftUp*up + alchflags.vdwShiftDown*(!up);
229 const float myVdwShift2 = alchflags.vdwShift2Up*up + alchflags.vdwShift2Down*(!up);
230 r2_1 = __fdividef(1.f,(r2 + myVdwShift));
231 r2_2 = __fdividef(1.f,(r2 + myVdwShift2));
232 switchdist6_1 = alchflags.switchdist2 + myVdwShift;
233 switchdist6_1 = switchdist6_1 * switchdist6_1 * switchdist6_1;
234 switchdist6_2 = alchflags.switchdist2 + myVdwShift2;
235 switchdist6_2 = switchdist6_2 * switchdist6_2 * switchdist6_2;
239 switchdist6_1 = alchflags.switchdist2 * alchflags.switchdist2 * alchflags.switchdist2;
240 switchdist6_2 = switchdist6_1;
242 const float r6_1 = r2_1*r2_1*r2_1;
243 const float r6_2 = r2_2*r2_2*r2_2;
244 if (r2 <= alchflags.switchdist2) {
245 const float U1 = ljab.x*r6_1*r6_1 - ljab.y*r6_1; // NB: unscaled, shorthand only!
246 const float U2 = ljab.x*r6_2*r6_2 - ljab.y*r6_2;
247 // A == ljab.x, B == ljab.y
248 const float dU_1 = -ljab.x / (cutoff6 * switchdist6_1) - (-ljab.y * rsqrtf(cutoff6 * switchdist6_1));
249 const float dU_2 = -ljab.x / (cutoff6 * switchdist6_2) - (-ljab.y * rsqrtf(cutoff6 * switchdist6_2));
250 alch_vdw_energy = myVdwLambda * (U1 + dU_1);
251 alch_vdw_energy_2 = myVdwLambda2 * (U2 + dU_2);
253 //Multiplied by -1.0 to match CPU values
254 alch_vdw_force =-1.f*myVdwLambda*((12.f*U1 + 6.f*ljab.y*r6_1)*r2_1);
256 const float r3_1 = sqrtf(r6_1);
257 const float r3_2 = sqrtf(r6_2);
258 const float inv_cutoff6 = 1.0f / cutoff6;
259 const float inv_cutoff3 = rsqrtf(cutoff6);
260 // A == ljab.x, B == ljab.y
261 const float k_vdwa_1 = ljab.x / (1.0f - switchdist6_1 * inv_cutoff6);
262 const float k_vdwb_1 = ljab.y / (1.0f - sqrtf(switchdist6_1 * inv_cutoff6));
263 const float k_vdwa_2 = ljab.x / (1.0f - switchdist6_2 * inv_cutoff6);
264 const float k_vdwb_2 = ljab.y / (1.0f - sqrtf(switchdist6_2 * inv_cutoff6));
265 const float tmpa_1 = r6_1 - inv_cutoff6;
266 const float tmpb_1 = r3_1 - inv_cutoff3;
267 const float tmpa_2 = r6_2 - inv_cutoff6;
268 const float tmpb_2 = r3_2 - inv_cutoff3;
269 alch_vdw_energy = myVdwLambda * (k_vdwa_1 * tmpa_1 * tmpa_1 - k_vdwb_1 * tmpb_1 * tmpb_1);
270 alch_vdw_energy_2 = myVdwLambda2 * (k_vdwa_2 * tmpa_2 * tmpa_2 - k_vdwb_2 * tmpb_2 * tmpb_2);
271 //Multiplied by -1.0 to match CPU values
272 alch_vdw_force = -1.0f * myVdwLambda * (6.0f * r2_1 * (2.0f * k_vdwa_1 * tmpa_1 * r6_1 - k_vdwb_1 * tmpb_1 * r3_1));
273 } // r2 <= alchflags.switchdist2
275 // potential switching
276 const float diff = alchflags.cutoff2 - r2;
278 const float switchmul = (alchflags.switchfactor*(diff)*(diff)*(alchflags.cutoff2 - 3.f*alchflags.switchdist2 + 2.f*r2))*(r2 > alchflags.switchdist2) + (1.f)*(r2 <= alchflags.switchdist2);
279 const float switchmul2 = (12.f*alchflags.switchfactor*(diff)*(r2 - alchflags.switchdist2))*(r2 > alchflags.switchdist2) + (0.f) * (r2 <= alchflags.switchdist2);
281 //Templated parameter. No control divergence here
283 //This templated parameter lets me get away with not making 2 divisions. But for myVdwShift != 0, how do I do this?
284 const float myVdwShift = alchflags.vdwShiftUp*up + alchflags.vdwShiftDown*(!up);
285 const float myVdwShift2 = alchflags.vdwShift2Up*up + alchflags.vdwShift2Down*(!up);
286 //r2_1 = 1.0/(r2 + myVdwShift);
287 //r2_2 = 1.0/(r2 + myVdwShift2);
288 r2_1 = __fdividef(1.f,(r2 + myVdwShift));
289 r2_2 = __fdividef(1.f,(r2 + myVdwShift2));
295 const float r6_1 = r2_1*r2_1*r2_1;
296 const float r6_2 = r2_2*r2_2*r2_2;
297 const float U1 = ljab.x*r6_1*r6_1 - ljab.y*r6_1; // NB: unscaled, shorthand only!
298 const float U2 = ljab.x*r6_2*r6_2 - ljab.y*r6_2;
299 alch_vdw_energy = myVdwLambda*switchmul*U1;
300 alch_vdw_energy_2 = myVdwLambda2*switchmul*U2;
302 //Multiplied by -1.0 to match CPU values
303 alch_vdw_force =-1.f*myVdwLambda*((switchmul*(12.f*U1 + 6.f*ljab.y*r6_1)*r2_1+ switchmul2*U1));
307 /*-----------------------------------------------------------*/
310 //All energies should be scaled by the corresponding lambda
311 energyVdw += (ljab.x * ei.z + ljab.y * ei.y)*(ref || dec) + alch_vdw_energy*(alch && !dec);
312 energyElec += (fSlow * ei.x)*myElecLambda;
313 energyVdw_s += (ljab.x * ei.z + ljab.y * ei.y)*(ref || dec) + alch_vdw_energy_2*(alch && !dec);
314 energyElec_s += (fSlow * ei.x)*myElecLambda2;
316 energySlow += (fSlow * ei.w)*myElecLambda;
317 energySlow_s += (fSlow * ei.w)*myElecLambda2;
321 if (doSlow) fSlow *= fi.w;
323 //We should include the regular VDW forces if not dealing with alch pairs
324 f = (f + ((ljab.x * fi.z + ljab.y * fi.y)*(!alch || dec)))*myElecLambda
325 + alch_vdw_force*(alch && !dec);
339 /*There's stuff that needs to be added here, when FAST AND NOSHORT macros are on*/
340 fSlow = myElecLambda*fSlow;
341 float fxSlow = dx * fSlow;
342 float fySlow = dy * fSlow;
343 float fzSlow = dz * fSlow;
344 iforceSlow.x += fxSlow;
345 iforceSlow.y += fySlow;
346 iforceSlow.z += fzSlow;
347 jforceSlow.x -= fxSlow;
348 jforceSlow.y -= fySlow;
349 jforceSlow.z -= fzSlow;
353 /* JM: Special __device__ function to compute VDW forces for TI.
356 template<bool doEnergy, bool doSlow, bool shift, bool vdwForceSwitch, typename jForceType>
357 __device__ __forceinline__
358 void calcForceEnergyTI(const float r2, const float qi, const float qj,
359 const float dx, const float dy, const float dz,
360 const int vdwtypei, const int vdwtypej,
362 const float2* __restrict__ vdwCoefTable,
363 cudaTextureObject_t vdwCoefTableTex,
364 cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
365 float3& iforce, float3& iforceSlow, jForceType& jforce, jForceType& jforceSlow,
366 float& energyVdw, float& energyVdw_ti_1, float& energyVdw_ti_2,
367 float& energyElec, float& energyElec_ti_1, float& energyElec_ti_2,
368 float& energySlow, float& energySlow_ti_1, float& energySlow_ti_2) {
370 int vdwIndex = vdwtypej + vdwtypei;
371 #if __CUDA_ARCH__ >= 350
372 float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
374 float2 ljab = tex1D<float2>(vdwCoefTableTex, vdwIndex); //ljab.x is A and ljab.y is B
377 /* JM: For TI, we have to deal ALCH1 OR ALCH2 during ComputeNonbondedBase2
378 * ALCH1 for appearing terms;
379 * ALCH2 for dissapearing terms;
380 * Instead of the _s energy terms, we need the to calculate:
382 * vdwEnergy_ti_1 and _2 for VDW energies. For those we need to add the special terms calculated on
383 * ComputeNonbondedTI.C
385 * elecEnergy_ti_1 and _2 for electrostatic energy. No correction needed here though.
389 float myVdwLambda = 0.0f;
390 float myElecLambda = 0.0f;
391 float rinv = rsqrtf(r2);
393 float alch_vdw_energy = 0.0f;
394 float alch_vdw_force = 0.0f;
395 float alch_vdw_dUdl = 0.0f;
396 float fSlow = qi * qj;
398 float4 fi = tex1D<float4>(forceTableTex, rinv);
399 if (doEnergy) ei = tex1D<float4>(energyTableTex, rinv);
401 //John said that there is a better way to avoid divergences here
402 //alch: true if => 1-0, 1-1, 2-0, 2-2
403 //dec: true if => 1-1, 2-2 && decouple
404 //up: true if => 1-0 && 1,1
405 //down: true if => 2-0, && 2,2
406 int ref = (p1 == 0 && p2 == 0);
407 int alch = (!ref && !(p1 == 1 && p2 ==2) && !(p1 == 2 && p2 == 1));
408 int dec = (alch && (p1 == p2) && alchflags.alchDecouple);
409 int up = (alch && (p1 == 1 || p2 == 1) && !dec);
410 int down = (alch && (p1 == 2 || p2 == 2) && !dec);
415 /*--------------- VDW SPECIAL ALCH STUFF (Swiped from ComputeNonbondedTI.C) ---------------*/
416 myVdwLambda = alchflags.vdwLambdaUp*(up) + alchflags.vdwLambdaDown*(down) + 1.f*(ref || dec);
417 myElecLambda = alchflags.elecLambdaUp*(up) + alchflags.elecLambdaDown*(down) + 1.f*(ref || dec);
419 if (vdwForceSwitch) {
420 const float cutoff6 = alchflags.cutoff2 * alchflags.cutoff2 * alchflags.cutoff2;
423 const float myVdwShift = alchflags.vdwShiftUp*up + alchflags.vdwShiftDown*(!up);
424 r2_1 = __fdividef(1.f,(r2 + myVdwShift));
425 switchdist6 = alchflags.switchdist2 + myVdwShift;
426 switchdist6 = switchdist6 * switchdist6 * switchdist6;
429 switchdist6 = alchflags.switchdist2 * alchflags.switchdist2 * alchflags.switchdist2;
431 const float r6_1 = r2_1*r2_1*r2_1;
432 if (r2 <= alchflags.switchdist2) {
433 const float U = ljab.x*r6_1*r6_1 - ljab.y*r6_1;
434 const float dU = -ljab.x / (cutoff6 * switchdist6) - (-ljab.y * rsqrtf(cutoff6 * switchdist6));
435 alch_vdw_force = -1.f*(myVdwLambda*((12.f*U + 6.f*ljab.y*r6_1)*r2_1));
436 alch_vdw_energy = myVdwLambda * (U + dU);
437 alch_vdw_dUdl = U + myVdwLambda * alchflags.alchVdwShiftCoeff * (6.f*U + 3.f*ljab.y*r6_1)*r2_1 + dU;
439 const float r3_1 = sqrtf(r6_1);
440 const float inv_cutoff6 = 1.0f / cutoff6;
441 const float inv_cutoff3 = sqrtf(inv_cutoff6);
442 const float k_vdwa_1 = ljab.x / (1.0f - switchdist6 * inv_cutoff6);
443 const float k_vdwb_1 = ljab.y / (1.0f - sqrtf(switchdist6 * inv_cutoff6));
444 const float tmpa_1 = r6_1 - inv_cutoff6;
445 const float tmpb_1 = r3_1 - inv_cutoff3;
446 const float U = k_vdwa_1 * tmpa_1 * tmpa_1 - k_vdwb_1 * tmpb_1 * tmpb_1;
447 alch_vdw_force = -1.0f * myVdwLambda * (6.0f * r2_1 * (2.0f * k_vdwa_1 * tmpa_1 * r6_1 - k_vdwb_1 * tmpb_1 * r3_1));
448 alch_vdw_energy = myVdwLambda * U;
449 alch_vdw_dUdl = U + myVdwLambda * alchflags.alchVdwShiftCoeff * (3.0f * r2_1 * (2.0f * k_vdwa_1 * tmpa_1 * r6_1 - k_vdwb_1 * tmpb_1 * r3_1));
450 } // r2 <= alchflags.switchdist2
452 const float diff = alchflags.cutoff2 - r2;
453 const float switchmul = (r2 > alchflags.switchdist2 ? alchflags.switchfactor*(diff)*(diff) \
454 *(alchflags.cutoff2 - 3.f*alchflags.switchdist2 + 2.f*r2) : 1.f);
456 const float switchmul2 = (r2 > alchflags.switchdist2 ? \
457 12.f*alchflags.switchfactor*(diff) \
458 *(r2 - alchflags.switchdist2) : 0.f);
459 //Templated parameter. No control divergence here
461 const float myVdwShift = alchflags.vdwShiftUp*up + alchflags.vdwShiftDown*(!up);
462 r2_1 = __fdividef(1.f,(r2 + myVdwShift));
463 }else r2_1 = rinv*rinv;
465 const float r6_1 = r2_1*r2_1*r2_1;
466 const float U = ljab.x*r6_1*r6_1 - ljab.y*r6_1; // NB: unscaled! for shorthand only!
467 alch_vdw_energy = myVdwLambda*switchmul*U;
468 //Multiplied by -1.0 to match CPU values
469 alch_vdw_force = -1.f*(myVdwLambda*(switchmul*(12.f*U + 6.f*ljab.y*r6_1)*r2_1 \
471 alch_vdw_dUdl = (switchmul*(U + myVdwLambda*alchflags.alchVdwShiftCoeff \
472 *(6.f*U + 3.f*ljab.y*r6_1)*r2_1));
475 /*-------------------------------------------------------------------------*/
478 //All energies should be scaled by the corresponding lambda
479 energyVdw += (ljab.x * ei.z + ljab.y * ei.y)*(ref || dec) + alch_vdw_energy*(alch && !dec);
480 energyElec += (fSlow * ei.x)*myElecLambda;
482 energyVdw_ti_1 += alch_vdw_dUdl*up;
483 energyVdw_ti_2 += alch_vdw_dUdl*down;
484 energyElec_ti_1 += (fSlow * ei.x)*up;
485 energyElec_ti_2 += (fSlow * ei.x)*down;
488 energySlow += (fSlow * ei.w)*myElecLambda;
490 energySlow_ti_1 += (fSlow * ei.w)*up;
491 energySlow_ti_2 += (fSlow * ei.w)*down;
496 if (doSlow) fSlow *= fi.w;
497 //We should include the regular VDW forces if not dealing with alch pairs
498 f = (f + ((ljab.x * fi.z + ljab.y * fi.y)*(ref || dec)))*myElecLambda
499 + alch_vdw_force*(alch && !dec);
513 /*There's stuff that needs to be added here, when FAST AND NOSHORT macros are on*/
514 fSlow = myElecLambda*fSlow; /* FAST(NOSHORT(+alch_vdw_force))*/ //Those should also be zeroed
515 float fxSlow = dx * fSlow;
516 float fySlow = dy * fSlow;
517 float fzSlow = dz * fSlow;
518 iforceSlow.x += fxSlow;
519 iforceSlow.y += fySlow;
520 iforceSlow.z += fzSlow;
521 jforceSlow.x -= fxSlow;
522 jforceSlow.y -= fySlow;
523 jforceSlow.z -= fzSlow;
528 template<bool doSlow, typename jForceType>
529 __device__ __forceinline__
530 void storeForces(const int pos, const jForceType force, const jForceType forceSlow,
531 float4* __restrict__ devForces, float4* __restrict__ devForcesSlow) {
532 atomicAdd(&devForces[pos].x, force.x);
533 atomicAdd(&devForces[pos].y, force.y);
534 atomicAdd(&devForces[pos].z, force.z);
536 atomicAdd(&devForcesSlow[pos].x, forceSlow.x);
537 atomicAdd(&devForcesSlow[pos].y, forceSlow.y);
538 atomicAdd(&devForcesSlow[pos].z, forceSlow.z);
542 template<bool doSlow, typename jForceType>
543 __device__ __forceinline__
544 void storeForces(const int pos, const jForceType force, const jForceType forceSlow,
545 float* __restrict__ devForces_x,
546 float* __restrict__ devForces_y,
547 float* __restrict__ devForces_z,
548 float* __restrict__ devForcesSlow_x,
549 float* __restrict__ devForcesSlow_y,
550 float* __restrict__ devForcesSlow_z)
552 atomicAdd(&devForces_x[pos], force.x);
553 atomicAdd(&devForces_y[pos], force.y);
554 atomicAdd(&devForces_z[pos], force.z);
556 atomicAdd(&devForcesSlow_x[pos], forceSlow.x);
557 atomicAdd(&devForcesSlow_y[pos], forceSlow.y);
558 atomicAdd(&devForcesSlow_z[pos], forceSlow.z);
562 template<bool doSlow, typename jForceType>
563 __device__ __forceinline__
564 void storeForces(const int pos, const jForceType force, const jForceType forceSlow,
565 float3* __restrict__ forces, float3* __restrict__ forcesSlow) {
566 atomicAdd(&forces[pos].x, force.x);
567 atomicAdd(&forces[pos].y, force.y);
568 atomicAdd(&forces[pos].z, force.z);
570 atomicAdd(&forcesSlow[pos].x, forceSlow.x);
571 atomicAdd(&forcesSlow[pos].y, forceSlow.y);
572 atomicAdd(&forcesSlow[pos].z, forceSlow.z);
576 template<bool doPairlist>
577 __device__ __forceinline__
578 void shuffleNext(float& xyzq_j_w, int& vdwtypej, int& jatomIndex, int& jexclMaxdiff, int& jexclIndex) {
579 xyzq_j_w = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j_w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
580 vdwtypej = WARP_SHUFFLE(WARP_FULL_MASK, vdwtypej, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
582 jatomIndex = WARP_SHUFFLE(WARP_FULL_MASK, jatomIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
583 jexclIndex = WARP_SHUFFLE(WARP_FULL_MASK, jexclIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
584 jexclMaxdiff = WARP_SHUFFLE(WARP_FULL_MASK, jexclMaxdiff, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
588 template<bool doPairlist>
589 __device__ __forceinline__
590 void shuffleNext(float& xyzq_j_w, int& vdwtypej, int& jatomIndex) {
591 xyzq_j_w = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j_w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
592 vdwtypej = WARP_SHUFFLE(WARP_FULL_MASK, vdwtypej, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
594 jatomIndex = WARP_SHUFFLE(WARP_FULL_MASK, jatomIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
598 template<bool doSlow, typename jForceType>
599 __device__ __forceinline__
600 void shuffleNext(jForceType& jforce, jForceType& jforceSlow) {
601 jforce.x = WARP_SHUFFLE(WARP_FULL_MASK, jforce.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
602 jforce.y = WARP_SHUFFLE(WARP_FULL_MASK, jforce.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
603 jforce.z = WARP_SHUFFLE(WARP_FULL_MASK, jforce.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
605 jforceSlow.x = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
606 jforceSlow.y = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
607 jforceSlow.z = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
611 //#define USE_NEW_EXCL_METHOD
614 // Returns the lower estimate for the distance between a bounding box and a set of atoms
616 __device__ __forceinline__ float distsq(const BoundingBox a, const float4 b) {
617 float dx = max(0.0f, fabsf(a.x - b.x) - a.wx);
618 float dy = max(0.0f, fabsf(a.y - b.y) - a.wy);
619 float dz = max(0.0f, fabsf(a.z - b.z) - a.wz);
620 float r2 = dx*dx + dy*dy + dz*dz;
624 #define LARGE_FLOAT (float)(1.0e10)
627 // Nonbonded force kernel
629 template <bool doEnergy, bool doVirial, bool doSlow, bool doPairlist, bool doAlch, bool doFEP, bool doTI, bool doStreaming, bool doTable, bool doAlchVdwForceSwitching>
631 __launch_bounds__(WARPSIZE*NONBONDKERNEL_NUM_WARP,
632 doPairlist ? (10) : (doEnergy ? (10) : (12) )
634 nonbondedForceKernel(
635 const int start, const int numTileLists,
636 const TileList* __restrict__ tileLists, TileExcl* __restrict__ tileExcls,
637 const int* __restrict__ tileJatomStart,
638 const int vdwCoefTableWidth, const float2* __restrict__ vdwCoefTable, const int* __restrict__ vdwTypes,
639 const float3 lata, const float3 latb, const float3 latc,
640 const float4* __restrict__ xyzq,
641 const float cutoff2, const CudaNBConstants nbConstants,
642 cudaTextureObject_t vdwCoefTableTex,
643 cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
646 const int atomStorageSize, const float plcutoff2, const PatchPairRecord* __restrict__ patchPairs,
647 const int* __restrict__ atomIndex,
648 const int2* __restrict__ exclIndexMaxDiff, const unsigned int* __restrict__ overflowExclusions,
649 unsigned int* __restrict__ tileListDepth, int* __restrict__ tileListOrder,
650 int* __restrict__ jtiles, TileListStat* __restrict__ tileListStat,
651 const BoundingBox* __restrict__ boundingBoxes,
652 #ifdef USE_NEW_EXCL_METHOD
653 const int* __restrict__ minmaxExclAtom,
656 float4* __restrict__ devForces, float4* __restrict__ devForcesSlow,
657 float * __restrict__ devForce_x,
658 float * __restrict__ devForce_y,
659 float * __restrict__ devForce_z,
660 float * __restrict__ devForce_w,
661 float * __restrict__ devForceSlow_x,
662 float * __restrict__ devForceSlow_y,
663 float * __restrict__ devForceSlow_z,
664 float * __restrict__ devForceSlow_w,
665 // ---- USE_STREAMING_FORCES ----
666 const int numPatches,
667 unsigned int* __restrict__ patchNumCount,
668 const CudaPatchRecord* __restrict__ cudaPatches,
669 float4* __restrict__ mapForces, float4* __restrict__ mapForcesSlow,
670 int* __restrict__ mapPatchReadyQueue,
671 int* __restrict__ outputOrder,
672 // ------------------------------
673 TileListVirialEnergy* __restrict__ virialEnergy,
677 using jForceType = typename std::conditional<doSlow, float3, float4>::type;
678 // Single warp takes care of one list of tiles
679 // for (int itileList = (threadIdx.x + blockDim.x*blockIdx.x)/WARPSIZE;itileList < numTileLists;itileList += blockDim.x*gridDim.x/WARPSIZE)
680 const int itileListTemp = start + (threadIdx.x + blockDim.x*blockIdx.x) / WARPSIZE;
681 const int itileList = WARP_SHUFFLE(WARP_FULL_MASK, itileListTemp, 0, WARPSIZE);
682 if (itileList < numTileLists)
687 float energyVdw, energyElec, energySlow;
689 float energyVdw_s, energyElec_s, energySlow_s;
691 float energyVdw_ti_1, energyVdw_ti_2, energyElec_ti_1, energyElec_ti_2, energySlow_ti_1, energySlow_ti_2;
693 unsigned int itileListLen;
696 char part1, part2, p2;
697 bool doShift = (alchflags.alchVdwShiftCoeff != 0.0f);
698 __shared__ float4 s_xyzq[NONBONDKERNEL_NUM_WARP][WARPSIZE];
699 __shared__ jForceType s_jforce[NONBONDKERNEL_NUM_WARP][WARPSIZE];
700 __shared__ jForceType s_jforceSlow[NONBONDKERNEL_NUM_WARP][WARPSIZE];
701 __shared__ int s_vdwtypej[NONBONDKERNEL_NUM_WARP][WARPSIZE];
702 __shared__ int s_jatomIndex[NONBONDKERNEL_NUM_WARP][WARPSIZE];
704 __shared__ int s_iatomStart[NONBONDKERNEL_NUM_WARP];
705 __shared__ int s_jatomStart[NONBONDKERNEL_NUM_WARP];
709 // Warp index (0...warpsize-1)
710 const int wid = threadIdx.x & (WARPSIZE-1);
711 const int iwarp = WARP_SHUFFLE(WARP_FULL_MASK, threadIdx.x / WARPSIZE, 0, WARPSIZE);
713 TileList tmp = tileLists[itileList];
714 int iatomStart = tmp.iatomStart;
715 int jtileStart = tmp.jtileStart;
716 int jtileEnd = tmp.jtileEnd;
717 patchInd = tmp.patchInd;
718 patchNumList = tmp.patchNumList;
720 float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x;
721 float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y;
722 float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z;
724 // DH - set zeroShift flag if magnitude of shift vector is zero
725 bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
727 int iatomSize, iatomFreeSize, jatomSize, jatomFreeSize;
729 PatchPairRecord PPStmp = patchPairs[itileList];
730 iatomSize = PPStmp.iatomSize;
731 iatomFreeSize = PPStmp.iatomFreeSize;
732 jatomSize = PPStmp.jatomSize;
733 jatomFreeSize = PPStmp.jatomFreeSize;
736 // Write to global memory here to avoid register spilling
739 virialEnergy[itileList].shx = shx;
740 virialEnergy[itileList].shy = shy;
741 virialEnergy[itileList].shz = shz;
745 // Load i-atom data (and shift coordinates)
746 float4 xyzq_i = xyzq[iatomStart + wid];
747 if (doAlch) part1 = p[iatomStart + wid];
751 int vdwtypei = vdwTypes[iatomStart + wid]*vdwCoefTableWidth;
753 // Load i-atom data (and shift coordinates)
754 BoundingBox boundingBoxI;
756 boundingBoxI = boundingBoxes[iatomStart/WARPSIZE];
757 boundingBoxI.x += shx;
758 boundingBoxI.y += shy;
759 boundingBoxI.z += shz;
762 // Get i-atom global index
763 #ifdef USE_NEW_EXCL_METHOD
764 int iatomIndex, minExclAtom, maxExclAtom;
769 #ifdef USE_NEW_EXCL_METHOD
770 iatomIndex = atomIndex[iatomStart + wid];
771 int2 tmp = minmaxExclAtom[iatomStart + wid];
775 iatomIndex = atomIndex[iatomStart + wid];
779 // i-forces in registers
785 // float3 iforceSlow;
792 // float energyVdw, energyElec, energySlow;
796 energyVdw_ti_1 = 0.0f;
797 energyVdw_ti_2 = 0.0f;
799 energyElec_ti_1 = 0.0f;
800 energyElec_ti_2 = 0.0f;
805 energySlow_ti_1 = 0.0f;
806 energySlow_ti_2 = 0.0f;
810 // Number of exclusions
811 // NOTE: Lowest bit is used as indicator bit for tile pairs:
812 // bit 0 tile has no atoms within pairlist cutoff
813 // bit 1 tile has atoms within pairlist cutoff
815 if (doPairlist) nexcluded = 0;
817 // Number of i loops and free atoms
820 int nloopi = min(iatomSize - iatomStart, WARPSIZE);
821 nfreei = max(iatomFreeSize - iatomStart, 0);
823 xyzq_i.x = -LARGE_FLOAT;
824 xyzq_i.y = -LARGE_FLOAT;
825 xyzq_i.z = -LARGE_FLOAT;
831 // int minJatomStart;
833 // minJatomStart = tileJatomStart[jtileStart];
837 // Exclusion index and maxdiff
838 int iexclIndex, iexclMaxdiff;
840 int2 tmp = exclIndexMaxDiff[iatomStart + wid];
842 iexclMaxdiff = tmp.y;
844 s_iatomStart[iwarp] = iatomStart;
846 // If the tile is within a patch, then the first jtile is a self tile
847 if (patchInd.x == patchInd.y & zeroShift) {
848 int jtile = jtileStart;
849 // Load j-atom starting index and exclusion mask
850 int jatomStart = tileJatomStart[jtile];
852 float4 xyzq_j = xyzq[jatomStart + wid];
853 WARP_SYNC(WARP_FULL_MASK);
854 if (doAlch) p2 = p[jatomStart + wid];
856 // Check for early bail
857 // No point of early bail for self
859 unsigned int excl = (doPairlist) ? 0 : tileExcls[jtile].excl[wid];
860 int vdwtypej = vdwTypes[jatomStart + wid];
861 s_vdwtypej[iwarp][wid] = vdwtypej;
863 // Get i-atom global index
865 s_jatomIndex[iwarp][wid] = atomIndex[jatomStart + wid];
868 // Number of j loops and free atoms
871 int nloopj = min(jatomSize - jatomStart, WARPSIZE);
872 nfreej = max(jatomFreeSize - jatomStart, 0);
873 //if (nfreei == 0 && nfreej == 0) continue;
875 xyzq_j.x = LARGE_FLOAT;
876 xyzq_j.y = LARGE_FLOAT;
877 xyzq_j.z = LARGE_FLOAT;
880 s_xyzq[iwarp][wid] = xyzq_j;
882 // DH - self requires that zeroShift is also set
883 // DC - In this case self is always true
884 const int modval = 2*WARPSIZE-1;
886 s_jforce[iwarp][wid] = make_zero<jForceType>();
888 s_jforceSlow[iwarp][wid] = make_zero<jForceType>();
889 WARP_SYNC(WARP_FULL_MASK);
893 // NOTE: Pairlist update, we must also include the diagonal since this is used
895 // Clear the lowest (indicator) bit
898 // For self tiles, do the diagonal term (t=0).
899 // NOTE: No energies are computed here, since this self-diagonal term is only for GBIS phase 2
900 int j = (0 + wid) & modval;
901 xyzq_j = s_xyzq[iwarp][j];
902 float dx = xyzq_j.x - xyzq_i.x;
903 float dy = xyzq_j.y - xyzq_i.y;
904 float dz = xyzq_j.z - xyzq_i.z;
905 float r2 = dx*dx + dy*dy + dz*dz;
907 if (j < WARPSIZE && r2 < plcutoff2) {
908 // We have atom pair within the pairlist cutoff => Set indicator bit
911 WARP_SYNC(WARP_FULL_MASK);
913 // TODO this can be done in fewer iterations if we take advantage of Newtons's 3rd
915 for (int t = 1;t < WARPSIZE;t++) {
916 int j = (t + wid) & modval;
918 // NOTE: __shfl() operation can give non-sense here because j may be >= WARPSIZE.
919 // However, if (j < WARPSIZE ..) below makes sure that these non-sense
920 // results are not used
921 if (doAlch) part2 = WARP_SHUFFLE(WARP_FULL_MASK, p2, j, WARPSIZE);
925 xyzq_j = s_xyzq[iwarp][j];
926 float dx = xyzq_j.x - xyzq_i.x;
927 float dy = xyzq_j.y - xyzq_i.y;
928 float dz = xyzq_j.z - xyzq_i.z;
929 float r2 = dx*dx + dy*dy + dz*dz;
930 if (r2 < plcutoff2) {
931 // We have atom pair within the pairlist cutoff => Set indicator bit
933 if (j < nfreej || wid < nfreei) {
934 bool excluded = false;
935 int indexdiff = s_jatomIndex[iwarp][j] - iatomIndex;
936 if ( abs(indexdiff) <= iexclMaxdiff) {
937 indexdiff += iexclIndex;
938 int indexword = ((unsigned int) indexdiff) >> 5;
940 if ( indexword < MAX_CONST_EXCLUSIONS ) {
941 indexword = constExclusions[indexword];
943 indexword = overflowExclusions[indexword];
946 excluded = ((indexword & (1<<(indexdiff&31))) != 0);
948 if (excluded) nexcluded += 2;
949 if (!excluded) excl |= 0x80000000;
951 if(!excluded && r2 < cutoff2){
954 calcForceEnergyFEP<doEnergy, doSlow, true, doAlchVdwForceSwitching, jForceType>(
955 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
956 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
957 vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
959 s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
960 energyVdw, energyVdw_s,
961 energyElec, energySlow, energyElec_s, energySlow_s);
963 calcForceEnergyTI<doEnergy, doSlow, true, doAlchVdwForceSwitching, jForceType>(
964 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
965 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
966 vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
968 s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
969 energyVdw, energyVdw_ti_1,
970 energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
971 energySlow, energySlow_ti_1, energySlow_ti_2);
975 calcForceEnergyFEP<doEnergy, doSlow, false, doAlchVdwForceSwitching, jForceType>(
976 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
977 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
978 vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
980 s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
981 energyVdw, energyVdw_s,
982 energyElec, energySlow, energyElec_s, energySlow_s);
984 calcForceEnergyTI<doEnergy, doSlow, false, doAlchVdwForceSwitching, jForceType>(
985 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
986 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
987 vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
989 s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
990 energyVdw, energyVdw_ti_1,
991 energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
992 energySlow, energySlow_ti_1, energySlow_ti_2);
995 }//if !excluded && r2 < cutoff2
997 if (!excluded && r2 < cutoff2) {
999 calcForceEnergy<doEnergy, doSlow>(
1000 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1001 vdwtypei, s_vdwtypej[iwarp][j], vdwCoefTable,
1002 vdwCoefTableTex, forceTableTex, energyTableTex,
1005 s_jforceSlow[iwarp][j],
1006 energyVdw, energyElec, energySlow);
1008 calcForceEnergyMath<doEnergy, doSlow, jForceType>(
1009 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1010 vdwtypei, s_vdwtypej[iwarp][j], vdwCoefTable,
1011 vdwCoefTableTex, forceTableTex, energyTableTex,
1014 s_jforceSlow[iwarp][j],
1015 energyVdw, energyElec, energySlow,
1023 WARP_SYNC(WARP_FULL_MASK);
1026 // Just compute forces
1029 for (int t = 1;t < WARPSIZE;t++) {
1031 int j = (t + wid) & modval;
1032 part2 = WARP_SHUFFLE(WARP_FULL_MASK, p2, j, WARPSIZE);
1035 int j = ((t + wid) & (WARPSIZE-1));
1036 xyzq_j = s_xyzq[iwarp][j];
1037 float dx = xyzq_j.x - xyzq_i.x;
1038 float dy = xyzq_j.y - xyzq_i.y;
1039 float dz = xyzq_j.z - xyzq_i.z;
1041 float r2 = dx*dx + dy*dy + dz*dz;
1043 if(r2 < cutoff2){ // (r2 < cutoff2)
1046 calcForceEnergyFEP<doEnergy, doSlow, true, doAlchVdwForceSwitching, jForceType>(
1047 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1048 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1049 vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1052 s_jforceSlow[iwarp][j],
1053 energyVdw, energyVdw_s,
1054 energyElec, energySlow, energyElec_s, energySlow_s);
1056 calcForceEnergyTI<doEnergy, doSlow, true, doAlchVdwForceSwitching, jForceType>(
1057 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1058 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1059 vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1062 s_jforceSlow[iwarp][j],
1063 energyVdw, energyVdw_ti_1,
1064 energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
1065 energySlow, energySlow_ti_1, energySlow_ti_2);
1069 calcForceEnergyFEP<doEnergy, doSlow, false, doAlchVdwForceSwitching, jForceType>(
1070 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1071 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1072 vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1075 s_jforceSlow[iwarp][j],
1076 energyVdw, energyVdw_s,
1077 energyElec, energySlow, energyElec_s, energySlow_s);
1079 calcForceEnergyTI<doEnergy, doSlow, false, doAlchVdwForceSwitching, jForceType>(
1080 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1081 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1082 vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1085 s_jforceSlow[iwarp][j],
1086 energyVdw, energyVdw_ti_1,
1087 energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
1088 energySlow, energySlow_ti_1, energySlow_ti_2);
1095 calcForceEnergy<doEnergy, doSlow>(
1096 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1097 vdwtypei, s_vdwtypej[iwarp][j], vdwCoefTable,
1098 vdwCoefTableTex, forceTableTex, energyTableTex,
1101 s_jforceSlow[iwarp][j],
1102 energyVdw, energyElec, energySlow);
1104 calcForceEnergyMath<doEnergy, doSlow, jForceType>(
1105 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1106 vdwtypei, s_vdwtypej[iwarp][j], vdwCoefTable,
1107 vdwCoefTableTex, forceTableTex, energyTableTex,
1110 s_jforceSlow[iwarp][j],
1111 energyVdw, energyElec, energySlow,
1118 WARP_SYNC(WARP_FULL_MASK);
1121 WARP_SYNC(WARP_FULL_MASK);
1124 storeForces<doSlow, jForceType>(jatomStart + wid, s_jforce[iwarp][wid], s_jforceSlow[iwarp][wid],
1125 devForce_x, devForce_y, devForce_z,
1126 devForceSlow_x, devForceSlow_y, devForceSlow_z);
1129 const unsigned int warp_exclude = WARP_BALLOT(WARP_FULL_MASK, nexcluded & 1);
1130 const unsigned int warp_any_exclude = WARP_BALLOT(WARP_FULL_MASK, excl);
1132 int anyexcl = warp_any_exclude ? 1 : 0;
1134 // Mark this jtile as non-empty:
1135 // VdW: 1 if tile has atom pairs within pairlist cutoff and some these atoms interact
1136 // GBIS: 65536 if tile has atom pairs within pairlist cutoff but not necessary interacting (i.e. these atoms are fixed or excluded)
1137 if (wid == 0 && anyexcl) jtiles[jtile] = anyexcl;
1139 tileExcls[jtile].excl[wid] = excl;
1141 // lower 16 bits number of tiles with atom pairs within pairlist cutoff that interact
1142 // upper 16 bits number of tiles with atom pairs within pairlist cutoff (but not necessary interacting)
1143 itileListLen += anyexcl;
1144 // NOTE, this minJatomStart is only stored once for the first tile list entry
1145 // minJatomStart = min(minJatomStart, jatomStart);
1151 WARP_SYNC(WARP_FULL_MASK);
1153 for (int jtile=jtileStart; jtile <= jtileEnd; jtile++) {
1155 unsigned int excl = 0;
1159 // Load j-atom starting index and exclusion mask
1160 jatomStart = tileJatomStart[jtile];
1162 xyzq_j = xyzq[jatomStart + wid];
1163 if (doAlch) p2 = p[jatomStart + wid];
1165 // Check for early bail
1166 // DC - I found this was slower
1168 // float r2bb = distsq(boundingBoxI, xyzq_j);
1169 // if (WARP_ALL(WARP_FULL_MASK, r2bb > plcutoff2)) continue;
1172 excl = (doPairlist) ? 0 : tileExcls[jtile].excl[wid];
1173 vdwtypej = vdwTypes[jatomStart + wid];
1174 s_vdwtypej[iwarp][wid] = vdwtypej;
1176 // Get i-atom global index
1178 s_jatomIndex[iwarp][wid] = atomIndex[jatomStart + wid];
1181 // Number of j loops and free atoms
1184 int nloopj = min(jatomSize - jatomStart, WARPSIZE);
1185 nfreej = max(jatomFreeSize - jatomStart, 0);
1186 //if (nfreei == 0 && nfreej == 0) continue;
1187 if (wid >= nloopj) {
1188 xyzq_j.x = LARGE_FLOAT;
1189 xyzq_j.y = LARGE_FLOAT;
1190 xyzq_j.z = LARGE_FLOAT;
1194 s_jatomStart[iwarp] = jatomStart;
1196 WARP_SYNC(WARP_FULL_MASK);
1197 s_xyzq[iwarp][wid] = xyzq_j;
1199 // DH - self requires that zeroShift is also set
1200 // DC - In this case self is always false
1201 const int modval = WARPSIZE-1;
1203 s_jforce[iwarp][wid] = make_zero<jForceType>();
1205 s_jforceSlow[iwarp][wid] = make_zero<jForceType>();
1206 WARP_SYNC(WARP_FULL_MASK);
1210 // NOTE: Pairlist update, we must also include the diagonal since this is used
1212 // Clear the lowest (indicator) bit
1216 for (int t = 0;t < WARPSIZE;t++) {
1217 const int j = (t + wid) & modval;
1219 // NOTE: __shfl() operation can give non-sense here because j may be >= WARPSIZE.
1220 // However, if (j < WARPSIZE ..) below makes sure that these non-sense
1221 // results are not used
1222 if (doAlch) part2 = WARP_SHUFFLE(WARP_FULL_MASK, p2, j, WARPSIZE);
1225 xyzq_j = s_xyzq[iwarp][j];
1226 float dx = xyzq_j.x - xyzq_i.x;
1227 float dy = xyzq_j.y - xyzq_i.y;
1228 float dz = xyzq_j.z - xyzq_i.z;
1229 float r2 = dx*dx + dy*dy + dz*dz;
1230 if (r2 < plcutoff2) {
1231 // We have atom pair within the pairlist cutoff => Set indicator bit
1233 if (j < nfreej || wid < nfreei) {
1234 bool excluded = false;
1235 int indexdiff = s_jatomIndex[iwarp][j] - iatomIndex;
1236 if ( abs(indexdiff) <= iexclMaxdiff) {
1237 indexdiff += iexclIndex;
1238 int indexword = ((unsigned int) indexdiff) >> 5;
1240 if ( indexword < MAX_CONST_EXCLUSIONS ) {
1241 indexword = constExclusions[indexword];
1243 indexword = overflowExclusions[indexword];
1246 excluded = ((indexword & (1<<(indexdiff&31))) != 0);
1248 if (excluded) nexcluded += 2;
1249 if (!excluded) excl |= 0x80000000;
1251 if(!excluded && r2 < cutoff2){
1254 calcForceEnergyFEP<doEnergy, doSlow, true, doAlchVdwForceSwitching, jForceType>(
1255 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1256 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1257 vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1259 s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
1260 energyVdw, energyVdw_s,
1261 energyElec, energySlow, energyElec_s, energySlow_s);
1263 calcForceEnergyTI<doEnergy, doSlow, true, doAlchVdwForceSwitching, jForceType>(
1264 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1265 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1266 vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1268 s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
1269 energyVdw, energyVdw_ti_1,
1270 energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
1271 energySlow, energySlow_ti_1, energySlow_ti_2);
1275 calcForceEnergyFEP<doEnergy, doSlow, false, doAlchVdwForceSwitching, jForceType>(
1276 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1277 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1278 vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1280 s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
1281 energyVdw, energyVdw_s,
1282 energyElec, energySlow, energyElec_s, energySlow_s);
1284 calcForceEnergyTI<doEnergy, doSlow, false, doAlchVdwForceSwitching, jForceType>(
1285 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1286 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1287 vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1289 s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
1290 energyVdw, energyVdw_ti_1,
1291 energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
1292 energySlow, energySlow_ti_1, energySlow_ti_2);
1295 }//if !excluded && r2 < cutoff2
1297 if (!excluded && r2 < cutoff2) {
1299 calcForceEnergy<doEnergy, doSlow>(
1300 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1301 vdwtypei, s_vdwtypej[iwarp][j], vdwCoefTable,
1302 vdwCoefTableTex, forceTableTex, energyTableTex,
1305 s_jforceSlow[iwarp][j],
1306 energyVdw, energyElec, energySlow);
1308 calcForceEnergyMath<doEnergy, doSlow, jForceType>(
1309 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1310 vdwtypei, s_vdwtypej[iwarp][j], vdwCoefTable,
1311 vdwCoefTableTex, forceTableTex, energyTableTex,
1314 s_jforceSlow[iwarp][j],
1315 energyVdw, energyElec, energySlow,
1322 WARP_SYNC(WARP_FULL_MASK);
1325 // Just compute forces
1327 for (int t = 0; t < WARPSIZE; t++) {
1328 const int j = ((t + wid) & (WARPSIZE-1));
1330 part2 = WARP_SHUFFLE(WARP_FULL_MASK, p2, j, WARPSIZE);
1333 xyzq_j = s_xyzq[iwarp][j];
1334 float dx = xyzq_j.x - xyzq_i.x;
1335 float dy = xyzq_j.y - xyzq_i.y;
1336 float dz = xyzq_j.z - xyzq_i.z;
1338 float r2 = dx*dx + dy*dy + dz*dz;
1340 if(r2 < cutoff2){ // (r2 < cutoff2)
1343 calcForceEnergyFEP<doEnergy, doSlow, true, doAlchVdwForceSwitching, jForceType>(
1344 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1345 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1346 vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1349 s_jforceSlow[iwarp][j],
1350 energyVdw, energyVdw_s,
1351 energyElec, energySlow, energyElec_s, energySlow_s);
1353 calcForceEnergyTI<doEnergy, doSlow, true, doAlchVdwForceSwitching, jForceType>(
1354 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1355 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1356 vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1359 s_jforceSlow[iwarp][j],
1360 energyVdw, energyVdw_ti_1,
1361 energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
1362 energySlow, energySlow_ti_1, energySlow_ti_2);
1366 calcForceEnergyFEP<doEnergy, doSlow, false, doAlchVdwForceSwitching, jForceType>(
1367 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1368 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1369 vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1372 s_jforceSlow[iwarp][j],
1373 energyVdw, energyVdw_s,
1374 energyElec, energySlow, energyElec_s, energySlow_s);
1376 calcForceEnergyTI<doEnergy, doSlow, false, doAlchVdwForceSwitching, jForceType>(
1377 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1378 vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1379 vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1382 s_jforceSlow[iwarp][j],
1383 energyVdw, energyVdw_ti_1,
1384 energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
1385 energySlow, energySlow_ti_1, energySlow_ti_2);
1392 calcForceEnergy<doEnergy, doSlow>(
1393 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1394 vdwtypei, s_vdwtypej[iwarp][j], vdwCoefTable,
1395 vdwCoefTableTex, forceTableTex, energyTableTex,
1398 s_jforceSlow[iwarp][j],
1399 energyVdw, energyElec, energySlow);
1401 calcForceEnergyMath<doEnergy, doSlow, jForceType>(
1402 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1403 vdwtypei, s_vdwtypej[iwarp][j], vdwCoefTable,
1404 vdwCoefTableTex, forceTableTex, energyTableTex,
1407 s_jforceSlow[iwarp][j],
1408 energyVdw, energyElec, energySlow,
1415 WARP_SYNC(WARP_FULL_MASK);
1420 storeForces<doSlow, jForceType>(s_jatomStart[iwarp] + wid, s_jforce[iwarp][wid], s_jforceSlow[iwarp][wid],
1421 devForce_x, devForce_y, devForce_z,
1422 devForceSlow_x, devForceSlow_y, devForceSlow_z);
1425 const unsigned int warp_exclude = WARP_BALLOT(WARP_FULL_MASK, nexcluded & 1);
1426 const unsigned int warp_any_exclude = WARP_BALLOT(WARP_FULL_MASK, excl);
1428 int anyexcl = warp_any_exclude ? 1 : 0;
1430 // Mark this jtile as non-empty:
1431 // VdW: 1 if tile has atom pairs within pairlist cutoff and some these atoms interact
1432 // GBIS: 65536 if tile has atom pairs within pairlist cutoff but not necessary interacting (i.e. these atoms are fixed or excluded)
1433 if (wid == 0 && anyexcl) jtiles[jtile] = anyexcl;
1435 tileExcls[jtile].excl[wid] = excl;
1437 // lower 16 bits number of tiles with atom pairs within pairlist cutoff that interact
1438 // upper 16 bits number of tiles with atom pairs within pairlist cutoff (but not necessary interacting)
1439 itileListLen += anyexcl;
1440 // NOTE, this minJatomStart is only stored once for the first tile list entry
1441 // minJatomStart = min(minJatomStart, jatomStart);
1443 WARP_SYNC(WARP_FULL_MASK);
1448 storeForces<doSlow, float3>(s_iatomStart[iwarp] + wid, iforce, iforceSlow,
1449 devForce_x, devForce_y, devForce_z,
1450 devForceSlow_x, devForceSlow_y, devForceSlow_z);
1452 // Done with computation
1454 // Save pairlist stuff
1457 // Warp index (0...warpsize-1)
1458 const int wid = threadIdx.x % WARPSIZE;
1461 // minJatomStart is in range [0 ... atomStorageSize-1]
1462 //int atom0 = (minJatomStart)/WARPSIZE;
1464 // int storageOffset = atomStorageSize/WARPSIZE;
1465 // int itileListLen = 0;
1466 // for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) itileListLen += jtiles[jtile];
1467 // Store 0 if itileListLen == 0
1468 // tileListDepth[itileList] = (itileListLen > 0)*(itileListLen*storageOffset + atom0);
1469 tileListDepth[itileList] = itileListLen;
1470 tileListOrder[itileList] = itileList;
1471 // Number of active tilelists with tile with atom pairs within pairlist cutoff that interact
1472 if ((itileListLen & 65535) > 0) atomicAdd(&tileListStat->numTileLists, 1);
1473 // Number of active tilelists with tiles with atom pairs within pairlist cutoff (but not necessary interacting)
1474 if (itileListLen > 0) atomicAdd(&tileListStat->numTileListsGBIS, 1);
1475 // NOTE: always numTileListsGBIS >= numTileLists
1478 typedef cub::WarpReduce<int> WarpReduceInt;
1479 __shared__ typename WarpReduceInt::TempStorage tempStorage[NONBONDKERNEL_NUM_WARP];
1480 const int warpId = threadIdx.x / WARPSIZE;
1481 // Remove indicator bit
1483 volatile int nexcludedWarp = WarpReduceInt(tempStorage[warpId]).Sum(nexcluded);
1484 if (wid == 0) atomicAdd(&tileListStat->numExcluded, nexcludedWarp);
1489 // Warp index (0...warpsize-1)
1490 const int wid = threadIdx.x % WARPSIZE;
1492 typedef cub::WarpReduce<float> WarpReduce;
1493 __shared__ typename WarpReduce::TempStorage tempStorage[NONBONDKERNEL_NUM_WARP];
1494 const int warpId = threadIdx.x / WARPSIZE;
1495 volatile float iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforce.x);
1496 WARP_SYNC(WARP_FULL_MASK);
1497 volatile float iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforce.y);
1498 WARP_SYNC(WARP_FULL_MASK);
1499 volatile float iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforce.z);
1500 WARP_SYNC(WARP_FULL_MASK);
1502 virialEnergy[itileList].forcex = iforcexSum;
1503 virialEnergy[itileList].forcey = iforceySum;
1504 virialEnergy[itileList].forcez = iforcezSum;
1508 iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.x);
1509 WARP_SYNC(WARP_FULL_MASK);
1510 iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.y);
1511 WARP_SYNC(WARP_FULL_MASK);
1512 iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.z);
1513 WARP_SYNC(WARP_FULL_MASK);
1515 virialEnergy[itileList].forceSlowx = iforcexSum;
1516 virialEnergy[itileList].forceSlowy = iforceySum;
1517 virialEnergy[itileList].forceSlowz = iforcezSum;
1524 // NOTE: We must hand write these warp-wide reductions to avoid excess register spillage
1525 // (Why does CUB suck here?)
1527 for (int i=16;i >= 1;i/=2) {
1528 energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, 32);
1529 energyElec += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec, i, 32);
1530 if(doFEP) energyVdw_s += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_s, i, 32);
1531 if(doFEP) energyElec_s += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec_s, i, 32);
1533 energyVdw_ti_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_ti_1, i, 32);
1534 energyVdw_ti_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_ti_2, i, 32);
1535 energyElec_ti_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec_ti_1, i, 32);
1536 energyElec_ti_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec_ti_2, i, 32);
1539 energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, 32);
1540 if(doFEP) energySlow_s += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_s, i, 32);
1542 energySlow_ti_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_ti_1, i, 32);
1543 energySlow_ti_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_ti_2, i, 32);
1548 if (threadIdx.x % WARPSIZE == 0) {
1549 virialEnergy[itileList].energyVdw = energyVdw;
1550 virialEnergy[itileList].energyElec = energyElec;
1551 if (doFEP) virialEnergy[itileList].energyVdw_s = energyVdw_s;
1552 if (doFEP) virialEnergy[itileList].energyElec_s = energyElec_s;
1554 virialEnergy[itileList].energyVdw_ti_1 = energyVdw_ti_1;
1555 virialEnergy[itileList].energyVdw_ti_2 = energyVdw_ti_2;
1556 virialEnergy[itileList].energyElec_ti_1 = energyElec_ti_1;
1557 virialEnergy[itileList].energyElec_ti_2 = energyElec_ti_2;
1560 virialEnergy[itileList].energySlow = energySlow;
1561 if(doFEP) virialEnergy[itileList].energySlow_s = energySlow_s;
1563 virialEnergy[itileList].energySlow_ti_1 = energySlow_ti_1;
1564 virialEnergy[itileList].energySlow_ti_2 = energySlow_ti_2;
1569 // XXX TODO: Disable streaming and see what happens
1572 // Make sure devForces and devForcesSlow have been written into device memory
1573 WARP_SYNC(WARP_FULL_MASK);
1576 int patchDone[2] = {false, false};
1577 const int wid = threadIdx.x % WARPSIZE;
1579 int patchCountOld0 = atomicInc(&patchNumCount[patchInd.x], (unsigned int)(patchNumList.x-1));
1580 patchDone[0] = (patchCountOld0 + 1 == patchNumList.x);
1581 if (patchInd.x != patchInd.y) {
1582 int patchCountOld1 = atomicInc(&patchNumCount[patchInd.y], (unsigned int)(patchNumList.y-1));
1583 patchDone[1] = (patchCountOld1 + 1 == patchNumList.y);
1587 patchDone[0] = WARP_ANY(WARP_FULL_MASK, patchDone[0]);
1588 patchDone[1] = WARP_ANY(WARP_FULL_MASK, patchDone[1]);
1591 // Patch 1 is done, write onto host-mapped memory
1592 CudaPatchRecord patch = cudaPatches[patchInd.x];
1593 int start = patch.atomStart;
1594 int end = start + patch.numAtoms;
1595 for (int i=start+wid;i < end;i+=WARPSIZE) {
1596 mapForces[i] = make_float4(devForce_x[i],
1597 devForce_y[i], devForce_z[i], devForce_w[i]);
1599 mapForcesSlow[i] = make_float4(devForceSlow_x[i],
1600 devForceSlow_y[i], devForceSlow_z[i], devForceSlow_w[i]);
1606 CudaPatchRecord patch = cudaPatches[patchInd.y];
1607 int start = patch.atomStart;
1608 int end = start + patch.numAtoms;
1609 for (int i=start+wid;i < end;i+=WARPSIZE) {
1610 mapForces[i] = make_float4(devForce_x[i],
1611 devForce_y[i], devForce_z[i], devForce_w[i]);
1613 mapForcesSlow[i] = make_float4(devForceSlow_x[i],
1614 devForceSlow_y[i], devForceSlow_z[i], devForceSlow_w[i]);
1619 if (patchDone[0] || patchDone[1]) {
1620 // Make sure mapForces and mapForcesSlow are up-to-date
1621 WARP_SYNC(WARP_FULL_MASK);
1622 __threadfence_system();
1623 // Add patch into "patchReadyQueue"
1626 int ind = atomicAdd(&tileListStat->patchReadyQueueCount, 1);
1627 // int ind = atomicInc((unsigned int *)&mapPatchReadyQueue[numPatches], numPatches-1);
1628 mapPatchReadyQueue[ind] = patchInd.x;
1631 int ind = atomicAdd(&tileListStat->patchReadyQueueCount, 1);
1632 // int ind = atomicInc((unsigned int *)&mapPatchReadyQueue[numPatches], numPatches-1);
1633 mapPatchReadyQueue[ind] = patchInd.y;
1639 if (doStreaming && outputOrder != NULL && threadIdx.x % WARPSIZE == 0) {
1640 int index = atomicAdd(&tileListStat->outputOrderIndex, 1);
1641 outputOrder[index] = itileList;
1643 } // if (itileList < numTileLists)
1647 // Finish up - reduce virials from nonbonded kernel
1649 #define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP 32
1650 __global__ void reduceNonbondedVirialKernel(const bool doSlow,
1651 const int atomStorageSize,
1652 const float4* __restrict__ xyzq,
1653 const float4* __restrict__ devForces, const float4* __restrict__ devForcesSlow,
1654 VirialEnergy* __restrict__ virialEnergy) {
1656 for (int ibase = blockIdx.x*blockDim.x;ibase < atomStorageSize;ibase += blockDim.x*gridDim.x)
1658 int i = ibase + threadIdx.x;
1660 // Set to zero to avoid nan*0
1665 float4 force, forceSlow;
1672 if (i < atomStorageSize) {
1674 force = devForces[i];
1675 if (doSlow) forceSlow = devForcesSlow[i];
1677 // Reduce across the entire thread block
1678 float vxxt = force.x*pos.x;
1679 float vxyt = force.x*pos.y;
1680 float vxzt = force.x*pos.z;
1681 float vyxt = force.y*pos.x;
1682 float vyyt = force.y*pos.y;
1683 float vyzt = force.y*pos.z;
1684 float vzxt = force.z*pos.x;
1685 float vzyt = force.z*pos.y;
1686 float vzzt = force.z*pos.z;
1687 // atomicAdd(&virialEnergy->virial[0], (double)vxx);
1688 // atomicAdd(&virialEnergy->virial[1], (double)vxy);
1689 // atomicAdd(&virialEnergy->virial[2], (double)vxz);
1690 // atomicAdd(&virialEnergy->virial[3], (double)vyx);
1691 // atomicAdd(&virialEnergy->virial[4], (double)vyy);
1692 // atomicAdd(&virialEnergy->virial[5], (double)vyz);
1693 // atomicAdd(&virialEnergy->virial[6], (double)vzx);
1694 // atomicAdd(&virialEnergy->virial[7], (double)vzy);
1695 // atomicAdd(&virialEnergy->virial[8], (double)vzz);
1697 typedef cub::BlockReduce<float, REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1698 __shared__ typename BlockReduce::TempStorage tempStorage;
1699 volatile float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
1700 volatile float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
1701 volatile float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
1702 volatile float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
1703 volatile float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
1704 volatile float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
1705 volatile float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
1706 volatile float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
1707 volatile float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
1708 if (threadIdx.x == 0) {
1709 atomicAdd(&virialEnergy->virial[0], (double)vxx);
1710 atomicAdd(&virialEnergy->virial[1], (double)vxy);
1711 atomicAdd(&virialEnergy->virial[2], (double)vxz);
1712 atomicAdd(&virialEnergy->virial[3], (double)vyx);
1713 atomicAdd(&virialEnergy->virial[4], (double)vyy);
1714 atomicAdd(&virialEnergy->virial[5], (double)vyz);
1715 atomicAdd(&virialEnergy->virial[6], (double)vzx);
1716 atomicAdd(&virialEnergy->virial[7], (double)vzy);
1717 atomicAdd(&virialEnergy->virial[8], (double)vzz);
1721 // if (isnan(forceSlow.x) || isnan(forceSlow.y) || isnan(forceSlow.z))
1722 float vxxSlowt = forceSlow.x*pos.x;
1723 float vxySlowt = forceSlow.x*pos.y;
1724 float vxzSlowt = forceSlow.x*pos.z;
1725 float vyxSlowt = forceSlow.y*pos.x;
1726 float vyySlowt = forceSlow.y*pos.y;
1727 float vyzSlowt = forceSlow.y*pos.z;
1728 float vzxSlowt = forceSlow.z*pos.x;
1729 float vzySlowt = forceSlow.z*pos.y;
1730 float vzzSlowt = forceSlow.z*pos.z;
1731 // atomicAdd(&virialEnergy->virialSlow[0], (double)vxxSlow);
1732 // atomicAdd(&virialEnergy->virialSlow[1], (double)vxySlow);
1733 // atomicAdd(&virialEnergy->virialSlow[2], (double)vxzSlow);
1734 // atomicAdd(&virialEnergy->virialSlow[3], (double)vyxSlow);
1735 // atomicAdd(&virialEnergy->virialSlow[4], (double)vyySlow);
1736 // atomicAdd(&virialEnergy->virialSlow[5], (double)vyzSlow);
1737 // atomicAdd(&virialEnergy->virialSlow[6], (double)vzxSlow);
1738 // atomicAdd(&virialEnergy->virialSlow[7], (double)vzySlow);
1739 // atomicAdd(&virialEnergy->virialSlow[8], (double)vzzSlow);
1740 volatile float vxxSlow = BlockReduce(tempStorage).Sum(vxxSlowt); BLOCK_SYNC;
1741 volatile float vxySlow = BlockReduce(tempStorage).Sum(vxySlowt); BLOCK_SYNC;
1742 volatile float vxzSlow = BlockReduce(tempStorage).Sum(vxzSlowt); BLOCK_SYNC;
1743 volatile float vyxSlow = BlockReduce(tempStorage).Sum(vyxSlowt); BLOCK_SYNC;
1744 volatile float vyySlow = BlockReduce(tempStorage).Sum(vyySlowt); BLOCK_SYNC;
1745 volatile float vyzSlow = BlockReduce(tempStorage).Sum(vyzSlowt); BLOCK_SYNC;
1746 volatile float vzxSlow = BlockReduce(tempStorage).Sum(vzxSlowt); BLOCK_SYNC;
1747 volatile float vzySlow = BlockReduce(tempStorage).Sum(vzySlowt); BLOCK_SYNC;
1748 volatile float vzzSlow = BlockReduce(tempStorage).Sum(vzzSlowt); BLOCK_SYNC;
1749 if (threadIdx.x == 0) {
1750 atomicAdd(&virialEnergy->virialSlow[0], (double)vxxSlow);
1751 atomicAdd(&virialEnergy->virialSlow[1], (double)vxySlow);
1752 atomicAdd(&virialEnergy->virialSlow[2], (double)vxzSlow);
1753 atomicAdd(&virialEnergy->virialSlow[3], (double)vyxSlow);
1754 atomicAdd(&virialEnergy->virialSlow[4], (double)vyySlow);
1755 atomicAdd(&virialEnergy->virialSlow[5], (double)vyzSlow);
1756 atomicAdd(&virialEnergy->virialSlow[6], (double)vzxSlow);
1757 atomicAdd(&virialEnergy->virialSlow[7], (double)vzySlow);
1758 atomicAdd(&virialEnergy->virialSlow[8], (double)vzzSlow);
1765 #define REDUCEVIRIALENERGYKERNEL_NUM_WARP 32
1766 __global__ void reduceVirialEnergyKernel(
1767 const bool doEnergy, const bool doVirial, const bool doSlow,
1768 const int numTileLists,
1769 const TileListVirialEnergy* __restrict__ tileListVirialEnergy,
1770 VirialEnergy* __restrict__ virialEnergy) {
1772 for (int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
1774 int itileList = ibase + threadIdx.x;
1775 TileListVirialEnergy ve;
1776 if (itileList < numTileLists) {
1777 ve = tileListVirialEnergy[itileList];
1779 // Set to zero to avoid nan*0
1787 ve.forceSlowx = 0.0f;
1788 ve.forceSlowy = 0.0f;
1789 ve.forceSlowz = 0.0f;
1793 ve.energyVdw_s = 0.0;
1794 ve.energyElec = 0.0;
1795 ve.energySlow = 0.0;
1796 ve.energyElec_s = 0.0;
1797 ve.energySlow_s = 0.0;
1800 ve.energyVdw_ti_1 = 0.0;
1801 ve.energyVdw_ti_2 = 0.0;
1802 ve.energyElec_ti_1 = 0.0;
1803 ve.energyElec_ti_2 = 0.0;
1804 ve.energySlow_ti_1 = 0.0;
1805 ve.energySlow_ti_2 = 0.0;
1806 // ve.energyGBIS = 0.0;
1811 typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1812 __shared__ typename BlockReduce::TempStorage tempStorage;
1813 float vxxt = ve.forcex*ve.shx;
1814 float vxyt = ve.forcex*ve.shy;
1815 float vxzt = ve.forcex*ve.shz;
1816 float vyxt = ve.forcey*ve.shx;
1817 float vyyt = ve.forcey*ve.shy;
1818 float vyzt = ve.forcey*ve.shz;
1819 float vzxt = ve.forcez*ve.shx;
1820 float vzyt = ve.forcez*ve.shy;
1821 float vzzt = ve.forcez*ve.shz;
1822 volatile float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
1823 volatile float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
1824 volatile float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
1825 volatile float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
1826 volatile float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
1827 volatile float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
1828 volatile float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
1829 volatile float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
1830 volatile float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
1831 if (threadIdx.x == 0) {
1832 atomicAdd(&virialEnergy->virial[0], (double)vxx);
1833 atomicAdd(&virialEnergy->virial[1], (double)vxy);
1834 atomicAdd(&virialEnergy->virial[2], (double)vxz);
1835 atomicAdd(&virialEnergy->virial[3], (double)vyx);
1836 atomicAdd(&virialEnergy->virial[4], (double)vyy);
1837 atomicAdd(&virialEnergy->virial[5], (double)vyz);
1838 atomicAdd(&virialEnergy->virial[6], (double)vzx);
1839 atomicAdd(&virialEnergy->virial[7], (double)vzy);
1840 atomicAdd(&virialEnergy->virial[8], (double)vzz);
1844 typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1845 __shared__ typename BlockReduce::TempStorage tempStorage;
1846 float vxxt = ve.forceSlowx*ve.shx;
1847 float vxyt = ve.forceSlowx*ve.shy;
1848 float vxzt = ve.forceSlowx*ve.shz;
1849 float vyxt = ve.forceSlowy*ve.shx;
1850 float vyyt = ve.forceSlowy*ve.shy;
1851 float vyzt = ve.forceSlowy*ve.shz;
1852 float vzxt = ve.forceSlowz*ve.shx;
1853 float vzyt = ve.forceSlowz*ve.shy;
1854 float vzzt = ve.forceSlowz*ve.shz;
1855 volatile float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
1856 volatile float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
1857 volatile float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
1858 volatile float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
1859 volatile float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
1860 volatile float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
1861 volatile float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
1862 volatile float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
1863 volatile float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
1864 if (threadIdx.x == 0) {
1865 atomicAdd(&virialEnergy->virialSlow[0], (double)vxx);
1866 atomicAdd(&virialEnergy->virialSlow[1], (double)vxy);
1867 atomicAdd(&virialEnergy->virialSlow[2], (double)vxz);
1868 atomicAdd(&virialEnergy->virialSlow[3], (double)vyx);
1869 atomicAdd(&virialEnergy->virialSlow[4], (double)vyy);
1870 atomicAdd(&virialEnergy->virialSlow[5], (double)vyz);
1871 atomicAdd(&virialEnergy->virialSlow[6], (double)vzx);
1872 atomicAdd(&virialEnergy->virialSlow[7], (double)vzy);
1873 atomicAdd(&virialEnergy->virialSlow[8], (double)vzz);
1879 typedef cub::BlockReduce<double, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1880 /* Maybe we should guard the TI and FEP energies, since those are not to be calculated on regular MDs */
1881 __shared__ typename BlockReduce::TempStorage tempStorage;
1882 volatile double energyVdw = BlockReduce(tempStorage).Sum(ve.energyVdw); BLOCK_SYNC;
1883 volatile double energyVdw_s = BlockReduce(tempStorage).Sum(ve.energyVdw_s); BLOCK_SYNC;
1884 volatile double energyElec = BlockReduce(tempStorage).Sum(ve.energyElec); BLOCK_SYNC;
1885 volatile double energyElec_s = BlockReduce(tempStorage).Sum(ve.energyElec_s); BLOCK_SYNC;
1886 volatile double energyVdw_ti_1 = BlockReduce(tempStorage).Sum(ve.energyVdw_ti_1); BLOCK_SYNC;
1887 volatile double energyVdw_ti_2 = BlockReduce(tempStorage).Sum(ve.energyVdw_ti_2); BLOCK_SYNC;
1888 volatile double energyElec_ti_1= BlockReduce(tempStorage).Sum(ve.energyElec_ti_1); BLOCK_SYNC;
1889 volatile double energyElec_ti_2= BlockReduce(tempStorage).Sum(ve.energyElec_ti_2); BLOCK_SYNC;
1890 if (threadIdx.x == 0){
1891 atomicAdd(&virialEnergy->energyVdw, (double)energyVdw);
1892 atomicAdd(&virialEnergy->energyVdw_s, (double)energyVdw_s);
1893 atomicAdd(&virialEnergy->energyElec, (double)energyElec);
1894 atomicAdd(&virialEnergy->energyElec_s, (double)energyElec_s);
1895 atomicAdd(&virialEnergy->energyVdw_ti_1, (double)energyVdw_ti_1);
1896 atomicAdd(&virialEnergy->energyVdw_ti_2, (double)energyVdw_ti_2);
1897 atomicAdd(&virialEnergy->energyElec_ti_1, (double)energyElec_ti_1);
1898 atomicAdd(&virialEnergy->energyElec_ti_2, (double)energyElec_ti_2);
1901 volatile double energySlow = BlockReduce(tempStorage).Sum(ve.energySlow); BLOCK_SYNC;
1902 volatile double energySlow_s = BlockReduce(tempStorage).Sum(ve.energySlow_s); BLOCK_SYNC;
1903 volatile double energySlow_ti_1 = BlockReduce(tempStorage).Sum(ve.energySlow_ti_1); BLOCK_SYNC;
1904 volatile double energySlow_ti_2 = BlockReduce(tempStorage).Sum(ve.energySlow_ti_2); BLOCK_SYNC;
1905 if (threadIdx.x == 0) {
1906 atomicAdd(&virialEnergy->energySlow, (double)energySlow);
1907 atomicAdd(&virialEnergy->energySlow_s, (double)energySlow_s);
1908 atomicAdd(&virialEnergy->energySlow_ti_1,(double)energySlow_ti_1);
1909 atomicAdd(&virialEnergy->energySlow_ti_2,(double)energySlow_ti_2);
1913 // double energyGBIS = BlockReduce(tempStorage).Sum(ve.energyGBIS); BLOCK_SYNC;
1914 // if (threadIdx.x == 0) atomicAdd(&virialEnergy->energyGBIS, (double)energyGBIS);
1922 #define REDUCEGBISENERGYKERNEL_NUM_WARP 32
1923 __global__ void reduceGBISEnergyKernel(const int numTileLists,
1924 const TileListVirialEnergy* __restrict__ tileListVirialEnergy,
1925 VirialEnergy* __restrict__ virialEnergy) {
1927 for (int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
1929 int itileList = ibase + threadIdx.x;
1930 double energyGBISt = 0.0;
1931 if (itileList < numTileLists) {
1932 energyGBISt = tileListVirialEnergy[itileList].energyGBIS;
1935 typedef cub::BlockReduce<double, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1936 __shared__ typename BlockReduce::TempStorage tempStorage;
1937 volatile double energyGBIS = BlockReduce(tempStorage).Sum(energyGBISt); BLOCK_SYNC;
1938 if (threadIdx.x == 0) atomicAdd(&virialEnergy->energyGBIS, (double)energyGBIS);
1943 // ##############################################################################################
1944 // ##############################################################################################
1945 // ##############################################################################################
1947 CudaComputeNonbondedKernel::CudaComputeNonbondedKernel(int deviceID, CudaNonbondedTables& cudaNonbondedTables,
1948 bool doStreaming) : deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables), doStreaming(doStreaming) {
1950 cudaCheck(cudaSetDevice(deviceID));
1952 d_exclusionsByAtom = NULL;
1954 overflowExclusions = NULL;
1955 overflowExclusionsSize = 0;
1957 exclIndexMaxDiff = NULL;
1958 exclIndexMaxDiffSize = 0;
1966 patchNumCount = NULL;
1967 patchNumCountSize = 0;
1969 patchReadyQueue = NULL;
1970 patchReadyQueueSize = 0;
1972 force_x = force_y = force_z = force_w = NULL;
1974 forceSlow_x = forceSlow_y = forceSlow_z = forceSlow_w = NULL;
1978 void CudaComputeNonbondedKernel::reallocate_forceSOA(int atomStorageSize)
1981 size_t forceSizeCurrent;
1983 // reallocate_device will update forceSizeCurrent, so we need to reset it to the current
1984 // value for each array
1985 forceSizeCurrent = forceSize;
1986 reallocate_device<float>(&force_x, &forceSizeCurrent, atomStorageSize, 1.4f);
1987 forceSizeCurrent = forceSize;
1988 reallocate_device<float>(&force_y, &forceSizeCurrent, atomStorageSize, 1.4f);
1989 forceSizeCurrent = forceSize;
1990 reallocate_device<float>(&force_z, &forceSizeCurrent, atomStorageSize, 1.4f);
1991 forceSizeCurrent = forceSize;
1992 reallocate_device<float>(&force_w, &forceSizeCurrent, atomStorageSize, 1.4f);
1995 size_t forceSlowSizeCurrent;
1996 forceSlowSizeCurrent = forceSlowSize;
1997 reallocate_device<float>(&forceSlow_x, &forceSlowSizeCurrent, atomStorageSize, 1.4f);
1998 forceSlowSizeCurrent = forceSlowSize;
1999 reallocate_device<float>(&forceSlow_y, &forceSlowSizeCurrent, atomStorageSize, 1.4f);
2000 forceSlowSizeCurrent = forceSlowSize;
2001 reallocate_device<float>(&forceSlow_z, &forceSlowSizeCurrent, atomStorageSize, 1.4f);
2002 forceSlowSizeCurrent = forceSlowSize;
2003 reallocate_device<float>(&forceSlow_w, &forceSlowSizeCurrent, atomStorageSize, 1.4f);
2005 reallocate_device<float>(&force_x, &forceSize, atomStorageSize*8, 1.4f);
2006 force_y = force_x + atomStorageSize;
2007 force_z = force_y + atomStorageSize;
2008 force_w = force_z + atomStorageSize;
2009 forceSlow_x = force_w + atomStorageSize;
2010 forceSlow_y = forceSlow_x + atomStorageSize;
2011 forceSlow_z = forceSlow_y + atomStorageSize;
2012 forceSlow_w = forceSlow_z + atomStorageSize;
2016 CudaComputeNonbondedKernel::~CudaComputeNonbondedKernel() {
2017 cudaCheck(cudaSetDevice(deviceID));
2018 if (overflowExclusions != NULL) deallocate_device<unsigned int>(&overflowExclusions);
2019 if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff);
2020 if (atomIndex != NULL) deallocate_device<int>(&atomIndex);
2021 if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes);
2022 if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount);
2023 if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue);
2025 if (force_x != NULL) deallocate_device<float>(&force_x);
2026 if (force_y != NULL) deallocate_device<float>(&force_y);
2027 if (force_z != NULL) deallocate_device<float>(&force_z);
2028 if (force_w != NULL) deallocate_device<float>(&force_w);
2029 if (forceSlow_x != NULL) deallocate_device<float>(&forceSlow_x);
2030 if (forceSlow_y != NULL) deallocate_device<float>(&forceSlow_y);
2031 if (forceSlow_z != NULL) deallocate_device<float>(&forceSlow_z);
2032 if (forceSlow_w != NULL) deallocate_device<float>(&forceSlow_w);
2034 if (force_x != NULL) deallocate_device<float>(&force_x);
2038 void CudaComputeNonbondedKernel::updateVdwTypesExcl(const int atomStorageSize, const int* h_vdwTypes,
2039 const int2* h_exclIndexMaxDiff, const int* h_atomIndex, cudaStream_t stream) {
2041 reallocate_device<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, OVERALLOC);
2042 reallocate_device<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, OVERALLOC);
2043 reallocate_device<int>(&atomIndex, &atomIndexSize, atomStorageSize, OVERALLOC);
2045 copy_HtoD<int>(h_vdwTypes, vdwTypes, atomStorageSize, stream);
2046 copy_HtoD<int2>(h_exclIndexMaxDiff, exclIndexMaxDiff, atomStorageSize, stream);
2047 copy_HtoD<int>(h_atomIndex, atomIndex, atomStorageSize, stream);
2050 int* CudaComputeNonbondedKernel::getPatchReadyQueue() {
2052 NAMD_die("CudaComputeNonbondedKernel::getPatchReadyQueue() called on non-streaming kernel");
2054 return patchReadyQueue;
2057 template <int doSlow>
2058 __global__ void transposeForcesKernel(float4 *f, float4 *fSlow,
2059 float *fx, float *fy, float *fz, float *fw,
2060 float *fSlowx, float *fSlowy, float *fSlowz, float *fSloww,
2063 int tid = blockIdx.x*blockDim.x + threadIdx.x;
2065 f[tid] = make_float4(fx[tid], fy[tid], fz[tid], fw[tid]);
2066 fx[tid] = 0.f; fy[tid] = 0.f; fz[tid] = 0.f; fw[tid] = 0.f;
2068 fSlow[tid] = make_float4(fSlowx[tid], fSlowy[tid], fSlowz[tid], fSloww[tid]);
2069 fSlowx[tid] = 0.f; fSlowy[tid] = 0.f; fSlowz[tid] = 0.f; fSloww[tid] = 0.f;
2076 void CudaComputeNonbondedKernel::nonbondedForce(CudaTileListKernel& tlKernel,
2077 const int atomStorageSize, const bool atomsChanged, const bool doMinimize,
2078 const bool doPairlist, const bool doEnergy, const bool doVirial,
2079 const bool doSlow, const bool doAlch, const bool doAlchVdwForceSwitching,
2080 const bool doFEP, const bool doTI, const bool doTable,
2081 const float3 lata, const float3 latb, const float3 latc,
2082 const float4* h_xyzq, const float cutoff2,
2083 const CudaNBConstants nbConstants,
2084 float4* d_forces, float4* d_forcesSlow,
2085 float4* h_forces, float4* h_forcesSlow, AlchData *srcFlags,
2086 bool lambdaWindowUpdated, char *part,
2087 bool CUDASOAintegrator, bool useDeviceMigration,
2088 cudaStream_t stream) {
2090 #ifdef NODEGROUP_FORCE_REGISTER
2091 if (!atomsChanged && !CUDASOAintegrator) copy_HtoD<float4>(h_xyzq, tlKernel.get_xyzq(), atomStorageSize, stream);
2093 if (!doPairlist) copy_HtoD<float4>(h_xyzq, tlKernel.get_xyzq(), atomStorageSize, stream);
2097 // Copy partition to device. This is not necessary if both CUDASOAintegrator and useDeviceMigration
2099 if (doPairlist && (!CUDASOAintegrator || !useDeviceMigration)) {
2100 copy_HtoD< char>(part, tlKernel.get_part(), atomStorageSize, stream);
2102 //Copies flags to constant memory
2103 if(lambdaWindowUpdated) cudaCheck(cudaMemcpyToSymbol(alchflags, srcFlags, sizeof(AlchData)));
2106 // XXX TODO: Get rid of the clears
2108 // clear_device_array<float4>(d_forces, atomStorageSize, stream);
2109 // if (doSlow) clear_device_array<float4>(d_forcesSlow, atomStorageSize, stream);
2111 // This needs to go.
2112 if (doStreaming) tlKernel.clearTileListStat(stream);
2113 if(atomsChanged || doMinimize){
2114 clear_device_array<float>(force_x, atomStorageSize*4, stream);
2115 if(doSlow) clear_device_array<float>(forceSlow_x, atomStorageSize*4, stream);
2119 // --- streaming ----
2120 float4* m_forces = NULL;
2121 float4* m_forcesSlow = NULL;
2122 int* m_patchReadyQueue = NULL;
2124 unsigned int* patchNumCountPtr = NULL;
2126 numPatches = tlKernel.getNumPatches();
2127 if (reallocate_device<unsigned int>(&patchNumCount, &patchNumCountSize, numPatches)) {
2128 // If re-allocated, clear array
2129 clear_device_array<unsigned int>(patchNumCount, numPatches, stream);
2131 patchNumCountPtr = patchNumCount;
2132 bool re = reallocate_host<int>(&patchReadyQueue, &patchReadyQueueSize, numPatches, cudaHostAllocMapped);
2134 // If re-allocated, re-set to "-1"
2135 for (int i=0;i < numPatches;i++) patchReadyQueue[i] = -1;
2137 cudaCheck(cudaHostGetDevicePointer(&m_patchReadyQueue, patchReadyQueue, 0));
2138 cudaCheck(cudaHostGetDevicePointer(&m_forces, h_forces, 0));
2139 cudaCheck(cudaHostGetDevicePointer(&m_forcesSlow, h_forcesSlow, 0));
2141 // -----------------
2143 if (doVirial || doEnergy) {
2144 tlKernel.setTileListVirialEnergyLength(tlKernel.getNumTileLists());
2149 int* outputOrderPtr = tlKernel.getOutputOrder();
2151 int nwarp = NONBONDKERNEL_NUM_WARP;
2152 int nthread = WARPSIZE*nwarp;
2160 int options = doEnergy + (doVirial << 1) + (doSlow << 2) +
2161 (doPairlist << 3) + (doAlch << 4) + (doFEP << 5) + (doTI << 6) + (doStreaming << 7) + (doTable << 8) + (doAlchVdwForceSwitching << 9);
2164 while (start < tlKernel.getNumTileLists()) {
2166 int nleft = tlKernel.getNumTileLists() - start;
2167 int nblock = min(deviceCUDA->getMaxNumBlocks(), (nleft-1)/nwarp+1);
2169 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOALCH, DOFEP, DOTI, DOSTREAMING, DOALCHWDWFORCESWITCHING) \
2170 nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOALCH, DOFEP, DOTI, DOSTREAMING, DOALCHWDWFORCESWITCHING> \
2171 <<< nblock, nthread, shMemSize, stream >>> \
2172 (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
2173 cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getVdwCoefTable(), \
2174 vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, nbConstants, \
2175 cudaNonbondedTables.getVdwCoefTableTex(), cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex(), \
2176 atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
2177 tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
2178 tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \
2179 force_x, force_y, force_z, force_w, \
2180 forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w, \
2181 numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
2182 outputOrderPtr, tlKernel.getTileListVirialEnergy(), tlKernel.get_part()); called=true
2184 bool called = false;
2187 if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 0, 0, 0, 1, 0);
2188 if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 0, 0, 0, 1, 0);
2189 if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 0, 0, 0, 1, 0);
2190 if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 0, 0, 0, 1, 0);
2191 if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 0, 0, 0, 1, 0);
2192 if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 0, 0, 0, 1, 0);
2193 if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 0, 0, 0, 1, 0);
2194 if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 0, 0, 0, 1, 0);
2197 if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 0, 0, 0, 1, 0);
2198 if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 0, 0, 0, 1, 0);
2199 if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 0, 0, 0, 1, 0);
2200 if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 0, 0, 0, 1, 0);
2201 if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 0, 0, 0, 1, 0);
2202 if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 0, 0, 0, 1, 0);
2203 if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 0, 0, 0, 1, 0);
2204 if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 0, 0, 0, 1, 0);
2207 if (doAlchVdwForceSwitching) {
2208 if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 1, 0, 1, 1);
2209 if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 1, 0, 1, 1);
2210 if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 1, 0, 1, 1);
2211 if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 1, 0, 1, 1);
2212 if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 1, 0, 1, 1);
2213 if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 1, 0, 1, 1);
2214 if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 1, 0, 1, 1);
2215 if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 1, 0, 1, 1);
2217 if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 1, 0, 1, 1);
2218 if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 1, 0, 1, 1);
2219 if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 1, 0, 1, 1);
2220 if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 1, 0, 1, 1);
2221 if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 1, 0, 1, 1);
2222 if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 1, 0, 1, 1);
2223 if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 1, 0, 1, 1);
2224 if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 1, 0, 1, 1);
2226 if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 1, 0, 1, 0);
2227 if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 1, 0, 1, 0);
2228 if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 1, 0, 1, 0);
2229 if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 1, 0, 1, 0);
2230 if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 1, 0, 1, 0);
2231 if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 1, 0, 1, 0);
2232 if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 1, 0, 1, 0);
2233 if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 1, 0, 1, 0);
2235 if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 1, 0, 1, 0);
2236 if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 1, 0, 1, 0);
2237 if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 1, 0, 1, 0);
2238 if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 1, 0, 1, 0);
2239 if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 1, 0, 1, 0);
2240 if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 1, 0, 1, 0);
2241 if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 1, 0, 1, 0);
2242 if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 1, 0, 1, 0);
2243 } // doAlchVdwForceSwitching
2246 if (doAlchVdwForceSwitching) {
2247 if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 0, 1, 1, 1);
2248 if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 0, 1, 1, 1);
2249 if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 0, 1, 1, 1);
2250 if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 0, 1, 1, 1);
2251 if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 0, 1, 1, 1);
2252 if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 0, 1, 1, 1);
2253 if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 0, 1, 1, 1);
2254 if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 0, 1, 1, 1);
2256 if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 0, 1, 1, 1);
2257 if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 0, 1, 1, 1);
2258 if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 0, 1, 1, 1);
2259 if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 0, 1, 1, 1);
2260 if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 0, 1, 1, 1);
2261 if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 0, 1, 1, 1);
2262 if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 0, 1, 1, 1);
2263 if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 0, 1, 1, 1);
2265 if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 0, 1, 1, 0);
2266 if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 0, 1, 1, 0);
2267 if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 0, 1, 1, 0);
2268 if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 0, 1, 1, 0);
2269 if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 0, 1, 1, 0);
2270 if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 0, 1, 1, 0);
2271 if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 0, 1, 1, 0);
2272 if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 0, 1, 1, 0);
2274 if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 0, 1, 1, 0);
2275 if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 0, 1, 1, 0);
2276 if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 0, 1, 1, 0);
2277 if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 0, 1, 1, 0);
2278 if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 0, 1, 1, 0);
2279 if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 0, 1, 1, 0);
2280 if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 0, 1, 1, 0);
2281 if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 0, 1, 1, 0);
2282 } // doAlchVdwForceSwitching
2289 if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 0, 0, 0, 0, 0);
2290 if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 0, 0, 0, 0, 0);
2291 if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 0, 0, 0, 0, 0);
2292 if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 0, 0, 0, 0, 0);
2293 if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 0, 0, 0, 0, 0);
2294 if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 0, 0, 0, 0, 0);
2295 if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 0, 0, 0, 0, 0);
2296 if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 0, 0, 0, 0, 0);
2299 if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 0, 0, 0, 0, 0);
2300 if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 0, 0, 0, 0, 0);
2301 if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 0, 0, 0, 0, 0);
2302 if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 0, 0, 0, 0, 0);
2303 if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 0, 0, 0, 0, 0);
2304 if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 0, 0, 0, 0, 0);
2305 if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 0, 0, 0, 0, 0);
2306 if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 0, 0, 0, 0, 0);
2309 if (doAlchVdwForceSwitching) {
2310 if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 1, 0, 0, 1);
2311 if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 1, 0, 0, 1);
2312 if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 1, 0, 0, 1);
2313 if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 1, 0, 0, 1);
2314 if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 1, 0, 0, 1);
2315 if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 1, 0, 0, 1);
2316 if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 1, 0, 0, 1);
2317 if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 1, 0, 0, 1);
2319 if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 1, 0, 0, 1);
2320 if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 1, 0, 0, 1);
2321 if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 1, 0, 0, 1);
2322 if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 1, 0, 0, 1);
2323 if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 1, 0, 0, 1);
2324 if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 1, 0, 0, 1);
2325 if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 1, 0, 0, 1);
2326 if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 1, 0, 0, 1);
2328 if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 1, 0, 0, 0);
2329 if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 1, 0, 0, 0);
2330 if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 1, 0, 0, 0);
2331 if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 1, 0, 0, 0);
2332 if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 1, 0, 0, 0);
2333 if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 1, 0, 0, 0);
2334 if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 1, 0, 0, 0);
2335 if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 1, 0, 0, 0);
2337 if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 1, 0, 0, 0);
2338 if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 1, 0, 0, 0);
2339 if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 1, 0, 0, 0);
2340 if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 1, 0, 0, 0);
2341 if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 1, 0, 0, 0);
2342 if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 1, 0, 0, 0);
2343 if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 1, 0, 0, 0);
2344 if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 1, 0, 0, 0);
2348 if (doAlchVdwForceSwitching) {
2349 if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 0, 1, 0, 1);
2350 if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 0, 1, 0, 1);
2351 if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 0, 1, 0, 1);
2352 if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 0, 1, 0, 1);
2353 if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 0, 1, 0, 1);
2354 if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 0, 1, 0, 1);
2355 if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 0, 1, 0, 1);
2356 if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 0, 1, 0, 1);
2358 if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 0, 1, 0, 1);
2359 if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 0, 1, 0, 1);
2360 if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 0, 1, 0, 1);
2361 if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 0, 1, 0, 1);
2362 if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 0, 1, 0, 1);
2363 if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 0, 1, 0, 1);
2364 if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 0, 1, 0, 1);
2365 if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 0, 1, 0, 1);
2367 if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 0, 1, 0, 0);
2368 if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 0, 1, 0, 0);
2369 if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 0, 1, 0, 0);
2370 if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 0, 1, 0, 0);
2371 if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 0, 1, 0, 0);
2372 if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 0, 1, 0, 0);
2373 if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 0, 1, 0, 0);
2374 if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 0, 1, 0, 0);
2376 if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 0, 1, 0, 0);
2377 if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 0, 1, 0, 0);
2378 if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 0, 1, 0, 0);
2379 if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 0, 1, 0, 0);
2380 if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 0, 1, 0, 0);
2381 if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 0, 1, 0, 0);
2382 if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 0, 1, 0, 0);
2383 if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 0, 1, 0, 0);
2390 NAMD_die("CudaComputeNonbondedKernel::nonbondedForce, none of the kernels called");
2396 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOALCH, DOFEP, DOTI, DOSTREAMING, DOTABLE, DOALCHWDWFORCESWITCHING) \
2397 nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOALCH, DOFEP, DOTI, DOSTREAMING, DOTABLE, DOALCHWDWFORCESWITCHING> \
2398 <<< nblock, nthread, shMemSize, stream >>> \
2399 (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
2400 cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getVdwCoefTable(), \
2401 vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, nbConstants, \
2402 cudaNonbondedTables.getVdwCoefTableTex(), cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex(), \
2403 atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
2404 tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
2405 tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \
2406 force_x, force_y, force_z, force_w, \
2407 forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w, \
2408 numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
2409 outputOrderPtr, tlKernel.getTileListVirialEnergy(), tlKernel.get_part())
2417 case 0: CALL(0, 0, 0, 0, 0, 0, 0, 0, 0, 0); break;
2418 case 1: CALL(1, 0, 0, 0, 0, 0, 0, 0, 0, 0); break;
2419 case 2: CALL(0, 1, 0, 0, 0, 0, 0, 0, 0, 0); break;
2420 case 3: CALL(1, 1, 0, 0, 0, 0, 0, 0, 0, 0); break;
2421 case 4: CALL(0, 0, 1, 0, 0, 0, 0, 0, 0, 0); break;
2422 case 5: CALL(1, 0, 1, 0, 0, 0, 0, 0, 0, 0); break;
2423 case 6: CALL(0, 1, 1, 0, 0, 0, 0, 0, 0, 0); break;
2424 case 7: CALL(1, 1, 1, 0, 0, 0, 0, 0, 0, 0); break;
2425 case 8: CALL(0, 0, 0, 1, 0, 0, 0, 0, 0, 0); break;
2426 case 9: CALL(1, 0, 0, 1, 0, 0, 0, 0, 0, 0); break;
2427 case 10: CALL(0, 1, 0, 1, 0, 0, 0, 0, 0, 0); break;
2428 case 11: CALL(1, 1, 0, 1, 0, 0, 0, 0, 0, 0); break;
2429 case 12: CALL(0, 0, 1, 1, 0, 0, 0, 0, 0, 0); break;
2430 case 13: CALL(1, 0, 1, 1, 0, 0, 0, 0, 0, 0); break;
2431 case 14: CALL(0, 1, 1, 1, 0, 0, 0, 0, 0, 0); break;
2432 case 15: CALL(1, 1, 1, 1, 0, 0, 0, 0, 0, 0); break;
2435 case 16: CALL(0, 0, 0, 0, 1, 0, 0, 0, 0, 0); break;
2436 case 17: CALL(1, 0, 0, 0, 1, 0, 0, 0, 0, 0); break;
2437 case 18: CALL(0, 1, 0, 0, 1, 0, 0, 0, 0, 0); break;
2438 case 19: CALL(1, 1, 0, 0, 1, 0, 0, 0, 0, 0); break;
2439 case 20: CALL(0, 0, 1, 0, 1, 0, 0, 0, 0, 0); break;
2440 case 21: CALL(1, 0, 1, 0, 1, 0, 0, 0, 0, 0); break;
2441 case 22: CALL(0, 1, 1, 0, 1, 0, 0, 0, 0, 0); break;
2442 case 23: CALL(1, 1, 1, 0, 1, 0, 0, 0, 0, 0); break;
2443 case 24: CALL(0, 0, 0, 1, 1, 0, 0, 0, 0, 0); break;
2444 case 25: CALL(1, 0, 0, 1, 1, 0, 0, 0, 0, 0); break;
2445 case 26: CALL(0, 1, 0, 1, 1, 0, 0, 0, 0, 0); break;
2446 case 27: CALL(1, 1, 0, 1, 1, 0, 0, 0, 0, 0); break;
2447 case 28: CALL(0, 0, 1, 1, 1, 0, 0, 0, 0, 0); break;
2448 case 29: CALL(1, 0, 1, 1, 1, 0, 0, 0, 0, 0); break;
2449 case 30: CALL(0, 1, 1, 1, 1, 0, 0, 0, 0, 0); break;
2450 case 31: CALL(1, 1, 1, 1, 1, 0, 0, 0, 0, 0); break;
2452 case 32: CALL(0, 0, 0, 0, 0, 1, 0, 0, 0, 0); break;
2453 case 33: CALL(1, 0, 0, 0, 0, 1, 0, 0, 0, 0); break;
2454 case 34: CALL(0, 1, 0, 0, 0, 1, 0, 0, 0, 0); break;
2455 case 35: CALL(1, 1, 0, 0, 0, 1, 0, 0, 0, 0); break;
2456 case 36: CALL(0, 0, 1, 0, 0, 1, 0, 0, 0, 0); break;
2457 case 37: CALL(1, 0, 1, 0, 0, 1, 0, 0, 0, 0); break;
2458 case 38: CALL(0, 1, 1, 0, 0, 1, 0, 0, 0, 0); break;
2459 case 39: CALL(1, 1, 1, 0, 0, 1, 0, 0, 0, 0); break;
2460 case 40: CALL(0, 0, 0, 1, 0, 1, 0, 0, 0, 0); break;
2461 case 41: CALL(1, 0, 0, 1, 0, 1, 0, 0, 0, 0); break;
2462 case 42: CALL(0, 1, 0, 1, 0, 1, 0, 0, 0, 0); break;
2463 case 43: CALL(1, 1, 0, 1, 0, 1, 0, 0, 0, 0); break;
2464 case 44: CALL(0, 0, 1, 1, 0, 1, 0, 0, 0, 0); break;
2465 case 45: CALL(1, 0, 1, 1, 0, 1, 0, 0, 0, 0); break;
2466 case 46: CALL(0, 1, 1, 1, 0, 1, 0, 0, 0, 0); break;
2467 case 47: CALL(1, 1, 1, 1, 0, 1, 0, 0, 0, 0); break;
2469 case 48: CALL(0, 0, 0, 0, 1, 1, 0, 0, 0, 0); break;
2470 case 49: CALL(1, 0, 0, 0, 1, 1, 0, 0, 0, 0); break;
2471 case 50: CALL(0, 1, 0, 0, 1, 1, 0, 0, 0, 0); break;
2472 case 51: CALL(1, 1, 0, 0, 1, 1, 0, 0, 0, 0); break;
2473 case 52: CALL(0, 0, 1, 0, 1, 1, 0, 0, 0, 0); break;
2474 case 53: CALL(1, 0, 1, 0, 1, 1, 0, 0, 0, 0); break;
2475 case 54: CALL(0, 1, 1, 0, 1, 1, 0, 0, 0, 0); break;
2476 case 55: CALL(1, 1, 1, 0, 1, 1, 0, 0, 0, 0); break;
2477 case 56: CALL(0, 0, 0, 1, 1, 1, 0, 0, 0, 0); break;
2478 case 57: CALL(1, 0, 0, 1, 1, 1, 0, 0, 0, 0); break;
2479 case 58: CALL(0, 1, 0, 1, 1, 1, 0, 0, 0, 0); break;
2480 case 59: CALL(1, 1, 0, 1, 1, 1, 0, 0, 0, 0); break;
2481 case 60: CALL(0, 0, 1, 1, 1, 1, 0, 0, 0, 0); break;
2482 case 61: CALL(1, 0, 1, 1, 1, 1, 0, 0, 0, 0); break;
2483 case 62: CALL(0, 1, 1, 1, 1, 1, 0, 0, 0, 0); break;
2484 case 63: CALL(1, 1, 1, 1, 1, 1, 0, 0, 0, 0); break;
2486 case 64: CALL(0, 0, 0, 0, 0, 0, 1, 0, 0, 0); break;
2487 case 65: CALL(1, 0, 0, 0, 0, 0, 1, 0, 0, 0); break;
2488 case 66: CALL(0, 1, 0, 0, 0, 0, 1, 0, 0, 0); break;
2489 case 67: CALL(1, 1, 0, 0, 0, 0, 1, 0, 0, 0); break;
2490 case 68: CALL(0, 0, 1, 0, 0, 0, 1, 0, 0, 0); break;
2491 case 69: CALL(1, 0, 1, 0, 0, 0, 1, 0, 0, 0); break;
2492 case 70: CALL(0, 1, 1, 0, 0, 0, 1, 0, 0, 0); break;
2493 case 71: CALL(1, 1, 1, 0, 0, 0, 1, 0, 0, 0); break;
2494 case 72: CALL(0, 0, 0, 1, 0, 0, 1, 0, 0, 0); break;
2495 case 73: CALL(1, 0, 0, 1, 0, 0, 1, 0, 0, 0); break;
2496 case 74: CALL(0, 1, 0, 1, 0, 0, 1, 0, 0, 0); break;
2497 case 75: CALL(1, 1, 0, 1, 0, 0, 1, 0, 0, 0); break;
2498 case 76: CALL(0, 0, 1, 1, 0, 0, 1, 0, 0, 0); break;
2499 case 77: CALL(1, 0, 1, 1, 0, 0, 1, 0, 0, 0); break;
2500 case 78: CALL(0, 1, 1, 1, 0, 0, 1, 0, 0, 0); break;
2501 case 79: CALL(1, 1, 1, 1, 0, 0, 1, 0, 0, 0); break;
2503 case 80: CALL(0, 0, 0, 0, 1, 0, 1, 0, 0, 0); break;
2504 case 81: CALL(1, 0, 0, 0, 1, 0, 1, 0, 0, 0); break;
2505 case 82: CALL(0, 1, 0, 0, 1, 0, 1, 0, 0, 0); break;
2506 case 83: CALL(1, 1, 0, 0, 1, 0, 1, 0, 0, 0); break;
2507 case 84: CALL(0, 0, 1, 0, 1, 0, 1, 0, 0, 0); break;
2508 case 85: CALL(1, 0, 1, 0, 1, 0, 1, 0, 0, 0); break;
2509 case 86: CALL(0, 1, 1, 0, 1, 0, 1, 0, 0, 0); break;
2510 case 87: CALL(1, 1, 1, 0, 1, 0, 1, 0, 0, 0); break;
2511 case 88: CALL(0, 0, 0, 1, 1, 0, 1, 0, 0, 0); break;
2512 case 89: CALL(1, 0, 0, 1, 1, 0, 1, 0, 0, 0); break;
2513 case 90: CALL(0, 1, 0, 1, 1, 0, 1, 0, 0, 0); break;
2514 case 91: CALL(1, 1, 0, 1, 1, 0, 1, 0, 0, 0); break;
2515 case 92: CALL(0, 0, 1, 1, 1, 0, 1, 0, 0, 0); break;
2516 case 93: CALL(1, 0, 1, 1, 1, 0, 1, 0, 0, 0); break;
2517 case 94: CALL(0, 1, 1, 1, 1, 0, 1, 0, 0, 0); break;
2518 case 95: CALL(1, 1, 1, 1, 1, 0, 1, 0, 0, 0); break;
2520 case 96: CALL(0, 0, 0, 0, 0, 1, 1, 0, 0, 0); break;
2521 case 97: CALL(1, 0, 0, 0, 0, 1, 1, 0, 0, 0); break;
2522 case 98: CALL(0, 1, 0, 0, 0, 1, 1, 0, 0, 0); break;
2523 case 99: CALL(1, 1, 0, 0, 0, 1, 1, 0, 0, 0); break;
2524 case 100: CALL(0, 0, 1, 0, 0, 1, 1, 0, 0, 0); break;
2525 case 101: CALL(1, 0, 1, 0, 0, 1, 1, 0, 0, 0); break;
2526 case 102: CALL(0, 1, 1, 0, 0, 1, 1, 0, 0, 0); break;
2527 case 103: CALL(1, 1, 1, 0, 0, 1, 1, 0, 0, 0); break;
2528 case 104: CALL(0, 0, 0, 1, 0, 1, 1, 0, 0, 0); break;
2529 case 105: CALL(1, 0, 0, 1, 0, 1, 1, 0, 0, 0); break;
2530 case 106: CALL(0, 1, 0, 1, 0, 1, 1, 0, 0, 0); break;
2531 case 107: CALL(1, 1, 0, 1, 0, 1, 1, 0, 0, 0); break;
2532 case 108: CALL(0, 0, 1, 1, 0, 1, 1, 0, 0, 0); break;
2533 case 109: CALL(1, 0, 1, 1, 0, 1, 1, 0, 0, 0); break;
2534 case 110: CALL(0, 1, 1, 1, 0, 1, 1, 0, 0, 0); break;
2535 case 111: CALL(1, 1, 1, 1, 0, 1, 1, 0, 0, 0); break;
2537 case 112: CALL(0, 0, 0, 0, 1, 1, 1, 0, 0, 0); break;
2538 case 113: CALL(1, 0, 0, 0, 1, 1, 1, 0, 0, 0); break;
2539 case 114: CALL(0, 1, 0, 0, 1, 1, 1, 0, 0, 0); break;
2540 case 115: CALL(1, 1, 0, 0, 1, 1, 1, 0, 0, 0); break;
2541 case 116: CALL(0, 0, 1, 0, 1, 1, 1, 0, 0, 0); break;
2542 case 117: CALL(1, 0, 1, 0, 1, 1, 1, 0, 0, 0); break;
2543 case 118: CALL(0, 1, 1, 0, 1, 1, 1, 0, 0, 0); break;
2544 case 119: CALL(1, 1, 1, 0, 1, 1, 1, 0, 0, 0); break;
2545 case 120: CALL(0, 0, 0, 1, 1, 1, 1, 0, 0, 0); break;
2546 case 121: CALL(1, 0, 0, 1, 1, 1, 1, 0, 0, 0); break;
2547 case 122: CALL(0, 1, 0, 1, 1, 1, 1, 0, 0, 0); break;
2548 case 123: CALL(1, 1, 0, 1, 1, 1, 1, 0, 0, 0); break;
2549 case 124: CALL(0, 0, 1, 1, 1, 1, 1, 0, 0, 0); break;
2550 case 125: CALL(1, 0, 1, 1, 1, 1, 1, 0, 0, 0); break;
2551 case 126: CALL(0, 1, 1, 1, 1, 1, 1, 0, 0, 0); break;
2552 case 127: CALL(1, 1, 1, 1, 1, 1, 1, 0, 0, 0); break;
2555 case 128: CALL(0, 0, 0, 0, 0, 0, 0, 1, 0, 0); break;
2556 case 129: CALL(1, 0, 0, 0, 0, 0, 0, 1, 0, 0); break;
2557 case 130: CALL(0, 1, 0, 0, 0, 0, 0, 1, 0, 0); break;
2558 case 131: CALL(1, 1, 0, 0, 0, 0, 0, 1, 0, 0); break;
2559 case 132: CALL(0, 0, 1, 0, 0, 0, 0, 1, 0, 0); break;
2560 case 133: CALL(1, 0, 1, 0, 0, 0, 0, 1, 0, 0); break;
2561 case 134: CALL(0, 1, 1, 0, 0, 0, 0, 1, 0, 0); break;
2562 case 135: CALL(1, 1, 1, 0, 0, 0, 0, 1, 0, 0); break;
2563 case 136: CALL(0, 0, 0, 1, 0, 0, 0, 1, 0, 0); break;
2564 case 137: CALL(1, 0, 0, 1, 0, 0, 0, 1, 0, 0); break;
2565 case 138: CALL(0, 1, 0, 1, 0, 0, 0, 1, 0, 0); break;
2566 case 139: CALL(1, 1, 0, 1, 0, 0, 0, 1, 0, 0); break;
2567 case 140: CALL(0, 0, 1, 1, 0, 0, 0, 1, 0, 0); break;
2568 case 141: CALL(1, 0, 1, 1, 0, 0, 0, 1, 0, 0); break;
2569 case 142: CALL(0, 1, 1, 1, 0, 0, 0, 1, 0, 0); break;
2570 case 143: CALL(1, 1, 1, 1, 0, 0, 0, 1, 0, 0); break;
2573 case 144: CALL(0, 0, 0, 0, 1, 0, 0, 1, 0, 0); break;
2574 case 145: CALL(1, 0, 0, 0, 1, 0, 0, 1, 0, 0); break;
2575 case 146: CALL(0, 1, 0, 0, 1, 0, 0, 1, 0, 0); break;
2576 case 147: CALL(1, 1, 0, 0, 1, 0, 0, 1, 0, 0); break;
2577 case 148: CALL(0, 0, 1, 0, 1, 0, 0, 1, 0, 0); break;
2578 case 149: CALL(1, 0, 1, 0, 1, 0, 0, 1, 0, 0); break;
2579 case 150: CALL(0, 1, 1, 0, 1, 0, 0, 1, 0, 0); break;
2580 case 151: CALL(1, 1, 1, 0, 1, 0, 0, 1, 0, 0); break;
2581 case 152: CALL(0, 0, 0, 1, 1, 0, 0, 1, 0, 0); break;
2582 case 153: CALL(1, 0, 0, 1, 1, 0, 0, 1, 0, 0); break;
2583 case 154: CALL(0, 1, 0, 1, 1, 0, 0, 1, 0, 0); break;
2584 case 155: CALL(1, 1, 0, 1, 1, 0, 0, 1, 0, 0); break;
2585 case 156: CALL(0, 0, 1, 1, 1, 0, 0, 1, 0, 0); break;
2586 case 157: CALL(1, 0, 1, 1, 1, 0, 0, 1, 0, 0); break;
2587 case 158: CALL(0, 1, 1, 1, 1, 0, 0, 1, 0, 0); break;
2588 case 159: CALL(1, 1, 1, 1, 1, 0, 0, 1, 0, 0); break;
2590 case 160: CALL(0, 0, 0, 0, 0, 1, 0, 1, 0, 0); break;
2591 case 161: CALL(1, 0, 0, 0, 0, 1, 0, 1, 0, 0); break;
2592 case 162: CALL(0, 1, 0, 0, 0, 1, 0, 1, 0, 0); break;
2593 case 163: CALL(1, 1, 0, 0, 0, 1, 0, 1, 0, 0); break;
2594 case 164: CALL(0, 0, 1, 0, 0, 1, 0, 1, 0, 0); break;
2595 case 165: CALL(1, 0, 1, 0, 0, 1, 0, 1, 0, 0); break;
2596 case 166: CALL(0, 1, 1, 0, 0, 1, 0, 1, 0, 0); break;
2597 case 167: CALL(1, 1, 1, 0, 0, 1, 0, 1, 0, 0); break;
2598 case 168: CALL(0, 0, 0, 1, 0, 1, 0, 1, 0, 0); break;
2599 case 169: CALL(1, 0, 0, 1, 0, 1, 0, 1, 0, 0); break;
2600 case 170: CALL(0, 1, 0, 1, 0, 1, 0, 1, 0, 0); break;
2601 case 171: CALL(1, 1, 0, 1, 0, 1, 0, 1, 0, 0); break;
2602 case 172: CALL(0, 0, 1, 1, 0, 1, 0, 1, 0, 0); break;
2603 case 173: CALL(1, 0, 1, 1, 0, 1, 0, 1, 0, 0); break;
2604 case 174: CALL(0, 1, 1, 1, 0, 1, 0, 1, 0, 0); break;
2605 case 175: CALL(1, 1, 1, 1, 0, 1, 0, 1, 0, 0); break;
2607 case 176: CALL(0, 0, 0, 0, 1, 1, 0, 1, 0, 0); break;
2608 case 177: CALL(1, 0, 0, 0, 1, 1, 0, 1, 0, 0); break;
2609 case 178: CALL(0, 1, 0, 0, 1, 1, 0, 1, 0, 0); break;
2610 case 179: CALL(1, 1, 0, 0, 1, 1, 0, 1, 0, 0); break;
2611 case 180: CALL(0, 0, 1, 0, 1, 1, 0, 1, 0, 0); break;
2612 case 181: CALL(1, 0, 1, 0, 1, 1, 0, 1, 0, 0); break;
2613 case 182: CALL(0, 1, 1, 0, 1, 1, 0, 1, 0, 0); break;
2614 case 183: CALL(1, 1, 1, 0, 1, 1, 0, 1, 0, 0); break;
2615 case 184: CALL(0, 0, 0, 1, 1, 1, 0, 1, 0, 0); break;
2616 case 185: CALL(1, 0, 0, 1, 1, 1, 0, 1, 0, 0); break;
2617 case 186: CALL(0, 1, 0, 1, 1, 1, 0, 1, 0, 0); break;
2618 case 187: CALL(1, 1, 0, 1, 1, 1, 0, 1, 0, 0); break;
2619 case 188: CALL(0, 0, 1, 1, 1, 1, 0, 1, 0, 0); break;
2620 case 189: CALL(1, 0, 1, 1, 1, 1, 0, 1, 0, 0); break;
2621 case 190: CALL(0, 1, 1, 1, 1, 1, 0, 1, 0, 0); break;
2622 case 191: CALL(1, 1, 1, 1, 1, 1, 0, 1, 0, 0); break;
2624 case 192: CALL(0, 0, 0, 0, 0, 0, 1, 1, 0, 0); break;
2625 case 193: CALL(1, 0, 0, 0, 0, 0, 1, 1, 0, 0); break;
2626 case 194: CALL(0, 1, 0, 0, 0, 0, 1, 1, 0, 0); break;
2627 case 195: CALL(1, 1, 0, 0, 0, 0, 1, 1, 0, 0); break;
2628 case 196: CALL(0, 0, 1, 0, 0, 0, 1, 1, 0, 0); break;
2629 case 197: CALL(1, 0, 1, 0, 0, 0, 1, 1, 0, 0); break;
2630 case 198: CALL(0, 1, 1, 0, 0, 0, 1, 1, 0, 0); break;
2631 case 199: CALL(1, 1, 1, 0, 0, 0, 1, 1, 0, 0); break;
2632 case 200: CALL(0, 0, 0, 1, 0, 0, 1, 1, 0, 0); break;
2633 case 201: CALL(1, 0, 0, 1, 0, 0, 1, 1, 0, 0); break;
2634 case 202: CALL(0, 1, 0, 1, 0, 0, 1, 1, 0, 0); break;
2635 case 203: CALL(1, 1, 0, 1, 0, 0, 1, 1, 0, 0); break;
2636 case 204: CALL(0, 0, 1, 1, 0, 0, 1, 1, 0, 0); break;
2637 case 205: CALL(1, 0, 1, 1, 0, 0, 1, 1, 0, 0); break;
2638 case 206: CALL(0, 1, 1, 1, 0, 0, 1, 1, 0, 0); break;
2639 case 207: CALL(1, 1, 1, 1, 0, 0, 1, 1, 0, 0); break;
2641 case 208: CALL(0, 0, 0, 0, 1, 0, 1, 1, 0, 0); break;
2642 case 209: CALL(1, 0, 0, 0, 1, 0, 1, 1, 0, 0); break;
2643 case 210: CALL(0, 1, 0, 0, 1, 0, 1, 1, 0, 0); break;
2644 case 211: CALL(1, 1, 0, 0, 1, 0, 1, 1, 0, 0); break;
2645 case 212: CALL(0, 0, 1, 0, 1, 0, 1, 1, 0, 0); break;
2646 case 213: CALL(1, 0, 1, 0, 1, 0, 1, 1, 0, 0); break;
2647 case 214: CALL(0, 1, 1, 0, 1, 0, 1, 1, 0, 0); break;
2648 case 215: CALL(1, 1, 1, 0, 1, 0, 1, 1, 0, 0); break;
2649 case 216: CALL(0, 0, 0, 1, 1, 0, 1, 1, 0, 0); break;
2650 case 217: CALL(1, 0, 0, 1, 1, 0, 1, 1, 0, 0); break;
2651 case 218: CALL(0, 1, 0, 1, 1, 0, 1, 1, 0, 0); break;
2652 case 219: CALL(1, 1, 0, 1, 1, 0, 1, 1, 0, 0); break;
2653 case 220: CALL(0, 0, 1, 1, 1, 0, 1, 1, 0, 0); break;
2654 case 221: CALL(1, 0, 1, 1, 1, 0, 1, 1, 0, 0); break;
2655 case 222: CALL(0, 1, 1, 1, 1, 0, 1, 1, 0, 0); break;
2656 case 223: CALL(1, 1, 1, 1, 1, 0, 1, 1, 0, 0); break;
2658 case 224: CALL(0, 0, 0, 0, 0, 1, 1, 1, 0, 0); break;
2659 case 225: CALL(1, 0, 0, 0, 0, 1, 1, 1, 0, 0); break;
2660 case 226: CALL(0, 1, 0, 0, 0, 1, 1, 1, 0, 0); break;
2661 case 227: CALL(1, 1, 0, 0, 0, 1, 1, 1, 0, 0); break;
2662 case 228: CALL(0, 0, 1, 0, 0, 1, 1, 1, 0, 0); break;
2663 case 229: CALL(1, 0, 1, 0, 0, 1, 1, 1, 0, 0); break;
2664 case 230: CALL(0, 1, 1, 0, 0, 1, 1, 1, 0, 0); break;
2665 case 231: CALL(1, 1, 1, 0, 0, 1, 1, 1, 0, 0); break;
2666 case 232: CALL(0, 0, 0, 1, 0, 1, 1, 1, 0, 0); break;
2667 case 233: CALL(1, 0, 0, 1, 0, 1, 1, 1, 0, 0); break;
2668 case 234: CALL(0, 1, 0, 1, 0, 1, 1, 1, 0, 0); break;
2669 case 235: CALL(1, 1, 0, 1, 0, 1, 1, 1, 0, 0); break;
2670 case 236: CALL(0, 0, 1, 1, 0, 1, 1, 1, 0, 0); break;
2671 case 237: CALL(1, 0, 1, 1, 0, 1, 1, 1, 0, 0); break;
2672 case 238: CALL(0, 1, 1, 1, 0, 1, 1, 1, 0, 0); break;
2673 case 239: CALL(1, 1, 1, 1, 0, 1, 1, 1, 0, 0); break;
2675 case 240: CALL(0, 0, 0, 0, 1, 1, 1, 1, 0, 0); break;
2676 case 241: CALL(1, 0, 0, 0, 1, 1, 1, 1, 0, 0); break;
2677 case 242: CALL(0, 1, 0, 0, 1, 1, 1, 1, 0, 0); break;
2678 case 243: CALL(1, 1, 0, 0, 1, 1, 1, 1, 0, 0); break;
2679 case 244: CALL(0, 0, 1, 0, 1, 1, 1, 1, 0, 0); break;
2680 case 245: CALL(1, 0, 1, 0, 1, 1, 1, 1, 0, 0); break;
2681 case 246: CALL(0, 1, 1, 0, 1, 1, 1, 1, 0, 0); break;
2682 case 247: CALL(1, 1, 1, 0, 1, 1, 1, 1, 0, 0); break;
2683 case 248: CALL(0, 0, 0, 1, 1, 1, 1, 1, 0, 0); break;
2684 case 249: CALL(1, 0, 0, 1, 1, 1, 1, 1, 0, 0); break;
2685 case 250: CALL(0, 1, 0, 1, 1, 1, 1, 1, 0, 0); break;
2686 case 251: CALL(1, 1, 0, 1, 1, 1, 1, 1, 0, 0); break;
2687 case 252: CALL(0, 0, 1, 1, 1, 1, 1, 1, 0, 0); break;
2688 case 253: CALL(1, 0, 1, 1, 1, 1, 1, 1, 0, 0); break;
2689 case 254: CALL(0, 1, 1, 1, 1, 1, 1, 1, 0, 0); break;
2690 case 255: CALL(1, 1, 1, 1, 1, 1, 1, 1, 0, 0); break;
2693 case 256: CALL(0, 0, 0, 0, 0, 0, 0, 0, 1, 0); break;
2694 case 257: CALL(1, 0, 0, 0, 0, 0, 0, 0, 1, 0); break;
2695 case 258: CALL(0, 1, 0, 0, 0, 0, 0, 0, 1, 0); break;
2696 case 259: CALL(1, 1, 0, 0, 0, 0, 0, 0, 1, 0); break;
2697 case 260: CALL(0, 0, 1, 0, 0, 0, 0, 0, 1, 0); break;
2698 case 261: CALL(1, 0, 1, 0, 0, 0, 0, 0, 1, 0); break;
2699 case 262: CALL(0, 1, 1, 0, 0, 0, 0, 0, 1, 0); break;
2700 case 263: CALL(1, 1, 1, 0, 0, 0, 0, 0, 1, 0); break;
2701 case 264: CALL(0, 0, 0, 1, 0, 0, 0, 0, 1, 0); break;
2702 case 265: CALL(1, 0, 0, 1, 0, 0, 0, 0, 1, 0); break;
2703 case 266: CALL(0, 1, 0, 1, 0, 0, 0, 0, 1, 0); break;
2704 case 267: CALL(1, 1, 0, 1, 0, 0, 0, 0, 1, 0); break;
2705 case 268: CALL(0, 0, 1, 1, 0, 0, 0, 0, 1, 0); break;
2706 case 269: CALL(1, 0, 1, 1, 0, 0, 0, 0, 1, 0); break;
2707 case 270: CALL(0, 1, 1, 1, 0, 0, 0, 0, 1, 0); break;
2708 case 271: CALL(1, 1, 1, 1, 0, 0, 0, 0, 1, 0); break;
2711 case 272: CALL(0, 0, 0, 0, 1, 0, 0, 0, 1, 0); break;
2712 case 273: CALL(1, 0, 0, 0, 1, 0, 0, 0, 1, 0); break;
2713 case 274: CALL(0, 1, 0, 0, 1, 0, 0, 0, 1, 0); break;
2714 case 275: CALL(1, 1, 0, 0, 1, 0, 0, 0, 1, 0); break;
2715 case 276: CALL(0, 0, 1, 0, 1, 0, 0, 0, 1, 0); break;
2716 case 277: CALL(1, 0, 1, 0, 1, 0, 0, 0, 1, 0); break;
2717 case 278: CALL(0, 1, 1, 0, 1, 0, 0, 0, 1, 0); break;
2718 case 279: CALL(1, 1, 1, 0, 1, 0, 0, 0, 1, 0); break;
2719 case 280: CALL(0, 0, 0, 1, 1, 0, 0, 0, 1, 0); break;
2720 case 281: CALL(1, 0, 0, 1, 1, 0, 0, 0, 1, 0); break;
2721 case 282: CALL(0, 1, 0, 1, 1, 0, 0, 0, 1, 0); break;
2722 case 283: CALL(1, 1, 0, 1, 1, 0, 0, 0, 1, 0); break;
2723 case 284: CALL(0, 0, 1, 1, 1, 0, 0, 0, 1, 0); break;
2724 case 285: CALL(1, 0, 1, 1, 1, 0, 0, 0, 1, 0); break;
2725 case 286: CALL(0, 1, 1, 1, 1, 0, 0, 0, 1, 0); break;
2726 case 287: CALL(1, 1, 1, 1, 1, 0, 0, 0, 1, 0); break;
2728 case 288: CALL(0, 0, 0, 0, 0, 1, 0, 0, 1, 0); break;
2729 case 289: CALL(1, 0, 0, 0, 0, 1, 0, 0, 1, 0); break;
2730 case 290: CALL(0, 1, 0, 0, 0, 1, 0, 0, 1, 0); break;
2731 case 291: CALL(1, 1, 0, 0, 0, 1, 0, 0, 1, 0); break;
2732 case 292: CALL(0, 0, 1, 0, 0, 1, 0, 0, 1, 0); break;
2733 case 293: CALL(1, 0, 1, 0, 0, 1, 0, 0, 1, 0); break;
2734 case 294: CALL(0, 1, 1, 0, 0, 1, 0, 0, 1, 0); break;
2735 case 295: CALL(1, 1, 1, 0, 0, 1, 0, 0, 1, 0); break;
2736 case 296: CALL(0, 0, 0, 1, 0, 1, 0, 0, 1, 0); break;
2737 case 297: CALL(1, 0, 0, 1, 0, 1, 0, 0, 1, 0); break;
2738 case 298: CALL(0, 1, 0, 1, 0, 1, 0, 0, 1, 0); break;
2739 case 299: CALL(1, 1, 0, 1, 0, 1, 0, 0, 1, 0); break;
2740 case 300: CALL(0, 0, 1, 1, 0, 1, 0, 0, 1, 0); break;
2741 case 301: CALL(1, 0, 1, 1, 0, 1, 0, 0, 1, 0); break;
2742 case 302: CALL(0, 1, 1, 1, 0, 1, 0, 0, 1, 0); break;
2743 case 303: CALL(1, 1, 1, 1, 0, 1, 0, 0, 1, 0); break;
2746 case 304: CALL(0, 0, 0, 0, 1, 1, 0, 0, 1, 0); break;
2747 case 305: CALL(1, 0, 0, 0, 1, 1, 0, 0, 1, 0); break;
2748 case 306: CALL(0, 1, 0, 0, 1, 1, 0, 0, 1, 0); break;
2749 case 307: CALL(1, 1, 0, 0, 1, 1, 0, 0, 1, 0); break;
2750 case 308: CALL(0, 0, 1, 0, 1, 1, 0, 0, 1, 0); break;
2751 case 309: CALL(1, 0, 1, 0, 1, 1, 0, 0, 1, 0); break;
2752 case 310: CALL(0, 1, 1, 0, 1, 1, 0, 0, 1, 0); break;
2753 case 311: CALL(1, 1, 1, 0, 1, 1, 0, 0, 1, 0); break;
2754 case 312: CALL(0, 0, 0, 1, 1, 1, 0, 0, 1, 0); break;
2755 case 313: CALL(1, 0, 0, 1, 1, 1, 0, 0, 1, 0); break;
2756 case 314: CALL(0, 1, 0, 1, 1, 1, 0, 0, 1, 0); break;
2757 case 315: CALL(1, 1, 0, 1, 1, 1, 0, 0, 1, 0); break;
2758 case 316: CALL(0, 0, 1, 1, 1, 1, 0, 0, 1, 0); break;
2759 case 317: CALL(1, 0, 1, 1, 1, 1, 0, 0, 1, 0); break;
2760 case 318: CALL(0, 1, 1, 1, 1, 1, 0, 0, 1, 0); break;
2761 case 319: CALL(1, 1, 1, 1, 1, 1, 0, 0, 1, 0); break;
2764 case 320: CALL(0, 0, 0, 0, 0, 0, 1, 0, 1, 0); break;
2765 case 321: CALL(1, 0, 0, 0, 0, 0, 1, 0, 1, 0); break;
2766 case 322: CALL(0, 1, 0, 0, 0, 0, 1, 0, 1, 0); break;
2767 case 323: CALL(1, 1, 0, 0, 0, 0, 1, 0, 1, 0); break;
2768 case 324: CALL(0, 0, 1, 0, 0, 0, 1, 0, 1, 0); break;
2769 case 325: CALL(1, 0, 1, 0, 0, 0, 1, 0, 1, 0); break;
2770 case 326: CALL(0, 1, 1, 0, 0, 0, 1, 0, 1, 0); break;
2771 case 327: CALL(1, 1, 1, 0, 0, 0, 1, 0, 1, 0); break;
2772 case 328: CALL(0, 0, 0, 1, 0, 0, 1, 0, 1, 0); break;
2773 case 329: CALL(1, 0, 0, 1, 0, 0, 1, 0, 1, 0); break;
2774 case 330: CALL(0, 1, 0, 1, 0, 0, 1, 0, 1, 0); break;
2775 case 331: CALL(1, 1, 0, 1, 0, 0, 1, 0, 1, 0); break;
2776 case 332: CALL(0, 0, 1, 1, 0, 0, 1, 0, 1, 0); break;
2777 case 333: CALL(1, 0, 1, 1, 0, 0, 1, 0, 1, 0); break;
2778 case 334: CALL(0, 1, 1, 1, 0, 0, 1, 0, 1, 0); break;
2779 case 335: CALL(1, 1, 1, 1, 0, 0, 1, 0, 1, 0); break;
2782 case 336: CALL(0, 0, 0, 0, 1, 0, 1, 0, 1, 0); break;
2783 case 337: CALL(1, 0, 0, 0, 1, 0, 1, 0, 1, 0); break;
2784 case 338: CALL(0, 1, 0, 0, 1, 0, 1, 0, 1, 0); break;
2785 case 339: CALL(1, 1, 0, 0, 1, 0, 1, 0, 1, 0); break;
2786 case 340: CALL(0, 0, 1, 0, 1, 0, 1, 0, 1, 0); break;
2787 case 341: CALL(1, 0, 1, 0, 1, 0, 1, 0, 1, 0); break;
2788 case 342: CALL(0, 1, 1, 0, 1, 0, 1, 0, 1, 0); break;
2789 case 343: CALL(1, 1, 1, 0, 1, 0, 1, 0, 1, 0); break;
2790 case 344: CALL(0, 0, 0, 1, 1, 0, 1, 0, 1, 0); break;
2791 case 345: CALL(1, 0, 0, 1, 1, 0, 1, 0, 1, 0); break;
2792 case 346: CALL(0, 1, 0, 1, 1, 0, 1, 0, 1, 0); break;
2793 case 347: CALL(1, 1, 0, 1, 1, 0, 1, 0, 1, 0); break;
2794 case 348: CALL(0, 0, 1, 1, 1, 0, 1, 0, 1, 0); break;
2795 case 349: CALL(1, 0, 1, 1, 1, 0, 1, 0, 1, 0); break;
2796 case 350: CALL(0, 1, 1, 1, 1, 0, 1, 0, 1, 0); break;
2797 case 351: CALL(1, 1, 1, 1, 1, 0, 1, 0, 1, 0); break;
2800 case 352: CALL(0, 0, 0, 0, 0, 1, 1, 0, 1, 0); break;
2801 case 353: CALL(1, 0, 0, 0, 0, 1, 1, 0, 1, 0); break;
2802 case 354: CALL(0, 1, 0, 0, 0, 1, 1, 0, 1, 0); break;
2803 case 355: CALL(1, 1, 0, 0, 0, 1, 1, 0, 1, 0); break;
2804 case 356: CALL(0, 0, 1, 0, 0, 1, 1, 0, 1, 0); break;
2805 case 357: CALL(1, 0, 1, 0, 0, 1, 1, 0, 1, 0); break;
2806 case 358: CALL(0, 1, 1, 0, 0, 1, 1, 0, 1, 0); break;
2807 case 359: CALL(1, 1, 1, 0, 0, 1, 1, 0, 1, 0); break;
2808 case 360: CALL(0, 0, 0, 1, 0, 1, 1, 0, 1, 0); break;
2809 case 361: CALL(1, 0, 0, 1, 0, 1, 1, 0, 1, 0); break;
2810 case 362: CALL(0, 1, 0, 1, 0, 1, 1, 0, 1, 0); break;
2811 case 363: CALL(1, 1, 0, 1, 0, 1, 1, 0, 1, 0); break;
2812 case 364: CALL(0, 0, 1, 1, 0, 1, 1, 0, 1, 0); break;
2813 case 365: CALL(1, 0, 1, 1, 0, 1, 1, 0, 1, 0); break;
2814 case 366: CALL(0, 1, 1, 1, 0, 1, 1, 0, 1, 0); break;
2815 case 367: CALL(1, 1, 1, 1, 0, 1, 1, 0, 1, 0); break;
2817 case 368: CALL(0, 0, 0, 0, 1, 1, 1, 0, 1, 0); break;
2818 case 369: CALL(1, 0, 0, 0, 1, 1, 1, 0, 1, 0); break;
2819 case 370: CALL(0, 1, 0, 0, 1, 1, 1, 0, 1, 0); break;
2820 case 371: CALL(1, 1, 0, 0, 1, 1, 1, 0, 1, 0); break;
2821 case 372: CALL(0, 0, 1, 0, 1, 1, 1, 0, 1, 0); break;
2822 case 373: CALL(1, 0, 1, 0, 1, 1, 1, 0, 1, 0); break;
2823 case 374: CALL(0, 1, 1, 0, 1, 1, 1, 0, 1, 0); break;
2824 case 375: CALL(1, 1, 1, 0, 1, 1, 1, 0, 1, 0); break;
2825 case 376: CALL(0, 0, 0, 1, 1, 1, 1, 0, 1, 0); break;
2826 case 377: CALL(1, 0, 0, 1, 1, 1, 1, 0, 1, 0); break;
2827 case 378: CALL(0, 1, 0, 1, 1, 1, 1, 0, 1, 0); break;
2828 case 379: CALL(1, 1, 0, 1, 1, 1, 1, 0, 1, 0); break;
2829 case 380: CALL(0, 0, 1, 1, 1, 1, 1, 0, 1, 0); break;
2830 case 381: CALL(1, 0, 1, 1, 1, 1, 1, 0, 1, 0); break;
2831 case 382: CALL(0, 1, 1, 1, 1, 1, 1, 0, 1, 0); break;
2832 case 383: CALL(1, 1, 1, 1, 1, 1, 1, 0, 1, 0); break;
2835 case 384: CALL(0, 0, 0, 0, 0, 0, 0, 1, 1, 0); break;
2836 case 385: CALL(1, 0, 0, 0, 0, 0, 0, 1, 1, 0); break;
2837 case 386: CALL(0, 1, 0, 0, 0, 0, 0, 1, 1, 0); break;
2838 case 387: CALL(1, 1, 0, 0, 0, 0, 0, 1, 1, 0); break;
2839 case 388: CALL(0, 0, 1, 0, 0, 0, 0, 1, 1, 0); break;
2840 case 389: CALL(1, 0, 1, 0, 0, 0, 0, 1, 1, 0); break;
2841 case 390: CALL(0, 1, 1, 0, 0, 0, 0, 1, 1, 0); break;
2842 case 391: CALL(1, 1, 1, 0, 0, 0, 0, 1, 1, 0); break;
2843 case 392: CALL(0, 0, 0, 1, 0, 0, 0, 1, 1, 0); break;
2844 case 393: CALL(1, 0, 0, 1, 0, 0, 0, 1, 1, 0); break;
2845 case 394: CALL(0, 1, 0, 1, 0, 0, 0, 1, 1, 0); break;
2846 case 395: CALL(1, 1, 0, 1, 0, 0, 0, 1, 1, 0); break;
2847 case 396: CALL(0, 0, 1, 1, 0, 0, 0, 1, 1, 0); break;
2848 case 397: CALL(1, 0, 1, 1, 0, 0, 0, 1, 1, 0); break;
2849 case 398: CALL(0, 1, 1, 1, 0, 0, 0, 1, 1, 0); break;
2850 case 399: CALL(1, 1, 1, 1, 0, 0, 0, 1, 1, 0); break;
2853 case 400: CALL(0, 0, 0, 0, 1, 0, 0, 1, 1, 0); break;
2854 case 401: CALL(1, 0, 0, 0, 1, 0, 0, 1, 1, 0); break;
2855 case 402: CALL(0, 1, 0, 0, 1, 0, 0, 1, 1, 0); break;
2856 case 403: CALL(1, 1, 0, 0, 1, 0, 0, 1, 1, 0); break;
2857 case 404: CALL(0, 0, 1, 0, 1, 0, 0, 1, 1, 0); break;
2858 case 405: CALL(1, 0, 1, 0, 1, 0, 0, 1, 1, 0); break;
2859 case 406: CALL(0, 1, 1, 0, 1, 0, 0, 1, 1, 0); break;
2860 case 407: CALL(1, 1, 1, 0, 1, 0, 0, 1, 1, 0); break;
2861 case 408: CALL(0, 0, 0, 1, 1, 0, 0, 1, 1, 0); break;
2862 case 409: CALL(1, 0, 0, 1, 1, 0, 0, 1, 1, 0); break;
2863 case 410: CALL(0, 1, 0, 1, 1, 0, 0, 1, 1, 0); break;
2864 case 411: CALL(1, 1, 0, 1, 1, 0, 0, 1, 1, 0); break;
2865 case 412: CALL(0, 0, 1, 1, 1, 0, 0, 1, 1, 0); break;
2866 case 413: CALL(1, 0, 1, 1, 1, 0, 0, 1, 1, 0); break;
2867 case 414: CALL(0, 1, 1, 1, 1, 0, 0, 1, 1, 0); break;
2868 case 415: CALL(1, 1, 1, 1, 1, 0, 0, 1, 1, 0); break;
2870 case 416: CALL(0, 0, 0, 0, 0, 1, 0, 1, 1, 0); break;
2871 case 417: CALL(1, 0, 0, 0, 0, 1, 0, 1, 1, 0); break;
2872 case 418: CALL(0, 1, 0, 0, 0, 1, 0, 1, 1, 0); break;
2873 case 419: CALL(1, 1, 0, 0, 0, 1, 0, 1, 1, 0); break;
2874 case 420: CALL(0, 0, 1, 0, 0, 1, 0, 1, 1, 0); break;
2875 case 421: CALL(1, 0, 1, 0, 0, 1, 0, 1, 1, 0); break;
2876 case 422: CALL(0, 1, 1, 0, 0, 1, 0, 1, 1, 0); break;
2877 case 423: CALL(1, 1, 1, 0, 0, 1, 0, 1, 1, 0); break;
2878 case 424: CALL(0, 0, 0, 1, 0, 1, 0, 1, 1, 0); break;
2879 case 425: CALL(1, 0, 0, 1, 0, 1, 0, 1, 1, 0); break;
2880 case 426: CALL(0, 1, 0, 1, 0, 1, 0, 1, 1, 0); break;
2881 case 427: CALL(1, 1, 0, 1, 0, 1, 0, 1, 1, 0); break;
2882 case 428: CALL(0, 0, 1, 1, 0, 1, 0, 1, 1, 0); break;
2883 case 429: CALL(1, 0, 1, 1, 0, 1, 0, 1, 1, 0); break;
2884 case 430: CALL(0, 1, 1, 1, 0, 1, 0, 1, 1, 0); break;
2885 case 431: CALL(1, 1, 1, 1, 0, 1, 0, 1, 1, 0); break;
2888 case 432: CALL(0, 0, 0, 0, 1, 1, 0, 1, 1, 0); break;
2889 case 433: CALL(1, 0, 0, 0, 1, 1, 0, 1, 1, 0); break;
2890 case 434: CALL(0, 1, 0, 0, 1, 1, 0, 1, 1, 0); break;
2891 case 435: CALL(1, 1, 0, 0, 1, 1, 0, 1, 1, 0); break;
2892 case 436: CALL(0, 0, 1, 0, 1, 1, 0, 1, 1, 0); break;
2893 case 437: CALL(1, 0, 1, 0, 1, 1, 0, 1, 1, 0); break;
2894 case 438: CALL(0, 1, 1, 0, 1, 1, 0, 1, 1, 0); break;
2895 case 439: CALL(1, 1, 1, 0, 1, 1, 0, 1, 1, 0); break;
2896 case 440: CALL(0, 0, 0, 1, 1, 1, 0, 1, 1, 0); break;
2897 case 441: CALL(1, 0, 0, 1, 1, 1, 0, 1, 1, 0); break;
2898 case 442: CALL(0, 1, 0, 1, 1, 1, 0, 1, 1, 0); break;
2899 case 443: CALL(1, 1, 0, 1, 1, 1, 0, 1, 1, 0); break;
2900 case 444: CALL(0, 0, 1, 1, 1, 1, 0, 1, 1, 0); break;
2901 case 445: CALL(1, 0, 1, 1, 1, 1, 0, 1, 1, 0); break;
2902 case 446: CALL(0, 1, 1, 1, 1, 1, 0, 1, 1, 0); break;
2903 case 447: CALL(1, 1, 1, 1, 1, 1, 0, 1, 1, 0); break;
2906 case 448: CALL(0, 0, 0, 0, 0, 0, 1, 1, 1, 0); break;
2907 case 449: CALL(1, 0, 0, 0, 0, 0, 1, 1, 1, 0); break;
2908 case 450: CALL(0, 1, 0, 0, 0, 0, 1, 1, 1, 0); break;
2909 case 451: CALL(1, 1, 0, 0, 0, 0, 1, 1, 1, 0); break;
2910 case 452: CALL(0, 0, 1, 0, 0, 0, 1, 1, 1, 0); break;
2911 case 453: CALL(1, 0, 1, 0, 0, 0, 1, 1, 1, 0); break;
2912 case 454: CALL(0, 1, 1, 0, 0, 0, 1, 1, 1, 0); break;
2913 case 455: CALL(1, 1, 1, 0, 0, 0, 1, 1, 1, 0); break;
2914 case 456: CALL(0, 0, 0, 1, 0, 0, 1, 1, 1, 0); break;
2915 case 457: CALL(1, 0, 0, 1, 0, 0, 1, 1, 1, 0); break;
2916 case 458: CALL(0, 1, 0, 1, 0, 0, 1, 1, 1, 0); break;
2917 case 459: CALL(1, 1, 0, 1, 0, 0, 1, 1, 1, 0); break;
2918 case 460: CALL(0, 0, 1, 1, 0, 0, 1, 1, 1, 0); break;
2919 case 461: CALL(1, 0, 1, 1, 0, 0, 1, 1, 1, 0); break;
2920 case 462: CALL(0, 1, 1, 1, 0, 0, 1, 1, 1, 0); break;
2921 case 463: CALL(1, 1, 1, 1, 0, 0, 1, 1, 1, 0); break;
2924 case 464: CALL(0, 0, 0, 0, 1, 0, 1, 1, 1, 0); break;
2925 case 465: CALL(1, 0, 0, 0, 1, 0, 1, 1, 1, 0); break;
2926 case 466: CALL(0, 1, 0, 0, 1, 0, 1, 1, 1, 0); break;
2927 case 467: CALL(1, 1, 0, 0, 1, 0, 1, 1, 1, 0); break;
2928 case 468: CALL(0, 0, 1, 0, 1, 0, 1, 1, 1, 0); break;
2929 case 469: CALL(1, 0, 1, 0, 1, 0, 1, 1, 1, 0); break;
2930 case 470: CALL(0, 1, 1, 0, 1, 0, 1, 1, 1, 0); break;
2931 case 471: CALL(1, 1, 1, 0, 1, 0, 1, 1, 1, 0); break;
2932 case 472: CALL(0, 0, 0, 1, 1, 0, 1, 1, 1, 0); break;
2933 case 473: CALL(1, 0, 0, 1, 1, 0, 1, 1, 1, 0); break;
2934 case 474: CALL(0, 1, 0, 1, 1, 0, 1, 1, 1, 0); break;
2935 case 475: CALL(1, 1, 0, 1, 1, 0, 1, 1, 1, 0); break;
2936 case 476: CALL(0, 0, 1, 1, 1, 0, 1, 1, 1, 0); break;
2937 case 477: CALL(1, 0, 1, 1, 1, 0, 1, 1, 1, 0); break;
2938 case 478: CALL(0, 1, 1, 1, 1, 0, 1, 1, 1, 0); break;
2939 case 479: CALL(1, 1, 1, 1, 1, 0, 1, 1, 1, 0); break;
2942 case 480: CALL(0, 0, 0, 0, 0, 1, 1, 1, 1, 0); break;
2943 case 481: CALL(1, 0, 0, 0, 0, 1, 1, 1, 1, 0); break;
2944 case 482: CALL(0, 1, 0, 0, 0, 1, 1, 1, 1, 0); break;
2945 case 483: CALL(1, 1, 0, 0, 0, 1, 1, 1, 1, 0); break;
2946 case 484: CALL(0, 0, 1, 0, 0, 1, 1, 1, 1, 0); break;
2947 case 485: CALL(1, 0, 1, 0, 0, 1, 1, 1, 1, 0); break;
2948 case 486: CALL(0, 1, 1, 0, 0, 1, 1, 1, 1, 0); break;
2949 case 487: CALL(1, 1, 1, 0, 0, 1, 1, 1, 1, 0); break;
2950 case 488: CALL(0, 0, 0, 1, 0, 1, 1, 1, 1, 0); break;
2951 case 489: CALL(1, 0, 0, 1, 0, 1, 1, 1, 1, 0); break;
2952 case 490: CALL(0, 1, 0, 1, 0, 1, 1, 1, 1, 0); break;
2953 case 491: CALL(1, 1, 0, 1, 0, 1, 1, 1, 1, 0); break;
2954 case 492: CALL(0, 0, 1, 1, 0, 1, 1, 1, 1, 0); break;
2955 case 493: CALL(1, 0, 1, 1, 0, 1, 1, 1, 1, 0); break;
2956 case 494: CALL(0, 1, 1, 1, 0, 1, 1, 1, 1, 0); break;
2957 case 495: CALL(1, 1, 1, 1, 0, 1, 1, 1, 1, 0); break;
2959 case 496: CALL(0, 0, 0, 0, 1, 1, 1, 1, 1, 0); break;
2960 case 497: CALL(1, 0, 0, 0, 1, 1, 1, 1, 1, 0); break;
2961 case 498: CALL(0, 1, 0, 0, 1, 1, 1, 1, 1, 0); break;
2962 case 499: CALL(1, 1, 0, 0, 1, 1, 1, 1, 1, 0); break;
2963 case 500: CALL(0, 0, 1, 0, 1, 1, 1, 1, 1, 0); break;
2964 case 501: CALL(1, 0, 1, 0, 1, 1, 1, 1, 1, 0); break;
2965 case 502: CALL(0, 1, 1, 0, 1, 1, 1, 1, 1, 0); break;
2966 case 503: CALL(1, 1, 1, 0, 1, 1, 1, 1, 1, 0); break;
2967 case 504: CALL(0, 0, 0, 1, 1, 1, 1, 1, 1, 0); break;
2968 case 505: CALL(1, 0, 0, 1, 1, 1, 1, 1, 1, 0); break;
2969 case 506: CALL(0, 1, 0, 1, 1, 1, 1, 1, 1, 0); break;
2970 case 507: CALL(1, 1, 0, 1, 1, 1, 1, 1, 1, 0); break;
2971 case 508: CALL(0, 0, 1, 1, 1, 1, 1, 1, 1, 0); break;
2972 case 509: CALL(1, 0, 1, 1, 1, 1, 1, 1, 1, 0); break;
2973 case 510: CALL(0, 1, 1, 1, 1, 1, 1, 1, 1, 0); break;
2974 case 511: CALL(1, 1, 1, 1, 1, 1, 1, 1, 1, 0); break;
2977 * Haochuan: the calls starting from 512 to 1023 were generated by the following python script
2978 * #!/usr/bin/env python3
2979 * def gen_call(option: int):
2980 * doEnergy = option & 1
2981 * doVirial = (option >> 1) & 1
2982 * doSlow = (option >> 2) & 1
2983 * doPairlist = (option >> 3) & 1
2984 * doAlch = (option >> 4) & 1
2985 * doFEP = (option >> 5) & 1
2986 * doTI = (option >> 6) & 1
2987 * doStreaming = (option >> 7) & 1
2988 * doTable = (option >> 8) & 1
2989 * doAlchVdwForceSwitching = (option >> 9) & 1
2990 * incompatible = False
2991 * incompatible = incompatible | (doFEP and doTI)
2992 * incompatible = incompatible | (doAlch and ((not doFEP) and (not doTI)))
2993 * incompatible = incompatible | ((not doAlch) and (doFEP or doTI or doAlchVdwForceSwitching))
2994 * incompatible = incompatible | ((not doTable) and (doAlch or doTI or doFEP or doAlchVdwForceSwitching))
2997 * print(f' // case {option}: CALL({doEnergy}, {doVirial}, {doSlow}, {doPairlist}, {doAlch}, {doFEP}, {doTI}, {doStreaming}, {doTable}, {doAlchVdwForceSwitching}); break;')
2999 * print(f' case {option}: CALL({doEnergy}, {doVirial}, {doSlow}, {doPairlist}, {doAlch}, {doFEP}, {doTI}, {doStreaming}, {doTable}, {doAlchVdwForceSwitching}); break;')
3002 * for i in range(512, 1024):
3006 // case 512: CALL(0, 0, 0, 0, 0, 0, 0, 0, 0, 1); break;
3007 // case 513: CALL(1, 0, 0, 0, 0, 0, 0, 0, 0, 1); break;
3008 // case 514: CALL(0, 1, 0, 0, 0, 0, 0, 0, 0, 1); break;
3009 // case 515: CALL(1, 1, 0, 0, 0, 0, 0, 0, 0, 1); break;
3010 // case 516: CALL(0, 0, 1, 0, 0, 0, 0, 0, 0, 1); break;
3011 // case 517: CALL(1, 0, 1, 0, 0, 0, 0, 0, 0, 1); break;
3012 // case 518: CALL(0, 1, 1, 0, 0, 0, 0, 0, 0, 1); break;
3013 // case 519: CALL(1, 1, 1, 0, 0, 0, 0, 0, 0, 1); break;
3014 // case 520: CALL(0, 0, 0, 1, 0, 0, 0, 0, 0, 1); break;
3015 // case 521: CALL(1, 0, 0, 1, 0, 0, 0, 0, 0, 1); break;
3016 // case 522: CALL(0, 1, 0, 1, 0, 0, 0, 0, 0, 1); break;
3017 // case 523: CALL(1, 1, 0, 1, 0, 0, 0, 0, 0, 1); break;
3018 // case 524: CALL(0, 0, 1, 1, 0, 0, 0, 0, 0, 1); break;
3019 // case 525: CALL(1, 0, 1, 1, 0, 0, 0, 0, 0, 1); break;
3020 // case 526: CALL(0, 1, 1, 1, 0, 0, 0, 0, 0, 1); break;
3021 // case 527: CALL(1, 1, 1, 1, 0, 0, 0, 0, 0, 1); break;
3022 // case 528: CALL(0, 0, 0, 0, 1, 0, 0, 0, 0, 1); break;
3023 // case 529: CALL(1, 0, 0, 0, 1, 0, 0, 0, 0, 1); break;
3024 // case 530: CALL(0, 1, 0, 0, 1, 0, 0, 0, 0, 1); break;
3025 // case 531: CALL(1, 1, 0, 0, 1, 0, 0, 0, 0, 1); break;
3026 // case 532: CALL(0, 0, 1, 0, 1, 0, 0, 0, 0, 1); break;
3027 // case 533: CALL(1, 0, 1, 0, 1, 0, 0, 0, 0, 1); break;
3028 // case 534: CALL(0, 1, 1, 0, 1, 0, 0, 0, 0, 1); break;
3029 // case 535: CALL(1, 1, 1, 0, 1, 0, 0, 0, 0, 1); break;
3030 // case 536: CALL(0, 0, 0, 1, 1, 0, 0, 0, 0, 1); break;
3031 // case 537: CALL(1, 0, 0, 1, 1, 0, 0, 0, 0, 1); break;
3032 // case 538: CALL(0, 1, 0, 1, 1, 0, 0, 0, 0, 1); break;
3033 // case 539: CALL(1, 1, 0, 1, 1, 0, 0, 0, 0, 1); break;
3034 // case 540: CALL(0, 0, 1, 1, 1, 0, 0, 0, 0, 1); break;
3035 // case 541: CALL(1, 0, 1, 1, 1, 0, 0, 0, 0, 1); break;
3036 // case 542: CALL(0, 1, 1, 1, 1, 0, 0, 0, 0, 1); break;
3037 // case 543: CALL(1, 1, 1, 1, 1, 0, 0, 0, 0, 1); break;
3038 // case 544: CALL(0, 0, 0, 0, 0, 1, 0, 0, 0, 1); break;
3039 // case 545: CALL(1, 0, 0, 0, 0, 1, 0, 0, 0, 1); break;
3040 // case 546: CALL(0, 1, 0, 0, 0, 1, 0, 0, 0, 1); break;
3041 // case 547: CALL(1, 1, 0, 0, 0, 1, 0, 0, 0, 1); break;
3042 // case 548: CALL(0, 0, 1, 0, 0, 1, 0, 0, 0, 1); break;
3043 // case 549: CALL(1, 0, 1, 0, 0, 1, 0, 0, 0, 1); break;
3044 // case 550: CALL(0, 1, 1, 0, 0, 1, 0, 0, 0, 1); break;
3045 // case 551: CALL(1, 1, 1, 0, 0, 1, 0, 0, 0, 1); break;
3046 // case 552: CALL(0, 0, 0, 1, 0, 1, 0, 0, 0, 1); break;
3047 // case 553: CALL(1, 0, 0, 1, 0, 1, 0, 0, 0, 1); break;
3048 // case 554: CALL(0, 1, 0, 1, 0, 1, 0, 0, 0, 1); break;
3049 // case 555: CALL(1, 1, 0, 1, 0, 1, 0, 0, 0, 1); break;
3050 // case 556: CALL(0, 0, 1, 1, 0, 1, 0, 0, 0, 1); break;
3051 // case 557: CALL(1, 0, 1, 1, 0, 1, 0, 0, 0, 1); break;
3052 // case 558: CALL(0, 1, 1, 1, 0, 1, 0, 0, 0, 1); break;
3053 // case 559: CALL(1, 1, 1, 1, 0, 1, 0, 0, 0, 1); break;
3054 // case 560: CALL(0, 0, 0, 0, 1, 1, 0, 0, 0, 1); break;
3055 // case 561: CALL(1, 0, 0, 0, 1, 1, 0, 0, 0, 1); break;
3056 // case 562: CALL(0, 1, 0, 0, 1, 1, 0, 0, 0, 1); break;
3057 // case 563: CALL(1, 1, 0, 0, 1, 1, 0, 0, 0, 1); break;
3058 // case 564: CALL(0, 0, 1, 0, 1, 1, 0, 0, 0, 1); break;
3059 // case 565: CALL(1, 0, 1, 0, 1, 1, 0, 0, 0, 1); break;
3060 // case 566: CALL(0, 1, 1, 0, 1, 1, 0, 0, 0, 1); break;
3061 // case 567: CALL(1, 1, 1, 0, 1, 1, 0, 0, 0, 1); break;
3062 // case 568: CALL(0, 0, 0, 1, 1, 1, 0, 0, 0, 1); break;
3063 // case 569: CALL(1, 0, 0, 1, 1, 1, 0, 0, 0, 1); break;
3064 // case 570: CALL(0, 1, 0, 1, 1, 1, 0, 0, 0, 1); break;
3065 // case 571: CALL(1, 1, 0, 1, 1, 1, 0, 0, 0, 1); break;
3066 // case 572: CALL(0, 0, 1, 1, 1, 1, 0, 0, 0, 1); break;
3067 // case 573: CALL(1, 0, 1, 1, 1, 1, 0, 0, 0, 1); break;
3068 // case 574: CALL(0, 1, 1, 1, 1, 1, 0, 0, 0, 1); break;
3069 // case 575: CALL(1, 1, 1, 1, 1, 1, 0, 0, 0, 1); break;
3070 // case 576: CALL(0, 0, 0, 0, 0, 0, 1, 0, 0, 1); break;
3071 // case 577: CALL(1, 0, 0, 0, 0, 0, 1, 0, 0, 1); break;
3072 // case 578: CALL(0, 1, 0, 0, 0, 0, 1, 0, 0, 1); break;
3073 // case 579: CALL(1, 1, 0, 0, 0, 0, 1, 0, 0, 1); break;
3074 // case 580: CALL(0, 0, 1, 0, 0, 0, 1, 0, 0, 1); break;
3075 // case 581: CALL(1, 0, 1, 0, 0, 0, 1, 0, 0, 1); break;
3076 // case 582: CALL(0, 1, 1, 0, 0, 0, 1, 0, 0, 1); break;
3077 // case 583: CALL(1, 1, 1, 0, 0, 0, 1, 0, 0, 1); break;
3078 // case 584: CALL(0, 0, 0, 1, 0, 0, 1, 0, 0, 1); break;
3079 // case 585: CALL(1, 0, 0, 1, 0, 0, 1, 0, 0, 1); break;
3080 // case 586: CALL(0, 1, 0, 1, 0, 0, 1, 0, 0, 1); break;
3081 // case 587: CALL(1, 1, 0, 1, 0, 0, 1, 0, 0, 1); break;
3082 // case 588: CALL(0, 0, 1, 1, 0, 0, 1, 0, 0, 1); break;
3083 // case 589: CALL(1, 0, 1, 1, 0, 0, 1, 0, 0, 1); break;
3084 // case 590: CALL(0, 1, 1, 1, 0, 0, 1, 0, 0, 1); break;
3085 // case 591: CALL(1, 1, 1, 1, 0, 0, 1, 0, 0, 1); break;
3086 // case 592: CALL(0, 0, 0, 0, 1, 0, 1, 0, 0, 1); break;
3087 // case 593: CALL(1, 0, 0, 0, 1, 0, 1, 0, 0, 1); break;
3088 // case 594: CALL(0, 1, 0, 0, 1, 0, 1, 0, 0, 1); break;
3089 // case 595: CALL(1, 1, 0, 0, 1, 0, 1, 0, 0, 1); break;
3090 // case 596: CALL(0, 0, 1, 0, 1, 0, 1, 0, 0, 1); break;
3091 // case 597: CALL(1, 0, 1, 0, 1, 0, 1, 0, 0, 1); break;
3092 // case 598: CALL(0, 1, 1, 0, 1, 0, 1, 0, 0, 1); break;
3093 // case 599: CALL(1, 1, 1, 0, 1, 0, 1, 0, 0, 1); break;
3094 // case 600: CALL(0, 0, 0, 1, 1, 0, 1, 0, 0, 1); break;
3095 // case 601: CALL(1, 0, 0, 1, 1, 0, 1, 0, 0, 1); break;
3096 // case 602: CALL(0, 1, 0, 1, 1, 0, 1, 0, 0, 1); break;
3097 // case 603: CALL(1, 1, 0, 1, 1, 0, 1, 0, 0, 1); break;
3098 // case 604: CALL(0, 0, 1, 1, 1, 0, 1, 0, 0, 1); break;
3099 // case 605: CALL(1, 0, 1, 1, 1, 0, 1, 0, 0, 1); break;
3100 // case 606: CALL(0, 1, 1, 1, 1, 0, 1, 0, 0, 1); break;
3101 // case 607: CALL(1, 1, 1, 1, 1, 0, 1, 0, 0, 1); break;
3102 // case 608: CALL(0, 0, 0, 0, 0, 1, 1, 0, 0, 1); break;
3103 // case 609: CALL(1, 0, 0, 0, 0, 1, 1, 0, 0, 1); break;
3104 // case 610: CALL(0, 1, 0, 0, 0, 1, 1, 0, 0, 1); break;
3105 // case 611: CALL(1, 1, 0, 0, 0, 1, 1, 0, 0, 1); break;
3106 // case 612: CALL(0, 0, 1, 0, 0, 1, 1, 0, 0, 1); break;
3107 // case 613: CALL(1, 0, 1, 0, 0, 1, 1, 0, 0, 1); break;
3108 // case 614: CALL(0, 1, 1, 0, 0, 1, 1, 0, 0, 1); break;
3109 // case 615: CALL(1, 1, 1, 0, 0, 1, 1, 0, 0, 1); break;
3110 // case 616: CALL(0, 0, 0, 1, 0, 1, 1, 0, 0, 1); break;
3111 // case 617: CALL(1, 0, 0, 1, 0, 1, 1, 0, 0, 1); break;
3112 // case 618: CALL(0, 1, 0, 1, 0, 1, 1, 0, 0, 1); break;
3113 // case 619: CALL(1, 1, 0, 1, 0, 1, 1, 0, 0, 1); break;
3114 // case 620: CALL(0, 0, 1, 1, 0, 1, 1, 0, 0, 1); break;
3115 // case 621: CALL(1, 0, 1, 1, 0, 1, 1, 0, 0, 1); break;
3116 // case 622: CALL(0, 1, 1, 1, 0, 1, 1, 0, 0, 1); break;
3117 // case 623: CALL(1, 1, 1, 1, 0, 1, 1, 0, 0, 1); break;
3118 // case 624: CALL(0, 0, 0, 0, 1, 1, 1, 0, 0, 1); break;
3119 // case 625: CALL(1, 0, 0, 0, 1, 1, 1, 0, 0, 1); break;
3120 // case 626: CALL(0, 1, 0, 0, 1, 1, 1, 0, 0, 1); break;
3121 // case 627: CALL(1, 1, 0, 0, 1, 1, 1, 0, 0, 1); break;
3122 // case 628: CALL(0, 0, 1, 0, 1, 1, 1, 0, 0, 1); break;
3123 // case 629: CALL(1, 0, 1, 0, 1, 1, 1, 0, 0, 1); break;
3124 // case 630: CALL(0, 1, 1, 0, 1, 1, 1, 0, 0, 1); break;
3125 // case 631: CALL(1, 1, 1, 0, 1, 1, 1, 0, 0, 1); break;
3126 // case 632: CALL(0, 0, 0, 1, 1, 1, 1, 0, 0, 1); break;
3127 // case 633: CALL(1, 0, 0, 1, 1, 1, 1, 0, 0, 1); break;
3128 // case 634: CALL(0, 1, 0, 1, 1, 1, 1, 0, 0, 1); break;
3129 // case 635: CALL(1, 1, 0, 1, 1, 1, 1, 0, 0, 1); break;
3130 // case 636: CALL(0, 0, 1, 1, 1, 1, 1, 0, 0, 1); break;
3131 // case 637: CALL(1, 0, 1, 1, 1, 1, 1, 0, 0, 1); break;
3132 // case 638: CALL(0, 1, 1, 1, 1, 1, 1, 0, 0, 1); break;
3133 // case 639: CALL(1, 1, 1, 1, 1, 1, 1, 0, 0, 1); break;
3134 // case 640: CALL(0, 0, 0, 0, 0, 0, 0, 1, 0, 1); break;
3135 // case 641: CALL(1, 0, 0, 0, 0, 0, 0, 1, 0, 1); break;
3136 // case 642: CALL(0, 1, 0, 0, 0, 0, 0, 1, 0, 1); break;
3137 // case 643: CALL(1, 1, 0, 0, 0, 0, 0, 1, 0, 1); break;
3138 // case 644: CALL(0, 0, 1, 0, 0, 0, 0, 1, 0, 1); break;
3139 // case 645: CALL(1, 0, 1, 0, 0, 0, 0, 1, 0, 1); break;
3140 // case 646: CALL(0, 1, 1, 0, 0, 0, 0, 1, 0, 1); break;
3141 // case 647: CALL(1, 1, 1, 0, 0, 0, 0, 1, 0, 1); break;
3142 // case 648: CALL(0, 0, 0, 1, 0, 0, 0, 1, 0, 1); break;
3143 // case 649: CALL(1, 0, 0, 1, 0, 0, 0, 1, 0, 1); break;
3144 // case 650: CALL(0, 1, 0, 1, 0, 0, 0, 1, 0, 1); break;
3145 // case 651: CALL(1, 1, 0, 1, 0, 0, 0, 1, 0, 1); break;
3146 // case 652: CALL(0, 0, 1, 1, 0, 0, 0, 1, 0, 1); break;
3147 // case 653: CALL(1, 0, 1, 1, 0, 0, 0, 1, 0, 1); break;
3148 // case 654: CALL(0, 1, 1, 1, 0, 0, 0, 1, 0, 1); break;
3149 // case 655: CALL(1, 1, 1, 1, 0, 0, 0, 1, 0, 1); break;
3150 // case 656: CALL(0, 0, 0, 0, 1, 0, 0, 1, 0, 1); break;
3151 // case 657: CALL(1, 0, 0, 0, 1, 0, 0, 1, 0, 1); break;
3152 // case 658: CALL(0, 1, 0, 0, 1, 0, 0, 1, 0, 1); break;
3153 // case 659: CALL(1, 1, 0, 0, 1, 0, 0, 1, 0, 1); break;
3154 // case 660: CALL(0, 0, 1, 0, 1, 0, 0, 1, 0, 1); break;
3155 // case 661: CALL(1, 0, 1, 0, 1, 0, 0, 1, 0, 1); break;
3156 // case 662: CALL(0, 1, 1, 0, 1, 0, 0, 1, 0, 1); break;
3157 // case 663: CALL(1, 1, 1, 0, 1, 0, 0, 1, 0, 1); break;
3158 // case 664: CALL(0, 0, 0, 1, 1, 0, 0, 1, 0, 1); break;
3159 // case 665: CALL(1, 0, 0, 1, 1, 0, 0, 1, 0, 1); break;
3160 // case 666: CALL(0, 1, 0, 1, 1, 0, 0, 1, 0, 1); break;
3161 // case 667: CALL(1, 1, 0, 1, 1, 0, 0, 1, 0, 1); break;
3162 // case 668: CALL(0, 0, 1, 1, 1, 0, 0, 1, 0, 1); break;
3163 // case 669: CALL(1, 0, 1, 1, 1, 0, 0, 1, 0, 1); break;
3164 // case 670: CALL(0, 1, 1, 1, 1, 0, 0, 1, 0, 1); break;
3165 // case 671: CALL(1, 1, 1, 1, 1, 0, 0, 1, 0, 1); break;
3166 // case 672: CALL(0, 0, 0, 0, 0, 1, 0, 1, 0, 1); break;
3167 // case 673: CALL(1, 0, 0, 0, 0, 1, 0, 1, 0, 1); break;
3168 // case 674: CALL(0, 1, 0, 0, 0, 1, 0, 1, 0, 1); break;
3169 // case 675: CALL(1, 1, 0, 0, 0, 1, 0, 1, 0, 1); break;
3170 // case 676: CALL(0, 0, 1, 0, 0, 1, 0, 1, 0, 1); break;
3171 // case 677: CALL(1, 0, 1, 0, 0, 1, 0, 1, 0, 1); break;
3172 // case 678: CALL(0, 1, 1, 0, 0, 1, 0, 1, 0, 1); break;
3173 // case 679: CALL(1, 1, 1, 0, 0, 1, 0, 1, 0, 1); break;
3174 // case 680: CALL(0, 0, 0, 1, 0, 1, 0, 1, 0, 1); break;
3175 // case 681: CALL(1, 0, 0, 1, 0, 1, 0, 1, 0, 1); break;
3176 // case 682: CALL(0, 1, 0, 1, 0, 1, 0, 1, 0, 1); break;
3177 // case 683: CALL(1, 1, 0, 1, 0, 1, 0, 1, 0, 1); break;
3178 // case 684: CALL(0, 0, 1, 1, 0, 1, 0, 1, 0, 1); break;
3179 // case 685: CALL(1, 0, 1, 1, 0, 1, 0, 1, 0, 1); break;
3180 // case 686: CALL(0, 1, 1, 1, 0, 1, 0, 1, 0, 1); break;
3181 // case 687: CALL(1, 1, 1, 1, 0, 1, 0, 1, 0, 1); break;
3182 // case 688: CALL(0, 0, 0, 0, 1, 1, 0, 1, 0, 1); break;
3183 // case 689: CALL(1, 0, 0, 0, 1, 1, 0, 1, 0, 1); break;
3184 // case 690: CALL(0, 1, 0, 0, 1, 1, 0, 1, 0, 1); break;
3185 // case 691: CALL(1, 1, 0, 0, 1, 1, 0, 1, 0, 1); break;
3186 // case 692: CALL(0, 0, 1, 0, 1, 1, 0, 1, 0, 1); break;
3187 // case 693: CALL(1, 0, 1, 0, 1, 1, 0, 1, 0, 1); break;
3188 // case 694: CALL(0, 1, 1, 0, 1, 1, 0, 1, 0, 1); break;
3189 // case 695: CALL(1, 1, 1, 0, 1, 1, 0, 1, 0, 1); break;
3190 // case 696: CALL(0, 0, 0, 1, 1, 1, 0, 1, 0, 1); break;
3191 // case 697: CALL(1, 0, 0, 1, 1, 1, 0, 1, 0, 1); break;
3192 // case 698: CALL(0, 1, 0, 1, 1, 1, 0, 1, 0, 1); break;
3193 // case 699: CALL(1, 1, 0, 1, 1, 1, 0, 1, 0, 1); break;
3194 // case 700: CALL(0, 0, 1, 1, 1, 1, 0, 1, 0, 1); break;
3195 // case 701: CALL(1, 0, 1, 1, 1, 1, 0, 1, 0, 1); break;
3196 // case 702: CALL(0, 1, 1, 1, 1, 1, 0, 1, 0, 1); break;
3197 // case 703: CALL(1, 1, 1, 1, 1, 1, 0, 1, 0, 1); break;
3198 // case 704: CALL(0, 0, 0, 0, 0, 0, 1, 1, 0, 1); break;
3199 // case 705: CALL(1, 0, 0, 0, 0, 0, 1, 1, 0, 1); break;
3200 // case 706: CALL(0, 1, 0, 0, 0, 0, 1, 1, 0, 1); break;
3201 // case 707: CALL(1, 1, 0, 0, 0, 0, 1, 1, 0, 1); break;
3202 // case 708: CALL(0, 0, 1, 0, 0, 0, 1, 1, 0, 1); break;
3203 // case 709: CALL(1, 0, 1, 0, 0, 0, 1, 1, 0, 1); break;
3204 // case 710: CALL(0, 1, 1, 0, 0, 0, 1, 1, 0, 1); break;
3205 // case 711: CALL(1, 1, 1, 0, 0, 0, 1, 1, 0, 1); break;
3206 // case 712: CALL(0, 0, 0, 1, 0, 0, 1, 1, 0, 1); break;
3207 // case 713: CALL(1, 0, 0, 1, 0, 0, 1, 1, 0, 1); break;
3208 // case 714: CALL(0, 1, 0, 1, 0, 0, 1, 1, 0, 1); break;
3209 // case 715: CALL(1, 1, 0, 1, 0, 0, 1, 1, 0, 1); break;
3210 // case 716: CALL(0, 0, 1, 1, 0, 0, 1, 1, 0, 1); break;
3211 // case 717: CALL(1, 0, 1, 1, 0, 0, 1, 1, 0, 1); break;
3212 // case 718: CALL(0, 1, 1, 1, 0, 0, 1, 1, 0, 1); break;
3213 // case 719: CALL(1, 1, 1, 1, 0, 0, 1, 1, 0, 1); break;
3214 // case 720: CALL(0, 0, 0, 0, 1, 0, 1, 1, 0, 1); break;
3215 // case 721: CALL(1, 0, 0, 0, 1, 0, 1, 1, 0, 1); break;
3216 // case 722: CALL(0, 1, 0, 0, 1, 0, 1, 1, 0, 1); break;
3217 // case 723: CALL(1, 1, 0, 0, 1, 0, 1, 1, 0, 1); break;
3218 // case 724: CALL(0, 0, 1, 0, 1, 0, 1, 1, 0, 1); break;
3219 // case 725: CALL(1, 0, 1, 0, 1, 0, 1, 1, 0, 1); break;
3220 // case 726: CALL(0, 1, 1, 0, 1, 0, 1, 1, 0, 1); break;
3221 // case 727: CALL(1, 1, 1, 0, 1, 0, 1, 1, 0, 1); break;
3222 // case 728: CALL(0, 0, 0, 1, 1, 0, 1, 1, 0, 1); break;
3223 // case 729: CALL(1, 0, 0, 1, 1, 0, 1, 1, 0, 1); break;
3224 // case 730: CALL(0, 1, 0, 1, 1, 0, 1, 1, 0, 1); break;
3225 // case 731: CALL(1, 1, 0, 1, 1, 0, 1, 1, 0, 1); break;
3226 // case 732: CALL(0, 0, 1, 1, 1, 0, 1, 1, 0, 1); break;
3227 // case 733: CALL(1, 0, 1, 1, 1, 0, 1, 1, 0, 1); break;
3228 // case 734: CALL(0, 1, 1, 1, 1, 0, 1, 1, 0, 1); break;
3229 // case 735: CALL(1, 1, 1, 1, 1, 0, 1, 1, 0, 1); break;
3230 // case 736: CALL(0, 0, 0, 0, 0, 1, 1, 1, 0, 1); break;
3231 // case 737: CALL(1, 0, 0, 0, 0, 1, 1, 1, 0, 1); break;
3232 // case 738: CALL(0, 1, 0, 0, 0, 1, 1, 1, 0, 1); break;
3233 // case 739: CALL(1, 1, 0, 0, 0, 1, 1, 1, 0, 1); break;
3234 // case 740: CALL(0, 0, 1, 0, 0, 1, 1, 1, 0, 1); break;
3235 // case 741: CALL(1, 0, 1, 0, 0, 1, 1, 1, 0, 1); break;
3236 // case 742: CALL(0, 1, 1, 0, 0, 1, 1, 1, 0, 1); break;
3237 // case 743: CALL(1, 1, 1, 0, 0, 1, 1, 1, 0, 1); break;
3238 // case 744: CALL(0, 0, 0, 1, 0, 1, 1, 1, 0, 1); break;
3239 // case 745: CALL(1, 0, 0, 1, 0, 1, 1, 1, 0, 1); break;
3240 // case 746: CALL(0, 1, 0, 1, 0, 1, 1, 1, 0, 1); break;
3241 // case 747: CALL(1, 1, 0, 1, 0, 1, 1, 1, 0, 1); break;
3242 // case 748: CALL(0, 0, 1, 1, 0, 1, 1, 1, 0, 1); break;
3243 // case 749: CALL(1, 0, 1, 1, 0, 1, 1, 1, 0, 1); break;
3244 // case 750: CALL(0, 1, 1, 1, 0, 1, 1, 1, 0, 1); break;
3245 // case 751: CALL(1, 1, 1, 1, 0, 1, 1, 1, 0, 1); break;
3246 // case 752: CALL(0, 0, 0, 0, 1, 1, 1, 1, 0, 1); break;
3247 // case 753: CALL(1, 0, 0, 0, 1, 1, 1, 1, 0, 1); break;
3248 // case 754: CALL(0, 1, 0, 0, 1, 1, 1, 1, 0, 1); break;
3249 // case 755: CALL(1, 1, 0, 0, 1, 1, 1, 1, 0, 1); break;
3250 // case 756: CALL(0, 0, 1, 0, 1, 1, 1, 1, 0, 1); break;
3251 // case 757: CALL(1, 0, 1, 0, 1, 1, 1, 1, 0, 1); break;
3252 // case 758: CALL(0, 1, 1, 0, 1, 1, 1, 1, 0, 1); break;
3253 // case 759: CALL(1, 1, 1, 0, 1, 1, 1, 1, 0, 1); break;
3254 // case 760: CALL(0, 0, 0, 1, 1, 1, 1, 1, 0, 1); break;
3255 // case 761: CALL(1, 0, 0, 1, 1, 1, 1, 1, 0, 1); break;
3256 // case 762: CALL(0, 1, 0, 1, 1, 1, 1, 1, 0, 1); break;
3257 // case 763: CALL(1, 1, 0, 1, 1, 1, 1, 1, 0, 1); break;
3258 // case 764: CALL(0, 0, 1, 1, 1, 1, 1, 1, 0, 1); break;
3259 // case 765: CALL(1, 0, 1, 1, 1, 1, 1, 1, 0, 1); break;
3260 // case 766: CALL(0, 1, 1, 1, 1, 1, 1, 1, 0, 1); break;
3261 // case 767: CALL(1, 1, 1, 1, 1, 1, 1, 1, 0, 1); break;
3262 // case 768: CALL(0, 0, 0, 0, 0, 0, 0, 0, 1, 1); break;
3263 // case 769: CALL(1, 0, 0, 0, 0, 0, 0, 0, 1, 1); break;
3264 // case 770: CALL(0, 1, 0, 0, 0, 0, 0, 0, 1, 1); break;
3265 // case 771: CALL(1, 1, 0, 0, 0, 0, 0, 0, 1, 1); break;
3266 // case 772: CALL(0, 0, 1, 0, 0, 0, 0, 0, 1, 1); break;
3267 // case 773: CALL(1, 0, 1, 0, 0, 0, 0, 0, 1, 1); break;
3268 // case 774: CALL(0, 1, 1, 0, 0, 0, 0, 0, 1, 1); break;
3269 // case 775: CALL(1, 1, 1, 0, 0, 0, 0, 0, 1, 1); break;
3270 // case 776: CALL(0, 0, 0, 1, 0, 0, 0, 0, 1, 1); break;
3271 // case 777: CALL(1, 0, 0, 1, 0, 0, 0, 0, 1, 1); break;
3272 // case 778: CALL(0, 1, 0, 1, 0, 0, 0, 0, 1, 1); break;
3273 // case 779: CALL(1, 1, 0, 1, 0, 0, 0, 0, 1, 1); break;
3274 // case 780: CALL(0, 0, 1, 1, 0, 0, 0, 0, 1, 1); break;
3275 // case 781: CALL(1, 0, 1, 1, 0, 0, 0, 0, 1, 1); break;
3276 // case 782: CALL(0, 1, 1, 1, 0, 0, 0, 0, 1, 1); break;
3277 // case 783: CALL(1, 1, 1, 1, 0, 0, 0, 0, 1, 1); break;
3278 // case 784: CALL(0, 0, 0, 0, 1, 0, 0, 0, 1, 1); break;
3279 // case 785: CALL(1, 0, 0, 0, 1, 0, 0, 0, 1, 1); break;
3280 // case 786: CALL(0, 1, 0, 0, 1, 0, 0, 0, 1, 1); break;
3281 // case 787: CALL(1, 1, 0, 0, 1, 0, 0, 0, 1, 1); break;
3282 // case 788: CALL(0, 0, 1, 0, 1, 0, 0, 0, 1, 1); break;
3283 // case 789: CALL(1, 0, 1, 0, 1, 0, 0, 0, 1, 1); break;
3284 // case 790: CALL(0, 1, 1, 0, 1, 0, 0, 0, 1, 1); break;
3285 // case 791: CALL(1, 1, 1, 0, 1, 0, 0, 0, 1, 1); break;
3286 // case 792: CALL(0, 0, 0, 1, 1, 0, 0, 0, 1, 1); break;
3287 // case 793: CALL(1, 0, 0, 1, 1, 0, 0, 0, 1, 1); break;
3288 // case 794: CALL(0, 1, 0, 1, 1, 0, 0, 0, 1, 1); break;
3289 // case 795: CALL(1, 1, 0, 1, 1, 0, 0, 0, 1, 1); break;
3290 // case 796: CALL(0, 0, 1, 1, 1, 0, 0, 0, 1, 1); break;
3291 // case 797: CALL(1, 0, 1, 1, 1, 0, 0, 0, 1, 1); break;
3292 // case 798: CALL(0, 1, 1, 1, 1, 0, 0, 0, 1, 1); break;
3293 // case 799: CALL(1, 1, 1, 1, 1, 0, 0, 0, 1, 1); break;
3294 // case 800: CALL(0, 0, 0, 0, 0, 1, 0, 0, 1, 1); break;
3295 // case 801: CALL(1, 0, 0, 0, 0, 1, 0, 0, 1, 1); break;
3296 // case 802: CALL(0, 1, 0, 0, 0, 1, 0, 0, 1, 1); break;
3297 // case 803: CALL(1, 1, 0, 0, 0, 1, 0, 0, 1, 1); break;
3298 // case 804: CALL(0, 0, 1, 0, 0, 1, 0, 0, 1, 1); break;
3299 // case 805: CALL(1, 0, 1, 0, 0, 1, 0, 0, 1, 1); break;
3300 // case 806: CALL(0, 1, 1, 0, 0, 1, 0, 0, 1, 1); break;
3301 // case 807: CALL(1, 1, 1, 0, 0, 1, 0, 0, 1, 1); break;
3302 // case 808: CALL(0, 0, 0, 1, 0, 1, 0, 0, 1, 1); break;
3303 // case 809: CALL(1, 0, 0, 1, 0, 1, 0, 0, 1, 1); break;
3304 // case 810: CALL(0, 1, 0, 1, 0, 1, 0, 0, 1, 1); break;
3305 // case 811: CALL(1, 1, 0, 1, 0, 1, 0, 0, 1, 1); break;
3306 // case 812: CALL(0, 0, 1, 1, 0, 1, 0, 0, 1, 1); break;
3307 // case 813: CALL(1, 0, 1, 1, 0, 1, 0, 0, 1, 1); break;
3308 // case 814: CALL(0, 1, 1, 1, 0, 1, 0, 0, 1, 1); break;
3309 // case 815: CALL(1, 1, 1, 1, 0, 1, 0, 0, 1, 1); break;
3310 case 816: CALL(0, 0, 0, 0, 1, 1, 0, 0, 1, 1); break;
3311 case 817: CALL(1, 0, 0, 0, 1, 1, 0, 0, 1, 1); break;
3312 case 818: CALL(0, 1, 0, 0, 1, 1, 0, 0, 1, 1); break;
3313 case 819: CALL(1, 1, 0, 0, 1, 1, 0, 0, 1, 1); break;
3314 case 820: CALL(0, 0, 1, 0, 1, 1, 0, 0, 1, 1); break;
3315 case 821: CALL(1, 0, 1, 0, 1, 1, 0, 0, 1, 1); break;
3316 case 822: CALL(0, 1, 1, 0, 1, 1, 0, 0, 1, 1); break;
3317 case 823: CALL(1, 1, 1, 0, 1, 1, 0, 0, 1, 1); break;
3318 case 824: CALL(0, 0, 0, 1, 1, 1, 0, 0, 1, 1); break;
3319 case 825: CALL(1, 0, 0, 1, 1, 1, 0, 0, 1, 1); break;
3320 case 826: CALL(0, 1, 0, 1, 1, 1, 0, 0, 1, 1); break;
3321 case 827: CALL(1, 1, 0, 1, 1, 1, 0, 0, 1, 1); break;
3322 case 828: CALL(0, 0, 1, 1, 1, 1, 0, 0, 1, 1); break;
3323 case 829: CALL(1, 0, 1, 1, 1, 1, 0, 0, 1, 1); break;
3324 case 830: CALL(0, 1, 1, 1, 1, 1, 0, 0, 1, 1); break;
3325 case 831: CALL(1, 1, 1, 1, 1, 1, 0, 0, 1, 1); break;
3326 // case 832: CALL(0, 0, 0, 0, 0, 0, 1, 0, 1, 1); break;
3327 // case 833: CALL(1, 0, 0, 0, 0, 0, 1, 0, 1, 1); break;
3328 // case 834: CALL(0, 1, 0, 0, 0, 0, 1, 0, 1, 1); break;
3329 // case 835: CALL(1, 1, 0, 0, 0, 0, 1, 0, 1, 1); break;
3330 // case 836: CALL(0, 0, 1, 0, 0, 0, 1, 0, 1, 1); break;
3331 // case 837: CALL(1, 0, 1, 0, 0, 0, 1, 0, 1, 1); break;
3332 // case 838: CALL(0, 1, 1, 0, 0, 0, 1, 0, 1, 1); break;
3333 // case 839: CALL(1, 1, 1, 0, 0, 0, 1, 0, 1, 1); break;
3334 // case 840: CALL(0, 0, 0, 1, 0, 0, 1, 0, 1, 1); break;
3335 // case 841: CALL(1, 0, 0, 1, 0, 0, 1, 0, 1, 1); break;
3336 // case 842: CALL(0, 1, 0, 1, 0, 0, 1, 0, 1, 1); break;
3337 // case 843: CALL(1, 1, 0, 1, 0, 0, 1, 0, 1, 1); break;
3338 // case 844: CALL(0, 0, 1, 1, 0, 0, 1, 0, 1, 1); break;
3339 // case 845: CALL(1, 0, 1, 1, 0, 0, 1, 0, 1, 1); break;
3340 // case 846: CALL(0, 1, 1, 1, 0, 0, 1, 0, 1, 1); break;
3341 // case 847: CALL(1, 1, 1, 1, 0, 0, 1, 0, 1, 1); break;
3342 case 848: CALL(0, 0, 0, 0, 1, 0, 1, 0, 1, 1); break;
3343 case 849: CALL(1, 0, 0, 0, 1, 0, 1, 0, 1, 1); break;
3344 case 850: CALL(0, 1, 0, 0, 1, 0, 1, 0, 1, 1); break;
3345 case 851: CALL(1, 1, 0, 0, 1, 0, 1, 0, 1, 1); break;
3346 case 852: CALL(0, 0, 1, 0, 1, 0, 1, 0, 1, 1); break;
3347 case 853: CALL(1, 0, 1, 0, 1, 0, 1, 0, 1, 1); break;
3348 case 854: CALL(0, 1, 1, 0, 1, 0, 1, 0, 1, 1); break;
3349 case 855: CALL(1, 1, 1, 0, 1, 0, 1, 0, 1, 1); break;
3350 case 856: CALL(0, 0, 0, 1, 1, 0, 1, 0, 1, 1); break;
3351 case 857: CALL(1, 0, 0, 1, 1, 0, 1, 0, 1, 1); break;
3352 case 858: CALL(0, 1, 0, 1, 1, 0, 1, 0, 1, 1); break;
3353 case 859: CALL(1, 1, 0, 1, 1, 0, 1, 0, 1, 1); break;
3354 case 860: CALL(0, 0, 1, 1, 1, 0, 1, 0, 1, 1); break;
3355 case 861: CALL(1, 0, 1, 1, 1, 0, 1, 0, 1, 1); break;
3356 case 862: CALL(0, 1, 1, 1, 1, 0, 1, 0, 1, 1); break;
3357 case 863: CALL(1, 1, 1, 1, 1, 0, 1, 0, 1, 1); break;
3358 // case 864: CALL(0, 0, 0, 0, 0, 1, 1, 0, 1, 1); break;
3359 // case 865: CALL(1, 0, 0, 0, 0, 1, 1, 0, 1, 1); break;
3360 // case 866: CALL(0, 1, 0, 0, 0, 1, 1, 0, 1, 1); break;
3361 // case 867: CALL(1, 1, 0, 0, 0, 1, 1, 0, 1, 1); break;
3362 // case 868: CALL(0, 0, 1, 0, 0, 1, 1, 0, 1, 1); break;
3363 // case 869: CALL(1, 0, 1, 0, 0, 1, 1, 0, 1, 1); break;
3364 // case 870: CALL(0, 1, 1, 0, 0, 1, 1, 0, 1, 1); break;
3365 // case 871: CALL(1, 1, 1, 0, 0, 1, 1, 0, 1, 1); break;
3366 // case 872: CALL(0, 0, 0, 1, 0, 1, 1, 0, 1, 1); break;
3367 // case 873: CALL(1, 0, 0, 1, 0, 1, 1, 0, 1, 1); break;
3368 // case 874: CALL(0, 1, 0, 1, 0, 1, 1, 0, 1, 1); break;
3369 // case 875: CALL(1, 1, 0, 1, 0, 1, 1, 0, 1, 1); break;
3370 // case 876: CALL(0, 0, 1, 1, 0, 1, 1, 0, 1, 1); break;
3371 // case 877: CALL(1, 0, 1, 1, 0, 1, 1, 0, 1, 1); break;
3372 // case 878: CALL(0, 1, 1, 1, 0, 1, 1, 0, 1, 1); break;
3373 // case 879: CALL(1, 1, 1, 1, 0, 1, 1, 0, 1, 1); break;
3374 // case 880: CALL(0, 0, 0, 0, 1, 1, 1, 0, 1, 1); break;
3375 // case 881: CALL(1, 0, 0, 0, 1, 1, 1, 0, 1, 1); break;
3376 // case 882: CALL(0, 1, 0, 0, 1, 1, 1, 0, 1, 1); break;
3377 // case 883: CALL(1, 1, 0, 0, 1, 1, 1, 0, 1, 1); break;
3378 // case 884: CALL(0, 0, 1, 0, 1, 1, 1, 0, 1, 1); break;
3379 // case 885: CALL(1, 0, 1, 0, 1, 1, 1, 0, 1, 1); break;
3380 // case 886: CALL(0, 1, 1, 0, 1, 1, 1, 0, 1, 1); break;
3381 // case 887: CALL(1, 1, 1, 0, 1, 1, 1, 0, 1, 1); break;
3382 // case 888: CALL(0, 0, 0, 1, 1, 1, 1, 0, 1, 1); break;
3383 // case 889: CALL(1, 0, 0, 1, 1, 1, 1, 0, 1, 1); break;
3384 // case 890: CALL(0, 1, 0, 1, 1, 1, 1, 0, 1, 1); break;
3385 // case 891: CALL(1, 1, 0, 1, 1, 1, 1, 0, 1, 1); break;
3386 // case 892: CALL(0, 0, 1, 1, 1, 1, 1, 0, 1, 1); break;
3387 // case 893: CALL(1, 0, 1, 1, 1, 1, 1, 0, 1, 1); break;
3388 // case 894: CALL(0, 1, 1, 1, 1, 1, 1, 0, 1, 1); break;
3389 // case 895: CALL(1, 1, 1, 1, 1, 1, 1, 0, 1, 1); break;
3390 // case 896: CALL(0, 0, 0, 0, 0, 0, 0, 1, 1, 1); break;
3391 // case 897: CALL(1, 0, 0, 0, 0, 0, 0, 1, 1, 1); break;
3392 // case 898: CALL(0, 1, 0, 0, 0, 0, 0, 1, 1, 1); break;
3393 // case 899: CALL(1, 1, 0, 0, 0, 0, 0, 1, 1, 1); break;
3394 // case 900: CALL(0, 0, 1, 0, 0, 0, 0, 1, 1, 1); break;
3395 // case 901: CALL(1, 0, 1, 0, 0, 0, 0, 1, 1, 1); break;
3396 // case 902: CALL(0, 1, 1, 0, 0, 0, 0, 1, 1, 1); break;
3397 // case 903: CALL(1, 1, 1, 0, 0, 0, 0, 1, 1, 1); break;
3398 // case 904: CALL(0, 0, 0, 1, 0, 0, 0, 1, 1, 1); break;
3399 // case 905: CALL(1, 0, 0, 1, 0, 0, 0, 1, 1, 1); break;
3400 // case 906: CALL(0, 1, 0, 1, 0, 0, 0, 1, 1, 1); break;
3401 // case 907: CALL(1, 1, 0, 1, 0, 0, 0, 1, 1, 1); break;
3402 // case 908: CALL(0, 0, 1, 1, 0, 0, 0, 1, 1, 1); break;
3403 // case 909: CALL(1, 0, 1, 1, 0, 0, 0, 1, 1, 1); break;
3404 // case 910: CALL(0, 1, 1, 1, 0, 0, 0, 1, 1, 1); break;
3405 // case 911: CALL(1, 1, 1, 1, 0, 0, 0, 1, 1, 1); break;
3406 // case 912: CALL(0, 0, 0, 0, 1, 0, 0, 1, 1, 1); break;
3407 // case 913: CALL(1, 0, 0, 0, 1, 0, 0, 1, 1, 1); break;
3408 // case 914: CALL(0, 1, 0, 0, 1, 0, 0, 1, 1, 1); break;
3409 // case 915: CALL(1, 1, 0, 0, 1, 0, 0, 1, 1, 1); break;
3410 // case 916: CALL(0, 0, 1, 0, 1, 0, 0, 1, 1, 1); break;
3411 // case 917: CALL(1, 0, 1, 0, 1, 0, 0, 1, 1, 1); break;
3412 // case 918: CALL(0, 1, 1, 0, 1, 0, 0, 1, 1, 1); break;
3413 // case 919: CALL(1, 1, 1, 0, 1, 0, 0, 1, 1, 1); break;
3414 // case 920: CALL(0, 0, 0, 1, 1, 0, 0, 1, 1, 1); break;
3415 // case 921: CALL(1, 0, 0, 1, 1, 0, 0, 1, 1, 1); break;
3416 // case 922: CALL(0, 1, 0, 1, 1, 0, 0, 1, 1, 1); break;
3417 // case 923: CALL(1, 1, 0, 1, 1, 0, 0, 1, 1, 1); break;
3418 // case 924: CALL(0, 0, 1, 1, 1, 0, 0, 1, 1, 1); break;
3419 // case 925: CALL(1, 0, 1, 1, 1, 0, 0, 1, 1, 1); break;
3420 // case 926: CALL(0, 1, 1, 1, 1, 0, 0, 1, 1, 1); break;
3421 // case 927: CALL(1, 1, 1, 1, 1, 0, 0, 1, 1, 1); break;
3422 // case 928: CALL(0, 0, 0, 0, 0, 1, 0, 1, 1, 1); break;
3423 // case 929: CALL(1, 0, 0, 0, 0, 1, 0, 1, 1, 1); break;
3424 // case 930: CALL(0, 1, 0, 0, 0, 1, 0, 1, 1, 1); break;
3425 // case 931: CALL(1, 1, 0, 0, 0, 1, 0, 1, 1, 1); break;
3426 // case 932: CALL(0, 0, 1, 0, 0, 1, 0, 1, 1, 1); break;
3427 // case 933: CALL(1, 0, 1, 0, 0, 1, 0, 1, 1, 1); break;
3428 // case 934: CALL(0, 1, 1, 0, 0, 1, 0, 1, 1, 1); break;
3429 // case 935: CALL(1, 1, 1, 0, 0, 1, 0, 1, 1, 1); break;
3430 // case 936: CALL(0, 0, 0, 1, 0, 1, 0, 1, 1, 1); break;
3431 // case 937: CALL(1, 0, 0, 1, 0, 1, 0, 1, 1, 1); break;
3432 // case 938: CALL(0, 1, 0, 1, 0, 1, 0, 1, 1, 1); break;
3433 // case 939: CALL(1, 1, 0, 1, 0, 1, 0, 1, 1, 1); break;
3434 // case 940: CALL(0, 0, 1, 1, 0, 1, 0, 1, 1, 1); break;
3435 // case 941: CALL(1, 0, 1, 1, 0, 1, 0, 1, 1, 1); break;
3436 // case 942: CALL(0, 1, 1, 1, 0, 1, 0, 1, 1, 1); break;
3437 // case 943: CALL(1, 1, 1, 1, 0, 1, 0, 1, 1, 1); break;
3438 case 944: CALL(0, 0, 0, 0, 1, 1, 0, 1, 1, 1); break;
3439 case 945: CALL(1, 0, 0, 0, 1, 1, 0, 1, 1, 1); break;
3440 case 946: CALL(0, 1, 0, 0, 1, 1, 0, 1, 1, 1); break;
3441 case 947: CALL(1, 1, 0, 0, 1, 1, 0, 1, 1, 1); break;
3442 case 948: CALL(0, 0, 1, 0, 1, 1, 0, 1, 1, 1); break;
3443 case 949: CALL(1, 0, 1, 0, 1, 1, 0, 1, 1, 1); break;
3444 case 950: CALL(0, 1, 1, 0, 1, 1, 0, 1, 1, 1); break;
3445 case 951: CALL(1, 1, 1, 0, 1, 1, 0, 1, 1, 1); break;
3446 case 952: CALL(0, 0, 0, 1, 1, 1, 0, 1, 1, 1); break;
3447 case 953: CALL(1, 0, 0, 1, 1, 1, 0, 1, 1, 1); break;
3448 case 954: CALL(0, 1, 0, 1, 1, 1, 0, 1, 1, 1); break;
3449 case 955: CALL(1, 1, 0, 1, 1, 1, 0, 1, 1, 1); break;
3450 case 956: CALL(0, 0, 1, 1, 1, 1, 0, 1, 1, 1); break;
3451 case 957: CALL(1, 0, 1, 1, 1, 1, 0, 1, 1, 1); break;
3452 case 958: CALL(0, 1, 1, 1, 1, 1, 0, 1, 1, 1); break;
3453 case 959: CALL(1, 1, 1, 1, 1, 1, 0, 1, 1, 1); break;
3454 // case 960: CALL(0, 0, 0, 0, 0, 0, 1, 1, 1, 1); break;
3455 // case 961: CALL(1, 0, 0, 0, 0, 0, 1, 1, 1, 1); break;
3456 // case 962: CALL(0, 1, 0, 0, 0, 0, 1, 1, 1, 1); break;
3457 // case 963: CALL(1, 1, 0, 0, 0, 0, 1, 1, 1, 1); break;
3458 // case 964: CALL(0, 0, 1, 0, 0, 0, 1, 1, 1, 1); break;
3459 // case 965: CALL(1, 0, 1, 0, 0, 0, 1, 1, 1, 1); break;
3460 // case 966: CALL(0, 1, 1, 0, 0, 0, 1, 1, 1, 1); break;
3461 // case 967: CALL(1, 1, 1, 0, 0, 0, 1, 1, 1, 1); break;
3462 // case 968: CALL(0, 0, 0, 1, 0, 0, 1, 1, 1, 1); break;
3463 // case 969: CALL(1, 0, 0, 1, 0, 0, 1, 1, 1, 1); break;
3464 // case 970: CALL(0, 1, 0, 1, 0, 0, 1, 1, 1, 1); break;
3465 // case 971: CALL(1, 1, 0, 1, 0, 0, 1, 1, 1, 1); break;
3466 // case 972: CALL(0, 0, 1, 1, 0, 0, 1, 1, 1, 1); break;
3467 // case 973: CALL(1, 0, 1, 1, 0, 0, 1, 1, 1, 1); break;
3468 // case 974: CALL(0, 1, 1, 1, 0, 0, 1, 1, 1, 1); break;
3469 // case 975: CALL(1, 1, 1, 1, 0, 0, 1, 1, 1, 1); break;
3470 case 976: CALL(0, 0, 0, 0, 1, 0, 1, 1, 1, 1); break;
3471 case 977: CALL(1, 0, 0, 0, 1, 0, 1, 1, 1, 1); break;
3472 case 978: CALL(0, 1, 0, 0, 1, 0, 1, 1, 1, 1); break;
3473 case 979: CALL(1, 1, 0, 0, 1, 0, 1, 1, 1, 1); break;
3474 case 980: CALL(0, 0, 1, 0, 1, 0, 1, 1, 1, 1); break;
3475 case 981: CALL(1, 0, 1, 0, 1, 0, 1, 1, 1, 1); break;
3476 case 982: CALL(0, 1, 1, 0, 1, 0, 1, 1, 1, 1); break;
3477 case 983: CALL(1, 1, 1, 0, 1, 0, 1, 1, 1, 1); break;
3478 case 984: CALL(0, 0, 0, 1, 1, 0, 1, 1, 1, 1); break;
3479 case 985: CALL(1, 0, 0, 1, 1, 0, 1, 1, 1, 1); break;
3480 case 986: CALL(0, 1, 0, 1, 1, 0, 1, 1, 1, 1); break;
3481 case 987: CALL(1, 1, 0, 1, 1, 0, 1, 1, 1, 1); break;
3482 case 988: CALL(0, 0, 1, 1, 1, 0, 1, 1, 1, 1); break;
3483 case 989: CALL(1, 0, 1, 1, 1, 0, 1, 1, 1, 1); break;
3484 case 990: CALL(0, 1, 1, 1, 1, 0, 1, 1, 1, 1); break;
3485 case 991: CALL(1, 1, 1, 1, 1, 0, 1, 1, 1, 1); break;
3486 // case 992: CALL(0, 0, 0, 0, 0, 1, 1, 1, 1, 1); break;
3487 // case 993: CALL(1, 0, 0, 0, 0, 1, 1, 1, 1, 1); break;
3488 // case 994: CALL(0, 1, 0, 0, 0, 1, 1, 1, 1, 1); break;
3489 // case 995: CALL(1, 1, 0, 0, 0, 1, 1, 1, 1, 1); break;
3490 // case 996: CALL(0, 0, 1, 0, 0, 1, 1, 1, 1, 1); break;
3491 // case 997: CALL(1, 0, 1, 0, 0, 1, 1, 1, 1, 1); break;
3492 // case 998: CALL(0, 1, 1, 0, 0, 1, 1, 1, 1, 1); break;
3493 // case 999: CALL(1, 1, 1, 0, 0, 1, 1, 1, 1, 1); break;
3494 // case 1000: CALL(0, 0, 0, 1, 0, 1, 1, 1, 1, 1); break;
3495 // case 1001: CALL(1, 0, 0, 1, 0, 1, 1, 1, 1, 1); break;
3496 // case 1002: CALL(0, 1, 0, 1, 0, 1, 1, 1, 1, 1); break;
3497 // case 1003: CALL(1, 1, 0, 1, 0, 1, 1, 1, 1, 1); break;
3498 // case 1004: CALL(0, 0, 1, 1, 0, 1, 1, 1, 1, 1); break;
3499 // case 1005: CALL(1, 0, 1, 1, 0, 1, 1, 1, 1, 1); break;
3500 // case 1006: CALL(0, 1, 1, 1, 0, 1, 1, 1, 1, 1); break;
3501 // case 1007: CALL(1, 1, 1, 1, 0, 1, 1, 1, 1, 1); break;
3502 // case 1008: CALL(0, 0, 0, 0, 1, 1, 1, 1, 1, 1); break;
3503 // case 1009: CALL(1, 0, 0, 0, 1, 1, 1, 1, 1, 1); break;
3504 // case 1010: CALL(0, 1, 0, 0, 1, 1, 1, 1, 1, 1); break;
3505 // case 1011: CALL(1, 1, 0, 0, 1, 1, 1, 1, 1, 1); break;
3506 // case 1012: CALL(0, 0, 1, 0, 1, 1, 1, 1, 1, 1); break;
3507 // case 1013: CALL(1, 0, 1, 0, 1, 1, 1, 1, 1, 1); break;
3508 // case 1014: CALL(0, 1, 1, 0, 1, 1, 1, 1, 1, 1); break;
3509 // case 1015: CALL(1, 1, 1, 0, 1, 1, 1, 1, 1, 1); break;
3510 // case 1016: CALL(0, 0, 0, 1, 1, 1, 1, 1, 1, 1); break;
3511 // case 1017: CALL(1, 0, 0, 1, 1, 1, 1, 1, 1, 1); break;
3512 // case 1018: CALL(0, 1, 0, 1, 1, 1, 1, 1, 1, 1); break;
3513 // case 1019: CALL(1, 1, 0, 1, 1, 1, 1, 1, 1, 1); break;
3514 // case 1020: CALL(0, 0, 1, 1, 1, 1, 1, 1, 1, 1); break;
3515 // case 1021: CALL(1, 0, 1, 1, 1, 1, 1, 1, 1, 1); break;
3516 // case 1022: CALL(0, 1, 1, 1, 1, 1, 1, 1, 1, 1); break;
3517 // case 1023: CALL(1, 1, 1, 1, 1, 1, 1, 1, 1, 1); break;
3522 "doEnergy=%d doVirial=%d doSlow=%d doPairlist=%d "
3523 "doAlch=%d doFEP=%d doTI=%d doStreaming=%d doTable=%d "
3525 doEnergy, doVirial, doSlow, doPairlist, doAlch, doFEP, doTI,
3526 doStreaming, doTable, options);
3529 std::string call_options;
3530 call_options += "doEnergy = " + std::to_string(int(doEnergy));
3531 call_options += ", doVirial = " + std::to_string(int(doVirial));
3532 call_options += ", doSlow = " + std::to_string(int(doSlow));
3533 call_options += ", doPairlist = " + std::to_string(int(doPairlist));
3534 call_options += ", doAlch = " + std::to_string(int(doAlch));
3535 call_options += ", doFEP = " + std::to_string(int(doFEP));
3536 call_options += ", doTI = " + std::to_string(int(doTI));
3537 call_options += ", doStreaming = " + std::to_string(int(doStreaming));
3538 call_options += ", doTable = " + std::to_string(int(doTable));
3539 call_options += ", doAlchVdwForceSwitching = " + std::to_string(int(doAlchVdwForceSwitching));
3540 const std::string error = "CudaComputeNonbondedKernel::nonbondedForce, none of the kernels called. Options are:\n" + call_options;
3541 NAMD_bug(error.c_str());
3550 cudaCheck(cudaGetLastError());
3552 start += nblock*nwarp;
3554 if ( doVirial || ! doStreaming ){
3556 int grid = (atomStorageSize + block - 1)/block;
3558 transposeForcesKernel<1><<<grid, block, 0, stream>>>(d_forces, d_forcesSlow,
3559 force_x, force_y, force_z, force_w,
3560 forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
3563 transposeForcesKernel<0><<<grid, block, 0, stream>>>(d_forces, d_forcesSlow,
3564 force_x, force_y, force_z, force_w,
3565 forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
3569 cudaCheck(cudaStreamSynchronize(stream));
3571 // Haochuan: I add atomIndex to the output and also debug the slow forces.
3572 // XXX TODO: ERASE THIS AFTERWARDS
3573 // this is not numAtoms, this is something else
3574 // will print the force inside the compute and afterwards
3578 allocate_host<float4>(&h_f, atomStorageSize);
3579 allocate_host<float4>(&h_f_slow, atomStorageSize);
3580 allocate_host<int>(&h_index, atomStorageSize);
3581 copy_DtoH_sync<float4>(d_forces, h_f, atomStorageSize);
3582 copy_DtoH_sync<float4>(d_forcesSlow, h_f_slow, atomStorageSize);
3583 copy_DtoH_sync<int>(atomIndex, h_index, atomStorageSize);
3585 FILE* pos_nb_f = fopen("compute_nb_dforces.txt", "a+");
3586 fprintf(pos_nb_f, "forces after kernel\n");
3587 // I'm gonna copy back the forces and just print them
3589 for(int i = 0; i < atomStorageSize; i++){
3590 //for(int i = 83000; i < 85000; i++){
3591 fprintf(pos_nb_f, "%10d %10.5f %10.5f %10.5f\n", h_index[i], h_f[i].x,
3592 h_f[i].y, h_f[i].z);
3596 FILE* pos_slow_f = fopen("compute_slow_dforces.txt", "a+");
3597 for(int i = 0; i < atomStorageSize; i++){
3598 fprintf(pos_slow_f, "%10d %10.5f %10.5f %10.5f\n", h_index[i], h_f_slow[i].x,
3599 h_f_slow[i].y, h_f_slow[i].z);
3602 deallocate_host(&h_f_slow);
3605 deallocate_host<int>(&h_index);
3606 deallocate_host<float4>(&h_f);
3612 // Perform virial and energy reductions for non-bonded force calculation
3614 void CudaComputeNonbondedKernel::reduceVirialEnergy(CudaTileListKernel& tlKernel,
3615 const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS,
3616 float4* d_forces, float4* d_forcesSlow,
3617 VirialEnergy* d_virialEnergy, cudaStream_t stream) {
3619 if (doEnergy || doVirial) {
3620 clear_device_array<VirialEnergy>(d_virialEnergy, 1, stream);
3625 int nthread = REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*WARPSIZE;
3626 int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
3627 reduceNonbondedVirialKernel <<< nblock, nthread, 0, stream >>>
3628 (doSlow, atomStorageSize, tlKernel.get_xyzq(), d_forces, d_forcesSlow, d_virialEnergy);
3629 cudaCheck(cudaGetLastError());
3632 if (doVirial || doEnergy)
3634 int nthread = REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE;
3635 int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyLength()-1)/nthread+1);
3636 reduceVirialEnergyKernel <<< nblock, nthread, 0, stream >>>
3637 (doEnergy, doVirial, doSlow, tlKernel.getTileListVirialEnergyLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
3638 cudaCheck(cudaGetLastError());
3641 if (doGBIS && doEnergy)
3643 int nthread = REDUCEGBISENERGYKERNEL_NUM_WARP*WARPSIZE;
3644 int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyGBISLength()-1)/nthread+1);
3645 reduceGBISEnergyKernel <<< nblock, nthread, 0, stream >>>
3646 (tlKernel.getTileListVirialEnergyGBISLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
3647 cudaCheck(cudaGetLastError());
3652 void CudaComputeNonbondedKernel::bindExclusions(int numExclusions, unsigned int* exclusion_bits) {
3653 int nconst = ( numExclusions < MAX_CONST_EXCLUSIONS ? numExclusions : MAX_CONST_EXCLUSIONS );
3654 cudaCheck(cudaMemcpyToSymbol(constExclusions, exclusion_bits, nconst*sizeof(unsigned int), 0));
3656 reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions);
3657 copy_HtoD_sync<unsigned int>(exclusion_bits, overflowExclusions, numExclusions);
3661 void CudaComputeNonbondedKernel::setExclusionsByAtom(int2* h_data, const int num_atoms) {
3662 // Global data structure shouldn't be reallocated
3663 if (d_exclusionsByAtom == NULL) allocate_device<int2>(&d_exclusionsByAtom, num_atoms);
3664 copy_HtoD_sync<int2>(h_data, d_exclusionsByAtom, num_atoms);
3669 template<bool kDoAlch>
3670 __global__ void updateVdwTypesExclKernel(
3671 const int numPatches,
3672 const CudaLocalRecord* localRecords,
3673 const int* global_vdwTypes,
3674 const int* global_id,
3675 const int* patchSortOrder,
3676 const int2* exclusionsByAtom,
3677 const int* global_partition,
3683 __shared__ CudaLocalRecord s_record;
3684 using AccessType = int32_t;
3685 AccessType* s_record_buffer = (AccessType*) &s_record;
3687 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
3688 // Read in the CudaLocalRecord using multiple threads. This should
3690 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
3691 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
3695 const int numAtoms = s_record.numAtoms;
3696 const int offset = s_record.bufferOffset;
3697 const int offsetNB = s_record.bufferOffsetNBPad;
3699 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
3700 const int order = patchSortOrder[offset + i];
3701 const int id = global_id[offset + order];
3702 vdwTypes [offsetNB + i] = global_vdwTypes[offset + order];
3703 atomIndex [offsetNB + i] = id;
3704 exclusions[offsetNB + i].x = exclusionsByAtom[id].y;
3705 exclusions[offsetNB + i].y = exclusionsByAtom[id].x;
3707 part [offsetNB + i] = global_partition[offset + order];
3715 void CudaComputeNonbondedKernel::updateVdwTypesExclOnGPU(CudaTileListKernel& tlKernel,
3716 const int numPatches, const int atomStorageSize, const bool alchOn,
3717 CudaLocalRecord* localRecords,
3718 const int* d_vdwTypes, const int* d_id, const int* d_sortOrder,
3719 const int* d_partition,
3722 reallocate_device<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, OVERALLOC);
3723 reallocate_device<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, OVERALLOC);
3724 reallocate_device<int>(&atomIndex, &atomIndexSize, atomStorageSize, OVERALLOC);
3726 const int numBlocks = numPatches;
3727 const int numThreads = 512;
3730 updateVdwTypesExclKernel<true><<<numBlocks, numThreads, 0, stream>>>(
3731 numPatches, localRecords,
3732 d_vdwTypes, d_id, d_sortOrder, d_exclusionsByAtom, d_partition,
3733 vdwTypes, atomIndex, exclIndexMaxDiff, tlKernel.get_part()
3736 updateVdwTypesExclKernel<false><<<numBlocks, numThreads, 0, stream>>>(
3737 numPatches, localRecords,
3738 d_vdwTypes, d_id, d_sortOrder, d_exclusionsByAtom, d_partition,
3739 vdwTypes, atomIndex, exclIndexMaxDiff, tlKernel.get_part()