NAMD
CudaComputeNonbondedKernel.cu
Go to the documentation of this file.
1 #if defined(NAMD_CUDA)
2 #if __CUDACC_VER_MAJOR__ >= 11
3 #include <cub/cub.cuh>
4 #else
5 #include <namd_cub/cub.cuh>
6 #endif
7 #include <cuda.h>
8 #endif // NAMD_CUDA
9 
10 #include "CudaComputeNonbondedKernel.h"
11 #include "CudaTileListKernel.h"
12 #include "DeviceCUDA.h"
13 #include "CudaComputeNonbondedInteractions.h"
14 
15 #if defined(NAMD_CUDA)
16 
17 #ifdef WIN32
18 #define __thread __declspec(thread)
19 #endif
20 extern __thread DeviceCUDA *deviceCUDA;
21 
22 #define OVERALLOC 1.2f
23 
24 void NAMD_die(const char *);
25 void NAMD_bug(const char *);
26 
27 #define MAX_CONST_EXCLUSIONS 2048 // cache size is 8k
28 __constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS];
29 
30 //FEP parameters
31 __constant__ AlchData alchflags;
32 #define NONBONDKERNEL_NUM_WARP 4
33 
34 template<typename T>
35 __device__ __forceinline__
36 T make_zero();
37 
38 template<>
39 __device__ __forceinline__
40 float3 make_zero<float3>() {
41  return make_float3(0.0f, 0.0f, 0.0f);
42 }
43 
44 template<>
45 __device__ __forceinline__
46 float4 make_zero<float4>() {
47  return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
48 }
49 
50 template<bool doEnergy, bool doSlow, typename jForceType>
51 __device__ __forceinline__
52 void calcForceEnergyMath(const float r2, const float qi, const float qj,
53  const float dx, const float dy, const float dz,
54  const int vdwtypei, const int vdwtypej, const float2* __restrict__ vdwCoefTable,
55  cudaTextureObject_t vdwCoefTableTex,
56  cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
57  float3& iforce, float3& iforceSlow, jForceType& jforce, jForceType& jforceSlow,
58  float& energyVdw, float& energyElec, float& energySlow,
59  const CudaNBConstants nbConstants) {
60 
61  int vdwIndex = vdwtypej + vdwtypei;
62 #if __CUDA_ARCH__ >= 350
63  float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
64 #else
65  float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
66 #endif
67 
68  float rinv = rsqrtf(r2);
69  float f, fSlow;
70  float charge = qi * qj;
71 
72  cudaNBForceMagCalc_VdwEnergySwitch_PMEC1<doEnergy, doSlow>(
73  r2, rinv, charge, ljab, nbConstants,
74  f, fSlow, energyVdw, energyElec, energySlow);
75 
76  float fx = dx * f;
77  float fy = dy * f;
78  float fz = dz * f;
79  iforce.x += fx;
80  iforce.y += fy;
81  iforce.z += fz;
82  jforce.x -= fx;
83  jforce.y -= fy;
84  jforce.z -= fz;
85  if (doSlow) {
86  float fxSlow = dx * fSlow;
87  float fySlow = dy * fSlow;
88  float fzSlow = dz * fSlow;
89  iforceSlow.x += fxSlow;
90  iforceSlow.y += fySlow;
91  iforceSlow.z += fzSlow;
92  jforceSlow.x -= fxSlow;
93  jforceSlow.y -= fySlow;
94  jforceSlow.z -= fzSlow;
95  }
96 }
97 
98 
99 template<bool doEnergy, bool doSlow, typename jForceType>
100 __device__ __forceinline__
101 void calcForceEnergy(const float r2, const float qi, const float qj,
102  const float dx, const float dy, const float dz,
103  const int vdwtypei, const int vdwtypej, const float2* __restrict__ vdwCoefTable,
104  cudaTextureObject_t vdwCoefTableTex,
105  cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
106  float3& iforce, float3& iforceSlow, jForceType& jforce, jForceType& jforceSlow,
107  float& energyVdw, float& energyElec, float& energySlow) {
108 
109  int vdwIndex = vdwtypej + vdwtypei;
110 #if __CUDA_ARCH__ >= 350
111  float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
112 #else
113  float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
114 #endif
115 
116  float rinv = rsqrtf(r2);
117  float4 ei, fi;
118  float f, fSlow;
119 
120  fi = tex1D<float4>(forceTableTex, rinv);
121  if (doEnergy) ei = tex1D<float4>(energyTableTex, rinv);
122 
123  fSlow = qi * qj;
124  f = ljab.x * fi.z + ljab.y * fi.y + fSlow * fi.x;
125 
126  if (doEnergy) {
127  energyVdw += ljab.x * ei.z + ljab.y * ei.y;
128  energyElec += fSlow * ei.x;
129 
130  if (doSlow) {
131  energySlow += fSlow * ei.w;
132  }
133  }
134  if (doSlow) fSlow *= fi.w;
135 
136  float fx = dx * f;
137  float fy = dy * f;
138  float fz = dz * f;
139  iforce.x += fx;
140  iforce.y += fy;
141  iforce.z += fz;
142  jforce.x -= fx;
143  jforce.y -= fy;
144  jforce.z -= fz;
145 
146  if (doSlow) {
147  float fxSlow = dx * fSlow;
148  float fySlow = dy * fSlow;
149  float fzSlow = dz * fSlow;
150  iforceSlow.x += fxSlow;
151  iforceSlow.y += fySlow;
152  iforceSlow.z += fzSlow;
153  jforceSlow.x -= fxSlow;
154  jforceSlow.y -= fySlow;
155  jforceSlow.z -= fzSlow;
156  }
157 }
158 
159 
160 /* JM: Special __device__ function to compute VDW forces for alchemy.
161  * Partially swiped from ComputeNonbondedFEP.C
162  */
163 template<bool doEnergy, bool doSlow, bool shift, bool vdwForceSwitch, typename jForceType>
164 __device__ __forceinline__
165 void calcForceEnergyFEP(const float r2, const float qi, const float qj,
166  const float dx, const float dy, const float dz,
167  const int vdwtypei, const int vdwtypej,
168  char p1, char p2,
169  /*const AlchData &alchflags, */
170  const float2* __restrict__ vdwCoefTable,
171  cudaTextureObject_t vdwCoefTableTex,
172  cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
173  float3& iforce, float3& iforceSlow, jForceType& jforce, jForceType& jforceSlow,
174  float& energyVdw, float &energyVdw_s, float& energyElec, float& energySlow,
175  float& energyElec_s, float& energySlow_s) {
176 
177 
178  int vdwIndex = vdwtypej + vdwtypei;
179 #if __CUDA_ARCH__ >= 350
180  float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
181 #else
182  float2 ljab = tex1D<float2>(vdwCoefTableTex, vdwIndex); //ljab.x is A and ljab.y is B
183 #endif
184 
185  float myVdwLambda = 0.0f;
186  float myVdwLambda2 = 0.0f;
187  float myElecLambda = 0.0f;
188  float myElecLambda2 = 0.0f;
189  float rinv = rsqrtf(r2);
190  float f;
191  float alch_vdw_energy = 0.0f;
192  float alch_vdw_energy_2 = 0.0f;
193  float alch_vdw_force = 0.0f;
194  float fSlow = qi * qj;
195  float4 ei;
196  float4 fi = tex1D<float4>(forceTableTex, rinv);
197  if (doEnergy) ei = tex1D<float4>(energyTableTex, rinv);
198 
199  //John said that there is a better way to avoid divergences here
200  //alch: true if => 1-0, 1-1, 2-0, 2-2
201  //dec: true if => 1-1, 2-2 && decouple
202  //up: true if => 1-0 && 1,1
203  //down: true if => 2-0, && 2,2
204  int ref = (p1 == 0 && p2 == 0);
205  int alch = (!ref && !(p1 == 1 && p2 ==2) && !(p1 == 2 && p2 == 1));
206  int dec = (alch && (p1 == p2) && alchflags.alchDecouple);
207  int up = (alch && (p1 == 1 || p2 == 1) && !dec);
208  int down = (alch && (p1 == 2 || p2 == 2) && !dec);
209 
210  float r2_1, r2_2;
211  f = (fSlow * fi.x);
212 
213 /*--------------- VDW SPECIAL ALCH FORCES (Swiped from ComputeNonbondedFEP.C) ---------------*/
214 
215  myVdwLambda = alchflags.vdwLambdaUp*(up) + alchflags.vdwLambdaDown*(down) + 1.f*(ref || dec);
216  myVdwLambda2 = alchflags.vdwLambda2Up*(up) + alchflags.vdwLambda2Down*(down) + 1.f*(ref || dec);
217  myElecLambda = alchflags.elecLambdaUp*(up) + alchflags.elecLambdaDown*(down) + 1.f*(ref || dec);
218  myElecLambda2 = alchflags.elecLambda2Up*(up) + alchflags.elecLambda2Down*(down) + 1.f*(ref || dec);
219 
220  if (alch) {
221  if (vdwForceSwitch) {
222  // force switching
223  float switchdist6_1, switchdist6_2;
224  const float cutoff6 = alchflags.cutoff2 * alchflags.cutoff2 * alchflags.cutoff2;
225  // const float
226  //Templated parameter. No control divergence here
227  if (shift) {
228  const float myVdwShift = alchflags.vdwShiftUp*up + alchflags.vdwShiftDown*(!up);
229  const float myVdwShift2 = alchflags.vdwShift2Up*up + alchflags.vdwShift2Down*(!up);
230  r2_1 = __fdividef(1.f,(r2 + myVdwShift));
231  r2_2 = __fdividef(1.f,(r2 + myVdwShift2));
232  switchdist6_1 = alchflags.switchdist2 + myVdwShift;
233  switchdist6_1 = switchdist6_1 * switchdist6_1 * switchdist6_1;
234  switchdist6_2 = alchflags.switchdist2 + myVdwShift2;
235  switchdist6_2 = switchdist6_2 * switchdist6_2 * switchdist6_2;
236  } else {
237  r2_1 = rinv*rinv;
238  r2_2 = rinv*rinv;
239  switchdist6_1 = alchflags.switchdist2 * alchflags.switchdist2 * alchflags.switchdist2;
240  switchdist6_2 = switchdist6_1;
241  }
242  const float r6_1 = r2_1*r2_1*r2_1;
243  const float r6_2 = r2_2*r2_2*r2_2;
244  if (r2 <= alchflags.switchdist2) {
245  const float U1 = ljab.x*r6_1*r6_1 - ljab.y*r6_1; // NB: unscaled, shorthand only!
246  const float U2 = ljab.x*r6_2*r6_2 - ljab.y*r6_2;
247  // A == ljab.x, B == ljab.y
248  const float dU_1 = -ljab.x / (cutoff6 * switchdist6_1) - (-ljab.y * rsqrtf(cutoff6 * switchdist6_1));
249  const float dU_2 = -ljab.x / (cutoff6 * switchdist6_2) - (-ljab.y * rsqrtf(cutoff6 * switchdist6_2));
250  alch_vdw_energy = myVdwLambda * (U1 + dU_1);
251  alch_vdw_energy_2 = myVdwLambda2 * (U2 + dU_2);
252 
253  //Multiplied by -1.0 to match CPU values
254  alch_vdw_force =-1.f*myVdwLambda*((12.f*U1 + 6.f*ljab.y*r6_1)*r2_1);
255  } else {
256  const float r3_1 = sqrtf(r6_1);
257  const float r3_2 = sqrtf(r6_2);
258  const float inv_cutoff6 = 1.0f / cutoff6;
259  const float inv_cutoff3 = rsqrtf(cutoff6);
260  // A == ljab.x, B == ljab.y
261  const float k_vdwa_1 = ljab.x / (1.0f - switchdist6_1 * inv_cutoff6);
262  const float k_vdwb_1 = ljab.y / (1.0f - sqrtf(switchdist6_1 * inv_cutoff6));
263  const float k_vdwa_2 = ljab.x / (1.0f - switchdist6_2 * inv_cutoff6);
264  const float k_vdwb_2 = ljab.y / (1.0f - sqrtf(switchdist6_2 * inv_cutoff6));
265  const float tmpa_1 = r6_1 - inv_cutoff6;
266  const float tmpb_1 = r3_1 - inv_cutoff3;
267  const float tmpa_2 = r6_2 - inv_cutoff6;
268  const float tmpb_2 = r3_2 - inv_cutoff3;
269  alch_vdw_energy = myVdwLambda * (k_vdwa_1 * tmpa_1 * tmpa_1 - k_vdwb_1 * tmpb_1 * tmpb_1);
270  alch_vdw_energy_2 = myVdwLambda2 * (k_vdwa_2 * tmpa_2 * tmpa_2 - k_vdwb_2 * tmpb_2 * tmpb_2);
271  //Multiplied by -1.0 to match CPU values
272  alch_vdw_force = -1.0f * myVdwLambda * (6.0f * r2_1 * (2.0f * k_vdwa_1 * tmpa_1 * r6_1 - k_vdwb_1 * tmpb_1 * r3_1));
273  } // r2 <= alchflags.switchdist2
274  } else {
275  // potential switching
276  const float diff = alchflags.cutoff2 - r2;
277 
278  const float switchmul = (alchflags.switchfactor*(diff)*(diff)*(alchflags.cutoff2 - 3.f*alchflags.switchdist2 + 2.f*r2))*(r2 > alchflags.switchdist2) + (1.f)*(r2 <= alchflags.switchdist2);
279  const float switchmul2 = (12.f*alchflags.switchfactor*(diff)*(r2 - alchflags.switchdist2))*(r2 > alchflags.switchdist2) + (0.f) * (r2 <= alchflags.switchdist2);
280 
281  //Templated parameter. No control divergence here
282  if(shift){
283  //This templated parameter lets me get away with not making 2 divisions. But for myVdwShift != 0, how do I do this?
284  const float myVdwShift = alchflags.vdwShiftUp*up + alchflags.vdwShiftDown*(!up);
285  const float myVdwShift2 = alchflags.vdwShift2Up*up + alchflags.vdwShift2Down*(!up);
286  //r2_1 = 1.0/(r2 + myVdwShift);
287  //r2_2 = 1.0/(r2 + myVdwShift2);
288  r2_1 = __fdividef(1.f,(r2 + myVdwShift));
289  r2_2 = __fdividef(1.f,(r2 + myVdwShift2));
290  }else{
291  r2_1 = rinv*rinv;
292  r2_2 = rinv*rinv;
293  }
294 
295  const float r6_1 = r2_1*r2_1*r2_1;
296  const float r6_2 = r2_2*r2_2*r2_2;
297  const float U1 = ljab.x*r6_1*r6_1 - ljab.y*r6_1; // NB: unscaled, shorthand only!
298  const float U2 = ljab.x*r6_2*r6_2 - ljab.y*r6_2;
299  alch_vdw_energy = myVdwLambda*switchmul*U1;
300  alch_vdw_energy_2 = myVdwLambda2*switchmul*U2;
301 
302  //Multiplied by -1.0 to match CPU values
303  alch_vdw_force =-1.f*myVdwLambda*((switchmul*(12.f*U1 + 6.f*ljab.y*r6_1)*r2_1+ switchmul2*U1));
304  } // vdwForceSwitch
305  }
306 
307 /*-----------------------------------------------------------*/
308 
309  if (doEnergy){
310  //All energies should be scaled by the corresponding lambda
311  energyVdw += (ljab.x * ei.z + ljab.y * ei.y)*(ref || dec) + alch_vdw_energy*(alch && !dec);
312  energyElec += (fSlow * ei.x)*myElecLambda;
313  energyVdw_s += (ljab.x * ei.z + ljab.y * ei.y)*(ref || dec) + alch_vdw_energy_2*(alch && !dec);
314  energyElec_s += (fSlow * ei.x)*myElecLambda2;
315  if (doSlow){
316  energySlow += (fSlow * ei.w)*myElecLambda;
317  energySlow_s += (fSlow * ei.w)*myElecLambda2;
318  }
319  }
320 
321  if (doSlow) fSlow *= fi.w;
322 
323  //We should include the regular VDW forces if not dealing with alch pairs
324  f = (f + ((ljab.x * fi.z + ljab.y * fi.y)*(!alch || dec)))*myElecLambda
325  + alch_vdw_force*(alch && !dec);
326 
327  float fx = dx * f;
328  float fy = dy * f;
329  float fz = dz * f;
330 
331  iforce.x += fx;
332  iforce.y += fy;
333  iforce.z += fz;
334  jforce.x -= fx;
335  jforce.y -= fy;
336  jforce.z -= fz;
337 
338  if (doSlow) {
339  /*There's stuff that needs to be added here, when FAST AND NOSHORT macros are on*/
340  fSlow = myElecLambda*fSlow;
341  float fxSlow = dx * fSlow;
342  float fySlow = dy * fSlow;
343  float fzSlow = dz * fSlow;
344  iforceSlow.x += fxSlow;
345  iforceSlow.y += fySlow;
346  iforceSlow.z += fzSlow;
347  jforceSlow.x -= fxSlow;
348  jforceSlow.y -= fySlow;
349  jforceSlow.z -= fzSlow;
350  }
351 }
352 
353 /* JM: Special __device__ function to compute VDW forces for TI.
354  */
355 
356 template<bool doEnergy, bool doSlow, bool shift, bool vdwForceSwitch, typename jForceType>
357 __device__ __forceinline__
358 void calcForceEnergyTI(const float r2, const float qi, const float qj,
359  const float dx, const float dy, const float dz,
360  const int vdwtypei, const int vdwtypej,
361  char p1, char p2,
362  const float2* __restrict__ vdwCoefTable,
363  cudaTextureObject_t vdwCoefTableTex,
364  cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
365  float3& iforce, float3& iforceSlow, jForceType& jforce, jForceType& jforceSlow,
366  float& energyVdw, float& energyVdw_ti_1, float& energyVdw_ti_2,
367  float& energyElec, float& energyElec_ti_1, float& energyElec_ti_2,
368  float& energySlow, float& energySlow_ti_1, float& energySlow_ti_2) {
369 
370  int vdwIndex = vdwtypej + vdwtypei;
371 #if __CUDA_ARCH__ >= 350
372  float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
373 #else
374  float2 ljab = tex1D<float2>(vdwCoefTableTex, vdwIndex); //ljab.x is A and ljab.y is B
375 #endif
376 
377  /* JM: For TI, we have to deal ALCH1 OR ALCH2 during ComputeNonbondedBase2
378  * ALCH1 for appearing terms;
379  * ALCH2 for dissapearing terms;
380  * Instead of the _s energy terms, we need the to calculate:
381  *
382  * vdwEnergy_ti_1 and _2 for VDW energies. For those we need to add the special terms calculated on
383  * ComputeNonbondedTI.C
384  *
385  * elecEnergy_ti_1 and _2 for electrostatic energy. No correction needed here though.
386  *
387  */
388 
389  float myVdwLambda = 0.0f;
390  float myElecLambda = 0.0f;
391  float rinv = rsqrtf(r2);
392  float f;
393  float alch_vdw_energy = 0.0f;
394  float alch_vdw_force = 0.0f;
395  float alch_vdw_dUdl = 0.0f;
396  float fSlow = qi * qj;
397  float4 ei;
398  float4 fi = tex1D<float4>(forceTableTex, rinv);
399  if (doEnergy) ei = tex1D<float4>(energyTableTex, rinv);
400 
401  //John said that there is a better way to avoid divergences here
402  //alch: true if => 1-0, 1-1, 2-0, 2-2
403  //dec: true if => 1-1, 2-2 && decouple
404  //up: true if => 1-0 && 1,1
405  //down: true if => 2-0, && 2,2
406  int ref = (p1 == 0 && p2 == 0);
407  int alch = (!ref && !(p1 == 1 && p2 ==2) && !(p1 == 2 && p2 == 1));
408  int dec = (alch && (p1 == p2) && alchflags.alchDecouple);
409  int up = (alch && (p1 == 1 || p2 == 1) && !dec);
410  int down = (alch && (p1 == 2 || p2 == 2) && !dec);
411 
412  float r2_1;
413  f = (fSlow * fi.x);
414 
415 /*--------------- VDW SPECIAL ALCH STUFF (Swiped from ComputeNonbondedTI.C) ---------------*/
416  myVdwLambda = alchflags.vdwLambdaUp*(up) + alchflags.vdwLambdaDown*(down) + 1.f*(ref || dec);
417  myElecLambda = alchflags.elecLambdaUp*(up) + alchflags.elecLambdaDown*(down) + 1.f*(ref || dec);
418  if(alch){
419  if (vdwForceSwitch) {
420  const float cutoff6 = alchflags.cutoff2 * alchflags.cutoff2 * alchflags.cutoff2;
421  float switchdist6;
422  if (shift) {
423  const float myVdwShift = alchflags.vdwShiftUp*up + alchflags.vdwShiftDown*(!up);
424  r2_1 = __fdividef(1.f,(r2 + myVdwShift));
425  switchdist6 = alchflags.switchdist2 + myVdwShift;
426  switchdist6 = switchdist6 * switchdist6 * switchdist6;
427  } else {
428  r2_1 = rinv*rinv;
429  switchdist6 = alchflags.switchdist2 * alchflags.switchdist2 * alchflags.switchdist2;
430  }
431  const float r6_1 = r2_1*r2_1*r2_1;
432  if (r2 <= alchflags.switchdist2) {
433  const float U = ljab.x*r6_1*r6_1 - ljab.y*r6_1;
434  const float dU = -ljab.x / (cutoff6 * switchdist6) - (-ljab.y * rsqrtf(cutoff6 * switchdist6));
435  alch_vdw_force = -1.f*(myVdwLambda*((12.f*U + 6.f*ljab.y*r6_1)*r2_1));
436  alch_vdw_energy = myVdwLambda * (U + dU);
437  alch_vdw_dUdl = U + myVdwLambda * alchflags.alchVdwShiftCoeff * (6.f*U + 3.f*ljab.y*r6_1)*r2_1 + dU;
438  } else {
439  const float r3_1 = sqrtf(r6_1);
440  const float inv_cutoff6 = 1.0f / cutoff6;
441  const float inv_cutoff3 = sqrtf(inv_cutoff6);
442  const float k_vdwa_1 = ljab.x / (1.0f - switchdist6 * inv_cutoff6);
443  const float k_vdwb_1 = ljab.y / (1.0f - sqrtf(switchdist6 * inv_cutoff6));
444  const float tmpa_1 = r6_1 - inv_cutoff6;
445  const float tmpb_1 = r3_1 - inv_cutoff3;
446  const float U = k_vdwa_1 * tmpa_1 * tmpa_1 - k_vdwb_1 * tmpb_1 * tmpb_1;
447  alch_vdw_force = -1.0f * myVdwLambda * (6.0f * r2_1 * (2.0f * k_vdwa_1 * tmpa_1 * r6_1 - k_vdwb_1 * tmpb_1 * r3_1));
448  alch_vdw_energy = myVdwLambda * U;
449  alch_vdw_dUdl = U + myVdwLambda * alchflags.alchVdwShiftCoeff * (3.0f * r2_1 * (2.0f * k_vdwa_1 * tmpa_1 * r6_1 - k_vdwb_1 * tmpb_1 * r3_1));
450  } // r2 <= alchflags.switchdist2
451  } else {
452  const float diff = alchflags.cutoff2 - r2;
453  const float switchmul = (r2 > alchflags.switchdist2 ? alchflags.switchfactor*(diff)*(diff) \
454  *(alchflags.cutoff2 - 3.f*alchflags.switchdist2 + 2.f*r2) : 1.f);
455 
456  const float switchmul2 = (r2 > alchflags.switchdist2 ? \
457  12.f*alchflags.switchfactor*(diff) \
458  *(r2 - alchflags.switchdist2) : 0.f);
459  //Templated parameter. No control divergence here
460  if(shift){
461  const float myVdwShift = alchflags.vdwShiftUp*up + alchflags.vdwShiftDown*(!up);
462  r2_1 = __fdividef(1.f,(r2 + myVdwShift));
463  }else r2_1 = rinv*rinv;
464 
465  const float r6_1 = r2_1*r2_1*r2_1;
466  const float U = ljab.x*r6_1*r6_1 - ljab.y*r6_1; // NB: unscaled! for shorthand only!
467  alch_vdw_energy = myVdwLambda*switchmul*U;
468  //Multiplied by -1.0 to match CPU values
469  alch_vdw_force = -1.f*(myVdwLambda*(switchmul*(12.f*U + 6.f*ljab.y*r6_1)*r2_1 \
470  + switchmul2*U));
471  alch_vdw_dUdl = (switchmul*(U + myVdwLambda*alchflags.alchVdwShiftCoeff \
472  *(6.f*U + 3.f*ljab.y*r6_1)*r2_1));
473  } // vdwForceSwitch
474  }
475  /*-------------------------------------------------------------------------*/
476 
477  if (doEnergy){
478  //All energies should be scaled by the corresponding lambda
479  energyVdw += (ljab.x * ei.z + ljab.y * ei.y)*(ref || dec) + alch_vdw_energy*(alch && !dec);
480  energyElec += (fSlow * ei.x)*myElecLambda;
481  if(alch){
482  energyVdw_ti_1 += alch_vdw_dUdl*up;
483  energyVdw_ti_2 += alch_vdw_dUdl*down;
484  energyElec_ti_1 += (fSlow * ei.x)*up;
485  energyElec_ti_2 += (fSlow * ei.x)*down;
486  }
487  if (doSlow){
488  energySlow += (fSlow * ei.w)*myElecLambda;
489  if(alch){
490  energySlow_ti_1 += (fSlow * ei.w)*up;
491  energySlow_ti_2 += (fSlow * ei.w)*down;
492  }
493  }
494  }
495 
496  if (doSlow) fSlow *= fi.w;
497  //We should include the regular VDW forces if not dealing with alch pairs
498  f = (f + ((ljab.x * fi.z + ljab.y * fi.y)*(ref || dec)))*myElecLambda
499  + alch_vdw_force*(alch && !dec);
500 
501  float fx = dx * f;
502  float fy = dy * f;
503  float fz = dz * f;
504 
505  iforce.x += fx;
506  iforce.y += fy;
507  iforce.z += fz;
508  jforce.x -= fx;
509  jforce.y -= fy;
510  jforce.z -= fz;
511 
512  if (doSlow) {
513  /*There's stuff that needs to be added here, when FAST AND NOSHORT macros are on*/
514  fSlow = myElecLambda*fSlow; /* FAST(NOSHORT(+alch_vdw_force))*/ //Those should also be zeroed
515  float fxSlow = dx * fSlow;
516  float fySlow = dy * fSlow;
517  float fzSlow = dz * fSlow;
518  iforceSlow.x += fxSlow;
519  iforceSlow.y += fySlow;
520  iforceSlow.z += fzSlow;
521  jforceSlow.x -= fxSlow;
522  jforceSlow.y -= fySlow;
523  jforceSlow.z -= fzSlow;
524  }
525 }
526 
527 
528 template<bool doSlow, typename jForceType>
529 __device__ __forceinline__
530 void storeForces(const int pos, const jForceType force, const jForceType forceSlow,
531  float4* __restrict__ devForces, float4* __restrict__ devForcesSlow) {
532  atomicAdd(&devForces[pos].x, force.x);
533  atomicAdd(&devForces[pos].y, force.y);
534  atomicAdd(&devForces[pos].z, force.z);
535  if (doSlow) {
536  atomicAdd(&devForcesSlow[pos].x, forceSlow.x);
537  atomicAdd(&devForcesSlow[pos].y, forceSlow.y);
538  atomicAdd(&devForcesSlow[pos].z, forceSlow.z);
539  }
540 }
541 
542 template<bool doSlow, typename jForceType>
543 __device__ __forceinline__
544 void storeForces(const int pos, const jForceType force, const jForceType forceSlow,
545  float* __restrict__ devForces_x,
546  float* __restrict__ devForces_y,
547  float* __restrict__ devForces_z,
548  float* __restrict__ devForcesSlow_x,
549  float* __restrict__ devForcesSlow_y,
550  float* __restrict__ devForcesSlow_z)
551 {
552  atomicAdd(&devForces_x[pos], force.x);
553  atomicAdd(&devForces_y[pos], force.y);
554  atomicAdd(&devForces_z[pos], force.z);
555  if (doSlow) {
556  atomicAdd(&devForcesSlow_x[pos], forceSlow.x);
557  atomicAdd(&devForcesSlow_y[pos], forceSlow.y);
558  atomicAdd(&devForcesSlow_z[pos], forceSlow.z);
559  }
560 }
561 
562 template<bool doSlow, typename jForceType>
563 __device__ __forceinline__
564 void storeForces(const int pos, const jForceType force, const jForceType forceSlow,
565  float3* __restrict__ forces, float3* __restrict__ forcesSlow) {
566  atomicAdd(&forces[pos].x, force.x);
567  atomicAdd(&forces[pos].y, force.y);
568  atomicAdd(&forces[pos].z, force.z);
569  if (doSlow) {
570  atomicAdd(&forcesSlow[pos].x, forceSlow.x);
571  atomicAdd(&forcesSlow[pos].y, forceSlow.y);
572  atomicAdd(&forcesSlow[pos].z, forceSlow.z);
573  }
574 }
575 
576 template<bool doPairlist>
577 __device__ __forceinline__
578 void shuffleNext(float& xyzq_j_w, int& vdwtypej, int& jatomIndex, int& jexclMaxdiff, int& jexclIndex) {
579  xyzq_j_w = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j_w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
580  vdwtypej = WARP_SHUFFLE(WARP_FULL_MASK, vdwtypej, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
581  if (doPairlist) {
582  jatomIndex = WARP_SHUFFLE(WARP_FULL_MASK, jatomIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
583  jexclIndex = WARP_SHUFFLE(WARP_FULL_MASK, jexclIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
584  jexclMaxdiff = WARP_SHUFFLE(WARP_FULL_MASK, jexclMaxdiff, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
585  }
586 }
587 
588 template<bool doPairlist>
589 __device__ __forceinline__
590 void shuffleNext(float& xyzq_j_w, int& vdwtypej, int& jatomIndex) {
591  xyzq_j_w = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j_w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
592  vdwtypej = WARP_SHUFFLE(WARP_FULL_MASK, vdwtypej, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
593  if (doPairlist) {
594  jatomIndex = WARP_SHUFFLE(WARP_FULL_MASK, jatomIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
595  }
596 }
597 
598 template<bool doSlow, typename jForceType>
599 __device__ __forceinline__
600 void shuffleNext(jForceType& jforce, jForceType& jforceSlow) {
601  jforce.x = WARP_SHUFFLE(WARP_FULL_MASK, jforce.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
602  jforce.y = WARP_SHUFFLE(WARP_FULL_MASK, jforce.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
603  jforce.z = WARP_SHUFFLE(WARP_FULL_MASK, jforce.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
604  if (doSlow) {
605  jforceSlow.x = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
606  jforceSlow.y = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
607  jforceSlow.z = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
608  }
609 }
610 
611 //#define USE_NEW_EXCL_METHOD
612 
613 //
614 // Returns the lower estimate for the distance between a bounding box and a set of atoms
615 //
616 __device__ __forceinline__ float distsq(const BoundingBox a, const float4 b) {
617  float dx = max(0.0f, fabsf(a.x - b.x) - a.wx);
618  float dy = max(0.0f, fabsf(a.y - b.y) - a.wy);
619  float dz = max(0.0f, fabsf(a.z - b.z) - a.wz);
620  float r2 = dx*dx + dy*dy + dz*dz;
621  return r2;
622 }
623 
624 #define LARGE_FLOAT (float)(1.0e10)
625 
626 //
627 // Nonbonded force kernel
628 //
629 template <bool doEnergy, bool doVirial, bool doSlow, bool doPairlist, bool doAlch, bool doFEP, bool doTI, bool doStreaming, bool doTable, bool doAlchVdwForceSwitching>
630 __global__ void
631 __launch_bounds__(WARPSIZE*NONBONDKERNEL_NUM_WARP,
632  doPairlist ? (10) : (doEnergy ? (10) : (12) )
633  )
634 nonbondedForceKernel(
635  const int start, const int numTileLists,
636  const TileList* __restrict__ tileLists, TileExcl* __restrict__ tileExcls,
637  const int* __restrict__ tileJatomStart,
638  const int vdwCoefTableWidth, const float2* __restrict__ vdwCoefTable, const int* __restrict__ vdwTypes,
639  const float3 lata, const float3 latb, const float3 latc,
640  const float4* __restrict__ xyzq,
641  const float cutoff2, const CudaNBConstants nbConstants,
642  cudaTextureObject_t vdwCoefTableTex,
643  cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
644  // ----------
645  // doPairlist
646  const int atomStorageSize, const float plcutoff2, const PatchPairRecord* __restrict__ patchPairs,
647  const int* __restrict__ atomIndex,
648  const int2* __restrict__ exclIndexMaxDiff, const unsigned int* __restrict__ overflowExclusions,
649  unsigned int* __restrict__ tileListDepth, int* __restrict__ tileListOrder,
650  int* __restrict__ jtiles, TileListStat* __restrict__ tileListStat,
651  const BoundingBox* __restrict__ boundingBoxes,
652 #ifdef USE_NEW_EXCL_METHOD
653  const int* __restrict__ minmaxExclAtom,
654 #endif
655  // ----------
656  float4* __restrict__ devForces, float4* __restrict__ devForcesSlow,
657  float * __restrict__ devForce_x,
658  float * __restrict__ devForce_y,
659  float * __restrict__ devForce_z,
660  float * __restrict__ devForce_w,
661  float * __restrict__ devForceSlow_x,
662  float * __restrict__ devForceSlow_y,
663  float * __restrict__ devForceSlow_z,
664  float * __restrict__ devForceSlow_w,
665  // ---- USE_STREAMING_FORCES ----
666  const int numPatches,
667  unsigned int* __restrict__ patchNumCount,
668  const CudaPatchRecord* __restrict__ cudaPatches,
669  float4* __restrict__ mapForces, float4* __restrict__ mapForcesSlow,
670  int* __restrict__ mapPatchReadyQueue,
671  int* __restrict__ outputOrder,
672  // ------------------------------
673  TileListVirialEnergy* __restrict__ virialEnergy,
674  // ---- doAlch ----
675  char* __restrict__ p
676  ) {
677  using jForceType = typename std::conditional<doSlow, float3, float4>::type;
678  // Single warp takes care of one list of tiles
679  // for (int itileList = (threadIdx.x + blockDim.x*blockIdx.x)/WARPSIZE;itileList < numTileLists;itileList += blockDim.x*gridDim.x/WARPSIZE)
680  const int itileListTemp = start + (threadIdx.x + blockDim.x*blockIdx.x) / WARPSIZE;
681  const int itileList = WARP_SHUFFLE(WARP_FULL_MASK, itileListTemp, 0, WARPSIZE);
682  if (itileList < numTileLists)
683  {
684 
685  float3 iforce;
686  float3 iforceSlow;
687  float energyVdw, energyElec, energySlow;
688  //FEP energies
689  float energyVdw_s, energyElec_s, energySlow_s;
690  //TI energies
691  float energyVdw_ti_1, energyVdw_ti_2, energyElec_ti_1, energyElec_ti_2, energySlow_ti_1, energySlow_ti_2;
692  int nexcluded;
693  unsigned int itileListLen;
694  int2 patchInd;
695  int2 patchNumList;
696  char part1, part2, p2;
697  bool doShift = (alchflags.alchVdwShiftCoeff != 0.0f);
698  __shared__ float4 s_xyzq[NONBONDKERNEL_NUM_WARP][WARPSIZE];
699  __shared__ jForceType s_jforce[NONBONDKERNEL_NUM_WARP][WARPSIZE];
700  __shared__ jForceType s_jforceSlow[NONBONDKERNEL_NUM_WARP][WARPSIZE];
701  __shared__ int s_vdwtypej[NONBONDKERNEL_NUM_WARP][WARPSIZE];
702  __shared__ int s_jatomIndex[NONBONDKERNEL_NUM_WARP][WARPSIZE];
703 
704  __shared__ int s_iatomStart[NONBONDKERNEL_NUM_WARP];
705  __shared__ int s_jatomStart[NONBONDKERNEL_NUM_WARP];
706 
707  // Start computation
708  {
709  // Warp index (0...warpsize-1)
710  const int wid = threadIdx.x & (WARPSIZE-1);
711  const int iwarp = WARP_SHUFFLE(WARP_FULL_MASK, threadIdx.x / WARPSIZE, 0, WARPSIZE);
712 
713  TileList tmp = tileLists[itileList];
714  int iatomStart = tmp.iatomStart;
715  int jtileStart = tmp.jtileStart;
716  int jtileEnd = tmp.jtileEnd;
717  patchInd = tmp.patchInd;
718  patchNumList = tmp.patchNumList;
719 
720  float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x;
721  float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y;
722  float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z;
723 
724  // DH - set zeroShift flag if magnitude of shift vector is zero
725  bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
726 
727  int iatomSize, iatomFreeSize, jatomSize, jatomFreeSize;
728  if (doPairlist) {
729  PatchPairRecord PPStmp = patchPairs[itileList];
730  iatomSize = PPStmp.iatomSize;
731  iatomFreeSize = PPStmp.iatomFreeSize;
732  jatomSize = PPStmp.jatomSize;
733  jatomFreeSize = PPStmp.jatomFreeSize;
734  }
735 
736  // Write to global memory here to avoid register spilling
737  if (doVirial) {
738  if (wid == 0) {
739  virialEnergy[itileList].shx = shx;
740  virialEnergy[itileList].shy = shy;
741  virialEnergy[itileList].shz = shz;
742  }
743  }
744 
745  // Load i-atom data (and shift coordinates)
746  float4 xyzq_i = xyzq[iatomStart + wid];
747  if (doAlch) part1 = p[iatomStart + wid];
748  xyzq_i.x += shx;
749  xyzq_i.y += shy;
750  xyzq_i.z += shz;
751  int vdwtypei = vdwTypes[iatomStart + wid]*vdwCoefTableWidth;
752 
753  // Load i-atom data (and shift coordinates)
754  BoundingBox boundingBoxI;
755  if (doPairlist) {
756  boundingBoxI = boundingBoxes[iatomStart/WARPSIZE];
757  boundingBoxI.x += shx;
758  boundingBoxI.y += shy;
759  boundingBoxI.z += shz;
760  }
761 
762  // Get i-atom global index
763 #ifdef USE_NEW_EXCL_METHOD
764  int iatomIndex, minExclAtom, maxExclAtom;
765 #else
766  int iatomIndex;
767 #endif
768  if (doPairlist) {
769 #ifdef USE_NEW_EXCL_METHOD
770  iatomIndex = atomIndex[iatomStart + wid];
771  int2 tmp = minmaxExclAtom[iatomStart + wid];
772  minExclAtom = tmp.x;
773  maxExclAtom = tmp.y;
774 #else
775  iatomIndex = atomIndex[iatomStart + wid];
776 #endif
777  }
778 
779  // i-forces in registers
780  // float3 iforce;
781  iforce.x = 0.0f;
782  iforce.y = 0.0f;
783  iforce.z = 0.0f;
784 
785  // float3 iforceSlow;
786  if (doSlow) {
787  iforceSlow.x = 0.0f;
788  iforceSlow.y = 0.0f;
789  iforceSlow.z = 0.0f;
790  }
791 
792  // float energyVdw, energyElec, energySlow;
793  if (doEnergy) {
794  energyVdw = 0.0f;
795  energyVdw_s = 0.0f;
796  energyVdw_ti_1 = 0.0f;
797  energyVdw_ti_2 = 0.0f;
798  energyElec = 0.0f;
799  energyElec_ti_1 = 0.0f;
800  energyElec_ti_2 = 0.0f;
801  energyElec_s = 0.0f;
802  if (doSlow){
803  energySlow = 0.0f;
804  energySlow_s = 0.0f;
805  energySlow_ti_1 = 0.0f;
806  energySlow_ti_2 = 0.0f;
807  }
808  }
809 
810  // Number of exclusions
811  // NOTE: Lowest bit is used as indicator bit for tile pairs:
812  // bit 0 tile has no atoms within pairlist cutoff
813  // bit 1 tile has atoms within pairlist cutoff
814  // int nexcluded;
815  if (doPairlist) nexcluded = 0;
816 
817  // Number of i loops and free atoms
818  int nfreei;
819  if (doPairlist) {
820  int nloopi = min(iatomSize - iatomStart, WARPSIZE);
821  nfreei = max(iatomFreeSize - iatomStart, 0);
822  if (wid >= nloopi) {
823  xyzq_i.x = -LARGE_FLOAT;
824  xyzq_i.y = -LARGE_FLOAT;
825  xyzq_i.z = -LARGE_FLOAT;
826  }
827  }
828 
829  // tile list stuff
830  // int itileListLen;
831  // int minJatomStart;
832  if (doPairlist) {
833  // minJatomStart = tileJatomStart[jtileStart];
834  itileListLen = 0;
835  }
836 
837  // Exclusion index and maxdiff
838  int iexclIndex, iexclMaxdiff;
839  if (doPairlist) {
840  int2 tmp = exclIndexMaxDiff[iatomStart + wid];
841  iexclIndex = tmp.x;
842  iexclMaxdiff = tmp.y;
843  }
844  s_iatomStart[iwarp] = iatomStart;
845 
846  // If the tile is within a patch, then the first jtile is a self tile
847  if (patchInd.x == patchInd.y & zeroShift) {
848  int jtile = jtileStart;
849  // Load j-atom starting index and exclusion mask
850  int jatomStart = tileJatomStart[jtile];
851 
852  float4 xyzq_j = xyzq[jatomStart + wid];
853  WARP_SYNC(WARP_FULL_MASK);
854  if (doAlch) p2 = p[jatomStart + wid];
855 
856  // Check for early bail
857  // No point of early bail for self
858 
859  unsigned int excl = (doPairlist) ? 0 : tileExcls[jtile].excl[wid];
860  int vdwtypej = vdwTypes[jatomStart + wid];
861  s_vdwtypej[iwarp][wid] = vdwtypej;
862 
863  // Get i-atom global index
864  if (doPairlist) {
865  s_jatomIndex[iwarp][wid] = atomIndex[jatomStart + wid];
866  }
867 
868  // Number of j loops and free atoms
869  int nfreej;
870  if (doPairlist) {
871  int nloopj = min(jatomSize - jatomStart, WARPSIZE);
872  nfreej = max(jatomFreeSize - jatomStart, 0);
873  //if (nfreei == 0 && nfreej == 0) continue;
874  if (wid >= nloopj) {
875  xyzq_j.x = LARGE_FLOAT;
876  xyzq_j.y = LARGE_FLOAT;
877  xyzq_j.z = LARGE_FLOAT;
878  }
879  }
880  s_xyzq[iwarp][wid] = xyzq_j;
881 
882  // DH - self requires that zeroShift is also set
883  // DC - In this case self is always true
884  const int modval = 2*WARPSIZE-1;
885 
886  s_jforce[iwarp][wid] = make_zero<jForceType>();
887  if (doSlow)
888  s_jforceSlow[iwarp][wid] = make_zero<jForceType>();
889  WARP_SYNC(WARP_FULL_MASK);
890 
891  if (doPairlist) {
892  // Build pair list
893  // NOTE: Pairlist update, we must also include the diagonal since this is used
894  // in GBIS phase 2.
895  // Clear the lowest (indicator) bit
896  nexcluded &= (~1);
897 
898  // For self tiles, do the diagonal term (t=0).
899  // NOTE: No energies are computed here, since this self-diagonal term is only for GBIS phase 2
900  int j = (0 + wid) & modval;
901  xyzq_j = s_xyzq[iwarp][j];
902  float dx = xyzq_j.x - xyzq_i.x;
903  float dy = xyzq_j.y - xyzq_i.y;
904  float dz = xyzq_j.z - xyzq_i.z;
905  float r2 = dx*dx + dy*dy + dz*dz;
906 
907  if (j < WARPSIZE && r2 < plcutoff2) {
908  // We have atom pair within the pairlist cutoff => Set indicator bit
909  nexcluded |= 1;
910  }
911  WARP_SYNC(WARP_FULL_MASK);
912 
913  // TODO this can be done in fewer iterations if we take advantage of Newtons's 3rd
914 #pragma unroll 4
915  for (int t = 1;t < WARPSIZE;t++) {
916  int j = (t + wid) & modval;
917 
918  // NOTE: __shfl() operation can give non-sense here because j may be >= WARPSIZE.
919  // However, if (j < WARPSIZE ..) below makes sure that these non-sense
920  // results are not used
921  if (doAlch) part2 = WARP_SHUFFLE(WARP_FULL_MASK, p2, j, WARPSIZE);
922 
923  excl >>= 1;
924  if (j < WARPSIZE) {
925  xyzq_j = s_xyzq[iwarp][j];
926  float dx = xyzq_j.x - xyzq_i.x;
927  float dy = xyzq_j.y - xyzq_i.y;
928  float dz = xyzq_j.z - xyzq_i.z;
929  float r2 = dx*dx + dy*dy + dz*dz;
930  if (r2 < plcutoff2) {
931  // We have atom pair within the pairlist cutoff => Set indicator bit
932  nexcluded |= 1;
933  if (j < nfreej || wid < nfreei) {
934  bool excluded = false;
935  int indexdiff = s_jatomIndex[iwarp][j] - iatomIndex;
936  if ( abs(indexdiff) <= iexclMaxdiff) {
937  indexdiff += iexclIndex;
938  int indexword = ((unsigned int) indexdiff) >> 5;
939 
940  if ( indexword < MAX_CONST_EXCLUSIONS ) {
941  indexword = constExclusions[indexword];
942  } else {
943  indexword = overflowExclusions[indexword];
944  }
945 
946  excluded = ((indexword & (1<<(indexdiff&31))) != 0);
947  }
948  if (excluded) nexcluded += 2;
949  if (!excluded) excl |= 0x80000000;
950  if(doAlch){
951  if(!excluded && r2 < cutoff2){
952  if(doShift){
953  if(doFEP){
954  calcForceEnergyFEP<doEnergy, doSlow, true, doAlchVdwForceSwitching, jForceType>(
955  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
956  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
957  vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
958  iforce, iforceSlow,
959  s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
960  energyVdw, energyVdw_s,
961  energyElec, energySlow, energyElec_s, energySlow_s);
962  }else{
963  calcForceEnergyTI<doEnergy, doSlow, true, doAlchVdwForceSwitching, jForceType>(
964  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
965  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
966  vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
967  iforce, iforceSlow,
968  s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
969  energyVdw, energyVdw_ti_1,
970  energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
971  energySlow, energySlow_ti_1, energySlow_ti_2);
972  }//if doFEP
973  }else{
974  if(doFEP){
975  calcForceEnergyFEP<doEnergy, doSlow, false, doAlchVdwForceSwitching, jForceType>(
976  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
977  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
978  vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
979  iforce, iforceSlow,
980  s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
981  energyVdw, energyVdw_s,
982  energyElec, energySlow, energyElec_s, energySlow_s);
983  }else{
984  calcForceEnergyTI<doEnergy, doSlow, false, doAlchVdwForceSwitching, jForceType>(
985  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
986  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
987  vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
988  iforce, iforceSlow,
989  s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
990  energyVdw, energyVdw_ti_1,
991  energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
992  energySlow, energySlow_ti_1, energySlow_ti_2);
993  }
994  }//if doShift
995  }//if !excluded && r2 < cutoff2
996  }else{
997  if (!excluded && r2 < cutoff2) {
998  if (doTable) {
999  calcForceEnergy<doEnergy, doSlow>(
1000  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1001  vdwtypei, s_vdwtypej[iwarp][j], vdwCoefTable,
1002  vdwCoefTableTex, forceTableTex, energyTableTex,
1003  iforce, iforceSlow,
1004  s_jforce[iwarp][j],
1005  s_jforceSlow[iwarp][j],
1006  energyVdw, energyElec, energySlow);
1007  } else {
1008  calcForceEnergyMath<doEnergy, doSlow, jForceType>(
1009  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1010  vdwtypei, s_vdwtypej[iwarp][j], vdwCoefTable,
1011  vdwCoefTableTex, forceTableTex, energyTableTex,
1012  iforce, iforceSlow,
1013  s_jforce[iwarp][j],
1014  s_jforceSlow[iwarp][j],
1015  energyVdw, energyElec, energySlow,
1016  nbConstants);
1017  }
1018  }
1019  }
1020  }
1021  }
1022  }
1023  WARP_SYNC(WARP_FULL_MASK);
1024  } // t
1025  } else {
1026  // Just compute forces
1027  excl >>= 1;
1028 #pragma unroll 4
1029  for (int t = 1;t < WARPSIZE;t++) {
1030  if (doAlch) {
1031  int j = (t + wid) & modval;
1032  part2 = WARP_SHUFFLE(WARP_FULL_MASK, p2, j, WARPSIZE);
1033  }
1034  if ((excl & 1)) {
1035  int j = ((t + wid) & (WARPSIZE-1));
1036  xyzq_j = s_xyzq[iwarp][j];
1037  float dx = xyzq_j.x - xyzq_i.x;
1038  float dy = xyzq_j.y - xyzq_i.y;
1039  float dz = xyzq_j.z - xyzq_i.z;
1040 
1041  float r2 = dx*dx + dy*dy + dz*dz;
1042  if(doAlch){
1043  if(r2 < cutoff2){ // (r2 < cutoff2)
1044  if(doShift){
1045  if (doFEP){
1046  calcForceEnergyFEP<doEnergy, doSlow, true, doAlchVdwForceSwitching, jForceType>(
1047  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1048  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1049  vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1050  iforce, iforceSlow,
1051  s_jforce[iwarp][j],
1052  s_jforceSlow[iwarp][j],
1053  energyVdw, energyVdw_s,
1054  energyElec, energySlow, energyElec_s, energySlow_s);
1055  }else{
1056  calcForceEnergyTI<doEnergy, doSlow, true, doAlchVdwForceSwitching, jForceType>(
1057  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1058  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1059  vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1060  iforce, iforceSlow,
1061  s_jforce[iwarp][j],
1062  s_jforceSlow[iwarp][j],
1063  energyVdw, energyVdw_ti_1,
1064  energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
1065  energySlow, energySlow_ti_1, energySlow_ti_2);
1066  }//if doFEP
1067  }else{
1068  if(doFEP){
1069  calcForceEnergyFEP<doEnergy, doSlow, false, doAlchVdwForceSwitching, jForceType>(
1070  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1071  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1072  vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1073  iforce, iforceSlow,
1074  s_jforce[iwarp][j],
1075  s_jforceSlow[iwarp][j],
1076  energyVdw, energyVdw_s,
1077  energyElec, energySlow, energyElec_s, energySlow_s);
1078  }else{
1079  calcForceEnergyTI<doEnergy, doSlow, false, doAlchVdwForceSwitching, jForceType>(
1080  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1081  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1082  vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1083  iforce, iforceSlow,
1084  s_jforce[iwarp][j],
1085  s_jforceSlow[iwarp][j],
1086  energyVdw, energyVdw_ti_1,
1087  energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
1088  energySlow, energySlow_ti_1, energySlow_ti_2);
1089  }//if doFEP
1090  }//doShift
1091  }//r2 < cutoff
1092  }else {
1093  if (r2 < cutoff2) {
1094  if (doTable) {
1095  calcForceEnergy<doEnergy, doSlow>(
1096  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1097  vdwtypei, s_vdwtypej[iwarp][j], vdwCoefTable,
1098  vdwCoefTableTex, forceTableTex, energyTableTex,
1099  iforce, iforceSlow,
1100  s_jforce[iwarp][j],
1101  s_jforceSlow[iwarp][j],
1102  energyVdw, energyElec, energySlow);
1103  } else {
1104  calcForceEnergyMath<doEnergy, doSlow, jForceType>(
1105  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1106  vdwtypei, s_vdwtypej[iwarp][j], vdwCoefTable,
1107  vdwCoefTableTex, forceTableTex, energyTableTex,
1108  iforce, iforceSlow,
1109  s_jforce[iwarp][j],
1110  s_jforceSlow[iwarp][j],
1111  energyVdw, energyElec, energySlow,
1112  nbConstants);
1113  }
1114  }// (r2 < cutoff2)
1115  }//doAlch
1116  } // (excl & 1)
1117  excl >>= 1;
1118  WARP_SYNC(WARP_FULL_MASK);
1119  } // t
1120  }
1121  WARP_SYNC(WARP_FULL_MASK);
1122 
1123  // Write j-forces
1124  storeForces<doSlow, jForceType>(jatomStart + wid, s_jforce[iwarp][wid], s_jforceSlow[iwarp][wid],
1125  devForce_x, devForce_y, devForce_z,
1126  devForceSlow_x, devForceSlow_y, devForceSlow_z);
1127  // Write exclusions
1128  if (doPairlist) {
1129  const unsigned int warp_exclude = WARP_BALLOT(WARP_FULL_MASK, nexcluded & 1);
1130  const unsigned int warp_any_exclude = WARP_BALLOT(WARP_FULL_MASK, excl);
1131  if (warp_exclude) {
1132  int anyexcl = warp_any_exclude ? 1 : 0;
1133  anyexcl |= 65536;
1134  // Mark this jtile as non-empty:
1135  // VdW: 1 if tile has atom pairs within pairlist cutoff and some these atoms interact
1136  // GBIS: 65536 if tile has atom pairs within pairlist cutoff but not necessary interacting (i.e. these atoms are fixed or excluded)
1137  if (wid == 0 && anyexcl) jtiles[jtile] = anyexcl;
1138  // Store exclusions
1139  tileExcls[jtile].excl[wid] = excl;
1140  // itileListLen:
1141  // lower 16 bits number of tiles with atom pairs within pairlist cutoff that interact
1142  // upper 16 bits number of tiles with atom pairs within pairlist cutoff (but not necessary interacting)
1143  itileListLen += anyexcl;
1144  // NOTE, this minJatomStart is only stored once for the first tile list entry
1145  // minJatomStart = min(minJatomStart, jatomStart);
1146  }
1147  }
1148  jtileStart++;
1149  }
1150 
1151  WARP_SYNC(WARP_FULL_MASK);
1152 
1153  for (int jtile=jtileStart; jtile <= jtileEnd; jtile++) {
1154  int jatomStart = 0;
1155  unsigned int excl = 0;
1156  int vdwtypej = 0;
1157  float4 xyzq_j;
1158 
1159  // Load j-atom starting index and exclusion mask
1160  jatomStart = tileJatomStart[jtile];
1161 
1162  xyzq_j = xyzq[jatomStart + wid];
1163  if (doAlch) p2 = p[jatomStart + wid];
1164 
1165  // Check for early bail
1166  // DC - I found this was slower
1167  //if (doPairlist) {
1168  // float r2bb = distsq(boundingBoxI, xyzq_j);
1169  // if (WARP_ALL(WARP_FULL_MASK, r2bb > plcutoff2)) continue;
1170  //}
1171 
1172  excl = (doPairlist) ? 0 : tileExcls[jtile].excl[wid];
1173  vdwtypej = vdwTypes[jatomStart + wid];
1174  s_vdwtypej[iwarp][wid] = vdwtypej;
1175 
1176  // Get i-atom global index
1177  if (doPairlist) {
1178  s_jatomIndex[iwarp][wid] = atomIndex[jatomStart + wid];
1179  }
1180 
1181  // Number of j loops and free atoms
1182  int nfreej;
1183  if (doPairlist) {
1184  int nloopj = min(jatomSize - jatomStart, WARPSIZE);
1185  nfreej = max(jatomFreeSize - jatomStart, 0);
1186  //if (nfreei == 0 && nfreej == 0) continue;
1187  if (wid >= nloopj) {
1188  xyzq_j.x = LARGE_FLOAT;
1189  xyzq_j.y = LARGE_FLOAT;
1190  xyzq_j.z = LARGE_FLOAT;
1191  }
1192  }
1193  if (wid == 0) {
1194  s_jatomStart[iwarp] = jatomStart;
1195  }
1196  WARP_SYNC(WARP_FULL_MASK);
1197  s_xyzq[iwarp][wid] = xyzq_j;
1198 
1199  // DH - self requires that zeroShift is also set
1200  // DC - In this case self is always false
1201  const int modval = WARPSIZE-1;
1202 
1203  s_jforce[iwarp][wid] = make_zero<jForceType>();
1204  if (doSlow)
1205  s_jforceSlow[iwarp][wid] = make_zero<jForceType>();
1206  WARP_SYNC(WARP_FULL_MASK);
1207 
1208  if (doPairlist) {
1209  // Build pair list
1210  // NOTE: Pairlist update, we must also include the diagonal since this is used
1211  // in GBIS phase 2.
1212  // Clear the lowest (indicator) bit
1213  nexcluded &= (~1);
1214 
1215 #pragma unroll 4
1216  for (int t = 0;t < WARPSIZE;t++) {
1217  const int j = (t + wid) & modval;
1218 
1219  // NOTE: __shfl() operation can give non-sense here because j may be >= WARPSIZE.
1220  // However, if (j < WARPSIZE ..) below makes sure that these non-sense
1221  // results are not used
1222  if (doAlch) part2 = WARP_SHUFFLE(WARP_FULL_MASK, p2, j, WARPSIZE);
1223 
1224  excl >>= 1;
1225  xyzq_j = s_xyzq[iwarp][j];
1226  float dx = xyzq_j.x - xyzq_i.x;
1227  float dy = xyzq_j.y - xyzq_i.y;
1228  float dz = xyzq_j.z - xyzq_i.z;
1229  float r2 = dx*dx + dy*dy + dz*dz;
1230  if (r2 < plcutoff2) {
1231  // We have atom pair within the pairlist cutoff => Set indicator bit
1232  nexcluded |= 1;
1233  if (j < nfreej || wid < nfreei) {
1234  bool excluded = false;
1235  int indexdiff = s_jatomIndex[iwarp][j] - iatomIndex;
1236  if ( abs(indexdiff) <= iexclMaxdiff) {
1237  indexdiff += iexclIndex;
1238  int indexword = ((unsigned int) indexdiff) >> 5;
1239 
1240  if ( indexword < MAX_CONST_EXCLUSIONS ) {
1241  indexword = constExclusions[indexword];
1242  } else {
1243  indexword = overflowExclusions[indexword];
1244  }
1245 
1246  excluded = ((indexword & (1<<(indexdiff&31))) != 0);
1247  }
1248  if (excluded) nexcluded += 2;
1249  if (!excluded) excl |= 0x80000000;
1250  if(doAlch){
1251  if(!excluded && r2 < cutoff2){
1252  if(doShift){
1253  if(doFEP){
1254  calcForceEnergyFEP<doEnergy, doSlow, true, doAlchVdwForceSwitching, jForceType>(
1255  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1256  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1257  vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1258  iforce, iforceSlow,
1259  s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
1260  energyVdw, energyVdw_s,
1261  energyElec, energySlow, energyElec_s, energySlow_s);
1262  }else{
1263  calcForceEnergyTI<doEnergy, doSlow, true, doAlchVdwForceSwitching, jForceType>(
1264  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1265  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1266  vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1267  iforce, iforceSlow,
1268  s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
1269  energyVdw, energyVdw_ti_1,
1270  energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
1271  energySlow, energySlow_ti_1, energySlow_ti_2);
1272  }//if doFEP
1273  }else{
1274  if(doFEP){
1275  calcForceEnergyFEP<doEnergy, doSlow, false, doAlchVdwForceSwitching, jForceType>(
1276  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1277  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1278  vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1279  iforce, iforceSlow,
1280  s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
1281  energyVdw, energyVdw_s,
1282  energyElec, energySlow, energyElec_s, energySlow_s);
1283  }else{
1284  calcForceEnergyTI<doEnergy, doSlow, false, doAlchVdwForceSwitching, jForceType>(
1285  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1286  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1287  vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1288  iforce, iforceSlow,
1289  s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
1290  energyVdw, energyVdw_ti_1,
1291  energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
1292  energySlow, energySlow_ti_1, energySlow_ti_2);
1293  }
1294  }//if doShift
1295  }//if !excluded && r2 < cutoff2
1296  }else{
1297  if (!excluded && r2 < cutoff2) {
1298  if (doTable) {
1299  calcForceEnergy<doEnergy, doSlow>(
1300  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1301  vdwtypei, s_vdwtypej[iwarp][j], vdwCoefTable,
1302  vdwCoefTableTex, forceTableTex, energyTableTex,
1303  iforce, iforceSlow,
1304  s_jforce[iwarp][j],
1305  s_jforceSlow[iwarp][j],
1306  energyVdw, energyElec, energySlow);
1307  } else {
1308  calcForceEnergyMath<doEnergy, doSlow, jForceType>(
1309  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1310  vdwtypei, s_vdwtypej[iwarp][j], vdwCoefTable,
1311  vdwCoefTableTex, forceTableTex, energyTableTex,
1312  iforce, iforceSlow,
1313  s_jforce[iwarp][j],
1314  s_jforceSlow[iwarp][j],
1315  energyVdw, energyElec, energySlow,
1316  nbConstants);
1317  }
1318  }
1319  }
1320  }
1321  }
1322  WARP_SYNC(WARP_FULL_MASK);
1323  } // t
1324  } else {
1325  // Just compute forces
1326 #pragma unroll 4
1327  for (int t = 0; t < WARPSIZE; t++) {
1328  const int j = ((t + wid) & (WARPSIZE-1));
1329  if (doAlch) {
1330  part2 = WARP_SHUFFLE(WARP_FULL_MASK, p2, j, WARPSIZE);
1331  }
1332  if ((excl & 1)) {
1333  xyzq_j = s_xyzq[iwarp][j];
1334  float dx = xyzq_j.x - xyzq_i.x;
1335  float dy = xyzq_j.y - xyzq_i.y;
1336  float dz = xyzq_j.z - xyzq_i.z;
1337 
1338  float r2 = dx*dx + dy*dy + dz*dz;
1339  if(doAlch){
1340  if(r2 < cutoff2){ // (r2 < cutoff2)
1341  if(doShift){
1342  if (doFEP){
1343  calcForceEnergyFEP<doEnergy, doSlow, true, doAlchVdwForceSwitching, jForceType>(
1344  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1345  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1346  vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1347  iforce, iforceSlow,
1348  s_jforce[iwarp][j],
1349  s_jforceSlow[iwarp][j],
1350  energyVdw, energyVdw_s,
1351  energyElec, energySlow, energyElec_s, energySlow_s);
1352  }else{
1353  calcForceEnergyTI<doEnergy, doSlow, true, doAlchVdwForceSwitching, jForceType>(
1354  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1355  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1356  vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1357  iforce, iforceSlow,
1358  s_jforce[iwarp][j],
1359  s_jforceSlow[iwarp][j],
1360  energyVdw, energyVdw_ti_1,
1361  energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
1362  energySlow, energySlow_ti_1, energySlow_ti_2);
1363  }//if doFEP
1364  }else{
1365  if(doFEP){
1366  calcForceEnergyFEP<doEnergy, doSlow, false, doAlchVdwForceSwitching, jForceType>(
1367  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1368  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1369  vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1370  iforce, iforceSlow,
1371  s_jforce[iwarp][j],
1372  s_jforceSlow[iwarp][j],
1373  energyVdw, energyVdw_s,
1374  energyElec, energySlow, energyElec_s, energySlow_s);
1375  }else{
1376  calcForceEnergyTI<doEnergy, doSlow, false, doAlchVdwForceSwitching, jForceType>(
1377  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1378  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1379  vdwCoefTable, vdwCoefTableTex, forceTableTex, energyTableTex,
1380  iforce, iforceSlow,
1381  s_jforce[iwarp][j],
1382  s_jforceSlow[iwarp][j],
1383  energyVdw, energyVdw_ti_1,
1384  energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
1385  energySlow, energySlow_ti_1, energySlow_ti_2);
1386  }//if doFEP
1387  }//doShift
1388  }//r2 < cutoff
1389  } else {
1390  if (r2 < cutoff2) {
1391  if (doTable) {
1392  calcForceEnergy<doEnergy, doSlow>(
1393  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1394  vdwtypei, s_vdwtypej[iwarp][j], vdwCoefTable,
1395  vdwCoefTableTex, forceTableTex, energyTableTex,
1396  iforce, iforceSlow,
1397  s_jforce[iwarp][j],
1398  s_jforceSlow[iwarp][j],
1399  energyVdw, energyElec, energySlow);
1400  } else {
1401  calcForceEnergyMath<doEnergy, doSlow, jForceType>(
1402  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1403  vdwtypei, s_vdwtypej[iwarp][j], vdwCoefTable,
1404  vdwCoefTableTex, forceTableTex, energyTableTex,
1405  iforce, iforceSlow,
1406  s_jforce[iwarp][j],
1407  s_jforceSlow[iwarp][j],
1408  energyVdw, energyElec, energySlow,
1409  nbConstants);
1410  }
1411  }// (r2 < cutoff2)
1412  }//doAlch
1413  } // (excl & 1)
1414  excl >>= 1;
1415  WARP_SYNC(WARP_FULL_MASK);
1416  } // t
1417  }
1418 
1419  // Write j-forces
1420  storeForces<doSlow, jForceType>(s_jatomStart[iwarp] + wid, s_jforce[iwarp][wid], s_jforceSlow[iwarp][wid],
1421  devForce_x, devForce_y, devForce_z,
1422  devForceSlow_x, devForceSlow_y, devForceSlow_z);
1423  // Write exclusions
1424  if (doPairlist) {
1425  const unsigned int warp_exclude = WARP_BALLOT(WARP_FULL_MASK, nexcluded & 1);
1426  const unsigned int warp_any_exclude = WARP_BALLOT(WARP_FULL_MASK, excl);
1427  if (warp_exclude) {
1428  int anyexcl = warp_any_exclude ? 1 : 0;
1429  anyexcl |= 65536;
1430  // Mark this jtile as non-empty:
1431  // VdW: 1 if tile has atom pairs within pairlist cutoff and some these atoms interact
1432  // GBIS: 65536 if tile has atom pairs within pairlist cutoff but not necessary interacting (i.e. these atoms are fixed or excluded)
1433  if (wid == 0 && anyexcl) jtiles[jtile] = anyexcl;
1434  // Store exclusions
1435  tileExcls[jtile].excl[wid] = excl;
1436  // itileListLen:
1437  // lower 16 bits number of tiles with atom pairs within pairlist cutoff that interact
1438  // upper 16 bits number of tiles with atom pairs within pairlist cutoff (but not necessary interacting)
1439  itileListLen += anyexcl;
1440  // NOTE, this minJatomStart is only stored once for the first tile list entry
1441  // minJatomStart = min(minJatomStart, jatomStart);
1442  }
1443  WARP_SYNC(WARP_FULL_MASK);
1444  }
1445  } // jtile
1446 
1447  // Write i-forces
1448  storeForces<doSlow, float3>(s_iatomStart[iwarp] + wid, iforce, iforceSlow,
1449  devForce_x, devForce_y, devForce_z,
1450  devForceSlow_x, devForceSlow_y, devForceSlow_z);
1451  }
1452  // Done with computation
1453 
1454  // Save pairlist stuff
1455  if (doPairlist) {
1456 
1457  // Warp index (0...warpsize-1)
1458  const int wid = threadIdx.x % WARPSIZE;
1459 
1460  if (wid == 0) {
1461  // minJatomStart is in range [0 ... atomStorageSize-1]
1462  //int atom0 = (minJatomStart)/WARPSIZE;
1463  // int atom0 = 0;
1464  // int storageOffset = atomStorageSize/WARPSIZE;
1465  // int itileListLen = 0;
1466  // for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) itileListLen += jtiles[jtile];
1467  // Store 0 if itileListLen == 0
1468  // tileListDepth[itileList] = (itileListLen > 0)*(itileListLen*storageOffset + atom0);
1469  tileListDepth[itileList] = itileListLen;
1470  tileListOrder[itileList] = itileList;
1471  // Number of active tilelists with tile with atom pairs within pairlist cutoff that interact
1472  if ((itileListLen & 65535) > 0) atomicAdd(&tileListStat->numTileLists, 1);
1473  // Number of active tilelists with tiles with atom pairs within pairlist cutoff (but not necessary interacting)
1474  if (itileListLen > 0) atomicAdd(&tileListStat->numTileListsGBIS, 1);
1475  // NOTE: always numTileListsGBIS >= numTileLists
1476  }
1477 
1478  typedef cub::WarpReduce<int> WarpReduceInt;
1479  __shared__ typename WarpReduceInt::TempStorage tempStorage[NONBONDKERNEL_NUM_WARP];
1480  const int warpId = threadIdx.x / WARPSIZE;
1481  // Remove indicator bit
1482  nexcluded >>= 1;
1483  volatile int nexcludedWarp = WarpReduceInt(tempStorage[warpId]).Sum(nexcluded);
1484  if (wid == 0) atomicAdd(&tileListStat->numExcluded, nexcludedWarp);
1485 
1486  }
1487 
1488  if (doVirial) {
1489  // Warp index (0...warpsize-1)
1490  const int wid = threadIdx.x % WARPSIZE;
1491 
1492  typedef cub::WarpReduce<float> WarpReduce;
1493  __shared__ typename WarpReduce::TempStorage tempStorage[NONBONDKERNEL_NUM_WARP];
1494  const int warpId = threadIdx.x / WARPSIZE;
1495  volatile float iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforce.x);
1496  WARP_SYNC(WARP_FULL_MASK);
1497  volatile float iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforce.y);
1498  WARP_SYNC(WARP_FULL_MASK);
1499  volatile float iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforce.z);
1500  WARP_SYNC(WARP_FULL_MASK);
1501  if (wid == 0) {
1502  virialEnergy[itileList].forcex = iforcexSum;
1503  virialEnergy[itileList].forcey = iforceySum;
1504  virialEnergy[itileList].forcez = iforcezSum;
1505  }
1506 
1507  if (doSlow) {
1508  iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.x);
1509  WARP_SYNC(WARP_FULL_MASK);
1510  iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.y);
1511  WARP_SYNC(WARP_FULL_MASK);
1512  iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.z);
1513  WARP_SYNC(WARP_FULL_MASK);
1514  if (wid == 0) {
1515  virialEnergy[itileList].forceSlowx = iforcexSum;
1516  virialEnergy[itileList].forceSlowy = iforceySum;
1517  virialEnergy[itileList].forceSlowz = iforcezSum;
1518  }
1519  }
1520  }
1521 
1522  // Reduce energy
1523  if (doEnergy) {
1524  // NOTE: We must hand write these warp-wide reductions to avoid excess register spillage
1525  // (Why does CUB suck here?)
1526 #pragma unroll
1527  for (int i=16;i >= 1;i/=2) {
1528  energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, 32);
1529  energyElec += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec, i, 32);
1530  if(doFEP) energyVdw_s += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_s, i, 32);
1531  if(doFEP) energyElec_s += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec_s, i, 32);
1532  if(doTI){
1533  energyVdw_ti_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_ti_1, i, 32);
1534  energyVdw_ti_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_ti_2, i, 32);
1535  energyElec_ti_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec_ti_1, i, 32);
1536  energyElec_ti_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec_ti_2, i, 32);
1537  }
1538  if (doSlow){
1539  energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, 32);
1540  if(doFEP) energySlow_s += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_s, i, 32);
1541  if(doTI){
1542  energySlow_ti_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_ti_1, i, 32);
1543  energySlow_ti_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_ti_2, i, 32);
1544  }
1545  }
1546  }
1547 
1548  if (threadIdx.x % WARPSIZE == 0) {
1549  virialEnergy[itileList].energyVdw = energyVdw;
1550  virialEnergy[itileList].energyElec = energyElec;
1551  if (doFEP) virialEnergy[itileList].energyVdw_s = energyVdw_s;
1552  if (doFEP) virialEnergy[itileList].energyElec_s = energyElec_s;
1553  if(doTI){
1554  virialEnergy[itileList].energyVdw_ti_1 = energyVdw_ti_1;
1555  virialEnergy[itileList].energyVdw_ti_2 = energyVdw_ti_2;
1556  virialEnergy[itileList].energyElec_ti_1 = energyElec_ti_1;
1557  virialEnergy[itileList].energyElec_ti_2 = energyElec_ti_2;
1558  }
1559  if (doSlow) {
1560  virialEnergy[itileList].energySlow = energySlow;
1561  if(doFEP) virialEnergy[itileList].energySlow_s = energySlow_s;
1562  if (doTI){
1563  virialEnergy[itileList].energySlow_ti_1 = energySlow_ti_1;
1564  virialEnergy[itileList].energySlow_ti_2 = energySlow_ti_2;
1565  }
1566  }
1567  }
1568  }
1569  // XXX TODO: Disable streaming and see what happens
1570  // Let's try to set
1571  if (doStreaming) {
1572  // Make sure devForces and devForcesSlow have been written into device memory
1573  WARP_SYNC(WARP_FULL_MASK);
1574  __threadfence();
1575 
1576  int patchDone[2] = {false, false};
1577  const int wid = threadIdx.x % WARPSIZE;
1578  if (wid == 0) {
1579  int patchCountOld0 = atomicInc(&patchNumCount[patchInd.x], (unsigned int)(patchNumList.x-1));
1580  patchDone[0] = (patchCountOld0 + 1 == patchNumList.x);
1581  if (patchInd.x != patchInd.y) {
1582  int patchCountOld1 = atomicInc(&patchNumCount[patchInd.y], (unsigned int)(patchNumList.y-1));
1583  patchDone[1] = (patchCountOld1 + 1 == patchNumList.y);
1584  }
1585  }
1586 
1587  patchDone[0] = WARP_ANY(WARP_FULL_MASK, patchDone[0]);
1588  patchDone[1] = WARP_ANY(WARP_FULL_MASK, patchDone[1]);
1589 
1590  if (patchDone[0]) {
1591  // Patch 1 is done, write onto host-mapped memory
1592  CudaPatchRecord patch = cudaPatches[patchInd.x];
1593  int start = patch.atomStart;
1594  int end = start + patch.numAtoms;
1595  for (int i=start+wid;i < end;i+=WARPSIZE) {
1596  mapForces[i] = make_float4(devForce_x[i],
1597  devForce_y[i], devForce_z[i], devForce_w[i]);
1598  if (doSlow) {
1599  mapForcesSlow[i] = make_float4(devForceSlow_x[i],
1600  devForceSlow_y[i], devForceSlow_z[i], devForceSlow_w[i]);
1601  }
1602  }
1603  }
1604  if (patchDone[1]) {
1605  // Patch 2 is done
1606  CudaPatchRecord patch = cudaPatches[patchInd.y];
1607  int start = patch.atomStart;
1608  int end = start + patch.numAtoms;
1609  for (int i=start+wid;i < end;i+=WARPSIZE) {
1610  mapForces[i] = make_float4(devForce_x[i],
1611  devForce_y[i], devForce_z[i], devForce_w[i]);
1612  if (doSlow) {
1613  mapForcesSlow[i] = make_float4(devForceSlow_x[i],
1614  devForceSlow_y[i], devForceSlow_z[i], devForceSlow_w[i]);
1615  }
1616  }
1617  }
1618 
1619  if (patchDone[0] || patchDone[1]) {
1620  // Make sure mapForces and mapForcesSlow are up-to-date
1621  WARP_SYNC(WARP_FULL_MASK);
1622  __threadfence_system();
1623  // Add patch into "patchReadyQueue"
1624  if (wid == 0) {
1625  if (patchDone[0]) {
1626  int ind = atomicAdd(&tileListStat->patchReadyQueueCount, 1);
1627  // int ind = atomicInc((unsigned int *)&mapPatchReadyQueue[numPatches], numPatches-1);
1628  mapPatchReadyQueue[ind] = patchInd.x;
1629  }
1630  if (patchDone[1]) {
1631  int ind = atomicAdd(&tileListStat->patchReadyQueueCount, 1);
1632  // int ind = atomicInc((unsigned int *)&mapPatchReadyQueue[numPatches], numPatches-1);
1633  mapPatchReadyQueue[ind] = patchInd.y;
1634  }
1635  }
1636  }
1637  }
1638 
1639  if (doStreaming && outputOrder != NULL && threadIdx.x % WARPSIZE == 0) {
1640  int index = atomicAdd(&tileListStat->outputOrderIndex, 1);
1641  outputOrder[index] = itileList;
1642  }
1643  } // if (itileList < numTileLists)
1644 }
1645 
1646 //
1647 // Finish up - reduce virials from nonbonded kernel
1648 //
1649 #define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP 32
1650 __global__ void reduceNonbondedVirialKernel(const bool doSlow,
1651  const int atomStorageSize,
1652  const float4* __restrict__ xyzq,
1653  const float4* __restrict__ devForces, const float4* __restrict__ devForcesSlow,
1654  VirialEnergy* __restrict__ virialEnergy) {
1655 
1656  for (int ibase = blockIdx.x*blockDim.x;ibase < atomStorageSize;ibase += blockDim.x*gridDim.x)
1657  {
1658  int i = ibase + threadIdx.x;
1659 
1660  // Set to zero to avoid nan*0
1661  float4 pos;
1662  pos.x = 0.0f;
1663  pos.y = 0.0f;
1664  pos.z = 0.0f;
1665  float4 force, forceSlow;
1666  force.x = 0.0f;
1667  force.y = 0.0f;
1668  force.z = 0.0f;
1669  forceSlow.x = 0.0f;
1670  forceSlow.y = 0.0f;
1671  forceSlow.z = 0.0f;
1672  if (i < atomStorageSize) {
1673  pos = xyzq[i];
1674  force = devForces[i];
1675  if (doSlow) forceSlow = devForcesSlow[i];
1676  }
1677  // Reduce across the entire thread block
1678  float vxxt = force.x*pos.x;
1679  float vxyt = force.x*pos.y;
1680  float vxzt = force.x*pos.z;
1681  float vyxt = force.y*pos.x;
1682  float vyyt = force.y*pos.y;
1683  float vyzt = force.y*pos.z;
1684  float vzxt = force.z*pos.x;
1685  float vzyt = force.z*pos.y;
1686  float vzzt = force.z*pos.z;
1687  // atomicAdd(&virialEnergy->virial[0], (double)vxx);
1688  // atomicAdd(&virialEnergy->virial[1], (double)vxy);
1689  // atomicAdd(&virialEnergy->virial[2], (double)vxz);
1690  // atomicAdd(&virialEnergy->virial[3], (double)vyx);
1691  // atomicAdd(&virialEnergy->virial[4], (double)vyy);
1692  // atomicAdd(&virialEnergy->virial[5], (double)vyz);
1693  // atomicAdd(&virialEnergy->virial[6], (double)vzx);
1694  // atomicAdd(&virialEnergy->virial[7], (double)vzy);
1695  // atomicAdd(&virialEnergy->virial[8], (double)vzz);
1696 
1697  typedef cub::BlockReduce<float, REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1698  __shared__ typename BlockReduce::TempStorage tempStorage;
1699  volatile float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
1700  volatile float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
1701  volatile float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
1702  volatile float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
1703  volatile float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
1704  volatile float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
1705  volatile float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
1706  volatile float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
1707  volatile float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
1708  if (threadIdx.x == 0) {
1709  atomicAdd(&virialEnergy->virial[0], (double)vxx);
1710  atomicAdd(&virialEnergy->virial[1], (double)vxy);
1711  atomicAdd(&virialEnergy->virial[2], (double)vxz);
1712  atomicAdd(&virialEnergy->virial[3], (double)vyx);
1713  atomicAdd(&virialEnergy->virial[4], (double)vyy);
1714  atomicAdd(&virialEnergy->virial[5], (double)vyz);
1715  atomicAdd(&virialEnergy->virial[6], (double)vzx);
1716  atomicAdd(&virialEnergy->virial[7], (double)vzy);
1717  atomicAdd(&virialEnergy->virial[8], (double)vzz);
1718  }
1719 
1720  if (doSlow) {
1721  // if (isnan(forceSlow.x) || isnan(forceSlow.y) || isnan(forceSlow.z))
1722  float vxxSlowt = forceSlow.x*pos.x;
1723  float vxySlowt = forceSlow.x*pos.y;
1724  float vxzSlowt = forceSlow.x*pos.z;
1725  float vyxSlowt = forceSlow.y*pos.x;
1726  float vyySlowt = forceSlow.y*pos.y;
1727  float vyzSlowt = forceSlow.y*pos.z;
1728  float vzxSlowt = forceSlow.z*pos.x;
1729  float vzySlowt = forceSlow.z*pos.y;
1730  float vzzSlowt = forceSlow.z*pos.z;
1731  // atomicAdd(&virialEnergy->virialSlow[0], (double)vxxSlow);
1732  // atomicAdd(&virialEnergy->virialSlow[1], (double)vxySlow);
1733  // atomicAdd(&virialEnergy->virialSlow[2], (double)vxzSlow);
1734  // atomicAdd(&virialEnergy->virialSlow[3], (double)vyxSlow);
1735  // atomicAdd(&virialEnergy->virialSlow[4], (double)vyySlow);
1736  // atomicAdd(&virialEnergy->virialSlow[5], (double)vyzSlow);
1737  // atomicAdd(&virialEnergy->virialSlow[6], (double)vzxSlow);
1738  // atomicAdd(&virialEnergy->virialSlow[7], (double)vzySlow);
1739  // atomicAdd(&virialEnergy->virialSlow[8], (double)vzzSlow);
1740  volatile float vxxSlow = BlockReduce(tempStorage).Sum(vxxSlowt); BLOCK_SYNC;
1741  volatile float vxySlow = BlockReduce(tempStorage).Sum(vxySlowt); BLOCK_SYNC;
1742  volatile float vxzSlow = BlockReduce(tempStorage).Sum(vxzSlowt); BLOCK_SYNC;
1743  volatile float vyxSlow = BlockReduce(tempStorage).Sum(vyxSlowt); BLOCK_SYNC;
1744  volatile float vyySlow = BlockReduce(tempStorage).Sum(vyySlowt); BLOCK_SYNC;
1745  volatile float vyzSlow = BlockReduce(tempStorage).Sum(vyzSlowt); BLOCK_SYNC;
1746  volatile float vzxSlow = BlockReduce(tempStorage).Sum(vzxSlowt); BLOCK_SYNC;
1747  volatile float vzySlow = BlockReduce(tempStorage).Sum(vzySlowt); BLOCK_SYNC;
1748  volatile float vzzSlow = BlockReduce(tempStorage).Sum(vzzSlowt); BLOCK_SYNC;
1749  if (threadIdx.x == 0) {
1750  atomicAdd(&virialEnergy->virialSlow[0], (double)vxxSlow);
1751  atomicAdd(&virialEnergy->virialSlow[1], (double)vxySlow);
1752  atomicAdd(&virialEnergy->virialSlow[2], (double)vxzSlow);
1753  atomicAdd(&virialEnergy->virialSlow[3], (double)vyxSlow);
1754  atomicAdd(&virialEnergy->virialSlow[4], (double)vyySlow);
1755  atomicAdd(&virialEnergy->virialSlow[5], (double)vyzSlow);
1756  atomicAdd(&virialEnergy->virialSlow[6], (double)vzxSlow);
1757  atomicAdd(&virialEnergy->virialSlow[7], (double)vzySlow);
1758  atomicAdd(&virialEnergy->virialSlow[8], (double)vzzSlow);
1759  }
1760  }
1761 
1762  }
1763 }
1764 
1765 #define REDUCEVIRIALENERGYKERNEL_NUM_WARP 32
1766 __global__ void reduceVirialEnergyKernel(
1767  const bool doEnergy, const bool doVirial, const bool doSlow,
1768  const int numTileLists,
1769  const TileListVirialEnergy* __restrict__ tileListVirialEnergy,
1770  VirialEnergy* __restrict__ virialEnergy) {
1771 
1772  for (int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
1773  {
1774  int itileList = ibase + threadIdx.x;
1775  TileListVirialEnergy ve;
1776  if (itileList < numTileLists) {
1777  ve = tileListVirialEnergy[itileList];
1778  } else {
1779  // Set to zero to avoid nan*0
1780  if (doVirial) {
1781  ve.shx = 0.0f;
1782  ve.shy = 0.0f;
1783  ve.shz = 0.0f;
1784  ve.forcex = 0.0f;
1785  ve.forcey = 0.0f;
1786  ve.forcez = 0.0f;
1787  ve.forceSlowx = 0.0f;
1788  ve.forceSlowy = 0.0f;
1789  ve.forceSlowz = 0.0f;
1790  }
1791  if (doEnergy) {
1792  ve.energyVdw = 0.0;
1793  ve.energyVdw_s = 0.0;
1794  ve.energyElec = 0.0;
1795  ve.energySlow = 0.0;
1796  ve.energyElec_s = 0.0;
1797  ve.energySlow_s = 0.0;
1798 
1799  /* TI stuff */
1800  ve.energyVdw_ti_1 = 0.0;
1801  ve.energyVdw_ti_2 = 0.0;
1802  ve.energyElec_ti_1 = 0.0;
1803  ve.energyElec_ti_2 = 0.0;
1804  ve.energySlow_ti_1 = 0.0;
1805  ve.energySlow_ti_2 = 0.0;
1806  // ve.energyGBIS = 0.0;
1807  }
1808  }
1809 
1810  if (doVirial) {
1811  typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1812  __shared__ typename BlockReduce::TempStorage tempStorage;
1813  float vxxt = ve.forcex*ve.shx;
1814  float vxyt = ve.forcex*ve.shy;
1815  float vxzt = ve.forcex*ve.shz;
1816  float vyxt = ve.forcey*ve.shx;
1817  float vyyt = ve.forcey*ve.shy;
1818  float vyzt = ve.forcey*ve.shz;
1819  float vzxt = ve.forcez*ve.shx;
1820  float vzyt = ve.forcez*ve.shy;
1821  float vzzt = ve.forcez*ve.shz;
1822  volatile float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
1823  volatile float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
1824  volatile float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
1825  volatile float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
1826  volatile float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
1827  volatile float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
1828  volatile float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
1829  volatile float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
1830  volatile float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
1831  if (threadIdx.x == 0) {
1832  atomicAdd(&virialEnergy->virial[0], (double)vxx);
1833  atomicAdd(&virialEnergy->virial[1], (double)vxy);
1834  atomicAdd(&virialEnergy->virial[2], (double)vxz);
1835  atomicAdd(&virialEnergy->virial[3], (double)vyx);
1836  atomicAdd(&virialEnergy->virial[4], (double)vyy);
1837  atomicAdd(&virialEnergy->virial[5], (double)vyz);
1838  atomicAdd(&virialEnergy->virial[6], (double)vzx);
1839  atomicAdd(&virialEnergy->virial[7], (double)vzy);
1840  atomicAdd(&virialEnergy->virial[8], (double)vzz);
1841  }
1842 
1843  if (doSlow) {
1844  typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1845  __shared__ typename BlockReduce::TempStorage tempStorage;
1846  float vxxt = ve.forceSlowx*ve.shx;
1847  float vxyt = ve.forceSlowx*ve.shy;
1848  float vxzt = ve.forceSlowx*ve.shz;
1849  float vyxt = ve.forceSlowy*ve.shx;
1850  float vyyt = ve.forceSlowy*ve.shy;
1851  float vyzt = ve.forceSlowy*ve.shz;
1852  float vzxt = ve.forceSlowz*ve.shx;
1853  float vzyt = ve.forceSlowz*ve.shy;
1854  float vzzt = ve.forceSlowz*ve.shz;
1855  volatile float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
1856  volatile float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
1857  volatile float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
1858  volatile float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
1859  volatile float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
1860  volatile float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
1861  volatile float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
1862  volatile float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
1863  volatile float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
1864  if (threadIdx.x == 0) {
1865  atomicAdd(&virialEnergy->virialSlow[0], (double)vxx);
1866  atomicAdd(&virialEnergy->virialSlow[1], (double)vxy);
1867  atomicAdd(&virialEnergy->virialSlow[2], (double)vxz);
1868  atomicAdd(&virialEnergy->virialSlow[3], (double)vyx);
1869  atomicAdd(&virialEnergy->virialSlow[4], (double)vyy);
1870  atomicAdd(&virialEnergy->virialSlow[5], (double)vyz);
1871  atomicAdd(&virialEnergy->virialSlow[6], (double)vzx);
1872  atomicAdd(&virialEnergy->virialSlow[7], (double)vzy);
1873  atomicAdd(&virialEnergy->virialSlow[8], (double)vzz);
1874  }
1875  }
1876  }
1877 
1878  if (doEnergy) {
1879  typedef cub::BlockReduce<double, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1880  /* Maybe we should guard the TI and FEP energies, since those are not to be calculated on regular MDs */
1881  __shared__ typename BlockReduce::TempStorage tempStorage;
1882  volatile double energyVdw = BlockReduce(tempStorage).Sum(ve.energyVdw); BLOCK_SYNC;
1883  volatile double energyVdw_s = BlockReduce(tempStorage).Sum(ve.energyVdw_s); BLOCK_SYNC;
1884  volatile double energyElec = BlockReduce(tempStorage).Sum(ve.energyElec); BLOCK_SYNC;
1885  volatile double energyElec_s = BlockReduce(tempStorage).Sum(ve.energyElec_s); BLOCK_SYNC;
1886  volatile double energyVdw_ti_1 = BlockReduce(tempStorage).Sum(ve.energyVdw_ti_1); BLOCK_SYNC;
1887  volatile double energyVdw_ti_2 = BlockReduce(tempStorage).Sum(ve.energyVdw_ti_2); BLOCK_SYNC;
1888  volatile double energyElec_ti_1= BlockReduce(tempStorage).Sum(ve.energyElec_ti_1); BLOCK_SYNC;
1889  volatile double energyElec_ti_2= BlockReduce(tempStorage).Sum(ve.energyElec_ti_2); BLOCK_SYNC;
1890  if (threadIdx.x == 0){
1891  atomicAdd(&virialEnergy->energyVdw, (double)energyVdw);
1892  atomicAdd(&virialEnergy->energyVdw_s, (double)energyVdw_s);
1893  atomicAdd(&virialEnergy->energyElec, (double)energyElec);
1894  atomicAdd(&virialEnergy->energyElec_s, (double)energyElec_s);
1895  atomicAdd(&virialEnergy->energyVdw_ti_1, (double)energyVdw_ti_1);
1896  atomicAdd(&virialEnergy->energyVdw_ti_2, (double)energyVdw_ti_2);
1897  atomicAdd(&virialEnergy->energyElec_ti_1, (double)energyElec_ti_1);
1898  atomicAdd(&virialEnergy->energyElec_ti_2, (double)energyElec_ti_2);
1899  }
1900  if (doSlow) {
1901  volatile double energySlow = BlockReduce(tempStorage).Sum(ve.energySlow); BLOCK_SYNC;
1902  volatile double energySlow_s = BlockReduce(tempStorage).Sum(ve.energySlow_s); BLOCK_SYNC;
1903  volatile double energySlow_ti_1 = BlockReduce(tempStorage).Sum(ve.energySlow_ti_1); BLOCK_SYNC;
1904  volatile double energySlow_ti_2 = BlockReduce(tempStorage).Sum(ve.energySlow_ti_2); BLOCK_SYNC;
1905  if (threadIdx.x == 0) {
1906  atomicAdd(&virialEnergy->energySlow, (double)energySlow);
1907  atomicAdd(&virialEnergy->energySlow_s, (double)energySlow_s);
1908  atomicAdd(&virialEnergy->energySlow_ti_1,(double)energySlow_ti_1);
1909  atomicAdd(&virialEnergy->energySlow_ti_2,(double)energySlow_ti_2);
1910  }
1911  }
1912  // if (doGBIS) {
1913  // double energyGBIS = BlockReduce(tempStorage).Sum(ve.energyGBIS); BLOCK_SYNC;
1914  // if (threadIdx.x == 0) atomicAdd(&virialEnergy->energyGBIS, (double)energyGBIS);
1915  // }
1916  }
1917 
1918  }
1919 
1920 }
1921 
1922 #define REDUCEGBISENERGYKERNEL_NUM_WARP 32
1923 __global__ void reduceGBISEnergyKernel(const int numTileLists,
1924  const TileListVirialEnergy* __restrict__ tileListVirialEnergy,
1925  VirialEnergy* __restrict__ virialEnergy) {
1926 
1927  for (int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
1928  {
1929  int itileList = ibase + threadIdx.x;
1930  double energyGBISt = 0.0;
1931  if (itileList < numTileLists) {
1932  energyGBISt = tileListVirialEnergy[itileList].energyGBIS;
1933  }
1934 
1935  typedef cub::BlockReduce<double, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1936  __shared__ typename BlockReduce::TempStorage tempStorage;
1937  volatile double energyGBIS = BlockReduce(tempStorage).Sum(energyGBISt); BLOCK_SYNC;
1938  if (threadIdx.x == 0) atomicAdd(&virialEnergy->energyGBIS, (double)energyGBIS);
1939  }
1940 
1941 }
1942 
1943 // ##############################################################################################
1944 // ##############################################################################################
1945 // ##############################################################################################
1946 
1947 CudaComputeNonbondedKernel::CudaComputeNonbondedKernel(int deviceID, CudaNonbondedTables& cudaNonbondedTables,
1948  bool doStreaming) : deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables), doStreaming(doStreaming) {
1949 
1950  cudaCheck(cudaSetDevice(deviceID));
1951 
1952  d_exclusionsByAtom = NULL;
1953 
1954  overflowExclusions = NULL;
1955  overflowExclusionsSize = 0;
1956 
1957  exclIndexMaxDiff = NULL;
1958  exclIndexMaxDiffSize = 0;
1959 
1960  atomIndex = NULL;
1961  atomIndexSize = 0;
1962 
1963  vdwTypes = NULL;
1964  vdwTypesSize = 0;
1965 
1966  patchNumCount = NULL;
1967  patchNumCountSize = 0;
1968 
1969  patchReadyQueue = NULL;
1970  patchReadyQueueSize = 0;
1971 
1972  force_x = force_y = force_z = force_w = NULL;
1973  forceSize = 0;
1974  forceSlow_x = forceSlow_y = forceSlow_z = forceSlow_w = NULL;
1975  forceSlowSize = 0;
1976 }
1977 
1978 void CudaComputeNonbondedKernel::reallocate_forceSOA(int atomStorageSize)
1979 {
1980 #if 0
1981  size_t forceSizeCurrent;
1982 
1983  // reallocate_device will update forceSizeCurrent, so we need to reset it to the current
1984  // value for each array
1985  forceSizeCurrent = forceSize;
1986  reallocate_device<float>(&force_x, &forceSizeCurrent, atomStorageSize, 1.4f);
1987  forceSizeCurrent = forceSize;
1988  reallocate_device<float>(&force_y, &forceSizeCurrent, atomStorageSize, 1.4f);
1989  forceSizeCurrent = forceSize;
1990  reallocate_device<float>(&force_z, &forceSizeCurrent, atomStorageSize, 1.4f);
1991  forceSizeCurrent = forceSize;
1992  reallocate_device<float>(&force_w, &forceSizeCurrent, atomStorageSize, 1.4f);
1993 
1994 
1995  size_t forceSlowSizeCurrent;
1996  forceSlowSizeCurrent = forceSlowSize;
1997  reallocate_device<float>(&forceSlow_x, &forceSlowSizeCurrent, atomStorageSize, 1.4f);
1998  forceSlowSizeCurrent = forceSlowSize;
1999  reallocate_device<float>(&forceSlow_y, &forceSlowSizeCurrent, atomStorageSize, 1.4f);
2000  forceSlowSizeCurrent = forceSlowSize;
2001  reallocate_device<float>(&forceSlow_z, &forceSlowSizeCurrent, atomStorageSize, 1.4f);
2002  forceSlowSizeCurrent = forceSlowSize;
2003  reallocate_device<float>(&forceSlow_w, &forceSlowSizeCurrent, atomStorageSize, 1.4f);
2004 #else
2005  reallocate_device<float>(&force_x, &forceSize, atomStorageSize*8, 1.4f);
2006  force_y = force_x + atomStorageSize;
2007  force_z = force_y + atomStorageSize;
2008  force_w = force_z + atomStorageSize;
2009  forceSlow_x = force_w + atomStorageSize;
2010  forceSlow_y = forceSlow_x + atomStorageSize;
2011  forceSlow_z = forceSlow_y + atomStorageSize;
2012  forceSlow_w = forceSlow_z + atomStorageSize;
2013 #endif
2014 }
2015 
2016 CudaComputeNonbondedKernel::~CudaComputeNonbondedKernel() {
2017  cudaCheck(cudaSetDevice(deviceID));
2018  if (overflowExclusions != NULL) deallocate_device<unsigned int>(&overflowExclusions);
2019  if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff);
2020  if (atomIndex != NULL) deallocate_device<int>(&atomIndex);
2021  if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes);
2022  if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount);
2023  if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue);
2024 #if 0
2025  if (force_x != NULL) deallocate_device<float>(&force_x);
2026  if (force_y != NULL) deallocate_device<float>(&force_y);
2027  if (force_z != NULL) deallocate_device<float>(&force_z);
2028  if (force_w != NULL) deallocate_device<float>(&force_w);
2029  if (forceSlow_x != NULL) deallocate_device<float>(&forceSlow_x);
2030  if (forceSlow_y != NULL) deallocate_device<float>(&forceSlow_y);
2031  if (forceSlow_z != NULL) deallocate_device<float>(&forceSlow_z);
2032  if (forceSlow_w != NULL) deallocate_device<float>(&forceSlow_w);
2033 #else
2034  if (force_x != NULL) deallocate_device<float>(&force_x);
2035 #endif
2036 }
2037 
2038 void CudaComputeNonbondedKernel::updateVdwTypesExcl(const int atomStorageSize, const int* h_vdwTypes,
2039  const int2* h_exclIndexMaxDiff, const int* h_atomIndex, cudaStream_t stream) {
2040 
2041  reallocate_device<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, OVERALLOC);
2042  reallocate_device<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, OVERALLOC);
2043  reallocate_device<int>(&atomIndex, &atomIndexSize, atomStorageSize, OVERALLOC);
2044 
2045  copy_HtoD<int>(h_vdwTypes, vdwTypes, atomStorageSize, stream);
2046  copy_HtoD<int2>(h_exclIndexMaxDiff, exclIndexMaxDiff, atomStorageSize, stream);
2047  copy_HtoD<int>(h_atomIndex, atomIndex, atomStorageSize, stream);
2048 }
2049 
2050 int* CudaComputeNonbondedKernel::getPatchReadyQueue() {
2051  if (!doStreaming) {
2052  NAMD_die("CudaComputeNonbondedKernel::getPatchReadyQueue() called on non-streaming kernel");
2053  }
2054  return patchReadyQueue;
2055 }
2056 
2057 template <int doSlow>
2058 __global__ void transposeForcesKernel(float4 *f, float4 *fSlow,
2059  float *fx, float *fy, float *fz, float *fw,
2060  float *fSlowx, float *fSlowy, float *fSlowz, float *fSloww,
2061  int n)
2062 {
2063  int tid = blockIdx.x*blockDim.x + threadIdx.x;
2064  if (tid < n) {
2065  f[tid] = make_float4(fx[tid], fy[tid], fz[tid], fw[tid]);
2066  fx[tid] = 0.f; fy[tid] = 0.f; fz[tid] = 0.f; fw[tid] = 0.f;
2067  if (doSlow) {
2068  fSlow[tid] = make_float4(fSlowx[tid], fSlowy[tid], fSlowz[tid], fSloww[tid]);
2069  fSlowx[tid] = 0.f; fSlowy[tid] = 0.f; fSlowz[tid] = 0.f; fSloww[tid] = 0.f;
2070  }
2071  }
2072 }
2073 
2074 
2075 
2076 void CudaComputeNonbondedKernel::nonbondedForce(CudaTileListKernel& tlKernel,
2077  const int atomStorageSize, const bool atomsChanged, const bool doMinimize,
2078  const bool doPairlist, const bool doEnergy, const bool doVirial,
2079  const bool doSlow, const bool doAlch, const bool doAlchVdwForceSwitching,
2080  const bool doFEP, const bool doTI, const bool doTable,
2081  const float3 lata, const float3 latb, const float3 latc,
2082  const float4* h_xyzq, const float cutoff2,
2083  const CudaNBConstants nbConstants,
2084  float4* d_forces, float4* d_forcesSlow,
2085  float4* h_forces, float4* h_forcesSlow, AlchData *srcFlags,
2086  bool lambdaWindowUpdated, char *part,
2087  bool CUDASOAintegrator, bool useDeviceMigration,
2088  cudaStream_t stream) {
2089 
2090 #ifdef NODEGROUP_FORCE_REGISTER
2091  if (!atomsChanged && !CUDASOAintegrator) copy_HtoD<float4>(h_xyzq, tlKernel.get_xyzq(), atomStorageSize, stream);
2092 #else
2093  if (!doPairlist) copy_HtoD<float4>(h_xyzq, tlKernel.get_xyzq(), atomStorageSize, stream);
2094 #endif
2095 
2096  if (doAlch){
2097  // Copy partition to device. This is not necessary if both CUDASOAintegrator and useDeviceMigration
2098  // are true.
2099  if (doPairlist && (!CUDASOAintegrator || !useDeviceMigration)) {
2100  copy_HtoD< char>(part, tlKernel.get_part(), atomStorageSize, stream);
2101  }
2102  //Copies flags to constant memory
2103  if(lambdaWindowUpdated) cudaCheck(cudaMemcpyToSymbol(alchflags, srcFlags, sizeof(AlchData)));
2104  }
2105 
2106  // XXX TODO: Get rid of the clears
2107  if(1){
2108  // clear_device_array<float4>(d_forces, atomStorageSize, stream);
2109  // if (doSlow) clear_device_array<float4>(d_forcesSlow, atomStorageSize, stream);
2110  // two clears
2111  // This needs to go.
2112  if (doStreaming) tlKernel.clearTileListStat(stream);
2113  if(atomsChanged || doMinimize){
2114  clear_device_array<float>(force_x, atomStorageSize*4, stream);
2115  if(doSlow) clear_device_array<float>(forceSlow_x, atomStorageSize*4, stream);
2116  }
2117  }
2118 
2119  // --- streaming ----
2120  float4* m_forces = NULL;
2121  float4* m_forcesSlow = NULL;
2122  int* m_patchReadyQueue = NULL;
2123  int numPatches = 0;
2124  unsigned int* patchNumCountPtr = NULL;
2125  if (doStreaming) {
2126  numPatches = tlKernel.getNumPatches();
2127  if (reallocate_device<unsigned int>(&patchNumCount, &patchNumCountSize, numPatches)) {
2128  // If re-allocated, clear array
2129  clear_device_array<unsigned int>(patchNumCount, numPatches, stream);
2130  }
2131  patchNumCountPtr = patchNumCount;
2132  bool re = reallocate_host<int>(&patchReadyQueue, &patchReadyQueueSize, numPatches, cudaHostAllocMapped);
2133  if (re) {
2134  // If re-allocated, re-set to "-1"
2135  for (int i=0;i < numPatches;i++) patchReadyQueue[i] = -1;
2136  }
2137  cudaCheck(cudaHostGetDevicePointer(&m_patchReadyQueue, patchReadyQueue, 0));
2138  cudaCheck(cudaHostGetDevicePointer(&m_forces, h_forces, 0));
2139  cudaCheck(cudaHostGetDevicePointer(&m_forcesSlow, h_forcesSlow, 0));
2140  }
2141  // -----------------
2142 
2143  if (doVirial || doEnergy) {
2144  tlKernel.setTileListVirialEnergyLength(tlKernel.getNumTileLists());
2145  }
2146 
2147  int shMemSize = 0;
2148 
2149  int* outputOrderPtr = tlKernel.getOutputOrder();
2150 
2151  int nwarp = NONBONDKERNEL_NUM_WARP;
2152  int nthread = WARPSIZE*nwarp;
2153  int start = 0;
2154 
2155 #define APVERSION
2156 #undef APVERSION
2157 
2158 #ifdef APVERSION
2159 #else
2160  int options = doEnergy + (doVirial << 1) + (doSlow << 2) +
2161  (doPairlist << 3) + (doAlch << 4) + (doFEP << 5) + (doTI << 6) + (doStreaming << 7) + (doTable << 8) + (doAlchVdwForceSwitching << 9);
2162 #endif
2163 
2164  while (start < tlKernel.getNumTileLists()) {
2165 
2166  int nleft = tlKernel.getNumTileLists() - start;
2167  int nblock = min(deviceCUDA->getMaxNumBlocks(), (nleft-1)/nwarp+1);
2168 #ifdef APVERSION
2169 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOALCH, DOFEP, DOTI, DOSTREAMING, DOALCHWDWFORCESWITCHING) \
2170  nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOALCH, DOFEP, DOTI, DOSTREAMING, DOALCHWDWFORCESWITCHING> \
2171  <<< nblock, nthread, shMemSize, stream >>> \
2172  (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
2173  cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getVdwCoefTable(), \
2174  vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, nbConstants, \
2175  cudaNonbondedTables.getVdwCoefTableTex(), cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex(), \
2176  atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
2177  tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
2178  tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \
2179  force_x, force_y, force_z, force_w, \
2180  forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w, \
2181  numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
2182  outputOrderPtr, tlKernel.getTileListVirialEnergy(), tlKernel.get_part()); called=true
2183 
2184  bool called = false;
2185  if (doStreaming) {
2186  if(!doAlch){
2187  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 0, 0, 0, 1, 0);
2188  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 0, 0, 0, 1, 0);
2189  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 0, 0, 0, 1, 0);
2190  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 0, 0, 0, 1, 0);
2191  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 0, 0, 0, 1, 0);
2192  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 0, 0, 0, 1, 0);
2193  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 0, 0, 0, 1, 0);
2194  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 0, 0, 0, 1, 0);
2195 
2196 
2197  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 0, 0, 0, 1, 0);
2198  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 0, 0, 0, 1, 0);
2199  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 0, 0, 0, 1, 0);
2200  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 0, 0, 0, 1, 0);
2201  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 0, 0, 0, 1, 0);
2202  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 0, 0, 0, 1, 0);
2203  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 0, 0, 0, 1, 0);
2204  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 0, 0, 0, 1, 0);
2205  }else{
2206  if(doFEP){
2207  if (doAlchVdwForceSwitching) {
2208  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 1, 0, 1, 1);
2209  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 1, 0, 1, 1);
2210  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 1, 0, 1, 1);
2211  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 1, 0, 1, 1);
2212  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 1, 0, 1, 1);
2213  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 1, 0, 1, 1);
2214  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 1, 0, 1, 1);
2215  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 1, 0, 1, 1);
2216 
2217  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 1, 0, 1, 1);
2218  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 1, 0, 1, 1);
2219  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 1, 0, 1, 1);
2220  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 1, 0, 1, 1);
2221  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 1, 0, 1, 1);
2222  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 1, 0, 1, 1);
2223  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 1, 0, 1, 1);
2224  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 1, 0, 1, 1);
2225  } else {
2226  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 1, 0, 1, 0);
2227  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 1, 0, 1, 0);
2228  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 1, 0, 1, 0);
2229  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 1, 0, 1, 0);
2230  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 1, 0, 1, 0);
2231  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 1, 0, 1, 0);
2232  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 1, 0, 1, 0);
2233  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 1, 0, 1, 0);
2234 
2235  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 1, 0, 1, 0);
2236  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 1, 0, 1, 0);
2237  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 1, 0, 1, 0);
2238  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 1, 0, 1, 0);
2239  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 1, 0, 1, 0);
2240  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 1, 0, 1, 0);
2241  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 1, 0, 1, 0);
2242  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 1, 0, 1, 0);
2243  } // doAlchVdwForceSwitching
2244  }else{
2245  // TI
2246  if (doAlchVdwForceSwitching) {
2247  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 0, 1, 1, 1);
2248  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 0, 1, 1, 1);
2249  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 0, 1, 1, 1);
2250  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 0, 1, 1, 1);
2251  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 0, 1, 1, 1);
2252  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 0, 1, 1, 1);
2253  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 0, 1, 1, 1);
2254  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 0, 1, 1, 1);
2255 
2256  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 0, 1, 1, 1);
2257  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 0, 1, 1, 1);
2258  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 0, 1, 1, 1);
2259  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 0, 1, 1, 1);
2260  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 0, 1, 1, 1);
2261  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 0, 1, 1, 1);
2262  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 0, 1, 1, 1);
2263  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 0, 1, 1, 1);
2264  } else {
2265  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 0, 1, 1, 0);
2266  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 0, 1, 1, 0);
2267  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 0, 1, 1, 0);
2268  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 0, 1, 1, 0);
2269  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 0, 1, 1, 0);
2270  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 0, 1, 1, 0);
2271  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 0, 1, 1, 0);
2272  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 0, 1, 1, 0);
2273 
2274  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 0, 1, 1, 0);
2275  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 0, 1, 1, 0);
2276  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 0, 1, 1, 0);
2277  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 0, 1, 1, 0);
2278  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 0, 1, 1, 0);
2279  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 0, 1, 1, 0);
2280  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 0, 1, 1, 0);
2281  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 0, 1, 1, 0);
2282  } // doAlchVdwForceSwitching
2283  } // doFEP
2284  } // doAlch
2285  }
2286  else {
2287  // no streaming
2288  if(!doAlch){
2289  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 0, 0, 0, 0, 0);
2290  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 0, 0, 0, 0, 0);
2291  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 0, 0, 0, 0, 0);
2292  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 0, 0, 0, 0, 0);
2293  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 0, 0, 0, 0, 0);
2294  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 0, 0, 0, 0, 0);
2295  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 0, 0, 0, 0, 0);
2296  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 0, 0, 0, 0, 0);
2297 
2298 
2299  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 0, 0, 0, 0, 0);
2300  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 0, 0, 0, 0, 0);
2301  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 0, 0, 0, 0, 0);
2302  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 0, 0, 0, 0, 0);
2303  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 0, 0, 0, 0, 0);
2304  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 0, 0, 0, 0, 0);
2305  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 0, 0, 0, 0, 0);
2306  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 0, 0, 0, 0, 0);
2307  }else{
2308  if(doFEP){
2309  if (doAlchVdwForceSwitching) {
2310  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 1, 0, 0, 1);
2311  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 1, 0, 0, 1);
2312  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 1, 0, 0, 1);
2313  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 1, 0, 0, 1);
2314  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 1, 0, 0, 1);
2315  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 1, 0, 0, 1);
2316  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 1, 0, 0, 1);
2317  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 1, 0, 0, 1);
2318 
2319  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 1, 0, 0, 1);
2320  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 1, 0, 0, 1);
2321  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 1, 0, 0, 1);
2322  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 1, 0, 0, 1);
2323  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 1, 0, 0, 1);
2324  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 1, 0, 0, 1);
2325  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 1, 0, 0, 1);
2326  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 1, 0, 0, 1);
2327  } else {
2328  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 1, 0, 0, 0);
2329  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 1, 0, 0, 0);
2330  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 1, 0, 0, 0);
2331  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 1, 0, 0, 0);
2332  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 1, 0, 0, 0);
2333  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 1, 0, 0, 0);
2334  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 1, 0, 0, 0);
2335  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 1, 0, 0, 0);
2336 
2337  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 1, 0, 0, 0);
2338  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 1, 0, 0, 0);
2339  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 1, 0, 0, 0);
2340  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 1, 0, 0, 0);
2341  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 1, 0, 0, 0);
2342  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 1, 0, 0, 0);
2343  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 1, 0, 0, 0);
2344  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 1, 0, 0, 0);
2345  }
2346  }else{
2347  // TI
2348  if (doAlchVdwForceSwitching) {
2349  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 0, 1, 0, 1);
2350  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 0, 1, 0, 1);
2351  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 0, 1, 0, 1);
2352  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 0, 1, 0, 1);
2353  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 0, 1, 0, 1);
2354  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 0, 1, 0, 1);
2355  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 0, 1, 0, 1);
2356  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 0, 1, 0, 1);
2357 
2358  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 0, 1, 0, 1);
2359  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 0, 1, 0, 1);
2360  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 0, 1, 0, 1);
2361  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 0, 1, 0, 1);
2362  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 0, 1, 0, 1);
2363  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 0, 1, 0, 1);
2364  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 0, 1, 0, 1);
2365  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 0, 1, 0, 1);
2366  } else {
2367  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 0, 1, 0, 0);
2368  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 0, 1, 0, 0);
2369  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 0, 1, 0, 0);
2370  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 0, 1, 0, 0);
2371  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 0, 1, 0, 0);
2372  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 0, 1, 0, 0);
2373  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 0, 1, 0, 0);
2374  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 0, 1, 0, 0);
2375 
2376  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 0, 1, 0, 0);
2377  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 0, 1, 0, 0);
2378  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 0, 1, 0, 0);
2379  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 0, 1, 0, 0);
2380  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 0, 1, 0, 0);
2381  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 0, 1, 0, 0);
2382  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 0, 1, 0, 0);
2383  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 0, 1, 0, 0);
2384  }
2385  }//if doFEP
2386  }//if doAlch
2387  }//if doStreaming
2388 
2389  if (!called) {
2390  NAMD_die("CudaComputeNonbondedKernel::nonbondedForce, none of the kernels called");
2391  }
2392 
2393 #else
2394 
2395 
2396 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOALCH, DOFEP, DOTI, DOSTREAMING, DOTABLE, DOALCHWDWFORCESWITCHING) \
2397  nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOALCH, DOFEP, DOTI, DOSTREAMING, DOTABLE, DOALCHWDWFORCESWITCHING> \
2398  <<< nblock, nthread, shMemSize, stream >>> \
2399  (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
2400  cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getVdwCoefTable(), \
2401  vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, nbConstants, \
2402  cudaNonbondedTables.getVdwCoefTableTex(), cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex(), \
2403  atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
2404  tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
2405  tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \
2406  force_x, force_y, force_z, force_w, \
2407  forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w, \
2408  numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
2409  outputOrderPtr, tlKernel.getTileListVirialEnergy(), tlKernel.get_part())
2410 
2411 #ifdef DEBUG
2412  char errmsg[256];
2413 #endif
2414 
2415 
2416  switch (options) {
2417  case 0: CALL(0, 0, 0, 0, 0, 0, 0, 0, 0, 0); break;
2418  case 1: CALL(1, 0, 0, 0, 0, 0, 0, 0, 0, 0); break;
2419  case 2: CALL(0, 1, 0, 0, 0, 0, 0, 0, 0, 0); break;
2420  case 3: CALL(1, 1, 0, 0, 0, 0, 0, 0, 0, 0); break;
2421  case 4: CALL(0, 0, 1, 0, 0, 0, 0, 0, 0, 0); break;
2422  case 5: CALL(1, 0, 1, 0, 0, 0, 0, 0, 0, 0); break;
2423  case 6: CALL(0, 1, 1, 0, 0, 0, 0, 0, 0, 0); break;
2424  case 7: CALL(1, 1, 1, 0, 0, 0, 0, 0, 0, 0); break;
2425  case 8: CALL(0, 0, 0, 1, 0, 0, 0, 0, 0, 0); break;
2426  case 9: CALL(1, 0, 0, 1, 0, 0, 0, 0, 0, 0); break;
2427  case 10: CALL(0, 1, 0, 1, 0, 0, 0, 0, 0, 0); break;
2428  case 11: CALL(1, 1, 0, 1, 0, 0, 0, 0, 0, 0); break;
2429  case 12: CALL(0, 0, 1, 1, 0, 0, 0, 0, 0, 0); break;
2430  case 13: CALL(1, 0, 1, 1, 0, 0, 0, 0, 0, 0); break;
2431  case 14: CALL(0, 1, 1, 1, 0, 0, 0, 0, 0, 0); break;
2432  case 15: CALL(1, 1, 1, 1, 0, 0, 0, 0, 0, 0); break;
2433 
2434 #if 0
2435  case 16: CALL(0, 0, 0, 0, 1, 0, 0, 0, 0, 0); break;
2436  case 17: CALL(1, 0, 0, 0, 1, 0, 0, 0, 0, 0); break;
2437  case 18: CALL(0, 1, 0, 0, 1, 0, 0, 0, 0, 0); break;
2438  case 19: CALL(1, 1, 0, 0, 1, 0, 0, 0, 0, 0); break;
2439  case 20: CALL(0, 0, 1, 0, 1, 0, 0, 0, 0, 0); break;
2440  case 21: CALL(1, 0, 1, 0, 1, 0, 0, 0, 0, 0); break;
2441  case 22: CALL(0, 1, 1, 0, 1, 0, 0, 0, 0, 0); break;
2442  case 23: CALL(1, 1, 1, 0, 1, 0, 0, 0, 0, 0); break;
2443  case 24: CALL(0, 0, 0, 1, 1, 0, 0, 0, 0, 0); break;
2444  case 25: CALL(1, 0, 0, 1, 1, 0, 0, 0, 0, 0); break;
2445  case 26: CALL(0, 1, 0, 1, 1, 0, 0, 0, 0, 0); break;
2446  case 27: CALL(1, 1, 0, 1, 1, 0, 0, 0, 0, 0); break;
2447  case 28: CALL(0, 0, 1, 1, 1, 0, 0, 0, 0, 0); break;
2448  case 29: CALL(1, 0, 1, 1, 1, 0, 0, 0, 0, 0); break;
2449  case 30: CALL(0, 1, 1, 1, 1, 0, 0, 0, 0, 0); break;
2450  case 31: CALL(1, 1, 1, 1, 1, 0, 0, 0, 0, 0); break;
2451 
2452  case 32: CALL(0, 0, 0, 0, 0, 1, 0, 0, 0, 0); break;
2453  case 33: CALL(1, 0, 0, 0, 0, 1, 0, 0, 0, 0); break;
2454  case 34: CALL(0, 1, 0, 0, 0, 1, 0, 0, 0, 0); break;
2455  case 35: CALL(1, 1, 0, 0, 0, 1, 0, 0, 0, 0); break;
2456  case 36: CALL(0, 0, 1, 0, 0, 1, 0, 0, 0, 0); break;
2457  case 37: CALL(1, 0, 1, 0, 0, 1, 0, 0, 0, 0); break;
2458  case 38: CALL(0, 1, 1, 0, 0, 1, 0, 0, 0, 0); break;
2459  case 39: CALL(1, 1, 1, 0, 0, 1, 0, 0, 0, 0); break;
2460  case 40: CALL(0, 0, 0, 1, 0, 1, 0, 0, 0, 0); break;
2461  case 41: CALL(1, 0, 0, 1, 0, 1, 0, 0, 0, 0); break;
2462  case 42: CALL(0, 1, 0, 1, 0, 1, 0, 0, 0, 0); break;
2463  case 43: CALL(1, 1, 0, 1, 0, 1, 0, 0, 0, 0); break;
2464  case 44: CALL(0, 0, 1, 1, 0, 1, 0, 0, 0, 0); break;
2465  case 45: CALL(1, 0, 1, 1, 0, 1, 0, 0, 0, 0); break;
2466  case 46: CALL(0, 1, 1, 1, 0, 1, 0, 0, 0, 0); break;
2467  case 47: CALL(1, 1, 1, 1, 0, 1, 0, 0, 0, 0); break;
2468 
2469  case 48: CALL(0, 0, 0, 0, 1, 1, 0, 0, 0, 0); break;
2470  case 49: CALL(1, 0, 0, 0, 1, 1, 0, 0, 0, 0); break;
2471  case 50: CALL(0, 1, 0, 0, 1, 1, 0, 0, 0, 0); break;
2472  case 51: CALL(1, 1, 0, 0, 1, 1, 0, 0, 0, 0); break;
2473  case 52: CALL(0, 0, 1, 0, 1, 1, 0, 0, 0, 0); break;
2474  case 53: CALL(1, 0, 1, 0, 1, 1, 0, 0, 0, 0); break;
2475  case 54: CALL(0, 1, 1, 0, 1, 1, 0, 0, 0, 0); break;
2476  case 55: CALL(1, 1, 1, 0, 1, 1, 0, 0, 0, 0); break;
2477  case 56: CALL(0, 0, 0, 1, 1, 1, 0, 0, 0, 0); break;
2478  case 57: CALL(1, 0, 0, 1, 1, 1, 0, 0, 0, 0); break;
2479  case 58: CALL(0, 1, 0, 1, 1, 1, 0, 0, 0, 0); break;
2480  case 59: CALL(1, 1, 0, 1, 1, 1, 0, 0, 0, 0); break;
2481  case 60: CALL(0, 0, 1, 1, 1, 1, 0, 0, 0, 0); break;
2482  case 61: CALL(1, 0, 1, 1, 1, 1, 0, 0, 0, 0); break;
2483  case 62: CALL(0, 1, 1, 1, 1, 1, 0, 0, 0, 0); break;
2484  case 63: CALL(1, 1, 1, 1, 1, 1, 0, 0, 0, 0); break;
2485 
2486  case 64: CALL(0, 0, 0, 0, 0, 0, 1, 0, 0, 0); break;
2487  case 65: CALL(1, 0, 0, 0, 0, 0, 1, 0, 0, 0); break;
2488  case 66: CALL(0, 1, 0, 0, 0, 0, 1, 0, 0, 0); break;
2489  case 67: CALL(1, 1, 0, 0, 0, 0, 1, 0, 0, 0); break;
2490  case 68: CALL(0, 0, 1, 0, 0, 0, 1, 0, 0, 0); break;
2491  case 69: CALL(1, 0, 1, 0, 0, 0, 1, 0, 0, 0); break;
2492  case 70: CALL(0, 1, 1, 0, 0, 0, 1, 0, 0, 0); break;
2493  case 71: CALL(1, 1, 1, 0, 0, 0, 1, 0, 0, 0); break;
2494  case 72: CALL(0, 0, 0, 1, 0, 0, 1, 0, 0, 0); break;
2495  case 73: CALL(1, 0, 0, 1, 0, 0, 1, 0, 0, 0); break;
2496  case 74: CALL(0, 1, 0, 1, 0, 0, 1, 0, 0, 0); break;
2497  case 75: CALL(1, 1, 0, 1, 0, 0, 1, 0, 0, 0); break;
2498  case 76: CALL(0, 0, 1, 1, 0, 0, 1, 0, 0, 0); break;
2499  case 77: CALL(1, 0, 1, 1, 0, 0, 1, 0, 0, 0); break;
2500  case 78: CALL(0, 1, 1, 1, 0, 0, 1, 0, 0, 0); break;
2501  case 79: CALL(1, 1, 1, 1, 0, 0, 1, 0, 0, 0); break;
2502 
2503  case 80: CALL(0, 0, 0, 0, 1, 0, 1, 0, 0, 0); break;
2504  case 81: CALL(1, 0, 0, 0, 1, 0, 1, 0, 0, 0); break;
2505  case 82: CALL(0, 1, 0, 0, 1, 0, 1, 0, 0, 0); break;
2506  case 83: CALL(1, 1, 0, 0, 1, 0, 1, 0, 0, 0); break;
2507  case 84: CALL(0, 0, 1, 0, 1, 0, 1, 0, 0, 0); break;
2508  case 85: CALL(1, 0, 1, 0, 1, 0, 1, 0, 0, 0); break;
2509  case 86: CALL(0, 1, 1, 0, 1, 0, 1, 0, 0, 0); break;
2510  case 87: CALL(1, 1, 1, 0, 1, 0, 1, 0, 0, 0); break;
2511  case 88: CALL(0, 0, 0, 1, 1, 0, 1, 0, 0, 0); break;
2512  case 89: CALL(1, 0, 0, 1, 1, 0, 1, 0, 0, 0); break;
2513  case 90: CALL(0, 1, 0, 1, 1, 0, 1, 0, 0, 0); break;
2514  case 91: CALL(1, 1, 0, 1, 1, 0, 1, 0, 0, 0); break;
2515  case 92: CALL(0, 0, 1, 1, 1, 0, 1, 0, 0, 0); break;
2516  case 93: CALL(1, 0, 1, 1, 1, 0, 1, 0, 0, 0); break;
2517  case 94: CALL(0, 1, 1, 1, 1, 0, 1, 0, 0, 0); break;
2518  case 95: CALL(1, 1, 1, 1, 1, 0, 1, 0, 0, 0); break;
2519 
2520  case 96: CALL(0, 0, 0, 0, 0, 1, 1, 0, 0, 0); break;
2521  case 97: CALL(1, 0, 0, 0, 0, 1, 1, 0, 0, 0); break;
2522  case 98: CALL(0, 1, 0, 0, 0, 1, 1, 0, 0, 0); break;
2523  case 99: CALL(1, 1, 0, 0, 0, 1, 1, 0, 0, 0); break;
2524  case 100: CALL(0, 0, 1, 0, 0, 1, 1, 0, 0, 0); break;
2525  case 101: CALL(1, 0, 1, 0, 0, 1, 1, 0, 0, 0); break;
2526  case 102: CALL(0, 1, 1, 0, 0, 1, 1, 0, 0, 0); break;
2527  case 103: CALL(1, 1, 1, 0, 0, 1, 1, 0, 0, 0); break;
2528  case 104: CALL(0, 0, 0, 1, 0, 1, 1, 0, 0, 0); break;
2529  case 105: CALL(1, 0, 0, 1, 0, 1, 1, 0, 0, 0); break;
2530  case 106: CALL(0, 1, 0, 1, 0, 1, 1, 0, 0, 0); break;
2531  case 107: CALL(1, 1, 0, 1, 0, 1, 1, 0, 0, 0); break;
2532  case 108: CALL(0, 0, 1, 1, 0, 1, 1, 0, 0, 0); break;
2533  case 109: CALL(1, 0, 1, 1, 0, 1, 1, 0, 0, 0); break;
2534  case 110: CALL(0, 1, 1, 1, 0, 1, 1, 0, 0, 0); break;
2535  case 111: CALL(1, 1, 1, 1, 0, 1, 1, 0, 0, 0); break;
2536 
2537  case 112: CALL(0, 0, 0, 0, 1, 1, 1, 0, 0, 0); break;
2538  case 113: CALL(1, 0, 0, 0, 1, 1, 1, 0, 0, 0); break;
2539  case 114: CALL(0, 1, 0, 0, 1, 1, 1, 0, 0, 0); break;
2540  case 115: CALL(1, 1, 0, 0, 1, 1, 1, 0, 0, 0); break;
2541  case 116: CALL(0, 0, 1, 0, 1, 1, 1, 0, 0, 0); break;
2542  case 117: CALL(1, 0, 1, 0, 1, 1, 1, 0, 0, 0); break;
2543  case 118: CALL(0, 1, 1, 0, 1, 1, 1, 0, 0, 0); break;
2544  case 119: CALL(1, 1, 1, 0, 1, 1, 1, 0, 0, 0); break;
2545  case 120: CALL(0, 0, 0, 1, 1, 1, 1, 0, 0, 0); break;
2546  case 121: CALL(1, 0, 0, 1, 1, 1, 1, 0, 0, 0); break;
2547  case 122: CALL(0, 1, 0, 1, 1, 1, 1, 0, 0, 0); break;
2548  case 123: CALL(1, 1, 0, 1, 1, 1, 1, 0, 0, 0); break;
2549  case 124: CALL(0, 0, 1, 1, 1, 1, 1, 0, 0, 0); break;
2550  case 125: CALL(1, 0, 1, 1, 1, 1, 1, 0, 0, 0); break;
2551  case 126: CALL(0, 1, 1, 1, 1, 1, 1, 0, 0, 0); break;
2552  case 127: CALL(1, 1, 1, 1, 1, 1, 1, 0, 0, 0); break;
2553 
2554 #endif
2555  case 128: CALL(0, 0, 0, 0, 0, 0, 0, 1, 0, 0); break;
2556  case 129: CALL(1, 0, 0, 0, 0, 0, 0, 1, 0, 0); break;
2557  case 130: CALL(0, 1, 0, 0, 0, 0, 0, 1, 0, 0); break;
2558  case 131: CALL(1, 1, 0, 0, 0, 0, 0, 1, 0, 0); break;
2559  case 132: CALL(0, 0, 1, 0, 0, 0, 0, 1, 0, 0); break;
2560  case 133: CALL(1, 0, 1, 0, 0, 0, 0, 1, 0, 0); break;
2561  case 134: CALL(0, 1, 1, 0, 0, 0, 0, 1, 0, 0); break;
2562  case 135: CALL(1, 1, 1, 0, 0, 0, 0, 1, 0, 0); break;
2563  case 136: CALL(0, 0, 0, 1, 0, 0, 0, 1, 0, 0); break;
2564  case 137: CALL(1, 0, 0, 1, 0, 0, 0, 1, 0, 0); break;
2565  case 138: CALL(0, 1, 0, 1, 0, 0, 0, 1, 0, 0); break;
2566  case 139: CALL(1, 1, 0, 1, 0, 0, 0, 1, 0, 0); break;
2567  case 140: CALL(0, 0, 1, 1, 0, 0, 0, 1, 0, 0); break;
2568  case 141: CALL(1, 0, 1, 1, 0, 0, 0, 1, 0, 0); break;
2569  case 142: CALL(0, 1, 1, 1, 0, 0, 0, 1, 0, 0); break;
2570  case 143: CALL(1, 1, 1, 1, 0, 0, 0, 1, 0, 0); break;
2571 
2572 #if 0
2573  case 144: CALL(0, 0, 0, 0, 1, 0, 0, 1, 0, 0); break;
2574  case 145: CALL(1, 0, 0, 0, 1, 0, 0, 1, 0, 0); break;
2575  case 146: CALL(0, 1, 0, 0, 1, 0, 0, 1, 0, 0); break;
2576  case 147: CALL(1, 1, 0, 0, 1, 0, 0, 1, 0, 0); break;
2577  case 148: CALL(0, 0, 1, 0, 1, 0, 0, 1, 0, 0); break;
2578  case 149: CALL(1, 0, 1, 0, 1, 0, 0, 1, 0, 0); break;
2579  case 150: CALL(0, 1, 1, 0, 1, 0, 0, 1, 0, 0); break;
2580  case 151: CALL(1, 1, 1, 0, 1, 0, 0, 1, 0, 0); break;
2581  case 152: CALL(0, 0, 0, 1, 1, 0, 0, 1, 0, 0); break;
2582  case 153: CALL(1, 0, 0, 1, 1, 0, 0, 1, 0, 0); break;
2583  case 154: CALL(0, 1, 0, 1, 1, 0, 0, 1, 0, 0); break;
2584  case 155: CALL(1, 1, 0, 1, 1, 0, 0, 1, 0, 0); break;
2585  case 156: CALL(0, 0, 1, 1, 1, 0, 0, 1, 0, 0); break;
2586  case 157: CALL(1, 0, 1, 1, 1, 0, 0, 1, 0, 0); break;
2587  case 158: CALL(0, 1, 1, 1, 1, 0, 0, 1, 0, 0); break;
2588  case 159: CALL(1, 1, 1, 1, 1, 0, 0, 1, 0, 0); break;
2589 
2590  case 160: CALL(0, 0, 0, 0, 0, 1, 0, 1, 0, 0); break;
2591  case 161: CALL(1, 0, 0, 0, 0, 1, 0, 1, 0, 0); break;
2592  case 162: CALL(0, 1, 0, 0, 0, 1, 0, 1, 0, 0); break;
2593  case 163: CALL(1, 1, 0, 0, 0, 1, 0, 1, 0, 0); break;
2594  case 164: CALL(0, 0, 1, 0, 0, 1, 0, 1, 0, 0); break;
2595  case 165: CALL(1, 0, 1, 0, 0, 1, 0, 1, 0, 0); break;
2596  case 166: CALL(0, 1, 1, 0, 0, 1, 0, 1, 0, 0); break;
2597  case 167: CALL(1, 1, 1, 0, 0, 1, 0, 1, 0, 0); break;
2598  case 168: CALL(0, 0, 0, 1, 0, 1, 0, 1, 0, 0); break;
2599  case 169: CALL(1, 0, 0, 1, 0, 1, 0, 1, 0, 0); break;
2600  case 170: CALL(0, 1, 0, 1, 0, 1, 0, 1, 0, 0); break;
2601  case 171: CALL(1, 1, 0, 1, 0, 1, 0, 1, 0, 0); break;
2602  case 172: CALL(0, 0, 1, 1, 0, 1, 0, 1, 0, 0); break;
2603  case 173: CALL(1, 0, 1, 1, 0, 1, 0, 1, 0, 0); break;
2604  case 174: CALL(0, 1, 1, 1, 0, 1, 0, 1, 0, 0); break;
2605  case 175: CALL(1, 1, 1, 1, 0, 1, 0, 1, 0, 0); break;
2606 
2607  case 176: CALL(0, 0, 0, 0, 1, 1, 0, 1, 0, 0); break;
2608  case 177: CALL(1, 0, 0, 0, 1, 1, 0, 1, 0, 0); break;
2609  case 178: CALL(0, 1, 0, 0, 1, 1, 0, 1, 0, 0); break;
2610  case 179: CALL(1, 1, 0, 0, 1, 1, 0, 1, 0, 0); break;
2611  case 180: CALL(0, 0, 1, 0, 1, 1, 0, 1, 0, 0); break;
2612  case 181: CALL(1, 0, 1, 0, 1, 1, 0, 1, 0, 0); break;
2613  case 182: CALL(0, 1, 1, 0, 1, 1, 0, 1, 0, 0); break;
2614  case 183: CALL(1, 1, 1, 0, 1, 1, 0, 1, 0, 0); break;
2615  case 184: CALL(0, 0, 0, 1, 1, 1, 0, 1, 0, 0); break;
2616  case 185: CALL(1, 0, 0, 1, 1, 1, 0, 1, 0, 0); break;
2617  case 186: CALL(0, 1, 0, 1, 1, 1, 0, 1, 0, 0); break;
2618  case 187: CALL(1, 1, 0, 1, 1, 1, 0, 1, 0, 0); break;
2619  case 188: CALL(0, 0, 1, 1, 1, 1, 0, 1, 0, 0); break;
2620  case 189: CALL(1, 0, 1, 1, 1, 1, 0, 1, 0, 0); break;
2621  case 190: CALL(0, 1, 1, 1, 1, 1, 0, 1, 0, 0); break;
2622  case 191: CALL(1, 1, 1, 1, 1, 1, 0, 1, 0, 0); break;
2623 
2624  case 192: CALL(0, 0, 0, 0, 0, 0, 1, 1, 0, 0); break;
2625  case 193: CALL(1, 0, 0, 0, 0, 0, 1, 1, 0, 0); break;
2626  case 194: CALL(0, 1, 0, 0, 0, 0, 1, 1, 0, 0); break;
2627  case 195: CALL(1, 1, 0, 0, 0, 0, 1, 1, 0, 0); break;
2628  case 196: CALL(0, 0, 1, 0, 0, 0, 1, 1, 0, 0); break;
2629  case 197: CALL(1, 0, 1, 0, 0, 0, 1, 1, 0, 0); break;
2630  case 198: CALL(0, 1, 1, 0, 0, 0, 1, 1, 0, 0); break;
2631  case 199: CALL(1, 1, 1, 0, 0, 0, 1, 1, 0, 0); break;
2632  case 200: CALL(0, 0, 0, 1, 0, 0, 1, 1, 0, 0); break;
2633  case 201: CALL(1, 0, 0, 1, 0, 0, 1, 1, 0, 0); break;
2634  case 202: CALL(0, 1, 0, 1, 0, 0, 1, 1, 0, 0); break;
2635  case 203: CALL(1, 1, 0, 1, 0, 0, 1, 1, 0, 0); break;
2636  case 204: CALL(0, 0, 1, 1, 0, 0, 1, 1, 0, 0); break;
2637  case 205: CALL(1, 0, 1, 1, 0, 0, 1, 1, 0, 0); break;
2638  case 206: CALL(0, 1, 1, 1, 0, 0, 1, 1, 0, 0); break;
2639  case 207: CALL(1, 1, 1, 1, 0, 0, 1, 1, 0, 0); break;
2640 
2641  case 208: CALL(0, 0, 0, 0, 1, 0, 1, 1, 0, 0); break;
2642  case 209: CALL(1, 0, 0, 0, 1, 0, 1, 1, 0, 0); break;
2643  case 210: CALL(0, 1, 0, 0, 1, 0, 1, 1, 0, 0); break;
2644  case 211: CALL(1, 1, 0, 0, 1, 0, 1, 1, 0, 0); break;
2645  case 212: CALL(0, 0, 1, 0, 1, 0, 1, 1, 0, 0); break;
2646  case 213: CALL(1, 0, 1, 0, 1, 0, 1, 1, 0, 0); break;
2647  case 214: CALL(0, 1, 1, 0, 1, 0, 1, 1, 0, 0); break;
2648  case 215: CALL(1, 1, 1, 0, 1, 0, 1, 1, 0, 0); break;
2649  case 216: CALL(0, 0, 0, 1, 1, 0, 1, 1, 0, 0); break;
2650  case 217: CALL(1, 0, 0, 1, 1, 0, 1, 1, 0, 0); break;
2651  case 218: CALL(0, 1, 0, 1, 1, 0, 1, 1, 0, 0); break;
2652  case 219: CALL(1, 1, 0, 1, 1, 0, 1, 1, 0, 0); break;
2653  case 220: CALL(0, 0, 1, 1, 1, 0, 1, 1, 0, 0); break;
2654  case 221: CALL(1, 0, 1, 1, 1, 0, 1, 1, 0, 0); break;
2655  case 222: CALL(0, 1, 1, 1, 1, 0, 1, 1, 0, 0); break;
2656  case 223: CALL(1, 1, 1, 1, 1, 0, 1, 1, 0, 0); break;
2657 
2658  case 224: CALL(0, 0, 0, 0, 0, 1, 1, 1, 0, 0); break;
2659  case 225: CALL(1, 0, 0, 0, 0, 1, 1, 1, 0, 0); break;
2660  case 226: CALL(0, 1, 0, 0, 0, 1, 1, 1, 0, 0); break;
2661  case 227: CALL(1, 1, 0, 0, 0, 1, 1, 1, 0, 0); break;
2662  case 228: CALL(0, 0, 1, 0, 0, 1, 1, 1, 0, 0); break;
2663  case 229: CALL(1, 0, 1, 0, 0, 1, 1, 1, 0, 0); break;
2664  case 230: CALL(0, 1, 1, 0, 0, 1, 1, 1, 0, 0); break;
2665  case 231: CALL(1, 1, 1, 0, 0, 1, 1, 1, 0, 0); break;
2666  case 232: CALL(0, 0, 0, 1, 0, 1, 1, 1, 0, 0); break;
2667  case 233: CALL(1, 0, 0, 1, 0, 1, 1, 1, 0, 0); break;
2668  case 234: CALL(0, 1, 0, 1, 0, 1, 1, 1, 0, 0); break;
2669  case 235: CALL(1, 1, 0, 1, 0, 1, 1, 1, 0, 0); break;
2670  case 236: CALL(0, 0, 1, 1, 0, 1, 1, 1, 0, 0); break;
2671  case 237: CALL(1, 0, 1, 1, 0, 1, 1, 1, 0, 0); break;
2672  case 238: CALL(0, 1, 1, 1, 0, 1, 1, 1, 0, 0); break;
2673  case 239: CALL(1, 1, 1, 1, 0, 1, 1, 1, 0, 0); break;
2674 
2675  case 240: CALL(0, 0, 0, 0, 1, 1, 1, 1, 0, 0); break;
2676  case 241: CALL(1, 0, 0, 0, 1, 1, 1, 1, 0, 0); break;
2677  case 242: CALL(0, 1, 0, 0, 1, 1, 1, 1, 0, 0); break;
2678  case 243: CALL(1, 1, 0, 0, 1, 1, 1, 1, 0, 0); break;
2679  case 244: CALL(0, 0, 1, 0, 1, 1, 1, 1, 0, 0); break;
2680  case 245: CALL(1, 0, 1, 0, 1, 1, 1, 1, 0, 0); break;
2681  case 246: CALL(0, 1, 1, 0, 1, 1, 1, 1, 0, 0); break;
2682  case 247: CALL(1, 1, 1, 0, 1, 1, 1, 1, 0, 0); break;
2683  case 248: CALL(0, 0, 0, 1, 1, 1, 1, 1, 0, 0); break;
2684  case 249: CALL(1, 0, 0, 1, 1, 1, 1, 1, 0, 0); break;
2685  case 250: CALL(0, 1, 0, 1, 1, 1, 1, 1, 0, 0); break;
2686  case 251: CALL(1, 1, 0, 1, 1, 1, 1, 1, 0, 0); break;
2687  case 252: CALL(0, 0, 1, 1, 1, 1, 1, 1, 0, 0); break;
2688  case 253: CALL(1, 0, 1, 1, 1, 1, 1, 1, 0, 0); break;
2689  case 254: CALL(0, 1, 1, 1, 1, 1, 1, 1, 0, 0); break;
2690  case 255: CALL(1, 1, 1, 1, 1, 1, 1, 1, 0, 0); break;
2691 
2692 #endif
2693  case 256: CALL(0, 0, 0, 0, 0, 0, 0, 0, 1, 0); break;
2694  case 257: CALL(1, 0, 0, 0, 0, 0, 0, 0, 1, 0); break;
2695  case 258: CALL(0, 1, 0, 0, 0, 0, 0, 0, 1, 0); break;
2696  case 259: CALL(1, 1, 0, 0, 0, 0, 0, 0, 1, 0); break;
2697  case 260: CALL(0, 0, 1, 0, 0, 0, 0, 0, 1, 0); break;
2698  case 261: CALL(1, 0, 1, 0, 0, 0, 0, 0, 1, 0); break;
2699  case 262: CALL(0, 1, 1, 0, 0, 0, 0, 0, 1, 0); break;
2700  case 263: CALL(1, 1, 1, 0, 0, 0, 0, 0, 1, 0); break;
2701  case 264: CALL(0, 0, 0, 1, 0, 0, 0, 0, 1, 0); break;
2702  case 265: CALL(1, 0, 0, 1, 0, 0, 0, 0, 1, 0); break;
2703  case 266: CALL(0, 1, 0, 1, 0, 0, 0, 0, 1, 0); break;
2704  case 267: CALL(1, 1, 0, 1, 0, 0, 0, 0, 1, 0); break;
2705  case 268: CALL(0, 0, 1, 1, 0, 0, 0, 0, 1, 0); break;
2706  case 269: CALL(1, 0, 1, 1, 0, 0, 0, 0, 1, 0); break;
2707  case 270: CALL(0, 1, 1, 1, 0, 0, 0, 0, 1, 0); break;
2708  case 271: CALL(1, 1, 1, 1, 0, 0, 0, 0, 1, 0); break;
2709 
2710 #if 0
2711  case 272: CALL(0, 0, 0, 0, 1, 0, 0, 0, 1, 0); break;
2712  case 273: CALL(1, 0, 0, 0, 1, 0, 0, 0, 1, 0); break;
2713  case 274: CALL(0, 1, 0, 0, 1, 0, 0, 0, 1, 0); break;
2714  case 275: CALL(1, 1, 0, 0, 1, 0, 0, 0, 1, 0); break;
2715  case 276: CALL(0, 0, 1, 0, 1, 0, 0, 0, 1, 0); break;
2716  case 277: CALL(1, 0, 1, 0, 1, 0, 0, 0, 1, 0); break;
2717  case 278: CALL(0, 1, 1, 0, 1, 0, 0, 0, 1, 0); break;
2718  case 279: CALL(1, 1, 1, 0, 1, 0, 0, 0, 1, 0); break;
2719  case 280: CALL(0, 0, 0, 1, 1, 0, 0, 0, 1, 0); break;
2720  case 281: CALL(1, 0, 0, 1, 1, 0, 0, 0, 1, 0); break;
2721  case 282: CALL(0, 1, 0, 1, 1, 0, 0, 0, 1, 0); break;
2722  case 283: CALL(1, 1, 0, 1, 1, 0, 0, 0, 1, 0); break;
2723  case 284: CALL(0, 0, 1, 1, 1, 0, 0, 0, 1, 0); break;
2724  case 285: CALL(1, 0, 1, 1, 1, 0, 0, 0, 1, 0); break;
2725  case 286: CALL(0, 1, 1, 1, 1, 0, 0, 0, 1, 0); break;
2726  case 287: CALL(1, 1, 1, 1, 1, 0, 0, 0, 1, 0); break;
2727 
2728  case 288: CALL(0, 0, 0, 0, 0, 1, 0, 0, 1, 0); break;
2729  case 289: CALL(1, 0, 0, 0, 0, 1, 0, 0, 1, 0); break;
2730  case 290: CALL(0, 1, 0, 0, 0, 1, 0, 0, 1, 0); break;
2731  case 291: CALL(1, 1, 0, 0, 0, 1, 0, 0, 1, 0); break;
2732  case 292: CALL(0, 0, 1, 0, 0, 1, 0, 0, 1, 0); break;
2733  case 293: CALL(1, 0, 1, 0, 0, 1, 0, 0, 1, 0); break;
2734  case 294: CALL(0, 1, 1, 0, 0, 1, 0, 0, 1, 0); break;
2735  case 295: CALL(1, 1, 1, 0, 0, 1, 0, 0, 1, 0); break;
2736  case 296: CALL(0, 0, 0, 1, 0, 1, 0, 0, 1, 0); break;
2737  case 297: CALL(1, 0, 0, 1, 0, 1, 0, 0, 1, 0); break;
2738  case 298: CALL(0, 1, 0, 1, 0, 1, 0, 0, 1, 0); break;
2739  case 299: CALL(1, 1, 0, 1, 0, 1, 0, 0, 1, 0); break;
2740  case 300: CALL(0, 0, 1, 1, 0, 1, 0, 0, 1, 0); break;
2741  case 301: CALL(1, 0, 1, 1, 0, 1, 0, 0, 1, 0); break;
2742  case 302: CALL(0, 1, 1, 1, 0, 1, 0, 0, 1, 0); break;
2743  case 303: CALL(1, 1, 1, 1, 0, 1, 0, 0, 1, 0); break;
2744 
2745 #endif
2746  case 304: CALL(0, 0, 0, 0, 1, 1, 0, 0, 1, 0); break;
2747  case 305: CALL(1, 0, 0, 0, 1, 1, 0, 0, 1, 0); break;
2748  case 306: CALL(0, 1, 0, 0, 1, 1, 0, 0, 1, 0); break;
2749  case 307: CALL(1, 1, 0, 0, 1, 1, 0, 0, 1, 0); break;
2750  case 308: CALL(0, 0, 1, 0, 1, 1, 0, 0, 1, 0); break;
2751  case 309: CALL(1, 0, 1, 0, 1, 1, 0, 0, 1, 0); break;
2752  case 310: CALL(0, 1, 1, 0, 1, 1, 0, 0, 1, 0); break;
2753  case 311: CALL(1, 1, 1, 0, 1, 1, 0, 0, 1, 0); break;
2754  case 312: CALL(0, 0, 0, 1, 1, 1, 0, 0, 1, 0); break;
2755  case 313: CALL(1, 0, 0, 1, 1, 1, 0, 0, 1, 0); break;
2756  case 314: CALL(0, 1, 0, 1, 1, 1, 0, 0, 1, 0); break;
2757  case 315: CALL(1, 1, 0, 1, 1, 1, 0, 0, 1, 0); break;
2758  case 316: CALL(0, 0, 1, 1, 1, 1, 0, 0, 1, 0); break;
2759  case 317: CALL(1, 0, 1, 1, 1, 1, 0, 0, 1, 0); break;
2760  case 318: CALL(0, 1, 1, 1, 1, 1, 0, 0, 1, 0); break;
2761  case 319: CALL(1, 1, 1, 1, 1, 1, 0, 0, 1, 0); break;
2762 
2763 #if 0
2764  case 320: CALL(0, 0, 0, 0, 0, 0, 1, 0, 1, 0); break;
2765  case 321: CALL(1, 0, 0, 0, 0, 0, 1, 0, 1, 0); break;
2766  case 322: CALL(0, 1, 0, 0, 0, 0, 1, 0, 1, 0); break;
2767  case 323: CALL(1, 1, 0, 0, 0, 0, 1, 0, 1, 0); break;
2768  case 324: CALL(0, 0, 1, 0, 0, 0, 1, 0, 1, 0); break;
2769  case 325: CALL(1, 0, 1, 0, 0, 0, 1, 0, 1, 0); break;
2770  case 326: CALL(0, 1, 1, 0, 0, 0, 1, 0, 1, 0); break;
2771  case 327: CALL(1, 1, 1, 0, 0, 0, 1, 0, 1, 0); break;
2772  case 328: CALL(0, 0, 0, 1, 0, 0, 1, 0, 1, 0); break;
2773  case 329: CALL(1, 0, 0, 1, 0, 0, 1, 0, 1, 0); break;
2774  case 330: CALL(0, 1, 0, 1, 0, 0, 1, 0, 1, 0); break;
2775  case 331: CALL(1, 1, 0, 1, 0, 0, 1, 0, 1, 0); break;
2776  case 332: CALL(0, 0, 1, 1, 0, 0, 1, 0, 1, 0); break;
2777  case 333: CALL(1, 0, 1, 1, 0, 0, 1, 0, 1, 0); break;
2778  case 334: CALL(0, 1, 1, 1, 0, 0, 1, 0, 1, 0); break;
2779  case 335: CALL(1, 1, 1, 1, 0, 0, 1, 0, 1, 0); break;
2780 
2781 #endif
2782  case 336: CALL(0, 0, 0, 0, 1, 0, 1, 0, 1, 0); break;
2783  case 337: CALL(1, 0, 0, 0, 1, 0, 1, 0, 1, 0); break;
2784  case 338: CALL(0, 1, 0, 0, 1, 0, 1, 0, 1, 0); break;
2785  case 339: CALL(1, 1, 0, 0, 1, 0, 1, 0, 1, 0); break;
2786  case 340: CALL(0, 0, 1, 0, 1, 0, 1, 0, 1, 0); break;
2787  case 341: CALL(1, 0, 1, 0, 1, 0, 1, 0, 1, 0); break;
2788  case 342: CALL(0, 1, 1, 0, 1, 0, 1, 0, 1, 0); break;
2789  case 343: CALL(1, 1, 1, 0, 1, 0, 1, 0, 1, 0); break;
2790  case 344: CALL(0, 0, 0, 1, 1, 0, 1, 0, 1, 0); break;
2791  case 345: CALL(1, 0, 0, 1, 1, 0, 1, 0, 1, 0); break;
2792  case 346: CALL(0, 1, 0, 1, 1, 0, 1, 0, 1, 0); break;
2793  case 347: CALL(1, 1, 0, 1, 1, 0, 1, 0, 1, 0); break;
2794  case 348: CALL(0, 0, 1, 1, 1, 0, 1, 0, 1, 0); break;
2795  case 349: CALL(1, 0, 1, 1, 1, 0, 1, 0, 1, 0); break;
2796  case 350: CALL(0, 1, 1, 1, 1, 0, 1, 0, 1, 0); break;
2797  case 351: CALL(1, 1, 1, 1, 1, 0, 1, 0, 1, 0); break;
2798 
2799 #if 0
2800  case 352: CALL(0, 0, 0, 0, 0, 1, 1, 0, 1, 0); break;
2801  case 353: CALL(1, 0, 0, 0, 0, 1, 1, 0, 1, 0); break;
2802  case 354: CALL(0, 1, 0, 0, 0, 1, 1, 0, 1, 0); break;
2803  case 355: CALL(1, 1, 0, 0, 0, 1, 1, 0, 1, 0); break;
2804  case 356: CALL(0, 0, 1, 0, 0, 1, 1, 0, 1, 0); break;
2805  case 357: CALL(1, 0, 1, 0, 0, 1, 1, 0, 1, 0); break;
2806  case 358: CALL(0, 1, 1, 0, 0, 1, 1, 0, 1, 0); break;
2807  case 359: CALL(1, 1, 1, 0, 0, 1, 1, 0, 1, 0); break;
2808  case 360: CALL(0, 0, 0, 1, 0, 1, 1, 0, 1, 0); break;
2809  case 361: CALL(1, 0, 0, 1, 0, 1, 1, 0, 1, 0); break;
2810  case 362: CALL(0, 1, 0, 1, 0, 1, 1, 0, 1, 0); break;
2811  case 363: CALL(1, 1, 0, 1, 0, 1, 1, 0, 1, 0); break;
2812  case 364: CALL(0, 0, 1, 1, 0, 1, 1, 0, 1, 0); break;
2813  case 365: CALL(1, 0, 1, 1, 0, 1, 1, 0, 1, 0); break;
2814  case 366: CALL(0, 1, 1, 1, 0, 1, 1, 0, 1, 0); break;
2815  case 367: CALL(1, 1, 1, 1, 0, 1, 1, 0, 1, 0); break;
2816 
2817  case 368: CALL(0, 0, 0, 0, 1, 1, 1, 0, 1, 0); break;
2818  case 369: CALL(1, 0, 0, 0, 1, 1, 1, 0, 1, 0); break;
2819  case 370: CALL(0, 1, 0, 0, 1, 1, 1, 0, 1, 0); break;
2820  case 371: CALL(1, 1, 0, 0, 1, 1, 1, 0, 1, 0); break;
2821  case 372: CALL(0, 0, 1, 0, 1, 1, 1, 0, 1, 0); break;
2822  case 373: CALL(1, 0, 1, 0, 1, 1, 1, 0, 1, 0); break;
2823  case 374: CALL(0, 1, 1, 0, 1, 1, 1, 0, 1, 0); break;
2824  case 375: CALL(1, 1, 1, 0, 1, 1, 1, 0, 1, 0); break;
2825  case 376: CALL(0, 0, 0, 1, 1, 1, 1, 0, 1, 0); break;
2826  case 377: CALL(1, 0, 0, 1, 1, 1, 1, 0, 1, 0); break;
2827  case 378: CALL(0, 1, 0, 1, 1, 1, 1, 0, 1, 0); break;
2828  case 379: CALL(1, 1, 0, 1, 1, 1, 1, 0, 1, 0); break;
2829  case 380: CALL(0, 0, 1, 1, 1, 1, 1, 0, 1, 0); break;
2830  case 381: CALL(1, 0, 1, 1, 1, 1, 1, 0, 1, 0); break;
2831  case 382: CALL(0, 1, 1, 1, 1, 1, 1, 0, 1, 0); break;
2832  case 383: CALL(1, 1, 1, 1, 1, 1, 1, 0, 1, 0); break;
2833 
2834 #endif
2835  case 384: CALL(0, 0, 0, 0, 0, 0, 0, 1, 1, 0); break;
2836  case 385: CALL(1, 0, 0, 0, 0, 0, 0, 1, 1, 0); break;
2837  case 386: CALL(0, 1, 0, 0, 0, 0, 0, 1, 1, 0); break;
2838  case 387: CALL(1, 1, 0, 0, 0, 0, 0, 1, 1, 0); break;
2839  case 388: CALL(0, 0, 1, 0, 0, 0, 0, 1, 1, 0); break;
2840  case 389: CALL(1, 0, 1, 0, 0, 0, 0, 1, 1, 0); break;
2841  case 390: CALL(0, 1, 1, 0, 0, 0, 0, 1, 1, 0); break;
2842  case 391: CALL(1, 1, 1, 0, 0, 0, 0, 1, 1, 0); break;
2843  case 392: CALL(0, 0, 0, 1, 0, 0, 0, 1, 1, 0); break;
2844  case 393: CALL(1, 0, 0, 1, 0, 0, 0, 1, 1, 0); break;
2845  case 394: CALL(0, 1, 0, 1, 0, 0, 0, 1, 1, 0); break;
2846  case 395: CALL(1, 1, 0, 1, 0, 0, 0, 1, 1, 0); break;
2847  case 396: CALL(0, 0, 1, 1, 0, 0, 0, 1, 1, 0); break;
2848  case 397: CALL(1, 0, 1, 1, 0, 0, 0, 1, 1, 0); break;
2849  case 398: CALL(0, 1, 1, 1, 0, 0, 0, 1, 1, 0); break;
2850  case 399: CALL(1, 1, 1, 1, 0, 0, 0, 1, 1, 0); break;
2851 
2852 #if 0
2853  case 400: CALL(0, 0, 0, 0, 1, 0, 0, 1, 1, 0); break;
2854  case 401: CALL(1, 0, 0, 0, 1, 0, 0, 1, 1, 0); break;
2855  case 402: CALL(0, 1, 0, 0, 1, 0, 0, 1, 1, 0); break;
2856  case 403: CALL(1, 1, 0, 0, 1, 0, 0, 1, 1, 0); break;
2857  case 404: CALL(0, 0, 1, 0, 1, 0, 0, 1, 1, 0); break;
2858  case 405: CALL(1, 0, 1, 0, 1, 0, 0, 1, 1, 0); break;
2859  case 406: CALL(0, 1, 1, 0, 1, 0, 0, 1, 1, 0); break;
2860  case 407: CALL(1, 1, 1, 0, 1, 0, 0, 1, 1, 0); break;
2861  case 408: CALL(0, 0, 0, 1, 1, 0, 0, 1, 1, 0); break;
2862  case 409: CALL(1, 0, 0, 1, 1, 0, 0, 1, 1, 0); break;
2863  case 410: CALL(0, 1, 0, 1, 1, 0, 0, 1, 1, 0); break;
2864  case 411: CALL(1, 1, 0, 1, 1, 0, 0, 1, 1, 0); break;
2865  case 412: CALL(0, 0, 1, 1, 1, 0, 0, 1, 1, 0); break;
2866  case 413: CALL(1, 0, 1, 1, 1, 0, 0, 1, 1, 0); break;
2867  case 414: CALL(0, 1, 1, 1, 1, 0, 0, 1, 1, 0); break;
2868  case 415: CALL(1, 1, 1, 1, 1, 0, 0, 1, 1, 0); break;
2869 
2870  case 416: CALL(0, 0, 0, 0, 0, 1, 0, 1, 1, 0); break;
2871  case 417: CALL(1, 0, 0, 0, 0, 1, 0, 1, 1, 0); break;
2872  case 418: CALL(0, 1, 0, 0, 0, 1, 0, 1, 1, 0); break;
2873  case 419: CALL(1, 1, 0, 0, 0, 1, 0, 1, 1, 0); break;
2874  case 420: CALL(0, 0, 1, 0, 0, 1, 0, 1, 1, 0); break;
2875  case 421: CALL(1, 0, 1, 0, 0, 1, 0, 1, 1, 0); break;
2876  case 422: CALL(0, 1, 1, 0, 0, 1, 0, 1, 1, 0); break;
2877  case 423: CALL(1, 1, 1, 0, 0, 1, 0, 1, 1, 0); break;
2878  case 424: CALL(0, 0, 0, 1, 0, 1, 0, 1, 1, 0); break;
2879  case 425: CALL(1, 0, 0, 1, 0, 1, 0, 1, 1, 0); break;
2880  case 426: CALL(0, 1, 0, 1, 0, 1, 0, 1, 1, 0); break;
2881  case 427: CALL(1, 1, 0, 1, 0, 1, 0, 1, 1, 0); break;
2882  case 428: CALL(0, 0, 1, 1, 0, 1, 0, 1, 1, 0); break;
2883  case 429: CALL(1, 0, 1, 1, 0, 1, 0, 1, 1, 0); break;
2884  case 430: CALL(0, 1, 1, 1, 0, 1, 0, 1, 1, 0); break;
2885  case 431: CALL(1, 1, 1, 1, 0, 1, 0, 1, 1, 0); break;
2886 
2887 #endif
2888  case 432: CALL(0, 0, 0, 0, 1, 1, 0, 1, 1, 0); break;
2889  case 433: CALL(1, 0, 0, 0, 1, 1, 0, 1, 1, 0); break;
2890  case 434: CALL(0, 1, 0, 0, 1, 1, 0, 1, 1, 0); break;
2891  case 435: CALL(1, 1, 0, 0, 1, 1, 0, 1, 1, 0); break;
2892  case 436: CALL(0, 0, 1, 0, 1, 1, 0, 1, 1, 0); break;
2893  case 437: CALL(1, 0, 1, 0, 1, 1, 0, 1, 1, 0); break;
2894  case 438: CALL(0, 1, 1, 0, 1, 1, 0, 1, 1, 0); break;
2895  case 439: CALL(1, 1, 1, 0, 1, 1, 0, 1, 1, 0); break;
2896  case 440: CALL(0, 0, 0, 1, 1, 1, 0, 1, 1, 0); break;
2897  case 441: CALL(1, 0, 0, 1, 1, 1, 0, 1, 1, 0); break;
2898  case 442: CALL(0, 1, 0, 1, 1, 1, 0, 1, 1, 0); break;
2899  case 443: CALL(1, 1, 0, 1, 1, 1, 0, 1, 1, 0); break;
2900  case 444: CALL(0, 0, 1, 1, 1, 1, 0, 1, 1, 0); break;
2901  case 445: CALL(1, 0, 1, 1, 1, 1, 0, 1, 1, 0); break;
2902  case 446: CALL(0, 1, 1, 1, 1, 1, 0, 1, 1, 0); break;
2903  case 447: CALL(1, 1, 1, 1, 1, 1, 0, 1, 1, 0); break;
2904 
2905 #if 0
2906  case 448: CALL(0, 0, 0, 0, 0, 0, 1, 1, 1, 0); break;
2907  case 449: CALL(1, 0, 0, 0, 0, 0, 1, 1, 1, 0); break;
2908  case 450: CALL(0, 1, 0, 0, 0, 0, 1, 1, 1, 0); break;
2909  case 451: CALL(1, 1, 0, 0, 0, 0, 1, 1, 1, 0); break;
2910  case 452: CALL(0, 0, 1, 0, 0, 0, 1, 1, 1, 0); break;
2911  case 453: CALL(1, 0, 1, 0, 0, 0, 1, 1, 1, 0); break;
2912  case 454: CALL(0, 1, 1, 0, 0, 0, 1, 1, 1, 0); break;
2913  case 455: CALL(1, 1, 1, 0, 0, 0, 1, 1, 1, 0); break;
2914  case 456: CALL(0, 0, 0, 1, 0, 0, 1, 1, 1, 0); break;
2915  case 457: CALL(1, 0, 0, 1, 0, 0, 1, 1, 1, 0); break;
2916  case 458: CALL(0, 1, 0, 1, 0, 0, 1, 1, 1, 0); break;
2917  case 459: CALL(1, 1, 0, 1, 0, 0, 1, 1, 1, 0); break;
2918  case 460: CALL(0, 0, 1, 1, 0, 0, 1, 1, 1, 0); break;
2919  case 461: CALL(1, 0, 1, 1, 0, 0, 1, 1, 1, 0); break;
2920  case 462: CALL(0, 1, 1, 1, 0, 0, 1, 1, 1, 0); break;
2921  case 463: CALL(1, 1, 1, 1, 0, 0, 1, 1, 1, 0); break;
2922 
2923 #endif
2924  case 464: CALL(0, 0, 0, 0, 1, 0, 1, 1, 1, 0); break;
2925  case 465: CALL(1, 0, 0, 0, 1, 0, 1, 1, 1, 0); break;
2926  case 466: CALL(0, 1, 0, 0, 1, 0, 1, 1, 1, 0); break;
2927  case 467: CALL(1, 1, 0, 0, 1, 0, 1, 1, 1, 0); break;
2928  case 468: CALL(0, 0, 1, 0, 1, 0, 1, 1, 1, 0); break;
2929  case 469: CALL(1, 0, 1, 0, 1, 0, 1, 1, 1, 0); break;
2930  case 470: CALL(0, 1, 1, 0, 1, 0, 1, 1, 1, 0); break;
2931  case 471: CALL(1, 1, 1, 0, 1, 0, 1, 1, 1, 0); break;
2932  case 472: CALL(0, 0, 0, 1, 1, 0, 1, 1, 1, 0); break;
2933  case 473: CALL(1, 0, 0, 1, 1, 0, 1, 1, 1, 0); break;
2934  case 474: CALL(0, 1, 0, 1, 1, 0, 1, 1, 1, 0); break;
2935  case 475: CALL(1, 1, 0, 1, 1, 0, 1, 1, 1, 0); break;
2936  case 476: CALL(0, 0, 1, 1, 1, 0, 1, 1, 1, 0); break;
2937  case 477: CALL(1, 0, 1, 1, 1, 0, 1, 1, 1, 0); break;
2938  case 478: CALL(0, 1, 1, 1, 1, 0, 1, 1, 1, 0); break;
2939  case 479: CALL(1, 1, 1, 1, 1, 0, 1, 1, 1, 0); break;
2940 
2941 #if 0
2942  case 480: CALL(0, 0, 0, 0, 0, 1, 1, 1, 1, 0); break;
2943  case 481: CALL(1, 0, 0, 0, 0, 1, 1, 1, 1, 0); break;
2944  case 482: CALL(0, 1, 0, 0, 0, 1, 1, 1, 1, 0); break;
2945  case 483: CALL(1, 1, 0, 0, 0, 1, 1, 1, 1, 0); break;
2946  case 484: CALL(0, 0, 1, 0, 0, 1, 1, 1, 1, 0); break;
2947  case 485: CALL(1, 0, 1, 0, 0, 1, 1, 1, 1, 0); break;
2948  case 486: CALL(0, 1, 1, 0, 0, 1, 1, 1, 1, 0); break;
2949  case 487: CALL(1, 1, 1, 0, 0, 1, 1, 1, 1, 0); break;
2950  case 488: CALL(0, 0, 0, 1, 0, 1, 1, 1, 1, 0); break;
2951  case 489: CALL(1, 0, 0, 1, 0, 1, 1, 1, 1, 0); break;
2952  case 490: CALL(0, 1, 0, 1, 0, 1, 1, 1, 1, 0); break;
2953  case 491: CALL(1, 1, 0, 1, 0, 1, 1, 1, 1, 0); break;
2954  case 492: CALL(0, 0, 1, 1, 0, 1, 1, 1, 1, 0); break;
2955  case 493: CALL(1, 0, 1, 1, 0, 1, 1, 1, 1, 0); break;
2956  case 494: CALL(0, 1, 1, 1, 0, 1, 1, 1, 1, 0); break;
2957  case 495: CALL(1, 1, 1, 1, 0, 1, 1, 1, 1, 0); break;
2958 
2959  case 496: CALL(0, 0, 0, 0, 1, 1, 1, 1, 1, 0); break;
2960  case 497: CALL(1, 0, 0, 0, 1, 1, 1, 1, 1, 0); break;
2961  case 498: CALL(0, 1, 0, 0, 1, 1, 1, 1, 1, 0); break;
2962  case 499: CALL(1, 1, 0, 0, 1, 1, 1, 1, 1, 0); break;
2963  case 500: CALL(0, 0, 1, 0, 1, 1, 1, 1, 1, 0); break;
2964  case 501: CALL(1, 0, 1, 0, 1, 1, 1, 1, 1, 0); break;
2965  case 502: CALL(0, 1, 1, 0, 1, 1, 1, 1, 1, 0); break;
2966  case 503: CALL(1, 1, 1, 0, 1, 1, 1, 1, 1, 0); break;
2967  case 504: CALL(0, 0, 0, 1, 1, 1, 1, 1, 1, 0); break;
2968  case 505: CALL(1, 0, 0, 1, 1, 1, 1, 1, 1, 0); break;
2969  case 506: CALL(0, 1, 0, 1, 1, 1, 1, 1, 1, 0); break;
2970  case 507: CALL(1, 1, 0, 1, 1, 1, 1, 1, 1, 0); break;
2971  case 508: CALL(0, 0, 1, 1, 1, 1, 1, 1, 1, 0); break;
2972  case 509: CALL(1, 0, 1, 1, 1, 1, 1, 1, 1, 0); break;
2973  case 510: CALL(0, 1, 1, 1, 1, 1, 1, 1, 1, 0); break;
2974  case 511: CALL(1, 1, 1, 1, 1, 1, 1, 1, 1, 0); break;
2975 #endif
2976  /*
2977  * Haochuan: the calls starting from 512 to 1023 were generated by the following python script
2978  * #!/usr/bin/env python3
2979  * def gen_call(option: int):
2980  * doEnergy = option & 1
2981  * doVirial = (option >> 1) & 1
2982  * doSlow = (option >> 2) & 1
2983  * doPairlist = (option >> 3) & 1
2984  * doAlch = (option >> 4) & 1
2985  * doFEP = (option >> 5) & 1
2986  * doTI = (option >> 6) & 1
2987  * doStreaming = (option >> 7) & 1
2988  * doTable = (option >> 8) & 1
2989  * doAlchVdwForceSwitching = (option >> 9) & 1
2990  * incompatible = False
2991  * incompatible = incompatible | (doFEP and doTI)
2992  * incompatible = incompatible | (doAlch and ((not doFEP) and (not doTI)))
2993  * incompatible = incompatible | ((not doAlch) and (doFEP or doTI or doAlchVdwForceSwitching))
2994  * incompatible = incompatible | ((not doTable) and (doAlch or doTI or doFEP or doAlchVdwForceSwitching))
2995  * if incompatible:
2996  * pass
2997  * print(f' // case {option}: CALL({doEnergy}, {doVirial}, {doSlow}, {doPairlist}, {doAlch}, {doFEP}, {doTI}, {doStreaming}, {doTable}, {doAlchVdwForceSwitching}); break;')
2998  * else:
2999  * print(f' case {option}: CALL({doEnergy}, {doVirial}, {doSlow}, {doPairlist}, {doAlch}, {doFEP}, {doTI}, {doStreaming}, {doTable}, {doAlchVdwForceSwitching}); break;')
3000  * return
3001  *
3002  * for i in range(512, 1024):
3003  * gen_call(i)
3004  *
3005  */
3006  // case 512: CALL(0, 0, 0, 0, 0, 0, 0, 0, 0, 1); break;
3007  // case 513: CALL(1, 0, 0, 0, 0, 0, 0, 0, 0, 1); break;
3008  // case 514: CALL(0, 1, 0, 0, 0, 0, 0, 0, 0, 1); break;
3009  // case 515: CALL(1, 1, 0, 0, 0, 0, 0, 0, 0, 1); break;
3010  // case 516: CALL(0, 0, 1, 0, 0, 0, 0, 0, 0, 1); break;
3011  // case 517: CALL(1, 0, 1, 0, 0, 0, 0, 0, 0, 1); break;
3012  // case 518: CALL(0, 1, 1, 0, 0, 0, 0, 0, 0, 1); break;
3013  // case 519: CALL(1, 1, 1, 0, 0, 0, 0, 0, 0, 1); break;
3014  // case 520: CALL(0, 0, 0, 1, 0, 0, 0, 0, 0, 1); break;
3015  // case 521: CALL(1, 0, 0, 1, 0, 0, 0, 0, 0, 1); break;
3016  // case 522: CALL(0, 1, 0, 1, 0, 0, 0, 0, 0, 1); break;
3017  // case 523: CALL(1, 1, 0, 1, 0, 0, 0, 0, 0, 1); break;
3018  // case 524: CALL(0, 0, 1, 1, 0, 0, 0, 0, 0, 1); break;
3019  // case 525: CALL(1, 0, 1, 1, 0, 0, 0, 0, 0, 1); break;
3020  // case 526: CALL(0, 1, 1, 1, 0, 0, 0, 0, 0, 1); break;
3021  // case 527: CALL(1, 1, 1, 1, 0, 0, 0, 0, 0, 1); break;
3022  // case 528: CALL(0, 0, 0, 0, 1, 0, 0, 0, 0, 1); break;
3023  // case 529: CALL(1, 0, 0, 0, 1, 0, 0, 0, 0, 1); break;
3024  // case 530: CALL(0, 1, 0, 0, 1, 0, 0, 0, 0, 1); break;
3025  // case 531: CALL(1, 1, 0, 0, 1, 0, 0, 0, 0, 1); break;
3026  // case 532: CALL(0, 0, 1, 0, 1, 0, 0, 0, 0, 1); break;
3027  // case 533: CALL(1, 0, 1, 0, 1, 0, 0, 0, 0, 1); break;
3028  // case 534: CALL(0, 1, 1, 0, 1, 0, 0, 0, 0, 1); break;
3029  // case 535: CALL(1, 1, 1, 0, 1, 0, 0, 0, 0, 1); break;
3030  // case 536: CALL(0, 0, 0, 1, 1, 0, 0, 0, 0, 1); break;
3031  // case 537: CALL(1, 0, 0, 1, 1, 0, 0, 0, 0, 1); break;
3032  // case 538: CALL(0, 1, 0, 1, 1, 0, 0, 0, 0, 1); break;
3033  // case 539: CALL(1, 1, 0, 1, 1, 0, 0, 0, 0, 1); break;
3034  // case 540: CALL(0, 0, 1, 1, 1, 0, 0, 0, 0, 1); break;
3035  // case 541: CALL(1, 0, 1, 1, 1, 0, 0, 0, 0, 1); break;
3036  // case 542: CALL(0, 1, 1, 1, 1, 0, 0, 0, 0, 1); break;
3037  // case 543: CALL(1, 1, 1, 1, 1, 0, 0, 0, 0, 1); break;
3038  // case 544: CALL(0, 0, 0, 0, 0, 1, 0, 0, 0, 1); break;
3039  // case 545: CALL(1, 0, 0, 0, 0, 1, 0, 0, 0, 1); break;
3040  // case 546: CALL(0, 1, 0, 0, 0, 1, 0, 0, 0, 1); break;
3041  // case 547: CALL(1, 1, 0, 0, 0, 1, 0, 0, 0, 1); break;
3042  // case 548: CALL(0, 0, 1, 0, 0, 1, 0, 0, 0, 1); break;
3043  // case 549: CALL(1, 0, 1, 0, 0, 1, 0, 0, 0, 1); break;
3044  // case 550: CALL(0, 1, 1, 0, 0, 1, 0, 0, 0, 1); break;
3045  // case 551: CALL(1, 1, 1, 0, 0, 1, 0, 0, 0, 1); break;
3046  // case 552: CALL(0, 0, 0, 1, 0, 1, 0, 0, 0, 1); break;
3047  // case 553: CALL(1, 0, 0, 1, 0, 1, 0, 0, 0, 1); break;
3048  // case 554: CALL(0, 1, 0, 1, 0, 1, 0, 0, 0, 1); break;
3049  // case 555: CALL(1, 1, 0, 1, 0, 1, 0, 0, 0, 1); break;
3050  // case 556: CALL(0, 0, 1, 1, 0, 1, 0, 0, 0, 1); break;
3051  // case 557: CALL(1, 0, 1, 1, 0, 1, 0, 0, 0, 1); break;
3052  // case 558: CALL(0, 1, 1, 1, 0, 1, 0, 0, 0, 1); break;
3053  // case 559: CALL(1, 1, 1, 1, 0, 1, 0, 0, 0, 1); break;
3054  // case 560: CALL(0, 0, 0, 0, 1, 1, 0, 0, 0, 1); break;
3055  // case 561: CALL(1, 0, 0, 0, 1, 1, 0, 0, 0, 1); break;
3056  // case 562: CALL(0, 1, 0, 0, 1, 1, 0, 0, 0, 1); break;
3057  // case 563: CALL(1, 1, 0, 0, 1, 1, 0, 0, 0, 1); break;
3058  // case 564: CALL(0, 0, 1, 0, 1, 1, 0, 0, 0, 1); break;
3059  // case 565: CALL(1, 0, 1, 0, 1, 1, 0, 0, 0, 1); break;
3060  // case 566: CALL(0, 1, 1, 0, 1, 1, 0, 0, 0, 1); break;
3061  // case 567: CALL(1, 1, 1, 0, 1, 1, 0, 0, 0, 1); break;
3062  // case 568: CALL(0, 0, 0, 1, 1, 1, 0, 0, 0, 1); break;
3063  // case 569: CALL(1, 0, 0, 1, 1, 1, 0, 0, 0, 1); break;
3064  // case 570: CALL(0, 1, 0, 1, 1, 1, 0, 0, 0, 1); break;
3065  // case 571: CALL(1, 1, 0, 1, 1, 1, 0, 0, 0, 1); break;
3066  // case 572: CALL(0, 0, 1, 1, 1, 1, 0, 0, 0, 1); break;
3067  // case 573: CALL(1, 0, 1, 1, 1, 1, 0, 0, 0, 1); break;
3068  // case 574: CALL(0, 1, 1, 1, 1, 1, 0, 0, 0, 1); break;
3069  // case 575: CALL(1, 1, 1, 1, 1, 1, 0, 0, 0, 1); break;
3070  // case 576: CALL(0, 0, 0, 0, 0, 0, 1, 0, 0, 1); break;
3071  // case 577: CALL(1, 0, 0, 0, 0, 0, 1, 0, 0, 1); break;
3072  // case 578: CALL(0, 1, 0, 0, 0, 0, 1, 0, 0, 1); break;
3073  // case 579: CALL(1, 1, 0, 0, 0, 0, 1, 0, 0, 1); break;
3074  // case 580: CALL(0, 0, 1, 0, 0, 0, 1, 0, 0, 1); break;
3075  // case 581: CALL(1, 0, 1, 0, 0, 0, 1, 0, 0, 1); break;
3076  // case 582: CALL(0, 1, 1, 0, 0, 0, 1, 0, 0, 1); break;
3077  // case 583: CALL(1, 1, 1, 0, 0, 0, 1, 0, 0, 1); break;
3078  // case 584: CALL(0, 0, 0, 1, 0, 0, 1, 0, 0, 1); break;
3079  // case 585: CALL(1, 0, 0, 1, 0, 0, 1, 0, 0, 1); break;
3080  // case 586: CALL(0, 1, 0, 1, 0, 0, 1, 0, 0, 1); break;
3081  // case 587: CALL(1, 1, 0, 1, 0, 0, 1, 0, 0, 1); break;
3082  // case 588: CALL(0, 0, 1, 1, 0, 0, 1, 0, 0, 1); break;
3083  // case 589: CALL(1, 0, 1, 1, 0, 0, 1, 0, 0, 1); break;
3084  // case 590: CALL(0, 1, 1, 1, 0, 0, 1, 0, 0, 1); break;
3085  // case 591: CALL(1, 1, 1, 1, 0, 0, 1, 0, 0, 1); break;
3086  // case 592: CALL(0, 0, 0, 0, 1, 0, 1, 0, 0, 1); break;
3087  // case 593: CALL(1, 0, 0, 0, 1, 0, 1, 0, 0, 1); break;
3088  // case 594: CALL(0, 1, 0, 0, 1, 0, 1, 0, 0, 1); break;
3089  // case 595: CALL(1, 1, 0, 0, 1, 0, 1, 0, 0, 1); break;
3090  // case 596: CALL(0, 0, 1, 0, 1, 0, 1, 0, 0, 1); break;
3091  // case 597: CALL(1, 0, 1, 0, 1, 0, 1, 0, 0, 1); break;
3092  // case 598: CALL(0, 1, 1, 0, 1, 0, 1, 0, 0, 1); break;
3093  // case 599: CALL(1, 1, 1, 0, 1, 0, 1, 0, 0, 1); break;
3094  // case 600: CALL(0, 0, 0, 1, 1, 0, 1, 0, 0, 1); break;
3095  // case 601: CALL(1, 0, 0, 1, 1, 0, 1, 0, 0, 1); break;
3096  // case 602: CALL(0, 1, 0, 1, 1, 0, 1, 0, 0, 1); break;
3097  // case 603: CALL(1, 1, 0, 1, 1, 0, 1, 0, 0, 1); break;
3098  // case 604: CALL(0, 0, 1, 1, 1, 0, 1, 0, 0, 1); break;
3099  // case 605: CALL(1, 0, 1, 1, 1, 0, 1, 0, 0, 1); break;
3100  // case 606: CALL(0, 1, 1, 1, 1, 0, 1, 0, 0, 1); break;
3101  // case 607: CALL(1, 1, 1, 1, 1, 0, 1, 0, 0, 1); break;
3102  // case 608: CALL(0, 0, 0, 0, 0, 1, 1, 0, 0, 1); break;
3103  // case 609: CALL(1, 0, 0, 0, 0, 1, 1, 0, 0, 1); break;
3104  // case 610: CALL(0, 1, 0, 0, 0, 1, 1, 0, 0, 1); break;
3105  // case 611: CALL(1, 1, 0, 0, 0, 1, 1, 0, 0, 1); break;
3106  // case 612: CALL(0, 0, 1, 0, 0, 1, 1, 0, 0, 1); break;
3107  // case 613: CALL(1, 0, 1, 0, 0, 1, 1, 0, 0, 1); break;
3108  // case 614: CALL(0, 1, 1, 0, 0, 1, 1, 0, 0, 1); break;
3109  // case 615: CALL(1, 1, 1, 0, 0, 1, 1, 0, 0, 1); break;
3110  // case 616: CALL(0, 0, 0, 1, 0, 1, 1, 0, 0, 1); break;
3111  // case 617: CALL(1, 0, 0, 1, 0, 1, 1, 0, 0, 1); break;
3112  // case 618: CALL(0, 1, 0, 1, 0, 1, 1, 0, 0, 1); break;
3113  // case 619: CALL(1, 1, 0, 1, 0, 1, 1, 0, 0, 1); break;
3114  // case 620: CALL(0, 0, 1, 1, 0, 1, 1, 0, 0, 1); break;
3115  // case 621: CALL(1, 0, 1, 1, 0, 1, 1, 0, 0, 1); break;
3116  // case 622: CALL(0, 1, 1, 1, 0, 1, 1, 0, 0, 1); break;
3117  // case 623: CALL(1, 1, 1, 1, 0, 1, 1, 0, 0, 1); break;
3118  // case 624: CALL(0, 0, 0, 0, 1, 1, 1, 0, 0, 1); break;
3119  // case 625: CALL(1, 0, 0, 0, 1, 1, 1, 0, 0, 1); break;
3120  // case 626: CALL(0, 1, 0, 0, 1, 1, 1, 0, 0, 1); break;
3121  // case 627: CALL(1, 1, 0, 0, 1, 1, 1, 0, 0, 1); break;
3122  // case 628: CALL(0, 0, 1, 0, 1, 1, 1, 0, 0, 1); break;
3123  // case 629: CALL(1, 0, 1, 0, 1, 1, 1, 0, 0, 1); break;
3124  // case 630: CALL(0, 1, 1, 0, 1, 1, 1, 0, 0, 1); break;
3125  // case 631: CALL(1, 1, 1, 0, 1, 1, 1, 0, 0, 1); break;
3126  // case 632: CALL(0, 0, 0, 1, 1, 1, 1, 0, 0, 1); break;
3127  // case 633: CALL(1, 0, 0, 1, 1, 1, 1, 0, 0, 1); break;
3128  // case 634: CALL(0, 1, 0, 1, 1, 1, 1, 0, 0, 1); break;
3129  // case 635: CALL(1, 1, 0, 1, 1, 1, 1, 0, 0, 1); break;
3130  // case 636: CALL(0, 0, 1, 1, 1, 1, 1, 0, 0, 1); break;
3131  // case 637: CALL(1, 0, 1, 1, 1, 1, 1, 0, 0, 1); break;
3132  // case 638: CALL(0, 1, 1, 1, 1, 1, 1, 0, 0, 1); break;
3133  // case 639: CALL(1, 1, 1, 1, 1, 1, 1, 0, 0, 1); break;
3134  // case 640: CALL(0, 0, 0, 0, 0, 0, 0, 1, 0, 1); break;
3135  // case 641: CALL(1, 0, 0, 0, 0, 0, 0, 1, 0, 1); break;
3136  // case 642: CALL(0, 1, 0, 0, 0, 0, 0, 1, 0, 1); break;
3137  // case 643: CALL(1, 1, 0, 0, 0, 0, 0, 1, 0, 1); break;
3138  // case 644: CALL(0, 0, 1, 0, 0, 0, 0, 1, 0, 1); break;
3139  // case 645: CALL(1, 0, 1, 0, 0, 0, 0, 1, 0, 1); break;
3140  // case 646: CALL(0, 1, 1, 0, 0, 0, 0, 1, 0, 1); break;
3141  // case 647: CALL(1, 1, 1, 0, 0, 0, 0, 1, 0, 1); break;
3142  // case 648: CALL(0, 0, 0, 1, 0, 0, 0, 1, 0, 1); break;
3143  // case 649: CALL(1, 0, 0, 1, 0, 0, 0, 1, 0, 1); break;
3144  // case 650: CALL(0, 1, 0, 1, 0, 0, 0, 1, 0, 1); break;
3145  // case 651: CALL(1, 1, 0, 1, 0, 0, 0, 1, 0, 1); break;
3146  // case 652: CALL(0, 0, 1, 1, 0, 0, 0, 1, 0, 1); break;
3147  // case 653: CALL(1, 0, 1, 1, 0, 0, 0, 1, 0, 1); break;
3148  // case 654: CALL(0, 1, 1, 1, 0, 0, 0, 1, 0, 1); break;
3149  // case 655: CALL(1, 1, 1, 1, 0, 0, 0, 1, 0, 1); break;
3150  // case 656: CALL(0, 0, 0, 0, 1, 0, 0, 1, 0, 1); break;
3151  // case 657: CALL(1, 0, 0, 0, 1, 0, 0, 1, 0, 1); break;
3152  // case 658: CALL(0, 1, 0, 0, 1, 0, 0, 1, 0, 1); break;
3153  // case 659: CALL(1, 1, 0, 0, 1, 0, 0, 1, 0, 1); break;
3154  // case 660: CALL(0, 0, 1, 0, 1, 0, 0, 1, 0, 1); break;
3155  // case 661: CALL(1, 0, 1, 0, 1, 0, 0, 1, 0, 1); break;
3156  // case 662: CALL(0, 1, 1, 0, 1, 0, 0, 1, 0, 1); break;
3157  // case 663: CALL(1, 1, 1, 0, 1, 0, 0, 1, 0, 1); break;
3158  // case 664: CALL(0, 0, 0, 1, 1, 0, 0, 1, 0, 1); break;
3159  // case 665: CALL(1, 0, 0, 1, 1, 0, 0, 1, 0, 1); break;
3160  // case 666: CALL(0, 1, 0, 1, 1, 0, 0, 1, 0, 1); break;
3161  // case 667: CALL(1, 1, 0, 1, 1, 0, 0, 1, 0, 1); break;
3162  // case 668: CALL(0, 0, 1, 1, 1, 0, 0, 1, 0, 1); break;
3163  // case 669: CALL(1, 0, 1, 1, 1, 0, 0, 1, 0, 1); break;
3164  // case 670: CALL(0, 1, 1, 1, 1, 0, 0, 1, 0, 1); break;
3165  // case 671: CALL(1, 1, 1, 1, 1, 0, 0, 1, 0, 1); break;
3166  // case 672: CALL(0, 0, 0, 0, 0, 1, 0, 1, 0, 1); break;
3167  // case 673: CALL(1, 0, 0, 0, 0, 1, 0, 1, 0, 1); break;
3168  // case 674: CALL(0, 1, 0, 0, 0, 1, 0, 1, 0, 1); break;
3169  // case 675: CALL(1, 1, 0, 0, 0, 1, 0, 1, 0, 1); break;
3170  // case 676: CALL(0, 0, 1, 0, 0, 1, 0, 1, 0, 1); break;
3171  // case 677: CALL(1, 0, 1, 0, 0, 1, 0, 1, 0, 1); break;
3172  // case 678: CALL(0, 1, 1, 0, 0, 1, 0, 1, 0, 1); break;
3173  // case 679: CALL(1, 1, 1, 0, 0, 1, 0, 1, 0, 1); break;
3174  // case 680: CALL(0, 0, 0, 1, 0, 1, 0, 1, 0, 1); break;
3175  // case 681: CALL(1, 0, 0, 1, 0, 1, 0, 1, 0, 1); break;
3176  // case 682: CALL(0, 1, 0, 1, 0, 1, 0, 1, 0, 1); break;
3177  // case 683: CALL(1, 1, 0, 1, 0, 1, 0, 1, 0, 1); break;
3178  // case 684: CALL(0, 0, 1, 1, 0, 1, 0, 1, 0, 1); break;
3179  // case 685: CALL(1, 0, 1, 1, 0, 1, 0, 1, 0, 1); break;
3180  // case 686: CALL(0, 1, 1, 1, 0, 1, 0, 1, 0, 1); break;
3181  // case 687: CALL(1, 1, 1, 1, 0, 1, 0, 1, 0, 1); break;
3182  // case 688: CALL(0, 0, 0, 0, 1, 1, 0, 1, 0, 1); break;
3183  // case 689: CALL(1, 0, 0, 0, 1, 1, 0, 1, 0, 1); break;
3184  // case 690: CALL(0, 1, 0, 0, 1, 1, 0, 1, 0, 1); break;
3185  // case 691: CALL(1, 1, 0, 0, 1, 1, 0, 1, 0, 1); break;
3186  // case 692: CALL(0, 0, 1, 0, 1, 1, 0, 1, 0, 1); break;
3187  // case 693: CALL(1, 0, 1, 0, 1, 1, 0, 1, 0, 1); break;
3188  // case 694: CALL(0, 1, 1, 0, 1, 1, 0, 1, 0, 1); break;
3189  // case 695: CALL(1, 1, 1, 0, 1, 1, 0, 1, 0, 1); break;
3190  // case 696: CALL(0, 0, 0, 1, 1, 1, 0, 1, 0, 1); break;
3191  // case 697: CALL(1, 0, 0, 1, 1, 1, 0, 1, 0, 1); break;
3192  // case 698: CALL(0, 1, 0, 1, 1, 1, 0, 1, 0, 1); break;
3193  // case 699: CALL(1, 1, 0, 1, 1, 1, 0, 1, 0, 1); break;
3194  // case 700: CALL(0, 0, 1, 1, 1, 1, 0, 1, 0, 1); break;
3195  // case 701: CALL(1, 0, 1, 1, 1, 1, 0, 1, 0, 1); break;
3196  // case 702: CALL(0, 1, 1, 1, 1, 1, 0, 1, 0, 1); break;
3197  // case 703: CALL(1, 1, 1, 1, 1, 1, 0, 1, 0, 1); break;
3198  // case 704: CALL(0, 0, 0, 0, 0, 0, 1, 1, 0, 1); break;
3199  // case 705: CALL(1, 0, 0, 0, 0, 0, 1, 1, 0, 1); break;
3200  // case 706: CALL(0, 1, 0, 0, 0, 0, 1, 1, 0, 1); break;
3201  // case 707: CALL(1, 1, 0, 0, 0, 0, 1, 1, 0, 1); break;
3202  // case 708: CALL(0, 0, 1, 0, 0, 0, 1, 1, 0, 1); break;
3203  // case 709: CALL(1, 0, 1, 0, 0, 0, 1, 1, 0, 1); break;
3204  // case 710: CALL(0, 1, 1, 0, 0, 0, 1, 1, 0, 1); break;
3205  // case 711: CALL(1, 1, 1, 0, 0, 0, 1, 1, 0, 1); break;
3206  // case 712: CALL(0, 0, 0, 1, 0, 0, 1, 1, 0, 1); break;
3207  // case 713: CALL(1, 0, 0, 1, 0, 0, 1, 1, 0, 1); break;
3208  // case 714: CALL(0, 1, 0, 1, 0, 0, 1, 1, 0, 1); break;
3209  // case 715: CALL(1, 1, 0, 1, 0, 0, 1, 1, 0, 1); break;
3210  // case 716: CALL(0, 0, 1, 1, 0, 0, 1, 1, 0, 1); break;
3211  // case 717: CALL(1, 0, 1, 1, 0, 0, 1, 1, 0, 1); break;
3212  // case 718: CALL(0, 1, 1, 1, 0, 0, 1, 1, 0, 1); break;
3213  // case 719: CALL(1, 1, 1, 1, 0, 0, 1, 1, 0, 1); break;
3214  // case 720: CALL(0, 0, 0, 0, 1, 0, 1, 1, 0, 1); break;
3215  // case 721: CALL(1, 0, 0, 0, 1, 0, 1, 1, 0, 1); break;
3216  // case 722: CALL(0, 1, 0, 0, 1, 0, 1, 1, 0, 1); break;
3217  // case 723: CALL(1, 1, 0, 0, 1, 0, 1, 1, 0, 1); break;
3218  // case 724: CALL(0, 0, 1, 0, 1, 0, 1, 1, 0, 1); break;
3219  // case 725: CALL(1, 0, 1, 0, 1, 0, 1, 1, 0, 1); break;
3220  // case 726: CALL(0, 1, 1, 0, 1, 0, 1, 1, 0, 1); break;
3221  // case 727: CALL(1, 1, 1, 0, 1, 0, 1, 1, 0, 1); break;
3222  // case 728: CALL(0, 0, 0, 1, 1, 0, 1, 1, 0, 1); break;
3223  // case 729: CALL(1, 0, 0, 1, 1, 0, 1, 1, 0, 1); break;
3224  // case 730: CALL(0, 1, 0, 1, 1, 0, 1, 1, 0, 1); break;
3225  // case 731: CALL(1, 1, 0, 1, 1, 0, 1, 1, 0, 1); break;
3226  // case 732: CALL(0, 0, 1, 1, 1, 0, 1, 1, 0, 1); break;
3227  // case 733: CALL(1, 0, 1, 1, 1, 0, 1, 1, 0, 1); break;
3228  // case 734: CALL(0, 1, 1, 1, 1, 0, 1, 1, 0, 1); break;
3229  // case 735: CALL(1, 1, 1, 1, 1, 0, 1, 1, 0, 1); break;
3230  // case 736: CALL(0, 0, 0, 0, 0, 1, 1, 1, 0, 1); break;
3231  // case 737: CALL(1, 0, 0, 0, 0, 1, 1, 1, 0, 1); break;
3232  // case 738: CALL(0, 1, 0, 0, 0, 1, 1, 1, 0, 1); break;
3233  // case 739: CALL(1, 1, 0, 0, 0, 1, 1, 1, 0, 1); break;
3234  // case 740: CALL(0, 0, 1, 0, 0, 1, 1, 1, 0, 1); break;
3235  // case 741: CALL(1, 0, 1, 0, 0, 1, 1, 1, 0, 1); break;
3236  // case 742: CALL(0, 1, 1, 0, 0, 1, 1, 1, 0, 1); break;
3237  // case 743: CALL(1, 1, 1, 0, 0, 1, 1, 1, 0, 1); break;
3238  // case 744: CALL(0, 0, 0, 1, 0, 1, 1, 1, 0, 1); break;
3239  // case 745: CALL(1, 0, 0, 1, 0, 1, 1, 1, 0, 1); break;
3240  // case 746: CALL(0, 1, 0, 1, 0, 1, 1, 1, 0, 1); break;
3241  // case 747: CALL(1, 1, 0, 1, 0, 1, 1, 1, 0, 1); break;
3242  // case 748: CALL(0, 0, 1, 1, 0, 1, 1, 1, 0, 1); break;
3243  // case 749: CALL(1, 0, 1, 1, 0, 1, 1, 1, 0, 1); break;
3244  // case 750: CALL(0, 1, 1, 1, 0, 1, 1, 1, 0, 1); break;
3245  // case 751: CALL(1, 1, 1, 1, 0, 1, 1, 1, 0, 1); break;
3246  // case 752: CALL(0, 0, 0, 0, 1, 1, 1, 1, 0, 1); break;
3247  // case 753: CALL(1, 0, 0, 0, 1, 1, 1, 1, 0, 1); break;
3248  // case 754: CALL(0, 1, 0, 0, 1, 1, 1, 1, 0, 1); break;
3249  // case 755: CALL(1, 1, 0, 0, 1, 1, 1, 1, 0, 1); break;
3250  // case 756: CALL(0, 0, 1, 0, 1, 1, 1, 1, 0, 1); break;
3251  // case 757: CALL(1, 0, 1, 0, 1, 1, 1, 1, 0, 1); break;
3252  // case 758: CALL(0, 1, 1, 0, 1, 1, 1, 1, 0, 1); break;
3253  // case 759: CALL(1, 1, 1, 0, 1, 1, 1, 1, 0, 1); break;
3254  // case 760: CALL(0, 0, 0, 1, 1, 1, 1, 1, 0, 1); break;
3255  // case 761: CALL(1, 0, 0, 1, 1, 1, 1, 1, 0, 1); break;
3256  // case 762: CALL(0, 1, 0, 1, 1, 1, 1, 1, 0, 1); break;
3257  // case 763: CALL(1, 1, 0, 1, 1, 1, 1, 1, 0, 1); break;
3258  // case 764: CALL(0, 0, 1, 1, 1, 1, 1, 1, 0, 1); break;
3259  // case 765: CALL(1, 0, 1, 1, 1, 1, 1, 1, 0, 1); break;
3260  // case 766: CALL(0, 1, 1, 1, 1, 1, 1, 1, 0, 1); break;
3261  // case 767: CALL(1, 1, 1, 1, 1, 1, 1, 1, 0, 1); break;
3262  // case 768: CALL(0, 0, 0, 0, 0, 0, 0, 0, 1, 1); break;
3263  // case 769: CALL(1, 0, 0, 0, 0, 0, 0, 0, 1, 1); break;
3264  // case 770: CALL(0, 1, 0, 0, 0, 0, 0, 0, 1, 1); break;
3265  // case 771: CALL(1, 1, 0, 0, 0, 0, 0, 0, 1, 1); break;
3266  // case 772: CALL(0, 0, 1, 0, 0, 0, 0, 0, 1, 1); break;
3267  // case 773: CALL(1, 0, 1, 0, 0, 0, 0, 0, 1, 1); break;
3268  // case 774: CALL(0, 1, 1, 0, 0, 0, 0, 0, 1, 1); break;
3269  // case 775: CALL(1, 1, 1, 0, 0, 0, 0, 0, 1, 1); break;
3270  // case 776: CALL(0, 0, 0, 1, 0, 0, 0, 0, 1, 1); break;
3271  // case 777: CALL(1, 0, 0, 1, 0, 0, 0, 0, 1, 1); break;
3272  // case 778: CALL(0, 1, 0, 1, 0, 0, 0, 0, 1, 1); break;
3273  // case 779: CALL(1, 1, 0, 1, 0, 0, 0, 0, 1, 1); break;
3274  // case 780: CALL(0, 0, 1, 1, 0, 0, 0, 0, 1, 1); break;
3275  // case 781: CALL(1, 0, 1, 1, 0, 0, 0, 0, 1, 1); break;
3276  // case 782: CALL(0, 1, 1, 1, 0, 0, 0, 0, 1, 1); break;
3277  // case 783: CALL(1, 1, 1, 1, 0, 0, 0, 0, 1, 1); break;
3278  // case 784: CALL(0, 0, 0, 0, 1, 0, 0, 0, 1, 1); break;
3279  // case 785: CALL(1, 0, 0, 0, 1, 0, 0, 0, 1, 1); break;
3280  // case 786: CALL(0, 1, 0, 0, 1, 0, 0, 0, 1, 1); break;
3281  // case 787: CALL(1, 1, 0, 0, 1, 0, 0, 0, 1, 1); break;
3282  // case 788: CALL(0, 0, 1, 0, 1, 0, 0, 0, 1, 1); break;
3283  // case 789: CALL(1, 0, 1, 0, 1, 0, 0, 0, 1, 1); break;
3284  // case 790: CALL(0, 1, 1, 0, 1, 0, 0, 0, 1, 1); break;
3285  // case 791: CALL(1, 1, 1, 0, 1, 0, 0, 0, 1, 1); break;
3286  // case 792: CALL(0, 0, 0, 1, 1, 0, 0, 0, 1, 1); break;
3287  // case 793: CALL(1, 0, 0, 1, 1, 0, 0, 0, 1, 1); break;
3288  // case 794: CALL(0, 1, 0, 1, 1, 0, 0, 0, 1, 1); break;
3289  // case 795: CALL(1, 1, 0, 1, 1, 0, 0, 0, 1, 1); break;
3290  // case 796: CALL(0, 0, 1, 1, 1, 0, 0, 0, 1, 1); break;
3291  // case 797: CALL(1, 0, 1, 1, 1, 0, 0, 0, 1, 1); break;
3292  // case 798: CALL(0, 1, 1, 1, 1, 0, 0, 0, 1, 1); break;
3293  // case 799: CALL(1, 1, 1, 1, 1, 0, 0, 0, 1, 1); break;
3294  // case 800: CALL(0, 0, 0, 0, 0, 1, 0, 0, 1, 1); break;
3295  // case 801: CALL(1, 0, 0, 0, 0, 1, 0, 0, 1, 1); break;
3296  // case 802: CALL(0, 1, 0, 0, 0, 1, 0, 0, 1, 1); break;
3297  // case 803: CALL(1, 1, 0, 0, 0, 1, 0, 0, 1, 1); break;
3298  // case 804: CALL(0, 0, 1, 0, 0, 1, 0, 0, 1, 1); break;
3299  // case 805: CALL(1, 0, 1, 0, 0, 1, 0, 0, 1, 1); break;
3300  // case 806: CALL(0, 1, 1, 0, 0, 1, 0, 0, 1, 1); break;
3301  // case 807: CALL(1, 1, 1, 0, 0, 1, 0, 0, 1, 1); break;
3302  // case 808: CALL(0, 0, 0, 1, 0, 1, 0, 0, 1, 1); break;
3303  // case 809: CALL(1, 0, 0, 1, 0, 1, 0, 0, 1, 1); break;
3304  // case 810: CALL(0, 1, 0, 1, 0, 1, 0, 0, 1, 1); break;
3305  // case 811: CALL(1, 1, 0, 1, 0, 1, 0, 0, 1, 1); break;
3306  // case 812: CALL(0, 0, 1, 1, 0, 1, 0, 0, 1, 1); break;
3307  // case 813: CALL(1, 0, 1, 1, 0, 1, 0, 0, 1, 1); break;
3308  // case 814: CALL(0, 1, 1, 1, 0, 1, 0, 0, 1, 1); break;
3309  // case 815: CALL(1, 1, 1, 1, 0, 1, 0, 0, 1, 1); break;
3310  case 816: CALL(0, 0, 0, 0, 1, 1, 0, 0, 1, 1); break;
3311  case 817: CALL(1, 0, 0, 0, 1, 1, 0, 0, 1, 1); break;
3312  case 818: CALL(0, 1, 0, 0, 1, 1, 0, 0, 1, 1); break;
3313  case 819: CALL(1, 1, 0, 0, 1, 1, 0, 0, 1, 1); break;
3314  case 820: CALL(0, 0, 1, 0, 1, 1, 0, 0, 1, 1); break;
3315  case 821: CALL(1, 0, 1, 0, 1, 1, 0, 0, 1, 1); break;
3316  case 822: CALL(0, 1, 1, 0, 1, 1, 0, 0, 1, 1); break;
3317  case 823: CALL(1, 1, 1, 0, 1, 1, 0, 0, 1, 1); break;
3318  case 824: CALL(0, 0, 0, 1, 1, 1, 0, 0, 1, 1); break;
3319  case 825: CALL(1, 0, 0, 1, 1, 1, 0, 0, 1, 1); break;
3320  case 826: CALL(0, 1, 0, 1, 1, 1, 0, 0, 1, 1); break;
3321  case 827: CALL(1, 1, 0, 1, 1, 1, 0, 0, 1, 1); break;
3322  case 828: CALL(0, 0, 1, 1, 1, 1, 0, 0, 1, 1); break;
3323  case 829: CALL(1, 0, 1, 1, 1, 1, 0, 0, 1, 1); break;
3324  case 830: CALL(0, 1, 1, 1, 1, 1, 0, 0, 1, 1); break;
3325  case 831: CALL(1, 1, 1, 1, 1, 1, 0, 0, 1, 1); break;
3326  // case 832: CALL(0, 0, 0, 0, 0, 0, 1, 0, 1, 1); break;
3327  // case 833: CALL(1, 0, 0, 0, 0, 0, 1, 0, 1, 1); break;
3328  // case 834: CALL(0, 1, 0, 0, 0, 0, 1, 0, 1, 1); break;
3329  // case 835: CALL(1, 1, 0, 0, 0, 0, 1, 0, 1, 1); break;
3330  // case 836: CALL(0, 0, 1, 0, 0, 0, 1, 0, 1, 1); break;
3331  // case 837: CALL(1, 0, 1, 0, 0, 0, 1, 0, 1, 1); break;
3332  // case 838: CALL(0, 1, 1, 0, 0, 0, 1, 0, 1, 1); break;
3333  // case 839: CALL(1, 1, 1, 0, 0, 0, 1, 0, 1, 1); break;
3334  // case 840: CALL(0, 0, 0, 1, 0, 0, 1, 0, 1, 1); break;
3335  // case 841: CALL(1, 0, 0, 1, 0, 0, 1, 0, 1, 1); break;
3336  // case 842: CALL(0, 1, 0, 1, 0, 0, 1, 0, 1, 1); break;
3337  // case 843: CALL(1, 1, 0, 1, 0, 0, 1, 0, 1, 1); break;
3338  // case 844: CALL(0, 0, 1, 1, 0, 0, 1, 0, 1, 1); break;
3339  // case 845: CALL(1, 0, 1, 1, 0, 0, 1, 0, 1, 1); break;
3340  // case 846: CALL(0, 1, 1, 1, 0, 0, 1, 0, 1, 1); break;
3341  // case 847: CALL(1, 1, 1, 1, 0, 0, 1, 0, 1, 1); break;
3342  case 848: CALL(0, 0, 0, 0, 1, 0, 1, 0, 1, 1); break;
3343  case 849: CALL(1, 0, 0, 0, 1, 0, 1, 0, 1, 1); break;
3344  case 850: CALL(0, 1, 0, 0, 1, 0, 1, 0, 1, 1); break;
3345  case 851: CALL(1, 1, 0, 0, 1, 0, 1, 0, 1, 1); break;
3346  case 852: CALL(0, 0, 1, 0, 1, 0, 1, 0, 1, 1); break;
3347  case 853: CALL(1, 0, 1, 0, 1, 0, 1, 0, 1, 1); break;
3348  case 854: CALL(0, 1, 1, 0, 1, 0, 1, 0, 1, 1); break;
3349  case 855: CALL(1, 1, 1, 0, 1, 0, 1, 0, 1, 1); break;
3350  case 856: CALL(0, 0, 0, 1, 1, 0, 1, 0, 1, 1); break;
3351  case 857: CALL(1, 0, 0, 1, 1, 0, 1, 0, 1, 1); break;
3352  case 858: CALL(0, 1, 0, 1, 1, 0, 1, 0, 1, 1); break;
3353  case 859: CALL(1, 1, 0, 1, 1, 0, 1, 0, 1, 1); break;
3354  case 860: CALL(0, 0, 1, 1, 1, 0, 1, 0, 1, 1); break;
3355  case 861: CALL(1, 0, 1, 1, 1, 0, 1, 0, 1, 1); break;
3356  case 862: CALL(0, 1, 1, 1, 1, 0, 1, 0, 1, 1); break;
3357  case 863: CALL(1, 1, 1, 1, 1, 0, 1, 0, 1, 1); break;
3358  // case 864: CALL(0, 0, 0, 0, 0, 1, 1, 0, 1, 1); break;
3359  // case 865: CALL(1, 0, 0, 0, 0, 1, 1, 0, 1, 1); break;
3360  // case 866: CALL(0, 1, 0, 0, 0, 1, 1, 0, 1, 1); break;
3361  // case 867: CALL(1, 1, 0, 0, 0, 1, 1, 0, 1, 1); break;
3362  // case 868: CALL(0, 0, 1, 0, 0, 1, 1, 0, 1, 1); break;
3363  // case 869: CALL(1, 0, 1, 0, 0, 1, 1, 0, 1, 1); break;
3364  // case 870: CALL(0, 1, 1, 0, 0, 1, 1, 0, 1, 1); break;
3365  // case 871: CALL(1, 1, 1, 0, 0, 1, 1, 0, 1, 1); break;
3366  // case 872: CALL(0, 0, 0, 1, 0, 1, 1, 0, 1, 1); break;
3367  // case 873: CALL(1, 0, 0, 1, 0, 1, 1, 0, 1, 1); break;
3368  // case 874: CALL(0, 1, 0, 1, 0, 1, 1, 0, 1, 1); break;
3369  // case 875: CALL(1, 1, 0, 1, 0, 1, 1, 0, 1, 1); break;
3370  // case 876: CALL(0, 0, 1, 1, 0, 1, 1, 0, 1, 1); break;
3371  // case 877: CALL(1, 0, 1, 1, 0, 1, 1, 0, 1, 1); break;
3372  // case 878: CALL(0, 1, 1, 1, 0, 1, 1, 0, 1, 1); break;
3373  // case 879: CALL(1, 1, 1, 1, 0, 1, 1, 0, 1, 1); break;
3374  // case 880: CALL(0, 0, 0, 0, 1, 1, 1, 0, 1, 1); break;
3375  // case 881: CALL(1, 0, 0, 0, 1, 1, 1, 0, 1, 1); break;
3376  // case 882: CALL(0, 1, 0, 0, 1, 1, 1, 0, 1, 1); break;
3377  // case 883: CALL(1, 1, 0, 0, 1, 1, 1, 0, 1, 1); break;
3378  // case 884: CALL(0, 0, 1, 0, 1, 1, 1, 0, 1, 1); break;
3379  // case 885: CALL(1, 0, 1, 0, 1, 1, 1, 0, 1, 1); break;
3380  // case 886: CALL(0, 1, 1, 0, 1, 1, 1, 0, 1, 1); break;
3381  // case 887: CALL(1, 1, 1, 0, 1, 1, 1, 0, 1, 1); break;
3382  // case 888: CALL(0, 0, 0, 1, 1, 1, 1, 0, 1, 1); break;
3383  // case 889: CALL(1, 0, 0, 1, 1, 1, 1, 0, 1, 1); break;
3384  // case 890: CALL(0, 1, 0, 1, 1, 1, 1, 0, 1, 1); break;
3385  // case 891: CALL(1, 1, 0, 1, 1, 1, 1, 0, 1, 1); break;
3386  // case 892: CALL(0, 0, 1, 1, 1, 1, 1, 0, 1, 1); break;
3387  // case 893: CALL(1, 0, 1, 1, 1, 1, 1, 0, 1, 1); break;
3388  // case 894: CALL(0, 1, 1, 1, 1, 1, 1, 0, 1, 1); break;
3389  // case 895: CALL(1, 1, 1, 1, 1, 1, 1, 0, 1, 1); break;
3390  // case 896: CALL(0, 0, 0, 0, 0, 0, 0, 1, 1, 1); break;
3391  // case 897: CALL(1, 0, 0, 0, 0, 0, 0, 1, 1, 1); break;
3392  // case 898: CALL(0, 1, 0, 0, 0, 0, 0, 1, 1, 1); break;
3393  // case 899: CALL(1, 1, 0, 0, 0, 0, 0, 1, 1, 1); break;
3394  // case 900: CALL(0, 0, 1, 0, 0, 0, 0, 1, 1, 1); break;
3395  // case 901: CALL(1, 0, 1, 0, 0, 0, 0, 1, 1, 1); break;
3396  // case 902: CALL(0, 1, 1, 0, 0, 0, 0, 1, 1, 1); break;
3397  // case 903: CALL(1, 1, 1, 0, 0, 0, 0, 1, 1, 1); break;
3398  // case 904: CALL(0, 0, 0, 1, 0, 0, 0, 1, 1, 1); break;
3399  // case 905: CALL(1, 0, 0, 1, 0, 0, 0, 1, 1, 1); break;
3400  // case 906: CALL(0, 1, 0, 1, 0, 0, 0, 1, 1, 1); break;
3401  // case 907: CALL(1, 1, 0, 1, 0, 0, 0, 1, 1, 1); break;
3402  // case 908: CALL(0, 0, 1, 1, 0, 0, 0, 1, 1, 1); break;
3403  // case 909: CALL(1, 0, 1, 1, 0, 0, 0, 1, 1, 1); break;
3404  // case 910: CALL(0, 1, 1, 1, 0, 0, 0, 1, 1, 1); break;
3405  // case 911: CALL(1, 1, 1, 1, 0, 0, 0, 1, 1, 1); break;
3406  // case 912: CALL(0, 0, 0, 0, 1, 0, 0, 1, 1, 1); break;
3407  // case 913: CALL(1, 0, 0, 0, 1, 0, 0, 1, 1, 1); break;
3408  // case 914: CALL(0, 1, 0, 0, 1, 0, 0, 1, 1, 1); break;
3409  // case 915: CALL(1, 1, 0, 0, 1, 0, 0, 1, 1, 1); break;
3410  // case 916: CALL(0, 0, 1, 0, 1, 0, 0, 1, 1, 1); break;
3411  // case 917: CALL(1, 0, 1, 0, 1, 0, 0, 1, 1, 1); break;
3412  // case 918: CALL(0, 1, 1, 0, 1, 0, 0, 1, 1, 1); break;
3413  // case 919: CALL(1, 1, 1, 0, 1, 0, 0, 1, 1, 1); break;
3414  // case 920: CALL(0, 0, 0, 1, 1, 0, 0, 1, 1, 1); break;
3415  // case 921: CALL(1, 0, 0, 1, 1, 0, 0, 1, 1, 1); break;
3416  // case 922: CALL(0, 1, 0, 1, 1, 0, 0, 1, 1, 1); break;
3417  // case 923: CALL(1, 1, 0, 1, 1, 0, 0, 1, 1, 1); break;
3418  // case 924: CALL(0, 0, 1, 1, 1, 0, 0, 1, 1, 1); break;
3419  // case 925: CALL(1, 0, 1, 1, 1, 0, 0, 1, 1, 1); break;
3420  // case 926: CALL(0, 1, 1, 1, 1, 0, 0, 1, 1, 1); break;
3421  // case 927: CALL(1, 1, 1, 1, 1, 0, 0, 1, 1, 1); break;
3422  // case 928: CALL(0, 0, 0, 0, 0, 1, 0, 1, 1, 1); break;
3423  // case 929: CALL(1, 0, 0, 0, 0, 1, 0, 1, 1, 1); break;
3424  // case 930: CALL(0, 1, 0, 0, 0, 1, 0, 1, 1, 1); break;
3425  // case 931: CALL(1, 1, 0, 0, 0, 1, 0, 1, 1, 1); break;
3426  // case 932: CALL(0, 0, 1, 0, 0, 1, 0, 1, 1, 1); break;
3427  // case 933: CALL(1, 0, 1, 0, 0, 1, 0, 1, 1, 1); break;
3428  // case 934: CALL(0, 1, 1, 0, 0, 1, 0, 1, 1, 1); break;
3429  // case 935: CALL(1, 1, 1, 0, 0, 1, 0, 1, 1, 1); break;
3430  // case 936: CALL(0, 0, 0, 1, 0, 1, 0, 1, 1, 1); break;
3431  // case 937: CALL(1, 0, 0, 1, 0, 1, 0, 1, 1, 1); break;
3432  // case 938: CALL(0, 1, 0, 1, 0, 1, 0, 1, 1, 1); break;
3433  // case 939: CALL(1, 1, 0, 1, 0, 1, 0, 1, 1, 1); break;
3434  // case 940: CALL(0, 0, 1, 1, 0, 1, 0, 1, 1, 1); break;
3435  // case 941: CALL(1, 0, 1, 1, 0, 1, 0, 1, 1, 1); break;
3436  // case 942: CALL(0, 1, 1, 1, 0, 1, 0, 1, 1, 1); break;
3437  // case 943: CALL(1, 1, 1, 1, 0, 1, 0, 1, 1, 1); break;
3438  case 944: CALL(0, 0, 0, 0, 1, 1, 0, 1, 1, 1); break;
3439  case 945: CALL(1, 0, 0, 0, 1, 1, 0, 1, 1, 1); break;
3440  case 946: CALL(0, 1, 0, 0, 1, 1, 0, 1, 1, 1); break;
3441  case 947: CALL(1, 1, 0, 0, 1, 1, 0, 1, 1, 1); break;
3442  case 948: CALL(0, 0, 1, 0, 1, 1, 0, 1, 1, 1); break;
3443  case 949: CALL(1, 0, 1, 0, 1, 1, 0, 1, 1, 1); break;
3444  case 950: CALL(0, 1, 1, 0, 1, 1, 0, 1, 1, 1); break;
3445  case 951: CALL(1, 1, 1, 0, 1, 1, 0, 1, 1, 1); break;
3446  case 952: CALL(0, 0, 0, 1, 1, 1, 0, 1, 1, 1); break;
3447  case 953: CALL(1, 0, 0, 1, 1, 1, 0, 1, 1, 1); break;
3448  case 954: CALL(0, 1, 0, 1, 1, 1, 0, 1, 1, 1); break;
3449  case 955: CALL(1, 1, 0, 1, 1, 1, 0, 1, 1, 1); break;
3450  case 956: CALL(0, 0, 1, 1, 1, 1, 0, 1, 1, 1); break;
3451  case 957: CALL(1, 0, 1, 1, 1, 1, 0, 1, 1, 1); break;
3452  case 958: CALL(0, 1, 1, 1, 1, 1, 0, 1, 1, 1); break;
3453  case 959: CALL(1, 1, 1, 1, 1, 1, 0, 1, 1, 1); break;
3454  // case 960: CALL(0, 0, 0, 0, 0, 0, 1, 1, 1, 1); break;
3455  // case 961: CALL(1, 0, 0, 0, 0, 0, 1, 1, 1, 1); break;
3456  // case 962: CALL(0, 1, 0, 0, 0, 0, 1, 1, 1, 1); break;
3457  // case 963: CALL(1, 1, 0, 0, 0, 0, 1, 1, 1, 1); break;
3458  // case 964: CALL(0, 0, 1, 0, 0, 0, 1, 1, 1, 1); break;
3459  // case 965: CALL(1, 0, 1, 0, 0, 0, 1, 1, 1, 1); break;
3460  // case 966: CALL(0, 1, 1, 0, 0, 0, 1, 1, 1, 1); break;
3461  // case 967: CALL(1, 1, 1, 0, 0, 0, 1, 1, 1, 1); break;
3462  // case 968: CALL(0, 0, 0, 1, 0, 0, 1, 1, 1, 1); break;
3463  // case 969: CALL(1, 0, 0, 1, 0, 0, 1, 1, 1, 1); break;
3464  // case 970: CALL(0, 1, 0, 1, 0, 0, 1, 1, 1, 1); break;
3465  // case 971: CALL(1, 1, 0, 1, 0, 0, 1, 1, 1, 1); break;
3466  // case 972: CALL(0, 0, 1, 1, 0, 0, 1, 1, 1, 1); break;
3467  // case 973: CALL(1, 0, 1, 1, 0, 0, 1, 1, 1, 1); break;
3468  // case 974: CALL(0, 1, 1, 1, 0, 0, 1, 1, 1, 1); break;
3469  // case 975: CALL(1, 1, 1, 1, 0, 0, 1, 1, 1, 1); break;
3470  case 976: CALL(0, 0, 0, 0, 1, 0, 1, 1, 1, 1); break;
3471  case 977: CALL(1, 0, 0, 0, 1, 0, 1, 1, 1, 1); break;
3472  case 978: CALL(0, 1, 0, 0, 1, 0, 1, 1, 1, 1); break;
3473  case 979: CALL(1, 1, 0, 0, 1, 0, 1, 1, 1, 1); break;
3474  case 980: CALL(0, 0, 1, 0, 1, 0, 1, 1, 1, 1); break;
3475  case 981: CALL(1, 0, 1, 0, 1, 0, 1, 1, 1, 1); break;
3476  case 982: CALL(0, 1, 1, 0, 1, 0, 1, 1, 1, 1); break;
3477  case 983: CALL(1, 1, 1, 0, 1, 0, 1, 1, 1, 1); break;
3478  case 984: CALL(0, 0, 0, 1, 1, 0, 1, 1, 1, 1); break;
3479  case 985: CALL(1, 0, 0, 1, 1, 0, 1, 1, 1, 1); break;
3480  case 986: CALL(0, 1, 0, 1, 1, 0, 1, 1, 1, 1); break;
3481  case 987: CALL(1, 1, 0, 1, 1, 0, 1, 1, 1, 1); break;
3482  case 988: CALL(0, 0, 1, 1, 1, 0, 1, 1, 1, 1); break;
3483  case 989: CALL(1, 0, 1, 1, 1, 0, 1, 1, 1, 1); break;
3484  case 990: CALL(0, 1, 1, 1, 1, 0, 1, 1, 1, 1); break;
3485  case 991: CALL(1, 1, 1, 1, 1, 0, 1, 1, 1, 1); break;
3486  // case 992: CALL(0, 0, 0, 0, 0, 1, 1, 1, 1, 1); break;
3487  // case 993: CALL(1, 0, 0, 0, 0, 1, 1, 1, 1, 1); break;
3488  // case 994: CALL(0, 1, 0, 0, 0, 1, 1, 1, 1, 1); break;
3489  // case 995: CALL(1, 1, 0, 0, 0, 1, 1, 1, 1, 1); break;
3490  // case 996: CALL(0, 0, 1, 0, 0, 1, 1, 1, 1, 1); break;
3491  // case 997: CALL(1, 0, 1, 0, 0, 1, 1, 1, 1, 1); break;
3492  // case 998: CALL(0, 1, 1, 0, 0, 1, 1, 1, 1, 1); break;
3493  // case 999: CALL(1, 1, 1, 0, 0, 1, 1, 1, 1, 1); break;
3494  // case 1000: CALL(0, 0, 0, 1, 0, 1, 1, 1, 1, 1); break;
3495  // case 1001: CALL(1, 0, 0, 1, 0, 1, 1, 1, 1, 1); break;
3496  // case 1002: CALL(0, 1, 0, 1, 0, 1, 1, 1, 1, 1); break;
3497  // case 1003: CALL(1, 1, 0, 1, 0, 1, 1, 1, 1, 1); break;
3498  // case 1004: CALL(0, 0, 1, 1, 0, 1, 1, 1, 1, 1); break;
3499  // case 1005: CALL(1, 0, 1, 1, 0, 1, 1, 1, 1, 1); break;
3500  // case 1006: CALL(0, 1, 1, 1, 0, 1, 1, 1, 1, 1); break;
3501  // case 1007: CALL(1, 1, 1, 1, 0, 1, 1, 1, 1, 1); break;
3502  // case 1008: CALL(0, 0, 0, 0, 1, 1, 1, 1, 1, 1); break;
3503  // case 1009: CALL(1, 0, 0, 0, 1, 1, 1, 1, 1, 1); break;
3504  // case 1010: CALL(0, 1, 0, 0, 1, 1, 1, 1, 1, 1); break;
3505  // case 1011: CALL(1, 1, 0, 0, 1, 1, 1, 1, 1, 1); break;
3506  // case 1012: CALL(0, 0, 1, 0, 1, 1, 1, 1, 1, 1); break;
3507  // case 1013: CALL(1, 0, 1, 0, 1, 1, 1, 1, 1, 1); break;
3508  // case 1014: CALL(0, 1, 1, 0, 1, 1, 1, 1, 1, 1); break;
3509  // case 1015: CALL(1, 1, 1, 0, 1, 1, 1, 1, 1, 1); break;
3510  // case 1016: CALL(0, 0, 0, 1, 1, 1, 1, 1, 1, 1); break;
3511  // case 1017: CALL(1, 0, 0, 1, 1, 1, 1, 1, 1, 1); break;
3512  // case 1018: CALL(0, 1, 0, 1, 1, 1, 1, 1, 1, 1); break;
3513  // case 1019: CALL(1, 1, 0, 1, 1, 1, 1, 1, 1, 1); break;
3514  // case 1020: CALL(0, 0, 1, 1, 1, 1, 1, 1, 1, 1); break;
3515  // case 1021: CALL(1, 0, 1, 1, 1, 1, 1, 1, 1, 1); break;
3516  // case 1022: CALL(0, 1, 1, 1, 1, 1, 1, 1, 1, 1); break;
3517  // case 1023: CALL(1, 1, 1, 1, 1, 1, 1, 1, 1, 1); break;
3518 
3519  default: {
3520 #ifdef DEBUG
3521  sprintf(errmsg,
3522  "doEnergy=%d doVirial=%d doSlow=%d doPairlist=%d "
3523  "doAlch=%d doFEP=%d doTI=%d doStreaming=%d doTable=%d "
3524  "\noptions = %d\n",
3525  doEnergy, doVirial, doSlow, doPairlist, doAlch, doFEP, doTI,
3526  doStreaming, doTable, options);
3527  NAMD_die(errmsg);
3528 #else
3529  std::string call_options;
3530  call_options += "doEnergy = " + std::to_string(int(doEnergy));
3531  call_options += ", doVirial = " + std::to_string(int(doVirial));
3532  call_options += ", doSlow = " + std::to_string(int(doSlow));
3533  call_options += ", doPairlist = " + std::to_string(int(doPairlist));
3534  call_options += ", doAlch = " + std::to_string(int(doAlch));
3535  call_options += ", doFEP = " + std::to_string(int(doFEP));
3536  call_options += ", doTI = " + std::to_string(int(doTI));
3537  call_options += ", doStreaming = " + std::to_string(int(doStreaming));
3538  call_options += ", doTable = " + std::to_string(int(doTable));
3539  call_options += ", doAlchVdwForceSwitching = " + std::to_string(int(doAlchVdwForceSwitching));
3540  const std::string error = "CudaComputeNonbondedKernel::nonbondedForce, none of the kernels called. Options are:\n" + call_options;
3541  NAMD_bug(error.c_str());
3542 #endif
3543  }
3544 
3545  }
3546 
3547 #endif
3548 
3549 #undef CALL
3550  cudaCheck(cudaGetLastError());
3551 
3552  start += nblock*nwarp;
3553  }
3554  if ( doVirial || ! doStreaming ){
3555  int block = 128;
3556  int grid = (atomStorageSize + block - 1)/block;
3557  if (doSlow)
3558  transposeForcesKernel<1><<<grid, block, 0, stream>>>(d_forces, d_forcesSlow,
3559  force_x, force_y, force_z, force_w,
3560  forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
3561  atomStorageSize);
3562  else
3563  transposeForcesKernel<0><<<grid, block, 0, stream>>>(d_forces, d_forcesSlow,
3564  force_x, force_y, force_z, force_w,
3565  forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
3566  atomStorageSize);
3567  }
3568 #if 0
3569  cudaCheck(cudaStreamSynchronize(stream));
3570 
3571  // Haochuan: I add atomIndex to the output and also debug the slow forces.
3572  // XXX TODO: ERASE THIS AFTERWARDS
3573  // this is not numAtoms, this is something else
3574  // will print the force inside the compute and afterwards
3575  float4* h_f;
3576  float4* h_f_slow;
3577  int* h_index;
3578  allocate_host<float4>(&h_f, atomStorageSize);
3579  allocate_host<float4>(&h_f_slow, atomStorageSize);
3580  allocate_host<int>(&h_index, atomStorageSize);
3581  copy_DtoH_sync<float4>(d_forces, h_f, atomStorageSize);
3582  copy_DtoH_sync<float4>(d_forcesSlow, h_f_slow, atomStorageSize);
3583  copy_DtoH_sync<int>(atomIndex, h_index, atomStorageSize);
3584 
3585  FILE* pos_nb_f = fopen("compute_nb_dforces.txt", "a+");
3586  fprintf(pos_nb_f, "forces after kernel\n");
3587  // I'm gonna copy back the forces and just print them
3588 
3589  for(int i = 0; i < atomStorageSize; i++){
3590  //for(int i = 83000; i < 85000; i++){
3591  fprintf(pos_nb_f, "%10d %10.5f %10.5f %10.5f\n", h_index[i], h_f[i].x,
3592  h_f[i].y, h_f[i].z);
3593  }
3594 
3595  if (doSlow) {
3596  FILE* pos_slow_f = fopen("compute_slow_dforces.txt", "a+");
3597  for(int i = 0; i < atomStorageSize; i++){
3598  fprintf(pos_slow_f, "%10d %10.5f %10.5f %10.5f\n", h_index[i], h_f_slow[i].x,
3599  h_f_slow[i].y, h_f_slow[i].z);
3600  }
3601  fclose(pos_slow_f);
3602  deallocate_host(&h_f_slow);
3603  }
3604 
3605  deallocate_host<int>(&h_index);
3606  deallocate_host<float4>(&h_f);
3607  fclose(pos_nb_f);
3608 #endif
3609 }
3610 
3611 //
3612 // Perform virial and energy reductions for non-bonded force calculation
3613 //
3614 void CudaComputeNonbondedKernel::reduceVirialEnergy(CudaTileListKernel& tlKernel,
3615  const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS,
3616  float4* d_forces, float4* d_forcesSlow,
3617  VirialEnergy* d_virialEnergy, cudaStream_t stream) {
3618 
3619  if (doEnergy || doVirial) {
3620  clear_device_array<VirialEnergy>(d_virialEnergy, 1, stream);
3621  }
3622 
3623  if (doVirial)
3624  {
3625  int nthread = REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*WARPSIZE;
3626  int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
3627  reduceNonbondedVirialKernel <<< nblock, nthread, 0, stream >>>
3628  (doSlow, atomStorageSize, tlKernel.get_xyzq(), d_forces, d_forcesSlow, d_virialEnergy);
3629  cudaCheck(cudaGetLastError());
3630  }
3631 
3632  if (doVirial || doEnergy)
3633  {
3634  int nthread = REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE;
3635  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyLength()-1)/nthread+1);
3636  reduceVirialEnergyKernel <<< nblock, nthread, 0, stream >>>
3637  (doEnergy, doVirial, doSlow, tlKernel.getTileListVirialEnergyLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
3638  cudaCheck(cudaGetLastError());
3639  }
3640 
3641  if (doGBIS && doEnergy)
3642  {
3643  int nthread = REDUCEGBISENERGYKERNEL_NUM_WARP*WARPSIZE;
3644  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyGBISLength()-1)/nthread+1);
3645  reduceGBISEnergyKernel <<< nblock, nthread, 0, stream >>>
3646  (tlKernel.getTileListVirialEnergyGBISLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
3647  cudaCheck(cudaGetLastError());
3648  }
3649 
3650 }
3651 
3652 void CudaComputeNonbondedKernel::bindExclusions(int numExclusions, unsigned int* exclusion_bits) {
3653  int nconst = ( numExclusions < MAX_CONST_EXCLUSIONS ? numExclusions : MAX_CONST_EXCLUSIONS );
3654  cudaCheck(cudaMemcpyToSymbol(constExclusions, exclusion_bits, nconst*sizeof(unsigned int), 0));
3655 
3656  reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions);
3657  copy_HtoD_sync<unsigned int>(exclusion_bits, overflowExclusions, numExclusions);
3658 }
3659 
3660 
3661 void CudaComputeNonbondedKernel::setExclusionsByAtom(int2* h_data, const int num_atoms) {
3662  // Global data structure shouldn't be reallocated
3663  if (d_exclusionsByAtom == NULL) allocate_device<int2>(&d_exclusionsByAtom, num_atoms);
3664  copy_HtoD_sync<int2>(h_data, d_exclusionsByAtom, num_atoms);
3665 
3666 }
3667 
3668 
3669 template<bool kDoAlch>
3670 __global__ void updateVdwTypesExclKernel(
3671  const int numPatches,
3672  const CudaLocalRecord* localRecords,
3673  const int* global_vdwTypes,
3674  const int* global_id,
3675  const int* patchSortOrder,
3676  const int2* exclusionsByAtom,
3677  const int* global_partition,
3678  int* vdwTypes,
3679  int* atomIndex,
3680  int2* exclusions,
3681  char* part
3682 ) {
3683  __shared__ CudaLocalRecord s_record;
3684  using AccessType = int32_t;
3685  AccessType* s_record_buffer = (AccessType*) &s_record;
3686 
3687  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
3688  // Read in the CudaLocalRecord using multiple threads. This should
3689  #pragma unroll 1
3690  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
3691  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
3692  }
3693  __syncthreads();
3694 
3695  const int numAtoms = s_record.numAtoms;
3696  const int offset = s_record.bufferOffset;
3697  const int offsetNB = s_record.bufferOffsetNBPad;
3698 
3699  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
3700  const int order = patchSortOrder[offset + i];
3701  const int id = global_id[offset + order];
3702  vdwTypes [offsetNB + i] = global_vdwTypes[offset + order];
3703  atomIndex [offsetNB + i] = id;
3704  exclusions[offsetNB + i].x = exclusionsByAtom[id].y;
3705  exclusions[offsetNB + i].y = exclusionsByAtom[id].x;
3706  if (kDoAlch) {
3707  part [offsetNB + i] = global_partition[offset + order];
3708  }
3709  }
3710  __syncthreads();
3711  }
3712 }
3713 
3714 
3715 void CudaComputeNonbondedKernel::updateVdwTypesExclOnGPU(CudaTileListKernel& tlKernel,
3716  const int numPatches, const int atomStorageSize, const bool alchOn,
3717  CudaLocalRecord* localRecords,
3718  const int* d_vdwTypes, const int* d_id, const int* d_sortOrder,
3719  const int* d_partition,
3720  cudaStream_t stream
3721 ) {
3722  reallocate_device<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, OVERALLOC);
3723  reallocate_device<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, OVERALLOC);
3724  reallocate_device<int>(&atomIndex, &atomIndexSize, atomStorageSize, OVERALLOC);
3725 
3726  const int numBlocks = numPatches;
3727  const int numThreads = 512;
3728 
3729  if (alchOn) {
3730  updateVdwTypesExclKernel<true><<<numBlocks, numThreads, 0, stream>>>(
3731  numPatches, localRecords,
3732  d_vdwTypes, d_id, d_sortOrder, d_exclusionsByAtom, d_partition,
3733  vdwTypes, atomIndex, exclIndexMaxDiff, tlKernel.get_part()
3734  );
3735  } else {
3736  updateVdwTypesExclKernel<false><<<numBlocks, numThreads, 0, stream>>>(
3737  numPatches, localRecords,
3738  d_vdwTypes, d_id, d_sortOrder, d_exclusionsByAtom, d_partition,
3739  vdwTypes, atomIndex, exclIndexMaxDiff, tlKernel.get_part()
3740  );
3741  }
3742 
3743 }
3744 
3745 #endif // NAMD_CUDA