NAMD
CudaComputeNonbondedKernelHip.cu
Go to the documentation of this file.
1 #if defined(NAMD_HIP)
2 #include <hip/hip_runtime.h>
3 #include <hipcub/hipcub.hpp>
4 #define cub hipcub
5 #endif // NAMD_HIP
6 
7 #include "HipDefines.h"
8 #include "CudaComputeNonbondedKernel.hip.h"
9 #include "CudaTileListKernel.hip.h"
10 #include "DeviceCUDA.h"
11 #include "CudaComputeNonbondedInteractions.h"
12 
13 #if defined(NAMD_HIP)
14 
15 #ifdef WIN32
16 #define __thread __declspec(thread)
17 #endif
18 extern __thread DeviceCUDA *deviceCUDA;
19 
20 #define NONBONDKERNEL_NUM_WARP 1
21 #define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP 4
22 #define REDUCEVIRIALENERGYKERNEL_NUM_WARP 4
23 #define REDUCEGBISENERGYKERNEL_NUM_WARP 4
24 #define __ldg *
25 #define NONBONDKERNEL_SWEEPS_PER_TILE WARPSIZE/BOUNDINGBOXSIZE
26 
27 #define OVERALLOC 1.2f
28 
29 void NAMD_die(const char *);
30 void NAMD_bug(const char *);
31 
32 #define MAX_CONST_EXCLUSIONS 2048 // cache size is 8k
33 __constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS];
34 
35 //FEP parameters
36 __constant__ AlchData alchflags;
37 
38 struct fast_float3
39 {
40  typedef float __attribute__((ext_vector_type(2))) Native_float2_;
41 
42  union
43  {
44  struct __attribute__((packed)) { Native_float2_ dxy; float dz; };
45  struct { float x, y, z; };
46  };
47 
48  __host__ __device__
49  fast_float3() = default;
50 
51  __host__ __device__
52  fast_float3(float x_, float y_, float z_) : dxy{ x_, y_ }, dz{ z_ } {}
53 
54  __host__ __device__
55  fast_float3(Native_float2_ xy_, float z_) : dxy{ xy_ }, dz{ z_ } {}
56 
57  __host__ __device__
58  operator float3() const
59  {
60  return float3{ x, y, z };
61  }
62 
63  __forceinline__ __host__ __device__
64  fast_float3& operator=(float4 y)
65  {
66  this->x = y.x;
67  this->y = y.y;
68  this->z = y.z;
69  return *this;
70  }
71 };
72 static_assert(sizeof(fast_float3) == 12);
73 
74 __forceinline__ __host__ __device__
75 fast_float3 operator*(fast_float3 x, fast_float3 y)
76 {
77  return fast_float3{ x.dxy * y.dxy, x.dz * y.dz };
78 }
79 
80 __forceinline__ __host__ __device__
81 fast_float3 operator*(fast_float3 x, float y)
82 {
83  return fast_float3{ x.dxy * y, x.dz * y };
84 }
85 
86 __forceinline__ __host__ __device__
87 fast_float3 operator*(float x, fast_float3 y)
88 {
89  return fast_float3{ x * y.dxy, x * y.dz };
90 }
91 
92 __forceinline__ __host__ __device__
93 fast_float3 operator+(fast_float3 x, fast_float3 y)
94 {
95  return fast_float3{ x.dxy + y.dxy, x.dz + y.dz };
96 }
97 
98 __forceinline__ __host__ __device__
99 fast_float3 operator-(fast_float3 x, fast_float3 y)
100 {
101  return fast_float3{ x.dxy - y.dxy, x.dz - y.dz };
102 }
103 
104 
105 static __forceinline__ __host__ __device__ fast_float3 make_fast_float3(const float x)
106 {
107  return fast_float3{ x, x, x };
108 }
109 
110 static __forceinline__ __host__ __device__ fast_float3 make_fast_float3(const float3 x)
111 {
112  return fast_float3{ x.x, x.y, x.z };
113 }
114 
115 static __forceinline__ __host__ __device__ fast_float3 make_fast_float3(const float4 x)
116 {
117  return fast_float3{ x.x, x.y, x.z };
118 }
119 
120 static __forceinline__ __host__ __device__ float norm2(fast_float3 a)
121 {
122  fast_float3 b = a * a;
123  return (b.x + b.y + b.z);
124 }
125 
126 static __forceinline__ __host__ __device__ float4 make_float4(fast_float3 a)
127 {
128  return make_float4(a.x, a.y, a.z, 0.f);
129 }
130 
131 
132 #ifndef USE_TABLE_ARRAYS
133 __device__ __forceinline__
134 float4 sampleTableTex(cudaTextureObject_t tex, float k) {
135  const int tableSize = FORCE_ENERGY_TABLE_SIZE;
136  const float x = k * (float)tableSize - 0.5f;
137  const float f = floorf(x);
138  const float a = x - f;
139  const unsigned int i = (unsigned int)f;
140  const int i0 = i < tableSize - 1 ? i : tableSize - 1;
141  const int i1 = i0 + 1;
142  const float4 t0 = tex1Dfetch<float4>(tex, i0);
143  const float4 t1 = tex1Dfetch<float4>(tex, i1);
144  return make_float4(
145  a * (t1.x - t0.x) + t0.x,
146  a * (t1.y - t0.y) + t0.y,
147  a * (t1.z - t0.z) + t0.z,
148  a * (t1.w - t0.w) + t0.w);
149 }
150 
151 __device__ __forceinline__
152 float4 sampleTableTexInv(cudaTextureObject_t tex, float k) {
153  const int tableSize = FORCE_ENERGY_TABLE_SIZE;
154  const float x = k * (float)tableSize - 0.5f;
155  const float f = floorf(x);
156  const float a = x - f;
157  const unsigned int i = (unsigned int)f;
158  const int i0 = i < tableSize - 1 ? i : tableSize - 1;
159  const int i1 = i0 + 1;
160  const float4 t0 = tex1Dfetch<float4>(tex, i0);
161  const float4 t1 = tex1Dfetch<float4>(tex, i1);
162  float4 t2 = (t1 - t0) * a + t0;
163  return make_float4(t2.z, t2.y, t2.x, t2.w);
164 }
165 
166 #endif
167 
168 
169 
170 // HIP implementation of tex1D has lower performance than this custom implementation of
171 // linear filtering
172 
173 //#define USE_TABLE_ARRAYS
174 
175 // Fast loads here
176 template<typename T>
177 static __forceinline__ __device__ const T& fast_load(const T* buffer, unsigned int idx, unsigned int offset = 0)
178 {
179  return *reinterpret_cast<const T*>(reinterpret_cast<const char*>(buffer) + idx * static_cast<unsigned int>(sizeof(T)) + offset * static_cast<unsigned int>(sizeof(T)));
180 }
181 
182 __device__ __forceinline__
183 float4 tableLookup(const float4* table, const float k)
184 {
185  const int tableSize = FORCE_ENERGY_TABLE_SIZE;
186  const float x = k * static_cast<float>(tableSize) - 0.5f;
187  const float f = floorf(x);
188  const float a = x - f;
189  const int i = static_cast<int>(f);
190  const int i0 = i < tableSize - 1 ? i : tableSize - 1;
191  const int i1 = i0 + 1;
192  // const float4 t0 = __ldg(&table[i0]);
193  // const float4 t1 = __ldg(&table[i1]);
194  const float4 t0 = fast_load(table, i0);
195  const float4 t1 = fast_load(table, i1);
196  return make_float4(
197  a * (t1.x - t0.x) + t0.x,
198  a * (t1.y - t0.y) + t0.y,
199  a * (t1.z - t0.z) + t0.z,
200  a * (t1.w - t0.w) + t0.w);
201 }
202 
203 __device__ __forceinline__
204 float4 tableLookupInv(const float4* table, const float k)
205 {
206  constexpr int tableSize = FORCE_ENERGY_TABLE_SIZE;
207  const float x = k * static_cast<float>(tableSize) - 0.5f;
208  const float f = floorf(x);
209  const float a = x - f;
210  const int i = static_cast<int>(f);
211  const int i0 = i < tableSize - 1 ? i : tableSize - 1;
212  const int i1 = i0 + 1;
213  // const float4 t0 = __ldg(&table[i0]);
214  // const float4 t1 = __ldg(&table[i1]);
215  const float4 t0 = fast_load(table, i0);
216  const float4 t1 = fast_load(table, i1);
217  float4 t2 = (t1 - t0) * a + t0;
218  return make_float4(t2.z, t2.y, t2.x, t2.w);
219 }
220 
221 
222 template<bool doEnergy, bool doSlow>
223 __device__ __forceinline__
224 void calcForceEnergy(const float r2, const float qi, const float qj,
225  const float dx, const float dy, const float dz,
226  const int vdwtypei, const int vdwtypej,
227 #ifdef USE_TABLE_ARRAYS
228  const float2* __restrict__ vdwCoefTable,
229  const float4* __restrict__ forceTable,
230  const float4* __restrict__ energyTable,
231 #else
232  cudaTextureObject_t vdwCoefTableTex,
233  cudaTextureObject_t forceTableTex,
234  cudaTextureObject_t energyTableTex,
235 #endif
236  fast_float3& iforce, fast_float3& iforceSlow, fast_float3& jforce, fast_float3& jforceSlow,
237  float& energyVdw, float& energyElec, float& energySlow) {
238 
239  int vdwIndex = vdwtypej + vdwtypei;
240 
241 #if defined(USE_TABLE_ARRAYS)
242  // float2 ljab = vdwCoefTable[vdwIndex];
243  float2 ljab = fast_load(vdwCoefTable, vdwIndex);
244 #else
245  float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
246 #endif
247  float rinv = __frsqrt_rn(r2);
248  float4 ei;
249 
250 #if defined(USE_TABLE_ARRAYS)
251  float4 fi = tableLookupInv(forceTable, rinv);
252  if (doEnergy) ei = tableLookup(energyTable, rinv);
253 #else
254  float4 fi = sampleTableTexInv(forceTableTex, rinv);
255  if (doEnergy) ei = sampleTableTex(energyTableTex, rinv);
256 #endif
257 
258  float fSlow = qi * qj;
259  fast_float3 f3(fi.x, fi.y, fi.z); // no waster registers, we already allocated stuff
260  fast_float3 lj(ljab.x, ljab.y, fSlow);
261  // we want to operate on reverse data here, the order of the float4 needs to be
262  // fast_float3 fi = f.z, f.y, f.x;
263  // fast_float3 lj = ljab.x, ljab.y, fSlow
264  // so if we do fi = fi * lj;
265  // f is the norm(fi) // x y z
266  // fSlow = fi.w
267  // float f = ljab.x * fi.z + ljab.y * fi.y + fSlow * fi.x;
268  f3 = f3 * lj;
269  float f = f3.x + f3.y + f3.z;
270 
271  if (doEnergy) {
272  energyVdw += ljab.x * ei.z + ljab.y * ei.y;
273  energyElec += fSlow * ei.x;
274  if (doSlow) energySlow += fSlow * ei.w;
275  }
276  if (doSlow) fSlow *= fi.w;
277 
278  // dx, dy, and dz are already store in contiguous registers -> wrap them in a fast_float3 and operate
279  fast_float3 dr(dx, dy, dz);
280  fast_float3 fr = dr *f;
281  // float fx = dx * f;
282  // float fy = dy * f;
283  // float fz = dz * f;
284  // iforce.x += fr.x;
285  // iforce.y += fr.y;
286  // iforce.z += fr.z;
287  // jforce.x -= fr.x;
288  // jforce.y -= fr.y;
289  // jforce.z -= fr.z;
290  iforce = iforce + fr;
291  jforce = jforce - fr;
292 
293  if (doSlow) {
294  fast_float3 sr = dr*fSlow;
295  // float fxSlow = dx * fSlow;
296  // float fySlow = dy * fSlow;
297  // float fzSlow = dz * fSlow;
298  // iforceSlow.x += sr.x;
299  // iforceSlow.y += sr.y;
300  // iforceSlow.z += sr.z;
301  // jforceSlow.x -= sr.x;
302  // jforceSlow.y -= sr.y;
303  // jforceSlow.z -= sr.z;
304  iforceSlow = iforceSlow + sr;
305  jforceSlow = jforceSlow - sr;
306  }
307 }
308 
309 template<bool doEnergy, bool doSlow>
310 __device__ __forceinline__
311 void calcForceEnergyMath(const float r2, const float qi, const float qj,
312  const float dx, const float dy, const float dz,
313  const int vdwtypei, const int vdwtypej,
314 #if defined(USE_TABLE_ARRAYS)
315  const float2* __restrict__ vdwCoefTable,
316 #else
317  cudaTextureObject_t vdwCoefTableTex,
318 #endif
319  fast_float3& iforce, fast_float3& iforceSlow, fast_float3& jforce, fast_float3& jforceSlow,
320  float& energyVdw, float& energyElec, float& energySlow,
321  const CudaNBConstants nbConstants) {
322 
323  int vdwIndex = vdwtypej + vdwtypei;
324 
325 #if defined(USE_TABLE_ARRAYS)
326  float2 ljab = vdwCoefTable[vdwIndex];
327 #else
328  float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
329 #endif
330 
331 
332  float rinv = rsqrtf(r2);
333  float f, fSlow;
334  float charge = qi * qj;
335 
336  cudaNBForceMagCalc_VdwEnergySwitch_PMEC1<doEnergy, doSlow>(
337  r2, rinv, charge, ljab, nbConstants,
338  f, fSlow, energyVdw, energyElec, energySlow);
339  // dx, dy, and dz are already store in contiguous registers -> wrap them in a fast_float3 and operate
340  fast_float3 ff(dx, dy, dz);
341 
342  ff = ff * f;
343  // float fx = dx * f;
344  // float fy = dy * f;
345  // float fz = dz * f;
346  fast_float3 iff = make_fast_float3(iforce);
347  fast_float3 jff = make_fast_float3(jforce);
348  iff = iff + ff;
349  iforce.x = iff.x;
350  iforce.y = iff.y;
351  iforce.z = iff.z;
352  // iforce.x += fx;
353  // iforce.y += fy;
354  // iforce.z += fz;
355  jff = jff - ff;
356  jforce.x = jff.x;
357  jforce.y = jff.y;
358  jforce.z = jff.z;
359  // jforce.x -= fx;
360  // jforce.y -= fy;
361  // jforce.z -= fz;
362  if (doSlow) {
363  float fxSlow = dx * fSlow;
364  float fySlow = dy * fSlow;
365  float fzSlow = dz * fSlow;
366  iforceSlow.x += fxSlow;
367  iforceSlow.y += fySlow;
368  iforceSlow.z += fzSlow;
369  jforceSlow.x -= fxSlow;
370  jforceSlow.y -= fySlow;
371  jforceSlow.z -= fzSlow;
372  }
373 }
374 
375 /* JM: Special __device__ function to compute VDW forces for alchemy.
376  * Partially swiped from ComputeNonbondedFEP.C
377  */
378 template<bool doEnergy, bool doSlow, bool shift, bool vdwForceSwitch>
379 __device__ __forceinline__
380 void calcForceEnergyFEP(const float r2, const float qi, const float qj,
381  const float dx, const float dy, const float dz,
382  const int vdwtypei, const int vdwtypej,
383  char p1, char p2,
384  /*const AlchData &alchflags, */
385  #ifdef USE_TABLE_ARRAYS
386  const float2* __restrict__ vdwCoefTable,
387  const float4* __restrict__ forceTable,
388  const float4* __restrict__ energyTable,
389 #else
390  cudaTextureObject_t vdwCoefTableTex,
391  cudaTextureObject_t forceTableTex,
392  cudaTextureObject_t energyTableTex,
393 #endif
394  fast_float3& iforce, fast_float3& iforceSlow, fast_float3& jforce, fast_float3& jforceSlow,
395  float& energyVdw, float &energyVdw_s, float& energyElec, float& energySlow,
396  float& energyElec_s, float& energySlow_s) {
397 
398 
399  int vdwIndex = vdwtypej + vdwtypei;
400 #if defined(USE_TABLE_ARRAYS)
401  float2 ljab = fast_load(vdwCoefTable, vdwIndex);
402 #else
403  float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex); //ljab.x is A and ljab.y is B
404 #endif
405 
406  float myVdwLambda = 0.0f;
407  float myVdwLambda2 = 0.0f;
408  float myElecLambda = 0.0f;
409  float myElecLambda2 = 0.0f;
410  float rinv = __frsqrt_rn(r2);
411  float f;
412  float alch_vdw_energy = 0.0f;
413  float alch_vdw_energy_2 = 0.0f;
414  float alch_vdw_force = 0.0f;
415  float fSlow = qi * qj;
416  float4 ei;
417 
418 #ifdef USE_TABLE_ARRAYS
419  float4 fi = tableLookup(forceTable, rinv);
420  if (doEnergy) ei = tableLookup(energyTable, rinv);
421 #else
422  float4 fi = sampleTableTex(forceTableTex, rinv);
423  if (doEnergy) ei = sampleTableTex(energyTableTex, rinv);
424 #endif
425 
426 
427  //John said that there is a better way to avoid divergences here
428  //alch: true if => 1-0, 1-1, 2-0, 2-2
429  //dec: true if => 1-1, 2-2 && decouple
430  //up: true if => 1-0 && 1,1
431  //down: true if => 2-0, && 2,2
432  int ref = (p1 == 0 && p2 == 0);
433  int alch = (!ref && !(p1 == 1 && p2 ==2) && !(p1 == 2 && p2 == 1));
434  int dec = (alch && (p1 == p2) && alchflags.alchDecouple);
435  int up = (alch && (p1 == 1 || p2 == 1) && !dec);
436  int down = (alch && (p1 == 2 || p2 == 2) && !dec);
437 
438  float r2_1, r2_2;
439  f = (fSlow * fi.x);
440 
441 /*--------------- VDW SPECIAL ALCH FORCES (Swiped from ComputeNonbondedFEP.C) ---------------*/
442 
443  myVdwLambda = alchflags.vdwLambdaUp*(up) + alchflags.vdwLambdaDown*(down) + 1.f*(ref || dec);
444  myVdwLambda2 = alchflags.vdwLambda2Up*(up) + alchflags.vdwLambda2Down*(down) + 1.f*(ref || dec);
445  myElecLambda = alchflags.elecLambdaUp*(up) + alchflags.elecLambdaDown*(down) + 1.f*(ref || dec);
446  myElecLambda2 = alchflags.elecLambda2Up*(up) + alchflags.elecLambda2Down*(down) + 1.f*(ref || dec);
447 
448  if (alch) {
449  if (vdwForceSwitch) {
450  // force switching
451  float switchdist6_1, switchdist6_2;
452  const float cutoff6 = alchflags.cutoff2 * alchflags.cutoff2 * alchflags.cutoff2;
453  // const float
454  //Templated parameter. No control divergence here
455  if (shift) {
456  const float myVdwShift = alchflags.vdwShiftUp*up + alchflags.vdwShiftDown*(!up);
457  const float myVdwShift2 = alchflags.vdwShift2Up*up + alchflags.vdwShift2Down*(!up);
458  r2_1 = __fdividef(1.f,(r2 + myVdwShift));
459  r2_2 = __fdividef(1.f,(r2 + myVdwShift2));
460  switchdist6_1 = alchflags.switchdist2 + myVdwShift;
461  switchdist6_1 = switchdist6_1 * switchdist6_1 * switchdist6_1;
462  switchdist6_2 = alchflags.switchdist2 + myVdwShift2;
463  switchdist6_2 = switchdist6_2 * switchdist6_2 * switchdist6_2;
464  } else {
465  r2_1 = rinv*rinv;
466  r2_2 = rinv*rinv;
467  switchdist6_1 = alchflags.switchdist2 * alchflags.switchdist2 * alchflags.switchdist2;
468  switchdist6_2 = switchdist6_1;
469  }
470  const float r6_1 = r2_1*r2_1*r2_1;
471  const float r6_2 = r2_2*r2_2*r2_2;
472  if (r2 <= alchflags.switchdist2) {
473  const float U1 = ljab.x*r6_1*r6_1 - ljab.y*r6_1; // NB: unscaled, shorthand only!
474  const float U2 = ljab.x*r6_2*r6_2 - ljab.y*r6_2;
475  // A == ljab.x, B == ljab.y
476  const float dU_1 = -ljab.x / (cutoff6 * switchdist6_1) - (-ljab.y * rsqrtf(cutoff6 * switchdist6_1));
477  const float dU_2 = -ljab.x / (cutoff6 * switchdist6_2) - (-ljab.y * rsqrtf(cutoff6 * switchdist6_2));
478  alch_vdw_energy = myVdwLambda * (U1 + dU_1);
479  alch_vdw_energy_2 = myVdwLambda2 * (U2 + dU_2);
480 
481  //Multiplied by -1.0 to match CPU values
482  alch_vdw_force =-1.f*myVdwLambda*((12.f*U1 + 6.f*ljab.y*r6_1)*r2_1);
483  } else {
484  const float r3_1 = sqrtf(r6_1);
485  const float r3_2 = sqrtf(r6_2);
486  const float inv_cutoff6 = 1.0f / cutoff6;
487  const float inv_cutoff3 = rsqrtf(cutoff6);
488  // A == ljab.x, B == ljab.y
489  const float k_vdwa_1 = ljab.x / (1.0f - switchdist6_1 * inv_cutoff6);
490  const float k_vdwb_1 = ljab.y / (1.0f - sqrtf(switchdist6_1 * inv_cutoff6));
491  const float k_vdwa_2 = ljab.x / (1.0f - switchdist6_2 * inv_cutoff6);
492  const float k_vdwb_2 = ljab.y / (1.0f - sqrtf(switchdist6_2 * inv_cutoff6));
493  const float tmpa_1 = r6_1 - inv_cutoff6;
494  const float tmpb_1 = r3_1 - inv_cutoff3;
495  const float tmpa_2 = r6_2 - inv_cutoff6;
496  const float tmpb_2 = r3_2 - inv_cutoff3;
497  alch_vdw_energy = myVdwLambda * (k_vdwa_1 * tmpa_1 * tmpa_1 - k_vdwb_1 * tmpb_1 * tmpb_1);
498  alch_vdw_energy_2 = myVdwLambda2 * (k_vdwa_2 * tmpa_2 * tmpa_2 - k_vdwb_2 * tmpb_2 * tmpb_2);
499  //Multiplied by -1.0 to match CPU values
500  alch_vdw_force = -1.0f * myVdwLambda * (6.0f * r2_1 * (2.0f * k_vdwa_1 * tmpa_1 * r6_1 - k_vdwb_1 * tmpb_1 * r3_1));
501  } // r2 <= alchflags.switchdist2
502  } else {
503  // potential switching
504  const float diff = alchflags.cutoff2 - r2;
505 
506  const float switchmul = (alchflags.switchfactor*(diff)*(diff)*(alchflags.cutoff2 - 3.f*alchflags.switchdist2 + 2.f*r2))*(r2 > alchflags.switchdist2) + (1.f)*(r2 <= alchflags.switchdist2);
507  const float switchmul2 = (12.f*alchflags.switchfactor*(diff)*(r2 - alchflags.switchdist2))*(r2 > alchflags.switchdist2) + (0.f) * (r2 <= alchflags.switchdist2);
508 
509  //Templated parameter. No control divergence here
510  if(shift){
511  //This templated parameter lets me get away with not making 2 divisions. But for myVdwShift != 0, how do I do this?
512  const float myVdwShift = alchflags.vdwShiftUp*up + alchflags.vdwShiftDown*(!up);
513  const float myVdwShift2 = alchflags.vdwShift2Up*up + alchflags.vdwShift2Down*(!up);
514  //r2_1 = 1.0/(r2 + myVdwShift);
515  //r2_2 = 1.0/(r2 + myVdwShift2);
516  r2_1 = __fdividef(1.f,(r2 + myVdwShift));
517  r2_2 = __fdividef(1.f,(r2 + myVdwShift2));
518  }else{
519  r2_1 = rinv*rinv;
520  r2_2 = rinv*rinv;
521  }
522 
523  const float r6_1 = r2_1*r2_1*r2_1;
524  const float r6_2 = r2_2*r2_2*r2_2;
525  const float U1 = ljab.x*r6_1*r6_1 - ljab.y*r6_1; // NB: unscaled, shorthand only!
526  const float U2 = ljab.x*r6_2*r6_2 - ljab.y*r6_2;
527  alch_vdw_energy = myVdwLambda*switchmul*U1;
528  alch_vdw_energy_2 = myVdwLambda2*switchmul*U2;
529 
530  //Multiplied by -1.0 to match CPU values
531  alch_vdw_force =-1.f*myVdwLambda*((switchmul*(12.f*U1 + 6.f*ljab.y*r6_1)*r2_1+ switchmul2*U1));
532  } // vdwForceSwitch
533  }
534 
535 /*-----------------------------------------------------------*/
536 
537  if (doEnergy){
538  //All energies should be scaled by the corresponding lambda
539  energyVdw += (ljab.x * ei.z + ljab.y * ei.y)*(ref || dec) + alch_vdw_energy*(alch && !dec);
540  energyElec += (fSlow * ei.x)*myElecLambda;
541  energyVdw_s += (ljab.x * ei.z + ljab.y * ei.y)*(ref || dec) + alch_vdw_energy_2*(alch && !dec);
542  energyElec_s += (fSlow * ei.x)*myElecLambda2;
543  if (doSlow){
544  energySlow += (fSlow * ei.w)*myElecLambda;
545  energySlow_s += (fSlow * ei.w)*myElecLambda2;
546  }
547  }
548 
549  if (doSlow) fSlow *= fi.w;
550 
551  //We should include the regular VDW forces if not dealing with alch pairs
552  f = (f + ((ljab.x * fi.z + ljab.y * fi.y)*(!alch || dec)))*myElecLambda
553  + alch_vdw_force*(alch && !dec);
554 
555  float fx = dx * f;
556  float fy = dy * f;
557  float fz = dz * f;
558 
559  iforce.x += fx;
560  iforce.y += fy;
561  iforce.z += fz;
562  jforce.x -= fx;
563  jforce.y -= fy;
564  jforce.z -= fz;
565 
566  if (doSlow) {
567  /*There's stuff that needs to be added here, when FAST AND NOSHORT macros are on*/
568  fSlow = myElecLambda*fSlow;
569  float fxSlow = dx * fSlow;
570  float fySlow = dy * fSlow;
571  float fzSlow = dz * fSlow;
572  iforceSlow.x += fxSlow;
573  iforceSlow.y += fySlow;
574  iforceSlow.z += fzSlow;
575  jforceSlow.x -= fxSlow;
576  jforceSlow.y -= fySlow;
577  jforceSlow.z -= fzSlow;
578  }
579 }
580 
581 /* JM: Special __device__ function to compute VDW forces for TI.
582  */
583 
584 template<bool doEnergy, bool doSlow, bool shift, bool vdwForceSwitch>
585 __device__ __forceinline__
586 void calcForceEnergyTI(const float r2, const float qi, const float qj,
587  const float dx, const float dy, const float dz,
588  const int vdwtypei, const int vdwtypej,
589  char p1, char p2,
590 #ifdef USE_TABLE_ARRAYS
591  const float2* __restrict__ vdwCoefTable,
592  const float4* __restrict__ forceTable,
593  const float4* __restrict__ energyTable,
594 #else
595  cudaTextureObject_t vdwCoefTableTex,
596  cudaTextureObject_t forceTableTex,
597  cudaTextureObject_t energyTableTex,
598 #endif
599  fast_float3& iforce, fast_float3& iforceSlow, fast_float3& jforce, fast_float3& jforceSlow,
600  float& energyVdw, float& energyVdw_ti_1, float& energyVdw_ti_2,
601  float& energyElec, float& energyElec_ti_1, float& energyElec_ti_2,
602  float& energySlow, float& energySlow_ti_1, float& energySlow_ti_2) {
603 
604  int vdwIndex = vdwtypej + vdwtypei;
605 #if defined(USE_TABLE_ARRAYS)
606  float2 ljab = fast_load(vdwCoefTable, vdwIndex);
607 #else
608  float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex); //ljab.x is A and ljab.y is B
609 #endif
610 
611  /* JM: For TI, we have to deal ALCH1 OR ALCH2 during ComputeNonbondedBase2
612  * ALCH1 for appearing terms;
613  * ALCH2 for dissapearing terms;
614  * Instead of the _s energy terms, we need the to calculate:
615  *
616  * vdwEnergy_ti_1 and _2 for VDW energies. For those we need to add the special terms calculated on
617  * ComputeNonbondedTI.C
618  *
619  * elecEnergy_ti_1 and _2 for electrostatic energy. No correction needed here though.
620  *
621  */
622 
623  float myVdwLambda = 0.0f;
624  float myElecLambda = 0.0f;
625  float rinv = __frsqrt_rn(r2);
626  float f;
627  float alch_vdw_energy = 0.0f;
628  float alch_vdw_force = 0.0f;
629  float alch_vdw_dUdl = 0.0f;
630  float fSlow = qi * qj;
631  float4 ei;
632 #if defined(USE_TABLE_ARRAYS)
633  float4 fi = tableLookup(forceTable, rinv);
634  if (doEnergy) ei = tableLookup(energyTable, rinv);
635 #else
636  float4 fi = sampleTableTex(forceTableTex, rinv);
637  if (doEnergy) ei = sampleTableTex(energyTableTex, rinv);
638 #endif
639 
640  //John said that there is a better way to avoid divergences here
641  //alch: true if => 1-0, 1-1, 2-0, 2-2
642  //dec: true if => 1-1, 2-2 && decouple
643  //up: true if => 1-0 && 1,1
644  //down: true if => 2-0, && 2,2
645  int ref = (p1 == 0 && p2 == 0);
646  int alch = (!ref && !(p1 == 1 && p2 ==2) && !(p1 == 2 && p2 == 1));
647  int dec = (alch && (p1 == p2) && alchflags.alchDecouple);
648  int up = (alch && (p1 == 1 || p2 == 1) && !dec);
649  int down = (alch && (p1 == 2 || p2 == 2) && !dec);
650 
651  float r2_1;
652  f = (fSlow * fi.x);
653 
654 /*--------------- VDW SPECIAL ALCH STUFF (Swiped from ComputeNonbondedTI.C) ---------------*/
655  myVdwLambda = alchflags.vdwLambdaUp*(up) + alchflags.vdwLambdaDown*(down) + 1.f*(ref || dec);
656  myElecLambda = alchflags.elecLambdaUp*(up) + alchflags.elecLambdaDown*(down) + 1.f*(ref || dec);
657  if(alch){
658  if (vdwForceSwitch) {
659  const float cutoff6 = alchflags.cutoff2 * alchflags.cutoff2 * alchflags.cutoff2;
660  float switchdist6;
661  if (shift) {
662  const float myVdwShift = alchflags.vdwShiftUp*up + alchflags.vdwShiftDown*(!up);
663  r2_1 = __fdividef(1.f,(r2 + myVdwShift));
664  switchdist6 = alchflags.switchdist2 + myVdwShift;
665  switchdist6 = switchdist6 * switchdist6 * switchdist6;
666  } else {
667  r2_1 = rinv*rinv;
668  switchdist6 = alchflags.switchdist2 * alchflags.switchdist2 * alchflags.switchdist2;
669  }
670  const float r6_1 = r2_1*r2_1*r2_1;
671  if (r2 <= alchflags.switchdist2) {
672  const float U = ljab.x*r6_1*r6_1 - ljab.y*r6_1;
673  const float dU = -ljab.x / (cutoff6 * switchdist6) - (-ljab.y * rsqrtf(cutoff6 * switchdist6));
674  alch_vdw_force = -1.f*(myVdwLambda*((12.f*U + 6.f*ljab.y*r6_1)*r2_1));
675  alch_vdw_energy = myVdwLambda * (U + dU);
676  alch_vdw_dUdl = U + myVdwLambda * alchflags.alchVdwShiftCoeff * (6.f*U + 3.f*ljab.y*r6_1)*r2_1 + dU;
677  } else {
678  const float r3_1 = sqrtf(r6_1);
679  const float inv_cutoff6 = 1.0f / cutoff6;
680  const float inv_cutoff3 = sqrtf(inv_cutoff6);
681  const float k_vdwa_1 = ljab.x / (1.0f - switchdist6 * inv_cutoff6);
682  const float k_vdwb_1 = ljab.y / (1.0f - sqrtf(switchdist6 * inv_cutoff6));
683  const float tmpa_1 = r6_1 - inv_cutoff6;
684  const float tmpb_1 = r3_1 - inv_cutoff3;
685  const float U = k_vdwa_1 * tmpa_1 * tmpa_1 - k_vdwb_1 * tmpb_1 * tmpb_1;
686  alch_vdw_force = -1.0f * myVdwLambda * (6.0f * r2_1 * (2.0f * k_vdwa_1 * tmpa_1 * r6_1 - k_vdwb_1 * tmpb_1 * r3_1));
687  alch_vdw_energy = myVdwLambda * U;
688  alch_vdw_dUdl = U + myVdwLambda * alchflags.alchVdwShiftCoeff * (3.0f * r2_1 * (2.0f * k_vdwa_1 * tmpa_1 * r6_1 - k_vdwb_1 * tmpb_1 * r3_1));
689  } // r2 <= alchflags.switchdist2
690  } else {
691  // potential switching
692  const float diff = alchflags.cutoff2 - r2;
693  const float switchmul = (r2 > alchflags.switchdist2 ? alchflags.switchfactor*(diff)*(diff) \
694  *(alchflags.cutoff2 - 3.f*alchflags.switchdist2 + 2.f*r2) : 1.f);
695 
696  const float switchmul2 = (r2 > alchflags.switchdist2 ? \
697  12.f*alchflags.switchfactor*(diff) \
698  *(r2 - alchflags.switchdist2) : 0.f);
699  //Templated parameter. No control divergence here
700  if(shift){
701  const float myVdwShift = alchflags.vdwShiftUp*up + alchflags.vdwShiftDown*(!up);
702  r2_1 = __fdividef(1.f,(r2 + myVdwShift));
703  }else r2_1 = rinv*rinv;
704 
705  const float r6_1 = r2_1*r2_1*r2_1;
706  const float U = ljab.x*r6_1*r6_1 - ljab.y*r6_1; // NB: unscaled! for shorthand only!
707  alch_vdw_energy = myVdwLambda*switchmul*U;
708  //Multiplied by -1.0 to match CPU values
709  alch_vdw_force = -1.f*(myVdwLambda*(switchmul*(12.f*U + 6.f*ljab.y*r6_1)*r2_1 \
710  + switchmul2*U));
711  alch_vdw_dUdl = (switchmul*(U + myVdwLambda*alchflags.alchVdwShiftCoeff \
712  *(6.f*U + 3.f*ljab.y*r6_1)*r2_1));
713  } // vdwForceSwitch
714  }
715  /*-------------------------------------------------------------------------*/
716 
717  if (doEnergy){
718  //All energies should be scaled by the corresponding lambda
719  energyVdw += (ljab.x * ei.z + ljab.y * ei.y)*(ref || dec) + alch_vdw_energy*(alch && !dec);
720  energyElec += (fSlow * ei.x)*myElecLambda;
721  if(alch){
722  energyVdw_ti_1 += alch_vdw_dUdl*up;
723  energyVdw_ti_2 += alch_vdw_dUdl*down;
724  energyElec_ti_1 += (fSlow * ei.x)*up;
725  energyElec_ti_2 += (fSlow * ei.x)*down;
726  }
727  if (doSlow){
728  energySlow += (fSlow * ei.w)*myElecLambda;
729  if(alch){
730  energySlow_ti_1 += (fSlow * ei.w)*up;
731  energySlow_ti_2 += (fSlow * ei.w)*down;
732  }
733  }
734  }
735 
736  if (doSlow) fSlow *= fi.w;
737  //We should include the regular VDW forces if not dealing with alch pairs
738  f = (f + ((ljab.x * fi.z + ljab.y * fi.y)*(ref || dec)))*myElecLambda
739  + alch_vdw_force*(alch && !dec);
740 
741  float fx = dx * f;
742  float fy = dy * f;
743  float fz = dz * f;
744 
745  iforce.x += fx;
746  iforce.y += fy;
747  iforce.z += fz;
748  jforce.x -= fx;
749  jforce.y -= fy;
750  jforce.z -= fz;
751 
752  if (doSlow) {
753  /*There's stuff that needs to be added here, when FAST AND NOSHORT macros are on*/
754  fSlow = myElecLambda*fSlow; /* FAST(NOSHORT(+alch_vdw_force))*/ //Those should also be zeroed
755  float fxSlow = dx * fSlow;
756  float fySlow = dy * fSlow;
757  float fzSlow = dz * fSlow;
758  iforceSlow.x += fxSlow;
759  iforceSlow.y += fySlow;
760  iforceSlow.z += fzSlow;
761  jforceSlow.x -= fxSlow;
762  jforceSlow.y -= fySlow;
763  jforceSlow.z -= fzSlow;
764  }
765 }
766 
767 template<bool doSlow, typename T>
768 __device__ __forceinline__
769 void storeForces(const int pos, const T force, const T forceSlow,
770  float* __restrict__ devForces_x,
771  float* __restrict__ devForces_y,
772  float* __restrict__ devForces_z,
773  float* __restrict__ devForcesSlow_x,
774  float* __restrict__ devForcesSlow_y,
775  float* __restrict__ devForcesSlow_z)
776 {
777 #if defined(NAMD_HIP) && ((HIP_VERSION_MAJOR == 3) && (HIP_VERSION_MINOR > 3) || (HIP_VERSION_MAJOR > 3))
778  if (force.x != 0.0f || force.y != 0.0f || force.z != 0.0f) {
779  atomicAdd(&devForces_x[pos], force.x);
780  atomicAdd(&devForces_y[pos], force.y);
781  atomicAdd(&devForces_z[pos], force.z);
782  }
783  if (doSlow) {
784  if (forceSlow.x != 0.0f || forceSlow.y != 0.0f || forceSlow.z != 0.0f) {
785  atomicAdd(&devForcesSlow_x[pos], forceSlow.x);
786  atomicAdd(&devForcesSlow_y[pos], forceSlow.y);
787  atomicAdd(&devForcesSlow_z[pos], forceSlow.z);
788  }
789  }
790 #else
791  atomicAdd(&devForces_x[pos], force.x);
792  atomicAdd(&devForces_y[pos], force.y);
793  atomicAdd(&devForces_z[pos], force.z);
794  if (doSlow) {
795  atomicAdd(&devForcesSlow_x[pos], forceSlow.x);
796  atomicAdd(&devForcesSlow_y[pos], forceSlow.y);
797  atomicAdd(&devForcesSlow_z[pos], forceSlow.z);
798  }
799 #endif
800 }
801 
802 template<bool doSlow>
803 __device__ __forceinline__
804 void storeForces(const int pos, const float4 force, const float4 forceSlow,
805  float* __restrict__ devForces_x,
806  float* __restrict__ devForces_y,
807  float* __restrict__ devForces_z,
808  float* __restrict__ devForcesSlow_x,
809  float* __restrict__ devForcesSlow_y,
810  float* __restrict__ devForcesSlow_z)
811 {
812 #if defined(NAMD_HIP) && ((HIP_VERSION_MAJOR == 3) && (HIP_VERSION_MINOR > 3) || (HIP_VERSION_MAJOR > 3))
813  if (force.x != 0.0f || force.y != 0.0f || force.z != 0.0f) {
814  atomicAdd(&devForces_x[pos], force.x);
815  atomicAdd(&devForces_y[pos], force.y);
816  atomicAdd(&devForces_z[pos], force.z);
817  }
818  if (doSlow) {
819  if (forceSlow.x != 0.0f || forceSlow.y != 0.0f || forceSlow.z != 0.0f) {
820  atomicAdd(&devForcesSlow_x[pos], forceSlow.x);
821  atomicAdd(&devForcesSlow_y[pos], forceSlow.y);
822  atomicAdd(&devForcesSlow_z[pos], forceSlow.z);
823  }
824  }
825 #else
826  atomicAdd(&devForces_x[pos], force.x);
827  atomicAdd(&devForces_y[pos], force.y);
828  atomicAdd(&devForces_z[pos], force.z);
829  if (doSlow) {
830  atomicAdd(&devForcesSlow_x[pos], forceSlow.x);
831  atomicAdd(&devForcesSlow_y[pos], forceSlow.y);
832  atomicAdd(&devForcesSlow_z[pos], forceSlow.z);
833  }
834 #endif
835 }
836 
837 //#define USE_NEW_EXCL_METHOD
838 
839 //
840 // Returns the lower estimate for the distance between a bounding box and a set of atoms
841 //
842 __device__ __forceinline__ float distsq(const BoundingBox a, const float4 b) {
843  float dx = max(0.0f, fabsf(a.x - b.x) - a.wx);
844  float dy = max(0.0f, fabsf(a.y - b.y) - a.wy);
845  float dz = max(0.0f, fabsf(a.z - b.z) - a.wz);
846  float r2 = dx*dx + dy*dy + dz*dz;
847  return r2;
848 }
849 
850 #define LARGE_FLOAT (float)(1.0e10)
851 
852 //
853 // Nonbonded force kernel
854 //
855 template <bool doEnergy, bool doVirial, bool doSlow, bool doPairlist, bool doAlch, bool doFEP, bool doTI, bool doStreaming, bool doTable, bool doAlchVdwForceSwitching>
856 __global__ void
857 __launch_bounds__(WARPSIZE*NONBONDKERNEL_NUM_WARP,
858  doPairlist ? (10) : (doEnergy ? (10) : (10) ))
859 nonbondedForceKernel(
860  const int start, const int numTileLists,
861  const TileList* __restrict__ tileLists, TileExcl* __restrict__ tileExcls,
862  const int* __restrict__ tileJatomStart,
863  const int vdwCoefTableWidth,
864 #if defined(USE_TABLE_ARRAYS)
865  const float2* __restrict__ vdwCoefTable,
866 #else
867  cudaTextureObject_t vdwCoefTableTex,
868 #endif
869  const int* __restrict__ vdwTypes,
870  const float3 lata, const float3 latb, const float3 latc,
871  const float4* __restrict__ xyzq, const float cutoff2, const CudaNBConstants nbConstants,
872 #ifdef USE_TABLE_ARRAYS
873  const float4* __restrict__ forceTable, const float4* __restrict__ energyTable,
874 #else
875  cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
876 #endif
877  // ----------
878  // doPairlist
879  float plcutoff2, const PatchPairRecord* __restrict__ patchPairs,
880  const int* __restrict__ atomIndex,
881  const int2* __restrict__ exclIndexMaxDiff, const unsigned int* __restrict__ overflowExclusions,
882  unsigned int* __restrict__ tileListDepth, int* __restrict__ tileListOrder,
883  int* __restrict__ jtiles, TileListStat* __restrict__ tileListStat,
884  const BoundingBox* __restrict__ boundingBoxes,
885 #ifdef USE_NEW_EXCL_METHOD
886  const int* __restrict__ minmaxExclAtom,
887 #endif
888  // ----------
889  float * __restrict__ devForce_x,
890  float * __restrict__ devForce_y,
891  float * __restrict__ devForce_z,
892  float * __restrict__ devForce_w,
893  float * __restrict__ devForceSlow_x,
894  float * __restrict__ devForceSlow_y,
895  float * __restrict__ devForceSlow_z,
896  float * __restrict__ devForceSlow_w,
897  // ---- USE_STREAMING_FORCES ----
898  const int numPatches,
899  unsigned int* __restrict__ patchNumCount,
900  const CudaPatchRecord* __restrict__ cudaPatches,
901  float4* __restrict__ mapForces, float4* __restrict__ mapForcesSlow,
902  int* __restrict__ mapPatchReadyQueue,
903  int* __restrict__ outputOrder,
904  // ------------------------------
905  TileListVirialEnergy* __restrict__ virialEnergy,
906  // ---- doAlch ----
907  char* __restrict__ p
908  ) {
909 
910  // Single warp takes care of one list of tiles
911  // for (int itileList = (threadIdx.x + blockDim.x*blockIdx.x)/WARPSIZE;itileList < numTileLists;itileList += blockDim.x*gridDim.x/WARPSIZE)
912  // int itileList = start + threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;
913  // The line above is the CUDA-exclusive version. HIPification in this case is selecting b const CudaNBConstants nbConstants, ased on specific parameters
914  // In the hip scheme: Each warp gets an itile and 2 jtiles
915  // if NONBONDEDKERNEL_NUM_WARP == 1, each warp does 2 tiles, so we do 2* blockIdx
916  // int itileList = start + (threadIdx.x/WARPSIZE + NONBONDKERNEL_NUM_WARP*blockIdx.x);
917  // fetches 1 tile
918  int itileList = start + (WARPSIZE == 64 ? blockIdx.x : (threadIdx.x/BOUNDINGBOXSIZE + NONBONDKERNEL_NUM_WARP*blockIdx.x));
919  if (itileList < numTileLists)
920  {
921  fast_float3 iforce, iforceSlow;
922  fast_float3 jforce, jforceSlow;
923  float energyVdw, energyElec, energySlow;
924  //FEP energies
925  float energyVdw_s, energyElec_s, energySlow_s;
926  //TI energies
927  float energyVdw_ti_1, energyVdw_ti_2, energyElec_ti_1, energyElec_ti_2, energySlow_ti_1, energySlow_ti_2;
928  int nexcluded;
929  unsigned int itileListLen;
930  int2 patchInd;
931  int2 patchNumList;
932  char part1, part2, p2;
933  bool doShift = doAlch && (alchflags.alchVdwShiftCoeff != 0.0f);
934  __shared__ float4 s_xyzq[NONBONDKERNEL_NUM_WARP][BOUNDINGBOXSIZE];
935  __shared__ int s_vdwtypej[NONBONDKERNEL_NUM_WARP][BOUNDINGBOXSIZE];
936  __shared__ float4 s_jforce[NONBONDKERNEL_SWEEPS_PER_TILE][BOUNDINGBOXSIZE];
937  __shared__ float4 s_jforceSlow[NONBONDKERNEL_SWEEPS_PER_TILE][BOUNDINGBOXSIZE];
938  __shared__ int s_jatomIndex[NONBONDKERNEL_NUM_WARP][BOUNDINGBOXSIZE];
939 
940  // Warp index (0...WARPSIZE-1)
941  // JM: For BoundingBoxSize32 and WARPSIZE == 64, each warp gets two tile lists!
942 
943  const int wid = threadIdx.x % BOUNDINGBOXSIZE;
944  const int iwarp = 0; // one workgroup does two sweeps
945  const int isweep = threadIdx.x / BOUNDINGBOXSIZE; // 2 sweeps for 32-sized bounding boxes
946  constexpr int nsweep = NONBONDKERNEL_SWEEPS_PER_TILE;
947 
948  // Start computation
949  {
950  TileList tmp = tileLists[itileList];
951  int iatomStart = tmp.iatomStart;
952  int jtileStart = tmp.jtileStart;
953  int jtileEnd = tmp.jtileEnd;
954  patchInd = tmp.patchInd;
955  patchNumList = tmp.patchNumList;
956 
957  float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x;
958  float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y;
959  float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z;
960 
961  // DH - set zeroShift flag if magnitude of shift vector is zero
962  bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
963 
964  int iatomSize, iatomFreeSize, jatomSize, jatomFreeSize;
965  if (doPairlist) {
966  PatchPairRecord PPStmp = patchPairs[itileList];
967  iatomSize = PPStmp.iatomSize;
968  iatomFreeSize = PPStmp.iatomFreeSize;
969  jatomSize = PPStmp.jatomSize;
970  jatomFreeSize = PPStmp.jatomFreeSize;
971  }
972 
973  // Write to global memory here to avoid register spilling
974  if (doVirial) {
975  if (wid == 0) {
976  virialEnergy[itileList].shx = shx;
977  virialEnergy[itileList].shy = shy;
978  virialEnergy[itileList].shz = shz;
979  }
980  }
981 
982  // Load i-atom data (and shift coordinates)
983  fast_float3 xyz_i;
984  float4 xyzq_i = xyzq[iatomStart + wid];
985  if (doAlch) part1 = p[iatomStart + wid];
986  xyzq_i.x += shx;
987  xyzq_i.y += shy;
988  xyzq_i.z += shz;
989  xyz_i.x = xyzq_i.x; // no wasted registers here, just aliasing as float4s are vectorized
990  xyz_i.y = xyzq_i.y;
991  xyz_i.z = xyzq_i.z;
992  int vdwtypei = vdwTypes[iatomStart + wid]*vdwCoefTableWidth;
993 
994  // Load i-atom data (and shift coordinates)
995  BoundingBox boundingBoxI;
996  if (doPairlist) {
997  boundingBoxI = boundingBoxes[iatomStart/BOUNDINGBOXSIZE];
998  boundingBoxI.x += shx;
999  boundingBoxI.y += shy;
1000  boundingBoxI.z += shz;
1001  }
1002 
1003  // Get i-atom global index
1004 #ifdef USE_NEW_EXCL_METHOD
1005  int iatomIndex, minExclAtom, maxExclAtom;
1006 #else
1007  int iatomIndex;
1008 #endif
1009  if (doPairlist) {
1010 #ifdef USE_NEW_EXCL_METHOD
1011  iatomIndex = atomIndex[iatomStart + wid];
1012  int2 tmp = minmaxExclAtom[iatomStart + wid];
1013  minExclAtom = tmp.x;
1014  maxExclAtom = tmp.y;
1015 #else
1016  iatomIndex = atomIndex[iatomStart + wid];
1017 #endif
1018  }
1019 
1020  // i-forces in registers
1021  // float3 iforce;
1022  iforce.x = 0.0f;
1023  iforce.y = 0.0f;
1024  iforce.z = 0.0f;
1025 
1026  // float3 iforceSlow;
1027  if (doSlow) {
1028  iforceSlow.x = 0.0f;
1029  iforceSlow.y = 0.0f;
1030  iforceSlow.z = 0.0f;
1031  }
1032 
1033  // float energyVdw, energyElec, energySlow;
1034  if (doEnergy) {
1035  energyVdw = 0.0f;
1036  energyVdw_s = 0.0f;
1037  energyVdw_ti_1 = 0.0f;
1038  energyVdw_ti_2 = 0.0f;
1039  energyElec = 0.0f;
1040  energyElec_ti_1 = 0.0f;
1041  energyElec_ti_2 = 0.0f;
1042  energyElec_s = 0.0f;
1043  if (doSlow){
1044  energySlow = 0.0f;
1045  energySlow_s = 0.0f;
1046  energySlow_ti_1 = 0.0f;
1047  energySlow_ti_2 = 0.0f;
1048  }
1049  }
1050 
1051  // Number of exclusions
1052  // NOTE: Lowest bit is used as indicator bit for tile pairs:
1053  // bit 0 tile has no atoms within pairlist cutoff
1054  // bit 1 tile has atoms within pairlist cutoff
1055  // int nexcluded;
1056  if (doPairlist) nexcluded = 0;
1057 
1058  // Number of i loops and free atoms
1059  int nfreei;
1060  if (doPairlist) {
1061  int nloopi = min(iatomSize - iatomStart, BOUNDINGBOXSIZE);
1062  nfreei = max(iatomFreeSize - iatomStart, 0);
1063  if (wid >= nloopi) {
1064  xyzq_i.x = -LARGE_FLOAT;
1065  xyzq_i.y = -LARGE_FLOAT;
1066  xyzq_i.z = -LARGE_FLOAT;
1067  }
1068  }
1069 
1070  // tile list stuff
1071  // int itileListLen;
1072  // int minJatomStart;
1073  if (doPairlist) {
1074  // minJatomStart = tileJatomStart[jtileStart];
1075  itileListLen = 0;
1076  }
1077 
1078  // Exclusion index and maxdiff
1079  int iexclIndex, iexclMaxdiff;
1080  if (doPairlist) {
1081  int2 tmp = exclIndexMaxDiff[iatomStart + wid];
1082  iexclIndex = tmp.x;
1083  iexclMaxdiff = tmp.y;
1084  }
1085  // fetch two tiles at once here
1086  for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
1087  // Load j-atom starting index and exclusion mask
1088  int jatomStart = tileJatomStart[jtile];
1089 
1090  float4 xyzq_j = xyzq[jatomStart + wid];
1091  NAMD_WARP_SYNC(WARP_FULL_MASK);
1092  if (doAlch) p2 = p[jatomStart + wid];
1093 
1094  // Check for early bail
1095  if (doPairlist) {
1096  float r2bb = distsq(boundingBoxI, xyzq_j);
1097  if (NAMD_WARP_ALL(WARP_FULL_MASK, r2bb > plcutoff2)) continue;
1098  }
1099  WarpMask excl = (doPairlist) ? 0 : tileExcls[jtile].excl[wid];
1100  int vdwtypej = vdwTypes[jatomStart + wid];
1101  s_vdwtypej[iwarp][wid] = vdwtypej;
1102 
1103  // Get i-atom global index
1104  if (doPairlist) {
1105  s_jatomIndex[iwarp][wid] = atomIndex[jatomStart + wid];
1106  }
1107 
1108  // Number of j loops and free atoms
1109  int nfreej;
1110  if (doPairlist) {
1111  int nloopj = min(jatomSize - jatomStart, BOUNDINGBOXSIZE);
1112  nfreej = max(jatomFreeSize - jatomStart, 0);
1113  //if (nfreei == 0 && nfreej == 0) continue;
1114  if (wid >= nloopj) {
1115  xyzq_j.x = LARGE_FLOAT;
1116  xyzq_j.y = LARGE_FLOAT;
1117  xyzq_j.z = LARGE_FLOAT;
1118  }
1119  }
1120  s_xyzq[iwarp][wid] = xyzq_j;
1121 
1122  // DH - self requires that zeroShift is also set
1123  const bool self = zeroShift && (iatomStart == jatomStart);
1124  const int modval = (self) ? 2*BOUNDINGBOXSIZE-1 : BOUNDINGBOXSIZE-1;
1125 
1126  s_jforce[isweep][wid] = make_float4(0.0f, 0.0f, 0.0f, 0.f);
1127  if (doSlow)
1128  s_jforceSlow[isweep][wid] = make_float4(0.0f, 0.0f, 0.0f, 0.f);
1129  NAMD_WARP_SYNC(WARP_FULL_MASK);
1130 
1131  if (doPairlist) {
1132  // Build pair list
1133  // NOTE: Pairlist update, we must also include the diagonal since this is used
1134  // in GBIS phase 2.
1135  // Clear the lowest (indicator) bit
1136  nexcluded &= (~1);
1137 
1138  // For self tiles, do the diagonal term (t=0).
1139  // NOTE: No energies are computed here, since this self-diagonal term is only for GBIS phase 2
1140  if (self && !isweep) {
1141  int j = (0 + wid) & modval;
1142  xyzq_j = s_xyzq[iwarp][j];
1143  float dx = xyzq_j.x - xyzq_i.x;
1144  float dy = xyzq_j.y - xyzq_i.y;
1145  float dz = xyzq_j.z - xyzq_i.z;
1146  float r2 = dx*dx + dy*dy + dz*dz;
1147 
1148  if (j < BOUNDINGBOXSIZE && r2 < plcutoff2) {
1149  // We have atom pair within the pairlist cutoff => Set indicator bit
1150  nexcluded |= 1;
1151  }
1152  }
1153  for (int t = (self && !isweep) ? isweep+nsweep : isweep;t < BOUNDINGBOXSIZE; t+=nsweep) {
1154  int j = (t + wid) & modval;
1155 
1156  // NOTE: __shfl() operation can give non-sense here because j may be >= WARPSIZE.
1157  // However, if (j < WARPSIZE ..) below makes sure that these non-sense
1158  // results are not used
1159  if (doAlch) part2 = WARP_SHUFFLE(WARP_FULL_MASK, p2, j, WARPSIZE);
1160 
1161  excl >>= nsweep;// moves it two sweeps
1162  if (j < BOUNDINGBOXSIZE) {
1163  xyzq_j = s_xyzq[iwarp][j];
1164  float dx = xyzq_j.x - xyzq_i.x;
1165  float dy = xyzq_j.y - xyzq_i.y;
1166  float dz = xyzq_j.z - xyzq_i.z;
1167  float r2 = dx*dx + dy*dy + dz*dz;
1168  if (r2 < plcutoff2) {
1169  // We have atom pair within the pairlist cutoff => Set indicator bit
1170  nexcluded |= 1;
1171  if (j < nfreej || wid < nfreei) {
1172  bool excluded = false;
1173  int indexdiff = s_jatomIndex[iwarp][j] - iatomIndex;
1174  if ( abs(indexdiff) <= iexclMaxdiff) {
1175  indexdiff += iexclIndex;
1176  int indexword = ((unsigned int) indexdiff) >> 5;
1177  {
1178  indexword = overflowExclusions[indexword];
1179  }
1180 
1181  excluded = ((indexword & (1<<(indexdiff&31))) != 0);
1182  }
1183  if (excluded) nexcluded += 2;
1184  if (!excluded) excl |= (WarpMask)1 << (BOUNDINGBOXSIZE-(nsweep - isweep)); // good luck understanding this
1185  if(doAlch){
1186  if(!excluded && r2 < cutoff2){
1187  jforce = float4(s_jforce[isweep][j]);
1188  if(doSlow) jforceSlow = float4(s_jforceSlow[isweep][j]);
1189  if(doShift){
1190  if(doFEP){
1191  calcForceEnergyFEP<doEnergy, doSlow, true, doAlchVdwForceSwitching>(
1192  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1193  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1194 #ifdef USE_TABLE_ARRAYS
1195  vdwCoefTable,
1196 #else
1197  vdwCoefTableTex,
1198 #endif
1199 #ifdef USE_TABLE_ARRAYS
1200  forceTable, energyTable,
1201 #else
1202  forceTableTex, energyTableTex,
1203 #endif
1204  iforce, iforceSlow,
1205  jforce, jforceSlow,
1206  energyVdw, energyVdw_s,
1207  energyElec, energySlow, energyElec_s, energySlow_s);
1208  }else{
1209  calcForceEnergyTI<doEnergy, doSlow, true, doAlchVdwForceSwitching>(
1210  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1211  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1212 #ifdef USE_TABLE_ARRAYS
1213  vdwCoefTable,
1214 #else
1215  vdwCoefTableTex,
1216 #endif
1217 #ifdef USE_TABLE_ARRAYS
1218  forceTable, energyTable,
1219 #else
1220  forceTableTex, energyTableTex,
1221 #endif
1222  iforce, iforceSlow,
1223  jforce, jforceSlow,
1224  energyVdw, energyVdw_ti_1,
1225  energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
1226  energySlow, energySlow_ti_1, energySlow_ti_2);
1227  }//if doFEP
1228  }else{
1229  if(doFEP){
1230  calcForceEnergyFEP<doEnergy, doSlow, false, doAlchVdwForceSwitching>(
1231  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1232  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1233 #ifdef USE_TABLE_ARRAYS
1234  vdwCoefTable,
1235 #else
1236  vdwCoefTableTex,
1237 #endif
1238 #ifdef USE_TABLE_ARRAYS
1239  forceTable, energyTable,
1240 #else
1241  forceTableTex, energyTableTex,
1242 #endif
1243  iforce, iforceSlow,
1244  jforce, jforceSlow,
1245  energyVdw, energyVdw_s,
1246  energyElec, energySlow, energyElec_s, energySlow_s);
1247  }else{
1248  calcForceEnergyTI<doEnergy, doSlow, false, doAlchVdwForceSwitching>(
1249  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1250  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1251 #ifdef USE_TABLE_ARRAYS
1252  vdwCoefTable,
1253 #else
1254  vdwCoefTableTex,
1255 #endif
1256 #ifdef USE_TABLE_ARRAYS
1257  forceTable, energyTable,
1258 #else
1259  forceTableTex, energyTableTex,
1260 #endif
1261  iforce, iforceSlow,
1262  jforce, jforceSlow,
1263  energyVdw, energyVdw_ti_1,
1264  energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
1265  energySlow, energySlow_ti_1, energySlow_ti_2);
1266  }
1267  }//if doShift
1268  s_jforce[isweep][j] = make_float4(jforce.x, jforce.y, jforce.z, 1.f);
1269  if(doSlow) s_jforceSlow[isweep][j] = make_float4(jforceSlow.x, jforceSlow.y, jforceSlow.z, 1.f);
1270  }//if !excluded && r2 < cutoff2
1271  }else{
1272  if (!excluded && r2 < cutoff2) {
1273  jforce = float4(s_jforce[isweep][j]);
1274  if(doSlow) jforceSlow = float4(s_jforceSlow[isweep][j]);
1275  if(doTable){
1276  calcForceEnergy<doEnergy, doSlow>(
1277  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1278  vdwtypei, s_vdwtypej[iwarp][j],
1279 #ifdef USE_TABLE_ARRAYS
1280  vdwCoefTable,
1281 #else
1282  vdwCoefTableTex,
1283 #endif
1284 #ifdef USE_TABLE_ARRAYS
1285  forceTable, energyTable,
1286 #else
1287  forceTableTex, energyTableTex,
1288 #endif
1289  iforce, iforceSlow,
1290  jforce, jforceSlow,
1291  energyVdw,
1292  energyElec, energySlow);
1293  } else {
1294  calcForceEnergyMath<doEnergy, doSlow>(
1295  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1296  vdwtypei, s_vdwtypej[iwarp][j],
1297 #ifdef USE_TABLE_ARRAYS
1298  vdwCoefTable,
1299 #else
1300  vdwCoefTableTex,
1301 #endif
1302  iforce, iforceSlow,
1303  jforce, jforceSlow,
1304  energyVdw,
1305  energyElec, energySlow, nbConstants);
1306 
1307  }
1308  s_jforce[isweep][j] = make_float4(jforce.x, jforce.y, jforce.z, 1.f);
1309  if(doSlow) s_jforceSlow[isweep][j] = make_float4(jforceSlow.x, jforceSlow.y, jforceSlow.z, 1.f);
1310  }
1311  }
1312  }
1313  }
1314  }
1315  NAMD_WARP_SYNC(WARP_FULL_MASK);
1316  } // t
1317  } else {
1318  // Just compute forces
1319  if (self) {
1320  // Clear the first bit
1321  excl = excl & (~(WarpMask)1);
1322  }
1323  excl >>= isweep;
1324  int ind_sweep = isweep + wid;
1325 #pragma unroll 4
1326  for (int t = 0; t < BOUNDINGBOXSIZE;t+= nsweep) {
1327  if (doAlch) {
1328  int j = (ind_sweep + t) & (BOUNDINGBOXSIZE-1);
1329  part2 = WARP_SHUFFLE(WARP_FULL_MASK, p2, j, WARPSIZE);
1330  }
1331  if ((excl & 1)) {
1332  fast_float3 xyz_j;
1333  int j = (ind_sweep + t) & (BOUNDINGBOXSIZE-1);
1334  xyzq_j = (float4)s_xyzq[iwarp][j];
1335  xyz_j.x = xyzq_j.x;
1336  xyz_j.y = xyzq_j.y;
1337  xyz_j.z = xyzq_j.z;
1338  fast_float3 dr = xyz_j - xyz_i;
1339  // float dx = xyzq_j.x - xyzq_i.x;
1340  // float dy = xyzq_j.y - xyzq_i.y;
1341  // float dz = xyzq_j.z - xyzq_i.z;
1342  // float r2 = dx*dx + dy*dy + dz*dz;
1343  float dx = dr.x;
1344  float dy = dr.y;
1345  float dz = dr.z;
1346  float r2 = norm2(dr);
1347  if(doAlch){
1348  if(r2 < cutoff2){ // (r2 < cutoff2)
1349  jforce = float4(s_jforce[isweep][j]);
1350  if(doSlow) jforceSlow = float4(s_jforceSlow[isweep][j]);
1351  if(doShift){
1352  if (doFEP){
1353  calcForceEnergyFEP<doEnergy, doSlow, true, doAlchVdwForceSwitching>(
1354  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1355  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1356 #ifdef USE_TABLE_ARRAYS
1357  vdwCoefTable,
1358 #else
1359  vdwCoefTableTex,
1360 #endif
1361 #ifdef USE_TABLE_ARRAYS
1362  forceTable, energyTable,
1363 #else
1364  forceTableTex, energyTableTex,
1365 #endif
1366  iforce, iforceSlow,
1367  jforce, jforceSlow,
1368  energyVdw, energyVdw_s,
1369  energyElec, energySlow, energyElec_s, energySlow_s);
1370  }else{
1371  calcForceEnergyTI<doEnergy, doSlow, true, doAlchVdwForceSwitching>(
1372  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1373  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1374 #ifdef USE_TABLE_ARRAYS
1375  vdwCoefTable,
1376 #else
1377  vdwCoefTableTex,
1378 #endif
1379 #ifdef USE_TABLE_ARRAYS
1380  forceTable, energyTable,
1381 #else
1382  forceTableTex, energyTableTex,
1383 #endif
1384  iforce, iforceSlow,
1385  jforce, jforceSlow,
1386  energyVdw, energyVdw_ti_1,
1387  energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
1388  energySlow, energySlow_ti_1, energySlow_ti_2);
1389  }//if doFEP
1390  }else{
1391  if(doFEP){
1392  calcForceEnergyFEP<doEnergy, doSlow, false, doAlchVdwForceSwitching>(
1393  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1394  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1395 #ifdef USE_TABLE_ARRAYS
1396  vdwCoefTable,
1397 #else
1398  vdwCoefTableTex,
1399 #endif
1400 #ifdef USE_TABLE_ARRAYS
1401  forceTable, energyTable,
1402 #else
1403  forceTableTex, energyTableTex,
1404 #endif
1405  iforce, iforceSlow,
1406  jforce, jforceSlow,
1407  energyVdw, energyVdw_s,
1408  energyElec, energySlow, energyElec_s, energySlow_s);
1409  }else{
1410  calcForceEnergyTI<doEnergy, doSlow, false, doAlchVdwForceSwitching>(
1411  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1412  vdwtypei, s_vdwtypej[iwarp][j], part1, part2,
1413 #ifdef USE_TABLE_ARRAYS
1414  vdwCoefTable,
1415 #else
1416  vdwCoefTableTex,
1417 #endif
1418 #ifdef USE_TABLE_ARRAYS
1419  forceTable, energyTable,
1420 #else
1421  forceTableTex, energyTableTex,
1422 #endif
1423  iforce, iforceSlow,
1424  jforce, jforceSlow,
1425  energyVdw, energyVdw_ti_1,
1426  energyVdw_ti_2, energyElec, energyElec_ti_1, energyElec_ti_2,
1427  energySlow, energySlow_ti_1, energySlow_ti_2);
1428  }//if doFEP
1429  }//doShift
1430  s_jforce[isweep][j] = make_float4(jforce.x, jforce.y, jforce.z, 1.f);
1431  if(doSlow) s_jforceSlow[isweep][j] = make_float4(jforceSlow.x, jforceSlow.y, jforceSlow.z, 1.f);
1432  }//r2 < cutoff
1433  }else {
1434  if (r2 < cutoff2) {
1435  jforce = float4(s_jforce[isweep][j]);
1436  if(doSlow) jforceSlow = float4(s_jforceSlow[isweep][j]);
1437  if (doTable) {
1438  calcForceEnergy<doEnergy, doSlow>(
1439  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1440  vdwtypei, s_vdwtypej[iwarp][j],
1441 #ifdef USE_TABLE_ARRAYS
1442  vdwCoefTable,
1443 #else
1444  vdwCoefTableTex,
1445 #endif
1446 #ifdef USE_TABLE_ARRAYS
1447  forceTable, energyTable,
1448 #else
1449  forceTableTex, energyTableTex,
1450 #endif
1451  iforce, iforceSlow,
1452  jforce, jforceSlow,
1453  energyVdw, energyElec, energySlow);
1454  }
1455  else{
1456  calcForceEnergyMath<doEnergy, doSlow>(
1457  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
1458  vdwtypei, s_vdwtypej[iwarp][j],
1459 #ifdef USE_TABLE_ARRAYS
1460  vdwCoefTable,
1461 #else
1462  vdwCoefTableTex,
1463 #endif
1464  iforce, iforceSlow,
1465  jforce, jforceSlow,
1466  energyVdw,
1467  energyElec, energySlow, nbConstants);
1468  }
1469  s_jforce[isweep][j] = make_float4(jforce.x, jforce.y, jforce.z, 0.f);
1470  if(doSlow) s_jforceSlow[isweep][j] = make_float4(jforceSlow.x, jforceSlow.y, jforceSlow.z, 0.f);
1471  }
1472  }//doAlch
1473  }
1474  excl >>= nsweep;
1475  NAMD_WARP_SYNC(WARP_FULL_MASK);
1476  } // t
1477  }
1478 
1479  // Write j-forces - shuffle them before storing to global memory
1480  jforce = s_jforce[isweep][wid];
1481  if(doSlow) jforceSlow = s_jforceSlow[isweep][wid];
1482  float4 jforceAccum = make_float4(jforce.x, jforce.y, jforce.z, 1.f);
1483  float4 jforceSlowAccum;
1484  if (doSlow) jforceSlowAccum = make_float4(jforceSlow.x, jforceSlow.y, jforceSlow.z, 1.f);
1485 #if 0
1486  jforce.x += WARP_SHUFFLE(WARP_FULL_MASK, jforce.x, wid + BOUNDINGBOXSIZE, WARPSIZE);
1487  jforce.y += WARP_SHUFFLE(WARP_FULL_MASK, jforce.y, wid + BOUNDINGBOXSIZE, WARPSIZE);
1488  jforce.z += WARP_SHUFFLE(WARP_FULL_MASK, jforce.z, wid + BOUNDINGBOXSIZE, WARPSIZE);
1489  if(doSlow){
1490  jforceSlow.x += WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.x, wid + BOUNDINGBOXSIZE, WARPSIZE);
1491  jforceSlow.y += WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.y, wid + BOUNDINGBOXSIZE, WARPSIZE);
1492  jforceSlow.z += WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.z, wid + BOUNDINGBOXSIZE, WARPSIZE);
1493  }
1494 #else
1495  for(int k = 1 ; k < nsweep; k++){
1496  jforceAccum.x += WARP_SHUFFLE(WARP_FULL_MASK, jforce.x, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1497  jforceAccum.y += WARP_SHUFFLE(WARP_FULL_MASK, jforce.y, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1498  jforceAccum.z += WARP_SHUFFLE(WARP_FULL_MASK, jforce.z, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1499  if(doSlow){
1500  jforceSlowAccum.x += WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.x, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1501  jforceSlowAccum.y += WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.y, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1502  jforceSlowAccum.z += WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.z, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1503  }
1504  }
1505 #endif
1506  if(!isweep){
1507  storeForces<doSlow, float4>(jatomStart + wid, jforceAccum, jforceSlowAccum,
1508  devForce_x, devForce_y, devForce_z,
1509  devForceSlow_x, devForceSlow_y, devForceSlow_z);
1510  }
1511  // Write exclusions
1512  if (doPairlist) {
1513 
1514  // We have to calculate the global flags of this part here, which is
1515  WarpMask res_excl = excl;
1516  for(int k = 1 ; k < nsweep; k++){
1517  // shuffle the integers to make sure we have complete flags
1518  res_excl |= WARP_SHUFFLE(WARP_FULL_MASK, excl, wid + BOUNDINGBOXSIZE*k, WARPSIZE);
1519  }
1520  // after that, everyone grabs the value from the lower warp
1521  int anyexcl = (65536 | NAMD_WARP_ANY(WARP_FULL_MASK, res_excl != 0));
1522  // Mark this jtile as non-empty:
1523  // VdW: 1 if tile has atom pairs within pairlist cutoff and some these atoms interact
1524  // GBIS: 65536 if tile has atom pairs within pairlist cutoff but not necessary interacting (i.e. these atoms are fixed or excluded)
1525  if (!isweep && anyexcl){ // lower threads in warp updates the values
1526  if (wid) jtiles[jtile] = anyexcl;
1527  // Store exclusions
1528  tileExcls[jtile].excl[wid] = res_excl;
1529  // itileListLen:
1530  // lower 16 bits number of tiles with atom pairs within pairlist cutoff that interact
1531  // upper 16 bits number of tiles with atom pairs within pairlist cutoff (but not necessary interacting)
1532  // low sweep has the correct value for this, so all good
1533  itileListLen += anyexcl;
1534  // NOTE, this minJatomStart is only stored once for the first tile list entry
1535  // minJatomStart = min(minJatomStart, jatomStart);
1536  }
1537  }
1538 
1539  } // jtile
1540 
1541 #if 1
1542  // Write i-forces
1543  // shfl before writing - lower tile gets high tile value
1544  float3 iforceAccum = iforce;
1545  float3 iforceSlowAccum = iforceSlow;
1546  for(int k = 1 ; k < nsweep; k++){
1547  iforceAccum.x += WARP_SHUFFLE(WARP_FULL_MASK, iforce.x, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1548  iforceAccum.y += WARP_SHUFFLE(WARP_FULL_MASK, iforce.y, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1549  iforceAccum.z += WARP_SHUFFLE(WARP_FULL_MASK, iforce.z, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1550  if(doSlow){
1551  iforceSlowAccum.x += WARP_SHUFFLE(WARP_FULL_MASK, iforceSlow.x, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1552  iforceSlowAccum.y += WARP_SHUFFLE(WARP_FULL_MASK, iforceSlow.y, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1553  iforceSlowAccum.z += WARP_SHUFFLE(WARP_FULL_MASK, iforceSlow.z, wid+BOUNDINGBOXSIZE*k, WARPSIZE);
1554  }
1555  }
1556  if(!isweep){
1557  storeForces<doSlow, float3>(iatomStart + wid, iforceAccum, iforceSlowAccum,
1558  devForce_x, devForce_y, devForce_z,
1559  devForceSlow_x, devForceSlow_y, devForceSlow_z);
1560  }
1561 #endif
1562  }
1563  // Done with computation
1564 
1565  // Save pairlist stuff
1566  if (doPairlist) {
1567  if (wid == 0 && isweep == 0){
1568  // minJatomStart is in range [0 ... atomStorageSize-1]
1569  //int atom0 = (minJatomStart)/WARPSIZE;
1570  // int atom0 = 0;
1571  // int storageOffset = atomStorageSize/WARPSIZE;
1572  // int itileListLen = 0;
1573  // for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) itileListLen += jtiles[jtile];
1574  // Store 0 if itileListLen == 0
1575  // tileListDepth[itileList] = (itileListLen > 0)*(itileListLen*storageOffset + atom0);
1576  // race condition here if we have the warp fetching two tiles from the same list
1577  tileListDepth[itileList] = itileListLen;
1578  tileListOrder[itileList] = itileList;
1579  // Number of active tilelists with tile with atom pairs within pairlist cutoff that interact
1580  if ((itileListLen & 65535) > 0) atomicAdd(&tileListStat->numTileLists, 1);
1581  // Number of active tilelists with tiles with atom pairs within pairlist cutoff (but not necessary interacting)
1582  if (itileListLen > 0) atomicAdd(&tileListStat->numTileListsGBIS, 1);
1583  // NOTE: always numTileListsGBIS >= numTileLists
1584  }
1585 
1586  typedef cub::WarpReduce<int> WarpReduceInt;
1587  __shared__ typename WarpReduceInt::TempStorage tempStorage[NONBONDKERNEL_NUM_WARP];
1588  const int warpId = threadIdx.x / WARPSIZE;
1589  // Remove indicator bit
1590  nexcluded >>= 1;
1591  volatile int nexcludedWarp = WarpReduceInt(tempStorage[warpId]).Sum(nexcluded);
1592  if (threadIdx.x % WARPSIZE == 0){
1593  atomicAdd(&tileListStat->numExcluded, nexcludedWarp);
1594  }
1595  }
1596 
1597  if (doVirial) {
1598  typedef cub::WarpReduce<float> WarpReduce;
1599  __shared__ typename WarpReduce::TempStorage tempStorage[NONBONDKERNEL_NUM_WARP];
1600  const int warpId = threadIdx.x / WARPSIZE;
1601  volatile float iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforce.x);
1602  NAMD_WARP_SYNC(WARP_FULL_MASK);
1603  volatile float iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforce.y);
1604  NAMD_WARP_SYNC(WARP_FULL_MASK);
1605  volatile float iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforce.z);
1606  NAMD_WARP_SYNC(WARP_FULL_MASK);
1607  if (wid == 0) {
1608  virialEnergy[itileList].forcex = iforcexSum;
1609  virialEnergy[itileList].forcey = iforceySum;
1610  virialEnergy[itileList].forcez = iforcezSum;
1611  }
1612 
1613  if (doSlow) {
1614  iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.x);
1615  NAMD_WARP_SYNC(WARP_FULL_MASK);
1616  iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.y);
1617  NAMD_WARP_SYNC(WARP_FULL_MASK);
1618  iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.z);
1619  NAMD_WARP_SYNC(WARP_FULL_MASK);
1620  if (wid == 0) {
1621  virialEnergy[itileList].forceSlowx = iforcexSum;
1622  virialEnergy[itileList].forceSlowy = iforceySum;
1623  virialEnergy[itileList].forceSlowz = iforcezSum;
1624  }
1625  }
1626  }
1627 
1628  // Reduce energy
1629  if (doEnergy) {
1630  // NOTE: We must hand write these warp-wide reductions to avoid excess register spillage
1631  // (Why does CUB suck here?)
1632 #pragma unroll
1633  for (int i=WARPSIZE/2;i >= 1;i/=2) {
1634  energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
1635  energyElec += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec, i, WARPSIZE);
1636  if(doFEP) energyVdw_s += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_s, i, WARPSIZE);
1637  if(doFEP) energyElec_s += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec_s, i, WARPSIZE);
1638  if(doTI){
1639  energyVdw_ti_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_ti_1, i, WARPSIZE);
1640  energyVdw_ti_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw_ti_2, i, WARPSIZE);
1641  energyElec_ti_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec_ti_1, i, WARPSIZE);
1642  energyElec_ti_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec_ti_2, i, WARPSIZE);
1643  }
1644  if (doSlow){
1645  energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
1646  if(doFEP) energySlow_s += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_s, i, WARPSIZE);
1647  if(doTI){
1648  energySlow_ti_1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_ti_1, i, WARPSIZE);
1649  energySlow_ti_2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow_ti_2, i, WARPSIZE);
1650  }
1651  }
1652  }
1653 
1654  if (threadIdx.x % WARPSIZE == 0) {
1655  virialEnergy[itileList].energyVdw = energyVdw;
1656  virialEnergy[itileList].energyElec = energyElec;
1657  if (doFEP) virialEnergy[itileList].energyVdw_s = energyVdw_s;
1658  if (doFEP) virialEnergy[itileList].energyElec_s = energyElec_s;
1659  if(doTI){
1660  virialEnergy[itileList].energyVdw_ti_1 = energyVdw_ti_1;
1661  virialEnergy[itileList].energyVdw_ti_2 = energyVdw_ti_2;
1662  virialEnergy[itileList].energyElec_ti_1 = energyElec_ti_1;
1663  virialEnergy[itileList].energyElec_ti_2 = energyElec_ti_2;
1664  }
1665  if (doSlow) {
1666  virialEnergy[itileList].energySlow = energySlow;
1667  if(doFEP) virialEnergy[itileList].energySlow_s = energySlow_s;
1668  if (doTI){
1669  virialEnergy[itileList].energySlow_ti_1 = energySlow_ti_1;
1670  virialEnergy[itileList].energySlow_ti_2 = energySlow_ti_2;
1671  }
1672  }
1673  }
1674  }
1675  // XXX TODO: Disable streaming and see what happens
1676  // Let's try to set
1677  if (doStreaming) {
1678  // Make sure devForces and devForcesSlow have been written into device memory
1679  NAMD_WARP_SYNC(WARP_FULL_MASK);
1680  __threadfence();
1681 
1682  int patchDone[2] = {false, false};
1683  const int wid = threadIdx.x % WARPSIZE;
1684  if (wid == 0) {
1685  int patchCountOld0 = atomicInc(&patchNumCount[patchInd.x], (unsigned int)(patchNumList.x-1));
1686  patchDone[0] = (patchCountOld0 + 1 == patchNumList.x);
1687  if (patchInd.x != patchInd.y) {
1688  int patchCountOld1 = atomicInc(&patchNumCount[patchInd.y], (unsigned int)(patchNumList.y-1));
1689  patchDone[1] = (patchCountOld1 + 1 == patchNumList.y);
1690  }
1691  }
1692 
1693  patchDone[0] = NAMD_WARP_ANY(WARP_FULL_MASK, patchDone[0]);
1694  patchDone[1] = NAMD_WARP_ANY(WARP_FULL_MASK, patchDone[1]);
1695 
1696  if (patchDone[0]) {
1697  // Patch 1 is done, write onto host-mapped memory
1698  CudaPatchRecord patch = cudaPatches[patchInd.x];
1699  int start = patch.atomStart;
1700  int end = start + patch.numAtoms;
1701  for (int i=start+wid;i < end;i+=WARPSIZE) {
1702  mapForces[i] = make_float4(devForce_x[i],
1703  devForce_y[i], devForce_z[i], devForce_w[i]);
1704  if (doSlow) {
1705  mapForcesSlow[i] = make_float4(devForceSlow_x[i],
1706  devForceSlow_y[i], devForceSlow_z[i], devForceSlow_w[i]);
1707  }
1708  }
1709  }
1710  if (patchDone[1]) {
1711  // Patch 2 is done
1712  CudaPatchRecord patch = cudaPatches[patchInd.y];
1713  int start = patch.atomStart;
1714  int end = start + patch.numAtoms;
1715  for (int i=start+wid;i < end;i+=WARPSIZE) {
1716  mapForces[i] = make_float4(devForce_x[i],
1717  devForce_y[i], devForce_z[i], devForce_w[i]);
1718  if (doSlow) {
1719  mapForcesSlow[i] = make_float4(devForceSlow_x[i],
1720  devForceSlow_y[i], devForceSlow_z[i], devForceSlow_w[i]);
1721  }
1722  }
1723  }
1724 
1725  if (patchDone[0] || patchDone[1]) {
1726  // Make sure mapForces and mapForcesSlow are up-to-date
1727  NAMD_WARP_SYNC(WARP_FULL_MASK);
1728  __threadfence_system();
1729  // Add patch into "patchReadyQueue"
1730  if (wid == 0) {
1731  if (patchDone[0]) {
1732  int ind = atomicAdd(&tileListStat->patchReadyQueueCount, 1);
1733  // int ind = atomicInc((unsigned int *)&mapPatchReadyQueue[numPatches], numPatches-1);
1734  mapPatchReadyQueue[ind] = patchInd.x;
1735  }
1736  if (patchDone[1]) {
1737  int ind = atomicAdd(&tileListStat->patchReadyQueueCount, 1);
1738  // int ind = atomicInc((unsigned int *)&mapPatchReadyQueue[numPatches], numPatches-1);
1739  mapPatchReadyQueue[ind] = patchInd.y;
1740  }
1741  }
1742  }
1743  }
1744 
1745  if (doStreaming && outputOrder != NULL && threadIdx.x % WARPSIZE == 0) {
1746  int index = atomicAdd(&tileListStat->outputOrderIndex, 1);
1747  outputOrder[index] = itileList;
1748  }
1749  } // if (itileList < numTileLists)
1750 }
1751 
1752 //
1753 // Finish up - reduce virials from nonbonded kernel
1754 //
1755 __global__ void reduceNonbondedVirialKernel(const bool doSlow,
1756  const int atomStorageSize,
1757  const float4* __restrict__ xyzq,
1758  const float4* __restrict__ devForces, const float4* __restrict__ devForcesSlow,
1759  VirialEnergy* __restrict__ virialEnergy) {
1760 
1761  for (int ibase = blockIdx.x*blockDim.x;ibase < atomStorageSize;ibase += blockDim.x*gridDim.x)
1762  {
1763  int i = ibase + threadIdx.x;
1764 
1765  // Set to zero to avoid nan*0
1766  float4 pos;
1767  pos.x = 0.0f;
1768  pos.y = 0.0f;
1769  pos.z = 0.0f;
1770  float4 force, forceSlow;
1771  force.x = 0.0f;
1772  force.y = 0.0f;
1773  force.z = 0.0f;
1774  forceSlow.x = 0.0f;
1775  forceSlow.y = 0.0f;
1776  forceSlow.z = 0.0f;
1777  if (i < atomStorageSize) {
1778  pos = xyzq[i];
1779  force = devForces[i];
1780  if (doSlow) forceSlow = devForcesSlow[i];
1781  }
1782  // Reduce across the entire thread block
1783  float vxxt = force.x*pos.x;
1784  float vxyt = force.x*pos.y;
1785  float vxzt = force.x*pos.z;
1786  float vyxt = force.y*pos.x;
1787  float vyyt = force.y*pos.y;
1788  float vyzt = force.y*pos.z;
1789  float vzxt = force.z*pos.x;
1790  float vzyt = force.z*pos.y;
1791  float vzzt = force.z*pos.z;
1792 
1793  const int bin = blockIdx.x % ATOMIC_BINS;
1794 
1795  typedef cub::BlockReduce<float, REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1796  __shared__ typename BlockReduce::TempStorage tempStorage;
1797  float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
1798  float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
1799  float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
1800  float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
1801  float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
1802  float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
1803  float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
1804  float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
1805  float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
1806  if (threadIdx.x == 0) {
1807  atomicAdd(&virialEnergy[bin].virial[0], (double)vxx);
1808  atomicAdd(&virialEnergy[bin].virial[1], (double)vxy);
1809  atomicAdd(&virialEnergy[bin].virial[2], (double)vxz);
1810  atomicAdd(&virialEnergy[bin].virial[3], (double)vyx);
1811  atomicAdd(&virialEnergy[bin].virial[4], (double)vyy);
1812  atomicAdd(&virialEnergy[bin].virial[5], (double)vyz);
1813  atomicAdd(&virialEnergy[bin].virial[6], (double)vzx);
1814  atomicAdd(&virialEnergy[bin].virial[7], (double)vzy);
1815  atomicAdd(&virialEnergy[bin].virial[8], (double)vzz);
1816  }
1817 
1818  if (doSlow) {
1819  // if (isnan(forceSlow.x) || isnan(forceSlow.y) || isnan(forceSlow.z))
1820  float vxxSlowt = forceSlow.x*pos.x;
1821  float vxySlowt = forceSlow.x*pos.y;
1822  float vxzSlowt = forceSlow.x*pos.z;
1823  float vyxSlowt = forceSlow.y*pos.x;
1824  float vyySlowt = forceSlow.y*pos.y;
1825  float vyzSlowt = forceSlow.y*pos.z;
1826  float vzxSlowt = forceSlow.z*pos.x;
1827  float vzySlowt = forceSlow.z*pos.y;
1828  float vzzSlowt = forceSlow.z*pos.z;
1829  float vxxSlow = BlockReduce(tempStorage).Sum(vxxSlowt); BLOCK_SYNC;
1830  float vxySlow = BlockReduce(tempStorage).Sum(vxySlowt); BLOCK_SYNC;
1831  float vxzSlow = BlockReduce(tempStorage).Sum(vxzSlowt); BLOCK_SYNC;
1832  float vyxSlow = BlockReduce(tempStorage).Sum(vyxSlowt); BLOCK_SYNC;
1833  float vyySlow = BlockReduce(tempStorage).Sum(vyySlowt); BLOCK_SYNC;
1834  float vyzSlow = BlockReduce(tempStorage).Sum(vyzSlowt); BLOCK_SYNC;
1835  float vzxSlow = BlockReduce(tempStorage).Sum(vzxSlowt); BLOCK_SYNC;
1836  float vzySlow = BlockReduce(tempStorage).Sum(vzySlowt); BLOCK_SYNC;
1837  float vzzSlow = BlockReduce(tempStorage).Sum(vzzSlowt); BLOCK_SYNC;
1838  if (threadIdx.x == 0) {
1839  atomicAdd(&virialEnergy[bin].virialSlow[0], (double)vxxSlow);
1840  atomicAdd(&virialEnergy[bin].virialSlow[1], (double)vxySlow);
1841  atomicAdd(&virialEnergy[bin].virialSlow[2], (double)vxzSlow);
1842  atomicAdd(&virialEnergy[bin].virialSlow[3], (double)vyxSlow);
1843  atomicAdd(&virialEnergy[bin].virialSlow[4], (double)vyySlow);
1844  atomicAdd(&virialEnergy[bin].virialSlow[5], (double)vyzSlow);
1845  atomicAdd(&virialEnergy[bin].virialSlow[6], (double)vzxSlow);
1846  atomicAdd(&virialEnergy[bin].virialSlow[7], (double)vzySlow);
1847  atomicAdd(&virialEnergy[bin].virialSlow[8], (double)vzzSlow);
1848  }
1849  }
1850 
1851  }
1852 }
1853 
1854 __global__ void reduceVirialEnergyKernel(
1855  const bool doEnergy, const bool doVirial, const bool doSlow,
1856  const int numTileLists,
1857  const TileListVirialEnergy* __restrict__ tileListVirialEnergy,
1858  VirialEnergy* __restrict__ virialEnergy) {
1859 
1860  for (int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
1861  {
1862  int itileList = ibase + threadIdx.x;
1863  TileListVirialEnergy ve;
1864  if (itileList < numTileLists) {
1865  ve = tileListVirialEnergy[itileList];
1866  } else {
1867  // Set to zero to avoid nan*0
1868  if (doVirial) {
1869  ve.shx = 0.0f;
1870  ve.shy = 0.0f;
1871  ve.shz = 0.0f;
1872  ve.forcex = 0.0f;
1873  ve.forcey = 0.0f;
1874  ve.forcez = 0.0f;
1875  ve.forceSlowx = 0.0f;
1876  ve.forceSlowy = 0.0f;
1877  ve.forceSlowz = 0.0f;
1878  }
1879  if (doEnergy) {
1880  ve.energyVdw = 0.0;
1881  ve.energyVdw_s = 0.0;
1882  ve.energyElec = 0.0;
1883  ve.energySlow = 0.0;
1884  ve.energyElec_s = 0.0;
1885  ve.energySlow_s = 0.0;
1886 
1887  /* TI stuff */
1888  ve.energyVdw_ti_1 = 0.0;
1889  ve.energyVdw_ti_2 = 0.0;
1890  ve.energyElec_ti_1 = 0.0;
1891  ve.energyElec_ti_2 = 0.0;
1892  ve.energySlow_ti_1 = 0.0;
1893  ve.energySlow_ti_2 = 0.0;
1894  // ve.energyGBIS = 0.0;
1895  }
1896  }
1897 
1898  const int bin = blockIdx.x % ATOMIC_BINS;
1899 
1900  if (doVirial) {
1901  typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1902  __shared__ typename BlockReduce::TempStorage tempStorage;
1903  float vxxt = ve.forcex*ve.shx;
1904  float vxyt = ve.forcex*ve.shy;
1905  float vxzt = ve.forcex*ve.shz;
1906  float vyxt = ve.forcey*ve.shx;
1907  float vyyt = ve.forcey*ve.shy;
1908  float vyzt = ve.forcey*ve.shz;
1909  float vzxt = ve.forcez*ve.shx;
1910  float vzyt = ve.forcez*ve.shy;
1911  float vzzt = ve.forcez*ve.shz;
1912  float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
1913  float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
1914  float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
1915  float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
1916  float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
1917  float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
1918  float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
1919  float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
1920  float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
1921  if (threadIdx.x == 0) {
1922  atomicAdd(&virialEnergy[bin].virial[0], (double)vxx);
1923  atomicAdd(&virialEnergy[bin].virial[1], (double)vxy);
1924  atomicAdd(&virialEnergy[bin].virial[2], (double)vxz);
1925  atomicAdd(&virialEnergy[bin].virial[3], (double)vyx);
1926  atomicAdd(&virialEnergy[bin].virial[4], (double)vyy);
1927  atomicAdd(&virialEnergy[bin].virial[5], (double)vyz);
1928  atomicAdd(&virialEnergy[bin].virial[6], (double)vzx);
1929  atomicAdd(&virialEnergy[bin].virial[7], (double)vzy);
1930  atomicAdd(&virialEnergy[bin].virial[8], (double)vzz);
1931  }
1932 
1933  if (doSlow) {
1934  typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1935  __shared__ typename BlockReduce::TempStorage tempStorage;
1936  float vxxt = ve.forceSlowx*ve.shx;
1937  float vxyt = ve.forceSlowx*ve.shy;
1938  float vxzt = ve.forceSlowx*ve.shz;
1939  float vyxt = ve.forceSlowy*ve.shx;
1940  float vyyt = ve.forceSlowy*ve.shy;
1941  float vyzt = ve.forceSlowy*ve.shz;
1942  float vzxt = ve.forceSlowz*ve.shx;
1943  float vzyt = ve.forceSlowz*ve.shy;
1944  float vzzt = ve.forceSlowz*ve.shz;
1945  float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
1946  float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
1947  float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
1948  float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
1949  float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
1950  float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
1951  float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
1952  float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
1953  float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
1954  if (threadIdx.x == 0) {
1955  atomicAdd(&virialEnergy[bin].virialSlow[0], (double)vxx);
1956  atomicAdd(&virialEnergy[bin].virialSlow[1], (double)vxy);
1957  atomicAdd(&virialEnergy[bin].virialSlow[2], (double)vxz);
1958  atomicAdd(&virialEnergy[bin].virialSlow[3], (double)vyx);
1959  atomicAdd(&virialEnergy[bin].virialSlow[4], (double)vyy);
1960  atomicAdd(&virialEnergy[bin].virialSlow[5], (double)vyz);
1961  atomicAdd(&virialEnergy[bin].virialSlow[6], (double)vzx);
1962  atomicAdd(&virialEnergy[bin].virialSlow[7], (double)vzy);
1963  atomicAdd(&virialEnergy[bin].virialSlow[8], (double)vzz);
1964  }
1965  }
1966  }
1967 
1968  if (doEnergy) {
1969  typedef cub::BlockReduce<double, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1970  /* Maybe we should guard the TI and FEP energies, since those are not to be calculated on regular MDs */
1971  __shared__ typename BlockReduce::TempStorage tempStorage;
1972  double energyVdw = BlockReduce(tempStorage).Sum(ve.energyVdw); BLOCK_SYNC;
1973  double energyVdw_s = BlockReduce(tempStorage).Sum(ve.energyVdw_s); BLOCK_SYNC;
1974  double energyElec = BlockReduce(tempStorage).Sum(ve.energyElec); BLOCK_SYNC;
1975  double energyElec_s = BlockReduce(tempStorage).Sum(ve.energyElec_s); BLOCK_SYNC;
1976  double energyVdw_ti_1 = BlockReduce(tempStorage).Sum(ve.energyVdw_ti_1); BLOCK_SYNC;
1977  double energyVdw_ti_2 = BlockReduce(tempStorage).Sum(ve.energyVdw_ti_2); BLOCK_SYNC;
1978  double energyElec_ti_1 = BlockReduce(tempStorage).Sum(ve.energyElec_ti_1); BLOCK_SYNC;
1979  double energyElec_ti_2 = BlockReduce(tempStorage).Sum(ve.energyElec_ti_2); BLOCK_SYNC;
1980  if (threadIdx.x == 0){
1981  atomicAdd(&virialEnergy[bin].energyVdw, energyVdw);
1982  atomicAdd(&virialEnergy[bin].energyVdw_s, energyVdw_s);
1983  atomicAdd(&virialEnergy[bin].energyElec, energyElec);
1984  atomicAdd(&virialEnergy[bin].energyElec_s, energyElec_s);
1985  atomicAdd(&virialEnergy[bin].energyVdw_ti_1, energyVdw_ti_1);
1986  atomicAdd(&virialEnergy[bin].energyVdw_ti_2, energyVdw_ti_2);
1987  atomicAdd(&virialEnergy[bin].energyElec_ti_1, energyElec_ti_1);
1988  atomicAdd(&virialEnergy[bin].energyElec_ti_2, energyElec_ti_2);
1989  }
1990  if (doSlow) {
1991  double energySlow = BlockReduce(tempStorage).Sum(ve.energySlow); BLOCK_SYNC;
1992  double energySlow_s = BlockReduce(tempStorage).Sum(ve.energySlow_s); BLOCK_SYNC;
1993  double energySlow_ti_1 = BlockReduce(tempStorage).Sum(ve.energySlow_ti_1); BLOCK_SYNC;
1994  double energySlow_ti_2 = BlockReduce(tempStorage).Sum(ve.energySlow_ti_2); BLOCK_SYNC;
1995  if (threadIdx.x == 0) {
1996  atomicAdd(&virialEnergy[bin].energySlow, energySlow);
1997  atomicAdd(&virialEnergy[bin].energySlow_s, energySlow_s);
1998  atomicAdd(&virialEnergy[bin].energySlow_ti_1, energySlow_ti_1);
1999  atomicAdd(&virialEnergy[bin].energySlow_ti_2, energySlow_ti_2);
2000  }
2001  }
2002  }
2003 
2004  }
2005 
2006 }
2007 
2008 __global__ void reduceGBISEnergyKernel(const int numTileLists,
2009  const TileListVirialEnergy* __restrict__ tileListVirialEnergy,
2010  VirialEnergy* __restrict__ virialEnergy) {
2011 
2012  for (int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
2013  {
2014  int itileList = ibase + threadIdx.x;
2015  double energyGBISt = 0.0;
2016  if (itileList < numTileLists) {
2017  energyGBISt = tileListVirialEnergy[itileList].energyGBIS;
2018  }
2019 
2020  const int bin = blockIdx.x % ATOMIC_BINS;
2021 
2022  typedef cub::BlockReduce<double, REDUCEGBISENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
2023  __shared__ typename BlockReduce::TempStorage tempStorage;
2024  double energyGBIS = BlockReduce(tempStorage).Sum(energyGBISt); BLOCK_SYNC;
2025  if (threadIdx.x == 0) atomicAdd(&virialEnergy[bin].energyGBIS, energyGBIS);
2026  }
2027 }
2028 
2029 __global__ void reduceNonbondedBinsKernel(
2030  const bool doVirial,
2031  const bool doEnergy,
2032  const bool doSlow,
2033  const bool doGBIS,
2034  VirialEnergy* __restrict__ virialEnergy) {
2035 
2036  const int bin = threadIdx.x;
2037 
2038  typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
2039  __shared__ typename WarpReduce::TempStorage tempStorage;
2040 
2041  if (doVirial) {
2042  double vxx = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[0]);
2043  double vxy = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[1]);
2044  double vxz = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[2]);
2045  double vyx = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[3]);
2046  double vyy = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[4]);
2047  double vyz = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[5]);
2048  double vzx = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[6]);
2049  double vzy = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[7]);
2050  double vzz = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[8]);
2051  if (threadIdx.x == 0) {
2052  virialEnergy->virial[0] = vxx;
2053  virialEnergy->virial[1] = vxy;
2054  virialEnergy->virial[2] = vxz;
2055  virialEnergy->virial[3] = vyx;
2056  virialEnergy->virial[4] = vyy;
2057  virialEnergy->virial[5] = vyz;
2058  virialEnergy->virial[6] = vzx;
2059  virialEnergy->virial[7] = vzy;
2060  virialEnergy->virial[8] = vzz;
2061  }
2062 
2063  if (doSlow) {
2064  double vxxSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[0]);
2065  double vxySlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[1]);
2066  double vxzSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[2]);
2067  double vyxSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[3]);
2068  double vyySlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[4]);
2069  double vyzSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[5]);
2070  double vzxSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[6]);
2071  double vzySlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[7]);
2072  double vzzSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[8]);
2073  if (threadIdx.x == 0) {
2074  virialEnergy->virialSlow[0] = vxxSlow;
2075  virialEnergy->virialSlow[1] = vxySlow;
2076  virialEnergy->virialSlow[2] = vxzSlow;
2077  virialEnergy->virialSlow[3] = vyxSlow;
2078  virialEnergy->virialSlow[4] = vyySlow;
2079  virialEnergy->virialSlow[5] = vyzSlow;
2080  virialEnergy->virialSlow[6] = vzxSlow;
2081  virialEnergy->virialSlow[7] = vzySlow;
2082  virialEnergy->virialSlow[8] = vzzSlow;
2083  }
2084  }
2085  }
2086 
2087  if (doEnergy) {
2088  double energyVdw = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyVdw);
2089  double energyVdw_s = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyVdw_s);
2090  double energyElec = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyElec);
2091  double energyElec_s = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyElec_s);
2092  double energyVdw_ti_1 = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyVdw_ti_1);
2093  double energyVdw_ti_2 = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyVdw_ti_2);
2094  double energyElec_ti_1 = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyElec_ti_1);
2095  double energyElec_ti_2 = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyElec_ti_2);
2096  if (threadIdx.x == 0) {
2097  virialEnergy->energyVdw = energyVdw;
2098  virialEnergy->energyVdw_s = energyVdw_s;
2099  virialEnergy->energyElec = energyElec;
2100  virialEnergy->energyElec_s = energyElec_s;
2101  virialEnergy->energyVdw_ti_1 = energyVdw_ti_1;
2102  virialEnergy->energyVdw_ti_2 = energyVdw_ti_2;
2103  virialEnergy->energyElec_ti_1 = energyElec_ti_1;
2104  virialEnergy->energyElec_ti_2 = energyElec_ti_2;
2105  }
2106  if (doSlow) {
2107  double energySlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].energySlow);
2108  double energySlow_s = WarpReduce(tempStorage).Sum(virialEnergy[bin].energySlow_s);
2109  double energySlow_ti_1 = WarpReduce(tempStorage).Sum(virialEnergy[bin].energySlow_ti_1);
2110  double energySlow_ti_2 = WarpReduce(tempStorage).Sum(virialEnergy[bin].energySlow_ti_2);
2111  if (threadIdx.x == 0) {
2112  virialEnergy->energySlow = energySlow;
2113  virialEnergy->energySlow_s = energySlow_s;
2114  virialEnergy->energySlow_ti_1 = energySlow_ti_1;
2115  virialEnergy->energySlow_ti_2 = energySlow_ti_2;
2116  }
2117  }
2118  if (doGBIS) {
2119  double energyGBIS = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyGBIS);
2120  if (threadIdx.x == 0) {
2121  virialEnergy->energyGBIS = energyGBIS;
2122  }
2123  }
2124  }
2125 }
2126 
2127 // ##############################################################################################
2128 // ##############################################################################################
2129 // ##############################################################################################
2130 
2131 CudaComputeNonbondedKernel::CudaComputeNonbondedKernel(int deviceID, CudaNonbondedTables& cudaNonbondedTables,
2132  bool doStreaming) : deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables), doStreaming(doStreaming) {
2133 
2134  cudaCheck(cudaSetDevice(deviceID));
2135 
2136  overflowExclusions = NULL;
2137  overflowExclusionsSize = 0;
2138 
2139  exclIndexMaxDiff = NULL;
2140  exclIndexMaxDiffSize = 0;
2141 
2142  atomIndex = NULL;
2143  atomIndexSize = 0;
2144 
2145  vdwTypes = NULL;
2146  vdwTypesSize = 0;
2147 
2148  patchNumCount = NULL;
2149  patchNumCountSize = 0;
2150 
2151  patchReadyQueue = NULL;
2152  patchReadyQueueSize = 0;
2153 
2154  force_x = force_y = force_z = force_w = NULL;
2155  forceSize = 0;
2156  forceSlow_x = forceSlow_y = forceSlow_z = forceSlow_w = NULL;
2157  forceSlowSize = 0;
2158 }
2159 
2160 void CudaComputeNonbondedKernel::reallocate_forceSOA(int atomStorageSize)
2161 {
2162 #if 0
2163  reallocate_device<float>(&force_x, &forceSize, atomStorageSize, 1.4f);
2164  reallocate_device<float>(&force_y, &forceSize, atomStorageSize, 1.4f);
2165  reallocate_device<float>(&force_z, &forceSize, atomStorageSize, 1.4f);
2166  reallocate_device<float>(&force_w, &forceSize, atomStorageSize, 1.4f);
2167  reallocate_device<float>(&forceSlow_x, &forceSlowSize, atomStorageSize, 1.4f);
2168  reallocate_device<float>(&forceSlow_y, &forceSlowSize, atomStorageSize, 1.4f);
2169  reallocate_device<float>(&forceSlow_z, &forceSlowSize, atomStorageSize, 1.4f);
2170  reallocate_device<float>(&forceSlow_w, &forceSlowSize, atomStorageSize, 1.4f);
2171 #else
2172  reallocate_device<float>(&force_x, &forceSize, atomStorageSize*8, 1.4f);
2173  force_y = force_x + atomStorageSize;
2174  force_z = force_y + atomStorageSize;
2175  force_w = force_z + atomStorageSize;
2176  forceSlow_x = force_w + atomStorageSize;
2177  forceSlow_y = forceSlow_x + atomStorageSize;
2178  forceSlow_z = forceSlow_y + atomStorageSize;
2179  forceSlow_w = forceSlow_z + atomStorageSize;
2180 #endif
2181 }
2182 
2183 CudaComputeNonbondedKernel::~CudaComputeNonbondedKernel() {
2184  cudaCheck(cudaSetDevice(deviceID));
2185  if (overflowExclusions != NULL) deallocate_device<unsigned int>(&overflowExclusions);
2186  if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff);
2187  if (atomIndex != NULL) deallocate_device<int>(&atomIndex);
2188  if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes);
2189  if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount);
2190  if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue);
2191 #if 0
2192  if (force_x != NULL) deallocate_device<float>(&force_x);
2193  if (force_y != NULL) deallocate_device<float>(&force_y);
2194  if (force_z != NULL) deallocate_device<float>(&force_z);
2195  if (force_w != NULL) deallocate_device<float>(&force_w);
2196  if (forceSlow_x != NULL) deallocate_device<float>(&forceSlow_x);
2197  if (forceSlow_y != NULL) deallocate_device<float>(&forceSlow_y);
2198  if (forceSlow_z != NULL) deallocate_device<float>(&forceSlow_z);
2199  if (forceSlow_w != NULL) deallocate_device<float>(&forceSlow_w);
2200 #else
2201  if (force_x != NULL) deallocate_device<float>(&force_x);
2202 #endif
2203 }
2204 
2205 void CudaComputeNonbondedKernel::updateVdwTypesExcl(const int atomStorageSize, const int* h_vdwTypes,
2206  const int2* h_exclIndexMaxDiff, const int* h_atomIndex, cudaStream_t stream) {
2207 
2208  reallocate_device<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, OVERALLOC);
2209  reallocate_device<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, OVERALLOC);
2210  reallocate_device<int>(&atomIndex, &atomIndexSize, atomStorageSize, OVERALLOC);
2211 
2212  copy_HtoD<int>(h_vdwTypes, vdwTypes, atomStorageSize, stream);
2213  copy_HtoD<int2>(h_exclIndexMaxDiff, exclIndexMaxDiff, atomStorageSize, stream);
2214  copy_HtoD<int>(h_atomIndex, atomIndex, atomStorageSize, stream);
2215 }
2216 
2217 int* CudaComputeNonbondedKernel::getPatchReadyQueue() {
2218  if (!doStreaming) {
2219  NAMD_die("CudaComputeNonbondedKernel::getPatchReadyQueue() called on non-streaming kernel");
2220  }
2221  return patchReadyQueue;
2222 }
2223 
2224 template <int doSlow>
2225 __global__ void transposeForcesKernel(float4 *f, float4 *fSlow,
2226  float *fx, float *fy, float *fz, float *fw,
2227  float *fSlowx, float *fSlowy, float *fSlowz, float *fSloww,
2228  int n)
2229 {
2230  int tid = blockIdx.x*blockDim.x + threadIdx.x;
2231  if (tid < n) {
2232  f[tid] = make_float4(fx[tid], fy[tid], fz[tid], fw[tid]);
2233  fx[tid] = 0.f; fy[tid] = 0.f; fz[tid] = 0.f; fw[tid] = 0.f;
2234  if (doSlow) {
2235  fSlow[tid] = make_float4(fSlowx[tid], fSlowy[tid], fSlowz[tid], fSloww[tid]);
2236  fSlowx[tid] = 0.f; fSlowy[tid] = 0.f; fSlowz[tid] = 0.f; fSloww[tid] = 0.f;
2237  }
2238  }
2239 }
2240 
2241 
2242 
2243 void CudaComputeNonbondedKernel::nonbondedForce(CudaTileListKernel& tlKernel,
2244  const int atomStorageSize, const bool atomsChanged, const bool doMinimize,
2245  const bool doPairlist,
2246  const bool doEnergy, const bool doVirial, const bool doSlow, const bool doAlch,
2247  const bool doAlchVdwForceSwitching,
2248  const bool doFEP, const bool doTI, const bool doTable,
2249  const float3 lata, const float3 latb, const float3 latc,
2250  const float4* h_xyzq, const float cutoff2,
2251  const CudaNBConstants nbConstants,
2252  float4* d_forces, float4* d_forcesSlow,
2253  float4* h_forces, float4* h_forcesSlow, AlchData *srcFlags,
2254  bool lambdaWindowUpdated, char *part,
2255  bool CUDASOAintegrator, bool useDeviceMigration,
2256  cudaStream_t stream) {
2257 
2258 #ifdef NODEGROUP_FORCE_REGISTER
2259  if (!atomsChanged && !CUDASOAintegrator) copy_HtoD<float4>(h_xyzq, tlKernel.get_xyzq(), atomStorageSize, stream);
2260 #else
2261  if (!doPairlist) copy_HtoD<float4>(h_xyzq, tlKernel.get_xyzq(), atomStorageSize, stream);
2262 #endif
2263 
2264  if (doAlch){
2265  // Copy partition to device. This is not necessary if both CUDASOAintegrator and useDeviceMigration
2266  // are true.
2267  if (doPairlist && (!CUDASOAintegrator || !useDeviceMigration)) {
2268  copy_HtoD< char>(part, tlKernel.get_part(), atomStorageSize, stream);
2269  }
2270  //Copies flags to constant memory
2271  if(lambdaWindowUpdated) cudaCheck(cudaMemcpyToSymbol(alchflags, srcFlags, sizeof(AlchData)));
2272  }
2273 
2274  // XXX TODO: Get rid of the clears
2275  if(1){
2276  if (doStreaming) tlKernel.clearTileListStat(stream);
2277  if(atomsChanged || doMinimize){
2278  clear_device_array<float>(force_x, atomStorageSize*4, stream);
2279  if(doSlow) clear_device_array<float>(forceSlow_x, atomStorageSize*4, stream);
2280  }
2281  }
2282 
2283  // --- streaming ----
2284  float4* m_forces = NULL;
2285  float4* m_forcesSlow = NULL;
2286  int* m_patchReadyQueue = NULL;
2287  int numPatches = 0;
2288  unsigned long long *calculatedPairs;
2289  unsigned long long *skippedPairs;
2290  unsigned int* patchNumCountPtr = NULL;
2291 
2292  if (doStreaming) {
2293  numPatches = tlKernel.getNumPatches();
2294  if (reallocate_device<unsigned int>(&patchNumCount, &patchNumCountSize, numPatches)) {
2295  // If re-allocated, clear array
2296  clear_device_array<unsigned int>(patchNumCount, numPatches, stream);
2297  }
2298  patchNumCountPtr = patchNumCount;
2299  bool re = reallocate_host<int>(&patchReadyQueue, &patchReadyQueueSize, numPatches, cudaHostAllocMapped);
2300  if (re) {
2301  // If re-allocated, re-set to "-1"
2302  for (int i=0;i < numPatches;i++) patchReadyQueue[i] = -1;
2303  }
2304  cudaCheck(cudaHostGetDevicePointer((void**)&m_patchReadyQueue, patchReadyQueue, 0));
2305  cudaCheck(cudaHostGetDevicePointer((void**)&m_forces, h_forces, 0));
2306  cudaCheck(cudaHostGetDevicePointer((void**)&m_forcesSlow, h_forcesSlow, 0));
2307  }
2308  // -----------------
2309 
2310  if (doVirial || doEnergy) {
2311  tlKernel.setTileListVirialEnergyLength(tlKernel.getNumTileLists());
2312  }
2313 
2314  int shMemSize = 0;
2315 
2316  int* outputOrderPtr = tlKernel.getOutputOrder();
2317 
2318  int nwarp = NONBONDKERNEL_NUM_WARP;
2319  int nthread = WARPSIZE*nwarp;
2320  int start = 0;
2321 
2322 #define APVERSION
2323 #undef APVERSION
2324 
2325 #ifdef APVERSION
2326 #else
2327  int options = doEnergy + (doVirial << 1) + (doSlow << 2) +
2328  (doPairlist << 3) + (doAlch << 4) + (doFEP << 5) + (doTI << 6) + (doStreaming << 7) + (doTable << 8) + (doAlchVdwForceSwitching << 9);
2329 #endif
2330 
2331  while (start < tlKernel.getNumTileLists()) {
2332 
2333  int nleft = tlKernel.getNumTileLists() - start;
2334  int nblock = min(deviceCUDA->getMaxNumBlocks(), (nleft-1)/nwarp+1);
2335 
2336 #ifdef APVERSION
2337 
2338 #ifdef USE_TABLE_ARRAYS
2339  #define VDW_TABLE_PARAMS cudaNonbondedTables.getVdwCoefTable()
2340  #define TABLE_PARAMS \
2341  cudaNonbondedTables.getForceTable(), cudaNonbondedTables.getEnergyTable()
2342 #else
2343  #define VDW_TABLE_PARAMS cudaNonbondedTables.getVdwCoefTableTex()
2344  #define TABLE_PARAMS \
2345  cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex()
2346 #endif
2347 
2348 
2349 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOALCH, DOFEP, DOTI, DOSTREAMING, DOALCHWDWFORCESWITCHING) \
2350  nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOALCH, DOFEP, DOTI, DOSTREAMING, DOALCHWDWFORCESWITCHING > \
2351  <<< nblock, nthread, shMemSize, stream >>> \
2352  (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
2353  cudaNonbondedTables.getVdwCoefTableWidth(), \
2354  VDW_TABLE_PARAMS, \
2355  vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, nbConstants, \
2356  TABLE_PARAMS, \
2357  tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
2358  tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
2359  tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \
2360  force_x, force_y, force_z, force_w, \
2361  forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w, \
2362  numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
2363  outputOrderPtr, tlKernel.getTileListVirialEnergy(), tlKernel.get_part(), calculatedPairs, skippedPairs); called=true
2364 
2365  bool called = false;
2366  if (doStreaming) {
2367  if(!doAlch){
2368  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 0, 0, 0, 1, 0);
2369  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 0, 0, 0, 1, 0);
2370  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 0, 0, 0, 1, 0);
2371  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 0, 0, 0, 1, 0);
2372  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 0, 0, 0, 1, 0);
2373  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 0, 0, 0, 1, 0);
2374  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 0, 0, 0, 1, 0);
2375  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 0, 0, 0, 1, 0);
2376 
2377 
2378  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 0, 0, 0, 1, 0);
2379  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 0, 0, 0, 1, 0);
2380  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 0, 0, 0, 1, 0);
2381  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 0, 0, 0, 1, 0);
2382  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 0, 0, 0, 1, 0);
2383  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 0, 0, 0, 1, 0);
2384  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 0, 0, 0, 1, 0);
2385  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 0, 0, 0, 1, 0);
2386  }else{
2387  if(doFEP){
2388  if (doAlchVdwForceSwitching) {
2389  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 1, 0, 1, 1);
2390  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 1, 0, 1, 1);
2391  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 1, 0, 1, 1);
2392  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 1, 0, 1, 1);
2393  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 1, 0, 1, 1);
2394  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 1, 0, 1, 1);
2395  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 1, 0, 1, 1);
2396  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 1, 0, 1, 1);
2397 
2398  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 1, 0, 1, 1);
2399  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 1, 0, 1, 1);
2400  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 1, 0, 1, 1);
2401  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 1, 0, 1, 1);
2402  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 1, 0, 1, 1);
2403  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 1, 0, 1, 1);
2404  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 1, 0, 1, 1);
2405  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 1, 0, 1, 1);
2406  } else {
2407  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 1, 0, 1, 0);
2408  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 1, 0, 1, 0);
2409  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 1, 0, 1, 0);
2410  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 1, 0, 1, 0);
2411  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 1, 0, 1, 0);
2412  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 1, 0, 1, 0);
2413  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 1, 0, 1, 0);
2414  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 1, 0, 1, 0);
2415 
2416  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 1, 0, 1, 0);
2417  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 1, 0, 1, 0);
2418  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 1, 0, 1, 0);
2419  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 1, 0, 1, 0);
2420  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 1, 0, 1, 0);
2421  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 1, 0, 1, 0);
2422  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 1, 0, 1, 0);
2423  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 1, 0, 1, 0);
2424  }
2425  }else{
2426  if (doAlchVdwForceSwitching) {
2427  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 0, 1, 1, 1);
2428  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 0, 1, 1, 1);
2429  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 0, 1, 1, 1);
2430  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 0, 1, 1, 1);
2431  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 0, 1, 1, 1);
2432  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 0, 1, 1, 1);
2433  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 0, 1, 1, 1);
2434  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 0, 1, 1, 1);
2435 
2436  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 0, 1, 1, 1);
2437  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 0, 1, 1, 1);
2438  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 0, 1, 1, 1);
2439  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 0, 1, 1, 1);
2440  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 0, 1, 1, 1);
2441  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 0, 1, 1, 1);
2442  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 0, 1, 1, 1);
2443  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 0, 1, 1, 1);
2444  } else {
2445  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 0, 1, 1, 0);
2446  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 0, 1, 1, 0);
2447  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 0, 1, 1, 0);
2448  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 0, 1, 1, 0);
2449  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 0, 1, 1, 0);
2450  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 0, 1, 1, 0);
2451  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 0, 1, 1, 0);
2452  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 0, 1, 1, 0);
2453 
2454  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 0, 1, 1, 0);
2455  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 0, 1, 1, 0);
2456  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 0, 1, 1, 0);
2457  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 0, 1, 1, 0);
2458  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 0, 1, 1, 0);
2459  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 0, 1, 1, 0);
2460  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 0, 1, 1, 0);
2461  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 0, 1, 1, 0);
2462  }
2463  }
2464  }
2465  }
2466  else {
2467  if(!doAlch){
2468  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 0, 0, 0, 0, 0);
2469  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 0, 0, 0, 0, 0);
2470  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 0, 0, 0, 0, 0);
2471  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 0, 0, 0, 0, 0);
2472  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 0, 0, 0, 0, 0);
2473  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 0, 0, 0, 0, 0);
2474  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 0, 0, 0, 0, 0);
2475  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 0, 0, 0, 0, 0);
2476 
2477 
2478  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 0, 0, 0, 0, 0);
2479  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 0, 0, 0, 0, 0);
2480  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 0, 0, 0, 0, 0);
2481  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 0, 0, 0, 0, 0);
2482  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 0, 0, 0, 0, 0);
2483  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 0, 0, 0, 0, 0);
2484  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 0, 0, 0, 0, 0);
2485  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 0, 0, 0, 0, 0);
2486  }else{
2487  if(doFEP){
2488  if (doAlchVdwForceSwitching) {
2489  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 1, 0, 0, 1);
2490  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 1, 0, 0, 1);
2491  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 1, 0, 0, 1);
2492  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 1, 0, 0, 1);
2493  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 1, 0, 0, 1);
2494  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 1, 0, 0, 1);
2495  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 1, 0, 0, 1);
2496  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 1, 0, 0, 1);
2497 
2498  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 1, 0, 0, 1);
2499  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 1, 0, 0, 1);
2500  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 1, 0, 0, 1);
2501  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 1, 0, 0, 1);
2502  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 1, 0, 0, 1);
2503  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 1, 0, 0, 1);
2504  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 1, 0, 0, 1);
2505  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 1, 0, 0, 1);
2506  } else {
2507  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 1, 0, 0, 0);
2508  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 1, 0, 0, 0);
2509  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 1, 0, 0, 0);
2510  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 1, 0, 0, 0);
2511  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 1, 0, 0, 0);
2512  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 1, 0, 0, 0);
2513  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 1, 0, 0, 0);
2514  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 1, 0, 0, 0);
2515 
2516  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 1, 0, 0, 0);
2517  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 1, 0, 0, 0);
2518  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 1, 0, 0, 0);
2519  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 1, 0, 0, 0);
2520  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 1, 0, 0, 0);
2521  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 1, 0, 0, 0);
2522  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 1, 0, 0, 0);
2523  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 1, 0, 0, 0);
2524  }
2525  }else{
2526  if (doAlchVdwForceSwitching) {
2527  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 0, 1, 0, 1);
2528  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 0, 1, 0, 1);
2529  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 0, 1, 0, 1);
2530  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 0, 1, 0, 1);
2531  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 0, 1, 0, 1);
2532  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 0, 1, 0, 1);
2533  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 0, 1, 0, 1);
2534  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 0, 1, 0, 1);
2535 
2536  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 0, 1, 0, 1);
2537  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 0, 1, 0, 1);
2538  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 0, 1, 0, 1);
2539  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 0, 1, 0, 1);
2540  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 0, 1, 0, 1);
2541  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 0, 1, 0, 1);
2542  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 0, 1, 0, 1);
2543  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 0, 1, 0, 1);
2544  } else {
2545  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1, 0, 1, 0, 0);
2546  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1, 0, 1, 0, 0);
2547  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1, 0, 1, 0, 0);
2548  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1, 0, 1, 0, 0);
2549  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1, 0, 1, 0, 0);
2550  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1, 0, 1, 0, 0);
2551  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1, 0, 1, 0, 0);
2552  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1, 0, 1, 0, 0);
2553 
2554  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1, 0, 1, 0, 0);
2555  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1, 0, 1, 0, 0);
2556  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1, 0, 1, 0, 0);
2557  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1, 0, 1, 0, 0);
2558  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1, 0, 1, 0, 0);
2559  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1, 0, 1, 0, 0);
2560  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1, 0, 1, 0, 0);
2561  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1, 0, 1, 0, 0);
2562  }
2563  }//if doFEP
2564  }//if doAlch
2565  }//if doStreaming
2566 
2567  if (!called) {
2568  NAMD_die("CudaComputeNonbondedKernel::nonbondedForce, none of the kernels called");
2569  }
2570 
2571 #else
2572 
2573 #ifdef USE_TABLE_ARRAYS
2574  #define VDW_TABLE_PARAMS cudaNonbondedTables.getVdwCoefTable()
2575  #define TABLE_PARAMS \
2576  cudaNonbondedTables.getForceTable(), cudaNonbondedTables.getEnergyTable()
2577 #else
2578  #define VDW_TABLE_PARAMS cudaNonbondedTables.getVdwCoefTableTex()
2579  #define TABLE_PARAMS \
2580  cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex()
2581 #endif
2582 
2583 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOALCH, DOFEP, DOTI, DOSTREAMING, DOTABLE, DOALCHWDWFORCESWITCHING) \
2584  nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOALCH, DOFEP, DOTI, DOSTREAMING, DOTABLE, DOALCHWDWFORCESWITCHING> \
2585  <<< nblock, nthread, shMemSize, stream >>> \
2586  (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
2587  cudaNonbondedTables.getVdwCoefTableWidth(), \
2588  VDW_TABLE_PARAMS, \
2589  vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, nbConstants, \
2590  TABLE_PARAMS,\
2591  tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
2592  tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
2593  tlKernel.getBoundingBoxes(), \
2594  force_x, force_y, force_z, force_w, \
2595  forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w, \
2596  numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
2597  outputOrderPtr, tlKernel.getTileListVirialEnergy(), tlKernel.get_part())
2598 
2599 #ifdef DEBUG
2600  char errmsg[256];
2601 #endif
2602 
2603  switch (options) {
2604  case 0: CALL(0, 0, 0, 0, 0, 0, 0, 0, 0, 0); break;
2605  case 1: CALL(1, 0, 0, 0, 0, 0, 0, 0, 0, 0); break;
2606  case 2: CALL(0, 1, 0, 0, 0, 0, 0, 0, 0, 0); break;
2607  case 3: CALL(1, 1, 0, 0, 0, 0, 0, 0, 0, 0); break;
2608  case 4: CALL(0, 0, 1, 0, 0, 0, 0, 0, 0, 0); break;
2609  case 5: CALL(1, 0, 1, 0, 0, 0, 0, 0, 0, 0); break;
2610  case 6: CALL(0, 1, 1, 0, 0, 0, 0, 0, 0, 0); break;
2611  case 7: CALL(1, 1, 1, 0, 0, 0, 0, 0, 0, 0); break;
2612  case 8: CALL(0, 0, 0, 1, 0, 0, 0, 0, 0, 0); break;
2613  case 9: CALL(1, 0, 0, 1, 0, 0, 0, 0, 0, 0); break;
2614  case 10: CALL(0, 1, 0, 1, 0, 0, 0, 0, 0, 0); break;
2615  case 11: CALL(1, 1, 0, 1, 0, 0, 0, 0, 0, 0); break;
2616  case 12: CALL(0, 0, 1, 1, 0, 0, 0, 0, 0, 0); break;
2617  case 13: CALL(1, 0, 1, 1, 0, 0, 0, 0, 0, 0); break;
2618  case 14: CALL(0, 1, 1, 1, 0, 0, 0, 0, 0, 0); break;
2619  case 15: CALL(1, 1, 1, 1, 0, 0, 0, 0, 0, 0); break;
2620 
2621 #if 0
2622  case 16: CALL(0, 0, 0, 0, 1, 0, 0, 0, 0, 0); break;
2623  case 17: CALL(1, 0, 0, 0, 1, 0, 0, 0, 0, 0); break;
2624  case 18: CALL(0, 1, 0, 0, 1, 0, 0, 0, 0, 0); break;
2625  case 19: CALL(1, 1, 0, 0, 1, 0, 0, 0, 0, 0); break;
2626  case 20: CALL(0, 0, 1, 0, 1, 0, 0, 0, 0, 0); break;
2627  case 21: CALL(1, 0, 1, 0, 1, 0, 0, 0, 0, 0); break;
2628  case 22: CALL(0, 1, 1, 0, 1, 0, 0, 0, 0, 0); break;
2629  case 23: CALL(1, 1, 1, 0, 1, 0, 0, 0, 0, 0); break;
2630  case 24: CALL(0, 0, 0, 1, 1, 0, 0, 0, 0, 0); break;
2631  case 25: CALL(1, 0, 0, 1, 1, 0, 0, 0, 0, 0); break;
2632  case 26: CALL(0, 1, 0, 1, 1, 0, 0, 0, 0, 0); break;
2633  case 27: CALL(1, 1, 0, 1, 1, 0, 0, 0, 0, 0); break;
2634  case 28: CALL(0, 0, 1, 1, 1, 0, 0, 0, 0, 0); break;
2635  case 29: CALL(1, 0, 1, 1, 1, 0, 0, 0, 0, 0); break;
2636  case 30: CALL(0, 1, 1, 1, 1, 0, 0, 0, 0, 0); break;
2637  case 31: CALL(1, 1, 1, 1, 1, 0, 0, 0, 0, 0); break;
2638 
2639  case 32: CALL(0, 0, 0, 0, 0, 1, 0, 0, 0, 0); break;
2640  case 33: CALL(1, 0, 0, 0, 0, 1, 0, 0, 0, 0); break;
2641  case 34: CALL(0, 1, 0, 0, 0, 1, 0, 0, 0, 0); break;
2642  case 35: CALL(1, 1, 0, 0, 0, 1, 0, 0, 0, 0); break;
2643  case 36: CALL(0, 0, 1, 0, 0, 1, 0, 0, 0, 0); break;
2644  case 37: CALL(1, 0, 1, 0, 0, 1, 0, 0, 0, 0); break;
2645  case 38: CALL(0, 1, 1, 0, 0, 1, 0, 0, 0, 0); break;
2646  case 39: CALL(1, 1, 1, 0, 0, 1, 0, 0, 0, 0); break;
2647  case 40: CALL(0, 0, 0, 1, 0, 1, 0, 0, 0, 0); break;
2648  case 41: CALL(1, 0, 0, 1, 0, 1, 0, 0, 0, 0); break;
2649  case 42: CALL(0, 1, 0, 1, 0, 1, 0, 0, 0, 0); break;
2650  case 43: CALL(1, 1, 0, 1, 0, 1, 0, 0, 0, 0); break;
2651  case 44: CALL(0, 0, 1, 1, 0, 1, 0, 0, 0, 0); break;
2652  case 45: CALL(1, 0, 1, 1, 0, 1, 0, 0, 0, 0); break;
2653  case 46: CALL(0, 1, 1, 1, 0, 1, 0, 0, 0, 0); break;
2654  case 47: CALL(1, 1, 1, 1, 0, 1, 0, 0, 0, 0); break;
2655 
2656  case 48: CALL(0, 0, 0, 0, 1, 1, 0, 0, 0, 0); break;
2657  case 49: CALL(1, 0, 0, 0, 1, 1, 0, 0, 0, 0); break;
2658  case 50: CALL(0, 1, 0, 0, 1, 1, 0, 0, 0, 0); break;
2659  case 51: CALL(1, 1, 0, 0, 1, 1, 0, 0, 0, 0); break;
2660  case 52: CALL(0, 0, 1, 0, 1, 1, 0, 0, 0, 0); break;
2661  case 53: CALL(1, 0, 1, 0, 1, 1, 0, 0, 0, 0); break;
2662  case 54: CALL(0, 1, 1, 0, 1, 1, 0, 0, 0, 0); break;
2663  case 55: CALL(1, 1, 1, 0, 1, 1, 0, 0, 0, 0); break;
2664  case 56: CALL(0, 0, 0, 1, 1, 1, 0, 0, 0, 0); break;
2665  case 57: CALL(1, 0, 0, 1, 1, 1, 0, 0, 0, 0); break;
2666  case 58: CALL(0, 1, 0, 1, 1, 1, 0, 0, 0, 0); break;
2667  case 59: CALL(1, 1, 0, 1, 1, 1, 0, 0, 0, 0); break;
2668  case 60: CALL(0, 0, 1, 1, 1, 1, 0, 0, 0, 0); break;
2669  case 61: CALL(1, 0, 1, 1, 1, 1, 0, 0, 0, 0); break;
2670  case 62: CALL(0, 1, 1, 1, 1, 1, 0, 0, 0, 0); break;
2671  case 63: CALL(1, 1, 1, 1, 1, 1, 0, 0, 0, 0); break;
2672 
2673  case 64: CALL(0, 0, 0, 0, 0, 0, 1, 0, 0, 0); break;
2674  case 65: CALL(1, 0, 0, 0, 0, 0, 1, 0, 0, 0); break;
2675  case 66: CALL(0, 1, 0, 0, 0, 0, 1, 0, 0, 0); break;
2676  case 67: CALL(1, 1, 0, 0, 0, 0, 1, 0, 0, 0); break;
2677  case 68: CALL(0, 0, 1, 0, 0, 0, 1, 0, 0, 0); break;
2678  case 69: CALL(1, 0, 1, 0, 0, 0, 1, 0, 0, 0); break;
2679  case 70: CALL(0, 1, 1, 0, 0, 0, 1, 0, 0, 0); break;
2680  case 71: CALL(1, 1, 1, 0, 0, 0, 1, 0, 0, 0); break;
2681  case 72: CALL(0, 0, 0, 1, 0, 0, 1, 0, 0, 0); break;
2682  case 73: CALL(1, 0, 0, 1, 0, 0, 1, 0, 0, 0); break;
2683  case 74: CALL(0, 1, 0, 1, 0, 0, 1, 0, 0, 0); break;
2684  case 75: CALL(1, 1, 0, 1, 0, 0, 1, 0, 0, 0); break;
2685  case 76: CALL(0, 0, 1, 1, 0, 0, 1, 0, 0, 0); break;
2686  case 77: CALL(1, 0, 1, 1, 0, 0, 1, 0, 0, 0); break;
2687  case 78: CALL(0, 1, 1, 1, 0, 0, 1, 0, 0, 0); break;
2688  case 79: CALL(1, 1, 1, 1, 0, 0, 1, 0, 0, 0); break;
2689 
2690  case 80: CALL(0, 0, 0, 0, 1, 0, 1, 0, 0, 0); break;
2691  case 81: CALL(1, 0, 0, 0, 1, 0, 1, 0, 0, 0); break;
2692  case 82: CALL(0, 1, 0, 0, 1, 0, 1, 0, 0, 0); break;
2693  case 83: CALL(1, 1, 0, 0, 1, 0, 1, 0, 0, 0); break;
2694  case 84: CALL(0, 0, 1, 0, 1, 0, 1, 0, 0, 0); break;
2695  case 85: CALL(1, 0, 1, 0, 1, 0, 1, 0, 0, 0); break;
2696  case 86: CALL(0, 1, 1, 0, 1, 0, 1, 0, 0, 0); break;
2697  case 87: CALL(1, 1, 1, 0, 1, 0, 1, 0, 0, 0); break;
2698  case 88: CALL(0, 0, 0, 1, 1, 0, 1, 0, 0, 0); break;
2699  case 89: CALL(1, 0, 0, 1, 1, 0, 1, 0, 0, 0); break;
2700  case 90: CALL(0, 1, 0, 1, 1, 0, 1, 0, 0, 0); break;
2701  case 91: CALL(1, 1, 0, 1, 1, 0, 1, 0, 0, 0); break;
2702  case 92: CALL(0, 0, 1, 1, 1, 0, 1, 0, 0, 0); break;
2703  case 93: CALL(1, 0, 1, 1, 1, 0, 1, 0, 0, 0); break;
2704  case 94: CALL(0, 1, 1, 1, 1, 0, 1, 0, 0, 0); break;
2705  case 95: CALL(1, 1, 1, 1, 1, 0, 1, 0, 0, 0); break;
2706 
2707  case 96: CALL(0, 0, 0, 0, 0, 1, 1, 0, 0, 0); break;
2708  case 97: CALL(1, 0, 0, 0, 0, 1, 1, 0, 0, 0); break;
2709  case 98: CALL(0, 1, 0, 0, 0, 1, 1, 0, 0, 0); break;
2710  case 99: CALL(1, 1, 0, 0, 0, 1, 1, 0, 0, 0); break;
2711  case 100: CALL(0, 0, 1, 0, 0, 1, 1, 0, 0, 0); break;
2712  case 101: CALL(1, 0, 1, 0, 0, 1, 1, 0, 0, 0); break;
2713  case 102: CALL(0, 1, 1, 0, 0, 1, 1, 0, 0, 0); break;
2714  case 103: CALL(1, 1, 1, 0, 0, 1, 1, 0, 0, 0); break;
2715  case 104: CALL(0, 0, 0, 1, 0, 1, 1, 0, 0, 0); break;
2716  case 105: CALL(1, 0, 0, 1, 0, 1, 1, 0, 0, 0); break;
2717  case 106: CALL(0, 1, 0, 1, 0, 1, 1, 0, 0, 0); break;
2718  case 107: CALL(1, 1, 0, 1, 0, 1, 1, 0, 0, 0); break;
2719  case 108: CALL(0, 0, 1, 1, 0, 1, 1, 0, 0, 0); break;
2720  case 109: CALL(1, 0, 1, 1, 0, 1, 1, 0, 0, 0); break;
2721  case 110: CALL(0, 1, 1, 1, 0, 1, 1, 0, 0, 0); break;
2722  case 111: CALL(1, 1, 1, 1, 0, 1, 1, 0, 0, 0); break;
2723 
2724  case 112: CALL(0, 0, 0, 0, 1, 1, 1, 0, 0, 0); break;
2725  case 113: CALL(1, 0, 0, 0, 1, 1, 1, 0, 0, 0); break;
2726  case 114: CALL(0, 1, 0, 0, 1, 1, 1, 0, 0, 0); break;
2727  case 115: CALL(1, 1, 0, 0, 1, 1, 1, 0, 0, 0); break;
2728  case 116: CALL(0, 0, 1, 0, 1, 1, 1, 0, 0, 0); break;
2729  case 117: CALL(1, 0, 1, 0, 1, 1, 1, 0, 0, 0); break;
2730  case 118: CALL(0, 1, 1, 0, 1, 1, 1, 0, 0, 0); break;
2731  case 119: CALL(1, 1, 1, 0, 1, 1, 1, 0, 0, 0); break;
2732  case 120: CALL(0, 0, 0, 1, 1, 1, 1, 0, 0, 0); break;
2733  case 121: CALL(1, 0, 0, 1, 1, 1, 1, 0, 0, 0); break;
2734  case 122: CALL(0, 1, 0, 1, 1, 1, 1, 0, 0, 0); break;
2735  case 123: CALL(1, 1, 0, 1, 1, 1, 1, 0, 0, 0); break;
2736  case 124: CALL(0, 0, 1, 1, 1, 1, 1, 0, 0, 0); break;
2737  case 125: CALL(1, 0, 1, 1, 1, 1, 1, 0, 0, 0); break;
2738  case 126: CALL(0, 1, 1, 1, 1, 1, 1, 0, 0, 0); break;
2739  case 127: CALL(1, 1, 1, 1, 1, 1, 1, 0, 0, 0); break;
2740 
2741 #endif
2742  case 128: CALL(0, 0, 0, 0, 0, 0, 0, 1, 0, 0); break;
2743  case 129: CALL(1, 0, 0, 0, 0, 0, 0, 1, 0, 0); break;
2744  case 130: CALL(0, 1, 0, 0, 0, 0, 0, 1, 0, 0); break;
2745  case 131: CALL(1, 1, 0, 0, 0, 0, 0, 1, 0, 0); break;
2746  case 132: CALL(0, 0, 1, 0, 0, 0, 0, 1, 0, 0); break;
2747  case 133: CALL(1, 0, 1, 0, 0, 0, 0, 1, 0, 0); break;
2748  case 134: CALL(0, 1, 1, 0, 0, 0, 0, 1, 0, 0); break;
2749  case 135: CALL(1, 1, 1, 0, 0, 0, 0, 1, 0, 0); break;
2750  case 136: CALL(0, 0, 0, 1, 0, 0, 0, 1, 0, 0); break;
2751  case 137: CALL(1, 0, 0, 1, 0, 0, 0, 1, 0, 0); break;
2752  case 138: CALL(0, 1, 0, 1, 0, 0, 0, 1, 0, 0); break;
2753  case 139: CALL(1, 1, 0, 1, 0, 0, 0, 1, 0, 0); break;
2754  case 140: CALL(0, 0, 1, 1, 0, 0, 0, 1, 0, 0); break;
2755  case 141: CALL(1, 0, 1, 1, 0, 0, 0, 1, 0, 0); break;
2756  case 142: CALL(0, 1, 1, 1, 0, 0, 0, 1, 0, 0); break;
2757  case 143: CALL(1, 1, 1, 1, 0, 0, 0, 1, 0, 0); break;
2758 
2759 #if 0
2760  case 144: CALL(0, 0, 0, 0, 1, 0, 0, 1, 0, 0); break;
2761  case 145: CALL(1, 0, 0, 0, 1, 0, 0, 1, 0, 0); break;
2762  case 146: CALL(0, 1, 0, 0, 1, 0, 0, 1, 0, 0); break;
2763  case 147: CALL(1, 1, 0, 0, 1, 0, 0, 1, 0, 0); break;
2764  case 148: CALL(0, 0, 1, 0, 1, 0, 0, 1, 0, 0); break;
2765  case 149: CALL(1, 0, 1, 0, 1, 0, 0, 1, 0, 0); break;
2766  case 150: CALL(0, 1, 1, 0, 1, 0, 0, 1, 0, 0); break;
2767  case 151: CALL(1, 1, 1, 0, 1, 0, 0, 1, 0, 0); break;
2768  case 152: CALL(0, 0, 0, 1, 1, 0, 0, 1, 0, 0); break;
2769  case 153: CALL(1, 0, 0, 1, 1, 0, 0, 1, 0, 0); break;
2770  case 154: CALL(0, 1, 0, 1, 1, 0, 0, 1, 0, 0); break;
2771  case 155: CALL(1, 1, 0, 1, 1, 0, 0, 1, 0, 0); break;
2772  case 156: CALL(0, 0, 1, 1, 1, 0, 0, 1, 0, 0); break;
2773  case 157: CALL(1, 0, 1, 1, 1, 0, 0, 1, 0, 0); break;
2774  case 158: CALL(0, 1, 1, 1, 1, 0, 0, 1, 0, 0); break;
2775  case 159: CALL(1, 1, 1, 1, 1, 0, 0, 1, 0, 0); break;
2776 
2777  case 160: CALL(0, 0, 0, 0, 0, 1, 0, 1, 0, 0); break;
2778  case 161: CALL(1, 0, 0, 0, 0, 1, 0, 1, 0, 0); break;
2779  case 162: CALL(0, 1, 0, 0, 0, 1, 0, 1, 0, 0); break;
2780  case 163: CALL(1, 1, 0, 0, 0, 1, 0, 1, 0, 0); break;
2781  case 164: CALL(0, 0, 1, 0, 0, 1, 0, 1, 0, 0); break;
2782  case 165: CALL(1, 0, 1, 0, 0, 1, 0, 1, 0, 0); break;
2783  case 166: CALL(0, 1, 1, 0, 0, 1, 0, 1, 0, 0); break;
2784  case 167: CALL(1, 1, 1, 0, 0, 1, 0, 1, 0, 0); break;
2785  case 168: CALL(0, 0, 0, 1, 0, 1, 0, 1, 0, 0); break;
2786  case 169: CALL(1, 0, 0, 1, 0, 1, 0, 1, 0, 0); break;
2787  case 170: CALL(0, 1, 0, 1, 0, 1, 0, 1, 0, 0); break;
2788  case 171: CALL(1, 1, 0, 1, 0, 1, 0, 1, 0, 0); break;
2789  case 172: CALL(0, 0, 1, 1, 0, 1, 0, 1, 0, 0); break;
2790  case 173: CALL(1, 0, 1, 1, 0, 1, 0, 1, 0, 0); break;
2791  case 174: CALL(0, 1, 1, 1, 0, 1, 0, 1, 0, 0); break;
2792  case 175: CALL(1, 1, 1, 1, 0, 1, 0, 1, 0, 0); break;
2793 
2794  case 176: CALL(0, 0, 0, 0, 1, 1, 0, 1, 0, 0); break;
2795  case 177: CALL(1, 0, 0, 0, 1, 1, 0, 1, 0, 0); break;
2796  case 178: CALL(0, 1, 0, 0, 1, 1, 0, 1, 0, 0); break;
2797  case 179: CALL(1, 1, 0, 0, 1, 1, 0, 1, 0, 0); break;
2798  case 180: CALL(0, 0, 1, 0, 1, 1, 0, 1, 0, 0); break;
2799  case 181: CALL(1, 0, 1, 0, 1, 1, 0, 1, 0, 0); break;
2800  case 182: CALL(0, 1, 1, 0, 1, 1, 0, 1, 0, 0); break;
2801  case 183: CALL(1, 1, 1, 0, 1, 1, 0, 1, 0, 0); break;
2802  case 184: CALL(0, 0, 0, 1, 1, 1, 0, 1, 0, 0); break;
2803  case 185: CALL(1, 0, 0, 1, 1, 1, 0, 1, 0, 0); break;
2804  case 186: CALL(0, 1, 0, 1, 1, 1, 0, 1, 0, 0); break;
2805  case 187: CALL(1, 1, 0, 1, 1, 1, 0, 1, 0, 0); break;
2806  case 188: CALL(0, 0, 1, 1, 1, 1, 0, 1, 0, 0); break;
2807  case 189: CALL(1, 0, 1, 1, 1, 1, 0, 1, 0, 0); break;
2808  case 190: CALL(0, 1, 1, 1, 1, 1, 0, 1, 0, 0); break;
2809  case 191: CALL(1, 1, 1, 1, 1, 1, 0, 1, 0, 0); break;
2810 
2811  case 192: CALL(0, 0, 0, 0, 0, 0, 1, 1, 0, 0); break;
2812  case 193: CALL(1, 0, 0, 0, 0, 0, 1, 1, 0, 0); break;
2813  case 194: CALL(0, 1, 0, 0, 0, 0, 1, 1, 0, 0); break;
2814  case 195: CALL(1, 1, 0, 0, 0, 0, 1, 1, 0, 0); break;
2815  case 196: CALL(0, 0, 1, 0, 0, 0, 1, 1, 0, 0); break;
2816  case 197: CALL(1, 0, 1, 0, 0, 0, 1, 1, 0, 0); break;
2817  case 198: CALL(0, 1, 1, 0, 0, 0, 1, 1, 0, 0); break;
2818  case 199: CALL(1, 1, 1, 0, 0, 0, 1, 1, 0, 0); break;
2819  case 200: CALL(0, 0, 0, 1, 0, 0, 1, 1, 0, 0); break;
2820  case 201: CALL(1, 0, 0, 1, 0, 0, 1, 1, 0, 0); break;
2821  case 202: CALL(0, 1, 0, 1, 0, 0, 1, 1, 0, 0); break;
2822  case 203: CALL(1, 1, 0, 1, 0, 0, 1, 1, 0, 0); break;
2823  case 204: CALL(0, 0, 1, 1, 0, 0, 1, 1, 0, 0); break;
2824  case 205: CALL(1, 0, 1, 1, 0, 0, 1, 1, 0, 0); break;
2825  case 206: CALL(0, 1, 1, 1, 0, 0, 1, 1, 0, 0); break;
2826  case 207: CALL(1, 1, 1, 1, 0, 0, 1, 1, 0, 0); break;
2827 
2828  case 208: CALL(0, 0, 0, 0, 1, 0, 1, 1, 0, 0); break;
2829  case 209: CALL(1, 0, 0, 0, 1, 0, 1, 1, 0, 0); break;
2830  case 210: CALL(0, 1, 0, 0, 1, 0, 1, 1, 0, 0); break;
2831  case 211: CALL(1, 1, 0, 0, 1, 0, 1, 1, 0, 0); break;
2832  case 212: CALL(0, 0, 1, 0, 1, 0, 1, 1, 0, 0); break;
2833  case 213: CALL(1, 0, 1, 0, 1, 0, 1, 1, 0, 0); break;
2834  case 214: CALL(0, 1, 1, 0, 1, 0, 1, 1, 0, 0); break;
2835  case 215: CALL(1, 1, 1, 0, 1, 0, 1, 1, 0, 0); break;
2836  case 216: CALL(0, 0, 0, 1, 1, 0, 1, 1, 0, 0); break;
2837  case 217: CALL(1, 0, 0, 1, 1, 0, 1, 1, 0, 0); break;
2838  case 218: CALL(0, 1, 0, 1, 1, 0, 1, 1, 0, 0); break;
2839  case 219: CALL(1, 1, 0, 1, 1, 0, 1, 1, 0, 0); break;
2840  case 220: CALL(0, 0, 1, 1, 1, 0, 1, 1, 0, 0); break;
2841  case 221: CALL(1, 0, 1, 1, 1, 0, 1, 1, 0, 0); break;
2842  case 222: CALL(0, 1, 1, 1, 1, 0, 1, 1, 0, 0); break;
2843  case 223: CALL(1, 1, 1, 1, 1, 0, 1, 1, 0, 0); break;
2844 
2845  case 224: CALL(0, 0, 0, 0, 0, 1, 1, 1, 0, 0); break;
2846  case 225: CALL(1, 0, 0, 0, 0, 1, 1, 1, 0, 0); break;
2847  case 226: CALL(0, 1, 0, 0, 0, 1, 1, 1, 0, 0); break;
2848  case 227: CALL(1, 1, 0, 0, 0, 1, 1, 1, 0, 0); break;
2849  case 228: CALL(0, 0, 1, 0, 0, 1, 1, 1, 0, 0); break;
2850  case 229: CALL(1, 0, 1, 0, 0, 1, 1, 1, 0, 0); break;
2851  case 230: CALL(0, 1, 1, 0, 0, 1, 1, 1, 0, 0); break;
2852  case 231: CALL(1, 1, 1, 0, 0, 1, 1, 1, 0, 0); break;
2853  case 232: CALL(0, 0, 0, 1, 0, 1, 1, 1, 0, 0); break;
2854  case 233: CALL(1, 0, 0, 1, 0, 1, 1, 1, 0, 0); break;
2855  case 234: CALL(0, 1, 0, 1, 0, 1, 1, 1, 0, 0); break;
2856  case 235: CALL(1, 1, 0, 1, 0, 1, 1, 1, 0, 0); break;
2857  case 236: CALL(0, 0, 1, 1, 0, 1, 1, 1, 0, 0); break;
2858  case 237: CALL(1, 0, 1, 1, 0, 1, 1, 1, 0, 0); break;
2859  case 238: CALL(0, 1, 1, 1, 0, 1, 1, 1, 0, 0); break;
2860  case 239: CALL(1, 1, 1, 1, 0, 1, 1, 1, 0, 0); break;
2861 
2862  case 240: CALL(0, 0, 0, 0, 1, 1, 1, 1, 0, 0); break;
2863  case 241: CALL(1, 0, 0, 0, 1, 1, 1, 1, 0, 0); break;
2864  case 242: CALL(0, 1, 0, 0, 1, 1, 1, 1, 0, 0); break;
2865  case 243: CALL(1, 1, 0, 0, 1, 1, 1, 1, 0, 0); break;
2866  case 244: CALL(0, 0, 1, 0, 1, 1, 1, 1, 0, 0); break;
2867  case 245: CALL(1, 0, 1, 0, 1, 1, 1, 1, 0, 0); break;
2868  case 246: CALL(0, 1, 1, 0, 1, 1, 1, 1, 0, 0); break;
2869  case 247: CALL(1, 1, 1, 0, 1, 1, 1, 1, 0, 0); break;
2870  case 248: CALL(0, 0, 0, 1, 1, 1, 1, 1, 0, 0); break;
2871  case 249: CALL(1, 0, 0, 1, 1, 1, 1, 1, 0, 0); break;
2872  case 250: CALL(0, 1, 0, 1, 1, 1, 1, 1, 0, 0); break;
2873  case 251: CALL(1, 1, 0, 1, 1, 1, 1, 1, 0, 0); break;
2874  case 252: CALL(0, 0, 1, 1, 1, 1, 1, 1, 0, 0); break;
2875  case 253: CALL(1, 0, 1, 1, 1, 1, 1, 1, 0, 0); break;
2876  case 254: CALL(0, 1, 1, 1, 1, 1, 1, 1, 0, 0); break;
2877  case 255: CALL(1, 1, 1, 1, 1, 1, 1, 1, 0, 0); break;
2878 
2879 #endif
2880  case 256: CALL(0, 0, 0, 0, 0, 0, 0, 0, 1, 0); break;
2881  case 257: CALL(1, 0, 0, 0, 0, 0, 0, 0, 1, 0); break;
2882  case 258: CALL(0, 1, 0, 0, 0, 0, 0, 0, 1, 0); break;
2883  case 259: CALL(1, 1, 0, 0, 0, 0, 0, 0, 1, 0); break;
2884  case 260: CALL(0, 0, 1, 0, 0, 0, 0, 0, 1, 0); break;
2885  case 261: CALL(1, 0, 1, 0, 0, 0, 0, 0, 1, 0); break;
2886  case 262: CALL(0, 1, 1, 0, 0, 0, 0, 0, 1, 0); break;
2887  case 263: CALL(1, 1, 1, 0, 0, 0, 0, 0, 1, 0); break;
2888  case 264: CALL(0, 0, 0, 1, 0, 0, 0, 0, 1, 0); break;
2889  case 265: CALL(1, 0, 0, 1, 0, 0, 0, 0, 1, 0); break;
2890  case 266: CALL(0, 1, 0, 1, 0, 0, 0, 0, 1, 0); break;
2891  case 267: CALL(1, 1, 0, 1, 0, 0, 0, 0, 1, 0); break;
2892  case 268: CALL(0, 0, 1, 1, 0, 0, 0, 0, 1, 0); break;
2893  case 269: CALL(1, 0, 1, 1, 0, 0, 0, 0, 1, 0); break;
2894  case 270: CALL(0, 1, 1, 1, 0, 0, 0, 0, 1, 0); break;
2895  case 271: CALL(1, 1, 1, 1, 0, 0, 0, 0, 1, 0); break;
2896 
2897 #if 0
2898  case 272: CALL(0, 0, 0, 0, 1, 0, 0, 0, 1, 0); break;
2899  case 273: CALL(1, 0, 0, 0, 1, 0, 0, 0, 1, 0); break;
2900  case 274: CALL(0, 1, 0, 0, 1, 0, 0, 0, 1, 0); break;
2901  case 275: CALL(1, 1, 0, 0, 1, 0, 0, 0, 1, 0); break;
2902  case 276: CALL(0, 0, 1, 0, 1, 0, 0, 0, 1, 0); break;
2903  case 277: CALL(1, 0, 1, 0, 1, 0, 0, 0, 1, 0); break;
2904  case 278: CALL(0, 1, 1, 0, 1, 0, 0, 0, 1, 0); break;
2905  case 279: CALL(1, 1, 1, 0, 1, 0, 0, 0, 1, 0); break;
2906  case 280: CALL(0, 0, 0, 1, 1, 0, 0, 0, 1, 0); break;
2907  case 281: CALL(1, 0, 0, 1, 1, 0, 0, 0, 1, 0); break;
2908  case 282: CALL(0, 1, 0, 1, 1, 0, 0, 0, 1, 0); break;
2909  case 283: CALL(1, 1, 0, 1, 1, 0, 0, 0, 1, 0); break;
2910  case 284: CALL(0, 0, 1, 1, 1, 0, 0, 0, 1, 0); break;
2911  case 285: CALL(1, 0, 1, 1, 1, 0, 0, 0, 1, 0); break;
2912  case 286: CALL(0, 1, 1, 1, 1, 0, 0, 0, 1, 0); break;
2913  case 287: CALL(1, 1, 1, 1, 1, 0, 0, 0, 1, 0); break;
2914 
2915  case 288: CALL(0, 0, 0, 0, 0, 1, 0, 0, 1, 0); break;
2916  case 289: CALL(1, 0, 0, 0, 0, 1, 0, 0, 1, 0); break;
2917  case 290: CALL(0, 1, 0, 0, 0, 1, 0, 0, 1, 0); break;
2918  case 291: CALL(1, 1, 0, 0, 0, 1, 0, 0, 1, 0); break;
2919  case 292: CALL(0, 0, 1, 0, 0, 1, 0, 0, 1, 0); break;
2920  case 293: CALL(1, 0, 1, 0, 0, 1, 0, 0, 1, 0); break;
2921  case 294: CALL(0, 1, 1, 0, 0, 1, 0, 0, 1, 0); break;
2922  case 295: CALL(1, 1, 1, 0, 0, 1, 0, 0, 1, 0); break;
2923  case 296: CALL(0, 0, 0, 1, 0, 1, 0, 0, 1, 0); break;
2924  case 297: CALL(1, 0, 0, 1, 0, 1, 0, 0, 1, 0); break;
2925  case 298: CALL(0, 1, 0, 1, 0, 1, 0, 0, 1, 0); break;
2926  case 299: CALL(1, 1, 0, 1, 0, 1, 0, 0, 1, 0); break;
2927  case 300: CALL(0, 0, 1, 1, 0, 1, 0, 0, 1, 0); break;
2928  case 301: CALL(1, 0, 1, 1, 0, 1, 0, 0, 1, 0); break;
2929  case 302: CALL(0, 1, 1, 1, 0, 1, 0, 0, 1, 0); break;
2930  case 303: CALL(1, 1, 1, 1, 0, 1, 0, 0, 1, 0); break;
2931 
2932 #endif
2933  case 304: CALL(0, 0, 0, 0, 1, 1, 0, 0, 1, 0); break;
2934  case 305: CALL(1, 0, 0, 0, 1, 1, 0, 0, 1, 0); break;
2935  case 306: CALL(0, 1, 0, 0, 1, 1, 0, 0, 1, 0); break;
2936  case 307: CALL(1, 1, 0, 0, 1, 1, 0, 0, 1, 0); break;
2937  case 308: CALL(0, 0, 1, 0, 1, 1, 0, 0, 1, 0); break;
2938  case 309: CALL(1, 0, 1, 0, 1, 1, 0, 0, 1, 0); break;
2939  case 310: CALL(0, 1, 1, 0, 1, 1, 0, 0, 1, 0); break;
2940  case 311: CALL(1, 1, 1, 0, 1, 1, 0, 0, 1, 0); break;
2941  case 312: CALL(0, 0, 0, 1, 1, 1, 0, 0, 1, 0); break;
2942  case 313: CALL(1, 0, 0, 1, 1, 1, 0, 0, 1, 0); break;
2943  case 314: CALL(0, 1, 0, 1, 1, 1, 0, 0, 1, 0); break;
2944  case 315: CALL(1, 1, 0, 1, 1, 1, 0, 0, 1, 0); break;
2945  case 316: CALL(0, 0, 1, 1, 1, 1, 0, 0, 1, 0); break;
2946  case 317: CALL(1, 0, 1, 1, 1, 1, 0, 0, 1, 0); break;
2947  case 318: CALL(0, 1, 1, 1, 1, 1, 0, 0, 1, 0); break;
2948  case 319: CALL(1, 1, 1, 1, 1, 1, 0, 0, 1, 0); break;
2949 
2950 #if 0
2951  case 320: CALL(0, 0, 0, 0, 0, 0, 1, 0, 1, 0); break;
2952  case 321: CALL(1, 0, 0, 0, 0, 0, 1, 0, 1, 0); break;
2953  case 322: CALL(0, 1, 0, 0, 0, 0, 1, 0, 1, 0); break;
2954  case 323: CALL(1, 1, 0, 0, 0, 0, 1, 0, 1, 0); break;
2955  case 324: CALL(0, 0, 1, 0, 0, 0, 1, 0, 1, 0); break;
2956  case 325: CALL(1, 0, 1, 0, 0, 0, 1, 0, 1, 0); break;
2957  case 326: CALL(0, 1, 1, 0, 0, 0, 1, 0, 1, 0); break;
2958  case 327: CALL(1, 1, 1, 0, 0, 0, 1, 0, 1, 0); break;
2959  case 328: CALL(0, 0, 0, 1, 0, 0, 1, 0, 1, 0); break;
2960  case 329: CALL(1, 0, 0, 1, 0, 0, 1, 0, 1, 0); break;
2961  case 330: CALL(0, 1, 0, 1, 0, 0, 1, 0, 1, 0); break;
2962  case 331: CALL(1, 1, 0, 1, 0, 0, 1, 0, 1, 0); break;
2963  case 332: CALL(0, 0, 1, 1, 0, 0, 1, 0, 1, 0); break;
2964  case 333: CALL(1, 0, 1, 1, 0, 0, 1, 0, 1, 0); break;
2965  case 334: CALL(0, 1, 1, 1, 0, 0, 1, 0, 1, 0); break;
2966  case 335: CALL(1, 1, 1, 1, 0, 0, 1, 0, 1, 0); break;
2967 
2968 #endif
2969  case 336: CALL(0, 0, 0, 0, 1, 0, 1, 0, 1, 0); break;
2970  case 337: CALL(1, 0, 0, 0, 1, 0, 1, 0, 1, 0); break;
2971  case 338: CALL(0, 1, 0, 0, 1, 0, 1, 0, 1, 0); break;
2972  case 339: CALL(1, 1, 0, 0, 1, 0, 1, 0, 1, 0); break;
2973  case 340: CALL(0, 0, 1, 0, 1, 0, 1, 0, 1, 0); break;
2974  case 341: CALL(1, 0, 1, 0, 1, 0, 1, 0, 1, 0); break;
2975  case 342: CALL(0, 1, 1, 0, 1, 0, 1, 0, 1, 0); break;
2976  case 343: CALL(1, 1, 1, 0, 1, 0, 1, 0, 1, 0); break;
2977  case 344: CALL(0, 0, 0, 1, 1, 0, 1, 0, 1, 0); break;
2978  case 345: CALL(1, 0, 0, 1, 1, 0, 1, 0, 1, 0); break;
2979  case 346: CALL(0, 1, 0, 1, 1, 0, 1, 0, 1, 0); break;
2980  case 347: CALL(1, 1, 0, 1, 1, 0, 1, 0, 1, 0); break;
2981  case 348: CALL(0, 0, 1, 1, 1, 0, 1, 0, 1, 0); break;
2982  case 349: CALL(1, 0, 1, 1, 1, 0, 1, 0, 1, 0); break;
2983  case 350: CALL(0, 1, 1, 1, 1, 0, 1, 0, 1, 0); break;
2984  case 351: CALL(1, 1, 1, 1, 1, 0, 1, 0, 1, 0); break;
2985 
2986 #if 0
2987  case 352: CALL(0, 0, 0, 0, 0, 1, 1, 0, 1, 0); break;
2988  case 353: CALL(1, 0, 0, 0, 0, 1, 1, 0, 1, 0); break;
2989  case 354: CALL(0, 1, 0, 0, 0, 1, 1, 0, 1, 0); break;
2990  case 355: CALL(1, 1, 0, 0, 0, 1, 1, 0, 1, 0); break;
2991  case 356: CALL(0, 0, 1, 0, 0, 1, 1, 0, 1, 0); break;
2992  case 357: CALL(1, 0, 1, 0, 0, 1, 1, 0, 1, 0); break;
2993  case 358: CALL(0, 1, 1, 0, 0, 1, 1, 0, 1, 0); break;
2994  case 359: CALL(1, 1, 1, 0, 0, 1, 1, 0, 1, 0); break;
2995  case 360: CALL(0, 0, 0, 1, 0, 1, 1, 0, 1, 0); break;
2996  case 361: CALL(1, 0, 0, 1, 0, 1, 1, 0, 1, 0); break;
2997  case 362: CALL(0, 1, 0, 1, 0, 1, 1, 0, 1, 0); break;
2998  case 363: CALL(1, 1, 0, 1, 0, 1, 1, 0, 1, 0); break;
2999  case 364: CALL(0, 0, 1, 1, 0, 1, 1, 0, 1, 0); break;
3000  case 365: CALL(1, 0, 1, 1, 0, 1, 1, 0, 1, 0); break;
3001  case 366: CALL(0, 1, 1, 1, 0, 1, 1, 0, 1, 0); break;
3002  case 367: CALL(1, 1, 1, 1, 0, 1, 1, 0, 1, 0); break;
3003 
3004  case 368: CALL(0, 0, 0, 0, 1, 1, 1, 0, 1, 0); break;
3005  case 369: CALL(1, 0, 0, 0, 1, 1, 1, 0, 1, 0); break;
3006  case 370: CALL(0, 1, 0, 0, 1, 1, 1, 0, 1, 0); break;
3007  case 371: CALL(1, 1, 0, 0, 1, 1, 1, 0, 1, 0); break;
3008  case 372: CALL(0, 0, 1, 0, 1, 1, 1, 0, 1, 0); break;
3009  case 373: CALL(1, 0, 1, 0, 1, 1, 1, 0, 1, 0); break;
3010  case 374: CALL(0, 1, 1, 0, 1, 1, 1, 0, 1, 0); break;
3011  case 375: CALL(1, 1, 1, 0, 1, 1, 1, 0, 1, 0); break;
3012  case 376: CALL(0, 0, 0, 1, 1, 1, 1, 0, 1, 0); break;
3013  case 377: CALL(1, 0, 0, 1, 1, 1, 1, 0, 1, 0); break;
3014  case 378: CALL(0, 1, 0, 1, 1, 1, 1, 0, 1, 0); break;
3015  case 379: CALL(1, 1, 0, 1, 1, 1, 1, 0, 1, 0); break;
3016  case 380: CALL(0, 0, 1, 1, 1, 1, 1, 0, 1, 0); break;
3017  case 381: CALL(1, 0, 1, 1, 1, 1, 1, 0, 1, 0); break;
3018  case 382: CALL(0, 1, 1, 1, 1, 1, 1, 0, 1, 0); break;
3019  case 383: CALL(1, 1, 1, 1, 1, 1, 1, 0, 1, 0); break;
3020 
3021 #endif
3022  case 384: CALL(0, 0, 0, 0, 0, 0, 0, 1, 1, 0); break;
3023  case 385: CALL(1, 0, 0, 0, 0, 0, 0, 1, 1, 0); break;
3024  case 386: CALL(0, 1, 0, 0, 0, 0, 0, 1, 1, 0); break;
3025  case 387: CALL(1, 1, 0, 0, 0, 0, 0, 1, 1, 0); break;
3026  case 388: CALL(0, 0, 1, 0, 0, 0, 0, 1, 1, 0); break;
3027  case 389: CALL(1, 0, 1, 0, 0, 0, 0, 1, 1, 0); break;
3028  case 390: CALL(0, 1, 1, 0, 0, 0, 0, 1, 1, 0); break;
3029  case 391: CALL(1, 1, 1, 0, 0, 0, 0, 1, 1, 0); break;
3030  case 392: CALL(0, 0, 0, 1, 0, 0, 0, 1, 1, 0); break;
3031  case 393: CALL(1, 0, 0, 1, 0, 0, 0, 1, 1, 0); break;
3032  case 394: CALL(0, 1, 0, 1, 0, 0, 0, 1, 1, 0); break;
3033  case 395: CALL(1, 1, 0, 1, 0, 0, 0, 1, 1, 0); break;
3034  case 396: CALL(0, 0, 1, 1, 0, 0, 0, 1, 1, 0); break;
3035  case 397: CALL(1, 0, 1, 1, 0, 0, 0, 1, 1, 0); break;
3036  case 398: CALL(0, 1, 1, 1, 0, 0, 0, 1, 1, 0); break;
3037  case 399: CALL(1, 1, 1, 1, 0, 0, 0, 1, 1, 0); break;
3038 
3039 #if 0
3040  case 400: CALL(0, 0, 0, 0, 1, 0, 0, 1, 1, 0); break;
3041  case 401: CALL(1, 0, 0, 0, 1, 0, 0, 1, 1, 0); break;
3042  case 402: CALL(0, 1, 0, 0, 1, 0, 0, 1, 1, 0); break;
3043  case 403: CALL(1, 1, 0, 0, 1, 0, 0, 1, 1, 0); break;
3044  case 404: CALL(0, 0, 1, 0, 1, 0, 0, 1, 1, 0); break;
3045  case 405: CALL(1, 0, 1, 0, 1, 0, 0, 1, 1, 0); break;
3046  case 406: CALL(0, 1, 1, 0, 1, 0, 0, 1, 1, 0); break;
3047  case 407: CALL(1, 1, 1, 0, 1, 0, 0, 1, 1, 0); break;
3048  case 408: CALL(0, 0, 0, 1, 1, 0, 0, 1, 1, 0); break;
3049  case 409: CALL(1, 0, 0, 1, 1, 0, 0, 1, 1, 0); break;
3050  case 410: CALL(0, 1, 0, 1, 1, 0, 0, 1, 1, 0); break;
3051  case 411: CALL(1, 1, 0, 1, 1, 0, 0, 1, 1, 0); break;
3052  case 412: CALL(0, 0, 1, 1, 1, 0, 0, 1, 1, 0); break;
3053  case 413: CALL(1, 0, 1, 1, 1, 0, 0, 1, 1, 0); break;
3054  case 414: CALL(0, 1, 1, 1, 1, 0, 0, 1, 1, 0); break;
3055  case 415: CALL(1, 1, 1, 1, 1, 0, 0, 1, 1, 0); break;
3056 
3057  case 416: CALL(0, 0, 0, 0, 0, 1, 0, 1, 1, 0); break;
3058  case 417: CALL(1, 0, 0, 0, 0, 1, 0, 1, 1, 0); break;
3059  case 418: CALL(0, 1, 0, 0, 0, 1, 0, 1, 1, 0); break;
3060  case 419: CALL(1, 1, 0, 0, 0, 1, 0, 1, 1, 0); break;
3061  case 420: CALL(0, 0, 1, 0, 0, 1, 0, 1, 1, 0); break;
3062  case 421: CALL(1, 0, 1, 0, 0, 1, 0, 1, 1, 0); break;
3063  case 422: CALL(0, 1, 1, 0, 0, 1, 0, 1, 1, 0); break;
3064  case 423: CALL(1, 1, 1, 0, 0, 1, 0, 1, 1, 0); break;
3065  case 424: CALL(0, 0, 0, 1, 0, 1, 0, 1, 1, 0); break;
3066  case 425: CALL(1, 0, 0, 1, 0, 1, 0, 1, 1, 0); break;
3067  case 426: CALL(0, 1, 0, 1, 0, 1, 0, 1, 1, 0); break;
3068  case 427: CALL(1, 1, 0, 1, 0, 1, 0, 1, 1, 0); break;
3069  case 428: CALL(0, 0, 1, 1, 0, 1, 0, 1, 1, 0); break;
3070  case 429: CALL(1, 0, 1, 1, 0, 1, 0, 1, 1, 0); break;
3071  case 430: CALL(0, 1, 1, 1, 0, 1, 0, 1, 1, 0); break;
3072  case 431: CALL(1, 1, 1, 1, 0, 1, 0, 1, 1, 0); break;
3073 
3074 #endif
3075  case 432: CALL(0, 0, 0, 0, 1, 1, 0, 1, 1, 0); break;
3076  case 433: CALL(1, 0, 0, 0, 1, 1, 0, 1, 1, 0); break;
3077  case 434: CALL(0, 1, 0, 0, 1, 1, 0, 1, 1, 0); break;
3078  case 435: CALL(1, 1, 0, 0, 1, 1, 0, 1, 1, 0); break;
3079  case 436: CALL(0, 0, 1, 0, 1, 1, 0, 1, 1, 0); break;
3080  case 437: CALL(1, 0, 1, 0, 1, 1, 0, 1, 1, 0); break;
3081  case 438: CALL(0, 1, 1, 0, 1, 1, 0, 1, 1, 0); break;
3082  case 439: CALL(1, 1, 1, 0, 1, 1, 0, 1, 1, 0); break;
3083  case 440: CALL(0, 0, 0, 1, 1, 1, 0, 1, 1, 0); break;
3084  case 441: CALL(1, 0, 0, 1, 1, 1, 0, 1, 1, 0); break;
3085  case 442: CALL(0, 1, 0, 1, 1, 1, 0, 1, 1, 0); break;
3086  case 443: CALL(1, 1, 0, 1, 1, 1, 0, 1, 1, 0); break;
3087  case 444: CALL(0, 0, 1, 1, 1, 1, 0, 1, 1, 0); break;
3088  case 445: CALL(1, 0, 1, 1, 1, 1, 0, 1, 1, 0); break;
3089  case 446: CALL(0, 1, 1, 1, 1, 1, 0, 1, 1, 0); break;
3090  case 447: CALL(1, 1, 1, 1, 1, 1, 0, 1, 1, 0); break;
3091 
3092 #if 0
3093  case 448: CALL(0, 0, 0, 0, 0, 0, 1, 1, 1, 0); break;
3094  case 449: CALL(1, 0, 0, 0, 0, 0, 1, 1, 1, 0); break;
3095  case 450: CALL(0, 1, 0, 0, 0, 0, 1, 1, 1, 0); break;
3096  case 451: CALL(1, 1, 0, 0, 0, 0, 1, 1, 1, 0); break;
3097  case 452: CALL(0, 0, 1, 0, 0, 0, 1, 1, 1, 0); break;
3098  case 453: CALL(1, 0, 1, 0, 0, 0, 1, 1, 1, 0); break;
3099  case 454: CALL(0, 1, 1, 0, 0, 0, 1, 1, 1, 0); break;
3100  case 455: CALL(1, 1, 1, 0, 0, 0, 1, 1, 1, 0); break;
3101  case 456: CALL(0, 0, 0, 1, 0, 0, 1, 1, 1, 0); break;
3102  case 457: CALL(1, 0, 0, 1, 0, 0, 1, 1, 1, 0); break;
3103  case 458: CALL(0, 1, 0, 1, 0, 0, 1, 1, 1, 0); break;
3104  case 459: CALL(1, 1, 0, 1, 0, 0, 1, 1, 1, 0); break;
3105  case 460: CALL(0, 0, 1, 1, 0, 0, 1, 1, 1, 0); break;
3106  case 461: CALL(1, 0, 1, 1, 0, 0, 1, 1, 1, 0); break;
3107  case 462: CALL(0, 1, 1, 1, 0, 0, 1, 1, 1, 0); break;
3108  case 463: CALL(1, 1, 1, 1, 0, 0, 1, 1, 1, 0); break;
3109 
3110 #endif
3111  case 464: CALL(0, 0, 0, 0, 1, 0, 1, 1, 1, 0); break;
3112  case 465: CALL(1, 0, 0, 0, 1, 0, 1, 1, 1, 0); break;
3113  case 466: CALL(0, 1, 0, 0, 1, 0, 1, 1, 1, 0); break;
3114  case 467: CALL(1, 1, 0, 0, 1, 0, 1, 1, 1, 0); break;
3115  case 468: CALL(0, 0, 1, 0, 1, 0, 1, 1, 1, 0); break;
3116  case 469: CALL(1, 0, 1, 0, 1, 0, 1, 1, 1, 0); break;
3117  case 470: CALL(0, 1, 1, 0, 1, 0, 1, 1, 1, 0); break;
3118  case 471: CALL(1, 1, 1, 0, 1, 0, 1, 1, 1, 0); break;
3119  case 472: CALL(0, 0, 0, 1, 1, 0, 1, 1, 1, 0); break;
3120  case 473: CALL(1, 0, 0, 1, 1, 0, 1, 1, 1, 0); break;
3121  case 474: CALL(0, 1, 0, 1, 1, 0, 1, 1, 1, 0); break;
3122  case 475: CALL(1, 1, 0, 1, 1, 0, 1, 1, 1, 0); break;
3123  case 476: CALL(0, 0, 1, 1, 1, 0, 1, 1, 1, 0); break;
3124  case 477: CALL(1, 0, 1, 1, 1, 0, 1, 1, 1, 0); break;
3125  case 478: CALL(0, 1, 1, 1, 1, 0, 1, 1, 1, 0); break;
3126  case 479: CALL(1, 1, 1, 1, 1, 0, 1, 1, 1, 0); break;
3127 
3128 #if 0
3129  case 480: CALL(0, 0, 0, 0, 0, 1, 1, 1, 1, 0); break;
3130  case 481: CALL(1, 0, 0, 0, 0, 1, 1, 1, 1, 0); break;
3131  case 482: CALL(0, 1, 0, 0, 0, 1, 1, 1, 1, 0); break;
3132  case 483: CALL(1, 1, 0, 0, 0, 1, 1, 1, 1, 0); break;
3133  case 484: CALL(0, 0, 1, 0, 0, 1, 1, 1, 1, 0); break;
3134  case 485: CALL(1, 0, 1, 0, 0, 1, 1, 1, 1, 0); break;
3135  case 486: CALL(0, 1, 1, 0, 0, 1, 1, 1, 1, 0); break;
3136  case 487: CALL(1, 1, 1, 0, 0, 1, 1, 1, 1, 0); break;
3137  case 488: CALL(0, 0, 0, 1, 0, 1, 1, 1, 1, 0); break;
3138  case 489: CALL(1, 0, 0, 1, 0, 1, 1, 1, 1, 0); break;
3139  case 490: CALL(0, 1, 0, 1, 0, 1, 1, 1, 1, 0); break;
3140  case 491: CALL(1, 1, 0, 1, 0, 1, 1, 1, 1, 0); break;
3141  case 492: CALL(0, 0, 1, 1, 0, 1, 1, 1, 1, 0); break;
3142  case 493: CALL(1, 0, 1, 1, 0, 1, 1, 1, 1, 0); break;
3143  case 494: CALL(0, 1, 1, 1, 0, 1, 1, 1, 1, 0); break;
3144  case 495: CALL(1, 1, 1, 1, 0, 1, 1, 1, 1, 0); break;
3145 
3146  case 496: CALL(0, 0, 0, 0, 1, 1, 1, 1, 1, 0); break;
3147  case 497: CALL(1, 0, 0, 0, 1, 1, 1, 1, 1, 0); break;
3148  case 498: CALL(0, 1, 0, 0, 1, 1, 1, 1, 1, 0); break;
3149  case 499: CALL(1, 1, 0, 0, 1, 1, 1, 1, 1, 0); break;
3150  case 500: CALL(0, 0, 1, 0, 1, 1, 1, 1, 1, 0); break;
3151  case 501: CALL(1, 0, 1, 0, 1, 1, 1, 1, 1, 0); break;
3152  case 502: CALL(0, 1, 1, 0, 1, 1, 1, 1, 1, 0); break;
3153  case 503: CALL(1, 1, 1, 0, 1, 1, 1, 1, 1, 0); break;
3154  case 504: CALL(0, 0, 0, 1, 1, 1, 1, 1, 1, 0); break;
3155  case 505: CALL(1, 0, 0, 1, 1, 1, 1, 1, 1, 0); break;
3156  case 506: CALL(0, 1, 0, 1, 1, 1, 1, 1, 1, 0); break;
3157  case 507: CALL(1, 1, 0, 1, 1, 1, 1, 1, 1, 0); break;
3158  case 508: CALL(0, 0, 1, 1, 1, 1, 1, 1, 1, 0); break;
3159  case 509: CALL(1, 0, 1, 1, 1, 1, 1, 1, 1, 0); break;
3160  case 510: CALL(0, 1, 1, 1, 1, 1, 1, 1, 1, 0); break;
3161  case 511: CALL(1, 1, 1, 1, 1, 1, 1, 1, 1, 0); break;
3162 #endif
3163  /*
3164  * Haochuan: the calls starting from 512 to 1023 were generated by the following python script
3165  * #!/usr/bin/env python3
3166  * def gen_call(option: int):
3167  * doEnergy = option & 1
3168  * doVirial = (option >> 1) & 1
3169  * doSlow = (option >> 2) & 1
3170  * doPairlist = (option >> 3) & 1
3171  * doAlch = (option >> 4) & 1
3172  * doFEP = (option >> 5) & 1
3173  * doTI = (option >> 6) & 1
3174  * doStreaming = (option >> 7) & 1
3175  * doTable = (option >> 8) & 1
3176  * doAlchVdwForceSwitching = (option >> 9) & 1
3177  * incompatible = False
3178  * incompatible = incompatible | (doFEP and doTI)
3179  * incompatible = incompatible | (doAlch and ((not doFEP) and (not doTI)))
3180  * incompatible = incompatible | ((not doAlch) and (doFEP or doTI or doAlchVdwForceSwitching))
3181  * incompatible = incompatible | ((not doTable) and (doAlch or doTI or doFEP or doAlchVdwForceSwitching))
3182  * if incompatible:
3183  * pass
3184  * print(f' // case {option}: CALL({doEnergy}, {doVirial}, {doSlow}, {doPairlist}, {doAlch}, {doFEP}, {doTI}, {doStreaming}, {doTable}, {doAlchVdwForceSwitching}); break;')
3185  * else:
3186  * print(f' case {option}: CALL({doEnergy}, {doVirial}, {doSlow}, {doPairlist}, {doAlch}, {doFEP}, {doTI}, {doStreaming}, {doTable}, {doAlchVdwForceSwitching}); break;')
3187  * return
3188  *
3189  * for i in range(512, 1024):
3190  * gen_call(i)
3191  *
3192  */
3193  // case 512: CALL(0, 0, 0, 0, 0, 0, 0, 0, 0, 1); break;
3194  // case 513: CALL(1, 0, 0, 0, 0, 0, 0, 0, 0, 1); break;
3195  // case 514: CALL(0, 1, 0, 0, 0, 0, 0, 0, 0, 1); break;
3196  // case 515: CALL(1, 1, 0, 0, 0, 0, 0, 0, 0, 1); break;
3197  // case 516: CALL(0, 0, 1, 0, 0, 0, 0, 0, 0, 1); break;
3198  // case 517: CALL(1, 0, 1, 0, 0, 0, 0, 0, 0, 1); break;
3199  // case 518: CALL(0, 1, 1, 0, 0, 0, 0, 0, 0, 1); break;
3200  // case 519: CALL(1, 1, 1, 0, 0, 0, 0, 0, 0, 1); break;
3201  // case 520: CALL(0, 0, 0, 1, 0, 0, 0, 0, 0, 1); break;
3202  // case 521: CALL(1, 0, 0, 1, 0, 0, 0, 0, 0, 1); break;
3203  // case 522: CALL(0, 1, 0, 1, 0, 0, 0, 0, 0, 1); break;
3204  // case 523: CALL(1, 1, 0, 1, 0, 0, 0, 0, 0, 1); break;
3205  // case 524: CALL(0, 0, 1, 1, 0, 0, 0, 0, 0, 1); break;
3206  // case 525: CALL(1, 0, 1, 1, 0, 0, 0, 0, 0, 1); break;
3207  // case 526: CALL(0, 1, 1, 1, 0, 0, 0, 0, 0, 1); break;
3208  // case 527: CALL(1, 1, 1, 1, 0, 0, 0, 0, 0, 1); break;
3209  // case 528: CALL(0, 0, 0, 0, 1, 0, 0, 0, 0, 1); break;
3210  // case 529: CALL(1, 0, 0, 0, 1, 0, 0, 0, 0, 1); break;
3211  // case 530: CALL(0, 1, 0, 0, 1, 0, 0, 0, 0, 1); break;
3212  // case 531: CALL(1, 1, 0, 0, 1, 0, 0, 0, 0, 1); break;
3213  // case 532: CALL(0, 0, 1, 0, 1, 0, 0, 0, 0, 1); break;
3214  // case 533: CALL(1, 0, 1, 0, 1, 0, 0, 0, 0, 1); break;
3215  // case 534: CALL(0, 1, 1, 0, 1, 0, 0, 0, 0, 1); break;
3216  // case 535: CALL(1, 1, 1, 0, 1, 0, 0, 0, 0, 1); break;
3217  // case 536: CALL(0, 0, 0, 1, 1, 0, 0, 0, 0, 1); break;
3218  // case 537: CALL(1, 0, 0, 1, 1, 0, 0, 0, 0, 1); break;
3219  // case 538: CALL(0, 1, 0, 1, 1, 0, 0, 0, 0, 1); break;
3220  // case 539: CALL(1, 1, 0, 1, 1, 0, 0, 0, 0, 1); break;
3221  // case 540: CALL(0, 0, 1, 1, 1, 0, 0, 0, 0, 1); break;
3222  // case 541: CALL(1, 0, 1, 1, 1, 0, 0, 0, 0, 1); break;
3223  // case 542: CALL(0, 1, 1, 1, 1, 0, 0, 0, 0, 1); break;
3224  // case 543: CALL(1, 1, 1, 1, 1, 0, 0, 0, 0, 1); break;
3225  // case 544: CALL(0, 0, 0, 0, 0, 1, 0, 0, 0, 1); break;
3226  // case 545: CALL(1, 0, 0, 0, 0, 1, 0, 0, 0, 1); break;
3227  // case 546: CALL(0, 1, 0, 0, 0, 1, 0, 0, 0, 1); break;
3228  // case 547: CALL(1, 1, 0, 0, 0, 1, 0, 0, 0, 1); break;
3229  // case 548: CALL(0, 0, 1, 0, 0, 1, 0, 0, 0, 1); break;
3230  // case 549: CALL(1, 0, 1, 0, 0, 1, 0, 0, 0, 1); break;
3231  // case 550: CALL(0, 1, 1, 0, 0, 1, 0, 0, 0, 1); break;
3232  // case 551: CALL(1, 1, 1, 0, 0, 1, 0, 0, 0, 1); break;
3233  // case 552: CALL(0, 0, 0, 1, 0, 1, 0, 0, 0, 1); break;
3234  // case 553: CALL(1, 0, 0, 1, 0, 1, 0, 0, 0, 1); break;
3235  // case 554: CALL(0, 1, 0, 1, 0, 1, 0, 0, 0, 1); break;
3236  // case 555: CALL(1, 1, 0, 1, 0, 1, 0, 0, 0, 1); break;
3237  // case 556: CALL(0, 0, 1, 1, 0, 1, 0, 0, 0, 1); break;
3238  // case 557: CALL(1, 0, 1, 1, 0, 1, 0, 0, 0, 1); break;
3239  // case 558: CALL(0, 1, 1, 1, 0, 1, 0, 0, 0, 1); break;
3240  // case 559: CALL(1, 1, 1, 1, 0, 1, 0, 0, 0, 1); break;
3241  // case 560: CALL(0, 0, 0, 0, 1, 1, 0, 0, 0, 1); break;
3242  // case 561: CALL(1, 0, 0, 0, 1, 1, 0, 0, 0, 1); break;
3243  // case 562: CALL(0, 1, 0, 0, 1, 1, 0, 0, 0, 1); break;
3244  // case 563: CALL(1, 1, 0, 0, 1, 1, 0, 0, 0, 1); break;
3245  // case 564: CALL(0, 0, 1, 0, 1, 1, 0, 0, 0, 1); break;
3246  // case 565: CALL(1, 0, 1, 0, 1, 1, 0, 0, 0, 1); break;
3247  // case 566: CALL(0, 1, 1, 0, 1, 1, 0, 0, 0, 1); break;
3248  // case 567: CALL(1, 1, 1, 0, 1, 1, 0, 0, 0, 1); break;
3249  // case 568: CALL(0, 0, 0, 1, 1, 1, 0, 0, 0, 1); break;
3250  // case 569: CALL(1, 0, 0, 1, 1, 1, 0, 0, 0, 1); break;
3251  // case 570: CALL(0, 1, 0, 1, 1, 1, 0, 0, 0, 1); break;
3252  // case 571: CALL(1, 1, 0, 1, 1, 1, 0, 0, 0, 1); break;
3253  // case 572: CALL(0, 0, 1, 1, 1, 1, 0, 0, 0, 1); break;
3254  // case 573: CALL(1, 0, 1, 1, 1, 1, 0, 0, 0, 1); break;
3255  // case 574: CALL(0, 1, 1, 1, 1, 1, 0, 0, 0, 1); break;
3256  // case 575: CALL(1, 1, 1, 1, 1, 1, 0, 0, 0, 1); break;
3257  // case 576: CALL(0, 0, 0, 0, 0, 0, 1, 0, 0, 1); break;
3258  // case 577: CALL(1, 0, 0, 0, 0, 0, 1, 0, 0, 1); break;
3259  // case 578: CALL(0, 1, 0, 0, 0, 0, 1, 0, 0, 1); break;
3260  // case 579: CALL(1, 1, 0, 0, 0, 0, 1, 0, 0, 1); break;
3261  // case 580: CALL(0, 0, 1, 0, 0, 0, 1, 0, 0, 1); break;
3262  // case 581: CALL(1, 0, 1, 0, 0, 0, 1, 0, 0, 1); break;
3263  // case 582: CALL(0, 1, 1, 0, 0, 0, 1, 0, 0, 1); break;
3264  // case 583: CALL(1, 1, 1, 0, 0, 0, 1, 0, 0, 1); break;
3265  // case 584: CALL(0, 0, 0, 1, 0, 0, 1, 0, 0, 1); break;
3266  // case 585: CALL(1, 0, 0, 1, 0, 0, 1, 0, 0, 1); break;
3267  // case 586: CALL(0, 1, 0, 1, 0, 0, 1, 0, 0, 1); break;
3268  // case 587: CALL(1, 1, 0, 1, 0, 0, 1, 0, 0, 1); break;
3269  // case 588: CALL(0, 0, 1, 1, 0, 0, 1, 0, 0, 1); break;
3270  // case 589: CALL(1, 0, 1, 1, 0, 0, 1, 0, 0, 1); break;
3271  // case 590: CALL(0, 1, 1, 1, 0, 0, 1, 0, 0, 1); break;
3272  // case 591: CALL(1, 1, 1, 1, 0, 0, 1, 0, 0, 1); break;
3273  // case 592: CALL(0, 0, 0, 0, 1, 0, 1, 0, 0, 1); break;
3274  // case 593: CALL(1, 0, 0, 0, 1, 0, 1, 0, 0, 1); break;
3275  // case 594: CALL(0, 1, 0, 0, 1, 0, 1, 0, 0, 1); break;
3276  // case 595: CALL(1, 1, 0, 0, 1, 0, 1, 0, 0, 1); break;
3277  // case 596: CALL(0, 0, 1, 0, 1, 0, 1, 0, 0, 1); break;
3278  // case 597: CALL(1, 0, 1, 0, 1, 0, 1, 0, 0, 1); break;
3279  // case 598: CALL(0, 1, 1, 0, 1, 0, 1, 0, 0, 1); break;
3280  // case 599: CALL(1, 1, 1, 0, 1, 0, 1, 0, 0, 1); break;
3281  // case 600: CALL(0, 0, 0, 1, 1, 0, 1, 0, 0, 1); break;
3282  // case 601: CALL(1, 0, 0, 1, 1, 0, 1, 0, 0, 1); break;
3283  // case 602: CALL(0, 1, 0, 1, 1, 0, 1, 0, 0, 1); break;
3284  // case 603: CALL(1, 1, 0, 1, 1, 0, 1, 0, 0, 1); break;
3285  // case 604: CALL(0, 0, 1, 1, 1, 0, 1, 0, 0, 1); break;
3286  // case 605: CALL(1, 0, 1, 1, 1, 0, 1, 0, 0, 1); break;
3287  // case 606: CALL(0, 1, 1, 1, 1, 0, 1, 0, 0, 1); break;
3288  // case 607: CALL(1, 1, 1, 1, 1, 0, 1, 0, 0, 1); break;
3289  // case 608: CALL(0, 0, 0, 0, 0, 1, 1, 0, 0, 1); break;
3290  // case 609: CALL(1, 0, 0, 0, 0, 1, 1, 0, 0, 1); break;
3291  // case 610: CALL(0, 1, 0, 0, 0, 1, 1, 0, 0, 1); break;
3292  // case 611: CALL(1, 1, 0, 0, 0, 1, 1, 0, 0, 1); break;
3293  // case 612: CALL(0, 0, 1, 0, 0, 1, 1, 0, 0, 1); break;
3294  // case 613: CALL(1, 0, 1, 0, 0, 1, 1, 0, 0, 1); break;
3295  // case 614: CALL(0, 1, 1, 0, 0, 1, 1, 0, 0, 1); break;
3296  // case 615: CALL(1, 1, 1, 0, 0, 1, 1, 0, 0, 1); break;
3297  // case 616: CALL(0, 0, 0, 1, 0, 1, 1, 0, 0, 1); break;
3298  // case 617: CALL(1, 0, 0, 1, 0, 1, 1, 0, 0, 1); break;
3299  // case 618: CALL(0, 1, 0, 1, 0, 1, 1, 0, 0, 1); break;
3300  // case 619: CALL(1, 1, 0, 1, 0, 1, 1, 0, 0, 1); break;
3301  // case 620: CALL(0, 0, 1, 1, 0, 1, 1, 0, 0, 1); break;
3302  // case 621: CALL(1, 0, 1, 1, 0, 1, 1, 0, 0, 1); break;
3303  // case 622: CALL(0, 1, 1, 1, 0, 1, 1, 0, 0, 1); break;
3304  // case 623: CALL(1, 1, 1, 1, 0, 1, 1, 0, 0, 1); break;
3305  // case 624: CALL(0, 0, 0, 0, 1, 1, 1, 0, 0, 1); break;
3306  // case 625: CALL(1, 0, 0, 0, 1, 1, 1, 0, 0, 1); break;
3307  // case 626: CALL(0, 1, 0, 0, 1, 1, 1, 0, 0, 1); break;
3308  // case 627: CALL(1, 1, 0, 0, 1, 1, 1, 0, 0, 1); break;
3309  // case 628: CALL(0, 0, 1, 0, 1, 1, 1, 0, 0, 1); break;
3310  // case 629: CALL(1, 0, 1, 0, 1, 1, 1, 0, 0, 1); break;
3311  // case 630: CALL(0, 1, 1, 0, 1, 1, 1, 0, 0, 1); break;
3312  // case 631: CALL(1, 1, 1, 0, 1, 1, 1, 0, 0, 1); break;
3313  // case 632: CALL(0, 0, 0, 1, 1, 1, 1, 0, 0, 1); break;
3314  // case 633: CALL(1, 0, 0, 1, 1, 1, 1, 0, 0, 1); break;
3315  // case 634: CALL(0, 1, 0, 1, 1, 1, 1, 0, 0, 1); break;
3316  // case 635: CALL(1, 1, 0, 1, 1, 1, 1, 0, 0, 1); break;
3317  // case 636: CALL(0, 0, 1, 1, 1, 1, 1, 0, 0, 1); break;
3318  // case 637: CALL(1, 0, 1, 1, 1, 1, 1, 0, 0, 1); break;
3319  // case 638: CALL(0, 1, 1, 1, 1, 1, 1, 0, 0, 1); break;
3320  // case 639: CALL(1, 1, 1, 1, 1, 1, 1, 0, 0, 1); break;
3321  // case 640: CALL(0, 0, 0, 0, 0, 0, 0, 1, 0, 1); break;
3322  // case 641: CALL(1, 0, 0, 0, 0, 0, 0, 1, 0, 1); break;
3323  // case 642: CALL(0, 1, 0, 0, 0, 0, 0, 1, 0, 1); break;
3324  // case 643: CALL(1, 1, 0, 0, 0, 0, 0, 1, 0, 1); break;
3325  // case 644: CALL(0, 0, 1, 0, 0, 0, 0, 1, 0, 1); break;
3326  // case 645: CALL(1, 0, 1, 0, 0, 0, 0, 1, 0, 1); break;
3327  // case 646: CALL(0, 1, 1, 0, 0, 0, 0, 1, 0, 1); break;
3328  // case 647: CALL(1, 1, 1, 0, 0, 0, 0, 1, 0, 1); break;
3329  // case 648: CALL(0, 0, 0, 1, 0, 0, 0, 1, 0, 1); break;
3330  // case 649: CALL(1, 0, 0, 1, 0, 0, 0, 1, 0, 1); break;
3331  // case 650: CALL(0, 1, 0, 1, 0, 0, 0, 1, 0, 1); break;
3332  // case 651: CALL(1, 1, 0, 1, 0, 0, 0, 1, 0, 1); break;
3333  // case 652: CALL(0, 0, 1, 1, 0, 0, 0, 1, 0, 1); break;
3334  // case 653: CALL(1, 0, 1, 1, 0, 0, 0, 1, 0, 1); break;
3335  // case 654: CALL(0, 1, 1, 1, 0, 0, 0, 1, 0, 1); break;
3336  // case 655: CALL(1, 1, 1, 1, 0, 0, 0, 1, 0, 1); break;
3337  // case 656: CALL(0, 0, 0, 0, 1, 0, 0, 1, 0, 1); break;
3338  // case 657: CALL(1, 0, 0, 0, 1, 0, 0, 1, 0, 1); break;
3339  // case 658: CALL(0, 1, 0, 0, 1, 0, 0, 1, 0, 1); break;
3340  // case 659: CALL(1, 1, 0, 0, 1, 0, 0, 1, 0, 1); break;
3341  // case 660: CALL(0, 0, 1, 0, 1, 0, 0, 1, 0, 1); break;
3342  // case 661: CALL(1, 0, 1, 0, 1, 0, 0, 1, 0, 1); break;
3343  // case 662: CALL(0, 1, 1, 0, 1, 0, 0, 1, 0, 1); break;
3344  // case 663: CALL(1, 1, 1, 0, 1, 0, 0, 1, 0, 1); break;
3345  // case 664: CALL(0, 0, 0, 1, 1, 0, 0, 1, 0, 1); break;
3346  // case 665: CALL(1, 0, 0, 1, 1, 0, 0, 1, 0, 1); break;
3347  // case 666: CALL(0, 1, 0, 1, 1, 0, 0, 1, 0, 1); break;
3348  // case 667: CALL(1, 1, 0, 1, 1, 0, 0, 1, 0, 1); break;
3349  // case 668: CALL(0, 0, 1, 1, 1, 0, 0, 1, 0, 1); break;
3350  // case 669: CALL(1, 0, 1, 1, 1, 0, 0, 1, 0, 1); break;
3351  // case 670: CALL(0, 1, 1, 1, 1, 0, 0, 1, 0, 1); break;
3352  // case 671: CALL(1, 1, 1, 1, 1, 0, 0, 1, 0, 1); break;
3353  // case 672: CALL(0, 0, 0, 0, 0, 1, 0, 1, 0, 1); break;
3354  // case 673: CALL(1, 0, 0, 0, 0, 1, 0, 1, 0, 1); break;
3355  // case 674: CALL(0, 1, 0, 0, 0, 1, 0, 1, 0, 1); break;
3356  // case 675: CALL(1, 1, 0, 0, 0, 1, 0, 1, 0, 1); break;
3357  // case 676: CALL(0, 0, 1, 0, 0, 1, 0, 1, 0, 1); break;
3358  // case 677: CALL(1, 0, 1, 0, 0, 1, 0, 1, 0, 1); break;
3359  // case 678: CALL(0, 1, 1, 0, 0, 1, 0, 1, 0, 1); break;
3360  // case 679: CALL(1, 1, 1, 0, 0, 1, 0, 1, 0, 1); break;
3361  // case 680: CALL(0, 0, 0, 1, 0, 1, 0, 1, 0, 1); break;
3362  // case 681: CALL(1, 0, 0, 1, 0, 1, 0, 1, 0, 1); break;
3363  // case 682: CALL(0, 1, 0, 1, 0, 1, 0, 1, 0, 1); break;
3364  // case 683: CALL(1, 1, 0, 1, 0, 1, 0, 1, 0, 1); break;
3365  // case 684: CALL(0, 0, 1, 1, 0, 1, 0, 1, 0, 1); break;
3366  // case 685: CALL(1, 0, 1, 1, 0, 1, 0, 1, 0, 1); break;
3367  // case 686: CALL(0, 1, 1, 1, 0, 1, 0, 1, 0, 1); break;
3368  // case 687: CALL(1, 1, 1, 1, 0, 1, 0, 1, 0, 1); break;
3369  // case 688: CALL(0, 0, 0, 0, 1, 1, 0, 1, 0, 1); break;
3370  // case 689: CALL(1, 0, 0, 0, 1, 1, 0, 1, 0, 1); break;
3371  // case 690: CALL(0, 1, 0, 0, 1, 1, 0, 1, 0, 1); break;
3372  // case 691: CALL(1, 1, 0, 0, 1, 1, 0, 1, 0, 1); break;
3373  // case 692: CALL(0, 0, 1, 0, 1, 1, 0, 1, 0, 1); break;
3374  // case 693: CALL(1, 0, 1, 0, 1, 1, 0, 1, 0, 1); break;
3375  // case 694: CALL(0, 1, 1, 0, 1, 1, 0, 1, 0, 1); break;
3376  // case 695: CALL(1, 1, 1, 0, 1, 1, 0, 1, 0, 1); break;
3377  // case 696: CALL(0, 0, 0, 1, 1, 1, 0, 1, 0, 1); break;
3378  // case 697: CALL(1, 0, 0, 1, 1, 1, 0, 1, 0, 1); break;
3379  // case 698: CALL(0, 1, 0, 1, 1, 1, 0, 1, 0, 1); break;
3380  // case 699: CALL(1, 1, 0, 1, 1, 1, 0, 1, 0, 1); break;
3381  // case 700: CALL(0, 0, 1, 1, 1, 1, 0, 1, 0, 1); break;
3382  // case 701: CALL(1, 0, 1, 1, 1, 1, 0, 1, 0, 1); break;
3383  // case 702: CALL(0, 1, 1, 1, 1, 1, 0, 1, 0, 1); break;
3384  // case 703: CALL(1, 1, 1, 1, 1, 1, 0, 1, 0, 1); break;
3385  // case 704: CALL(0, 0, 0, 0, 0, 0, 1, 1, 0, 1); break;
3386  // case 705: CALL(1, 0, 0, 0, 0, 0, 1, 1, 0, 1); break;
3387  // case 706: CALL(0, 1, 0, 0, 0, 0, 1, 1, 0, 1); break;
3388  // case 707: CALL(1, 1, 0, 0, 0, 0, 1, 1, 0, 1); break;
3389  // case 708: CALL(0, 0, 1, 0, 0, 0, 1, 1, 0, 1); break;
3390  // case 709: CALL(1, 0, 1, 0, 0, 0, 1, 1, 0, 1); break;
3391  // case 710: CALL(0, 1, 1, 0, 0, 0, 1, 1, 0, 1); break;
3392  // case 711: CALL(1, 1, 1, 0, 0, 0, 1, 1, 0, 1); break;
3393  // case 712: CALL(0, 0, 0, 1, 0, 0, 1, 1, 0, 1); break;
3394  // case 713: CALL(1, 0, 0, 1, 0, 0, 1, 1, 0, 1); break;
3395  // case 714: CALL(0, 1, 0, 1, 0, 0, 1, 1, 0, 1); break;
3396  // case 715: CALL(1, 1, 0, 1, 0, 0, 1, 1, 0, 1); break;
3397  // case 716: CALL(0, 0, 1, 1, 0, 0, 1, 1, 0, 1); break;
3398  // case 717: CALL(1, 0, 1, 1, 0, 0, 1, 1, 0, 1); break;
3399  // case 718: CALL(0, 1, 1, 1, 0, 0, 1, 1, 0, 1); break;
3400  // case 719: CALL(1, 1, 1, 1, 0, 0, 1, 1, 0, 1); break;
3401  // case 720: CALL(0, 0, 0, 0, 1, 0, 1, 1, 0, 1); break;
3402  // case 721: CALL(1, 0, 0, 0, 1, 0, 1, 1, 0, 1); break;
3403  // case 722: CALL(0, 1, 0, 0, 1, 0, 1, 1, 0, 1); break;
3404  // case 723: CALL(1, 1, 0, 0, 1, 0, 1, 1, 0, 1); break;
3405  // case 724: CALL(0, 0, 1, 0, 1, 0, 1, 1, 0, 1); break;
3406  // case 725: CALL(1, 0, 1, 0, 1, 0, 1, 1, 0, 1); break;
3407  // case 726: CALL(0, 1, 1, 0, 1, 0, 1, 1, 0, 1); break;
3408  // case 727: CALL(1, 1, 1, 0, 1, 0, 1, 1, 0, 1); break;
3409  // case 728: CALL(0, 0, 0, 1, 1, 0, 1, 1, 0, 1); break;
3410  // case 729: CALL(1, 0, 0, 1, 1, 0, 1, 1, 0, 1); break;
3411  // case 730: CALL(0, 1, 0, 1, 1, 0, 1, 1, 0, 1); break;
3412  // case 731: CALL(1, 1, 0, 1, 1, 0, 1, 1, 0, 1); break;
3413  // case 732: CALL(0, 0, 1, 1, 1, 0, 1, 1, 0, 1); break;
3414  // case 733: CALL(1, 0, 1, 1, 1, 0, 1, 1, 0, 1); break;
3415  // case 734: CALL(0, 1, 1, 1, 1, 0, 1, 1, 0, 1); break;
3416  // case 735: CALL(1, 1, 1, 1, 1, 0, 1, 1, 0, 1); break;
3417  // case 736: CALL(0, 0, 0, 0, 0, 1, 1, 1, 0, 1); break;
3418  // case 737: CALL(1, 0, 0, 0, 0, 1, 1, 1, 0, 1); break;
3419  // case 738: CALL(0, 1, 0, 0, 0, 1, 1, 1, 0, 1); break;
3420  // case 739: CALL(1, 1, 0, 0, 0, 1, 1, 1, 0, 1); break;
3421  // case 740: CALL(0, 0, 1, 0, 0, 1, 1, 1, 0, 1); break;
3422  // case 741: CALL(1, 0, 1, 0, 0, 1, 1, 1, 0, 1); break;
3423  // case 742: CALL(0, 1, 1, 0, 0, 1, 1, 1, 0, 1); break;
3424  // case 743: CALL(1, 1, 1, 0, 0, 1, 1, 1, 0, 1); break;
3425  // case 744: CALL(0, 0, 0, 1, 0, 1, 1, 1, 0, 1); break;
3426  // case 745: CALL(1, 0, 0, 1, 0, 1, 1, 1, 0, 1); break;
3427  // case 746: CALL(0, 1, 0, 1, 0, 1, 1, 1, 0, 1); break;
3428  // case 747: CALL(1, 1, 0, 1, 0, 1, 1, 1, 0, 1); break;
3429  // case 748: CALL(0, 0, 1, 1, 0, 1, 1, 1, 0, 1); break;
3430  // case 749: CALL(1, 0, 1, 1, 0, 1, 1, 1, 0, 1); break;
3431  // case 750: CALL(0, 1, 1, 1, 0, 1, 1, 1, 0, 1); break;
3432  // case 751: CALL(1, 1, 1, 1, 0, 1, 1, 1, 0, 1); break;
3433  // case 752: CALL(0, 0, 0, 0, 1, 1, 1, 1, 0, 1); break;
3434  // case 753: CALL(1, 0, 0, 0, 1, 1, 1, 1, 0, 1); break;
3435  // case 754: CALL(0, 1, 0, 0, 1, 1, 1, 1, 0, 1); break;
3436  // case 755: CALL(1, 1, 0, 0, 1, 1, 1, 1, 0, 1); break;
3437  // case 756: CALL(0, 0, 1, 0, 1, 1, 1, 1, 0, 1); break;
3438  // case 757: CALL(1, 0, 1, 0, 1, 1, 1, 1, 0, 1); break;
3439  // case 758: CALL(0, 1, 1, 0, 1, 1, 1, 1, 0, 1); break;
3440  // case 759: CALL(1, 1, 1, 0, 1, 1, 1, 1, 0, 1); break;
3441  // case 760: CALL(0, 0, 0, 1, 1, 1, 1, 1, 0, 1); break;
3442  // case 761: CALL(1, 0, 0, 1, 1, 1, 1, 1, 0, 1); break;
3443  // case 762: CALL(0, 1, 0, 1, 1, 1, 1, 1, 0, 1); break;
3444  // case 763: CALL(1, 1, 0, 1, 1, 1, 1, 1, 0, 1); break;
3445  // case 764: CALL(0, 0, 1, 1, 1, 1, 1, 1, 0, 1); break;
3446  // case 765: CALL(1, 0, 1, 1, 1, 1, 1, 1, 0, 1); break;
3447  // case 766: CALL(0, 1, 1, 1, 1, 1, 1, 1, 0, 1); break;
3448  // case 767: CALL(1, 1, 1, 1, 1, 1, 1, 1, 0, 1); break;
3449  // case 768: CALL(0, 0, 0, 0, 0, 0, 0, 0, 1, 1); break;
3450  // case 769: CALL(1, 0, 0, 0, 0, 0, 0, 0, 1, 1); break;
3451  // case 770: CALL(0, 1, 0, 0, 0, 0, 0, 0, 1, 1); break;
3452  // case 771: CALL(1, 1, 0, 0, 0, 0, 0, 0, 1, 1); break;
3453  // case 772: CALL(0, 0, 1, 0, 0, 0, 0, 0, 1, 1); break;
3454  // case 773: CALL(1, 0, 1, 0, 0, 0, 0, 0, 1, 1); break;
3455  // case 774: CALL(0, 1, 1, 0, 0, 0, 0, 0, 1, 1); break;
3456  // case 775: CALL(1, 1, 1, 0, 0, 0, 0, 0, 1, 1); break;
3457  // case 776: CALL(0, 0, 0, 1, 0, 0, 0, 0, 1, 1); break;
3458  // case 777: CALL(1, 0, 0, 1, 0, 0, 0, 0, 1, 1); break;
3459  // case 778: CALL(0, 1, 0, 1, 0, 0, 0, 0, 1, 1); break;
3460  // case 779: CALL(1, 1, 0, 1, 0, 0, 0, 0, 1, 1); break;
3461  // case 780: CALL(0, 0, 1, 1, 0, 0, 0, 0, 1, 1); break;
3462  // case 781: CALL(1, 0, 1, 1, 0, 0, 0, 0, 1, 1); break;
3463  // case 782: CALL(0, 1, 1, 1, 0, 0, 0, 0, 1, 1); break;
3464  // case 783: CALL(1, 1, 1, 1, 0, 0, 0, 0, 1, 1); break;
3465  // case 784: CALL(0, 0, 0, 0, 1, 0, 0, 0, 1, 1); break;
3466  // case 785: CALL(1, 0, 0, 0, 1, 0, 0, 0, 1, 1); break;
3467  // case 786: CALL(0, 1, 0, 0, 1, 0, 0, 0, 1, 1); break;
3468  // case 787: CALL(1, 1, 0, 0, 1, 0, 0, 0, 1, 1); break;
3469  // case 788: CALL(0, 0, 1, 0, 1, 0, 0, 0, 1, 1); break;
3470  // case 789: CALL(1, 0, 1, 0, 1, 0, 0, 0, 1, 1); break;
3471  // case 790: CALL(0, 1, 1, 0, 1, 0, 0, 0, 1, 1); break;
3472  // case 791: CALL(1, 1, 1, 0, 1, 0, 0, 0, 1, 1); break;
3473  // case 792: CALL(0, 0, 0, 1, 1, 0, 0, 0, 1, 1); break;
3474  // case 793: CALL(1, 0, 0, 1, 1, 0, 0, 0, 1, 1); break;
3475  // case 794: CALL(0, 1, 0, 1, 1, 0, 0, 0, 1, 1); break;
3476  // case 795: CALL(1, 1, 0, 1, 1, 0, 0, 0, 1, 1); break;
3477  // case 796: CALL(0, 0, 1, 1, 1, 0, 0, 0, 1, 1); break;
3478  // case 797: CALL(1, 0, 1, 1, 1, 0, 0, 0, 1, 1); break;
3479  // case 798: CALL(0, 1, 1, 1, 1, 0, 0, 0, 1, 1); break;
3480  // case 799: CALL(1, 1, 1, 1, 1, 0, 0, 0, 1, 1); break;
3481  // case 800: CALL(0, 0, 0, 0, 0, 1, 0, 0, 1, 1); break;
3482  // case 801: CALL(1, 0, 0, 0, 0, 1, 0, 0, 1, 1); break;
3483  // case 802: CALL(0, 1, 0, 0, 0, 1, 0, 0, 1, 1); break;
3484  // case 803: CALL(1, 1, 0, 0, 0, 1, 0, 0, 1, 1); break;
3485  // case 804: CALL(0, 0, 1, 0, 0, 1, 0, 0, 1, 1); break;
3486  // case 805: CALL(1, 0, 1, 0, 0, 1, 0, 0, 1, 1); break;
3487  // case 806: CALL(0, 1, 1, 0, 0, 1, 0, 0, 1, 1); break;
3488  // case 807: CALL(1, 1, 1, 0, 0, 1, 0, 0, 1, 1); break;
3489  // case 808: CALL(0, 0, 0, 1, 0, 1, 0, 0, 1, 1); break;
3490  // case 809: CALL(1, 0, 0, 1, 0, 1, 0, 0, 1, 1); break;
3491  // case 810: CALL(0, 1, 0, 1, 0, 1, 0, 0, 1, 1); break;
3492  // case 811: CALL(1, 1, 0, 1, 0, 1, 0, 0, 1, 1); break;
3493  // case 812: CALL(0, 0, 1, 1, 0, 1, 0, 0, 1, 1); break;
3494  // case 813: CALL(1, 0, 1, 1, 0, 1, 0, 0, 1, 1); break;
3495  // case 814: CALL(0, 1, 1, 1, 0, 1, 0, 0, 1, 1); break;
3496  // case 815: CALL(1, 1, 1, 1, 0, 1, 0, 0, 1, 1); break;
3497  case 816: CALL(0, 0, 0, 0, 1, 1, 0, 0, 1, 1); break;
3498  case 817: CALL(1, 0, 0, 0, 1, 1, 0, 0, 1, 1); break;
3499  case 818: CALL(0, 1, 0, 0, 1, 1, 0, 0, 1, 1); break;
3500  case 819: CALL(1, 1, 0, 0, 1, 1, 0, 0, 1, 1); break;
3501  case 820: CALL(0, 0, 1, 0, 1, 1, 0, 0, 1, 1); break;
3502  case 821: CALL(1, 0, 1, 0, 1, 1, 0, 0, 1, 1); break;
3503  case 822: CALL(0, 1, 1, 0, 1, 1, 0, 0, 1, 1); break;
3504  case 823: CALL(1, 1, 1, 0, 1, 1, 0, 0, 1, 1); break;
3505  case 824: CALL(0, 0, 0, 1, 1, 1, 0, 0, 1, 1); break;
3506  case 825: CALL(1, 0, 0, 1, 1, 1, 0, 0, 1, 1); break;
3507  case 826: CALL(0, 1, 0, 1, 1, 1, 0, 0, 1, 1); break;
3508  case 827: CALL(1, 1, 0, 1, 1, 1, 0, 0, 1, 1); break;
3509  case 828: CALL(0, 0, 1, 1, 1, 1, 0, 0, 1, 1); break;
3510  case 829: CALL(1, 0, 1, 1, 1, 1, 0, 0, 1, 1); break;
3511  case 830: CALL(0, 1, 1, 1, 1, 1, 0, 0, 1, 1); break;
3512  case 831: CALL(1, 1, 1, 1, 1, 1, 0, 0, 1, 1); break;
3513  // case 832: CALL(0, 0, 0, 0, 0, 0, 1, 0, 1, 1); break;
3514  // case 833: CALL(1, 0, 0, 0, 0, 0, 1, 0, 1, 1); break;
3515  // case 834: CALL(0, 1, 0, 0, 0, 0, 1, 0, 1, 1); break;
3516  // case 835: CALL(1, 1, 0, 0, 0, 0, 1, 0, 1, 1); break;
3517  // case 836: CALL(0, 0, 1, 0, 0, 0, 1, 0, 1, 1); break;
3518  // case 837: CALL(1, 0, 1, 0, 0, 0, 1, 0, 1, 1); break;
3519  // case 838: CALL(0, 1, 1, 0, 0, 0, 1, 0, 1, 1); break;
3520  // case 839: CALL(1, 1, 1, 0, 0, 0, 1, 0, 1, 1); break;
3521  // case 840: CALL(0, 0, 0, 1, 0, 0, 1, 0, 1, 1); break;
3522  // case 841: CALL(1, 0, 0, 1, 0, 0, 1, 0, 1, 1); break;
3523  // case 842: CALL(0, 1, 0, 1, 0, 0, 1, 0, 1, 1); break;
3524  // case 843: CALL(1, 1, 0, 1, 0, 0, 1, 0, 1, 1); break;
3525  // case 844: CALL(0, 0, 1, 1, 0, 0, 1, 0, 1, 1); break;
3526  // case 845: CALL(1, 0, 1, 1, 0, 0, 1, 0, 1, 1); break;
3527  // case 846: CALL(0, 1, 1, 1, 0, 0, 1, 0, 1, 1); break;
3528  // case 847: CALL(1, 1, 1, 1, 0, 0, 1, 0, 1, 1); break;
3529  case 848: CALL(0, 0, 0, 0, 1, 0, 1, 0, 1, 1); break;
3530  case 849: CALL(1, 0, 0, 0, 1, 0, 1, 0, 1, 1); break;
3531  case 850: CALL(0, 1, 0, 0, 1, 0, 1, 0, 1, 1); break;
3532  case 851: CALL(1, 1, 0, 0, 1, 0, 1, 0, 1, 1); break;
3533  case 852: CALL(0, 0, 1, 0, 1, 0, 1, 0, 1, 1); break;
3534  case 853: CALL(1, 0, 1, 0, 1, 0, 1, 0, 1, 1); break;
3535  case 854: CALL(0, 1, 1, 0, 1, 0, 1, 0, 1, 1); break;
3536  case 855: CALL(1, 1, 1, 0, 1, 0, 1, 0, 1, 1); break;
3537  case 856: CALL(0, 0, 0, 1, 1, 0, 1, 0, 1, 1); break;
3538  case 857: CALL(1, 0, 0, 1, 1, 0, 1, 0, 1, 1); break;
3539  case 858: CALL(0, 1, 0, 1, 1, 0, 1, 0, 1, 1); break;
3540  case 859: CALL(1, 1, 0, 1, 1, 0, 1, 0, 1, 1); break;
3541  case 860: CALL(0, 0, 1, 1, 1, 0, 1, 0, 1, 1); break;
3542  case 861: CALL(1, 0, 1, 1, 1, 0, 1, 0, 1, 1); break;
3543  case 862: CALL(0, 1, 1, 1, 1, 0, 1, 0, 1, 1); break;
3544  case 863: CALL(1, 1, 1, 1, 1, 0, 1, 0, 1, 1); break;
3545  // case 864: CALL(0, 0, 0, 0, 0, 1, 1, 0, 1, 1); break;
3546  // case 865: CALL(1, 0, 0, 0, 0, 1, 1, 0, 1, 1); break;
3547  // case 866: CALL(0, 1, 0, 0, 0, 1, 1, 0, 1, 1); break;
3548  // case 867: CALL(1, 1, 0, 0, 0, 1, 1, 0, 1, 1); break;
3549  // case 868: CALL(0, 0, 1, 0, 0, 1, 1, 0, 1, 1); break;
3550  // case 869: CALL(1, 0, 1, 0, 0, 1, 1, 0, 1, 1); break;
3551  // case 870: CALL(0, 1, 1, 0, 0, 1, 1, 0, 1, 1); break;
3552  // case 871: CALL(1, 1, 1, 0, 0, 1, 1, 0, 1, 1); break;
3553  // case 872: CALL(0, 0, 0, 1, 0, 1, 1, 0, 1, 1); break;
3554  // case 873: CALL(1, 0, 0, 1, 0, 1, 1, 0, 1, 1); break;
3555  // case 874: CALL(0, 1, 0, 1, 0, 1, 1, 0, 1, 1); break;
3556  // case 875: CALL(1, 1, 0, 1, 0, 1, 1, 0, 1, 1); break;
3557  // case 876: CALL(0, 0, 1, 1, 0, 1, 1, 0, 1, 1); break;
3558  // case 877: CALL(1, 0, 1, 1, 0, 1, 1, 0, 1, 1); break;
3559  // case 878: CALL(0, 1, 1, 1, 0, 1, 1, 0, 1, 1); break;
3560  // case 879: CALL(1, 1, 1, 1, 0, 1, 1, 0, 1, 1); break;
3561  // case 880: CALL(0, 0, 0, 0, 1, 1, 1, 0, 1, 1); break;
3562  // case 881: CALL(1, 0, 0, 0, 1, 1, 1, 0, 1, 1); break;
3563  // case 882: CALL(0, 1, 0, 0, 1, 1, 1, 0, 1, 1); break;
3564  // case 883: CALL(1, 1, 0, 0, 1, 1, 1, 0, 1, 1); break;
3565  // case 884: CALL(0, 0, 1, 0, 1, 1, 1, 0, 1, 1); break;
3566  // case 885: CALL(1, 0, 1, 0, 1, 1, 1, 0, 1, 1); break;
3567  // case 886: CALL(0, 1, 1, 0, 1, 1, 1, 0, 1, 1); break;
3568  // case 887: CALL(1, 1, 1, 0, 1, 1, 1, 0, 1, 1); break;
3569  // case 888: CALL(0, 0, 0, 1, 1, 1, 1, 0, 1, 1); break;
3570  // case 889: CALL(1, 0, 0, 1, 1, 1, 1, 0, 1, 1); break;
3571  // case 890: CALL(0, 1, 0, 1, 1, 1, 1, 0, 1, 1); break;
3572  // case 891: CALL(1, 1, 0, 1, 1, 1, 1, 0, 1, 1); break;
3573  // case 892: CALL(0, 0, 1, 1, 1, 1, 1, 0, 1, 1); break;
3574  // case 893: CALL(1, 0, 1, 1, 1, 1, 1, 0, 1, 1); break;
3575  // case 894: CALL(0, 1, 1, 1, 1, 1, 1, 0, 1, 1); break;
3576  // case 895: CALL(1, 1, 1, 1, 1, 1, 1, 0, 1, 1); break;
3577  // case 896: CALL(0, 0, 0, 0, 0, 0, 0, 1, 1, 1); break;
3578  // case 897: CALL(1, 0, 0, 0, 0, 0, 0, 1, 1, 1); break;
3579  // case 898: CALL(0, 1, 0, 0, 0, 0, 0, 1, 1, 1); break;
3580  // case 899: CALL(1, 1, 0, 0, 0, 0, 0, 1, 1, 1); break;
3581  // case 900: CALL(0, 0, 1, 0, 0, 0, 0, 1, 1, 1); break;
3582  // case 901: CALL(1, 0, 1, 0, 0, 0, 0, 1, 1, 1); break;
3583  // case 902: CALL(0, 1, 1, 0, 0, 0, 0, 1, 1, 1); break;
3584  // case 903: CALL(1, 1, 1, 0, 0, 0, 0, 1, 1, 1); break;
3585  // case 904: CALL(0, 0, 0, 1, 0, 0, 0, 1, 1, 1); break;
3586  // case 905: CALL(1, 0, 0, 1, 0, 0, 0, 1, 1, 1); break;
3587  // case 906: CALL(0, 1, 0, 1, 0, 0, 0, 1, 1, 1); break;
3588  // case 907: CALL(1, 1, 0, 1, 0, 0, 0, 1, 1, 1); break;
3589  // case 908: CALL(0, 0, 1, 1, 0, 0, 0, 1, 1, 1); break;
3590  // case 909: CALL(1, 0, 1, 1, 0, 0, 0, 1, 1, 1); break;
3591  // case 910: CALL(0, 1, 1, 1, 0, 0, 0, 1, 1, 1); break;
3592  // case 911: CALL(1, 1, 1, 1, 0, 0, 0, 1, 1, 1); break;
3593  // case 912: CALL(0, 0, 0, 0, 1, 0, 0, 1, 1, 1); break;
3594  // case 913: CALL(1, 0, 0, 0, 1, 0, 0, 1, 1, 1); break;
3595  // case 914: CALL(0, 1, 0, 0, 1, 0, 0, 1, 1, 1); break;
3596  // case 915: CALL(1, 1, 0, 0, 1, 0, 0, 1, 1, 1); break;
3597  // case 916: CALL(0, 0, 1, 0, 1, 0, 0, 1, 1, 1); break;
3598  // case 917: CALL(1, 0, 1, 0, 1, 0, 0, 1, 1, 1); break;
3599  // case 918: CALL(0, 1, 1, 0, 1, 0, 0, 1, 1, 1); break;
3600  // case 919: CALL(1, 1, 1, 0, 1, 0, 0, 1, 1, 1); break;
3601  // case 920: CALL(0, 0, 0, 1, 1, 0, 0, 1, 1, 1); break;
3602  // case 921: CALL(1, 0, 0, 1, 1, 0, 0, 1, 1, 1); break;
3603  // case 922: CALL(0, 1, 0, 1, 1, 0, 0, 1, 1, 1); break;
3604  // case 923: CALL(1, 1, 0, 1, 1, 0, 0, 1, 1, 1); break;
3605  // case 924: CALL(0, 0, 1, 1, 1, 0, 0, 1, 1, 1); break;
3606  // case 925: CALL(1, 0, 1, 1, 1, 0, 0, 1, 1, 1); break;
3607  // case 926: CALL(0, 1, 1, 1, 1, 0, 0, 1, 1, 1); break;
3608  // case 927: CALL(1, 1, 1, 1, 1, 0, 0, 1, 1, 1); break;
3609  // case 928: CALL(0, 0, 0, 0, 0, 1, 0, 1, 1, 1); break;
3610  // case 929: CALL(1, 0, 0, 0, 0, 1, 0, 1, 1, 1); break;
3611  // case 930: CALL(0, 1, 0, 0, 0, 1, 0, 1, 1, 1); break;
3612  // case 931: CALL(1, 1, 0, 0, 0, 1, 0, 1, 1, 1); break;
3613  // case 932: CALL(0, 0, 1, 0, 0, 1, 0, 1, 1, 1); break;
3614  // case 933: CALL(1, 0, 1, 0, 0, 1, 0, 1, 1, 1); break;
3615  // case 934: CALL(0, 1, 1, 0, 0, 1, 0, 1, 1, 1); break;
3616  // case 935: CALL(1, 1, 1, 0, 0, 1, 0, 1, 1, 1); break;
3617  // case 936: CALL(0, 0, 0, 1, 0, 1, 0, 1, 1, 1); break;
3618  // case 937: CALL(1, 0, 0, 1, 0, 1, 0, 1, 1, 1); break;
3619  // case 938: CALL(0, 1, 0, 1, 0, 1, 0, 1, 1, 1); break;
3620  // case 939: CALL(1, 1, 0, 1, 0, 1, 0, 1, 1, 1); break;
3621  // case 940: CALL(0, 0, 1, 1, 0, 1, 0, 1, 1, 1); break;
3622  // case 941: CALL(1, 0, 1, 1, 0, 1, 0, 1, 1, 1); break;
3623  // case 942: CALL(0, 1, 1, 1, 0, 1, 0, 1, 1, 1); break;
3624  // case 943: CALL(1, 1, 1, 1, 0, 1, 0, 1, 1, 1); break;
3625  case 944: CALL(0, 0, 0, 0, 1, 1, 0, 1, 1, 1); break;
3626  case 945: CALL(1, 0, 0, 0, 1, 1, 0, 1, 1, 1); break;
3627  case 946: CALL(0, 1, 0, 0, 1, 1, 0, 1, 1, 1); break;
3628  case 947: CALL(1, 1, 0, 0, 1, 1, 0, 1, 1, 1); break;
3629  case 948: CALL(0, 0, 1, 0, 1, 1, 0, 1, 1, 1); break;
3630  case 949: CALL(1, 0, 1, 0, 1, 1, 0, 1, 1, 1); break;
3631  case 950: CALL(0, 1, 1, 0, 1, 1, 0, 1, 1, 1); break;
3632  case 951: CALL(1, 1, 1, 0, 1, 1, 0, 1, 1, 1); break;
3633  case 952: CALL(0, 0, 0, 1, 1, 1, 0, 1, 1, 1); break;
3634  case 953: CALL(1, 0, 0, 1, 1, 1, 0, 1, 1, 1); break;
3635  case 954: CALL(0, 1, 0, 1, 1, 1, 0, 1, 1, 1); break;
3636  case 955: CALL(1, 1, 0, 1, 1, 1, 0, 1, 1, 1); break;
3637  case 956: CALL(0, 0, 1, 1, 1, 1, 0, 1, 1, 1); break;
3638  case 957: CALL(1, 0, 1, 1, 1, 1, 0, 1, 1, 1); break;
3639  case 958: CALL(0, 1, 1, 1, 1, 1, 0, 1, 1, 1); break;
3640  case 959: CALL(1, 1, 1, 1, 1, 1, 0, 1, 1, 1); break;
3641  // case 960: CALL(0, 0, 0, 0, 0, 0, 1, 1, 1, 1); break;
3642  // case 961: CALL(1, 0, 0, 0, 0, 0, 1, 1, 1, 1); break;
3643  // case 962: CALL(0, 1, 0, 0, 0, 0, 1, 1, 1, 1); break;
3644  // case 963: CALL(1, 1, 0, 0, 0, 0, 1, 1, 1, 1); break;
3645  // case 964: CALL(0, 0, 1, 0, 0, 0, 1, 1, 1, 1); break;
3646  // case 965: CALL(1, 0, 1, 0, 0, 0, 1, 1, 1, 1); break;
3647  // case 966: CALL(0, 1, 1, 0, 0, 0, 1, 1, 1, 1); break;
3648  // case 967: CALL(1, 1, 1, 0, 0, 0, 1, 1, 1, 1); break;
3649  // case 968: CALL(0, 0, 0, 1, 0, 0, 1, 1, 1, 1); break;
3650  // case 969: CALL(1, 0, 0, 1, 0, 0, 1, 1, 1, 1); break;
3651  // case 970: CALL(0, 1, 0, 1, 0, 0, 1, 1, 1, 1); break;
3652  // case 971: CALL(1, 1, 0, 1, 0, 0, 1, 1, 1, 1); break;
3653  // case 972: CALL(0, 0, 1, 1, 0, 0, 1, 1, 1, 1); break;
3654  // case 973: CALL(1, 0, 1, 1, 0, 0, 1, 1, 1, 1); break;
3655  // case 974: CALL(0, 1, 1, 1, 0, 0, 1, 1, 1, 1); break;
3656  // case 975: CALL(1, 1, 1, 1, 0, 0, 1, 1, 1, 1); break;
3657  case 976: CALL(0, 0, 0, 0, 1, 0, 1, 1, 1, 1); break;
3658  case 977: CALL(1, 0, 0, 0, 1, 0, 1, 1, 1, 1); break;
3659  case 978: CALL(0, 1, 0, 0, 1, 0, 1, 1, 1, 1); break;
3660  case 979: CALL(1, 1, 0, 0, 1, 0, 1, 1, 1, 1); break;
3661  case 980: CALL(0, 0, 1, 0, 1, 0, 1, 1, 1, 1); break;
3662  case 981: CALL(1, 0, 1, 0, 1, 0, 1, 1, 1, 1); break;
3663  case 982: CALL(0, 1, 1, 0, 1, 0, 1, 1, 1, 1); break;
3664  case 983: CALL(1, 1, 1, 0, 1, 0, 1, 1, 1, 1); break;
3665  case 984: CALL(0, 0, 0, 1, 1, 0, 1, 1, 1, 1); break;
3666  case 985: CALL(1, 0, 0, 1, 1, 0, 1, 1, 1, 1); break;
3667  case 986: CALL(0, 1, 0, 1, 1, 0, 1, 1, 1, 1); break;
3668  case 987: CALL(1, 1, 0, 1, 1, 0, 1, 1, 1, 1); break;
3669  case 988: CALL(0, 0, 1, 1, 1, 0, 1, 1, 1, 1); break;
3670  case 989: CALL(1, 0, 1, 1, 1, 0, 1, 1, 1, 1); break;
3671  case 990: CALL(0, 1, 1, 1, 1, 0, 1, 1, 1, 1); break;
3672  case 991: CALL(1, 1, 1, 1, 1, 0, 1, 1, 1, 1); break;
3673  // case 992: CALL(0, 0, 0, 0, 0, 1, 1, 1, 1, 1); break;
3674  // case 993: CALL(1, 0, 0, 0, 0, 1, 1, 1, 1, 1); break;
3675  // case 994: CALL(0, 1, 0, 0, 0, 1, 1, 1, 1, 1); break;
3676  // case 995: CALL(1, 1, 0, 0, 0, 1, 1, 1, 1, 1); break;
3677  // case 996: CALL(0, 0, 1, 0, 0, 1, 1, 1, 1, 1); break;
3678  // case 997: CALL(1, 0, 1, 0, 0, 1, 1, 1, 1, 1); break;
3679  // case 998: CALL(0, 1, 1, 0, 0, 1, 1, 1, 1, 1); break;
3680  // case 999: CALL(1, 1, 1, 0, 0, 1, 1, 1, 1, 1); break;
3681  // case 1000: CALL(0, 0, 0, 1, 0, 1, 1, 1, 1, 1); break;
3682  // case 1001: CALL(1, 0, 0, 1, 0, 1, 1, 1, 1, 1); break;
3683  // case 1002: CALL(0, 1, 0, 1, 0, 1, 1, 1, 1, 1); break;
3684  // case 1003: CALL(1, 1, 0, 1, 0, 1, 1, 1, 1, 1); break;
3685  // case 1004: CALL(0, 0, 1, 1, 0, 1, 1, 1, 1, 1); break;
3686  // case 1005: CALL(1, 0, 1, 1, 0, 1, 1, 1, 1, 1); break;
3687  // case 1006: CALL(0, 1, 1, 1, 0, 1, 1, 1, 1, 1); break;
3688  // case 1007: CALL(1, 1, 1, 1, 0, 1, 1, 1, 1, 1); break;
3689  // case 1008: CALL(0, 0, 0, 0, 1, 1, 1, 1, 1, 1); break;
3690  // case 1009: CALL(1, 0, 0, 0, 1, 1, 1, 1, 1, 1); break;
3691  // case 1010: CALL(0, 1, 0, 0, 1, 1, 1, 1, 1, 1); break;
3692  // case 1011: CALL(1, 1, 0, 0, 1, 1, 1, 1, 1, 1); break;
3693  // case 1012: CALL(0, 0, 1, 0, 1, 1, 1, 1, 1, 1); break;
3694  // case 1013: CALL(1, 0, 1, 0, 1, 1, 1, 1, 1, 1); break;
3695  // case 1014: CALL(0, 1, 1, 0, 1, 1, 1, 1, 1, 1); break;
3696  // case 1015: CALL(1, 1, 1, 0, 1, 1, 1, 1, 1, 1); break;
3697  // case 1016: CALL(0, 0, 0, 1, 1, 1, 1, 1, 1, 1); break;
3698  // case 1017: CALL(1, 0, 0, 1, 1, 1, 1, 1, 1, 1); break;
3699  // case 1018: CALL(0, 1, 0, 1, 1, 1, 1, 1, 1, 1); break;
3700  // case 1019: CALL(1, 1, 0, 1, 1, 1, 1, 1, 1, 1); break;
3701  // case 1020: CALL(0, 0, 1, 1, 1, 1, 1, 1, 1, 1); break;
3702  // case 1021: CALL(1, 0, 1, 1, 1, 1, 1, 1, 1, 1); break;
3703  // case 1022: CALL(0, 1, 1, 1, 1, 1, 1, 1, 1, 1); break;
3704  // case 1023: CALL(1, 1, 1, 1, 1, 1, 1, 1, 1, 1); break;
3705  default: {
3706 #ifdef DEBUG
3707  sprintf(errmsg,
3708  "doEnergy=%d doVirial=%d doSlow=%d doPairlist=%d "
3709  "doAlch=%d doFEP=%d doTI=%d doStreaming=%d doTable=%d"
3710  "\noptions = %d\n",
3711  doEnergy, doVirial, doSlow, doPairlist, doAlch, doFEP, doTI,
3712  doStreaming, doTable, options);
3713  NAMD_die(errmsg);
3714 #else
3715  std::string call_options;
3716  call_options += "doEnergy = " + std::to_string(int(doEnergy));
3717  call_options += ", doVirial = " + std::to_string(int(doVirial));
3718  call_options += ", doSlow = " + std::to_string(int(doSlow));
3719  call_options += ", doPairlist = " + std::to_string(int(doPairlist));
3720  call_options += ", doAlch = " + std::to_string(int(doAlch));
3721  call_options += ", doFEP = " + std::to_string(int(doFEP));
3722  call_options += ", doTI = " + std::to_string(int(doTI));
3723  call_options += ", doStreaming = " + std::to_string(int(doStreaming));
3724  // call_options += ", doTable = " + std::to_string(int(doTable));
3725  call_options += ", doAlchVdwForceSwitching = " + std::to_string(int(doAlchVdwForceSwitching));
3726  const std::string error = "CudaComputeNonbondedKernel::nonbondedForce, none of the kernels called. Options are:\n" + call_options;
3727  NAMD_bug(error.c_str());
3728 #endif
3729  }
3730  }
3731 
3732 #endif
3733 
3734 #undef CALL
3735 #undef TABLE_PARAMS
3736  cudaCheck(cudaGetLastError());
3737 
3738  start += nblock*nwarp;
3739  }
3740 
3741  if ( doVirial || ! doStreaming ){
3742  int block = 128;
3743  int grid = (atomStorageSize + block - 1)/block;
3744  if (doSlow)
3745  transposeForcesKernel<1><<<grid, block, 0, stream>>>(d_forces, d_forcesSlow,
3746  force_x, force_y, force_z, force_w,
3747  forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
3748  atomStorageSize);
3749  else
3750  transposeForcesKernel<0><<<grid, block, 0, stream>>>(d_forces, d_forcesSlow,
3751  force_x, force_y, force_z, force_w,
3752  forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
3753  atomStorageSize);
3754  }
3755 }
3756 
3757 //
3758 // Perform virial and energy reductions for non-bonded force calculation
3759 //
3760 void CudaComputeNonbondedKernel::reduceVirialEnergy(CudaTileListKernel& tlKernel,
3761  const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS,
3762  float4* d_forces, float4* d_forcesSlow,
3763  VirialEnergy* d_virialEnergy, cudaStream_t stream) {
3764 
3765  if (doEnergy || doVirial) {
3766  clear_device_array<VirialEnergy>(d_virialEnergy, ATOMIC_BINS, stream);
3767  }
3768 
3769  if (doVirial)
3770  {
3771  int nthread = REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*WARPSIZE;
3772  int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
3773  reduceNonbondedVirialKernel <<< nblock, nthread, 0, stream >>>
3774  (doSlow, atomStorageSize, tlKernel.get_xyzq(), d_forces, d_forcesSlow, d_virialEnergy);
3775  cudaCheck(cudaGetLastError());
3776  }
3777 
3778  if (doVirial || doEnergy)
3779  {
3780  int nthread = REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE;
3781  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyLength()-1)/nthread+1);
3782  reduceVirialEnergyKernel <<< nblock, nthread, 0, stream >>>
3783  (doEnergy, doVirial, doSlow, tlKernel.getTileListVirialEnergyLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
3784  cudaCheck(cudaGetLastError());
3785  }
3786 
3787  if (doGBIS && doEnergy)
3788  {
3789  int nthread = REDUCEGBISENERGYKERNEL_NUM_WARP*WARPSIZE;
3790  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyGBISLength()-1)/nthread+1);
3791  reduceGBISEnergyKernel <<< nblock, nthread, 0, stream >>>
3792  (tlKernel.getTileListVirialEnergyGBISLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
3793  cudaCheck(cudaGetLastError());
3794  }
3795 
3796  if (ATOMIC_BINS > 1)
3797  {
3798  // Reduce d_virialEnergy[ATOMIC_BINS] in-place (results are in d_virialEnergy[0])
3799  reduceNonbondedBinsKernel<<<1, ATOMIC_BINS, 0, stream>>>(doVirial, doEnergy, doSlow, doGBIS, d_virialEnergy);
3800  }
3801 }
3802 
3803 void CudaComputeNonbondedKernel::bindExclusions(int numExclusions, unsigned int* exclusion_bits) {
3804  reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions);
3805  copy_HtoD_sync<unsigned int>(exclusion_bits, overflowExclusions, numExclusions);
3806 }
3807 
3808 
3809 void CudaComputeNonbondedKernel::setExclusionsByAtom(int2* h_data, const int num_atoms) {
3810  // Global data structure shouldn't be reallocated
3811  if (d_exclusionsByAtom == NULL) allocate_device<int2>(&d_exclusionsByAtom, num_atoms);
3812  copy_HtoD_sync<int2>(h_data, d_exclusionsByAtom, num_atoms);
3813 
3814 }
3815 
3816 
3817 template<bool kDoAlch>
3818 __global__ void updateVdwTypesExclKernel(
3819  const int numPatches,
3820  const CudaLocalRecord* localRecords,
3821  const int* global_vdwTypes,
3822  const int* global_id,
3823  const int* patchSortOrder,
3824  const int2* exclusionsByAtom,
3825  const int* global_partition,
3826  int* vdwTypes,
3827  int* atomIndex,
3828  int2* exclusions,
3829  char* part
3830 ) {
3831  __shared__ CudaLocalRecord s_record;
3832  using AccessType = int32_t;
3833  AccessType* s_record_buffer = (AccessType*) &s_record;
3834 
3835  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
3836  // Read in the CudaLocalRecord using multiple threads. This should
3837  #pragma unroll 1
3838  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
3839  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
3840  }
3841  __syncthreads();
3842 
3843  const int numAtoms = s_record.numAtoms;
3844  const int offset = s_record.bufferOffset;
3845  const int offsetNB = s_record.bufferOffsetNBPad;
3846 
3847  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
3848  const int order = patchSortOrder[offset + i];
3849  const int id = global_id[offset + order];
3850  vdwTypes [offsetNB + i] = global_vdwTypes[offset + order];
3851  atomIndex [offsetNB + i] = id;
3852  exclusions[offsetNB + i].x = exclusionsByAtom[id].y;
3853  exclusions[offsetNB + i].y = exclusionsByAtom[id].x;
3854  if (kDoAlch) {
3855  part [offsetNB + i] = global_partition[offset + order];
3856  }
3857  }
3858  __syncthreads();
3859  }
3860 }
3861 
3862 
3863 void CudaComputeNonbondedKernel::updateVdwTypesExclOnGPU(CudaTileListKernel& tlKernel,
3864  const int numPatches, const int atomStorageSize, const bool alchOn,
3865  CudaLocalRecord* localRecords,
3866  const int* d_vdwTypes, const int* d_id, const int* d_sortOrder,
3867  const int* d_partition,
3868  cudaStream_t stream
3869 ) {
3870  reallocate_device<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, OVERALLOC);
3871  reallocate_device<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, OVERALLOC);
3872  reallocate_device<int>(&atomIndex, &atomIndexSize, atomStorageSize, OVERALLOC);
3873 
3874  const int numBlocks = numPatches;
3875  const int numThreads = 512;
3876 
3877  if (alchOn) {
3878  updateVdwTypesExclKernel<true><<<numBlocks, numThreads, 0, stream>>>(
3879  numPatches, localRecords,
3880  d_vdwTypes, d_id, d_sortOrder, d_exclusionsByAtom, d_partition,
3881  vdwTypes, atomIndex, exclIndexMaxDiff, tlKernel.get_part()
3882  );
3883  } else {
3884  updateVdwTypesExclKernel<false><<<numBlocks, numThreads, 0, stream>>>(
3885  numPatches, localRecords,
3886  d_vdwTypes, d_id, d_sortOrder, d_exclusionsByAtom, d_partition,
3887  vdwTypes, atomIndex, exclIndexMaxDiff, tlKernel.get_part()
3888  );
3889  }
3890 }
3891 
3892 #endif // NAMD_HIP