NAMD
CudaPmeSolverUtil.C
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <algorithm>
3 #include <cstdlib>
4 #ifdef NAMD_CUDA
5 #include <cuda_runtime.h>
6 #endif
7 #ifdef NAMD_HIP
8 #include <hip/hip_runtime.h>
9 #endif
10 #include "HipDefines.h"
11 #include "ComputeNonbondedUtil.h"
12 #include "ComputePmeCUDAMgr.h"
13 #include "CudaPmeSolver.h"
14 #include "CudaPmeSolverUtil.h"
15 #include "Node.h"
16 #include "PatchData.h"
17 
18 #include "NamdEventsProfiling.h"
19 #include "TestArray.h"
20 #include "DeviceCUDA.h"
21 
22 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
23 extern __thread DeviceCUDA *deviceCUDA;
24 
25 extern "C" void CcdCallBacksReset(void *ignored, double curWallTime); // fix Charm++
26 
27 void writeComplexToDisk(const float2 *d_data, const int size, const char* filename, cudaStream_t stream) {
28  fprintf(stderr, "writeComplexToDisk %d %s\n", size, filename);
29  float2* h_data = new float2[size];
30  copy_DtoH<float2>(d_data, h_data, size, stream);
31  cudaCheck(cudaStreamSynchronize(stream));
32  FILE *handle = fopen(filename, "w");
33  for (int i=0;i < size;i++)
34  fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
35  fclose(handle);
36  delete [] h_data;
37 }
38 
39 void writeHostComplexToDisk(const float2 *h_data, const int size, const char* filename) {
40  FILE *handle = fopen(filename, "w");
41  for (int i=0;i < size;i++)
42  fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
43  fclose(handle);
44 }
45 
46 void writeRealToDisk(const float *d_data, const int size, const char* filename, cudaStream_t stream) {
47  fprintf(stderr, "writeRealToDisk %d %s\n", size, filename);
48  float* h_data = new float[size];
49  copy_DtoH<float>(d_data, h_data, size, stream);
50  cudaCheck(cudaStreamSynchronize(stream));
51  FILE *handle = fopen(filename, "w");
52  for (int i=0;i < size;i++)
53  fprintf(handle, "%f\n", h_data[i]);
54  fclose(handle);
55  delete [] h_data;
56 }
57 
58  CudaFFTCompute::CudaFFTCompute(int deviceID, cudaStream_t stream)
59  : deviceID(deviceID), stream(stream) {
60  }
61 
62 void CudaFFTCompute::plan3D(int *n, int flags) {
63  cudaCheck(cudaSetDevice(deviceID));
64  forwardType = CUFFT_R2C;
65  backwardType = CUFFT_C2R;
66  cufftCheck(cufftPlan3d(&forwardPlan, n[2], n[1], n[0], CUFFT_R2C));
67  cufftCheck(cufftPlan3d(&backwardPlan, n[2], n[1], n[0], CUFFT_C2R));
68  setStream();
69  // plantype = 3;
70 }
71 
72 void CudaFFTCompute::plan2D(int *n, int howmany, int flags) {
73  cudaCheck(cudaSetDevice(deviceID));
74  forwardType = CUFFT_R2C;
75  backwardType = CUFFT_C2R;
76  int nt[2] = {n[1], n[0]};
77  cufftCheck(cufftPlanMany(&forwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_R2C, howmany));
78  cufftCheck(cufftPlanMany(&backwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, howmany));
79  setStream();
80  // plantype = 2;
81 }
82 
83 void CudaFFTCompute::plan1DX(int *n, int howmany, int flags) {
84  cudaCheck(cudaSetDevice(deviceID));
85  forwardType = CUFFT_R2C;
86  backwardType = CUFFT_C2R;
87  cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_R2C, howmany));
88  cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2R, howmany));
89  setStream();
90  // plantype = 1;
91 }
92 
93 void CudaFFTCompute::plan1DY(int *n, int howmany, int flags) {
94  cudaCheck(cudaSetDevice(deviceID));
95  forwardType = CUFFT_C2C;
96  backwardType = CUFFT_C2C;
97  cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
98  cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
99  setStream();
100  // plantype = 1;
101 }
102 
103 void CudaFFTCompute::plan1DZ(int *n, int howmany, int flags) {
104  cudaCheck(cudaSetDevice(deviceID));
105  forwardType = CUFFT_C2C;
106  backwardType = CUFFT_C2C;
107  cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
108  cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
109  setStream();
110  // plantype = 1;
111 }
112 
114  cudaCheck(cudaSetDevice(deviceID));
115  cufftCheck(cufftDestroy(forwardPlan));
116  cufftCheck(cufftDestroy(backwardPlan));
117  if (dataSrcAllocated) deallocate_device<float>(&dataSrc);
118  if (dataDstAllocated) deallocate_device<float>(&dataDst);
119 }
120 
121 float* CudaFFTCompute::allocateData(const int dataSizeRequired) {
122  cudaCheck(cudaSetDevice(deviceID));
123  float* tmp = NULL;
124  allocate_device<float>(&tmp, dataSizeRequired);
125  return tmp;
126 }
127 
128 // int ncall = 0;
129 
131  cudaCheck(cudaSetDevice(deviceID));
132  // ncall++;
133  if (forwardType == CUFFT_R2C) {
134  cufftCheck(cufftExecR2C(forwardPlan, (cufftReal *)dataSrc, (cufftComplex *)dataDst));
135 #ifdef TESTPID
136  if (1) {
137  cudaCheck(cudaStreamSynchronize(stream));
138  fprintf(stderr, "AP FORWARD FFT\n");
139  fprintf(stderr, "COPY DEVICE ARRAYS BACK TO HOST\n");
140  int m = dataDstSize;
141  float *tran = 0;
142  allocate_host<float>(&tran, m);
143  copy_DtoH<float>(dataDst, tran, m, stream);
144  cudaCheck(cudaStreamSynchronize(stream));
145  TestArray_write<float>("tran_charge_grid_good.bin",
146  "transformed charge grid good", tran, m);
147  deallocate_host<float>(&tran);
148  }
149 #endif
150 
151  // if (ncall == 1) {
152  // writeComplexToDisk((float2 *)dataSrc, (isize/2+1)*jsize*ksize, "dataSrc.txt", stream);
153  // }
154 
155  // if (ncall == 1 && plantype == 2) {
156  // writeComplexToDisk((float2 *)data, (isize/2+1)*jsize*ksize, "data_fx_fy_z.txt", stream);
157  // }
158 
159  } else if (forwardType == CUFFT_C2C) {
160  // nc2cf++;
161  // if (ncall == 1 && nc2cf == 1)
162  // writeComplexToDisk((float2 *)data, 33*64*64, "data_y_z_fx.txt");
163  // else if (ncall == 1 && nc2cf == 2)
164  // writeComplexToDisk((float2 *)data, 33*64*64, "data_z_fx_fy.txt");
165  cufftCheck(cufftExecC2C(forwardPlan, (cufftComplex *)dataSrc, (cufftComplex *)dataDst, CUFFT_FORWARD));
166  // fprintf(stderr, "ncall %d plantype %d\n", ncall, plantype);
167  // if (ncall == 1 && plantype == 1 && isize == 62) {
168  // writeComplexToDisk((float2 *)data, isize*jsize*(ksize/2+1), "data_fy_z_fx.txt", stream);
169  // }
170  // if (ncall == 1 && nc2cf == 1)
171  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fy_z_fx.txt");
172  // else if (ncall == 1 && nc2cf == 2)
173  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fz_fx_fy.txt");
174  } else {
175  cudaNAMD_bug("CudaFFTCompute::forward(), unsupported FFT type");
176  }
177 }
178 
180  cudaCheck(cudaSetDevice(deviceID));
181  if (backwardType == CUFFT_C2R) {
182  // if (ncall == 1) {
183  // if (plantype == 1)
184  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fx_by_bz.txt");
185  // else
186  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fx_fy_fz_2.txt");
187  // }
188  cufftCheck(cufftExecC2R(backwardPlan, (cufftComplex *)dataDst, (cufftReal *)dataSrc));
189 #ifdef TESTPID
190  if (1) {
191  cudaCheck(cudaStreamSynchronize(stream));
192  fprintf(stderr, "AP BACKWARD FFT\n");
193  fprintf(stderr, "COPY DEVICE ARRAYS BACK TO HOST\n");
194  float *grid;
195  int gridsize = dataSrcSize;
196  allocate_host<float>(&grid, gridsize);
197  copy_DtoH<float>((float*)dataSrc, grid, gridsize, stream);
198  cudaCheck(cudaStreamSynchronize(stream));
199  TestArray_write<float>("potential_grid_good.bin",
200  "potential grid good", grid, gridsize);
201  deallocate_host<float>(&grid);
202  }
203 #endif
204 
205  // if (ncall == 1)
206  // if (plantype == 1)
207  // writeRealToDisk(data, 64*64*64, "data_bx_by_bz_1D.txt");
208  // else
209  // writeRealToDisk(data, 64*64*64, "data_bx_by_bz_3D.txt");
210  } else if (backwardType == CUFFT_C2C) {
211  // nc2cb++;
212  // if (ncall == 1 && nc2cb == 1)
213  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fz_fx_fy_2.txt");
214  // else if (ncall == 1 && nc2cb == 2)
215  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fy_bz_fx.txt");
216  cufftCheck(cufftExecC2C(backwardPlan, (cufftComplex *)dataDst, (cufftComplex *)dataSrc, CUFFT_INVERSE));
217  // if (ncall == 1 && nc2cb == 1)
218  // writeComplexToDisk((float2 *)data, 33*64*64, "data_bz_fx_fy.txt");
219  // else if (ncall == 1 && nc2cb == 2)
220  // writeComplexToDisk((float2 *)data, 33*64*64, "data_by_bz_fx.txt");
221  } else {
222  cudaNAMD_bug("CudaFFTCompute::backward(), unsupported FFT type");
223  }
224 }
225 
226 void CudaFFTCompute::setStream() {
227  cudaCheck(cudaSetDevice(deviceID));
228  cufftCheck(cufftSetStream(forwardPlan, stream));
229  cufftCheck(cufftSetStream(backwardPlan, stream));
230 }
231 
232 
234  const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream, unsigned int iGrid) :
235  PmeKSpaceCompute(pmeGrid, permutation, jblock, kblock, kappa, iGrid),
236  deviceID(deviceID), stream(stream) {
237 
238  cudaCheck(cudaSetDevice(deviceID));
239 
240  // Copy bm1 -> prefac_x on GPU memory
241  float *bm1f = new float[pmeGrid.K1];
242  float *bm2f = new float[pmeGrid.K2];
243  float *bm3f = new float[pmeGrid.K3];
244  for (int i=0;i < pmeGrid.K1;i++) bm1f[i] = (float)bm1[i];
245  for (int i=0;i < pmeGrid.K2;i++) bm2f[i] = (float)bm2[i];
246  for (int i=0;i < pmeGrid.K3;i++) bm3f[i] = (float)bm3[i];
247  allocate_device<float>(&d_bm1, pmeGrid.K1);
248  allocate_device<float>(&d_bm2, pmeGrid.K2);
249  allocate_device<float>(&d_bm3, pmeGrid.K3);
250  copy_HtoD_sync<float>(bm1f, d_bm1, pmeGrid.K1);
251  copy_HtoD_sync<float>(bm2f, d_bm2, pmeGrid.K2);
252  copy_HtoD_sync<float>(bm3f, d_bm3, pmeGrid.K3);
253  delete [] bm1f;
254  delete [] bm2f;
255  delete [] bm3f;
256  allocate_device<EnergyVirial>(&d_energyVirial, 1);
257  allocate_host<EnergyVirial>(&h_energyVirial, 1);
258  // cudaCheck(cudaEventCreateWithFlags(&copyEnergyVirialEvent, cudaEventDisableTiming));
259  cudaCheck(cudaEventCreate(&copyEnergyVirialEvent));
260  // ncall = 0;
261 }
262 
264  cudaCheck(cudaSetDevice(deviceID));
265  deallocate_device<float>(&d_bm1);
266  deallocate_device<float>(&d_bm2);
267  deallocate_device<float>(&d_bm3);
268  deallocate_device<EnergyVirial>(&d_energyVirial);
269  deallocate_host<EnergyVirial>(&h_energyVirial);
270  cudaCheck(cudaEventDestroy(copyEnergyVirialEvent));
271 }
272 
273 void CudaPmeKSpaceCompute::solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float* data) {
274 #if 0
275  // Check lattice to make sure it is updating for constant pressure
276  fprintf(stderr, "K-SPACE LATTICE %g %g %g %g %g %g %g %g %g\n",
277  lattice.a().x, lattice.a().y, lattice.a().z,
278  lattice.b().x, lattice.b().y, lattice.b().z,
279  lattice.c().x, lattice.c().y, lattice.c().z);
280 #endif
281  cudaCheck(cudaSetDevice(deviceID));
282 
283  const bool doEnergyVirial = (doEnergy || doVirial);
284 
285  int nfft1, nfft2, nfft3;
286  float *prefac1, *prefac2, *prefac3;
287 
288  BigReal volume = lattice.volume();
289  Vector a_r = lattice.a_r();
290  Vector b_r = lattice.b_r();
291  Vector c_r = lattice.c_r();
292  float recip1x, recip1y, recip1z;
293  float recip2x, recip2y, recip2z;
294  float recip3x, recip3y, recip3z;
295 
296  if (permutation == Perm_Z_cX_Y) {
297  // Z, X, Y
298  nfft1 = pmeGrid.K3;
299  nfft2 = pmeGrid.K1;
300  nfft3 = pmeGrid.K2;
301  prefac1 = d_bm3;
302  prefac2 = d_bm1;
303  prefac3 = d_bm2;
304  recip1x = c_r.z;
305  recip1y = c_r.x;
306  recip1z = c_r.y;
307  recip2x = a_r.z;
308  recip2y = a_r.x;
309  recip2z = a_r.y;
310  recip3x = b_r.z;
311  recip3y = b_r.x;
312  recip3z = b_r.y;
313  } else if (permutation == Perm_cX_Y_Z) {
314  // X, Y, Z
315  nfft1 = pmeGrid.K1;
316  nfft2 = pmeGrid.K2;
317  nfft3 = pmeGrid.K3;
318  prefac1 = d_bm1;
319  prefac2 = d_bm2;
320  prefac3 = d_bm3;
321  recip1x = a_r.x;
322  recip1y = a_r.y;
323  recip1z = a_r.z;
324  recip2x = b_r.x;
325  recip2y = b_r.y;
326  recip2z = b_r.z;
327  recip3x = c_r.x;
328  recip3y = c_r.y;
329  recip3z = c_r.z;
330  } else {
331  NAMD_bug("CudaPmeKSpaceCompute::solve, invalid permutation");
332  }
333 
334  // ncall++;
335  // if (ncall == 1) {
336  // char filename[256];
337  // sprintf(filename,"dataf_%d_%d.txt",jblock,kblock);
338  // writeComplexToDisk((float2*)data, size1*size2*size3, filename, stream);
339  // }
340 
341  // if (ncall == 1) {
342  // float2* h_data = new float2[size1*size2*size3];
343  // float2* d_data = (float2*)data;
344  // copy_DtoH<float2>(d_data, h_data, size1*size2*size3, stream);
345  // cudaCheck(cudaStreamSynchronize(stream));
346  // FILE *handle = fopen("dataf.txt", "w");
347  // for (int z=0;z < pmeGrid.K3;z++) {
348  // for (int y=0;y < pmeGrid.K2;y++) {
349  // for (int x=0;x < pmeGrid.K1/2+1;x++) {
350  // int i;
351  // if (permutation == Perm_cX_Y_Z) {
352  // i = x + y*size1 + z*size1*size2;
353  // } else {
354  // i = z + x*size1 + y*size1*size2;
355  // }
356  // fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
357  // }
358  // }
359  // }
360  // fclose(handle);
361  // delete [] h_data;
362  // }
363 
364  // Clear energy and virial array if needed
365  if (doEnergyVirial) clear_device_array<EnergyVirial>(d_energyVirial, 1, stream);
366 
367 #ifdef TESTPID
368  if (1) {
369  cudaCheck(cudaStreamSynchronize(stream));
370  fprintf(stderr, "AP calling scalar sum\n");
371  fprintf(stderr, "(permutation == Perm_cX_Y_Z) = %s\n",
372  (permutation == Perm_cX_Y_Z ? "true" : "false"));
373  fprintf(stderr, "nfft1=%d nfft2=%d nfft3=%d\n", nfft1, nfft2, nfft3);
374  fprintf(stderr, "size1=%d size2=%d size3=%d\n", size1, size2, size3);
375  fprintf(stderr, "kappa=%g\n", kappa);
376  fprintf(stderr, "recip1x=%g recip1y=%g recip1z=%g\n",
377  (double)recip1x, (double)recip1y, (double)recip1z);
378  fprintf(stderr, "recip2x=%g recip2y=%g recip2z=%g\n",
379  (double)recip2x, (double)recip2y, (double)recip2z);
380  fprintf(stderr, "recip3x=%g recip3y=%g recip3z=%g\n",
381  (double)recip3x, (double)recip3y, (double)recip3z);
382  fprintf(stderr, "volume=%g\n", volume);
383  fprintf(stderr, "j0=%d k0=%d\n", j0, k0);
384  float *bm1, *bm2, *bm3;
385  allocate_host<float>(&bm1, nfft1);
386  allocate_host<float>(&bm2, nfft2);
387  allocate_host<float>(&bm3, nfft3);
388  copy_DtoH<float>(prefac1, bm1, nfft1, stream);
389  copy_DtoH<float>(prefac2, bm2, nfft2, stream);
390  copy_DtoH<float>(prefac3, bm3, nfft3, stream);
391  TestArray_write<float>("bm1_good.bin", "structure factor bm1 good",
392  bm1, nfft1);
393  TestArray_write<float>("bm2_good.bin", "structure factor bm2 good",
394  bm2, nfft2);
395  TestArray_write<float>("bm3_good.bin", "structure factor bm3 good",
396  bm3, nfft3);
397  deallocate_host<float>(&bm1);
398  deallocate_host<float>(&bm2);
399  deallocate_host<float>(&bm3);
400  }
401 #endif
402 
403  scalar_sum(permutation == Perm_cX_Y_Z, nfft1, nfft2, nfft3, size1, size2, size3, kappa,
404  recip1x, recip1y, recip1z, recip2x, recip2y, recip2z, recip3x, recip3y, recip3z,
405  volume, prefac1, prefac2, prefac3, j0, k0, doEnergyVirial,
406  &d_energyVirial->energy, d_energyVirial->virial, (float2*)data,
407  stream);
408 #ifdef TESTPID
409  if (1) {
410  cudaCheck(cudaStreamSynchronize(stream));
411  fprintf(stderr, "AP SCALAR SUM\n");
412  fprintf(stderr, "COPY DEVICE ARRAYS BACK TO HOST\n");
413  int m = 2 * (nfft1/2 + 1) * nfft2 * nfft3;
414  float *tran = 0;
415  allocate_host<float>(&tran, m);
416  copy_DtoH<float>((float*)data, tran, m, stream);
417  cudaCheck(cudaStreamSynchronize(stream));
418  TestArray_write<float>("tran_potential_grid_good.bin",
419  "transformed potential grid good", tran, m);
420  deallocate_host<float>(&tran);
421  }
422 #endif
423 
424  // Copy energy and virial to host if needed
425  if (doEnergyVirial) {
426  copy_DtoH<EnergyVirial>(d_energyVirial, h_energyVirial, 1, stream);
427  cudaCheck(cudaEventRecord(copyEnergyVirialEvent, stream));
428  // cudaCheck(cudaStreamSynchronize(stream));
429  }
430 
431 }
432 
433 // void CudaPmeKSpaceCompute::waitEnergyAndVirial() {
434 // cudaCheck(cudaSetDevice(deviceID));
435 // cudaCheck(cudaEventSynchronize(copyEnergyVirialEvent));
436 // }
437 
438 void CudaPmeKSpaceCompute::energyAndVirialCheck(void *arg, double walltime) {
440 
441  cudaError_t err = cudaEventQuery(c->copyEnergyVirialEvent);
442  if (err == cudaSuccess) {
443  // Event has occurred
444  c->checkCount = 0;
445  if (c->pencilXYZPtr != NULL)
446  c->pencilXYZPtr->energyAndVirialDone(c->multipleGridIndex);
447  else if (c->pencilZPtr != NULL)
448  c->pencilZPtr->energyAndVirialDone(c->multipleGridIndex);
449  else
450  NAMD_bug("CudaPmeKSpaceCompute::energyAndVirialCheck, pencilXYZPtr and pencilZPtr not set");
451  return;
452  } else if (err == cudaErrorNotReady) {
453  // Event has not occurred
454  c->checkCount++;
455  if (c->checkCount >= 1000000) {
456  char errmsg[256];
457  sprintf(errmsg,"CudaPmeKSpaceCompute::energyAndVirialCheck polled %d times",
458  c->checkCount);
459  cudaDie(errmsg,err);
460  }
461  } else {
462  // Anything else is an error
463  char errmsg[256];
464  sprintf(errmsg,"in CudaPmeKSpaceCompute::energyAndVirialCheck after polling %d times",
465  c->checkCount);
466  cudaDie(errmsg,err);
467  }
468 
469  // Call again
470  CcdCallBacksReset(0, walltime);
471  CcdCallFnAfter(energyAndVirialCheck, arg, 0.1);
472 }
473 
475  cudaCheck(cudaSetDevice(deviceID));
476  pencilXYZPtr = pencilPtr;
477  pencilZPtr = NULL;
478  checkCount = 0;
479  CcdCallBacksReset(0, CmiWallTimer());
480  // Set the call back at 0.1ms
481  CcdCallFnAfter(energyAndVirialCheck, this, 0.1);
482 }
483 
485  cudaCheck(cudaSetDevice(deviceID));
486  pencilXYZPtr = NULL;
487  pencilZPtr = pencilPtr;
488  checkCount = 0;
489  CcdCallBacksReset(0, CmiWallTimer());
490  // Set the call back at 0.1ms
491  CcdCallFnAfter(energyAndVirialCheck, this, 0.1);
492 }
493 
495  return h_energyVirial->energy;
496 }
497 
498 void CudaPmeKSpaceCompute::getVirial(double *virial) {
499  if (permutation == Perm_Z_cX_Y) {
500  // h_energyVirial->virial is storing ZZ, ZX, ZY, XX, XY, YY
501  virial[0] = h_energyVirial->virial[3];
502  virial[1] = h_energyVirial->virial[4];
503  virial[2] = h_energyVirial->virial[1];
504 
505  virial[3] = h_energyVirial->virial[4];
506  virial[4] = h_energyVirial->virial[5];
507  virial[5] = h_energyVirial->virial[2];
508 
509  virial[6] = h_energyVirial->virial[1];
510  virial[7] = h_energyVirial->virial[7];
511  virial[8] = h_energyVirial->virial[0];
512  } else if (permutation == Perm_cX_Y_Z) {
513  // h_energyVirial->virial is storing XX, XY, XZ, YY, YZ, ZZ
514  virial[0] = h_energyVirial->virial[0];
515  virial[1] = h_energyVirial->virial[1];
516  virial[2] = h_energyVirial->virial[2];
517 
518  virial[3] = h_energyVirial->virial[1];
519  virial[4] = h_energyVirial->virial[3];
520  virial[5] = h_energyVirial->virial[4];
521 
522  virial[6] = h_energyVirial->virial[2];
523  virial[7] = h_energyVirial->virial[4];
524  virial[8] = h_energyVirial->virial[5];
525  }
526 #if 0
527  fprintf(stderr, "AP PME VIRIAL =\n"
528  " %g %g %g\n %g %g %g\n %g %g %g\n",
529  virial[0], virial[1], virial[2], virial[3], virial[4],
530  virial[5], virial[6], virial[7], virial[8]);
531 #endif
532 }
533 
534 
535 //###########################################################################
536 //###########################################################################
537 //###########################################################################
538 
539 //
540 // Class constructor
541 //
543  const int jblock, const int kblock, int deviceID, cudaStream_t stream) :
544  PmeRealSpaceCompute(pmeGrid, jblock, kblock), deviceID(deviceID), stream(stream) {
545  if (dataSize < xsize*ysize*zsize)
546  NAMD_bug("CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute, insufficient dataSize");
547  cudaCheck(cudaSetDevice(deviceID));
548  d_atomsCapacity = 0;
549  d_atoms = NULL;
550  d_forceCapacity = 0;
551  d_force = NULL;
552  #ifdef NAMD_CUDA
553  tex_data = NULL;
554  tex_data_len = 0;
555  #else
556  grid_data = NULL;
557  grid_data_len = 0;
558  #endif
559  allocate_device<float>(&data, dataSize);
560  setupGridData(data, xsize*ysize*zsize);
561  cudaCheck(cudaEventCreate(&gatherForceEvent));
562 }
563 
564 //
565 // Class desctructor
566 //
568  cudaCheck(cudaSetDevice(deviceID));
569  if (d_atoms != NULL) deallocate_device<CudaAtom>(&d_atoms);
570  if (d_force != NULL) deallocate_device<CudaForce>(&d_force);
571  // if (d_patches != NULL) deallocate_device<PatchInfo>(&d_patches);
572  // deallocate_device<double>(&d_selfEnergy);
573  deallocate_device<float>(&data);
574  cudaCheck(cudaEventDestroy(gatherForceEvent));
575 }
576 
577 // //
578 // // Copy patches and atoms to device memory
579 // //
580 // void CudaPmeRealSpaceCompute::setPatchesAtoms(const int numPatches, const PatchInfo* patches,
581 // const int numAtoms, const CudaAtom* atoms) {
582 
583 // this->numPatches = numPatches;
584 // this->numAtoms = numAtoms;
585 
586 // // Reallocate device arrays as neccessary
587 // reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity, numAtoms, 1.5f);
588 // reallocate_device<PatchInfo>(&d_patches, &d_patchesCapacity, numPatches, 1.5f);
589 
590 // // Copy atom and patch data to device
591 // copy_HtoD<CudaAtom>(atoms, d_atoms, numAtoms, stream);
592 // copy_HtoD<PatchInfo>(patches, d_patches, numPatches, stream);
593 // }
594 
595 //
596 // Copy atoms to device memory
597 //
598 void CudaPmeRealSpaceCompute::copyAtoms(const int numAtoms, const CudaAtom* atoms) {
599  cudaCheck(cudaSetDevice(deviceID));
600  this->numAtoms = numAtoms;
601 
602  // Reallocate device arrays as neccessary
603  reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity, numAtoms, 1.5f);
604 
605  // Copy atom data to device
606  copy_HtoD<CudaAtom>(atoms, d_atoms, numAtoms, stream);
607 }
608 
609 //
610 // Spread charges on grid
611 //
613  cudaCheck(cudaSetDevice(deviceID));
614 #if 0
615  if (1) {
616  static int step = 0;
617  float *xyzq;
618  int natoms = numAtoms;
619  allocate_host<float>(&xyzq, 4*natoms);
620  copy_DtoH<float>((float *)d_atoms, xyzq, 4*natoms, stream);
621  cudaCheck(cudaStreamSynchronize(stream));
622  char fname[64], remark[64];
623  sprintf(fname, "pme_atoms_xyzq_soa_%d.bin", step);
624  sprintf(remark, "SOA PME atoms xyzq, step %d\n", step);
625  TestArray_write<float>(fname, remark, xyzq, 4*natoms);
626  deallocate_host<float>(&xyzq);
627  step += 2;
628  }
629 #endif
630 
631  NAMD_EVENT_START(1, NamdProfileEvent::SPREAD_CHARGE);
632 
633  // Clear grid
634  clear_device_array<float>(data, xsize*ysize*zsize, stream);
635 
636 #if defined(TESTPID)
637  fprintf(stderr, "Calling spread_charge with parameters:\n");
638  fprintf(stderr, "numAtoms = %d\n", numAtoms);
639  fprintf(stderr, "pmeGrid.K1 = %d\n", pmeGrid.K1);
640  fprintf(stderr, "pmeGrid.K2 = %d\n", pmeGrid.K2);
641  fprintf(stderr, "pmeGrid.K3 = %d\n", pmeGrid.K3);
642  fprintf(stderr, "xsize = %d\n", xsize);
643  fprintf(stderr, "ysize = %d\n", ysize);
644  fprintf(stderr, "zsize = %d\n", zsize);
645  fprintf(stderr, "y0 = %d\n", y0);
646  fprintf(stderr, "z0 = %d\n", z0);
647  fprintf(stderr, "(pmeGrid.yBlocks == 1) = %d\n", (pmeGrid.yBlocks == 1));
648  fprintf(stderr, "(pmeGrid.zBlocks == 1) = %d\n", (pmeGrid.zBlocks == 1));
649  fprintf(stderr, "pmeGrid.order = %d\n", pmeGrid.order);
650 #endif
651  spread_charge((const float4*)d_atoms, numAtoms,
653  xsize, y0, z0, (pmeGrid.yBlocks == 1), (pmeGrid.zBlocks == 1),
654  data, pmeGrid.order, stream);
655 #ifdef TESTPID
656  if (1) {
657  cudaCheck(cudaStreamSynchronize(stream));
658  fprintf(stderr, "AP SPREAD CHARGES\n");
659  fprintf(stderr, "COPY DEVICE ARRAYS BACK TO HOST\n");
660  float *xyzq;
661  allocate_host<float>(&xyzq, 4*numAtoms);
662  copy_DtoH<float>((float *)d_atoms, xyzq, 4*numAtoms, stream);
663  int gridlen = pmeGrid.K1 * pmeGrid.K2 * pmeGrid.K3;
664  float *grid;
665  allocate_host<float>(&grid, gridlen);
666  copy_DtoH<float>(data, grid, gridlen, stream);
667  cudaCheck(cudaStreamSynchronize(stream));
668  TestArray_write<float>("xyzq_good.bin", "xyzq good", xyzq, 4*numAtoms);
669  TestArray_write<float>("charge_grid_good.bin", "charge grid good",
670  grid, gridlen);
671  deallocate_host<float>(&xyzq);
672  deallocate_host<float>(&grid);
673  }
674 #endif
675 
676  // ncall++;
677 
678  // if (ncall == 1) writeRealToDisk(data, xsize*ysize*zsize, "data.txt");
679  NAMD_EVENT_STOP(1, NamdProfileEvent::SPREAD_CHARGE);
680 }
681 
682 void CudaPmeRealSpaceCompute::cuda_gatherforce_check(void *arg, double walltime) {
684  cudaCheck(cudaSetDevice(c->deviceID));
685 
686  cudaError_t err = cudaEventQuery(c->gatherForceEvent);
687  if (err == cudaSuccess) {
688  // Event has occurred
689  c->checkCount = 0;
690 // c->deviceProxy[CkMyNode()].gatherForceDone();
691  c->devicePtr->gatherForceDone(c->multipleGridIndex);
692  return;
693  } else if (err == cudaErrorNotReady) {
694  // Event has not occurred
695  c->checkCount++;
696  if (c->checkCount >= 1000000) {
697  char errmsg[256];
698  sprintf(errmsg,"CudaPmeRealSpaceCompute::cuda_gatherforce_check polled %d times",
699  c->checkCount);
700  cudaDie(errmsg,err);
701  }
702  } else {
703  // Anything else is an error
704  char errmsg[256];
705  sprintf(errmsg,"in CudaPmeRealSpaceCompute::cuda_gatherforce_check after polling %d times",
706  c->checkCount);
707  cudaDie(errmsg,err);
708  }
709 
710  // Call again
711  CcdCallBacksReset(0, walltime);
712  CcdCallFnAfter(cuda_gatherforce_check, arg, 0.1);
713 }
714 
716  cudaCheck(cudaSetDevice(deviceID));
717  devicePtr = devicePtr_in;
718  checkCount = 0;
719  CcdCallBacksReset(0, CmiWallTimer());
720  // Set the call back at 0.1ms
721  CcdCallFnAfter(cuda_gatherforce_check, this, 0.1);
722 }
723 
725  cudaCheck(cudaSetDevice(deviceID));
726  cudaCheck(cudaEventSynchronize(gatherForceEvent));
727 }
728 
729 void CudaPmeRealSpaceCompute::setupGridData(float* data, int data_len) {
730  #ifdef NAMD_CUDA
731  //HIP runtime error when using hipCreateTextureObject. No longer needed anyway, so we are moving in that direction now.
732  /*
733 
734  FATAL ERROR: CUDA error hipCreateTextureObject(&gridTexObj, &desc, &tdesc, NULL) in file src/CudaPmeSolverUtil.C, function setupGridTexture, line 744
735  on Pe 11 (jparada-MS-7B09 device 0 pci 0:43:0): hipErrorRuntimeOther
736 ------------- Processor 11 Exiting: Called CmiAbort ------------
737 Reason: FATAL ERROR: CUDA error hipCreateTextureObject(&gridTexObj, &desc, &tdesc, NULL) in file src/CudaPmeSolverUtil.C, function setupGridTexture, line 744
738  on Pe 11 (jparada-MS-7B09 device 0 pci 0:43:0): hipErrorRuntimeOther
739 
740  */
741  if (tex_data == data && tex_data_len == data_len) return;
742  tex_data = data;
743  tex_data_len = data_len;
744  // Use texture objects
745  cudaResourceDesc resDesc;
746  memset(&resDesc, 0, sizeof(resDesc));
747  resDesc.resType = cudaResourceTypeLinear;
748  resDesc.res.linear.devPtr = data;
749  resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
750  resDesc.res.linear.desc.x = sizeof(float)*8;
751  resDesc.res.linear.sizeInBytes = data_len*sizeof(float);
752  cudaTextureDesc texDesc;
753  memset(&texDesc, 0, sizeof(texDesc));
754  texDesc.readMode = cudaReadModeElementType;
755  cudaCheck(cudaCreateTextureObject(&gridTexObj, &resDesc, &texDesc, NULL));
756  #else
757  if (grid_data == data && grid_data_len == data_len) return;
758  grid_data = data;
759  grid_data_len = data_len;
760  #endif
761 }
762 
764  cudaCheck(cudaSetDevice(deviceID));
765 
766  NAMD_EVENT_START(1, NamdProfileEvent::GATHER_FORCE);
767 
768  // Re-allocate force array if needed
769  reallocate_device<CudaForce>(&d_force, &d_forceCapacity, numAtoms, 1.5f);
770 
771 #ifdef TESTPID
772  if (1) {
773  fprintf(stderr, "AP gather force arguments\n");
774  fprintf(stderr, "numAtoms = %d\n", numAtoms);
775  fprintf(stderr, "pmeGrid.K1 = %d\n", pmeGrid.K1);
776  fprintf(stderr, "pmeGrid.K2 = %d\n", pmeGrid.K2);
777  fprintf(stderr, "pmeGrid.K3 = %d\n", pmeGrid.K3);
778  fprintf(stderr, "xsize = %d\n", xsize);
779  fprintf(stderr, "ysize = %d\n", ysize);
780  fprintf(stderr, "zsize = %d\n", zsize);
781  fprintf(stderr, "y0 = %d\n", y0);
782  fprintf(stderr, "z0 = %d\n", z0);
783  fprintf(stderr, "(pmeGrid.yBlocks == 1) = %d\n", (pmeGrid.yBlocks == 1));
784  fprintf(stderr, "(pmeGrid.zBlocks == 1) = %d\n", (pmeGrid.zBlocks == 1));
785  fprintf(stderr, "pmeGrid.order = %d\n", pmeGrid.order);
786  fprintf(stderr, "gridTexObj = %p\n", gridTexObj);
787  }
788 #endif
789  // The patch-level PME kernels are only used for the GPU-resident code path. The default constructor
790  // of PatchLevelPmeData will initialize the compatibility variables to false, so the patch-level kernels
791  // won't be used here.
792  PatchLevelPmeData patchLevelPmeData;
793  gather_force(patchLevelPmeData,
794  (const float4*)d_atoms, numAtoms,
796  xsize, ysize, zsize, xsize, y0, z0, (pmeGrid.yBlocks == 1), (pmeGrid.zBlocks == 1),
797  data, pmeGrid.order, (float3*)d_force,
798 #ifdef NAMD_CUDA
799  gridTexObj,
800 #endif
801  stream);
802 #ifdef TESTPID
803  if (1) {
804  cudaCheck(cudaStreamSynchronize(stream));
805  fprintf(stderr, "AP GATHER FORCE\n");
806  fprintf(stderr, "COPY DEVICE ARRAYS BACK TO HOST\n");
807  float *xyz;
808  int natoms = numAtoms;
809  allocate_host<float>(&xyz, 3*natoms);
810  copy_DtoH<float>((float*)d_force, xyz, 3*natoms, stream);
811  cudaCheck(cudaStreamSynchronize(stream));
812  TestArray_write<float>("gather_force_good.bin",
813  "gather force good", xyz, 3*natoms);
814  deallocate_host<float>(&xyz);
815  }
816 #endif
817 
818  copy_DtoH<CudaForce>(d_force, force, numAtoms, stream);
819 
820  cudaCheck(cudaEventRecord(gatherForceEvent, stream));
821 
822  NAMD_EVENT_STOP(1, NamdProfileEvent::GATHER_FORCE);
823 }
824 
825 /*
826 double CudaPmeRealSpaceCompute::calcSelfEnergy() {
827  double h_selfEnergy;
828  clear_device_array<double>(d_selfEnergy, 1);
829  calc_sum_charge_squared((const float4*)d_atoms, numAtoms, d_selfEnergy, stream);
830  copy_DtoH<double>(d_selfEnergy, &h_selfEnergy, 1, stream);
831  cudaCheck(cudaStreamSynchronize(stream));
832  // 1.7724538509055160273 = sqrt(pi)
833  h_selfEnergy *= -ComputeNonbondedUtil::ewaldcof/1.7724538509055160273;
834  return h_selfEnergy;
835 }
836 */
837 
838 //###########################################################################
839 //###########################################################################
840 //###########################################################################
841 
842 CudaPmeTranspose::CudaPmeTranspose(PmeGrid pmeGrid, const int permutation,
843  const int jblock, const int kblock, int deviceID, cudaStream_t stream) :
844  PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
845  cudaCheck(cudaSetDevice(deviceID));
846 
847  allocate_device<float2>(&d_data, dataSize);
848 #ifndef P2P_ENABLE_3D
849  allocate_device<float2>(&d_buffer, dataSize);
850 #endif
851 
852  // Setup data pointers to NULL, these can be overridden later on by using setDataPtrs()
853  dataPtrsYZX.resize(nblock, NULL);
854  dataPtrsZXY.resize(nblock, NULL);
855 
856  allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*nblock);
857  allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*nblock);
858 }
859 
861  cudaCheck(cudaSetDevice(deviceID));
862  deallocate_device<float2>(&d_data);
863 #ifndef P2P_ENABLE_3D
864  deallocate_device<float2>(&d_buffer);
865 #endif
866  deallocate_device< TransposeBatch<float2> >(&batchesZXY);
867  deallocate_device< TransposeBatch<float2> >(&batchesYZX);
868 }
869 
870 //
871 // Set dataPtrsYZX
872 //
873 void CudaPmeTranspose::setDataPtrsYZX(std::vector<float2*>& dataPtrsNew, float2* data) {
874  if (dataPtrsYZX.size() != dataPtrsNew.size())
875  NAMD_bug("CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
876  for (int iblock=0;iblock < nblock;iblock++) {
877  dataPtrsYZX[iblock] = dataPtrsNew[iblock];
878  }
879  // Build batched data structures
881 
882  for (int iperm=0;iperm < 3;iperm++) {
883  int isize_out;
884  if (iperm == 0) {
885  // Perm_Z_cX_Y:
886  // ZXY -> XYZ
887  isize_out = pmeGrid.K1/2+1;
888  } else if (iperm == 1) {
889  // Perm_cX_Y_Z:
890  // XYZ -> YZX
891  isize_out = pmeGrid.K2;
892  } else {
893  // Perm_Y_Z_cX:
894  // YZX -> ZXY
895  isize_out = pmeGrid.K3;
896  }
897 
898  int max_nx = 0;
899  for (int iblock=0;iblock < nblock;iblock++) {
900 
901  int x0 = pos[iblock];
902  int nx = pos[iblock+1] - x0;
903  max_nx = std::max(max_nx, nx);
904 
905  int width_out;
906  float2* data_out;
907  if (dataPtrsYZX[iblock] == NULL) {
908  // Local transpose, use internal buffer
909  data_out = d_data + jsize*ksize*x0;
910  width_out = jsize;
911  } else {
912  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
913  data_out = dataPtrsYZX[iblock];
914  width_out = isize_out;
915  }
916 
918  batch.nx = nx;
919  batch.ysize_out = width_out;
920  batch.zsize_out = ksize;
921  batch.data_in = data+x0;
922  batch.data_out = data_out;
923 
924  h_batchesYZX[iperm*nblock + iblock] = batch;
925 
926  // transpose_xyz_yzx(
927  // nx, jsize, ksize,
928  // isize, jsize,
929  // width_out, ksize,
930  // data+x0, data_out, stream);
931  }
932 
933  max_nx_YZX[iperm] = max_nx;
934  }
935 
936  copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*nblock, stream);
937  cudaCheck(cudaStreamSynchronize(stream));
938  delete [] h_batchesYZX;
939 }
940 
941 //
942 // Set dataPtrsZXY
943 //
944 void CudaPmeTranspose::setDataPtrsZXY(std::vector<float2*>& dataPtrsNew, float2* data) {
945  if (dataPtrsZXY.size() != dataPtrsNew.size())
946  NAMD_bug("CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
947  for (int iblock=0;iblock < nblock;iblock++) {
948  dataPtrsZXY[iblock] = dataPtrsNew[iblock];
949  }
950 
951  // Build batched data structures
953 
954  for (int iperm=0;iperm < 3;iperm++) {
955  int isize_out;
956  if (iperm == 0) {
957  // Perm_cX_Y_Z:
958  // XYZ -> ZXY
959  isize_out = pmeGrid.K3;
960  } else if (iperm == 1) {
961  // Perm_Z_cX_Y:
962  // ZXY -> YZX
963  isize_out = pmeGrid.K2;
964  } else {
965  // Perm_Y_Z_cX:
966  // YZX -> XYZ
967  isize_out = pmeGrid.K1/2+1;
968  }
969 
970  int max_nx = 0;
971  for (int iblock=0;iblock < nblock;iblock++) {
972 
973  int x0 = pos[iblock];
974  int nx = pos[iblock+1] - x0;
975  max_nx = std::max(max_nx, nx);
976 
977  int width_out;
978  float2* data_out;
979  if (dataPtrsZXY[iblock] == NULL) {
980  // Local transpose, use internal buffer
981  data_out = d_data + jsize*ksize*x0;
982  width_out = ksize;
983  } else {
984  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
985  data_out = dataPtrsZXY[iblock];
986  width_out = isize_out;
987  }
988 
990  batch.nx = nx;
991  batch.zsize_out = width_out;
992  batch.xsize_out = nx;
993  batch.data_in = data+x0;
994  batch.data_out = data_out;
995  h_batchesZXY[iperm*nblock + iblock] = batch;
996  }
997 
998  max_nx_ZXY[iperm] = max_nx;
999  }
1000 
1001  copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*nblock, stream);
1002  cudaCheck(cudaStreamSynchronize(stream));
1003  delete [] h_batchesZXY;
1004 }
1005 
1006 void CudaPmeTranspose::transposeXYZtoYZX(const float2* data) {
1007  cudaCheck(cudaSetDevice(deviceID));
1008 
1009  int iperm;
1010  switch(permutation) {
1011  case Perm_Z_cX_Y:
1012  // ZXY -> XYZ
1013  iperm = 0;
1014  break;
1015  case Perm_cX_Y_Z:
1016  // XYZ -> YZX
1017  iperm = 1;
1018  break;
1019  case Perm_Y_Z_cX:
1020  // YZX -> ZXY
1021  iperm = 2;
1022  break;
1023  default:
1024  NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
1025  break;
1026  }
1027 
1029  nblock, batchesYZX + iperm*nblock,
1030  max_nx_YZX[iperm], jsize, ksize,
1031  isize, jsize, stream);
1032 
1033 
1034 /*
1035  int isize_out;
1036  switch(permutation) {
1037  case Perm_Z_cX_Y:
1038  // ZXY -> XYZ
1039  isize_out = pmeGrid.K1/2+1;
1040  break;
1041  case Perm_cX_Y_Z:
1042  // XYZ -> YZX
1043  isize_out = pmeGrid.K2;
1044  break;
1045  case Perm_Y_Z_cX:
1046  // YZX -> ZXY
1047  isize_out = pmeGrid.K3;
1048  break;
1049  default:
1050  NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
1051  break;
1052  }
1053 
1054  for (int iblock=0;iblock < nblock;iblock++) {
1055 
1056  int x0 = pos[iblock];
1057  int nx = pos[iblock+1] - x0;
1058 
1059  int width_out;
1060  float2* data_out;
1061  if (dataPtrsYZX[iblock] == NULL) {
1062  // Local transpose, use internal buffer
1063  data_out = d_data + jsize*ksize*x0;
1064  width_out = jsize;
1065  } else {
1066  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
1067  data_out = dataPtrsYZX[iblock];
1068  width_out = isize_out;
1069  }
1070 
1071  transpose_xyz_yzx(
1072  nx, jsize, ksize,
1073  isize, jsize,
1074  width_out, ksize,
1075  data+x0, data_out, stream);
1076  }
1077 */
1078 }
1079 
1080 void CudaPmeTranspose::transposeXYZtoZXY(const float2* data) {
1081  cudaCheck(cudaSetDevice(deviceID));
1082 
1083  int iperm;
1084  switch(permutation) {
1085  case Perm_cX_Y_Z:
1086  // XYZ -> ZXY
1087  iperm = 0;
1088  break;
1089  case Perm_Z_cX_Y:
1090  // ZXY -> YZX
1091  iperm = 1;
1092  break;
1093  case Perm_Y_Z_cX:
1094  // YZX -> XYZ
1095  iperm = 2;
1096  break;
1097  default:
1098  NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
1099  break;
1100  }
1101 
1103  nblock, batchesZXY + iperm*nblock,
1104  max_nx_ZXY[iperm], jsize, ksize,
1105  isize, jsize, stream);
1106 
1107 /*
1108  int isize_out;
1109  switch(permutation) {
1110  case Perm_cX_Y_Z:
1111  // XYZ -> ZXY
1112  isize_out = pmeGrid.K3;
1113  break;
1114  case Perm_Z_cX_Y:
1115  // ZXY -> YZX
1116  isize_out = pmeGrid.K2;
1117  break;
1118  case Perm_Y_Z_cX:
1119  // YZX -> XYZ
1120  isize_out = pmeGrid.K1/2+1;
1121  break;
1122  default:
1123  NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
1124  break;
1125  }
1126 
1127  for (int iblock=0;iblock < nblock;iblock++) {
1128 
1129  int x0 = pos[iblock];
1130  int nx = pos[iblock+1] - x0;
1131 
1132  int width_out;
1133  float2* data_out;
1134  if (dataPtrsZXY[iblock] == NULL) {
1135  // Local transpose, use internal buffer
1136  data_out = d_data + jsize*ksize*x0;
1137  width_out = ksize;
1138  } else {
1139  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
1140  data_out = dataPtrsZXY[iblock];
1141  width_out = isize_out;
1142  }
1143 
1144  transpose_xyz_zxy(
1145  nx, jsize, ksize,
1146  isize, jsize,
1147  width_out, nx,
1148  data+x0, data_out, stream);
1149  }
1150 */
1151 }
1152 
1154  cudaCheck(cudaSetDevice(deviceID));
1155  cudaCheck(cudaStreamSynchronize(stream));
1156 }
1157 
1158 void CudaPmeTranspose::copyDataDeviceToHost(const int iblock, float2* h_data, const int h_dataSize) {
1159  cudaCheck(cudaSetDevice(deviceID));
1160 
1161  if (iblock >= nblock)
1162  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
1163 
1164  int x0 = pos[iblock];
1165  int nx = pos[iblock+1] - x0;
1166 
1167  int copySize = jsize*ksize*nx;
1168  int copyStart = jsize*ksize*x0;
1169 
1170  if (copyStart + copySize > dataSize)
1171  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
1172 
1173  if (copySize > h_dataSize)
1174  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
1175 
1176  copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
1177 }
1178 
1179 void CudaPmeTranspose::copyDataHostToDevice(const int iblock, float2* data_in, float2* data_out) {
1180  cudaCheck(cudaSetDevice(deviceID));
1181 
1182  if (iblock >= nblock)
1183  NAMD_bug("CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
1184 
1185  // Determine block size = how much we're copying
1186  int i0, i1, j0, j1, k0, k1;
1187  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1188  int ni = i1-i0+1;
1189  int nj = j1-j0+1;
1190  int nk = k1-k0+1;
1191 
1192  copy3D_HtoD<float2>(data_in, data_out,
1193  0, 0, 0,
1194  ni, nj,
1195  i0, 0, 0,
1196  isize, jsize,
1197  ni, nj, nk, stream);
1198 }
1199 
1200 #ifndef P2P_ENABLE_3D
1201 //
1202 // Copy from temporary buffer to final buffer
1203 //
1204 void CudaPmeTranspose::copyDataDeviceToDevice(const int iblock, float2* data_out) {
1205  cudaCheck(cudaSetDevice(deviceID));
1206 
1207  if (iblock >= nblock)
1208  NAMD_bug("CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
1209 
1210  // Determine block size = how much we're copying
1211  int i0, i1, j0, j1, k0, k1;
1212  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1213  int ni = i1-i0+1;
1214  int nj = j1-j0+1;
1215  int nk = k1-k0+1;
1216 
1217  float2* data_in = d_buffer + i0*nj*nk;
1218 
1219  copy3D_DtoD<float2>(data_in, data_out,
1220  0, 0, 0,
1221  ni, nj,
1222  i0, 0, 0,
1223  isize, jsize,
1224  ni, nj, nk, stream);
1225 }
1226 
1227 //
1228 // Return temporary buffer for block "iblock"
1229 //
1230 float2* CudaPmeTranspose::getBuffer(const int iblock) {
1231  if (iblock >= nblock)
1232  NAMD_bug("CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
1233 
1234  // Determine block size = how much we're copying
1235  int i0, i1, j0, j1, k0, k1;
1236  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1237  int ni = i1-i0+1;
1238  int nj = j1-j0+1;
1239  int nk = k1-k0+1;
1240 
1241  return d_buffer + i0*nj*nk;
1242 }
1243 #endif
1244 
1245 void CudaPmeTranspose::copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out,
1246  float2* data_out) {
1247 
1248  int iblock_out = jblock;
1249  int jblock_out = kblock;
1250  int kblock_out = iblock;
1251 
1252  copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1253 }
1254 
1255 void CudaPmeTranspose::copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out,
1256  float2* data_out) {
1257 
1258  int iblock_out = kblock;
1259  int jblock_out = iblock;
1260  int kblock_out = jblock;
1261 
1262  copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1263 }
1264 
1265 void CudaPmeTranspose::copyDataToPeerDevice(const int iblock,
1266  const int iblock_out, const int jblock_out, const int kblock_out,
1267  int deviceID_out, int permutation_out, float2* data_out) {
1268 
1269  cudaCheck(cudaSetDevice(deviceID));
1270 
1271  // Determine block size = how much we're copying
1272  int i0, i1, j0, j1, k0, k1;
1273  getBlockDim(pmeGrid, permutation_out, iblock_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
1274  int ni = i1-i0+1;
1275  int nj = j1-j0+1;
1276  int nk = k1-k0+1;
1277 
1278  getPencilDim(pmeGrid, permutation_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
1279  int isize_out = i1-i0+1;
1280  int jsize_out = j1-j0+1;
1281 
1282  int x0 = pos[iblock];
1283  float2* data_in = d_data + jsize*ksize*x0;
1284 
1285 #ifndef P2P_ENABLE_3D
1286  // Copy into temporary peer device buffer
1287  copy_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out, ni*nj*nk, stream);
1288 #else
1289  copy3D_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out,
1290  0, 0, 0,
1291  ni, nj,
1292  0, 0, 0,
1293  isize_out, jsize_out,
1294  ni, nj, nk, stream);
1295 #endif
1296 
1297 }
1298 
1299 
1301  PmeGrid pmeGrid_,
1302  int deviceID_,
1303  int deviceIndex_
1304  ) :
1305  pmeGrid(pmeGrid_), deviceID(deviceID_), deviceIndex(deviceIndex_),
1306  natoms(0), d_atoms(0), d_forces(0),
1307  d_grids(0), gridsize(0),
1308  d_trans(0), transize(0),
1309  d_bm1(0), d_bm2(0), d_bm3(0),
1310  kappa(ComputeNonbondedUtil::ewaldcof),
1311  self_energy_alch_first_time(true),
1312  force_scaling_alch_first_time(true),
1313  selfEnergy(0), selfEnergy_FEP(0), selfEnergy_TI_1(0), selfEnergy_TI_2(0), m_step(0)
1314 {
1315 // fprintf(stderr, "CudaPmeOneDevice constructor START ******************************************\n");
1316  const SimParameters& sim_params = *(Node::Object()->simParameters);
1318  // Determine how many grids we need for the alchemical route
1319  if (sim_params.alchOn) {
1320  num_used_grids = sim_params.alchGetNumOfPMEGrids();
1321  } else {
1322  num_used_grids = 1;
1323  }
1324  cudaCheck(cudaSetDevice(deviceID));
1325 
1326  // Check to see if the simulation and device is compatible with patch-level kernels. The results
1327  // will be worked in the PatchLevelPmeData field
1328  checkPatchLevelSimParamCompatibility(pmeGrid.order, true /* periodic Y */, true /* periodic Z */);
1330 
1331  // create our own CUDA stream
1332 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
1333  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1334  int leastPriority, greatestPriority;
1335  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
1336  cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
1337 #else
1338  cudaCheck(cudaStreamCreate(&stream));
1339 #endif
1340 
1341  allocate_host<EnergyVirial>(&h_energyVirials, num_used_grids);
1342  allocate_device<EnergyVirial>(&d_energyVirials, num_used_grids);
1343  allocate_device<float>(&d_scaling_factors, num_used_grids);
1344  allocate_device<double>(&d_selfEnergy, 1);
1345  if (sim_params.alchFepOn) {
1346  allocate_device<double>(&d_selfEnergy_FEP, 1);
1347  } else {
1348  d_selfEnergy_FEP = NULL;
1349  }
1350  if (sim_params.alchThermIntOn) {
1351  allocate_device<double>(&d_selfEnergy_TI_1, 1);
1352  allocate_device<double>(&d_selfEnergy_TI_2, 1);
1353  } else {
1354  d_selfEnergy_TI_1 = NULL;
1355  d_selfEnergy_TI_2 = NULL;
1356  }
1357 
1358  // create device buffer space for atom positions and forces
1359  // to be accessed externally through PatchData
1360  allocate_device<float4>(&d_atoms, num_used_grids * natoms);
1361  allocate_device<float3>(&d_forces, num_used_grids * natoms);
1362  if (sim_params.alchOn) {
1363  allocate_device<int>(&d_partition, natoms);
1364  } else {
1365  d_partition = NULL;
1366  }
1367 #ifdef NODEGROUP_FORCE_REGISTER
1368  DeviceData& devData = cpdata.ckLocalBranch()->devData[deviceIndex];
1369  devData.s_datoms = (CudaAtom *) (d_atoms);
1370  devData.f_slow = (CudaForce *) (d_forces);
1371  devData.f_slow_size = natoms;
1372  devData.s_datoms_partition = d_partition;
1373 #endif
1374  int k1 = pmeGrid.K1;
1375  int k2 = pmeGrid.K2;
1376  int k3 = pmeGrid.K3;
1377  int order = pmeGrid.order;
1378  gridsize = k1 * k2 * k3;
1379  transize = (k1/2 + 1) * k2 * k3;
1380 
1381 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1382 
1383  // set up cufft
1384  forwardPlans = new cufftHandle[num_used_grids];
1385  backwardPlans = new cufftHandle[num_used_grids];
1386  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1387  cufftCheck(cufftPlan3d(&(forwardPlans[iGrid]), k3, k2, k1, CUFFT_R2C));
1388  cufftCheck(cufftPlan3d(&(backwardPlans[iGrid]), k3, k2, k1, CUFFT_C2R));
1389  cufftCheck(cufftSetStream(forwardPlans[iGrid], stream));
1390  cufftCheck(cufftSetStream(backwardPlans[iGrid], stream));
1391  }
1392 #endif
1393 
1394 #ifdef NAMD_CUDA
1395  cudaDeviceProp deviceProp;
1396  cudaCheck(cudaGetDeviceProperties(&deviceProp, deviceID));
1397  const int texture_alignment = int(deviceProp.textureAlignment);
1398  // d_grids and d_grids + N * gridsize will be used as device pointers for ::cudaResourceDesc::res::linear::devPtr
1399  // check if (d_grids + N * gridsize) is an address aligned to ::cudaDeviceProp::textureAlignment
1400  // which is required by cudaCreateTextureObject()
1401  // or maybe I should use cudaMallocPitch()?
1402  if ((gridsize % texture_alignment) != 0) {
1403  // if it is not aligned, padding is required
1404  gridsize = (int(gridsize / texture_alignment) + 1) * texture_alignment;
1405  }
1406  // Is it necesary to align transize too?
1407 // if ((transize % texture_alignment) != 0) {
1408 // // if it is not aligned, padding is required
1409 // transize = (int(transize / texture_alignment) + 1) * texture_alignment;
1410 // }
1411  allocate_device<float>(&d_grids, num_used_grids * gridsize);
1412  allocate_device<float2>(&d_trans, num_used_grids * transize);
1413  gridTexObjArrays = new cudaTextureObject_t[num_used_grids];
1414  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1415  // set up texture object
1416  cudaResourceDesc resDesc;
1417  memset(&resDesc, 0, sizeof(resDesc));
1418  resDesc.resType = cudaResourceTypeLinear;
1419  resDesc.res.linear.devPtr = (void*)(d_grids + iGrid * (size_t)gridsize);
1420  resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
1421  resDesc.res.linear.desc.x = sizeof(float)*8;
1422  resDesc.res.linear.sizeInBytes = gridsize*sizeof(float);
1423  cudaTextureDesc texDesc;
1424  memset(&texDesc, 0, sizeof(texDesc));
1425  texDesc.readMode = cudaReadModeElementType;
1426  cudaCheck(cudaCreateTextureObject(&(gridTexObjArrays[iGrid]), &resDesc, &texDesc, NULL));
1427  }
1428 #else
1429  allocate_device<float>(&d_grids, num_used_grids * gridsize);
1430  allocate_device<float2>(&d_trans, num_used_grids * transize);
1431 #endif
1432  // calculate prefactors
1433  double *bm1 = new double[k1];
1434  double *bm2 = new double[k2];
1435  double *bm3 = new double[k3];
1436  // Use compute_b_moduli from PmeKSpace.C
1437  extern void compute_b_moduli(double *bm, int k, int order);
1438  compute_b_moduli(bm1, k1, order);
1439  compute_b_moduli(bm2, k2, order);
1440  compute_b_moduli(bm3, k3, order);
1441 
1442  // allocate space for and copy prefactors onto GPU
1443  float *bm1f = new float[k1];
1444  float *bm2f = new float[k2];
1445  float *bm3f = new float[k3];
1446  for (int i=0; i < k1; i++) bm1f[i] = (float) bm1[i];
1447  for (int i=0; i < k2; i++) bm2f[i] = (float) bm2[i];
1448  for (int i=0; i < k3; i++) bm3f[i] = (float) bm3[i];
1449  allocate_device<float>(&d_bm1, k1);
1450  allocate_device<float>(&d_bm2, k2);
1451  allocate_device<float>(&d_bm3, k3);
1452  copy_HtoD_sync<float>(bm1f, d_bm1, k1);
1453  copy_HtoD_sync<float>(bm2f, d_bm2, k2);
1454  copy_HtoD_sync<float>(bm3f, d_bm3, k3);
1455  delete [] bm1f;
1456  delete [] bm2f;
1457  delete [] bm3f;
1458  delete [] bm1;
1459  delete [] bm2;
1460  delete [] bm3;
1461 
1462  cudaCheck(cudaStreamSynchronize(stream));
1463 
1464 // fprintf(stderr, "CudaPmeOneDevice constructor END ********************************************\n");
1465 }
1466 
1468  deallocate_device<float4>(&d_atoms);
1469  deallocate_device<float3>(&d_forces);
1470  deallocate_device<float2>(&d_trans);
1471  deallocate_device<float>(&d_grids);
1472  deallocate_host<EnergyVirial>(&h_energyVirials);
1473  deallocate_device<EnergyVirial>(&d_energyVirials);
1474  deallocate_device<float>(&d_scaling_factors);
1475 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1476  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1477  cufftCheck(cufftDestroy(forwardPlans[iGrid]));
1478  cufftCheck(cufftDestroy(backwardPlans[iGrid]));
1479 #if defined(NAMD_CUDA) // only CUDA uses texture objects
1480  cudaCheck(cudaDestroyTextureObject(gridTexObjArrays[iGrid]));
1481 #endif
1482  }
1483 
1484  if (patchLevelPmeData.h_patchGridOffsets != nullptr) {
1485  deallocate_host<int3>(&patchLevelPmeData.h_patchGridOffsets);
1486  }
1487  if (patchLevelPmeData.d_patchGridOffsets != nullptr) {
1488  deallocate_device<int3>(&patchLevelPmeData.d_patchGridOffsets);
1489  }
1490 
1491  delete[] forwardPlans;
1492  delete[] backwardPlans;
1493 #if defined(NAMD_CUDA) // only CUDA uses texture objects
1494  delete[] gridTexObjArrays;
1495 #endif
1496 
1497 
1498 #endif
1499  deallocate_device<double>(&d_selfEnergy);
1500  if (d_partition != NULL) deallocate_device<int>(&d_partition);
1501  if (d_selfEnergy_FEP != NULL) deallocate_device<double>(&d_selfEnergy_FEP);
1502  if (d_selfEnergy_TI_1 != NULL) deallocate_device<double>(&d_selfEnergy_TI_1);
1503  if (d_selfEnergy_TI_2 != NULL) deallocate_device<double>(&d_selfEnergy_TI_2);
1504  deallocate_device<float>(&d_bm1);
1505  deallocate_device<float>(&d_bm2);
1506  deallocate_device<float>(&d_bm3);
1507  cudaCheck(cudaStreamDestroy(stream));
1508 }
1509 
1511  const Lattice &lattice,
1512 // const CudaAtom *d_atoms,
1513 // CudaForce *d_force,
1514 // int natoms,
1515  int doEnergyVirial,
1516  int step
1517 #if 0
1518  double d_energy[1],
1519  double d_virial[6]
1520  NodeReduction& reduction
1521 #endif
1522  ) {
1523 // fprintf(stderr, "CudaPmeOneDevice compute ****************************************************\n");
1524  int k1 = pmeGrid.K1;
1525  int k2 = pmeGrid.K2;
1526  int k3 = pmeGrid.K3;
1527  int order = pmeGrid.order;
1528  double volume = lattice.volume();
1529  Vector a_r = lattice.a_r();
1530  Vector b_r = lattice.b_r();
1531  Vector c_r = lattice.c_r();
1532  float arx = a_r.x;
1533  float ary = a_r.y;
1534  float arz = a_r.z;
1535  float brx = b_r.x;
1536  float bry = b_r.y;
1537  float brz = b_r.z;
1538  float crx = c_r.x;
1539  float cry = c_r.y;
1540  float crz = c_r.z;
1541  m_step = step;
1542 
1543  //JM: actually necessary if you reserve a PME device!
1544  cudaCheck(cudaSetDevice(deviceID));
1545  const SimParameters& sim_params = *(Node::Object()->simParameters);
1546 
1547  // clear force array
1548  //fprintf(stderr, "Calling clear_device_array on d_force\n");
1549  clear_device_array<float3>(d_forces, num_used_grids * natoms, stream);
1550  // clear grid
1551  //fprintf(stderr, "Calling clear_device_array on d_grid\n");
1552  clear_device_array<float>(d_grids, num_used_grids * gridsize, stream);
1553  clear_device_array<float2>(d_trans, num_used_grids * transize, stream);
1554 
1555  // Clear energy and virial array if needed
1556  if (doEnergyVirial) {
1557  // clear_device_array<EnergyVirial>(d_energyVirial, 1, stream);
1558  clear_device_array<EnergyVirial>(d_energyVirials, num_used_grids * 1, stream);
1559  const bool updateSelfEnergy = (step == sim_params.firstTimestep) || (selfEnergy == 0);
1560  if (updateSelfEnergy && (sim_params.alchOn == false)) {
1561  clear_device_array<double>(d_selfEnergy, 1, stream);
1562  // calculate self energy term if not yet done
1564  kappa, stream);
1565  //fprintf(stderr, "selfEnergy = %12.8f\n", selfEnergy);
1566  }
1567  /* the self energy depends on the scaling factor, or lambda
1568  * the cases when self energy will be changed:
1569  * 1. If alchLambdaFreq > 0, we will have a linear scaling of lambda. Lambda is changed EVERY STEP!
1570  * 2. In most cases, users will not use alchLambdaFreq > 0, but simulations may enter another lambda-window by using TCL scripts.
1571  * in summary, the self energy will be not changed unless lambda is changed.
1572  * so calcSelfEnergyAlch() would compare lambda of current step with the one from last step.
1573  * only if lambda is changed, the calcSelfEnergyFEPKernel or calcSelfEnergyTIKernel will be executed again.
1574  */
1575  if (sim_params.alchOn) calcSelfEnergyAlch(m_step);
1576  }
1577 
1578 #if 0
1579 
1580  spread_charge(d_atoms, natoms, k1, k2, k3, k1, k2, k3,
1581  k1 /* xsize */, 0 /* jBlock */, 0 /* kBlock */,
1582  true /* pmeGrid.yBlocks == 1 */, true /* pmeGrid.zBlocks == 1 */,
1583  d_grid, order, stream);
1584 #else
1585  const int order3 = ((order*order*order-1)/WARPSIZE + 1)*WARPSIZE;
1586  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1588  d_atoms + iGrid * natoms, natoms, k1, k2, k3,
1589  float(k1), (float)k2, (float)k3, order3,
1590  k1, k2, k3,
1591  k1 /* xsize */, 0 /* jBlock */, 0 /* kBlock */,
1592  true /* pmeGrid.yBlocks == 1 */, true /* pmeGrid.zBlocks == 1 */,
1593  d_grids + iGrid * gridsize, order, stream);
1594  }
1595 
1596 #endif
1597  //cudaCheck(cudaStreamSynchronize(stream));
1598 
1599  // forward FFT
1600  //fprintf(stderr, "Calling cufftExecR2C\n");
1601  //cufftCheck(cufftExecR2C(forwardPlan, (cufftReal *)d_grid,
1602  // (cufftComplex *)d_tran));
1603 
1604  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1605  cufftCheck(cufftExecR2C(forwardPlans[iGrid],
1606  (cufftReal *)(d_grids + iGrid * gridsize),
1607  (cufftComplex *)(d_trans + iGrid * transize)));
1608  }
1609 
1610  //cudaCheck(cudaStreamSynchronize(stream));
1611 
1612  // reciprocal space calculation
1613  //fprintf(stderr, "Calling scalar_sum\n");
1614  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1615  scalar_sum(true /* Perm_cX_Y_Z */, k1, k2, k3, (k1/2 + 1), k2, k3,
1616  kappa, arx, ary, arz, brx, bry, brz, crx, cry, crz, volume,
1617  d_bm1, d_bm2, d_bm3, 0 /* jBlock */, 0 /* kBlock */,
1618  (bool) doEnergyVirial, &(d_energyVirials[iGrid].energy),
1619  d_energyVirials[iGrid].virial, d_trans + iGrid * transize, stream);
1620  }
1621  //scalar_sum(true /* Perm_cX_Y_Z */, k1, k2, k3, (k1/2 + 1), k2, k3,
1622  // kappa, arx, ary, arz, brx, bry, brz, crx, cry, crz, volume,
1623  // d_bm1, d_bm2, d_bm3, 0 /* jBlock */, 0 /* kBlock */,
1624  // (bool) doEnergyVirial, &(d_energyVirial->energy),
1625  // d_energyVirial->virial, d_tran, stream);
1626  //cudaCheck(cudaStreamSynchronize(stream));
1627 
1628  // backward FFT
1629  //fprintf(stderr, "Calling cufftExecC2R\n");
1630  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1631  cufftCheck(cufftExecC2R(backwardPlans[iGrid],
1632  (cufftComplex *)(d_trans + iGrid * transize),
1633  (cufftReal *)(d_grids + iGrid * gridsize)));
1634  }
1635 
1636  //cufftCheck(cufftExecC2R(backwardPlan, (cufftComplex *)d_tran,
1637  // (cufftReal *)d_grid));
1638  //cudaCheck(cudaStreamSynchronize(stream));
1639 
1640  // gather force from grid to atoms
1641  // missing cudaTextureObject_t below works for __CUDA_ARCH__ >= 350
1642  //fprintf(stderr, "Calling gather_force\n");
1643  for (unsigned int iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1645  &(d_atoms[iGrid * natoms]), natoms, k1, k2, k3, k1, k2, k3,
1646  k1 /* xsize */, 0 /* jBlock */, 0 /* kBlock */,
1647  true /* pmeGrid.yBlocks == 1 */, true /* pmeGrid.zBlocks == 1 */,
1648  d_grids + iGrid * gridsize, order, d_forces + iGrid * natoms,
1649 #ifdef NAMD_CUDA
1650  gridTexObjArrays[iGrid] /* cudaTextureObject_t */,
1651 #endif
1652  stream);
1653  }
1654 
1655  //gather_force(d_atoms, natoms, k1, k2, k3, k1, k2, k3,
1656  // k1 /* xsize */, 0 /* jBlock */, 0 /* kBlock */,
1657  // true /* pmeGrid.yBlocks == 1 */, true /* pmeGrid.zBlocks == 1 */,
1658  // d_grid, order, d_force, gridTexObj /* cudaTextureObject_t */,
1659  // stream);
1660  //cudaCheck(cudaStreamSynchronize(stream));
1661 
1662  // Copy energy and virial to host if needed
1663  if (doEnergyVirial) {
1664  //fprintf(stderr, "Calling copy_DtoH on d_energyVirial\n");
1665  copy_DtoH<EnergyVirial>(d_energyVirials, h_energyVirials,
1667  //cudaCheck(cudaEventRecord(copyEnergyVirialEvent, stream));
1668  //cudaCheck(cudaStreamSynchronize(stream));
1669  }
1670 
1671  // XXX debugging, quick test for borked forces
1672  //clear_device_array<float3>(d_force, natoms, stream);
1673  if (sim_params.alchOn) {
1674  scaleAndMergeForce(m_step);
1675  }
1676 }
1677 
1678 
1679 // call this after device-host memory transfer has completed
1681  bool doEnergyVirial
1682  ) {
1683  cudaCheck(cudaStreamSynchronize(stream));
1684  if(doEnergyVirial){
1685  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1686  PatchData* patchData = cpdata.ckLocalBranch();
1687  NodeReduction *reduction = patchData->reduction;
1688  cudaCheck(cudaSetDevice(deviceID));
1689  double virial[9];
1690  double energy, energy_F, energy_TI_1, energy_TI_2;
1691  const SimParameters& sim_params = *(Node::Object()->simParameters);
1692  if (sim_params.alchOn) {
1693  if (sim_params.alchFepOn) {
1694  scaleAndComputeFEPEnergyVirials(h_energyVirials, m_step, energy, energy_F, virial);
1695  energy += selfEnergy;
1696  energy_F += selfEnergy_FEP;
1697  }
1698  if (sim_params.alchThermIntOn) {
1699  scaleAndComputeTIEnergyVirials(h_energyVirials, m_step, energy, energy_TI_1, energy_TI_2, virial);
1700  energy += selfEnergy;
1701  energy_TI_1 += selfEnergy_TI_1;
1702  energy_TI_2 += selfEnergy_TI_2;
1703  }
1704  } else {
1705  virial[0] = h_energyVirials[0].virial[0];
1706  virial[1] = h_energyVirials[0].virial[1];
1707  virial[2] = h_energyVirials[0].virial[2];
1708  virial[3] = h_energyVirials[0].virial[1];
1709  virial[4] = h_energyVirials[0].virial[3];
1710  virial[5] = h_energyVirials[0].virial[4];
1711  virial[6] = h_energyVirials[0].virial[2];
1712  virial[7] = h_energyVirials[0].virial[4];
1713  virial[8] = h_energyVirials[0].virial[5];
1714  energy = h_energyVirials[0].energy + selfEnergy;
1715  }
1716  #if 0
1717  fprintf(stderr, "PME ENERGY = %g %g\n", h_energyVirials[0].energy, selfEnergy );
1718  fprintf(stderr, "PME VIRIAL =\n"
1719  " %g %g %g\n %g %g %g\n %g %g %g\n",
1720  virial[0], virial[1], virial[2], virial[3], virial[4],
1721  virial[5], virial[6], virial[7], virial[8]);
1722  #endif
1723  (*reduction)[REDUCTION_VIRIAL_SLOW_XX] += virial[0];
1724  (*reduction)[REDUCTION_VIRIAL_SLOW_XY] += virial[1];
1725  (*reduction)[REDUCTION_VIRIAL_SLOW_XZ] += virial[2];
1726  (*reduction)[REDUCTION_VIRIAL_SLOW_YX] += virial[3];
1727  (*reduction)[REDUCTION_VIRIAL_SLOW_YY] += virial[4];
1728  (*reduction)[REDUCTION_VIRIAL_SLOW_YZ] += virial[5];
1729  (*reduction)[REDUCTION_VIRIAL_SLOW_ZX] += virial[6];
1730  (*reduction)[REDUCTION_VIRIAL_SLOW_ZY] += virial[7];
1731  (*reduction)[REDUCTION_VIRIAL_SLOW_ZZ] += virial[8];
1732  (*reduction)[REDUCTION_ELECT_ENERGY_SLOW] += energy;
1733  if (sim_params.alchFepOn) {
1734  (*reduction)[REDUCTION_ELECT_ENERGY_SLOW_F] += energy_F;
1735  }
1736  if (sim_params.alchThermIntOn) {
1737  (*reduction)[REDUCTION_ELECT_ENERGY_SLOW_TI_1] += energy_TI_1;
1738  (*reduction)[REDUCTION_ELECT_ENERGY_SLOW_TI_2] += energy_TI_2;
1739  }
1740  }
1741 }
1742 
1743 void CudaPmeOneDevice::calcSelfEnergyAlch(int step) {
1744  SimParameters& sim_params = *(Node::Object()->simParameters);
1745  if (sim_params.alchFepOn) {
1746  const BigReal alchLambda1 = sim_params.getCurrentLambda(step);
1747  const BigReal alchLambda2 = sim_params.getCurrentLambda2(step);
1748  static BigReal lambda1Up = sim_params.getElecLambda(alchLambda1);
1749  static BigReal lambda2Up = sim_params.getElecLambda(alchLambda2);
1750  static BigReal lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
1751  static BigReal lambda2Down = sim_params.getElecLambda(1.0 - alchLambda2);
1752  // compute self energy at the first call
1753  // only compute self energy if factors are changed
1754  if ((lambda1Up != sim_params.getElecLambda(alchLambda1)) ||
1755  (lambda2Up != sim_params.getElecLambda(alchLambda2)) ||
1756  (lambda1Down != sim_params.getElecLambda(1.0 - alchLambda1)) ||
1757  (lambda2Down != sim_params.getElecLambda(1.0 - alchLambda2)) ||
1759  lambda1Up = sim_params.getElecLambda(alchLambda1);
1760  lambda2Up = sim_params.getElecLambda(alchLambda2);
1761  lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
1762  lambda2Down = sim_params.getElecLambda(1.0 - alchLambda2);
1763  selfEnergy = 0.0; // self energy for λ_1
1764  selfEnergy_FEP = 0.0; // self energy for λ_2
1765  cudaCheck(cudaMemsetAsync(d_selfEnergy, 0, sizeof(double), stream));
1766  cudaCheck(cudaMemsetAsync(d_selfEnergy_FEP, 0, sizeof(double), stream));
1767  calcSelfEnergyFEPWrapper(d_selfEnergy, d_selfEnergy_FEP, selfEnergy, selfEnergy_FEP, d_atoms, d_partition, natoms, kappa, sim_params.alchDecouple, lambda1Up, lambda2Up, lambda1Down, lambda2Down, stream);
1769  }
1770  }
1771  if (sim_params.alchThermIntOn) {
1772  const BigReal alchLambda1 = sim_params.getCurrentLambda(step);
1773  static BigReal lambda1Up = sim_params.getElecLambda(alchLambda1);
1774  static BigReal lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
1775  if ((lambda1Up != sim_params.getElecLambda(alchLambda1)) ||
1776  (lambda1Down != sim_params.getElecLambda(1.0 - alchLambda1)) ||
1778  lambda1Up = sim_params.getElecLambda(alchLambda1);
1779  lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
1780  selfEnergy = 0.0;
1781  selfEnergy_TI_1 = 0.0;
1782  selfEnergy_TI_2 = 0.0;
1783  cudaCheck(cudaMemsetAsync(d_selfEnergy, 0, sizeof(double), stream));
1784  cudaCheck(cudaMemsetAsync(d_selfEnergy_TI_1, 0, sizeof(double), stream));
1785  cudaCheck(cudaMemsetAsync(d_selfEnergy_TI_2, 0, sizeof(double), stream));
1788  }
1789  }
1790 }
1791 
1792 void CudaPmeOneDevice::scaleAndMergeForce(int step) {
1793  SimParameters& sim_params = *(Node::Object()->simParameters);
1794  const double alchLambda1 = sim_params.getCurrentLambda(step);
1795  static BigReal lambda1Up = sim_params.getElecLambda(alchLambda1);
1796  static BigReal lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
1797  if ((lambda1Up != sim_params.getElecLambda(alchLambda1)) ||
1798  (lambda1Down != sim_params.getElecLambda(1.0 - alchLambda1)) ||
1800  std::vector<float> scale_factors(num_used_grids);
1801  lambda1Up = sim_params.getElecLambda(alchLambda1);
1802  lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
1803  scale_factors[0] = lambda1Up;
1804  scale_factors[1] = lambda1Down;
1805  if (sim_params.alchDecouple) {
1806  scale_factors[2] = 1.0 - lambda1Up;
1807  scale_factors[3] = 1.0 - lambda1Down;
1808  }
1809  if (bool(sim_params.alchElecLambdaStart) || sim_params.alchThermIntOn) {
1810  scale_factors[num_used_grids-1] = (lambda1Up + lambda1Down - 1.0) * (-1.0);
1811  }
1812  copy_HtoD<float>(scale_factors.data(), d_scaling_factors, num_used_grids);
1814  }
1816 }
1817 
1818 void CudaPmeOneDevice::scaleAndComputeFEPEnergyVirials(const EnergyVirial* energyVirials, int step, double& energy, double& energy_F, double (&virial)[9]) {
1819  double scale1 = 1.0;
1820  double scale2 = 1.0;
1821  energy = 0;
1822  energy_F = 0;
1823  for (unsigned int i = 0; i < 9; ++i) {
1824  virial[i] = 0;
1825  }
1826  SimParameters& sim_params = *(Node::Object()->simParameters);
1827  const BigReal alchLambda = sim_params.getCurrentLambda(step);
1828  const BigReal alchLambda2 = sim_params.getCurrentLambda2(step);
1829  const BigReal elecLambdaUp = sim_params.getElecLambda(alchLambda);
1830  const BigReal elecLambda2Up = sim_params.getElecLambda(alchLambda2);
1831  const BigReal elecLambdaDown = sim_params.getElecLambda(1 - alchLambda);
1832  const BigReal elecLambda2Down = sim_params.getElecLambda(1 - alchLambda2);
1833  energy += energyVirials[0].energy * elecLambdaUp;
1834  energy_F += energyVirials[0].energy * elecLambda2Up;
1835  energy += energyVirials[1].energy * elecLambdaDown;
1836  energy_F += energyVirials[1].energy * elecLambda2Down;
1837  virial[0] += energyVirials[0].virial[0] * elecLambdaUp;
1838  virial[1] += energyVirials[0].virial[1] * elecLambdaUp;
1839  virial[2] += energyVirials[0].virial[2] * elecLambdaUp;
1840  virial[3] += energyVirials[0].virial[1] * elecLambdaUp;
1841  virial[4] += energyVirials[0].virial[3] * elecLambdaUp;
1842  virial[5] += energyVirials[0].virial[4] * elecLambdaUp;
1843  virial[6] += energyVirials[0].virial[2] * elecLambdaUp;
1844  virial[7] += energyVirials[0].virial[4] * elecLambdaUp;
1845  virial[8] += energyVirials[0].virial[5] * elecLambdaUp;
1846  virial[0] += energyVirials[1].virial[0] * elecLambdaDown;
1847  virial[1] += energyVirials[1].virial[1] * elecLambdaDown;
1848  virial[2] += energyVirials[1].virial[2] * elecLambdaDown;
1849  virial[3] += energyVirials[1].virial[1] * elecLambdaDown;
1850  virial[4] += energyVirials[1].virial[3] * elecLambdaDown;
1851  virial[5] += energyVirials[1].virial[4] * elecLambdaDown;
1852  virial[6] += energyVirials[1].virial[2] * elecLambdaDown;
1853  virial[7] += energyVirials[1].virial[4] * elecLambdaDown;
1854  virial[8] += energyVirials[1].virial[5] * elecLambdaDown;
1855  if (sim_params.alchDecouple) {
1856  energy += energyVirials[2].energy * (1.0 - elecLambdaUp);
1857  energy_F += energyVirials[2].energy * (1.0 - elecLambda2Up);
1858  energy += energyVirials[3].energy * (1.0 - elecLambdaDown);
1859  energy_F += energyVirials[3].energy * (1.0 - elecLambda2Down);
1860  virial[0] += energyVirials[2].virial[0] * (1.0 - elecLambdaUp);
1861  virial[1] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1862  virial[2] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1863  virial[3] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1864  virial[4] += energyVirials[2].virial[3] * (1.0 - elecLambdaUp);
1865  virial[5] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1866  virial[6] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1867  virial[7] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1868  virial[8] += energyVirials[2].virial[5] * (1.0 - elecLambdaUp);
1869  virial[0] += energyVirials[3].virial[0] * (1.0 - elecLambdaDown);
1870  virial[1] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1871  virial[2] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1872  virial[3] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1873  virial[4] += energyVirials[3].virial[3] * (1.0 - elecLambdaDown);
1874  virial[5] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1875  virial[6] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1876  virial[7] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1877  virial[8] += energyVirials[3].virial[5] * (1.0 - elecLambdaDown);
1878  if (sim_params.alchElecLambdaStart > 0) {
1879  energy += energyVirials[4].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1880  energy_F += energyVirials[4].energy * (-1.0 * (elecLambda2Up + elecLambda2Down - 1.0));
1881  virial[0] += energyVirials[4].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1882  virial[1] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1883  virial[2] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1884  virial[3] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1885  virial[4] += energyVirials[4].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1886  virial[5] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1887  virial[6] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1888  virial[7] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1889  virial[8] += energyVirials[4].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1890  }
1891  } else {
1892  if (sim_params.alchElecLambdaStart > 0) {
1893  energy += energyVirials[2].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1894  energy_F += energyVirials[2].energy * (-1.0 * (elecLambda2Up + elecLambda2Down - 1.0));
1895  virial[0] += energyVirials[2].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1896  virial[1] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1897  virial[2] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1898  virial[3] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1899  virial[4] += energyVirials[2].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1900  virial[5] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1901  virial[6] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1902  virial[7] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1903  virial[8] += energyVirials[2].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1904  }
1905  }
1906 }
1907 
1908 void CudaPmeOneDevice::scaleAndComputeTIEnergyVirials(const EnergyVirial* energyVirials, int step, double& energy, double& energy_TI_1, double& energy_TI_2, double (&virial)[9]) {
1909  double scale1 = 1.0;
1910  energy =0;
1911  energy_TI_1 = 0;
1912  energy_TI_2 = 0;
1913  for (unsigned int i = 0; i < 9; ++i) {
1914  virial[i] = 0;
1915  }
1916  SimParameters& sim_params = *(Node::Object()->simParameters);
1917  const BigReal alchLambda = sim_params.getCurrentLambda(step);
1918  const BigReal elecLambdaUp = sim_params.getElecLambda(alchLambda);
1919  const BigReal elecLambdaDown = sim_params.getElecLambda(1 - alchLambda);
1920  energy += energyVirials[0].energy * elecLambdaUp;
1921  energy += energyVirials[1].energy * elecLambdaDown;
1922  energy_TI_1 += energyVirials[0].energy;
1923  energy_TI_2 += energyVirials[1].energy;
1924  virial[0] += energyVirials[0].virial[0] * elecLambdaUp;
1925  virial[1] += energyVirials[0].virial[1] * elecLambdaUp;
1926  virial[2] += energyVirials[0].virial[2] * elecLambdaUp;
1927  virial[3] += energyVirials[0].virial[1] * elecLambdaUp;
1928  virial[4] += energyVirials[0].virial[3] * elecLambdaUp;
1929  virial[5] += energyVirials[0].virial[4] * elecLambdaUp;
1930  virial[6] += energyVirials[0].virial[2] * elecLambdaUp;
1931  virial[7] += energyVirials[0].virial[4] * elecLambdaUp;
1932  virial[8] += energyVirials[0].virial[5] * elecLambdaUp;
1933  virial[0] += energyVirials[1].virial[0] * elecLambdaDown;
1934  virial[1] += energyVirials[1].virial[1] * elecLambdaDown;
1935  virial[2] += energyVirials[1].virial[2] * elecLambdaDown;
1936  virial[3] += energyVirials[1].virial[1] * elecLambdaDown;
1937  virial[4] += energyVirials[1].virial[3] * elecLambdaDown;
1938  virial[5] += energyVirials[1].virial[4] * elecLambdaDown;
1939  virial[6] += energyVirials[1].virial[2] * elecLambdaDown;
1940  virial[7] += energyVirials[1].virial[4] * elecLambdaDown;
1941  virial[8] += energyVirials[1].virial[5] * elecLambdaDown;
1942  if (sim_params.alchDecouple) {
1943  energy += energyVirials[2].energy * (1.0 - elecLambdaUp);
1944  energy += energyVirials[3].energy * (1.0 - elecLambdaDown);
1945  energy_TI_1 += -1.0 * energyVirials[2].energy;
1946  energy_TI_2 += -1.0 * energyVirials[3].energy;
1947  virial[0] += energyVirials[2].virial[0] * (1.0 - elecLambdaUp);
1948  virial[1] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1949  virial[2] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1950  virial[3] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1951  virial[4] += energyVirials[2].virial[3] * (1.0 - elecLambdaUp);
1952  virial[5] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1953  virial[6] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1954  virial[7] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1955  virial[8] += energyVirials[2].virial[5] * (1.0 - elecLambdaUp);
1956  virial[0] += energyVirials[3].virial[0] * (1.0 - elecLambdaDown);
1957  virial[1] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1958  virial[2] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1959  virial[3] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1960  virial[4] += energyVirials[3].virial[3] * (1.0 - elecLambdaDown);
1961  virial[5] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1962  virial[6] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1963  virial[7] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1964  virial[8] += energyVirials[3].virial[5] * (1.0 - elecLambdaDown);
1965  energy += energyVirials[4].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1966  energy_TI_1 += -1.0 * energyVirials[4].energy;
1967  energy_TI_2 += -1.0 * energyVirials[4].energy;
1968  virial[0] += energyVirials[4].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1969  virial[1] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1970  virial[2] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1971  virial[3] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1972  virial[4] += energyVirials[4].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1973  virial[5] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1974  virial[6] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1975  virial[7] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1976  virial[8] += energyVirials[4].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1977  } else {
1978  energy += energyVirials[2].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1979  energy_TI_1 += -1.0 * energyVirials[2].energy;
1980  energy_TI_2 += -1.0 * energyVirials[2].energy;
1981  virial[0] += energyVirials[2].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1982  virial[1] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1983  virial[2] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1984  virial[3] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1985  virial[4] += energyVirials[2].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1986  virial[5] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1987  virial[6] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1988  virial[7] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1989  virial[8] += energyVirials[2].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1990  }
1991 }
1992 
1993 
1994 
1995 int CudaPmeOneDevice::getShiftedGrid(const double x, const int grid) {
1996  double w = x + 0.5;
1997  double gw = w * grid;
1998  return floor(gw);
1999 }
2000 
2002  const int numThreads, const int3 patchGridDim, const int order) {
2003 
2004  const int gridBytes = patchGridDim.x * patchGridDim.y * patchGridDim.z * sizeof(float);
2005  const int thetaBytes = PatchLevelPmeData::kDim * (numThreads + PatchLevelPmeData::kThetaPad) *
2006  order * sizeof(float);
2007  const int indexBytes = numThreads * sizeof(char4);
2008 
2009  return gridBytes + thetaBytes + indexBytes;
2010 }
2011 
2013  const int numThreads, const int3 patchGridDim, const int order) {
2014 
2015  const int gridBytes = patchGridDim.x * patchGridDim.y * patchGridDim.z * sizeof(float);
2016  const int thetaBytes = (numThreads + PatchLevelPmeData::kThetaPad) * order *
2017  2 /* theta and dtheta */ * sizeof(float);
2018 
2019  return gridBytes + thetaBytes;
2020 }
2021 
2022 void CudaPmeOneDevice::checkPatchLevelSimParamCompatibility(const int order, const bool periodicY, const bool periodicZ) {
2023  bool use = true;
2024  use = use && (order == 8);
2025  use = use && (periodicY);
2026  use = use && (periodicZ);
2027 
2028  use = use && (deviceCUDA->getNumDevice() == 1); // This is only supported for single GPU currently
2029 
2031 }
2032 
2034  cudaDeviceGetAttribute(&patchLevelPmeData.deviceMaxSharedBytes, cudaDevAttrMaxSharedMemoryPerBlockOptin, deviceID);
2035 
2036  const int3 constexprPatchGridDim = make_int3(
2040 
2043  constexprPatchGridDim, 8 /* order */);
2046  constexprPatchGridDim, 8 /* order */);
2047 
2051 }
2052 
2054  const int numPatches, const CudaLocalRecord* localRecords,
2055  double3* patchMin, double3* patchMax, double3* awayDists) {
2056 
2057  patchLevelPmeData.localRecords = localRecords;
2058 
2059  // If the simulation isn't compatible or the device isn't compatible then no point in checking
2060  // patch sizes
2062 
2063  patchLevelPmeData.numPatches = numPatches;
2064 
2065  if (patchLevelPmeData.h_patchGridOffsets == nullptr) {
2066  allocate_host<int3>(&patchLevelPmeData.h_patchGridOffsets, numPatches);
2067  }
2068  if (patchLevelPmeData.d_patchGridOffsets == nullptr) {
2069  allocate_device<int3>(&patchLevelPmeData.d_patchGridOffsets, numPatches);
2070  }
2071 
2073  const int order = pmeGrid.order;
2074 
2075  // We only need to recompute the grid offsets if the lattice has changed
2076  if (!lattice.isEqual(currentLattice)) {
2077  currentLattice = lattice;
2078 
2079  double sysdima = currentLattice.a_r().unit() * currentLattice.a();
2080  double sysdimb = currentLattice.b_r().unit() * currentLattice.b();
2081  double sysdimc = currentLattice.c_r().unit() * currentLattice.c();
2082 
2083  patchLevelPmeData.patchGridDim = make_int3(0,0,0);
2084 
2085  for (int i = 0; i < numPatches; i++) {
2086  double3 pmin = currentLattice.unscale(patchMin[i]);
2087  double3 pmax = currentLattice.unscale(patchMax[i]);
2088  double3 width = pmax - pmin;
2089 
2090  // Logic copied from margin violation check
2091  double3 marginVal;
2092  marginVal.x = 0.5 * (awayDists[i].x - simParams->cutoff / sysdima);
2093  marginVal.y = 0.5 * (awayDists[i].y - simParams->cutoff / sysdimb);
2094  marginVal.z = 0.5 * (awayDists[i].z - simParams->cutoff / sysdimc);
2095  marginVal = currentLattice.unscale(marginVal);
2096 
2097  double3 minAtom = pmin - marginVal;
2098  double3 maxAtom = pmax + marginVal;
2099 
2100  double3 minScaled = currentLattice.scale(minAtom);
2101  double3 maxScaled = currentLattice.scale(maxAtom);
2102 
2103  int3 gridMin;
2104  gridMin.x = getShiftedGrid(minScaled.x, pmeGrid.K1);
2105  gridMin.y = getShiftedGrid(minScaled.y, pmeGrid.K2);
2106  gridMin.z = getShiftedGrid(minScaled.z, pmeGrid.K3);
2107 
2108  int3 gridMax;
2109  gridMax.x = getShiftedGrid(maxScaled.x, pmeGrid.K1);
2110  gridMax.y = getShiftedGrid(maxScaled.y, pmeGrid.K2);
2111  gridMax.z = getShiftedGrid(maxScaled.z, pmeGrid.K3);
2112 
2113  int3 gridWidth;
2114  gridWidth.x = gridMax.x - gridMin.x + order;
2115  gridWidth.y = gridMax.y - gridMin.y + order;
2116  gridWidth.z = gridMax.z - gridMin.z + order;
2117 
2119  patchLevelPmeData.patchGridDim.x = std::max(patchLevelPmeData.patchGridDim.x, gridWidth.x);
2120  patchLevelPmeData.patchGridDim.y = std::max(patchLevelPmeData.patchGridDim.y, gridWidth.y);
2121  patchLevelPmeData.patchGridDim.z = std::max(patchLevelPmeData.patchGridDim.z, gridWidth.z);
2122  }
2124  numPatches, nullptr);
2125  cudaStreamSynchronize(nullptr);
2126  const int maxGridPoints = patchLevelPmeData.patchGridDim.x *
2128 
2133  }
2134 }
2135 
2136 
2137 #endif // NAMD_CUDA
2138 
static Node * Object()
Definition: Node.h:86
void spread_charge_v2(const PatchLevelPmeData patchLevelPmeData, const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const float nfftx_f, const float nffty_f, const float nfftz_f, const int order3, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
void finishReduction(bool doEnergyVirial)
int zBlocks
Definition: PmeBase.h:25
bool isEqual(const Lattice &other) const
Definition: Lattice.h:298
CudaPmeOneDevice(PmeGrid pmeGrid_, int deviceID_, int deviceIndex_)
#define NAMD_EVENT_STOP(eon, id)
cufftHandle * backwardPlans
void compute_b_moduli(double *bm, int K, int order)
Definition: PmeKSpace.C:42
NAMD_HOST_DEVICE Vector c() const
Definition: Lattice.h:270
void energyAndVirialSetCallback(CudaPmePencilXYZ *pencilPtr)
const int permutation
void gatherForceDone(unsigned int iGrid)
static constexpr int kNumThreads
void batchTranspose_xyz_yzx(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
CudaPmeTranspose(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
BigReal alchElecLambdaStart
Definition: Vector.h:72
int K2
Definition: PmeBase.h:21
SimParameters * simParameters
Definition: Node.h:181
int K1
Definition: PmeBase.h:21
void cudaDie(const char *msg, cudaError_t err)
Definition: CudaUtils.C:9
EnergyVirial * d_energyVirials
NAMD_HOST_DEVICE Position unscale(ScaledPosition s) const
Definition: Lattice.h:77
BigReal z
Definition: Vector.h:74
int getNumDevice()
Definition: DeviceCUDA.h:125
#define cufftCheck(stmt)
void checkPatchLevelLatticeCompatibilityAndComputeOffsets(const Lattice &lattice, const int numPatches, const CudaLocalRecord *localRecords, double3 *patchMin, double3 *patchMax, double3 *awayDists)
static void getPencilDim(const PmeGrid &pmeGrid, const int permutation, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:32
void energyAndVirialDone(unsigned int iGrid)
void checkPatchLevelSimParamCompatibility(const int order, const bool periodicY, const bool periodicZ)
NodeReduction * reduction
Definition: PatchData.h:133
void spreadCharge(Lattice &lattice)
BigReal getElecLambda(const BigReal) const
std::vector< int > pos
void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)
void copyAtoms(const int numAtoms, const CudaAtom *atoms)
#define WARPSIZE
Definition: CudaUtils.h:17
PatchLevelPmeData patchLevelPmeData
CudaPmeRealSpaceCompute(PmeGrid pmeGrid, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
void copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
void copyDataDeviceToDevice(const int iblock, float2 *data_out)
PmeGrid pmeGrid
bool dataDstAllocated
void calcSelfEnergyTIWrapper(double *d_selfEnergy, double *d_selfEnergy_TI_1, double *d_selfEnergy_TI_2, double &h_selfEnergy, double &h_selfEnergy_TI_1, double &h_selfEnergy_TI_2, const float4 *d_atoms, const int *d_partition, const int num_atoms, const double ewaldcof, const bool alchDecouple, const double lambda1Up, const double lambda1Down, cudaStream_t stream)
CudaFFTCompute(int deviceID, cudaStream_t stream)
int yBlocks
Definition: PmeBase.h:25
int computeSharedMemoryPatchLevelSpreadCharge(const int numThreads, const int3 patchGridDim, const int order)
void copyDataDeviceToHost(const int iblock, float2 *h_data, const int h_dataSize)
#define order
Definition: PmeRealSpace.C:235
#define NAMD_EVENT_START(eon, id)
int order
Definition: PmeBase.h:23
void NAMD_bug(const char *err_msg)
Definition: common.C:195
const int jblock
void writeHostComplexToDisk(const float2 *h_data, const int size, const char *filename)
NAMD_HOST_DEVICE ScaledPosition scale(Position p) const
Definition: Lattice.h:83
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
BigReal x
Definition: Vector.h:74
void gather_force(const PatchLevelPmeData patchLevelPmeData, const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, const float *data, const int order, float3 *force, const cudaTextureObject_t gridTexObj, cudaStream_t stream)
NAMD_HOST_DEVICE BigReal volume(void) const
Definition: Lattice.h:293
NAMD_HOST_DEVICE Vector a_r() const
Definition: Lattice.h:284
void calcSelfEnergyFEPWrapper(double *d_selfEnergy, double *d_selfEnergy_FEP, double &h_selfEnergy, double &h_selfEnergyFEP, const float4 *d_atoms, const int *d_partition, const int num_atoms, const double ewaldcof, const bool alchDecouple, const double lambda1Up, const double lambda2Up, const double lambda1Down, const double lambda2Down, cudaStream_t stream)
NAMD_HOST_DEVICE Vector b_r() const
Definition: Lattice.h:285
int numAtoms
Definition: Molecule.h:585
void getVirial(double *virial)
cudaTextureObject_t * gridTexObjArrays
static constexpr int kPatchGridDimPad
NAMD_HOST_DEVICE Vector c_r() const
Definition: Lattice.h:286
const int kblock
void energyAndVirialDone(unsigned int iGrid)
Definition: CudaPmeSolver.C:61
void writeComplexToDisk(const float2 *d_data, const int size, const char *filename, cudaStream_t stream)
NAMD_HOST_DEVICE Vector b() const
Definition: Lattice.h:269
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:53
bool dataSrcAllocated
double compute_selfEnergy(double *d_selfEnergy, const float4 *d_atoms, int natoms, double ewaldcof, cudaStream_t stream)
#define simParams
Definition: Output.C:129
int K3
Definition: PmeBase.h:21
const int permutation
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:89
void spread_charge(const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
BigReal getCurrentLambda2(const int) const
BigReal getCurrentLambda(const int) const
void gatherForce(Lattice &lattice, CudaForce *force)
BigReal y
Definition: Vector.h:74
unsigned int multipleGridIndex
void solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float *data)
void gatherForceSetCallback(ComputePmeCUDADevice *devicePtr_in)
void scaleAndMergeForceWrapper(float3 *forces, const float *factors, const size_t num_arrays, const int num_atoms, cudaStream_t stream)
void setDataPtrsYZX(std::vector< float2 *> &dataPtrsNew, float2 *data)
const CudaLocalRecord * localRecords
static constexpr int kDim
void transposeXYZtoYZX(const float2 *data)
void setDataPtrsZXY(std::vector< float2 *> &dataPtrsNew, float2 *data)
float * dataDst
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream, unsigned int iGrid=0)
void batchTranspose_xyz_zxy(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
cufftHandle * forwardPlans
size_t alchGetNumOfPMEGrids() const
void copyDataHostToDevice(const int iblock, float2 *data_in, float2 *data_out)
static constexpr int kThetaPad
void transposeXYZtoZXY(const float2 *data)
static constexpr int kPatchGridDim
NAMD_HOST_DEVICE Vector a() const
Definition: Lattice.h:268
float * dataSrc
NAMD_HOST_DEVICE Vector unit(void) const
Definition: Vector.h:215
void CcdCallBacksReset(void *ignored, double curWallTime)
Molecule * molecule
Definition: Node.h:179
void copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
int computeSharedMemoryPatchLevelGatherForce(const int numThreads, const int3 patchGridDim, const int order)
float2 * getBuffer(const int iblock)
int getShiftedGrid(const double x, const int grid)
EnergyVirial * h_energyVirials
unsigned int multipleGridIndex
double BigReal
Definition: common.h:123
void writeRealToDisk(const float *d_data, const int size, const char *filename, cudaStream_t stream)
void checkPatchLevelDeviceCompatibility()
void compute(const Lattice &lattice, int doEnergyVirial, int step)