NAMD
CudaPmeSolverUtil.C
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <algorithm>
3 #include <cstdlib>
4 #ifdef NAMD_CUDA
5 #include <cuda_runtime.h>
6 #endif
7 #ifdef NAMD_HIP
8 #include <hip/hip_runtime.h>
9 #endif
10 #include "HipDefines.h"
11 #include "ComputeNonbondedUtil.h"
12 #include "ComputePmeCUDAMgr.h"
13 #include "CudaPmeSolver.h"
14 #include "CudaPmeSolverUtil.h"
15 #include "Node.h"
16 #include "PatchData.h"
17 
18 #include "NamdEventsProfiling.h"
19 #include "TestArray.h"
20 #include "DeviceCUDA.h"
21 
22 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
23 extern __thread DeviceCUDA *deviceCUDA;
24 
25 extern "C" void CcdCallBacksReset(void *ignored, double curWallTime); // fix Charm++
26 
27 void writeComplexToDisk(const float2 *d_data, const int size, const char* filename, cudaStream_t stream) {
28  fprintf(stderr, "writeComplexToDisk %d %s\n", size, filename);
29  float2* h_data = new float2[size];
30  copy_DtoH<float2>(d_data, h_data, size, stream);
31  cudaCheck(cudaStreamSynchronize(stream));
32  FILE *handle = fopen(filename, "w");
33  for (int i=0;i < size;i++)
34  fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
35  fclose(handle);
36  delete [] h_data;
37 }
38 
39 void writeHostComplexToDisk(const float2 *h_data, const int size, const char* filename) {
40  FILE *handle = fopen(filename, "w");
41  for (int i=0;i < size;i++)
42  fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
43  fclose(handle);
44 }
45 
46 void writeRealToDisk(const float *d_data, const int size, const char* filename, cudaStream_t stream) {
47  fprintf(stderr, "writeRealToDisk %d %s\n", size, filename);
48  float* h_data = new float[size];
49  copy_DtoH<float>(d_data, h_data, size, stream);
50  cudaCheck(cudaStreamSynchronize(stream));
51  FILE *handle = fopen(filename, "w");
52  for (int i=0;i < size;i++)
53  fprintf(handle, "%f\n", h_data[i]);
54  fclose(handle);
55  delete [] h_data;
56 }
57 
58  CudaFFTCompute::CudaFFTCompute(int deviceID, cudaStream_t stream)
59  : deviceID(deviceID), stream(stream) {
60  }
61 
62 void CudaFFTCompute::plan3D(int *n, int flags) {
63  cudaCheck(cudaSetDevice(deviceID));
64  forwardType = CUFFT_R2C;
65  backwardType = CUFFT_C2R;
66  cufftCheck(cufftPlan3d(&forwardPlan, n[2], n[1], n[0], CUFFT_R2C));
67  cufftCheck(cufftPlan3d(&backwardPlan, n[2], n[1], n[0], CUFFT_C2R));
68  setStream();
69  // plantype = 3;
70 }
71 
72 void CudaFFTCompute::plan2D(int *n, int howmany, int flags) {
73  cudaCheck(cudaSetDevice(deviceID));
74  forwardType = CUFFT_R2C;
75  backwardType = CUFFT_C2R;
76  int nt[2] = {n[1], n[0]};
77  cufftCheck(cufftPlanMany(&forwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_R2C, howmany));
78  cufftCheck(cufftPlanMany(&backwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, howmany));
79  setStream();
80  // plantype = 2;
81 }
82 
83 void CudaFFTCompute::plan1DX(int *n, int howmany, int flags) {
84  cudaCheck(cudaSetDevice(deviceID));
85  forwardType = CUFFT_R2C;
86  backwardType = CUFFT_C2R;
87  cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_R2C, howmany));
88  cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2R, howmany));
89  setStream();
90  // plantype = 1;
91 }
92 
93 void CudaFFTCompute::plan1DY(int *n, int howmany, int flags) {
94  cudaCheck(cudaSetDevice(deviceID));
95  forwardType = CUFFT_C2C;
96  backwardType = CUFFT_C2C;
97  cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
98  cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
99  setStream();
100  // plantype = 1;
101 }
102 
103 void CudaFFTCompute::plan1DZ(int *n, int howmany, int flags) {
104  cudaCheck(cudaSetDevice(deviceID));
105  forwardType = CUFFT_C2C;
106  backwardType = CUFFT_C2C;
107  cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
108  cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
109  setStream();
110  // plantype = 1;
111 }
112 
114  cudaCheck(cudaSetDevice(deviceID));
115  cufftCheck(cufftDestroy(forwardPlan));
116  cufftCheck(cufftDestroy(backwardPlan));
117  if (dataSrcAllocated) deallocate_device<float>(&dataSrc);
118  if (dataDstAllocated) deallocate_device<float>(&dataDst);
119 }
120 
121 float* CudaFFTCompute::allocateData(const int dataSizeRequired) {
122  cudaCheck(cudaSetDevice(deviceID));
123  float* tmp = NULL;
124  allocate_device<float>(&tmp, dataSizeRequired);
125  return tmp;
126 }
127 
128 // int ncall = 0;
129 
131  cudaCheck(cudaSetDevice(deviceID));
132  // ncall++;
133  if (forwardType == CUFFT_R2C) {
134  cufftCheck(cufftExecR2C(forwardPlan, (cufftReal *)dataSrc, (cufftComplex *)dataDst));
135 #ifdef TESTPID
136  if (1) {
137  cudaCheck(cudaStreamSynchronize(stream));
138  fprintf(stderr, "AP FORWARD FFT\n");
139  fprintf(stderr, "COPY DEVICE ARRAYS BACK TO HOST\n");
140  int m = dataDstSize;
141  float *tran = 0;
142  allocate_host<float>(&tran, m);
143  copy_DtoH<float>(dataDst, tran, m, stream);
144  cudaCheck(cudaStreamSynchronize(stream));
145  TestArray_write<float>("tran_charge_grid_good.bin",
146  "transformed charge grid good", tran, m);
147  deallocate_host<float>(&tran);
148  }
149 #endif
150 
151  // if (ncall == 1) {
152  // writeComplexToDisk((float2 *)dataSrc, (isize/2+1)*jsize*ksize, "dataSrc.txt", stream);
153  // }
154 
155  // if (ncall == 1 && plantype == 2) {
156  // writeComplexToDisk((float2 *)data, (isize/2+1)*jsize*ksize, "data_fx_fy_z.txt", stream);
157  // }
158 
159  } else if (forwardType == CUFFT_C2C) {
160  // nc2cf++;
161  // if (ncall == 1 && nc2cf == 1)
162  // writeComplexToDisk((float2 *)data, 33*64*64, "data_y_z_fx.txt");
163  // else if (ncall == 1 && nc2cf == 2)
164  // writeComplexToDisk((float2 *)data, 33*64*64, "data_z_fx_fy.txt");
165  cufftCheck(cufftExecC2C(forwardPlan, (cufftComplex *)dataSrc, (cufftComplex *)dataDst, CUFFT_FORWARD));
166  // fprintf(stderr, "ncall %d plantype %d\n", ncall, plantype);
167  // if (ncall == 1 && plantype == 1 && isize == 62) {
168  // writeComplexToDisk((float2 *)data, isize*jsize*(ksize/2+1), "data_fy_z_fx.txt", stream);
169  // }
170  // if (ncall == 1 && nc2cf == 1)
171  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fy_z_fx.txt");
172  // else if (ncall == 1 && nc2cf == 2)
173  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fz_fx_fy.txt");
174  } else {
175  cudaNAMD_bug("CudaFFTCompute::forward(), unsupported FFT type");
176  }
177 }
178 
180  cudaCheck(cudaSetDevice(deviceID));
181  if (backwardType == CUFFT_C2R) {
182  // if (ncall == 1) {
183  // if (plantype == 1)
184  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fx_by_bz.txt");
185  // else
186  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fx_fy_fz_2.txt");
187  // }
188  cufftCheck(cufftExecC2R(backwardPlan, (cufftComplex *)dataDst, (cufftReal *)dataSrc));
189 #ifdef TESTPID
190  if (1) {
191  cudaCheck(cudaStreamSynchronize(stream));
192  fprintf(stderr, "AP BACKWARD FFT\n");
193  fprintf(stderr, "COPY DEVICE ARRAYS BACK TO HOST\n");
194  float *grid;
195  int gridsize = dataSrcSize;
196  allocate_host<float>(&grid, gridsize);
197  copy_DtoH<float>((float*)dataSrc, grid, gridsize, stream);
198  cudaCheck(cudaStreamSynchronize(stream));
199  TestArray_write<float>("potential_grid_good.bin",
200  "potential grid good", grid, gridsize);
201  deallocate_host<float>(&grid);
202  }
203 #endif
204 
205  // if (ncall == 1)
206  // if (plantype == 1)
207  // writeRealToDisk(data, 64*64*64, "data_bx_by_bz_1D.txt");
208  // else
209  // writeRealToDisk(data, 64*64*64, "data_bx_by_bz_3D.txt");
210  } else if (backwardType == CUFFT_C2C) {
211  // nc2cb++;
212  // if (ncall == 1 && nc2cb == 1)
213  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fz_fx_fy_2.txt");
214  // else if (ncall == 1 && nc2cb == 2)
215  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fy_bz_fx.txt");
216  cufftCheck(cufftExecC2C(backwardPlan, (cufftComplex *)dataDst, (cufftComplex *)dataSrc, CUFFT_INVERSE));
217  // if (ncall == 1 && nc2cb == 1)
218  // writeComplexToDisk((float2 *)data, 33*64*64, "data_bz_fx_fy.txt");
219  // else if (ncall == 1 && nc2cb == 2)
220  // writeComplexToDisk((float2 *)data, 33*64*64, "data_by_bz_fx.txt");
221  } else {
222  cudaNAMD_bug("CudaFFTCompute::backward(), unsupported FFT type");
223  }
224 }
225 
226 void CudaFFTCompute::setStream() {
227  cudaCheck(cudaSetDevice(deviceID));
228  cufftCheck(cufftSetStream(forwardPlan, stream));
229  cufftCheck(cufftSetStream(backwardPlan, stream));
230 }
231 
232 
234  const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream, unsigned int iGrid) :
235  PmeKSpaceCompute(pmeGrid, permutation, jblock, kblock, kappa, iGrid),
236  deviceID(deviceID), stream(stream) {
237 
238  cudaCheck(cudaSetDevice(deviceID));
239 
240  // Copy bm1 -> prefac_x on GPU memory
241  float *bm1f = new float[pmeGrid.K1];
242  float *bm2f = new float[pmeGrid.K2];
243  float *bm3f = new float[pmeGrid.K3];
244  for (int i=0;i < pmeGrid.K1;i++) bm1f[i] = (float)bm1[i];
245  for (int i=0;i < pmeGrid.K2;i++) bm2f[i] = (float)bm2[i];
246  for (int i=0;i < pmeGrid.K3;i++) bm3f[i] = (float)bm3[i];
247  allocate_device<float>(&d_bm1, pmeGrid.K1);
248  allocate_device<float>(&d_bm2, pmeGrid.K2);
249  allocate_device<float>(&d_bm3, pmeGrid.K3);
250  copy_HtoD_sync<float>(bm1f, d_bm1, pmeGrid.K1);
251  copy_HtoD_sync<float>(bm2f, d_bm2, pmeGrid.K2);
252  copy_HtoD_sync<float>(bm3f, d_bm3, pmeGrid.K3);
253  delete [] bm1f;
254  delete [] bm2f;
255  delete [] bm3f;
256  allocate_device<EnergyVirial>(&d_energyVirial, 1);
257  allocate_host<EnergyVirial>(&h_energyVirial, 1);
258  // cudaCheck(cudaEventCreateWithFlags(&copyEnergyVirialEvent, cudaEventDisableTiming));
259  cudaCheck(cudaEventCreate(&copyEnergyVirialEvent));
260  // ncall = 0;
261 }
262 
264  cudaCheck(cudaSetDevice(deviceID));
265  deallocate_device<float>(&d_bm1);
266  deallocate_device<float>(&d_bm2);
267  deallocate_device<float>(&d_bm3);
268  deallocate_device<EnergyVirial>(&d_energyVirial);
269  deallocate_host<EnergyVirial>(&h_energyVirial);
270  cudaCheck(cudaEventDestroy(copyEnergyVirialEvent));
271 }
272 
273 void CudaPmeKSpaceCompute::solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float* data) {
274 #if 0
275  // Check lattice to make sure it is updating for constant pressure
276  fprintf(stderr, "K-SPACE LATTICE %g %g %g %g %g %g %g %g %g\n",
277  lattice.a().x, lattice.a().y, lattice.a().z,
278  lattice.b().x, lattice.b().y, lattice.b().z,
279  lattice.c().x, lattice.c().y, lattice.c().z);
280 #endif
281  cudaCheck(cudaSetDevice(deviceID));
282 
283  const bool doEnergyVirial = (doEnergy || doVirial);
284 
285  int nfft1, nfft2, nfft3;
286  float *prefac1, *prefac2, *prefac3;
287 
288  BigReal volume = lattice.volume();
289  Vector a_r = lattice.a_r();
290  Vector b_r = lattice.b_r();
291  Vector c_r = lattice.c_r();
292  float recip1x, recip1y, recip1z;
293  float recip2x, recip2y, recip2z;
294  float recip3x, recip3y, recip3z;
295 
296  if (permutation == Perm_Z_cX_Y) {
297  // Z, X, Y
298  nfft1 = pmeGrid.K3;
299  nfft2 = pmeGrid.K1;
300  nfft3 = pmeGrid.K2;
301  prefac1 = d_bm3;
302  prefac2 = d_bm1;
303  prefac3 = d_bm2;
304  recip1x = c_r.z;
305  recip1y = c_r.x;
306  recip1z = c_r.y;
307  recip2x = a_r.z;
308  recip2y = a_r.x;
309  recip2z = a_r.y;
310  recip3x = b_r.z;
311  recip3y = b_r.x;
312  recip3z = b_r.y;
313  } else if (permutation == Perm_cX_Y_Z) {
314  // X, Y, Z
315  nfft1 = pmeGrid.K1;
316  nfft2 = pmeGrid.K2;
317  nfft3 = pmeGrid.K3;
318  prefac1 = d_bm1;
319  prefac2 = d_bm2;
320  prefac3 = d_bm3;
321  recip1x = a_r.x;
322  recip1y = a_r.y;
323  recip1z = a_r.z;
324  recip2x = b_r.x;
325  recip2y = b_r.y;
326  recip2z = b_r.z;
327  recip3x = c_r.x;
328  recip3y = c_r.y;
329  recip3z = c_r.z;
330  } else {
331  NAMD_bug("CudaPmeKSpaceCompute::solve, invalid permutation");
332  }
333 
334  // ncall++;
335  // if (ncall == 1) {
336  // char filename[256];
337  // sprintf(filename,"dataf_%d_%d.txt",jblock,kblock);
338  // writeComplexToDisk((float2*)data, size1*size2*size3, filename, stream);
339  // }
340 
341  // if (ncall == 1) {
342  // float2* h_data = new float2[size1*size2*size3];
343  // float2* d_data = (float2*)data;
344  // copy_DtoH<float2>(d_data, h_data, size1*size2*size3, stream);
345  // cudaCheck(cudaStreamSynchronize(stream));
346  // FILE *handle = fopen("dataf.txt", "w");
347  // for (int z=0;z < pmeGrid.K3;z++) {
348  // for (int y=0;y < pmeGrid.K2;y++) {
349  // for (int x=0;x < pmeGrid.K1/2+1;x++) {
350  // int i;
351  // if (permutation == Perm_cX_Y_Z) {
352  // i = x + y*size1 + z*size1*size2;
353  // } else {
354  // i = z + x*size1 + y*size1*size2;
355  // }
356  // fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
357  // }
358  // }
359  // }
360  // fclose(handle);
361  // delete [] h_data;
362  // }
363 
364  // Clear energy and virial array if needed
365  if (doEnergyVirial) clear_device_array<EnergyVirial>(d_energyVirial, 1, stream);
366 
367 #ifdef TESTPID
368  if (1) {
369  cudaCheck(cudaStreamSynchronize(stream));
370  fprintf(stderr, "AP calling scalar sum\n");
371  fprintf(stderr, "(permutation == Perm_cX_Y_Z) = %s\n",
372  (permutation == Perm_cX_Y_Z ? "true" : "false"));
373  fprintf(stderr, "nfft1=%d nfft2=%d nfft3=%d\n", nfft1, nfft2, nfft3);
374  fprintf(stderr, "size1=%d size2=%d size3=%d\n", size1, size2, size3);
375  fprintf(stderr, "kappa=%g\n", kappa);
376  fprintf(stderr, "recip1x=%g recip1y=%g recip1z=%g\n",
377  (double)recip1x, (double)recip1y, (double)recip1z);
378  fprintf(stderr, "recip2x=%g recip2y=%g recip2z=%g\n",
379  (double)recip2x, (double)recip2y, (double)recip2z);
380  fprintf(stderr, "recip3x=%g recip3y=%g recip3z=%g\n",
381  (double)recip3x, (double)recip3y, (double)recip3z);
382  fprintf(stderr, "volume=%g\n", volume);
383  fprintf(stderr, "j0=%d k0=%d\n", j0, k0);
384  float *bm1, *bm2, *bm3;
385  allocate_host<float>(&bm1, nfft1);
386  allocate_host<float>(&bm2, nfft2);
387  allocate_host<float>(&bm3, nfft3);
388  copy_DtoH<float>(prefac1, bm1, nfft1, stream);
389  copy_DtoH<float>(prefac2, bm2, nfft2, stream);
390  copy_DtoH<float>(prefac3, bm3, nfft3, stream);
391  TestArray_write<float>("bm1_good.bin", "structure factor bm1 good",
392  bm1, nfft1);
393  TestArray_write<float>("bm2_good.bin", "structure factor bm2 good",
394  bm2, nfft2);
395  TestArray_write<float>("bm3_good.bin", "structure factor bm3 good",
396  bm3, nfft3);
397  deallocate_host<float>(&bm1);
398  deallocate_host<float>(&bm2);
399  deallocate_host<float>(&bm3);
400  }
401 #endif
402 
403  scalar_sum(permutation == Perm_cX_Y_Z, nfft1, nfft2, nfft3, size1, size2, size3, kappa,
404  recip1x, recip1y, recip1z, recip2x, recip2y, recip2z, recip3x, recip3y, recip3z,
405  volume, prefac1, prefac2, prefac3, j0, k0, doEnergyVirial,
406  &d_energyVirial->energy, d_energyVirial->virial, (float2*)data,
407  stream);
408 #ifdef TESTPID
409  if (1) {
410  cudaCheck(cudaStreamSynchronize(stream));
411  fprintf(stderr, "AP SCALAR SUM\n");
412  fprintf(stderr, "COPY DEVICE ARRAYS BACK TO HOST\n");
413  int m = 2 * (nfft1/2 + 1) * nfft2 * nfft3;
414  float *tran = 0;
415  allocate_host<float>(&tran, m);
416  copy_DtoH<float>((float*)data, tran, m, stream);
417  cudaCheck(cudaStreamSynchronize(stream));
418  TestArray_write<float>("tran_potential_grid_good.bin",
419  "transformed potential grid good", tran, m);
420  deallocate_host<float>(&tran);
421  }
422 #endif
423 
424  // Copy energy and virial to host if needed
425  if (doEnergyVirial) {
426  copy_DtoH<EnergyVirial>(d_energyVirial, h_energyVirial, 1, stream);
427  cudaCheck(cudaEventRecord(copyEnergyVirialEvent, stream));
428  // cudaCheck(cudaStreamSynchronize(stream));
429  }
430 
431 }
432 
433 // void CudaPmeKSpaceCompute::waitEnergyAndVirial() {
434 // cudaCheck(cudaSetDevice(deviceID));
435 // cudaCheck(cudaEventSynchronize(copyEnergyVirialEvent));
436 // }
437 
438 void CudaPmeKSpaceCompute::energyAndVirialCheck(void *arg, double walltime) {
440 
441  cudaError_t err = cudaEventQuery(c->copyEnergyVirialEvent);
442  if (err == cudaSuccess) {
443  // Event has occurred
444  c->checkCount = 0;
445  if (c->pencilXYZPtr != NULL)
446  c->pencilXYZPtr->energyAndVirialDone(c->multipleGridIndex);
447  else if (c->pencilZPtr != NULL)
448  c->pencilZPtr->energyAndVirialDone(c->multipleGridIndex);
449  else
450  NAMD_bug("CudaPmeKSpaceCompute::energyAndVirialCheck, pencilXYZPtr and pencilZPtr not set");
451  return;
452  } else if (err == cudaErrorNotReady) {
453  // Event has not occurred
454  c->checkCount++;
455  if (c->checkCount >= 1000000) {
456  char errmsg[256];
457  sprintf(errmsg,"CudaPmeKSpaceCompute::energyAndVirialCheck polled %d times",
458  c->checkCount);
459  cudaDie(errmsg,err);
460  }
461  } else {
462  // Anything else is an error
463  char errmsg[256];
464  sprintf(errmsg,"in CudaPmeKSpaceCompute::energyAndVirialCheck after polling %d times",
465  c->checkCount);
466  cudaDie(errmsg,err);
467  }
468 
469  // Call again
470  CcdCallBacksReset(0, walltime);
471  CcdCallFnAfter(energyAndVirialCheck, arg, 0.1);
472 }
473 
475  cudaCheck(cudaSetDevice(deviceID));
476  pencilXYZPtr = pencilPtr;
477  pencilZPtr = NULL;
478  checkCount = 0;
479  CcdCallBacksReset(0, CmiWallTimer());
480  // Set the call back at 0.1ms
481  CcdCallFnAfter(energyAndVirialCheck, this, 0.1);
482 }
483 
485  cudaCheck(cudaSetDevice(deviceID));
486  pencilXYZPtr = NULL;
487  pencilZPtr = pencilPtr;
488  checkCount = 0;
489  CcdCallBacksReset(0, CmiWallTimer());
490  // Set the call back at 0.1ms
491  CcdCallFnAfter(energyAndVirialCheck, this, 0.1);
492 }
493 
495  return h_energyVirial->energy;
496 }
497 
498 void CudaPmeKSpaceCompute::getVirial(double *virial) {
499  if (permutation == Perm_Z_cX_Y) {
500  // h_energyVirial->virial is storing ZZ, ZX, ZY, XX, XY, YY
501  virial[0] = h_energyVirial->virial[3];
502  virial[1] = h_energyVirial->virial[4];
503  virial[2] = h_energyVirial->virial[1];
504 
505  virial[3] = h_energyVirial->virial[4];
506  virial[4] = h_energyVirial->virial[5];
507  virial[5] = h_energyVirial->virial[2];
508 
509  virial[6] = h_energyVirial->virial[1];
510  virial[7] = h_energyVirial->virial[7];
511  virial[8] = h_energyVirial->virial[0];
512  } else if (permutation == Perm_cX_Y_Z) {
513  // h_energyVirial->virial is storing XX, XY, XZ, YY, YZ, ZZ
514  virial[0] = h_energyVirial->virial[0];
515  virial[1] = h_energyVirial->virial[1];
516  virial[2] = h_energyVirial->virial[2];
517 
518  virial[3] = h_energyVirial->virial[1];
519  virial[4] = h_energyVirial->virial[3];
520  virial[5] = h_energyVirial->virial[4];
521 
522  virial[6] = h_energyVirial->virial[2];
523  virial[7] = h_energyVirial->virial[4];
524  virial[8] = h_energyVirial->virial[5];
525  }
526 #if 0
527  fprintf(stderr, "AP PME VIRIAL =\n"
528  " %g %g %g\n %g %g %g\n %g %g %g\n",
529  virial[0], virial[1], virial[2], virial[3], virial[4],
530  virial[5], virial[6], virial[7], virial[8]);
531 #endif
532 }
533 
534 
535 //###########################################################################
536 //###########################################################################
537 //###########################################################################
538 
539 //
540 // Class constructor
541 //
543  const int jblock, const int kblock, int deviceID, cudaStream_t stream) :
544  PmeRealSpaceCompute(pmeGrid, jblock, kblock), deviceID(deviceID), stream(stream) {
545  if (dataSize < xsize*ysize*zsize)
546  NAMD_bug("CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute, insufficient dataSize");
547  cudaCheck(cudaSetDevice(deviceID));
548  d_atomsCapacity = 0;
549  d_atoms = NULL;
550  d_forceCapacity = 0;
551  d_force = NULL;
552  #ifdef NAMD_CUDA
553  tex_data = NULL;
554  tex_data_len = 0;
555  #else
556  grid_data = NULL;
557  grid_data_len = 0;
558  #endif
559  allocate_device<float>(&data, dataSize);
560  setupGridData(data, xsize*ysize*zsize);
561  cudaCheck(cudaEventCreate(&gatherForceEvent));
562 }
563 
564 //
565 // Class desctructor
566 //
568  cudaCheck(cudaSetDevice(deviceID));
569  if (d_atoms != NULL) deallocate_device<CudaAtom>(&d_atoms);
570  if (d_force != NULL) deallocate_device<CudaForce>(&d_force);
571  // if (d_patches != NULL) deallocate_device<PatchInfo>(&d_patches);
572  // deallocate_device<double>(&d_selfEnergy);
573  deallocate_device<float>(&data);
574  cudaCheck(cudaEventDestroy(gatherForceEvent));
575 }
576 
577 // //
578 // // Copy patches and atoms to device memory
579 // //
580 // void CudaPmeRealSpaceCompute::setPatchesAtoms(const int numPatches, const PatchInfo* patches,
581 // const int numAtoms, const CudaAtom* atoms) {
582 
583 // this->numPatches = numPatches;
584 // this->numAtoms = numAtoms;
585 
586 // // Reallocate device arrays as neccessary
587 // reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity, numAtoms, 1.5f);
588 // reallocate_device<PatchInfo>(&d_patches, &d_patchesCapacity, numPatches, 1.5f);
589 
590 // // Copy atom and patch data to device
591 // copy_HtoD<CudaAtom>(atoms, d_atoms, numAtoms, stream);
592 // copy_HtoD<PatchInfo>(patches, d_patches, numPatches, stream);
593 // }
594 
595 //
596 // Copy atoms to device memory
597 //
598 void CudaPmeRealSpaceCompute::copyAtoms(const int numAtoms, const CudaAtom* atoms) {
599  cudaCheck(cudaSetDevice(deviceID));
600  this->numAtoms = numAtoms;
601 
602  // Reallocate device arrays as neccessary
603  reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity, numAtoms, 1.5f);
604 
605  // Copy atom data to device
606  copy_HtoD<CudaAtom>(atoms, d_atoms, numAtoms, stream);
607 }
608 
609 //
610 // Spread charges on grid
611 //
613  cudaCheck(cudaSetDevice(deviceID));
614 #if 0
615  if (1) {
616  static int step = 0;
617  float *xyzq;
618  int natoms = numAtoms;
619  allocate_host<float>(&xyzq, 4*natoms);
620  copy_DtoH<float>((float *)d_atoms, xyzq, 4*natoms, stream);
621  cudaCheck(cudaStreamSynchronize(stream));
622  char fname[64], remark[64];
623  sprintf(fname, "pme_atoms_xyzq_soa_%d.bin", step);
624  sprintf(remark, "SOA PME atoms xyzq, step %d\n", step);
625  TestArray_write<float>(fname, remark, xyzq, 4*natoms);
626  deallocate_host<float>(&xyzq);
627  step += 2;
628  }
629 #endif
630 
631  NAMD_EVENT_START(1, NamdProfileEvent::SPREAD_CHARGE);
632 
633  // Clear grid
634  clear_device_array<float>(data, xsize*ysize*zsize, stream);
635 
636 #if defined(TESTPID)
637  fprintf(stderr, "Calling spread_charge with parameters:\n");
638  fprintf(stderr, "numAtoms = %d\n", numAtoms);
639  fprintf(stderr, "pmeGrid.K1 = %d\n", pmeGrid.K1);
640  fprintf(stderr, "pmeGrid.K2 = %d\n", pmeGrid.K2);
641  fprintf(stderr, "pmeGrid.K3 = %d\n", pmeGrid.K3);
642  fprintf(stderr, "xsize = %d\n", xsize);
643  fprintf(stderr, "ysize = %d\n", ysize);
644  fprintf(stderr, "zsize = %d\n", zsize);
645  fprintf(stderr, "y0 = %d\n", y0);
646  fprintf(stderr, "z0 = %d\n", z0);
647  fprintf(stderr, "(pmeGrid.yBlocks == 1) = %d\n", (pmeGrid.yBlocks == 1));
648  fprintf(stderr, "(pmeGrid.zBlocks == 1) = %d\n", (pmeGrid.zBlocks == 1));
649  fprintf(stderr, "pmeGrid.order = %d\n", pmeGrid.order);
650 #endif
651  spread_charge((const float4*)d_atoms, numAtoms,
653  xsize, y0, z0, (pmeGrid.yBlocks == 1), (pmeGrid.zBlocks == 1),
654  data, pmeGrid.order, stream);
655 #ifdef TESTPID
656  if (1) {
657  cudaCheck(cudaStreamSynchronize(stream));
658  fprintf(stderr, "AP SPREAD CHARGES\n");
659  fprintf(stderr, "COPY DEVICE ARRAYS BACK TO HOST\n");
660  float *xyzq;
661  allocate_host<float>(&xyzq, 4*numAtoms);
662  copy_DtoH<float>((float *)d_atoms, xyzq, 4*numAtoms, stream);
663  int gridlen = pmeGrid.K1 * pmeGrid.K2 * pmeGrid.K3;
664  float *grid;
665  allocate_host<float>(&grid, gridlen);
666  copy_DtoH<float>(data, grid, gridlen, stream);
667  cudaCheck(cudaStreamSynchronize(stream));
668  TestArray_write<float>("xyzq_good.bin", "xyzq good", xyzq, 4*numAtoms);
669  TestArray_write<float>("charge_grid_good.bin", "charge grid good",
670  grid, gridlen);
671  deallocate_host<float>(&xyzq);
672  deallocate_host<float>(&grid);
673  }
674 #endif
675 
676  // ncall++;
677 
678  // if (ncall == 1) writeRealToDisk(data, xsize*ysize*zsize, "data.txt");
679  NAMD_EVENT_STOP(1, NamdProfileEvent::SPREAD_CHARGE);
680 }
681 
682 void CudaPmeRealSpaceCompute::cuda_gatherforce_check(void *arg, double walltime) {
684  cudaCheck(cudaSetDevice(c->deviceID));
685 
686  cudaError_t err = cudaEventQuery(c->gatherForceEvent);
687  if (err == cudaSuccess) {
688  // Event has occurred
689  c->checkCount = 0;
690 // c->deviceProxy[CkMyNode()].gatherForceDone();
691  c->devicePtr->gatherForceDone(c->multipleGridIndex);
692  return;
693  } else if (err == cudaErrorNotReady) {
694  // Event has not occurred
695  c->checkCount++;
696  if (c->checkCount >= 1000000) {
697  char errmsg[256];
698  sprintf(errmsg,"CudaPmeRealSpaceCompute::cuda_gatherforce_check polled %d times",
699  c->checkCount);
700  cudaDie(errmsg,err);
701  }
702  } else {
703  // Anything else is an error
704  char errmsg[256];
705  sprintf(errmsg,"in CudaPmeRealSpaceCompute::cuda_gatherforce_check after polling %d times",
706  c->checkCount);
707  cudaDie(errmsg,err);
708  }
709 
710  // Call again
711  CcdCallBacksReset(0, walltime);
712  CcdCallFnAfter(cuda_gatherforce_check, arg, 0.1);
713 }
714 
716  cudaCheck(cudaSetDevice(deviceID));
717  devicePtr = devicePtr_in;
718  checkCount = 0;
719  CcdCallBacksReset(0, CmiWallTimer());
720  // Set the call back at 0.1ms
721  CcdCallFnAfter(cuda_gatherforce_check, this, 0.1);
722 }
723 
725  cudaCheck(cudaSetDevice(deviceID));
726  cudaCheck(cudaEventSynchronize(gatherForceEvent));
727 }
728 
729 void CudaPmeRealSpaceCompute::setupGridData(float* data, int data_len) {
730  #ifdef NAMD_CUDA
731  //HIP runtime error when using hipCreateTextureObject. No longer needed anyway, so we are moving in that direction now.
732  /*
733 
734  FATAL ERROR: CUDA error hipCreateTextureObject(&gridTexObj, &desc, &tdesc, NULL) in file src/CudaPmeSolverUtil.C, function setupGridTexture, line 744
735  on Pe 11 (jparada-MS-7B09 device 0 pci 0:43:0): hipErrorRuntimeOther
736 ------------- Processor 11 Exiting: Called CmiAbort ------------
737 Reason: FATAL ERROR: CUDA error hipCreateTextureObject(&gridTexObj, &desc, &tdesc, NULL) in file src/CudaPmeSolverUtil.C, function setupGridTexture, line 744
738  on Pe 11 (jparada-MS-7B09 device 0 pci 0:43:0): hipErrorRuntimeOther
739 
740  */
741  if (tex_data == data && tex_data_len == data_len) return;
742  tex_data = data;
743  tex_data_len = data_len;
744  // Use texture objects
745  cudaResourceDesc resDesc;
746  memset(&resDesc, 0, sizeof(resDesc));
747  resDesc.resType = cudaResourceTypeLinear;
748  resDesc.res.linear.devPtr = data;
749  resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
750  resDesc.res.linear.desc.x = sizeof(float)*8;
751  resDesc.res.linear.sizeInBytes = data_len*sizeof(float);
752  cudaTextureDesc texDesc;
753  memset(&texDesc, 0, sizeof(texDesc));
754  texDesc.readMode = cudaReadModeElementType;
755  cudaCheck(cudaCreateTextureObject(&gridTexObj, &resDesc, &texDesc, NULL));
756  #else
757  if (grid_data == data && grid_data_len == data_len) return;
758  grid_data = data;
759  grid_data_len = data_len;
760  #endif
761 }
762 
764  cudaCheck(cudaSetDevice(deviceID));
765 
766  NAMD_EVENT_START(1, NamdProfileEvent::GATHER_FORCE);
767 
768  // Re-allocate force array if needed
769  reallocate_device<CudaForce>(&d_force, &d_forceCapacity, numAtoms, 1.5f);
770 
771 #ifdef TESTPID
772  if (1) {
773  fprintf(stderr, "AP gather force arguments\n");
774  fprintf(stderr, "numAtoms = %d\n", numAtoms);
775  fprintf(stderr, "pmeGrid.K1 = %d\n", pmeGrid.K1);
776  fprintf(stderr, "pmeGrid.K2 = %d\n", pmeGrid.K2);
777  fprintf(stderr, "pmeGrid.K3 = %d\n", pmeGrid.K3);
778  fprintf(stderr, "xsize = %d\n", xsize);
779  fprintf(stderr, "ysize = %d\n", ysize);
780  fprintf(stderr, "zsize = %d\n", zsize);
781  fprintf(stderr, "y0 = %d\n", y0);
782  fprintf(stderr, "z0 = %d\n", z0);
783  fprintf(stderr, "(pmeGrid.yBlocks == 1) = %d\n", (pmeGrid.yBlocks == 1));
784  fprintf(stderr, "(pmeGrid.zBlocks == 1) = %d\n", (pmeGrid.zBlocks == 1));
785  fprintf(stderr, "pmeGrid.order = %d\n", pmeGrid.order);
786  fprintf(stderr, "gridTexObj = %p\n", gridTexObj);
787  }
788 #endif
789  // The patch-level PME kernels are only used for the GPU-resident code path. The default constructor
790  // of PatchLevelPmeData will initialize the compatibility variables to false, so the patch-level kernels
791  // won't be used here.
792  PatchLevelPmeData patchLevelPmeData;
793  gather_force(patchLevelPmeData,
794  (const float4*)d_atoms, numAtoms,
796  xsize, ysize, zsize, xsize, y0, z0, (pmeGrid.yBlocks == 1), (pmeGrid.zBlocks == 1),
797  data, pmeGrid.order, (float3*)d_force,
798 #ifdef NAMD_CUDA
799  gridTexObj,
800 #endif
801  stream);
802 #ifdef TESTPID
803  if (1) {
804  cudaCheck(cudaStreamSynchronize(stream));
805  fprintf(stderr, "AP GATHER FORCE\n");
806  fprintf(stderr, "COPY DEVICE ARRAYS BACK TO HOST\n");
807  float *xyz;
808  int natoms = numAtoms;
809  allocate_host<float>(&xyz, 3*natoms);
810  copy_DtoH<float>((float*)d_force, xyz, 3*natoms, stream);
811  cudaCheck(cudaStreamSynchronize(stream));
812  TestArray_write<float>("gather_force_good.bin",
813  "gather force good", xyz, 3*natoms);
814  deallocate_host<float>(&xyz);
815  }
816 #endif
817 
818  copy_DtoH<CudaForce>(d_force, force, numAtoms, stream);
819 
820  cudaCheck(cudaEventRecord(gatherForceEvent, stream));
821 
822  NAMD_EVENT_STOP(1, NamdProfileEvent::GATHER_FORCE);
823 }
824 
825 /*
826 double CudaPmeRealSpaceCompute::calcSelfEnergy() {
827  double h_selfEnergy;
828  clear_device_array<double>(d_selfEnergy, 1);
829  calc_sum_charge_squared((const float4*)d_atoms, numAtoms, d_selfEnergy, stream);
830  copy_DtoH<double>(d_selfEnergy, &h_selfEnergy, 1, stream);
831  cudaCheck(cudaStreamSynchronize(stream));
832  // 1.7724538509055160273 = sqrt(pi)
833  h_selfEnergy *= -ComputeNonbondedUtil::ewaldcof/1.7724538509055160273;
834  return h_selfEnergy;
835 }
836 */
837 
838 //###########################################################################
839 //###########################################################################
840 //###########################################################################
841 
842 CudaPmeTranspose::CudaPmeTranspose(PmeGrid pmeGrid, const int permutation,
843  const int jblock, const int kblock, int deviceID, cudaStream_t stream) :
844  PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
845  cudaCheck(cudaSetDevice(deviceID));
846 
847  allocate_device<float2>(&d_data, dataSize);
848 #ifndef P2P_ENABLE_3D
849  allocate_device<float2>(&d_buffer, dataSize);
850 #endif
851 
852  // Setup data pointers to NULL, these can be overridden later on by using setDataPtrs()
853  dataPtrsYZX.resize(nblock, NULL);
854  dataPtrsZXY.resize(nblock, NULL);
855 
856  allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*nblock);
857  allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*nblock);
858 }
859 
861  cudaCheck(cudaSetDevice(deviceID));
862  deallocate_device<float2>(&d_data);
863 #ifndef P2P_ENABLE_3D
864  deallocate_device<float2>(&d_buffer);
865 #endif
866  deallocate_device< TransposeBatch<float2> >(&batchesZXY);
867  deallocate_device< TransposeBatch<float2> >(&batchesYZX);
868 }
869 
870 //
871 // Set dataPtrsYZX
872 //
873 void CudaPmeTranspose::setDataPtrsYZX(std::vector<float2*>& dataPtrsNew, float2* data) {
874  if (dataPtrsYZX.size() != dataPtrsNew.size())
875  NAMD_bug("CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
876  for (int iblock=0;iblock < nblock;iblock++) {
877  dataPtrsYZX[iblock] = dataPtrsNew[iblock];
878  }
879  // Build batched data structures
881 
882  for (int iperm=0;iperm < 3;iperm++) {
883  int isize_out;
884  if (iperm == 0) {
885  // Perm_Z_cX_Y:
886  // ZXY -> XYZ
887  isize_out = pmeGrid.K1/2+1;
888  } else if (iperm == 1) {
889  // Perm_cX_Y_Z:
890  // XYZ -> YZX
891  isize_out = pmeGrid.K2;
892  } else {
893  // Perm_Y_Z_cX:
894  // YZX -> ZXY
895  isize_out = pmeGrid.K3;
896  }
897 
898  int max_nx = 0;
899  for (int iblock=0;iblock < nblock;iblock++) {
900 
901  int x0 = pos[iblock];
902  int nx = pos[iblock+1] - x0;
903  max_nx = std::max(max_nx, nx);
904 
905  int width_out;
906  float2* data_out;
907  if (dataPtrsYZX[iblock] == NULL) {
908  // Local transpose, use internal buffer
909  data_out = d_data + jsize*ksize*x0;
910  width_out = jsize;
911  } else {
912  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
913  data_out = dataPtrsYZX[iblock];
914  width_out = isize_out;
915  }
916 
918  batch.nx = nx;
919  batch.ysize_out = width_out;
920  batch.zsize_out = ksize;
921  batch.data_in = data+x0;
922  batch.data_out = data_out;
923 
924  h_batchesYZX[iperm*nblock + iblock] = batch;
925 
926  // transpose_xyz_yzx(
927  // nx, jsize, ksize,
928  // isize, jsize,
929  // width_out, ksize,
930  // data+x0, data_out, stream);
931  }
932 
933  max_nx_YZX[iperm] = max_nx;
934  }
935 
936  copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*nblock, stream);
937  cudaCheck(cudaStreamSynchronize(stream));
938  delete [] h_batchesYZX;
939 }
940 
941 //
942 // Set dataPtrsZXY
943 //
944 void CudaPmeTranspose::setDataPtrsZXY(std::vector<float2*>& dataPtrsNew, float2* data) {
945  if (dataPtrsZXY.size() != dataPtrsNew.size())
946  NAMD_bug("CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
947  for (int iblock=0;iblock < nblock;iblock++) {
948  dataPtrsZXY[iblock] = dataPtrsNew[iblock];
949  }
950 
951  // Build batched data structures
953 
954  for (int iperm=0;iperm < 3;iperm++) {
955  int isize_out;
956  if (iperm == 0) {
957  // Perm_cX_Y_Z:
958  // XYZ -> ZXY
959  isize_out = pmeGrid.K3;
960  } else if (iperm == 1) {
961  // Perm_Z_cX_Y:
962  // ZXY -> YZX
963  isize_out = pmeGrid.K2;
964  } else {
965  // Perm_Y_Z_cX:
966  // YZX -> XYZ
967  isize_out = pmeGrid.K1/2+1;
968  }
969 
970  int max_nx = 0;
971  for (int iblock=0;iblock < nblock;iblock++) {
972 
973  int x0 = pos[iblock];
974  int nx = pos[iblock+1] - x0;
975  max_nx = std::max(max_nx, nx);
976 
977  int width_out;
978  float2* data_out;
979  if (dataPtrsZXY[iblock] == NULL) {
980  // Local transpose, use internal buffer
981  data_out = d_data + jsize*ksize*x0;
982  width_out = ksize;
983  } else {
984  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
985  data_out = dataPtrsZXY[iblock];
986  width_out = isize_out;
987  }
988 
990  batch.nx = nx;
991  batch.zsize_out = width_out;
992  batch.xsize_out = nx;
993  batch.data_in = data+x0;
994  batch.data_out = data_out;
995  h_batchesZXY[iperm*nblock + iblock] = batch;
996  }
997 
998  max_nx_ZXY[iperm] = max_nx;
999  }
1000 
1001  copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*nblock, stream);
1002  cudaCheck(cudaStreamSynchronize(stream));
1003  delete [] h_batchesZXY;
1004 }
1005 
1006 void CudaPmeTranspose::transposeXYZtoYZX(const float2* data) {
1007  cudaCheck(cudaSetDevice(deviceID));
1008 
1009  int iperm;
1010  switch(permutation) {
1011  case Perm_Z_cX_Y:
1012  // ZXY -> XYZ
1013  iperm = 0;
1014  break;
1015  case Perm_cX_Y_Z:
1016  // XYZ -> YZX
1017  iperm = 1;
1018  break;
1019  case Perm_Y_Z_cX:
1020  // YZX -> ZXY
1021  iperm = 2;
1022  break;
1023  default:
1024  NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
1025  break;
1026  }
1027 
1029  nblock, batchesYZX + iperm*nblock,
1030  max_nx_YZX[iperm], jsize, ksize,
1031  isize, jsize, stream);
1032 
1033 
1034 /*
1035  int isize_out;
1036  switch(permutation) {
1037  case Perm_Z_cX_Y:
1038  // ZXY -> XYZ
1039  isize_out = pmeGrid.K1/2+1;
1040  break;
1041  case Perm_cX_Y_Z:
1042  // XYZ -> YZX
1043  isize_out = pmeGrid.K2;
1044  break;
1045  case Perm_Y_Z_cX:
1046  // YZX -> ZXY
1047  isize_out = pmeGrid.K3;
1048  break;
1049  default:
1050  NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
1051  break;
1052  }
1053 
1054  for (int iblock=0;iblock < nblock;iblock++) {
1055 
1056  int x0 = pos[iblock];
1057  int nx = pos[iblock+1] - x0;
1058 
1059  int width_out;
1060  float2* data_out;
1061  if (dataPtrsYZX[iblock] == NULL) {
1062  // Local transpose, use internal buffer
1063  data_out = d_data + jsize*ksize*x0;
1064  width_out = jsize;
1065  } else {
1066  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
1067  data_out = dataPtrsYZX[iblock];
1068  width_out = isize_out;
1069  }
1070 
1071  transpose_xyz_yzx(
1072  nx, jsize, ksize,
1073  isize, jsize,
1074  width_out, ksize,
1075  data+x0, data_out, stream);
1076  }
1077 */
1078 }
1079 
1080 void CudaPmeTranspose::transposeXYZtoZXY(const float2* data) {
1081  cudaCheck(cudaSetDevice(deviceID));
1082 
1083  int iperm;
1084  switch(permutation) {
1085  case Perm_cX_Y_Z:
1086  // XYZ -> ZXY
1087  iperm = 0;
1088  break;
1089  case Perm_Z_cX_Y:
1090  // ZXY -> YZX
1091  iperm = 1;
1092  break;
1093  case Perm_Y_Z_cX:
1094  // YZX -> XYZ
1095  iperm = 2;
1096  break;
1097  default:
1098  NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
1099  break;
1100  }
1101 
1103  nblock, batchesZXY + iperm*nblock,
1104  max_nx_ZXY[iperm], jsize, ksize,
1105  isize, jsize, stream);
1106 
1107 /*
1108  int isize_out;
1109  switch(permutation) {
1110  case Perm_cX_Y_Z:
1111  // XYZ -> ZXY
1112  isize_out = pmeGrid.K3;
1113  break;
1114  case Perm_Z_cX_Y:
1115  // ZXY -> YZX
1116  isize_out = pmeGrid.K2;
1117  break;
1118  case Perm_Y_Z_cX:
1119  // YZX -> XYZ
1120  isize_out = pmeGrid.K1/2+1;
1121  break;
1122  default:
1123  NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
1124  break;
1125  }
1126 
1127  for (int iblock=0;iblock < nblock;iblock++) {
1128 
1129  int x0 = pos[iblock];
1130  int nx = pos[iblock+1] - x0;
1131 
1132  int width_out;
1133  float2* data_out;
1134  if (dataPtrsZXY[iblock] == NULL) {
1135  // Local transpose, use internal buffer
1136  data_out = d_data + jsize*ksize*x0;
1137  width_out = ksize;
1138  } else {
1139  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
1140  data_out = dataPtrsZXY[iblock];
1141  width_out = isize_out;
1142  }
1143 
1144  transpose_xyz_zxy(
1145  nx, jsize, ksize,
1146  isize, jsize,
1147  width_out, nx,
1148  data+x0, data_out, stream);
1149  }
1150 */
1151 }
1152 
1154  cudaCheck(cudaSetDevice(deviceID));
1155  cudaCheck(cudaStreamSynchronize(stream));
1156 }
1157 
1158 void CudaPmeTranspose::copyDataDeviceToHost(const int iblock, float2* h_data, const int h_dataSize) {
1159  cudaCheck(cudaSetDevice(deviceID));
1160 
1161  if (iblock >= nblock)
1162  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
1163 
1164  int x0 = pos[iblock];
1165  int nx = pos[iblock+1] - x0;
1166 
1167  int copySize = jsize*ksize*nx;
1168  int copyStart = jsize*ksize*x0;
1169 
1170  if (copyStart + copySize > dataSize)
1171  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
1172 
1173  if (copySize > h_dataSize)
1174  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
1175 
1176  copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
1177 }
1178 
1179 void CudaPmeTranspose::copyDataHostToDevice(const int iblock, float2* data_in, float2* data_out) {
1180  cudaCheck(cudaSetDevice(deviceID));
1181 
1182  if (iblock >= nblock)
1183  NAMD_bug("CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
1184 
1185  // Determine block size = how much we're copying
1186  int i0, i1, j0, j1, k0, k1;
1187  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1188  int ni = i1-i0+1;
1189  int nj = j1-j0+1;
1190  int nk = k1-k0+1;
1191 
1192  copy3D_HtoD<float2>(data_in, data_out,
1193  0, 0, 0,
1194  ni, nj,
1195  i0, 0, 0,
1196  isize, jsize,
1197  ni, nj, nk, stream);
1198 }
1199 
1200 #ifndef P2P_ENABLE_3D
1201 //
1202 // Copy from temporary buffer to final buffer
1203 //
1204 void CudaPmeTranspose::copyDataDeviceToDevice(const int iblock, float2* data_out) {
1205  cudaCheck(cudaSetDevice(deviceID));
1206 
1207  if (iblock >= nblock)
1208  NAMD_bug("CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
1209 
1210  // Determine block size = how much we're copying
1211  int i0, i1, j0, j1, k0, k1;
1212  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1213  int ni = i1-i0+1;
1214  int nj = j1-j0+1;
1215  int nk = k1-k0+1;
1216 
1217  float2* data_in = d_buffer + i0*nj*nk;
1218 
1219  copy3D_DtoD<float2>(data_in, data_out,
1220  0, 0, 0,
1221  ni, nj,
1222  i0, 0, 0,
1223  isize, jsize,
1224  ni, nj, nk, stream);
1225 }
1226 
1227 //
1228 // Return temporary buffer for block "iblock"
1229 //
1230 float2* CudaPmeTranspose::getBuffer(const int iblock) {
1231  if (iblock >= nblock)
1232  NAMD_bug("CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
1233 
1234  // Determine block size = how much we're copying
1235  int i0, i1, j0, j1, k0, k1;
1236  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1237  int ni = i1-i0+1;
1238  int nj = j1-j0+1;
1239  int nk = k1-k0+1;
1240 
1241  return d_buffer + i0*nj*nk;
1242 }
1243 #endif
1244 
1245 void CudaPmeTranspose::copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out,
1246  float2* data_out) {
1247 
1248  int iblock_out = jblock;
1249  int jblock_out = kblock;
1250  int kblock_out = iblock;
1251 
1252  copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1253 }
1254 
1255 void CudaPmeTranspose::copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out,
1256  float2* data_out) {
1257 
1258  int iblock_out = kblock;
1259  int jblock_out = iblock;
1260  int kblock_out = jblock;
1261 
1262  copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1263 }
1264 
1265 void CudaPmeTranspose::copyDataToPeerDevice(const int iblock,
1266  const int iblock_out, const int jblock_out, const int kblock_out,
1267  int deviceID_out, int permutation_out, float2* data_out) {
1268 
1269  cudaCheck(cudaSetDevice(deviceID));
1270 
1271  // Determine block size = how much we're copying
1272  int i0, i1, j0, j1, k0, k1;
1273  getBlockDim(pmeGrid, permutation_out, iblock_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
1274  int ni = i1-i0+1;
1275  int nj = j1-j0+1;
1276  int nk = k1-k0+1;
1277 
1278  getPencilDim(pmeGrid, permutation_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
1279  int isize_out = i1-i0+1;
1280  int jsize_out = j1-j0+1;
1281 
1282  int x0 = pos[iblock];
1283  float2* data_in = d_data + jsize*ksize*x0;
1284 
1285 #ifndef P2P_ENABLE_3D
1286  // Copy into temporary peer device buffer
1287  copy_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out, ni*nj*nk, stream);
1288 #else
1289  copy3D_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out,
1290  0, 0, 0,
1291  ni, nj,
1292  0, 0, 0,
1293  isize_out, jsize_out,
1294  ni, nj, nk, stream);
1295 #endif
1296 
1297 }
1298 
1299 
1301  PmeGrid pmeGrid_,
1302  int deviceID_,
1303  int deviceIndex_
1304  ) :
1305  pmeGrid(pmeGrid_), deviceID(deviceID_), deviceIndex(deviceIndex_),
1306  natoms(0), d_atoms(0), d_forces(0),
1307  d_grids(0), gridsize(0),
1308  d_trans(0), transize(0),
1309  d_bm1(0), d_bm2(0), d_bm3(0),
1310  kappa(ComputeNonbondedUtil::ewaldcof),
1311  self_energy_alch_first_time(true),
1312  force_scaling_alch_first_time(true),
1313  selfEnergy(0), selfEnergy_FEP(0), selfEnergy_TI_1(0), selfEnergy_TI_2(0), m_step(0)
1314 {
1315 // fprintf(stderr, "CudaPmeOneDevice constructor START ******************************************\n");
1316  const SimParameters& sim_params = *(Node::Object()->simParameters);
1318  // Determine how many grids we need for the alchemical route
1319  if (sim_params.alchOn) {
1320  num_used_grids = sim_params.alchGetNumOfPMEGrids();
1321  } else {
1322  num_used_grids = 1;
1323  }
1324  cudaCheck(cudaSetDevice(deviceID));
1325 
1326  // Check to see if the simulation and device is compatible with patch-level kernels. The results
1327  // will be worked in the PatchLevelPmeData field
1328  checkPatchLevelSimParamCompatibility(pmeGrid.order, true /* periodic Y */, true /* periodic Z */);
1330 
1331  if (!sim_params.CUDASOAintegrateMode) {
1332  NAMD_bug("CudaPmeOneDevice requires GPU-resident mode");
1333  }
1334  reductionGpuResident = ReductionMgr::Object()->willSubmit(REDUCTIONS_GPURESIDENT);
1335 
1336  // create our own CUDA stream
1337 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
1338  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1339  int leastPriority, greatestPriority;
1340  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
1341  cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
1342 #else
1343  cudaCheck(cudaStreamCreate(&stream));
1344 #endif
1345 
1346  allocate_host<EnergyVirial>(&h_energyVirials, num_used_grids);
1347  allocate_device<EnergyVirial>(&d_energyVirials, num_used_grids);
1348  allocate_device<float>(&d_scaling_factors, num_used_grids);
1349  allocate_device<double>(&d_selfEnergy, 1);
1350  if (sim_params.alchFepOn) {
1351  allocate_device<double>(&d_selfEnergy_FEP, 1);
1352  } else {
1353  d_selfEnergy_FEP = NULL;
1354  }
1355  if (sim_params.alchThermIntOn) {
1356  allocate_device<double>(&d_selfEnergy_TI_1, 1);
1357  allocate_device<double>(&d_selfEnergy_TI_2, 1);
1358  } else {
1359  d_selfEnergy_TI_1 = NULL;
1360  d_selfEnergy_TI_2 = NULL;
1361  }
1362 
1363  // create device buffer space for atom positions and forces
1364  // to be accessed externally through PatchData
1365  allocate_device<float4>(&d_atoms, num_used_grids * natoms);
1366  allocate_device<float3>(&d_forces, num_used_grids * natoms);
1367  if (sim_params.alchOn) {
1368  allocate_device<int>(&d_partition, natoms);
1369  } else {
1370  d_partition = NULL;
1371  }
1372 #ifdef NODEGROUP_FORCE_REGISTER
1373  DeviceData& devData = cpdata.ckLocalBranch()->devData[deviceIndex];
1374  devData.s_datoms = (CudaAtom *) (d_atoms);
1375  devData.f_slow = (CudaForce *) (d_forces);
1376  devData.f_slow_size = natoms;
1377  devData.s_datoms_partition = d_partition;
1378 #endif
1379  int k1 = pmeGrid.K1;
1380  int k2 = pmeGrid.K2;
1381  int k3 = pmeGrid.K3;
1382  int order = pmeGrid.order;
1383  gridsize = k1 * k2 * k3;
1384  transize = (k1/2 + 1) * k2 * k3;
1385 
1386 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1387 
1388  // set up cufft
1389  forwardPlans = new cufftHandle[num_used_grids];
1390  backwardPlans = new cufftHandle[num_used_grids];
1391  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1392  cufftCheck(cufftPlan3d(&(forwardPlans[iGrid]), k3, k2, k1, CUFFT_R2C));
1393  cufftCheck(cufftPlan3d(&(backwardPlans[iGrid]), k3, k2, k1, CUFFT_C2R));
1394  cufftCheck(cufftSetStream(forwardPlans[iGrid], stream));
1395  cufftCheck(cufftSetStream(backwardPlans[iGrid], stream));
1396  }
1397 #endif
1398 
1399 #ifdef NAMD_CUDA
1400  cudaDeviceProp deviceProp;
1401  cudaCheck(cudaGetDeviceProperties(&deviceProp, deviceID));
1402  const int texture_alignment = int(deviceProp.textureAlignment);
1403  // d_grids and d_grids + N * gridsize will be used as device pointers for ::cudaResourceDesc::res::linear::devPtr
1404  // check if (d_grids + N * gridsize) is an address aligned to ::cudaDeviceProp::textureAlignment
1405  // which is required by cudaCreateTextureObject()
1406  // or maybe I should use cudaMallocPitch()?
1407  if ((gridsize % texture_alignment) != 0) {
1408  // if it is not aligned, padding is required
1409  gridsize = (int(gridsize / texture_alignment) + 1) * texture_alignment;
1410  }
1411  // Is it necesary to align transize too?
1412 // if ((transize % texture_alignment) != 0) {
1413 // // if it is not aligned, padding is required
1414 // transize = (int(transize / texture_alignment) + 1) * texture_alignment;
1415 // }
1416  allocate_device<float>(&d_grids, num_used_grids * gridsize);
1417  allocate_device<float2>(&d_trans, num_used_grids * transize);
1418  gridTexObjArrays = new cudaTextureObject_t[num_used_grids];
1419  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1420  // set up texture object
1421  cudaResourceDesc resDesc;
1422  memset(&resDesc, 0, sizeof(resDesc));
1423  resDesc.resType = cudaResourceTypeLinear;
1424  resDesc.res.linear.devPtr = (void*)(d_grids + iGrid * (size_t)gridsize);
1425  resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
1426  resDesc.res.linear.desc.x = sizeof(float)*8;
1427  resDesc.res.linear.sizeInBytes = gridsize*sizeof(float);
1428  cudaTextureDesc texDesc;
1429  memset(&texDesc, 0, sizeof(texDesc));
1430  texDesc.readMode = cudaReadModeElementType;
1431  cudaCheck(cudaCreateTextureObject(&(gridTexObjArrays[iGrid]), &resDesc, &texDesc, NULL));
1432  }
1433 #else
1434  allocate_device<float>(&d_grids, num_used_grids * gridsize);
1435  allocate_device<float2>(&d_trans, num_used_grids * transize);
1436 #endif
1437  // calculate prefactors
1438  double *bm1 = new double[k1];
1439  double *bm2 = new double[k2];
1440  double *bm3 = new double[k3];
1441  // Use compute_b_moduli from PmeKSpace.C
1442  extern void compute_b_moduli(double *bm, int k, int order);
1443  compute_b_moduli(bm1, k1, order);
1444  compute_b_moduli(bm2, k2, order);
1445  compute_b_moduli(bm3, k3, order);
1446 
1447  // allocate space for and copy prefactors onto GPU
1448  float *bm1f = new float[k1];
1449  float *bm2f = new float[k2];
1450  float *bm3f = new float[k3];
1451  for (int i=0; i < k1; i++) bm1f[i] = (float) bm1[i];
1452  for (int i=0; i < k2; i++) bm2f[i] = (float) bm2[i];
1453  for (int i=0; i < k3; i++) bm3f[i] = (float) bm3[i];
1454  allocate_device<float>(&d_bm1, k1);
1455  allocate_device<float>(&d_bm2, k2);
1456  allocate_device<float>(&d_bm3, k3);
1457  copy_HtoD_sync<float>(bm1f, d_bm1, k1);
1458  copy_HtoD_sync<float>(bm2f, d_bm2, k2);
1459  copy_HtoD_sync<float>(bm3f, d_bm3, k3);
1460  delete [] bm1f;
1461  delete [] bm2f;
1462  delete [] bm3f;
1463  delete [] bm1;
1464  delete [] bm2;
1465  delete [] bm3;
1466 
1467  cudaCheck(cudaStreamSynchronize(stream));
1468 
1469 // fprintf(stderr, "CudaPmeOneDevice constructor END ********************************************\n");
1470 }
1471 
1473  deallocate_device<float4>(&d_atoms);
1474  deallocate_device<float3>(&d_forces);
1475  deallocate_device<float2>(&d_trans);
1476  deallocate_device<float>(&d_grids);
1477  deallocate_host<EnergyVirial>(&h_energyVirials);
1478  deallocate_device<EnergyVirial>(&d_energyVirials);
1479  deallocate_device<float>(&d_scaling_factors);
1480 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1481  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1482  cufftCheck(cufftDestroy(forwardPlans[iGrid]));
1483  cufftCheck(cufftDestroy(backwardPlans[iGrid]));
1484 #if defined(NAMD_CUDA) // only CUDA uses texture objects
1485  cudaCheck(cudaDestroyTextureObject(gridTexObjArrays[iGrid]));
1486 #endif
1487  }
1488 
1489  if (patchLevelPmeData.h_patchGridOffsets != nullptr) {
1490  deallocate_host<int3>(&patchLevelPmeData.h_patchGridOffsets);
1491  }
1492  if (patchLevelPmeData.d_patchGridOffsets != nullptr) {
1493  deallocate_device<int3>(&patchLevelPmeData.d_patchGridOffsets);
1494  }
1495 
1496  delete[] forwardPlans;
1497  delete[] backwardPlans;
1498 #if defined(NAMD_CUDA) // only CUDA uses texture objects
1499  delete[] gridTexObjArrays;
1500 #endif
1501 
1502 
1503 #endif
1504  deallocate_device<double>(&d_selfEnergy);
1505  if (d_partition != NULL) deallocate_device<int>(&d_partition);
1506  if (d_selfEnergy_FEP != NULL) deallocate_device<double>(&d_selfEnergy_FEP);
1507  if (d_selfEnergy_TI_1 != NULL) deallocate_device<double>(&d_selfEnergy_TI_1);
1508  if (d_selfEnergy_TI_2 != NULL) deallocate_device<double>(&d_selfEnergy_TI_2);
1509  deallocate_device<float>(&d_bm1);
1510  deallocate_device<float>(&d_bm2);
1511  deallocate_device<float>(&d_bm3);
1512  cudaCheck(cudaStreamDestroy(stream));
1513 
1514  if (reductionGpuResident) {
1515  delete reductionGpuResident;
1516  }
1517 }
1518 
1520  const Lattice &lattice,
1521 // const CudaAtom *d_atoms,
1522 // CudaForce *d_force,
1523 // int natoms,
1524  int doEnergyVirial,
1525  int step
1526 #if 0
1527  double d_energy[1],
1528  double d_virial[6]
1529 #endif
1530  ) {
1531 // fprintf(stderr, "CudaPmeOneDevice compute ****************************************************\n");
1532  int k1 = pmeGrid.K1;
1533  int k2 = pmeGrid.K2;
1534  int k3 = pmeGrid.K3;
1535  int order = pmeGrid.order;
1536  double volume = lattice.volume();
1537  Vector a_r = lattice.a_r();
1538  Vector b_r = lattice.b_r();
1539  Vector c_r = lattice.c_r();
1540  float arx = a_r.x;
1541  float ary = a_r.y;
1542  float arz = a_r.z;
1543  float brx = b_r.x;
1544  float bry = b_r.y;
1545  float brz = b_r.z;
1546  float crx = c_r.x;
1547  float cry = c_r.y;
1548  float crz = c_r.z;
1549  m_step = step;
1550 
1551  //JM: actually necessary if you reserve a PME device!
1552  cudaCheck(cudaSetDevice(deviceID));
1553  const SimParameters& sim_params = *(Node::Object()->simParameters);
1554 
1555  // clear force array
1556  //fprintf(stderr, "Calling clear_device_array on d_force\n");
1557  clear_device_array<float3>(d_forces, num_used_grids * natoms, stream);
1558  // clear grid
1559  //fprintf(stderr, "Calling clear_device_array on d_grid\n");
1560  clear_device_array<float>(d_grids, num_used_grids * gridsize, stream);
1561  clear_device_array<float2>(d_trans, num_used_grids * transize, stream);
1562 
1563  // Clear energy and virial array if needed
1564  if (doEnergyVirial) {
1565  // clear_device_array<EnergyVirial>(d_energyVirial, 1, stream);
1566  clear_device_array<EnergyVirial>(d_energyVirials, num_used_grids * 1, stream);
1567  const bool updateSelfEnergy = (step == sim_params.firstTimestep) || (selfEnergy == 0);
1568  if (updateSelfEnergy && (sim_params.alchOn == false)) {
1569  clear_device_array<double>(d_selfEnergy, 1, stream);
1570  // calculate self energy term if not yet done
1572  kappa, stream);
1573  //fprintf(stderr, "selfEnergy = %12.8f\n", selfEnergy);
1574  }
1575  /* the self energy depends on the scaling factor, or lambda
1576  * the cases when self energy will be changed:
1577  * 1. If alchLambdaFreq > 0, we will have a linear scaling of lambda. Lambda is changed EVERY STEP!
1578  * 2. In most cases, users will not use alchLambdaFreq > 0, but simulations may enter another lambda-window by using TCL scripts.
1579  * in summary, the self energy will be not changed unless lambda is changed.
1580  * so calcSelfEnergyAlch() would compare lambda of current step with the one from last step.
1581  * only if lambda is changed, the calcSelfEnergyFEPKernel or calcSelfEnergyTIKernel will be executed again.
1582  */
1583  if (sim_params.alchOn) calcSelfEnergyAlch(m_step);
1584  }
1585 
1586 #if 0
1587 
1588  spread_charge(d_atoms, natoms, k1, k2, k3, k1, k2, k3,
1589  k1 /* xsize */, 0 /* jBlock */, 0 /* kBlock */,
1590  true /* pmeGrid.yBlocks == 1 */, true /* pmeGrid.zBlocks == 1 */,
1591  d_grid, order, stream);
1592 #else
1593  const int order3 = ((order*order*order-1)/WARPSIZE + 1)*WARPSIZE;
1594  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1596  d_atoms + iGrid * natoms, natoms, k1, k2, k3,
1597  float(k1), (float)k2, (float)k3, order3,
1598  k1, k2, k3,
1599  k1 /* xsize */, 0 /* jBlock */, 0 /* kBlock */,
1600  true /* pmeGrid.yBlocks == 1 */, true /* pmeGrid.zBlocks == 1 */,
1601  d_grids + iGrid * gridsize, order, stream);
1602  }
1603 
1604 #endif
1605  //cudaCheck(cudaStreamSynchronize(stream));
1606 
1607  // forward FFT
1608  //fprintf(stderr, "Calling cufftExecR2C\n");
1609  //cufftCheck(cufftExecR2C(forwardPlan, (cufftReal *)d_grid,
1610  // (cufftComplex *)d_tran));
1611 
1612  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1613  cufftCheck(cufftExecR2C(forwardPlans[iGrid],
1614  (cufftReal *)(d_grids + iGrid * gridsize),
1615  (cufftComplex *)(d_trans + iGrid * transize)));
1616  }
1617 
1618  //cudaCheck(cudaStreamSynchronize(stream));
1619 
1620  // reciprocal space calculation
1621  //fprintf(stderr, "Calling scalar_sum\n");
1622  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1623  scalar_sum(true /* Perm_cX_Y_Z */, k1, k2, k3, (k1/2 + 1), k2, k3,
1624  kappa, arx, ary, arz, brx, bry, brz, crx, cry, crz, volume,
1625  d_bm1, d_bm2, d_bm3, 0 /* jBlock */, 0 /* kBlock */,
1626  (bool) doEnergyVirial, &(d_energyVirials[iGrid].energy),
1627  d_energyVirials[iGrid].virial, d_trans + iGrid * transize, stream);
1628  }
1629  //scalar_sum(true /* Perm_cX_Y_Z */, k1, k2, k3, (k1/2 + 1), k2, k3,
1630  // kappa, arx, ary, arz, brx, bry, brz, crx, cry, crz, volume,
1631  // d_bm1, d_bm2, d_bm3, 0 /* jBlock */, 0 /* kBlock */,
1632  // (bool) doEnergyVirial, &(d_energyVirial->energy),
1633  // d_energyVirial->virial, d_tran, stream);
1634  //cudaCheck(cudaStreamSynchronize(stream));
1635 
1636  // backward FFT
1637  //fprintf(stderr, "Calling cufftExecC2R\n");
1638  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1639  cufftCheck(cufftExecC2R(backwardPlans[iGrid],
1640  (cufftComplex *)(d_trans + iGrid * transize),
1641  (cufftReal *)(d_grids + iGrid * gridsize)));
1642  }
1643 
1644  //cufftCheck(cufftExecC2R(backwardPlan, (cufftComplex *)d_tran,
1645  // (cufftReal *)d_grid));
1646  //cudaCheck(cudaStreamSynchronize(stream));
1647 
1648  // gather force from grid to atoms
1649  // missing cudaTextureObject_t below works for __CUDA_ARCH__ >= 350
1650  //fprintf(stderr, "Calling gather_force\n");
1651  for (unsigned int iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1653  &(d_atoms[iGrid * natoms]), natoms, k1, k2, k3, k1, k2, k3,
1654  k1 /* xsize */, 0 /* jBlock */, 0 /* kBlock */,
1655  true /* pmeGrid.yBlocks == 1 */, true /* pmeGrid.zBlocks == 1 */,
1656  d_grids + iGrid * gridsize, order, d_forces + iGrid * natoms,
1657 #ifdef NAMD_CUDA
1658  gridTexObjArrays[iGrid] /* cudaTextureObject_t */,
1659 #endif
1660  stream);
1661  }
1662 
1663  //gather_force(d_atoms, natoms, k1, k2, k3, k1, k2, k3,
1664  // k1 /* xsize */, 0 /* jBlock */, 0 /* kBlock */,
1665  // true /* pmeGrid.yBlocks == 1 */, true /* pmeGrid.zBlocks == 1 */,
1666  // d_grid, order, d_force, gridTexObj /* cudaTextureObject_t */,
1667  // stream);
1668  //cudaCheck(cudaStreamSynchronize(stream));
1669 
1670  // Copy energy and virial to host if needed
1671  if (doEnergyVirial) {
1672  //fprintf(stderr, "Calling copy_DtoH on d_energyVirial\n");
1673  copy_DtoH<EnergyVirial>(d_energyVirials, h_energyVirials,
1675  //cudaCheck(cudaEventRecord(copyEnergyVirialEvent, stream));
1676  //cudaCheck(cudaStreamSynchronize(stream));
1677  }
1678 
1679  // XXX debugging, quick test for borked forces
1680  //clear_device_array<float3>(d_force, natoms, stream);
1681  if (sim_params.alchOn) {
1682  scaleAndMergeForce(m_step);
1683  }
1684 }
1685 
1686 
1687 // call this after device-host memory transfer has completed
1689  bool doEnergyVirial
1690  ) {
1691  cudaCheck(cudaStreamSynchronize(stream));
1692  SubmitReduction *reduction = getCurrentReduction();
1693  if(doEnergyVirial){
1694  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1695  PatchData* patchData = cpdata.ckLocalBranch();
1696  cudaCheck(cudaSetDevice(deviceID));
1697  double virial[9];
1698  double energy, energy_F, energy_TI_1, energy_TI_2;
1699  const SimParameters& sim_params = *(Node::Object()->simParameters);
1700  if (sim_params.alchOn) {
1701  if (sim_params.alchFepOn) {
1702  scaleAndComputeFEPEnergyVirials(h_energyVirials, m_step, energy, energy_F, virial);
1703  energy += selfEnergy;
1704  energy_F += selfEnergy_FEP;
1705  }
1706  if (sim_params.alchThermIntOn) {
1707  scaleAndComputeTIEnergyVirials(h_energyVirials, m_step, energy, energy_TI_1, energy_TI_2, virial);
1708  energy += selfEnergy;
1709  energy_TI_1 += selfEnergy_TI_1;
1710  energy_TI_2 += selfEnergy_TI_2;
1711  }
1712  } else {
1713  virial[0] = h_energyVirials[0].virial[0];
1714  virial[1] = h_energyVirials[0].virial[1];
1715  virial[2] = h_energyVirials[0].virial[2];
1716  virial[3] = h_energyVirials[0].virial[1];
1717  virial[4] = h_energyVirials[0].virial[3];
1718  virial[5] = h_energyVirials[0].virial[4];
1719  virial[6] = h_energyVirials[0].virial[2];
1720  virial[7] = h_energyVirials[0].virial[4];
1721  virial[8] = h_energyVirials[0].virial[5];
1722  energy = h_energyVirials[0].energy + selfEnergy;
1723  }
1724  #if 0
1725  fprintf(stderr, "PME ENERGY = %g %g\n", h_energyVirials[0].energy, selfEnergy );
1726  fprintf(stderr, "PME VIRIAL =\n"
1727  " %g %g %g\n %g %g %g\n %g %g %g\n",
1728  virial[0], virial[1], virial[2], virial[3], virial[4],
1729  virial[5], virial[6], virial[7], virial[8]);
1730  #endif
1731  reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial[0];
1732  reduction->item(REDUCTION_VIRIAL_SLOW_XY) += virial[1];
1733  reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += virial[2];
1734  reduction->item(REDUCTION_VIRIAL_SLOW_YX) += virial[3];
1735  reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial[4];
1736  reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += virial[5];
1737  reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += virial[6];
1738  reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += virial[7];
1739  reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial[8];
1740  reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += energy;
1741  if (sim_params.alchFepOn) {
1742  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += energy_F;
1743  }
1744  if (sim_params.alchThermIntOn) {
1745  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_1) += energy_TI_1;
1746  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_2) += energy_TI_2;
1747  }
1748  }
1749  reduction->submit();
1750 }
1751 
1752 void CudaPmeOneDevice::calcSelfEnergyAlch(int step) {
1753  SimParameters& sim_params = *(Node::Object()->simParameters);
1754  if (sim_params.alchFepOn) {
1755  const BigReal alchLambda1 = sim_params.getCurrentLambda(step);
1756  const BigReal alchLambda2 = sim_params.getCurrentLambda2(step);
1757  static BigReal lambda1Up = sim_params.getElecLambda(alchLambda1);
1758  static BigReal lambda2Up = sim_params.getElecLambda(alchLambda2);
1759  static BigReal lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
1760  static BigReal lambda2Down = sim_params.getElecLambda(1.0 - alchLambda2);
1761  // compute self energy at the first call
1762  // only compute self energy if factors are changed
1763  if ((lambda1Up != sim_params.getElecLambda(alchLambda1)) ||
1764  (lambda2Up != sim_params.getElecLambda(alchLambda2)) ||
1765  (lambda1Down != sim_params.getElecLambda(1.0 - alchLambda1)) ||
1766  (lambda2Down != sim_params.getElecLambda(1.0 - alchLambda2)) ||
1768  lambda1Up = sim_params.getElecLambda(alchLambda1);
1769  lambda2Up = sim_params.getElecLambda(alchLambda2);
1770  lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
1771  lambda2Down = sim_params.getElecLambda(1.0 - alchLambda2);
1772  selfEnergy = 0.0; // self energy for λ_1
1773  selfEnergy_FEP = 0.0; // self energy for λ_2
1774  cudaCheck(cudaMemsetAsync(d_selfEnergy, 0, sizeof(double), stream));
1775  cudaCheck(cudaMemsetAsync(d_selfEnergy_FEP, 0, sizeof(double), stream));
1776  calcSelfEnergyFEPWrapper(d_selfEnergy, d_selfEnergy_FEP, selfEnergy, selfEnergy_FEP, d_atoms, d_partition, natoms, kappa, sim_params.alchDecouple, lambda1Up, lambda2Up, lambda1Down, lambda2Down, stream);
1778  }
1779  }
1780  if (sim_params.alchThermIntOn) {
1781  const BigReal alchLambda1 = sim_params.getCurrentLambda(step);
1782  static BigReal lambda1Up = sim_params.getElecLambda(alchLambda1);
1783  static BigReal lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
1784  if ((lambda1Up != sim_params.getElecLambda(alchLambda1)) ||
1785  (lambda1Down != sim_params.getElecLambda(1.0 - alchLambda1)) ||
1787  lambda1Up = sim_params.getElecLambda(alchLambda1);
1788  lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
1789  selfEnergy = 0.0;
1790  selfEnergy_TI_1 = 0.0;
1791  selfEnergy_TI_2 = 0.0;
1792  cudaCheck(cudaMemsetAsync(d_selfEnergy, 0, sizeof(double), stream));
1793  cudaCheck(cudaMemsetAsync(d_selfEnergy_TI_1, 0, sizeof(double), stream));
1794  cudaCheck(cudaMemsetAsync(d_selfEnergy_TI_2, 0, sizeof(double), stream));
1797  }
1798  }
1799 }
1800 
1801 void CudaPmeOneDevice::scaleAndMergeForce(int step) {
1802  SimParameters& sim_params = *(Node::Object()->simParameters);
1803  const double alchLambda1 = sim_params.getCurrentLambda(step);
1804  static BigReal lambda1Up = sim_params.getElecLambda(alchLambda1);
1805  static BigReal lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
1806  if ((lambda1Up != sim_params.getElecLambda(alchLambda1)) ||
1807  (lambda1Down != sim_params.getElecLambda(1.0 - alchLambda1)) ||
1809  std::vector<float> scale_factors(num_used_grids);
1810  lambda1Up = sim_params.getElecLambda(alchLambda1);
1811  lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
1812  scale_factors[0] = lambda1Up;
1813  scale_factors[1] = lambda1Down;
1814  if (sim_params.alchDecouple) {
1815  scale_factors[2] = 1.0 - lambda1Up;
1816  scale_factors[3] = 1.0 - lambda1Down;
1817  }
1818  if (bool(sim_params.alchElecLambdaStart) || sim_params.alchThermIntOn) {
1819  scale_factors[num_used_grids-1] = (lambda1Up + lambda1Down - 1.0) * (-1.0);
1820  }
1821  copy_HtoD<float>(scale_factors.data(), d_scaling_factors, num_used_grids);
1823  }
1825 }
1826 
1827 void CudaPmeOneDevice::scaleAndComputeFEPEnergyVirials(const EnergyVirial* energyVirials, int step, double& energy, double& energy_F, double (&virial)[9]) {
1828  double scale1 = 1.0;
1829  double scale2 = 1.0;
1830  energy = 0;
1831  energy_F = 0;
1832  for (unsigned int i = 0; i < 9; ++i) {
1833  virial[i] = 0;
1834  }
1835  SimParameters& sim_params = *(Node::Object()->simParameters);
1836  const BigReal alchLambda = sim_params.getCurrentLambda(step);
1837  const BigReal alchLambda2 = sim_params.getCurrentLambda2(step);
1838  const BigReal elecLambdaUp = sim_params.getElecLambda(alchLambda);
1839  const BigReal elecLambda2Up = sim_params.getElecLambda(alchLambda2);
1840  const BigReal elecLambdaDown = sim_params.getElecLambda(1 - alchLambda);
1841  const BigReal elecLambda2Down = sim_params.getElecLambda(1 - alchLambda2);
1842  energy += energyVirials[0].energy * elecLambdaUp;
1843  energy_F += energyVirials[0].energy * elecLambda2Up;
1844  energy += energyVirials[1].energy * elecLambdaDown;
1845  energy_F += energyVirials[1].energy * elecLambda2Down;
1846  virial[0] += energyVirials[0].virial[0] * elecLambdaUp;
1847  virial[1] += energyVirials[0].virial[1] * elecLambdaUp;
1848  virial[2] += energyVirials[0].virial[2] * elecLambdaUp;
1849  virial[3] += energyVirials[0].virial[1] * elecLambdaUp;
1850  virial[4] += energyVirials[0].virial[3] * elecLambdaUp;
1851  virial[5] += energyVirials[0].virial[4] * elecLambdaUp;
1852  virial[6] += energyVirials[0].virial[2] * elecLambdaUp;
1853  virial[7] += energyVirials[0].virial[4] * elecLambdaUp;
1854  virial[8] += energyVirials[0].virial[5] * elecLambdaUp;
1855  virial[0] += energyVirials[1].virial[0] * elecLambdaDown;
1856  virial[1] += energyVirials[1].virial[1] * elecLambdaDown;
1857  virial[2] += energyVirials[1].virial[2] * elecLambdaDown;
1858  virial[3] += energyVirials[1].virial[1] * elecLambdaDown;
1859  virial[4] += energyVirials[1].virial[3] * elecLambdaDown;
1860  virial[5] += energyVirials[1].virial[4] * elecLambdaDown;
1861  virial[6] += energyVirials[1].virial[2] * elecLambdaDown;
1862  virial[7] += energyVirials[1].virial[4] * elecLambdaDown;
1863  virial[8] += energyVirials[1].virial[5] * elecLambdaDown;
1864  if (sim_params.alchDecouple) {
1865  energy += energyVirials[2].energy * (1.0 - elecLambdaUp);
1866  energy_F += energyVirials[2].energy * (1.0 - elecLambda2Up);
1867  energy += energyVirials[3].energy * (1.0 - elecLambdaDown);
1868  energy_F += energyVirials[3].energy * (1.0 - elecLambda2Down);
1869  virial[0] += energyVirials[2].virial[0] * (1.0 - elecLambdaUp);
1870  virial[1] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1871  virial[2] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1872  virial[3] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1873  virial[4] += energyVirials[2].virial[3] * (1.0 - elecLambdaUp);
1874  virial[5] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1875  virial[6] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1876  virial[7] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1877  virial[8] += energyVirials[2].virial[5] * (1.0 - elecLambdaUp);
1878  virial[0] += energyVirials[3].virial[0] * (1.0 - elecLambdaDown);
1879  virial[1] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1880  virial[2] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1881  virial[3] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1882  virial[4] += energyVirials[3].virial[3] * (1.0 - elecLambdaDown);
1883  virial[5] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1884  virial[6] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1885  virial[7] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1886  virial[8] += energyVirials[3].virial[5] * (1.0 - elecLambdaDown);
1887  if (sim_params.alchElecLambdaStart > 0) {
1888  energy += energyVirials[4].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1889  energy_F += energyVirials[4].energy * (-1.0 * (elecLambda2Up + elecLambda2Down - 1.0));
1890  virial[0] += energyVirials[4].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1891  virial[1] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1892  virial[2] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1893  virial[3] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1894  virial[4] += energyVirials[4].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1895  virial[5] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1896  virial[6] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1897  virial[7] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1898  virial[8] += energyVirials[4].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1899  }
1900  } else {
1901  if (sim_params.alchElecLambdaStart > 0) {
1902  energy += energyVirials[2].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1903  energy_F += energyVirials[2].energy * (-1.0 * (elecLambda2Up + elecLambda2Down - 1.0));
1904  virial[0] += energyVirials[2].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1905  virial[1] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1906  virial[2] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1907  virial[3] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1908  virial[4] += energyVirials[2].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1909  virial[5] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1910  virial[6] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1911  virial[7] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1912  virial[8] += energyVirials[2].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1913  }
1914  }
1915 }
1916 
1917 void CudaPmeOneDevice::scaleAndComputeTIEnergyVirials(const EnergyVirial* energyVirials, int step, double& energy, double& energy_TI_1, double& energy_TI_2, double (&virial)[9]) {
1918  double scale1 = 1.0;
1919  energy =0;
1920  energy_TI_1 = 0;
1921  energy_TI_2 = 0;
1922  for (unsigned int i = 0; i < 9; ++i) {
1923  virial[i] = 0;
1924  }
1925  SimParameters& sim_params = *(Node::Object()->simParameters);
1926  const BigReal alchLambda = sim_params.getCurrentLambda(step);
1927  const BigReal elecLambdaUp = sim_params.getElecLambda(alchLambda);
1928  const BigReal elecLambdaDown = sim_params.getElecLambda(1 - alchLambda);
1929  energy += energyVirials[0].energy * elecLambdaUp;
1930  energy += energyVirials[1].energy * elecLambdaDown;
1931  energy_TI_1 += energyVirials[0].energy;
1932  energy_TI_2 += energyVirials[1].energy;
1933  virial[0] += energyVirials[0].virial[0] * elecLambdaUp;
1934  virial[1] += energyVirials[0].virial[1] * elecLambdaUp;
1935  virial[2] += energyVirials[0].virial[2] * elecLambdaUp;
1936  virial[3] += energyVirials[0].virial[1] * elecLambdaUp;
1937  virial[4] += energyVirials[0].virial[3] * elecLambdaUp;
1938  virial[5] += energyVirials[0].virial[4] * elecLambdaUp;
1939  virial[6] += energyVirials[0].virial[2] * elecLambdaUp;
1940  virial[7] += energyVirials[0].virial[4] * elecLambdaUp;
1941  virial[8] += energyVirials[0].virial[5] * elecLambdaUp;
1942  virial[0] += energyVirials[1].virial[0] * elecLambdaDown;
1943  virial[1] += energyVirials[1].virial[1] * elecLambdaDown;
1944  virial[2] += energyVirials[1].virial[2] * elecLambdaDown;
1945  virial[3] += energyVirials[1].virial[1] * elecLambdaDown;
1946  virial[4] += energyVirials[1].virial[3] * elecLambdaDown;
1947  virial[5] += energyVirials[1].virial[4] * elecLambdaDown;
1948  virial[6] += energyVirials[1].virial[2] * elecLambdaDown;
1949  virial[7] += energyVirials[1].virial[4] * elecLambdaDown;
1950  virial[8] += energyVirials[1].virial[5] * elecLambdaDown;
1951  if (sim_params.alchDecouple) {
1952  energy += energyVirials[2].energy * (1.0 - elecLambdaUp);
1953  energy += energyVirials[3].energy * (1.0 - elecLambdaDown);
1954  energy_TI_1 += -1.0 * energyVirials[2].energy;
1955  energy_TI_2 += -1.0 * energyVirials[3].energy;
1956  virial[0] += energyVirials[2].virial[0] * (1.0 - elecLambdaUp);
1957  virial[1] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1958  virial[2] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1959  virial[3] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1960  virial[4] += energyVirials[2].virial[3] * (1.0 - elecLambdaUp);
1961  virial[5] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1962  virial[6] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1963  virial[7] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1964  virial[8] += energyVirials[2].virial[5] * (1.0 - elecLambdaUp);
1965  virial[0] += energyVirials[3].virial[0] * (1.0 - elecLambdaDown);
1966  virial[1] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1967  virial[2] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1968  virial[3] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1969  virial[4] += energyVirials[3].virial[3] * (1.0 - elecLambdaDown);
1970  virial[5] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1971  virial[6] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1972  virial[7] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1973  virial[8] += energyVirials[3].virial[5] * (1.0 - elecLambdaDown);
1974  energy += energyVirials[4].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1975  energy_TI_1 += -1.0 * energyVirials[4].energy;
1976  energy_TI_2 += -1.0 * energyVirials[4].energy;
1977  virial[0] += energyVirials[4].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1978  virial[1] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1979  virial[2] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1980  virial[3] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1981  virial[4] += energyVirials[4].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1982  virial[5] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1983  virial[6] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1984  virial[7] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1985  virial[8] += energyVirials[4].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1986  } else {
1987  energy += energyVirials[2].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1988  energy_TI_1 += -1.0 * energyVirials[2].energy;
1989  energy_TI_2 += -1.0 * energyVirials[2].energy;
1990  virial[0] += energyVirials[2].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1991  virial[1] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1992  virial[2] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1993  virial[3] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1994  virial[4] += energyVirials[2].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1995  virial[5] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1996  virial[6] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1997  virial[7] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1998  virial[8] += energyVirials[2].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1999  }
2000 }
2001 
2002 
2003 
2004 int CudaPmeOneDevice::getShiftedGrid(const double x, const int grid) {
2005  double w = x + 0.5;
2006  double gw = w * grid;
2007  return floor(gw);
2008 }
2009 
2011  const int numThreads, const int3 patchGridDim, const int order) {
2012 
2013  const int gridBytes = patchGridDim.x * patchGridDim.y * patchGridDim.z * sizeof(float);
2014  const int thetaBytes = PatchLevelPmeData::kDim * (numThreads + PatchLevelPmeData::kThetaPad) *
2015  order * sizeof(float);
2016  const int indexBytes = numThreads * sizeof(char4);
2017 
2018  return gridBytes + thetaBytes + indexBytes;
2019 }
2020 
2022  const int numThreads, const int3 patchGridDim, const int order) {
2023 
2024  const int gridBytes = patchGridDim.x * patchGridDim.y * patchGridDim.z * sizeof(float);
2025  const int thetaBytes = (numThreads + PatchLevelPmeData::kThetaPad) * order *
2026  2 /* theta and dtheta */ * sizeof(float);
2027 
2028  return gridBytes + thetaBytes;
2029 }
2030 
2031 void CudaPmeOneDevice::checkPatchLevelSimParamCompatibility(const int order, const bool periodicY, const bool periodicZ) {
2032  bool use = true;
2033  use = use && (order == 8);
2034  use = use && (periodicY);
2035  use = use && (periodicZ);
2036 
2037  use = use && (deviceCUDA->getNumDevice() == 1); // This is only supported for single GPU currently
2038 
2040 }
2041 
2043  cudaDeviceGetAttribute(&patchLevelPmeData.deviceMaxSharedBytes, cudaDevAttrMaxSharedMemoryPerBlockOptin, deviceID);
2044 
2045  const int3 constexprPatchGridDim = make_int3(
2049 
2052  constexprPatchGridDim, 8 /* order */);
2055  constexprPatchGridDim, 8 /* order */);
2056 
2060 }
2061 
2063  const int numPatches, const CudaLocalRecord* localRecords,
2064  double3* patchMin, double3* patchMax, double3* awayDists) {
2065 
2066  patchLevelPmeData.localRecords = localRecords;
2067 
2068  // If the simulation isn't compatible or the device isn't compatible then no point in checking
2069  // patch sizes
2071 
2072  patchLevelPmeData.numPatches = numPatches;
2073 
2074  if (patchLevelPmeData.h_patchGridOffsets == nullptr) {
2075  allocate_host<int3>(&patchLevelPmeData.h_patchGridOffsets, numPatches);
2076  }
2077  if (patchLevelPmeData.d_patchGridOffsets == nullptr) {
2078  allocate_device<int3>(&patchLevelPmeData.d_patchGridOffsets, numPatches);
2079  }
2080 
2082  const int order = pmeGrid.order;
2083 
2084  // We only need to recompute the grid offsets if the lattice has changed
2085  if (!lattice.isEqual(currentLattice)) {
2086  currentLattice = lattice;
2087 
2088  double sysdima = currentLattice.a_r().unit() * currentLattice.a();
2089  double sysdimb = currentLattice.b_r().unit() * currentLattice.b();
2090  double sysdimc = currentLattice.c_r().unit() * currentLattice.c();
2091 
2092  patchLevelPmeData.patchGridDim = make_int3(0,0,0);
2093 
2094  for (int i = 0; i < numPatches; i++) {
2095  double3 pmin = currentLattice.unscale(patchMin[i]);
2096  double3 pmax = currentLattice.unscale(patchMax[i]);
2097  double3 width = pmax - pmin;
2098 
2099  // Logic copied from margin violation check
2100  double3 marginVal;
2101  marginVal.x = 0.5 * (awayDists[i].x - simParams->cutoff / sysdima);
2102  marginVal.y = 0.5 * (awayDists[i].y - simParams->cutoff / sysdimb);
2103  marginVal.z = 0.5 * (awayDists[i].z - simParams->cutoff / sysdimc);
2104  marginVal = currentLattice.unscale(marginVal);
2105 
2106  double3 minAtom = pmin - marginVal;
2107  double3 maxAtom = pmax + marginVal;
2108 
2109  double3 minScaled = currentLattice.scale(minAtom);
2110  double3 maxScaled = currentLattice.scale(maxAtom);
2111 
2112  int3 gridMin;
2113  gridMin.x = getShiftedGrid(minScaled.x, pmeGrid.K1);
2114  gridMin.y = getShiftedGrid(minScaled.y, pmeGrid.K2);
2115  gridMin.z = getShiftedGrid(minScaled.z, pmeGrid.K3);
2116 
2117  int3 gridMax;
2118  gridMax.x = getShiftedGrid(maxScaled.x, pmeGrid.K1);
2119  gridMax.y = getShiftedGrid(maxScaled.y, pmeGrid.K2);
2120  gridMax.z = getShiftedGrid(maxScaled.z, pmeGrid.K3);
2121 
2122  int3 gridWidth;
2123  gridWidth.x = gridMax.x - gridMin.x + order;
2124  gridWidth.y = gridMax.y - gridMin.y + order;
2125  gridWidth.z = gridMax.z - gridMin.z + order;
2126 
2128  patchLevelPmeData.patchGridDim.x = std::max(patchLevelPmeData.patchGridDim.x, gridWidth.x);
2129  patchLevelPmeData.patchGridDim.y = std::max(patchLevelPmeData.patchGridDim.y, gridWidth.y);
2130  patchLevelPmeData.patchGridDim.z = std::max(patchLevelPmeData.patchGridDim.z, gridWidth.z);
2131  }
2133  numPatches, nullptr);
2134  cudaStreamSynchronize(nullptr);
2135  const int maxGridPoints = patchLevelPmeData.patchGridDim.x *
2137 
2142  }
2143 }
2144 
2145 #endif // NAMD_CUDA
2146 
static Node * Object()
Definition: Node.h:86
void spread_charge_v2(const PatchLevelPmeData patchLevelPmeData, const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const float nfftx_f, const float nffty_f, const float nfftz_f, const int order3, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
void finishReduction(bool doEnergyVirial)
int zBlocks
Definition: PmeBase.h:25
bool isEqual(const Lattice &other) const
Definition: Lattice.h:298
CudaPmeOneDevice(PmeGrid pmeGrid_, int deviceID_, int deviceIndex_)
#define NAMD_EVENT_STOP(eon, id)
cufftHandle * backwardPlans
void compute_b_moduli(double *bm, int K, int order)
Definition: PmeKSpace.C:42
NAMD_HOST_DEVICE Vector c() const
Definition: Lattice.h:270
void energyAndVirialSetCallback(CudaPmePencilXYZ *pencilPtr)
const int permutation
void gatherForceDone(unsigned int iGrid)
static constexpr int kNumThreads
void batchTranspose_xyz_yzx(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
CudaPmeTranspose(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
BigReal alchElecLambdaStart
Definition: Vector.h:72
int K2
Definition: PmeBase.h:21
virtual void submit(void)=0
SimParameters * simParameters
Definition: Node.h:181
int K1
Definition: PmeBase.h:21
void cudaDie(const char *msg, cudaError_t err)
Definition: CudaUtils.C:9
Bool CUDASOAintegrateMode
EnergyVirial * d_energyVirials
NAMD_HOST_DEVICE Position unscale(ScaledPosition s) const
Definition: Lattice.h:77
BigReal & item(int i)
Definition: ReductionMgr.h:336
BigReal z
Definition: Vector.h:74
int getNumDevice()
Definition: DeviceCUDA.h:125
#define cufftCheck(stmt)
void checkPatchLevelLatticeCompatibilityAndComputeOffsets(const Lattice &lattice, const int numPatches, const CudaLocalRecord *localRecords, double3 *patchMin, double3 *patchMax, double3 *awayDists)
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:368
static void getPencilDim(const PmeGrid &pmeGrid, const int permutation, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:32
void energyAndVirialDone(unsigned int iGrid)
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:290
void checkPatchLevelSimParamCompatibility(const int order, const bool periodicY, const bool periodicZ)
void spreadCharge(Lattice &lattice)
BigReal getElecLambda(const BigReal) const
std::vector< int > pos
void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)
void copyAtoms(const int numAtoms, const CudaAtom *atoms)
#define WARPSIZE
Definition: CudaUtils.h:17
PatchLevelPmeData patchLevelPmeData
CudaPmeRealSpaceCompute(PmeGrid pmeGrid, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
void copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
void copyDataDeviceToDevice(const int iblock, float2 *data_out)
PmeGrid pmeGrid
bool dataDstAllocated
void calcSelfEnergyTIWrapper(double *d_selfEnergy, double *d_selfEnergy_TI_1, double *d_selfEnergy_TI_2, double &h_selfEnergy, double &h_selfEnergy_TI_1, double &h_selfEnergy_TI_2, const float4 *d_atoms, const int *d_partition, const int num_atoms, const double ewaldcof, const bool alchDecouple, const double lambda1Up, const double lambda1Down, cudaStream_t stream)
CudaFFTCompute(int deviceID, cudaStream_t stream)
int yBlocks
Definition: PmeBase.h:25
int computeSharedMemoryPatchLevelSpreadCharge(const int numThreads, const int3 patchGridDim, const int order)
void copyDataDeviceToHost(const int iblock, float2 *h_data, const int h_dataSize)
#define order
Definition: PmeRealSpace.C:235
#define NAMD_EVENT_START(eon, id)
int order
Definition: PmeBase.h:23
void NAMD_bug(const char *err_msg)
Definition: common.C:195
const int jblock
void writeHostComplexToDisk(const float2 *h_data, const int size, const char *filename)
NAMD_HOST_DEVICE ScaledPosition scale(Position p) const
Definition: Lattice.h:83
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
BigReal x
Definition: Vector.h:74
void gather_force(const PatchLevelPmeData patchLevelPmeData, const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, const float *data, const int order, float3 *force, const cudaTextureObject_t gridTexObj, cudaStream_t stream)
NAMD_HOST_DEVICE BigReal volume(void) const
Definition: Lattice.h:293
NAMD_HOST_DEVICE Vector a_r() const
Definition: Lattice.h:284
void calcSelfEnergyFEPWrapper(double *d_selfEnergy, double *d_selfEnergy_FEP, double &h_selfEnergy, double &h_selfEnergyFEP, const float4 *d_atoms, const int *d_partition, const int num_atoms, const double ewaldcof, const bool alchDecouple, const double lambda1Up, const double lambda2Up, const double lambda1Down, const double lambda2Down, cudaStream_t stream)
NAMD_HOST_DEVICE Vector b_r() const
Definition: Lattice.h:285
int numAtoms
Definition: Molecule.h:586
void getVirial(double *virial)
cudaTextureObject_t * gridTexObjArrays
static constexpr int kPatchGridDimPad
NAMD_HOST_DEVICE Vector c_r() const
Definition: Lattice.h:286
const int kblock
void energyAndVirialDone(unsigned int iGrid)
Definition: CudaPmeSolver.C:61
void writeComplexToDisk(const float2 *d_data, const int size, const char *filename, cudaStream_t stream)
NAMD_HOST_DEVICE Vector b() const
Definition: Lattice.h:269
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:53
bool dataSrcAllocated
double compute_selfEnergy(double *d_selfEnergy, const float4 *d_atoms, int natoms, double ewaldcof, cudaStream_t stream)
#define simParams
Definition: Output.C:131
int K3
Definition: PmeBase.h:21
const int permutation
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:89
void spread_charge(const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
BigReal getCurrentLambda2(const int) const
BigReal getCurrentLambda(const int) const
void gatherForce(Lattice &lattice, CudaForce *force)
BigReal y
Definition: Vector.h:74
unsigned int multipleGridIndex
void solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float *data)
void gatherForceSetCallback(ComputePmeCUDADevice *devicePtr_in)
void scaleAndMergeForceWrapper(float3 *forces, const float *factors, const size_t num_arrays, const int num_atoms, cudaStream_t stream)
void setDataPtrsYZX(std::vector< float2 *> &dataPtrsNew, float2 *data)
const CudaLocalRecord * localRecords
static constexpr int kDim
void transposeXYZtoYZX(const float2 *data)
void setDataPtrsZXY(std::vector< float2 *> &dataPtrsNew, float2 *data)
float * dataDst
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream, unsigned int iGrid=0)
void batchTranspose_xyz_zxy(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
cufftHandle * forwardPlans
size_t alchGetNumOfPMEGrids() const
void copyDataHostToDevice(const int iblock, float2 *data_in, float2 *data_out)
static constexpr int kThetaPad
void transposeXYZtoZXY(const float2 *data)
static constexpr int kPatchGridDim
NAMD_HOST_DEVICE Vector a() const
Definition: Lattice.h:268
float * dataSrc
NAMD_HOST_DEVICE Vector unit(void) const
Definition: Vector.h:215
void CcdCallBacksReset(void *ignored, double curWallTime)
Molecule * molecule
Definition: Node.h:179
void copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
int computeSharedMemoryPatchLevelGatherForce(const int numThreads, const int3 patchGridDim, const int order)
float2 * getBuffer(const int iblock)
int getShiftedGrid(const double x, const int grid)
EnergyVirial * h_energyVirials
unsigned int multipleGridIndex
double BigReal
Definition: common.h:123
void writeRealToDisk(const float *d_data, const int size, const char *filename, cudaStream_t stream)
void checkPatchLevelDeviceCompatibility()
void compute(const Lattice &lattice, int doEnergyVirial, int step)