NAMD
CudaPmeSolverUtil.C
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <algorithm>
3 #include <cstdlib>
4 #ifdef NAMD_CUDA
5 #include <cuda_runtime.h>
6 #endif
7 #ifdef NAMD_HIP
8 #include <hip/hip_runtime.h>
9 #endif
10 #include "HipDefines.h"
11 #include "ComputeNonbondedUtil.h"
12 #include "ComputePmeCUDAMgr.h"
13 #include "CudaPmeSolver.h"
14 #include "CudaPmeSolverUtil.h"
15 #include "Node.h"
16 #include "PatchData.h"
17 
18 #include "NamdEventsProfiling.h"
19 #include "TestArray.h"
20 #include "DeviceCUDA.h"
21 
22 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
23 #if !defined(WIN64)
24 extern __thread DeviceCUDA *deviceCUDA;
25 #else
26 extern __declspec(thread) DeviceCUDA *deviceCUDA;
27 #endif
28 extern "C" void CcdCallBacksReset(void *ignored, double curWallTime); // fix Charm++
29 
30 void writeComplexToDisk(const float2 *d_data, const int size, const char* filename, cudaStream_t stream) {
31  fprintf(stderr, "writeComplexToDisk %d %s\n", size, filename);
32  float2* h_data = new float2[size];
33  copy_DtoH<float2>(d_data, h_data, size, stream);
34  cudaCheck(cudaStreamSynchronize(stream));
35  FILE *handle = fopen(filename, "w");
36  for (int i=0;i < size;i++)
37  fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
38  fclose(handle);
39  delete [] h_data;
40 }
41 
42 void writeHostComplexToDisk(const float2 *h_data, const int size, const char* filename) {
43  FILE *handle = fopen(filename, "w");
44  for (int i=0;i < size;i++)
45  fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
46  fclose(handle);
47 }
48 
49 void writeRealToDisk(const float *d_data, const int size, const char* filename, cudaStream_t stream) {
50  fprintf(stderr, "writeRealToDisk %d %s\n", size, filename);
51  float* h_data = new float[size];
52  copy_DtoH<float>(d_data, h_data, size, stream);
53  cudaCheck(cudaStreamSynchronize(stream));
54  FILE *handle = fopen(filename, "w");
55  for (int i=0;i < size;i++)
56  fprintf(handle, "%f\n", h_data[i]);
57  fclose(handle);
58  delete [] h_data;
59 }
60 
61  CudaFFTCompute::CudaFFTCompute(int deviceID, cudaStream_t stream)
62  : deviceID(deviceID), stream(stream) {
63  }
64 
65 void CudaFFTCompute::plan3D(int *n, int flags) {
66  cudaCheck(cudaSetDevice(deviceID));
67  forwardType = CUFFT_R2C;
68  backwardType = CUFFT_C2R;
69  cufftCheck(cufftPlan3d(&forwardPlan, n[2], n[1], n[0], CUFFT_R2C));
70  cufftCheck(cufftPlan3d(&backwardPlan, n[2], n[1], n[0], CUFFT_C2R));
71  setStream();
72  // plantype = 3;
73 }
74 
75 void CudaFFTCompute::plan2D(int *n, int howmany, int flags) {
76  cudaCheck(cudaSetDevice(deviceID));
77  forwardType = CUFFT_R2C;
78  backwardType = CUFFT_C2R;
79  int nt[2] = {n[1], n[0]};
80  cufftCheck(cufftPlanMany(&forwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_R2C, howmany));
81  cufftCheck(cufftPlanMany(&backwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, howmany));
82  setStream();
83  // plantype = 2;
84 }
85 
86 void CudaFFTCompute::plan1DX(int *n, int howmany, int flags) {
87  cudaCheck(cudaSetDevice(deviceID));
88  forwardType = CUFFT_R2C;
89  backwardType = CUFFT_C2R;
90  cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_R2C, howmany));
91  cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2R, howmany));
92  setStream();
93  // plantype = 1;
94 }
95 
96 void CudaFFTCompute::plan1DY(int *n, int howmany, int flags) {
97  cudaCheck(cudaSetDevice(deviceID));
98  forwardType = CUFFT_C2C;
99  backwardType = CUFFT_C2C;
100  cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
101  cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
102  setStream();
103  // plantype = 1;
104 }
105 
106 void CudaFFTCompute::plan1DZ(int *n, int howmany, int flags) {
107  cudaCheck(cudaSetDevice(deviceID));
108  forwardType = CUFFT_C2C;
109  backwardType = CUFFT_C2C;
110  cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
111  cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
112  setStream();
113  // plantype = 1;
114 }
115 
117  cudaCheck(cudaSetDevice(deviceID));
118  cufftCheck(cufftDestroy(forwardPlan));
119  cufftCheck(cufftDestroy(backwardPlan));
120  if (dataSrcAllocated) deallocate_device<float>(&dataSrc);
121  if (dataDstAllocated) deallocate_device<float>(&dataDst);
122 }
123 
124 float* CudaFFTCompute::allocateData(const int dataSizeRequired) {
125  cudaCheck(cudaSetDevice(deviceID));
126  float* tmp = NULL;
127  allocate_device<float>(&tmp, dataSizeRequired);
128  return tmp;
129 }
130 
131 // int ncall = 0;
132 
134  cudaCheck(cudaSetDevice(deviceID));
135  // ncall++;
136  if (forwardType == CUFFT_R2C) {
137  cufftCheck(cufftExecR2C(forwardPlan, (cufftReal *)dataSrc, (cufftComplex *)dataDst));
138 #ifdef TESTPID
139  if (1) {
140  cudaCheck(cudaStreamSynchronize(stream));
141  fprintf(stderr, "AP FORWARD FFT\n");
142  fprintf(stderr, "COPY DEVICE ARRAYS BACK TO HOST\n");
143  int m = dataDstSize;
144  float *tran = 0;
145  allocate_host<float>(&tran, m);
146  copy_DtoH<float>(dataDst, tran, m, stream);
147  cudaCheck(cudaStreamSynchronize(stream));
148  TestArray_write<float>("tran_charge_grid_good.bin",
149  "transformed charge grid good", tran, m);
150  deallocate_host<float>(&tran);
151  }
152 #endif
153 
154  // if (ncall == 1) {
155  // writeComplexToDisk((float2 *)dataSrc, (isize/2+1)*jsize*ksize, "dataSrc.txt", stream);
156  // }
157 
158  // if (ncall == 1 && plantype == 2) {
159  // writeComplexToDisk((float2 *)data, (isize/2+1)*jsize*ksize, "data_fx_fy_z.txt", stream);
160  // }
161 
162  } else if (forwardType == CUFFT_C2C) {
163  // nc2cf++;
164  // if (ncall == 1 && nc2cf == 1)
165  // writeComplexToDisk((float2 *)data, 33*64*64, "data_y_z_fx.txt");
166  // else if (ncall == 1 && nc2cf == 2)
167  // writeComplexToDisk((float2 *)data, 33*64*64, "data_z_fx_fy.txt");
168  cufftCheck(cufftExecC2C(forwardPlan, (cufftComplex *)dataSrc, (cufftComplex *)dataDst, CUFFT_FORWARD));
169  // fprintf(stderr, "ncall %d plantype %d\n", ncall, plantype);
170  // if (ncall == 1 && plantype == 1 && isize == 62) {
171  // writeComplexToDisk((float2 *)data, isize*jsize*(ksize/2+1), "data_fy_z_fx.txt", stream);
172  // }
173  // if (ncall == 1 && nc2cf == 1)
174  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fy_z_fx.txt");
175  // else if (ncall == 1 && nc2cf == 2)
176  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fz_fx_fy.txt");
177  } else {
178  cudaNAMD_bug("CudaFFTCompute::forward(), unsupported FFT type");
179  }
180 }
181 
183  cudaCheck(cudaSetDevice(deviceID));
184  if (backwardType == CUFFT_C2R) {
185  // if (ncall == 1) {
186  // if (plantype == 1)
187  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fx_by_bz.txt");
188  // else
189  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fx_fy_fz_2.txt");
190  // }
191  cufftCheck(cufftExecC2R(backwardPlan, (cufftComplex *)dataDst, (cufftReal *)dataSrc));
192 #ifdef TESTPID
193  if (1) {
194  cudaCheck(cudaStreamSynchronize(stream));
195  fprintf(stderr, "AP BACKWARD FFT\n");
196  fprintf(stderr, "COPY DEVICE ARRAYS BACK TO HOST\n");
197  float *grid;
198  int gridsize = dataSrcSize;
199  allocate_host<float>(&grid, gridsize);
200  copy_DtoH<float>((float*)dataSrc, grid, gridsize, stream);
201  cudaCheck(cudaStreamSynchronize(stream));
202  TestArray_write<float>("potential_grid_good.bin",
203  "potential grid good", grid, gridsize);
204  deallocate_host<float>(&grid);
205  }
206 #endif
207 
208  // if (ncall == 1)
209  // if (plantype == 1)
210  // writeRealToDisk(data, 64*64*64, "data_bx_by_bz_1D.txt");
211  // else
212  // writeRealToDisk(data, 64*64*64, "data_bx_by_bz_3D.txt");
213  } else if (backwardType == CUFFT_C2C) {
214  // nc2cb++;
215  // if (ncall == 1 && nc2cb == 1)
216  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fz_fx_fy_2.txt");
217  // else if (ncall == 1 && nc2cb == 2)
218  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fy_bz_fx.txt");
219  cufftCheck(cufftExecC2C(backwardPlan, (cufftComplex *)dataDst, (cufftComplex *)dataSrc, CUFFT_INVERSE));
220  // if (ncall == 1 && nc2cb == 1)
221  // writeComplexToDisk((float2 *)data, 33*64*64, "data_bz_fx_fy.txt");
222  // else if (ncall == 1 && nc2cb == 2)
223  // writeComplexToDisk((float2 *)data, 33*64*64, "data_by_bz_fx.txt");
224  } else {
225  cudaNAMD_bug("CudaFFTCompute::backward(), unsupported FFT type");
226  }
227 }
228 
229 void CudaFFTCompute::setStream() {
230  cudaCheck(cudaSetDevice(deviceID));
231  cufftCheck(cufftSetStream(forwardPlan, stream));
232  cufftCheck(cufftSetStream(backwardPlan, stream));
233 }
234 
235 
237  const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream, unsigned int iGrid) :
238  PmeKSpaceCompute(pmeGrid, permutation, jblock, kblock, kappa, iGrid),
239  deviceID(deviceID), stream(stream) {
240 
241  cudaCheck(cudaSetDevice(deviceID));
242 
243  // Copy bm1 -> prefac_x on GPU memory
244  float *bm1f = new float[pmeGrid.K1];
245  float *bm2f = new float[pmeGrid.K2];
246  float *bm3f = new float[pmeGrid.K3];
247  for (int i=0;i < pmeGrid.K1;i++) bm1f[i] = (float)bm1[i];
248  for (int i=0;i < pmeGrid.K2;i++) bm2f[i] = (float)bm2[i];
249  for (int i=0;i < pmeGrid.K3;i++) bm3f[i] = (float)bm3[i];
250  allocate_device<float>(&d_bm1, pmeGrid.K1);
251  allocate_device<float>(&d_bm2, pmeGrid.K2);
252  allocate_device<float>(&d_bm3, pmeGrid.K3);
253  copy_HtoD_sync<float>(bm1f, d_bm1, pmeGrid.K1);
254  copy_HtoD_sync<float>(bm2f, d_bm2, pmeGrid.K2);
255  copy_HtoD_sync<float>(bm3f, d_bm3, pmeGrid.K3);
256  delete [] bm1f;
257  delete [] bm2f;
258  delete [] bm3f;
259  allocate_device<EnergyVirial>(&d_energyVirial, 1);
260  allocate_host<EnergyVirial>(&h_energyVirial, 1);
261  // cudaCheck(cudaEventCreateWithFlags(&copyEnergyVirialEvent, cudaEventDisableTiming));
262  cudaCheck(cudaEventCreate(&copyEnergyVirialEvent));
263  // ncall = 0;
264 }
265 
267  cudaCheck(cudaSetDevice(deviceID));
268  deallocate_device<float>(&d_bm1);
269  deallocate_device<float>(&d_bm2);
270  deallocate_device<float>(&d_bm3);
271  deallocate_device<EnergyVirial>(&d_energyVirial);
272  deallocate_host<EnergyVirial>(&h_energyVirial);
273  cudaCheck(cudaEventDestroy(copyEnergyVirialEvent));
274 }
275 
276 void CudaPmeKSpaceCompute::solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float* data) {
277 #if 0
278  // Check lattice to make sure it is updating for constant pressure
279  fprintf(stderr, "K-SPACE LATTICE %g %g %g %g %g %g %g %g %g\n",
280  lattice.a().x, lattice.a().y, lattice.a().z,
281  lattice.b().x, lattice.b().y, lattice.b().z,
282  lattice.c().x, lattice.c().y, lattice.c().z);
283 #endif
284  cudaCheck(cudaSetDevice(deviceID));
285 
286  const bool doEnergyVirial = (doEnergy || doVirial);
287 
288  int nfft1, nfft2, nfft3;
289  float *prefac1, *prefac2, *prefac3;
290 
291  BigReal volume = lattice.volume();
292  Vector a_r = lattice.a_r();
293  Vector b_r = lattice.b_r();
294  Vector c_r = lattice.c_r();
295  float recip1x, recip1y, recip1z;
296  float recip2x, recip2y, recip2z;
297  float recip3x, recip3y, recip3z;
298 
299  if (permutation == Perm_Z_cX_Y) {
300  // Z, X, Y
301  nfft1 = pmeGrid.K3;
302  nfft2 = pmeGrid.K1;
303  nfft3 = pmeGrid.K2;
304  prefac1 = d_bm3;
305  prefac2 = d_bm1;
306  prefac3 = d_bm2;
307  recip1x = c_r.z;
308  recip1y = c_r.x;
309  recip1z = c_r.y;
310  recip2x = a_r.z;
311  recip2y = a_r.x;
312  recip2z = a_r.y;
313  recip3x = b_r.z;
314  recip3y = b_r.x;
315  recip3z = b_r.y;
316  } else if (permutation == Perm_cX_Y_Z) {
317  // X, Y, Z
318  nfft1 = pmeGrid.K1;
319  nfft2 = pmeGrid.K2;
320  nfft3 = pmeGrid.K3;
321  prefac1 = d_bm1;
322  prefac2 = d_bm2;
323  prefac3 = d_bm3;
324  recip1x = a_r.x;
325  recip1y = a_r.y;
326  recip1z = a_r.z;
327  recip2x = b_r.x;
328  recip2y = b_r.y;
329  recip2z = b_r.z;
330  recip3x = c_r.x;
331  recip3y = c_r.y;
332  recip3z = c_r.z;
333  } else {
334  NAMD_bug("CudaPmeKSpaceCompute::solve, invalid permutation");
335  }
336 
337  // ncall++;
338  // if (ncall == 1) {
339  // char filename[256];
340  // sprintf(filename,"dataf_%d_%d.txt",jblock,kblock);
341  // writeComplexToDisk((float2*)data, size1*size2*size3, filename, stream);
342  // }
343 
344  // if (ncall == 1) {
345  // float2* h_data = new float2[size1*size2*size3];
346  // float2* d_data = (float2*)data;
347  // copy_DtoH<float2>(d_data, h_data, size1*size2*size3, stream);
348  // cudaCheck(cudaStreamSynchronize(stream));
349  // FILE *handle = fopen("dataf.txt", "w");
350  // for (int z=0;z < pmeGrid.K3;z++) {
351  // for (int y=0;y < pmeGrid.K2;y++) {
352  // for (int x=0;x < pmeGrid.K1/2+1;x++) {
353  // int i;
354  // if (permutation == Perm_cX_Y_Z) {
355  // i = x + y*size1 + z*size1*size2;
356  // } else {
357  // i = z + x*size1 + y*size1*size2;
358  // }
359  // fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
360  // }
361  // }
362  // }
363  // fclose(handle);
364  // delete [] h_data;
365  // }
366 
367  // Clear energy and virial array if needed
368  if (doEnergyVirial) clear_device_array<EnergyVirial>(d_energyVirial, 1, stream);
369 
370 #ifdef TESTPID
371  if (1) {
372  cudaCheck(cudaStreamSynchronize(stream));
373  fprintf(stderr, "AP calling scalar sum\n");
374  fprintf(stderr, "(permutation == Perm_cX_Y_Z) = %s\n",
375  (permutation == Perm_cX_Y_Z ? "true" : "false"));
376  fprintf(stderr, "nfft1=%d nfft2=%d nfft3=%d\n", nfft1, nfft2, nfft3);
377  fprintf(stderr, "size1=%d size2=%d size3=%d\n", size1, size2, size3);
378  fprintf(stderr, "kappa=%g\n", kappa);
379  fprintf(stderr, "recip1x=%g recip1y=%g recip1z=%g\n",
380  (double)recip1x, (double)recip1y, (double)recip1z);
381  fprintf(stderr, "recip2x=%g recip2y=%g recip2z=%g\n",
382  (double)recip2x, (double)recip2y, (double)recip2z);
383  fprintf(stderr, "recip3x=%g recip3y=%g recip3z=%g\n",
384  (double)recip3x, (double)recip3y, (double)recip3z);
385  fprintf(stderr, "volume=%g\n", volume);
386  fprintf(stderr, "j0=%d k0=%d\n", j0, k0);
387  float *bm1, *bm2, *bm3;
388  allocate_host<float>(&bm1, nfft1);
389  allocate_host<float>(&bm2, nfft2);
390  allocate_host<float>(&bm3, nfft3);
391  copy_DtoH<float>(prefac1, bm1, nfft1, stream);
392  copy_DtoH<float>(prefac2, bm2, nfft2, stream);
393  copy_DtoH<float>(prefac3, bm3, nfft3, stream);
394  TestArray_write<float>("bm1_good.bin", "structure factor bm1 good",
395  bm1, nfft1);
396  TestArray_write<float>("bm2_good.bin", "structure factor bm2 good",
397  bm2, nfft2);
398  TestArray_write<float>("bm3_good.bin", "structure factor bm3 good",
399  bm3, nfft3);
400  deallocate_host<float>(&bm1);
401  deallocate_host<float>(&bm2);
402  deallocate_host<float>(&bm3);
403  }
404 #endif
405 
406  scalar_sum(permutation == Perm_cX_Y_Z, nfft1, nfft2, nfft3, size1, size2, size3, kappa,
407  recip1x, recip1y, recip1z, recip2x, recip2y, recip2z, recip3x, recip3y, recip3z,
408  volume, prefac1, prefac2, prefac3, j0, k0, doEnergyVirial,
409  &d_energyVirial->energy, d_energyVirial->virial, (float2*)data,
410  stream);
411 #ifdef TESTPID
412  if (1) {
413  cudaCheck(cudaStreamSynchronize(stream));
414  fprintf(stderr, "AP SCALAR SUM\n");
415  fprintf(stderr, "COPY DEVICE ARRAYS BACK TO HOST\n");
416  int m = 2 * (nfft1/2 + 1) * nfft2 * nfft3;
417  float *tran = 0;
418  allocate_host<float>(&tran, m);
419  copy_DtoH<float>((float*)data, tran, m, stream);
420  cudaCheck(cudaStreamSynchronize(stream));
421  TestArray_write<float>("tran_potential_grid_good.bin",
422  "transformed potential grid good", tran, m);
423  deallocate_host<float>(&tran);
424  }
425 #endif
426 
427  // Copy energy and virial to host if needed
428  if (doEnergyVirial) {
429  copy_DtoH<EnergyVirial>(d_energyVirial, h_energyVirial, 1, stream);
430  cudaCheck(cudaEventRecord(copyEnergyVirialEvent, stream));
431  // cudaCheck(cudaStreamSynchronize(stream));
432  }
433 
434 }
435 
436 // void CudaPmeKSpaceCompute::waitEnergyAndVirial() {
437 // cudaCheck(cudaSetDevice(deviceID));
438 // cudaCheck(cudaEventSynchronize(copyEnergyVirialEvent));
439 // }
440 
441 void CudaPmeKSpaceCompute::energyAndVirialCheck(void *arg, double walltime) {
443 
444  cudaError_t err = cudaEventQuery(c->copyEnergyVirialEvent);
445  if (err == cudaSuccess) {
446  // Event has occurred
447  c->checkCount = 0;
448  if (c->pencilXYZPtr != NULL)
449  c->pencilXYZPtr->energyAndVirialDone(c->multipleGridIndex);
450  else if (c->pencilZPtr != NULL)
451  c->pencilZPtr->energyAndVirialDone(c->multipleGridIndex);
452  else
453  NAMD_bug("CudaPmeKSpaceCompute::energyAndVirialCheck, pencilXYZPtr and pencilZPtr not set");
454  return;
455  } else if (err == cudaErrorNotReady) {
456  // Event has not occurred
457  c->checkCount++;
458  if (c->checkCount >= 1000000) {
459  char errmsg[256];
460  sprintf(errmsg,"CudaPmeKSpaceCompute::energyAndVirialCheck polled %d times",
461  c->checkCount);
462  cudaDie(errmsg,err);
463  }
464  } else {
465  // Anything else is an error
466  char errmsg[256];
467  sprintf(errmsg,"in CudaPmeKSpaceCompute::energyAndVirialCheck after polling %d times",
468  c->checkCount);
469  cudaDie(errmsg,err);
470  }
471 
472  // Call again
473  CcdCallBacksReset(0, walltime);
474  CcdCallFnAfter(energyAndVirialCheck, arg, 0.1);
475 }
476 
478  cudaCheck(cudaSetDevice(deviceID));
479  pencilXYZPtr = pencilPtr;
480  pencilZPtr = NULL;
481  checkCount = 0;
482  CcdCallBacksReset(0, CmiWallTimer());
483  // Set the call back at 0.1ms
484  CcdCallFnAfter(energyAndVirialCheck, this, 0.1);
485 }
486 
488  cudaCheck(cudaSetDevice(deviceID));
489  pencilXYZPtr = NULL;
490  pencilZPtr = pencilPtr;
491  checkCount = 0;
492  CcdCallBacksReset(0, CmiWallTimer());
493  // Set the call back at 0.1ms
494  CcdCallFnAfter(energyAndVirialCheck, this, 0.1);
495 }
496 
498  return h_energyVirial->energy;
499 }
500 
501 void CudaPmeKSpaceCompute::getVirial(double *virial) {
502  if (permutation == Perm_Z_cX_Y) {
503  // h_energyVirial->virial is storing ZZ, ZX, ZY, XX, XY, YY
504  virial[0] = h_energyVirial->virial[3];
505  virial[1] = h_energyVirial->virial[4];
506  virial[2] = h_energyVirial->virial[1];
507 
508  virial[3] = h_energyVirial->virial[4];
509  virial[4] = h_energyVirial->virial[5];
510  virial[5] = h_energyVirial->virial[2];
511 
512  virial[6] = h_energyVirial->virial[1];
513  virial[7] = h_energyVirial->virial[7];
514  virial[8] = h_energyVirial->virial[0];
515  } else if (permutation == Perm_cX_Y_Z) {
516  // h_energyVirial->virial is storing XX, XY, XZ, YY, YZ, ZZ
517  virial[0] = h_energyVirial->virial[0];
518  virial[1] = h_energyVirial->virial[1];
519  virial[2] = h_energyVirial->virial[2];
520 
521  virial[3] = h_energyVirial->virial[1];
522  virial[4] = h_energyVirial->virial[3];
523  virial[5] = h_energyVirial->virial[4];
524 
525  virial[6] = h_energyVirial->virial[2];
526  virial[7] = h_energyVirial->virial[4];
527  virial[8] = h_energyVirial->virial[5];
528  }
529 #if 0
530  fprintf(stderr, "AP PME VIRIAL =\n"
531  " %g %g %g\n %g %g %g\n %g %g %g\n",
532  virial[0], virial[1], virial[2], virial[3], virial[4],
533  virial[5], virial[6], virial[7], virial[8]);
534 #endif
535 }
536 
537 
538 //###########################################################################
539 //###########################################################################
540 //###########################################################################
541 
542 //
543 // Class constructor
544 //
546  const int jblock, const int kblock, int deviceID, cudaStream_t stream) :
547  PmeRealSpaceCompute(pmeGrid, jblock, kblock), deviceID(deviceID), stream(stream) {
548  if (dataSize < xsize*ysize*zsize)
549  NAMD_bug("CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute, insufficient dataSize");
550  cudaCheck(cudaSetDevice(deviceID));
551  d_atomsCapacity = 0;
552  d_atoms = NULL;
553  d_forceCapacity = 0;
554  d_force = NULL;
555  #ifdef NAMD_CUDA
556  tex_data = NULL;
557  tex_data_len = 0;
558  #else
559  grid_data = NULL;
560  grid_data_len = 0;
561  #endif
562  allocate_device<float>(&data, dataSize);
563  setupGridData(data, xsize*ysize*zsize);
564  cudaCheck(cudaEventCreate(&gatherForceEvent));
565 }
566 
567 //
568 // Class desctructor
569 //
571  cudaCheck(cudaSetDevice(deviceID));
572  if (d_atoms != NULL) deallocate_device<CudaAtom>(&d_atoms);
573  if (d_force != NULL) deallocate_device<CudaForce>(&d_force);
574  // if (d_patches != NULL) deallocate_device<PatchInfo>(&d_patches);
575  // deallocate_device<double>(&d_selfEnergy);
576  deallocate_device<float>(&data);
577  cudaCheck(cudaEventDestroy(gatherForceEvent));
578 }
579 
580 // //
581 // // Copy patches and atoms to device memory
582 // //
583 // void CudaPmeRealSpaceCompute::setPatchesAtoms(const int numPatches, const PatchInfo* patches,
584 // const int numAtoms, const CudaAtom* atoms) {
585 
586 // this->numPatches = numPatches;
587 // this->numAtoms = numAtoms;
588 
589 // // Reallocate device arrays as neccessary
590 // reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity, numAtoms, 1.5f);
591 // reallocate_device<PatchInfo>(&d_patches, &d_patchesCapacity, numPatches, 1.5f);
592 
593 // // Copy atom and patch data to device
594 // copy_HtoD<CudaAtom>(atoms, d_atoms, numAtoms, stream);
595 // copy_HtoD<PatchInfo>(patches, d_patches, numPatches, stream);
596 // }
597 
598 //
599 // Copy atoms to device memory
600 //
601 void CudaPmeRealSpaceCompute::copyAtoms(const int numAtoms, const CudaAtom* atoms) {
602  cudaCheck(cudaSetDevice(deviceID));
603  this->numAtoms = numAtoms;
604 
605  // Reallocate device arrays as neccessary
606  reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity, numAtoms, 1.5f);
607 
608  // Copy atom data to device
609  copy_HtoD<CudaAtom>(atoms, d_atoms, numAtoms, stream);
610 }
611 
612 //
613 // Spread charges on grid
614 //
616  cudaCheck(cudaSetDevice(deviceID));
617 #if 0
618  if (1) {
619  static int step = 0;
620  float *xyzq;
621  int natoms = numAtoms;
622  allocate_host<float>(&xyzq, 4*natoms);
623  copy_DtoH<float>((float *)d_atoms, xyzq, 4*natoms, stream);
624  cudaCheck(cudaStreamSynchronize(stream));
625  char fname[64], remark[64];
626  sprintf(fname, "pme_atoms_xyzq_soa_%d.bin", step);
627  sprintf(remark, "SOA PME atoms xyzq, step %d\n", step);
628  TestArray_write<float>(fname, remark, xyzq, 4*natoms);
629  deallocate_host<float>(&xyzq);
630  step += 2;
631  }
632 #endif
633 
634  NAMD_EVENT_START(1, NamdProfileEvent::SPREAD_CHARGE);
635 
636  // Clear grid
637  clear_device_array<float>(data, xsize*ysize*zsize, stream);
638 
639 #if defined(TESTPID)
640  fprintf(stderr, "Calling spread_charge with parameters:\n");
641  fprintf(stderr, "numAtoms = %d\n", numAtoms);
642  fprintf(stderr, "pmeGrid.K1 = %d\n", pmeGrid.K1);
643  fprintf(stderr, "pmeGrid.K2 = %d\n", pmeGrid.K2);
644  fprintf(stderr, "pmeGrid.K3 = %d\n", pmeGrid.K3);
645  fprintf(stderr, "xsize = %d\n", xsize);
646  fprintf(stderr, "ysize = %d\n", ysize);
647  fprintf(stderr, "zsize = %d\n", zsize);
648  fprintf(stderr, "y0 = %d\n", y0);
649  fprintf(stderr, "z0 = %d\n", z0);
650  fprintf(stderr, "(pmeGrid.yBlocks == 1) = %d\n", (pmeGrid.yBlocks == 1));
651  fprintf(stderr, "(pmeGrid.zBlocks == 1) = %d\n", (pmeGrid.zBlocks == 1));
652  fprintf(stderr, "pmeGrid.order = %d\n", pmeGrid.order);
653 #endif
654  spread_charge((const float4*)d_atoms, numAtoms,
656  xsize, y0, z0, (pmeGrid.yBlocks == 1), (pmeGrid.zBlocks == 1),
657  data, pmeGrid.order, stream);
658 #ifdef TESTPID
659  if (1) {
660  cudaCheck(cudaStreamSynchronize(stream));
661  fprintf(stderr, "AP SPREAD CHARGES\n");
662  fprintf(stderr, "COPY DEVICE ARRAYS BACK TO HOST\n");
663  float *xyzq;
664  allocate_host<float>(&xyzq, 4*numAtoms);
665  copy_DtoH<float>((float *)d_atoms, xyzq, 4*numAtoms, stream);
666  int gridlen = pmeGrid.K1 * pmeGrid.K2 * pmeGrid.K3;
667  float *grid;
668  allocate_host<float>(&grid, gridlen);
669  copy_DtoH<float>(data, grid, gridlen, stream);
670  cudaCheck(cudaStreamSynchronize(stream));
671  TestArray_write<float>("xyzq_good.bin", "xyzq good", xyzq, 4*numAtoms);
672  TestArray_write<float>("charge_grid_good.bin", "charge grid good",
673  grid, gridlen);
674  deallocate_host<float>(&xyzq);
675  deallocate_host<float>(&grid);
676  }
677 #endif
678 
679  // ncall++;
680 
681  // if (ncall == 1) writeRealToDisk(data, xsize*ysize*zsize, "data.txt");
682  NAMD_EVENT_STOP(1, NamdProfileEvent::SPREAD_CHARGE);
683 }
684 
685 void CudaPmeRealSpaceCompute::cuda_gatherforce_check(void *arg, double walltime) {
687  cudaCheck(cudaSetDevice(c->deviceID));
688 
689  cudaError_t err = cudaEventQuery(c->gatherForceEvent);
690  if (err == cudaSuccess) {
691  // Event has occurred
692  c->checkCount = 0;
693 // c->deviceProxy[CkMyNode()].gatherForceDone();
694  c->devicePtr->gatherForceDone(c->multipleGridIndex);
695  return;
696  } else if (err == cudaErrorNotReady) {
697  // Event has not occurred
698  c->checkCount++;
699  if (c->checkCount >= 1000000) {
700  char errmsg[256];
701  sprintf(errmsg,"CudaPmeRealSpaceCompute::cuda_gatherforce_check polled %d times",
702  c->checkCount);
703  cudaDie(errmsg,err);
704  }
705  } else {
706  // Anything else is an error
707  char errmsg[256];
708  sprintf(errmsg,"in CudaPmeRealSpaceCompute::cuda_gatherforce_check after polling %d times",
709  c->checkCount);
710  cudaDie(errmsg,err);
711  }
712 
713  // Call again
714  CcdCallBacksReset(0, walltime);
715  CcdCallFnAfter(cuda_gatherforce_check, arg, 0.1);
716 }
717 
719  cudaCheck(cudaSetDevice(deviceID));
720  devicePtr = devicePtr_in;
721  checkCount = 0;
722  CcdCallBacksReset(0, CmiWallTimer());
723  // Set the call back at 0.1ms
724  CcdCallFnAfter(cuda_gatherforce_check, this, 0.1);
725 }
726 
728  cudaCheck(cudaSetDevice(deviceID));
729  cudaCheck(cudaEventSynchronize(gatherForceEvent));
730 }
731 
732 void CudaPmeRealSpaceCompute::setupGridData(float* data, int data_len) {
733  #ifdef NAMD_CUDA
734  //HIP runtime error when using hipCreateTextureObject. No longer needed anyway, so we are moving in that direction now.
735  /*
736 
737  FATAL ERROR: CUDA error hipCreateTextureObject(&gridTexObj, &desc, &tdesc, NULL) in file src/CudaPmeSolverUtil.C, function setupGridTexture, line 744
738  on Pe 11 (jparada-MS-7B09 device 0 pci 0:43:0): hipErrorRuntimeOther
739 ------------- Processor 11 Exiting: Called CmiAbort ------------
740 Reason: FATAL ERROR: CUDA error hipCreateTextureObject(&gridTexObj, &desc, &tdesc, NULL) in file src/CudaPmeSolverUtil.C, function setupGridTexture, line 744
741  on Pe 11 (jparada-MS-7B09 device 0 pci 0:43:0): hipErrorRuntimeOther
742 
743  */
744  if (tex_data == data && tex_data_len == data_len) return;
745  tex_data = data;
746  tex_data_len = data_len;
747  // Use texture objects
748  cudaResourceDesc resDesc;
749  memset(&resDesc, 0, sizeof(resDesc));
750  resDesc.resType = cudaResourceTypeLinear;
751  resDesc.res.linear.devPtr = data;
752  resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
753  resDesc.res.linear.desc.x = sizeof(float)*8;
754  resDesc.res.linear.sizeInBytes = data_len*sizeof(float);
755  cudaTextureDesc texDesc;
756  memset(&texDesc, 0, sizeof(texDesc));
757  texDesc.readMode = cudaReadModeElementType;
758  cudaCheck(cudaCreateTextureObject(&gridTexObj, &resDesc, &texDesc, NULL));
759  #else
760  if (grid_data == data && grid_data_len == data_len) return;
761  grid_data = data;
762  grid_data_len = data_len;
763  #endif
764 }
765 
767  cudaCheck(cudaSetDevice(deviceID));
768 
769  NAMD_EVENT_START(1, NamdProfileEvent::GATHER_FORCE);
770 
771  // Re-allocate force array if needed
772  reallocate_device<CudaForce>(&d_force, &d_forceCapacity, numAtoms, 1.5f);
773 
774 #ifdef TESTPID
775  if (1) {
776  fprintf(stderr, "AP gather force arguments\n");
777  fprintf(stderr, "numAtoms = %d\n", numAtoms);
778  fprintf(stderr, "pmeGrid.K1 = %d\n", pmeGrid.K1);
779  fprintf(stderr, "pmeGrid.K2 = %d\n", pmeGrid.K2);
780  fprintf(stderr, "pmeGrid.K3 = %d\n", pmeGrid.K3);
781  fprintf(stderr, "xsize = %d\n", xsize);
782  fprintf(stderr, "ysize = %d\n", ysize);
783  fprintf(stderr, "zsize = %d\n", zsize);
784  fprintf(stderr, "y0 = %d\n", y0);
785  fprintf(stderr, "z0 = %d\n", z0);
786  fprintf(stderr, "(pmeGrid.yBlocks == 1) = %d\n", (pmeGrid.yBlocks == 1));
787  fprintf(stderr, "(pmeGrid.zBlocks == 1) = %d\n", (pmeGrid.zBlocks == 1));
788  fprintf(stderr, "pmeGrid.order = %d\n", pmeGrid.order);
789  fprintf(stderr, "gridTexObj = %p\n", gridTexObj);
790  }
791 #endif
792  // The patch-level PME kernels are only used for the GPU-resident code path. The default constructor
793  // of PatchLevelPmeData will initialize the compatibility variables to false, so the patch-level kernels
794  // won't be used here.
795  PatchLevelPmeData patchLevelPmeData;
796  gather_force(patchLevelPmeData,
797  (const float4*)d_atoms, numAtoms,
799  xsize, ysize, zsize, xsize, y0, z0, (pmeGrid.yBlocks == 1), (pmeGrid.zBlocks == 1),
800  data, pmeGrid.order, (float3*)d_force,
801 #ifdef NAMD_CUDA
802  gridTexObj,
803 #endif
804  stream);
805 #ifdef TESTPID
806  if (1) {
807  cudaCheck(cudaStreamSynchronize(stream));
808  fprintf(stderr, "AP GATHER FORCE\n");
809  fprintf(stderr, "COPY DEVICE ARRAYS BACK TO HOST\n");
810  float *xyz;
811  int natoms = numAtoms;
812  allocate_host<float>(&xyz, 3*natoms);
813  copy_DtoH<float>((float*)d_force, xyz, 3*natoms, stream);
814  cudaCheck(cudaStreamSynchronize(stream));
815  TestArray_write<float>("gather_force_good.bin",
816  "gather force good", xyz, 3*natoms);
817  deallocate_host<float>(&xyz);
818  }
819 #endif
820 
821  copy_DtoH<CudaForce>(d_force, force, numAtoms, stream);
822 
823  cudaCheck(cudaEventRecord(gatherForceEvent, stream));
824 
825  NAMD_EVENT_STOP(1, NamdProfileEvent::GATHER_FORCE);
826 }
827 
828 /*
829 double CudaPmeRealSpaceCompute::calcSelfEnergy() {
830  double h_selfEnergy;
831  clear_device_array<double>(d_selfEnergy, 1);
832  calc_sum_charge_squared((const float4*)d_atoms, numAtoms, d_selfEnergy, stream);
833  copy_DtoH<double>(d_selfEnergy, &h_selfEnergy, 1, stream);
834  cudaCheck(cudaStreamSynchronize(stream));
835  // 1.7724538509055160273 = sqrt(pi)
836  h_selfEnergy *= -ComputeNonbondedUtil::ewaldcof/1.7724538509055160273;
837  return h_selfEnergy;
838 }
839 */
840 
841 //###########################################################################
842 //###########################################################################
843 //###########################################################################
844 
845 CudaPmeTranspose::CudaPmeTranspose(PmeGrid pmeGrid, const int permutation,
846  const int jblock, const int kblock, int deviceID, cudaStream_t stream) :
847  PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
848  cudaCheck(cudaSetDevice(deviceID));
849 
850  allocate_device<float2>(&d_data, dataSize);
851 #ifndef P2P_ENABLE_3D
852  allocate_device<float2>(&d_buffer, dataSize);
853 #endif
854 
855  // Setup data pointers to NULL, these can be overridden later on by using setDataPtrs()
856  dataPtrsYZX.resize(nblock, NULL);
857  dataPtrsZXY.resize(nblock, NULL);
858 
859  allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*nblock);
860  allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*nblock);
861 }
862 
864  cudaCheck(cudaSetDevice(deviceID));
865  deallocate_device<float2>(&d_data);
866 #ifndef P2P_ENABLE_3D
867  deallocate_device<float2>(&d_buffer);
868 #endif
869  deallocate_device< TransposeBatch<float2> >(&batchesZXY);
870  deallocate_device< TransposeBatch<float2> >(&batchesYZX);
871 }
872 
873 //
874 // Set dataPtrsYZX
875 //
876 void CudaPmeTranspose::setDataPtrsYZX(std::vector<float2*>& dataPtrsNew, float2* data) {
877  if (dataPtrsYZX.size() != dataPtrsNew.size())
878  NAMD_bug("CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
879  for (int iblock=0;iblock < nblock;iblock++) {
880  dataPtrsYZX[iblock] = dataPtrsNew[iblock];
881  }
882  // Build batched data structures
884 
885  for (int iperm=0;iperm < 3;iperm++) {
886  int isize_out;
887  if (iperm == 0) {
888  // Perm_Z_cX_Y:
889  // ZXY -> XYZ
890  isize_out = pmeGrid.K1/2+1;
891  } else if (iperm == 1) {
892  // Perm_cX_Y_Z:
893  // XYZ -> YZX
894  isize_out = pmeGrid.K2;
895  } else {
896  // Perm_Y_Z_cX:
897  // YZX -> ZXY
898  isize_out = pmeGrid.K3;
899  }
900 
901  int max_nx = 0;
902  for (int iblock=0;iblock < nblock;iblock++) {
903 
904  int x0 = pos[iblock];
905  int nx = pos[iblock+1] - x0;
906  max_nx = std::max(max_nx, nx);
907 
908  int width_out;
909  float2* data_out;
910  if (dataPtrsYZX[iblock] == NULL) {
911  // Local transpose, use internal buffer
912  data_out = d_data + jsize*ksize*x0;
913  width_out = jsize;
914  } else {
915  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
916  data_out = dataPtrsYZX[iblock];
917  width_out = isize_out;
918  }
919 
921  batch.nx = nx;
922  batch.ysize_out = width_out;
923  batch.zsize_out = ksize;
924  batch.data_in = data+x0;
925  batch.data_out = data_out;
926 
927  h_batchesYZX[iperm*nblock + iblock] = batch;
928 
929  // transpose_xyz_yzx(
930  // nx, jsize, ksize,
931  // isize, jsize,
932  // width_out, ksize,
933  // data+x0, data_out, stream);
934  }
935 
936  max_nx_YZX[iperm] = max_nx;
937  }
938 
939  copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*nblock, stream);
940  cudaCheck(cudaStreamSynchronize(stream));
941  delete [] h_batchesYZX;
942 }
943 
944 //
945 // Set dataPtrsZXY
946 //
947 void CudaPmeTranspose::setDataPtrsZXY(std::vector<float2*>& dataPtrsNew, float2* data) {
948  if (dataPtrsZXY.size() != dataPtrsNew.size())
949  NAMD_bug("CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
950  for (int iblock=0;iblock < nblock;iblock++) {
951  dataPtrsZXY[iblock] = dataPtrsNew[iblock];
952  }
953 
954  // Build batched data structures
956 
957  for (int iperm=0;iperm < 3;iperm++) {
958  int isize_out;
959  if (iperm == 0) {
960  // Perm_cX_Y_Z:
961  // XYZ -> ZXY
962  isize_out = pmeGrid.K3;
963  } else if (iperm == 1) {
964  // Perm_Z_cX_Y:
965  // ZXY -> YZX
966  isize_out = pmeGrid.K2;
967  } else {
968  // Perm_Y_Z_cX:
969  // YZX -> XYZ
970  isize_out = pmeGrid.K1/2+1;
971  }
972 
973  int max_nx = 0;
974  for (int iblock=0;iblock < nblock;iblock++) {
975 
976  int x0 = pos[iblock];
977  int nx = pos[iblock+1] - x0;
978  max_nx = std::max(max_nx, nx);
979 
980  int width_out;
981  float2* data_out;
982  if (dataPtrsZXY[iblock] == NULL) {
983  // Local transpose, use internal buffer
984  data_out = d_data + jsize*ksize*x0;
985  width_out = ksize;
986  } else {
987  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
988  data_out = dataPtrsZXY[iblock];
989  width_out = isize_out;
990  }
991 
993  batch.nx = nx;
994  batch.zsize_out = width_out;
995  batch.xsize_out = nx;
996  batch.data_in = data+x0;
997  batch.data_out = data_out;
998  h_batchesZXY[iperm*nblock + iblock] = batch;
999  }
1000 
1001  max_nx_ZXY[iperm] = max_nx;
1002  }
1003 
1004  copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*nblock, stream);
1005  cudaCheck(cudaStreamSynchronize(stream));
1006  delete [] h_batchesZXY;
1007 }
1008 
1009 void CudaPmeTranspose::transposeXYZtoYZX(const float2* data) {
1010  cudaCheck(cudaSetDevice(deviceID));
1011 
1012  int iperm;
1013  switch(permutation) {
1014  case Perm_Z_cX_Y:
1015  // ZXY -> XYZ
1016  iperm = 0;
1017  break;
1018  case Perm_cX_Y_Z:
1019  // XYZ -> YZX
1020  iperm = 1;
1021  break;
1022  case Perm_Y_Z_cX:
1023  // YZX -> ZXY
1024  iperm = 2;
1025  break;
1026  default:
1027  NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
1028  break;
1029  }
1030 
1032  nblock, batchesYZX + iperm*nblock,
1033  max_nx_YZX[iperm], jsize, ksize,
1034  isize, jsize, stream);
1035 
1036 
1037 /*
1038  int isize_out;
1039  switch(permutation) {
1040  case Perm_Z_cX_Y:
1041  // ZXY -> XYZ
1042  isize_out = pmeGrid.K1/2+1;
1043  break;
1044  case Perm_cX_Y_Z:
1045  // XYZ -> YZX
1046  isize_out = pmeGrid.K2;
1047  break;
1048  case Perm_Y_Z_cX:
1049  // YZX -> ZXY
1050  isize_out = pmeGrid.K3;
1051  break;
1052  default:
1053  NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
1054  break;
1055  }
1056 
1057  for (int iblock=0;iblock < nblock;iblock++) {
1058 
1059  int x0 = pos[iblock];
1060  int nx = pos[iblock+1] - x0;
1061 
1062  int width_out;
1063  float2* data_out;
1064  if (dataPtrsYZX[iblock] == NULL) {
1065  // Local transpose, use internal buffer
1066  data_out = d_data + jsize*ksize*x0;
1067  width_out = jsize;
1068  } else {
1069  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
1070  data_out = dataPtrsYZX[iblock];
1071  width_out = isize_out;
1072  }
1073 
1074  transpose_xyz_yzx(
1075  nx, jsize, ksize,
1076  isize, jsize,
1077  width_out, ksize,
1078  data+x0, data_out, stream);
1079  }
1080 */
1081 }
1082 
1083 void CudaPmeTranspose::transposeXYZtoZXY(const float2* data) {
1084  cudaCheck(cudaSetDevice(deviceID));
1085 
1086  int iperm;
1087  switch(permutation) {
1088  case Perm_cX_Y_Z:
1089  // XYZ -> ZXY
1090  iperm = 0;
1091  break;
1092  case Perm_Z_cX_Y:
1093  // ZXY -> YZX
1094  iperm = 1;
1095  break;
1096  case Perm_Y_Z_cX:
1097  // YZX -> XYZ
1098  iperm = 2;
1099  break;
1100  default:
1101  NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
1102  break;
1103  }
1104 
1106  nblock, batchesZXY + iperm*nblock,
1107  max_nx_ZXY[iperm], jsize, ksize,
1108  isize, jsize, stream);
1109 
1110 /*
1111  int isize_out;
1112  switch(permutation) {
1113  case Perm_cX_Y_Z:
1114  // XYZ -> ZXY
1115  isize_out = pmeGrid.K3;
1116  break;
1117  case Perm_Z_cX_Y:
1118  // ZXY -> YZX
1119  isize_out = pmeGrid.K2;
1120  break;
1121  case Perm_Y_Z_cX:
1122  // YZX -> XYZ
1123  isize_out = pmeGrid.K1/2+1;
1124  break;
1125  default:
1126  NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
1127  break;
1128  }
1129 
1130  for (int iblock=0;iblock < nblock;iblock++) {
1131 
1132  int x0 = pos[iblock];
1133  int nx = pos[iblock+1] - x0;
1134 
1135  int width_out;
1136  float2* data_out;
1137  if (dataPtrsZXY[iblock] == NULL) {
1138  // Local transpose, use internal buffer
1139  data_out = d_data + jsize*ksize*x0;
1140  width_out = ksize;
1141  } else {
1142  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
1143  data_out = dataPtrsZXY[iblock];
1144  width_out = isize_out;
1145  }
1146 
1147  transpose_xyz_zxy(
1148  nx, jsize, ksize,
1149  isize, jsize,
1150  width_out, nx,
1151  data+x0, data_out, stream);
1152  }
1153 */
1154 }
1155 
1157  cudaCheck(cudaSetDevice(deviceID));
1158  cudaCheck(cudaStreamSynchronize(stream));
1159 }
1160 
1161 void CudaPmeTranspose::copyDataDeviceToHost(const int iblock, float2* h_data, const int h_dataSize) {
1162  cudaCheck(cudaSetDevice(deviceID));
1163 
1164  if (iblock >= nblock)
1165  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
1166 
1167  int x0 = pos[iblock];
1168  int nx = pos[iblock+1] - x0;
1169 
1170  int copySize = jsize*ksize*nx;
1171  int copyStart = jsize*ksize*x0;
1172 
1173  if (copyStart + copySize > dataSize)
1174  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
1175 
1176  if (copySize > h_dataSize)
1177  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
1178 
1179  copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
1180 }
1181 
1182 void CudaPmeTranspose::copyDataHostToDevice(const int iblock, float2* data_in, float2* data_out) {
1183  cudaCheck(cudaSetDevice(deviceID));
1184 
1185  if (iblock >= nblock)
1186  NAMD_bug("CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
1187 
1188  // Determine block size = how much we're copying
1189  int i0, i1, j0, j1, k0, k1;
1190  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1191  int ni = i1-i0+1;
1192  int nj = j1-j0+1;
1193  int nk = k1-k0+1;
1194 
1195  copy3D_HtoD<float2>(data_in, data_out,
1196  0, 0, 0,
1197  ni, nj,
1198  i0, 0, 0,
1199  isize, jsize,
1200  ni, nj, nk, stream);
1201 }
1202 
1203 #ifndef P2P_ENABLE_3D
1204 //
1205 // Copy from temporary buffer to final buffer
1206 //
1207 void CudaPmeTranspose::copyDataDeviceToDevice(const int iblock, float2* data_out) {
1208  cudaCheck(cudaSetDevice(deviceID));
1209 
1210  if (iblock >= nblock)
1211  NAMD_bug("CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
1212 
1213  // Determine block size = how much we're copying
1214  int i0, i1, j0, j1, k0, k1;
1215  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1216  int ni = i1-i0+1;
1217  int nj = j1-j0+1;
1218  int nk = k1-k0+1;
1219 
1220  float2* data_in = d_buffer + i0*nj*nk;
1221 
1222  copy3D_DtoD<float2>(data_in, data_out,
1223  0, 0, 0,
1224  ni, nj,
1225  i0, 0, 0,
1226  isize, jsize,
1227  ni, nj, nk, stream);
1228 }
1229 
1230 //
1231 // Return temporary buffer for block "iblock"
1232 //
1233 float2* CudaPmeTranspose::getBuffer(const int iblock) {
1234  if (iblock >= nblock)
1235  NAMD_bug("CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
1236 
1237  // Determine block size = how much we're copying
1238  int i0, i1, j0, j1, k0, k1;
1239  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1240  int ni = i1-i0+1;
1241  int nj = j1-j0+1;
1242  int nk = k1-k0+1;
1243 
1244  return d_buffer + i0*nj*nk;
1245 }
1246 #endif
1247 
1248 void CudaPmeTranspose::copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out,
1249  float2* data_out) {
1250 
1251  int iblock_out = jblock;
1252  int jblock_out = kblock;
1253  int kblock_out = iblock;
1254 
1255  copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1256 }
1257 
1258 void CudaPmeTranspose::copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out,
1259  float2* data_out) {
1260 
1261  int iblock_out = kblock;
1262  int jblock_out = iblock;
1263  int kblock_out = jblock;
1264 
1265  copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1266 }
1267 
1268 void CudaPmeTranspose::copyDataToPeerDevice(const int iblock,
1269  const int iblock_out, const int jblock_out, const int kblock_out,
1270  int deviceID_out, int permutation_out, float2* data_out) {
1271 
1272  cudaCheck(cudaSetDevice(deviceID));
1273 
1274  // Determine block size = how much we're copying
1275  int i0, i1, j0, j1, k0, k1;
1276  getBlockDim(pmeGrid, permutation_out, iblock_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
1277  int ni = i1-i0+1;
1278  int nj = j1-j0+1;
1279  int nk = k1-k0+1;
1280 
1281  getPencilDim(pmeGrid, permutation_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
1282  int isize_out = i1-i0+1;
1283  int jsize_out = j1-j0+1;
1284 
1285  int x0 = pos[iblock];
1286  float2* data_in = d_data + jsize*ksize*x0;
1287 
1288 #ifndef P2P_ENABLE_3D
1289  // Copy into temporary peer device buffer
1290  copy_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out, ni*nj*nk, stream);
1291 #else
1292  copy3D_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out,
1293  0, 0, 0,
1294  ni, nj,
1295  0, 0, 0,
1296  isize_out, jsize_out,
1297  ni, nj, nk, stream);
1298 #endif
1299 
1300 }
1301 
1302 
1304  PmeGrid pmeGrid_,
1305  int deviceID_,
1306  int deviceIndex_
1307  ) :
1308  pmeGrid(pmeGrid_), deviceID(deviceID_), deviceIndex(deviceIndex_),
1309  natoms(0), d_atoms(0), d_forces(0),
1310  d_grids(0), gridsize(0),
1311  d_trans(0), transize(0),
1312  d_bm1(0), d_bm2(0), d_bm3(0),
1313  kappa(ComputeNonbondedUtil::ewaldcof),
1314  self_energy_alch_first_time(true),
1315  force_scaling_alch_first_time(true),
1316  selfEnergy(0), selfEnergy_FEP(0), selfEnergy_TI_1(0), selfEnergy_TI_2(0), m_step(0)
1317 {
1318 // fprintf(stderr, "CudaPmeOneDevice constructor START ******************************************\n");
1319  const SimParameters& sim_params = *(Node::Object()->simParameters);
1321  // Determine how many grids we need for the alchemical route
1322  if (sim_params.alchOn) {
1323  num_used_grids = sim_params.alchGetNumOfPMEGrids();
1324  } else {
1325  num_used_grids = 1;
1326  }
1327  cudaCheck(cudaSetDevice(deviceID));
1328 
1329  // Check to see if the simulation and device is compatible with patch-level kernels. The results
1330  // will be worked in the PatchLevelPmeData field
1331  checkPatchLevelSimParamCompatibility(pmeGrid.order, true /* periodic Y */, true /* periodic Z */);
1333 
1334  if (!sim_params.CUDASOAintegrateMode) {
1335  NAMD_bug("CudaPmeOneDevice requires GPU-resident mode");
1336  }
1337  reductionGpuResident = ReductionMgr::Object()->willSubmit(REDUCTIONS_GPURESIDENT);
1338 
1339  // create our own CUDA stream
1340 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
1341  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1342  int leastPriority, greatestPriority;
1343  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
1344  cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
1345 #else
1346  cudaCheck(cudaStreamCreate(&stream));
1347 #endif
1348 
1349  allocate_host<EnergyVirial>(&h_energyVirials, num_used_grids);
1350  allocate_device<EnergyVirial>(&d_energyVirials, num_used_grids);
1351  allocate_device<float>(&d_scaling_factors, num_used_grids);
1352  allocate_device<double>(&d_selfEnergy, 1);
1353  if (sim_params.alchFepOn) {
1354  allocate_device<double>(&d_selfEnergy_FEP, 1);
1355  } else {
1356  d_selfEnergy_FEP = NULL;
1357  }
1358  if (sim_params.alchThermIntOn) {
1359  allocate_device<double>(&d_selfEnergy_TI_1, 1);
1360  allocate_device<double>(&d_selfEnergy_TI_2, 1);
1361  } else {
1362  d_selfEnergy_TI_1 = NULL;
1363  d_selfEnergy_TI_2 = NULL;
1364  }
1365 
1366  // create device buffer space for atom positions and forces
1367  // to be accessed externally through PatchData
1368  allocate_device<float4>(&d_atoms, num_used_grids * natoms);
1369  allocate_device<float3>(&d_forces, num_used_grids * natoms);
1370  if (sim_params.alchOn) {
1371  allocate_device<int>(&d_partition, natoms);
1372  } else {
1373  d_partition = NULL;
1374  }
1375 #ifdef NODEGROUP_FORCE_REGISTER
1376  DeviceData& devData = cpdata.ckLocalBranch()->devData[deviceIndex];
1377  devData.s_datoms = (CudaAtom *) (d_atoms);
1378  devData.f_slow = (CudaForce *) (d_forces);
1379  devData.f_slow_size = natoms;
1380  devData.s_datoms_partition = d_partition;
1381 #endif
1382  int k1 = pmeGrid.K1;
1383  int k2 = pmeGrid.K2;
1384  int k3 = pmeGrid.K3;
1385  int order = pmeGrid.order;
1386  gridsize = k1 * k2 * k3;
1387  transize = (k1/2 + 1) * k2 * k3;
1388 
1389 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1390 
1391  // set up cufft
1392  forwardPlans = new cufftHandle[num_used_grids];
1393  backwardPlans = new cufftHandle[num_used_grids];
1394  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1395  cufftCheck(cufftPlan3d(&(forwardPlans[iGrid]), k3, k2, k1, CUFFT_R2C));
1396  cufftCheck(cufftPlan3d(&(backwardPlans[iGrid]), k3, k2, k1, CUFFT_C2R));
1397  cufftCheck(cufftSetStream(forwardPlans[iGrid], stream));
1398  cufftCheck(cufftSetStream(backwardPlans[iGrid], stream));
1399  }
1400 #endif
1401 
1402 #ifdef NAMD_CUDA
1403  cudaDeviceProp deviceProp;
1404  cudaCheck(cudaGetDeviceProperties(&deviceProp, deviceID));
1405  const int texture_alignment = int(deviceProp.textureAlignment);
1406  // d_grids and d_grids + N * gridsize will be used as device pointers for ::cudaResourceDesc::res::linear::devPtr
1407  // check if (d_grids + N * gridsize) is an address aligned to ::cudaDeviceProp::textureAlignment
1408  // which is required by cudaCreateTextureObject()
1409  // or maybe I should use cudaMallocPitch()?
1410  if ((gridsize % texture_alignment) != 0) {
1411  // if it is not aligned, padding is required
1412  gridsize = (int(gridsize / texture_alignment) + 1) * texture_alignment;
1413  }
1414  // Is it necesary to align transize too?
1415 // if ((transize % texture_alignment) != 0) {
1416 // // if it is not aligned, padding is required
1417 // transize = (int(transize / texture_alignment) + 1) * texture_alignment;
1418 // }
1419  allocate_device<float>(&d_grids, num_used_grids * gridsize);
1420  allocate_device<float2>(&d_trans, num_used_grids * transize);
1421  gridTexObjArrays = new cudaTextureObject_t[num_used_grids];
1422  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1423  // set up texture object
1424  cudaResourceDesc resDesc;
1425  memset(&resDesc, 0, sizeof(resDesc));
1426  resDesc.resType = cudaResourceTypeLinear;
1427  resDesc.res.linear.devPtr = (void*)(d_grids + iGrid * (size_t)gridsize);
1428  resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
1429  resDesc.res.linear.desc.x = sizeof(float)*8;
1430  resDesc.res.linear.sizeInBytes = gridsize*sizeof(float);
1431  cudaTextureDesc texDesc;
1432  memset(&texDesc, 0, sizeof(texDesc));
1433  texDesc.readMode = cudaReadModeElementType;
1434  cudaCheck(cudaCreateTextureObject(&(gridTexObjArrays[iGrid]), &resDesc, &texDesc, NULL));
1435  }
1436 #else
1437  allocate_device<float>(&d_grids, num_used_grids * gridsize);
1438  allocate_device<float2>(&d_trans, num_used_grids * transize);
1439 #endif
1440  // calculate prefactors
1441  double *bm1 = new double[k1];
1442  double *bm2 = new double[k2];
1443  double *bm3 = new double[k3];
1444  // Use compute_b_moduli from PmeKSpace.C
1445  extern void compute_b_moduli(double *bm, int k, int order);
1446  compute_b_moduli(bm1, k1, order);
1447  compute_b_moduli(bm2, k2, order);
1448  compute_b_moduli(bm3, k3, order);
1449 
1450  // allocate space for and copy prefactors onto GPU
1451  float *bm1f = new float[k1];
1452  float *bm2f = new float[k2];
1453  float *bm3f = new float[k3];
1454  for (int i=0; i < k1; i++) bm1f[i] = (float) bm1[i];
1455  for (int i=0; i < k2; i++) bm2f[i] = (float) bm2[i];
1456  for (int i=0; i < k3; i++) bm3f[i] = (float) bm3[i];
1457  allocate_device<float>(&d_bm1, k1);
1458  allocate_device<float>(&d_bm2, k2);
1459  allocate_device<float>(&d_bm3, k3);
1460  copy_HtoD_sync<float>(bm1f, d_bm1, k1);
1461  copy_HtoD_sync<float>(bm2f, d_bm2, k2);
1462  copy_HtoD_sync<float>(bm3f, d_bm3, k3);
1463  delete [] bm1f;
1464  delete [] bm2f;
1465  delete [] bm3f;
1466  delete [] bm1;
1467  delete [] bm2;
1468  delete [] bm3;
1469 
1470  cudaCheck(cudaStreamSynchronize(stream));
1471 
1472 // fprintf(stderr, "CudaPmeOneDevice constructor END ********************************************\n");
1473 }
1474 
1476  deallocate_device<float4>(&d_atoms);
1477  deallocate_device<float3>(&d_forces);
1478  deallocate_device<float2>(&d_trans);
1479  deallocate_device<float>(&d_grids);
1480  deallocate_host<EnergyVirial>(&h_energyVirials);
1481  deallocate_device<EnergyVirial>(&d_energyVirials);
1482  deallocate_device<float>(&d_scaling_factors);
1483 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1484  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1485  cufftCheck(cufftDestroy(forwardPlans[iGrid]));
1486  cufftCheck(cufftDestroy(backwardPlans[iGrid]));
1487 #if defined(NAMD_CUDA) // only CUDA uses texture objects
1488  cudaCheck(cudaDestroyTextureObject(gridTexObjArrays[iGrid]));
1489 #endif
1490  }
1491 
1492  if (patchLevelPmeData.h_patchGridOffsets != nullptr) {
1493  deallocate_host<int3>(&patchLevelPmeData.h_patchGridOffsets);
1494  }
1495  if (patchLevelPmeData.d_patchGridOffsets != nullptr) {
1496  deallocate_device<int3>(&patchLevelPmeData.d_patchGridOffsets);
1497  }
1498 
1499  delete[] forwardPlans;
1500  delete[] backwardPlans;
1501 #if defined(NAMD_CUDA) // only CUDA uses texture objects
1502  delete[] gridTexObjArrays;
1503 #endif
1504 
1505 
1506 #endif
1507  deallocate_device<double>(&d_selfEnergy);
1508  if (d_partition != NULL) deallocate_device<int>(&d_partition);
1509  if (d_selfEnergy_FEP != NULL) deallocate_device<double>(&d_selfEnergy_FEP);
1510  if (d_selfEnergy_TI_1 != NULL) deallocate_device<double>(&d_selfEnergy_TI_1);
1511  if (d_selfEnergy_TI_2 != NULL) deallocate_device<double>(&d_selfEnergy_TI_2);
1512  deallocate_device<float>(&d_bm1);
1513  deallocate_device<float>(&d_bm2);
1514  deallocate_device<float>(&d_bm3);
1515  cudaCheck(cudaStreamDestroy(stream));
1516 
1517  if (reductionGpuResident) {
1518  delete reductionGpuResident;
1519  }
1520 }
1521 
1523  const Lattice &lattice,
1524 // const CudaAtom *d_atoms,
1525 // CudaForce *d_force,
1526 // int natoms,
1527  int doEnergyVirial,
1528  int step
1529 #if 0
1530  double d_energy[1],
1531  double d_virial[6]
1532 #endif
1533  ) {
1534 // fprintf(stderr, "CudaPmeOneDevice compute ****************************************************\n");
1535  int k1 = pmeGrid.K1;
1536  int k2 = pmeGrid.K2;
1537  int k3 = pmeGrid.K3;
1538  int order = pmeGrid.order;
1539  double volume = lattice.volume();
1540  Vector a_r = lattice.a_r();
1541  Vector b_r = lattice.b_r();
1542  Vector c_r = lattice.c_r();
1543  float arx = a_r.x;
1544  float ary = a_r.y;
1545  float arz = a_r.z;
1546  float brx = b_r.x;
1547  float bry = b_r.y;
1548  float brz = b_r.z;
1549  float crx = c_r.x;
1550  float cry = c_r.y;
1551  float crz = c_r.z;
1552  m_step = step;
1553 
1554  //JM: actually necessary if you reserve a PME device!
1555  cudaCheck(cudaSetDevice(deviceID));
1556  const SimParameters& sim_params = *(Node::Object()->simParameters);
1557 
1558  // clear force array
1559  //fprintf(stderr, "Calling clear_device_array on d_force\n");
1560  clear_device_array<float3>(d_forces, num_used_grids * natoms, stream);
1561  // clear grid
1562  //fprintf(stderr, "Calling clear_device_array on d_grid\n");
1563  clear_device_array<float>(d_grids, num_used_grids * gridsize, stream);
1564  clear_device_array<float2>(d_trans, num_used_grids * transize, stream);
1565 
1566  // Clear energy and virial array if needed
1567  if (doEnergyVirial) {
1568  // clear_device_array<EnergyVirial>(d_energyVirial, 1, stream);
1569  clear_device_array<EnergyVirial>(d_energyVirials, num_used_grids * 1, stream);
1570  const bool updateSelfEnergy = (step == sim_params.firstTimestep) || (selfEnergy == 0);
1571  if (updateSelfEnergy && (sim_params.alchOn == false)) {
1572  clear_device_array<double>(d_selfEnergy, 1, stream);
1573  // calculate self energy term if not yet done
1575  kappa, stream);
1576  //fprintf(stderr, "selfEnergy = %12.8f\n", selfEnergy);
1577  }
1578  /* the self energy depends on the scaling factor, or lambda
1579  * the cases when self energy will be changed:
1580  * 1. If alchLambdaFreq > 0, we will have a linear scaling of lambda. Lambda is changed EVERY STEP!
1581  * 2. In most cases, users will not use alchLambdaFreq > 0, but simulations may enter another lambda-window by using TCL scripts.
1582  * in summary, the self energy will be not changed unless lambda is changed.
1583  * so calcSelfEnergyAlch() would compare lambda of current step with the one from last step.
1584  * only if lambda is changed, the calcSelfEnergyFEPKernel or calcSelfEnergyTIKernel will be executed again.
1585  */
1586  if (sim_params.alchOn) calcSelfEnergyAlch(m_step);
1587  }
1588 
1589 #if 0
1590 
1591  spread_charge(d_atoms, natoms, k1, k2, k3, k1, k2, k3,
1592  k1 /* xsize */, 0 /* jBlock */, 0 /* kBlock */,
1593  true /* pmeGrid.yBlocks == 1 */, true /* pmeGrid.zBlocks == 1 */,
1594  d_grid, order, stream);
1595 #else
1596  const int order3 = ((order*order*order-1)/WARPSIZE + 1)*WARPSIZE;
1597  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1599  d_atoms + iGrid * natoms, natoms, k1, k2, k3,
1600  float(k1), (float)k2, (float)k3, order3,
1601  k1, k2, k3,
1602  k1 /* xsize */, 0 /* jBlock */, 0 /* kBlock */,
1603  true /* pmeGrid.yBlocks == 1 */, true /* pmeGrid.zBlocks == 1 */,
1604  d_grids + iGrid * gridsize, order, stream);
1605  }
1606 
1607 #endif
1608  //cudaCheck(cudaStreamSynchronize(stream));
1609 
1610  // forward FFT
1611  //fprintf(stderr, "Calling cufftExecR2C\n");
1612  //cufftCheck(cufftExecR2C(forwardPlan, (cufftReal *)d_grid,
1613  // (cufftComplex *)d_tran));
1614 
1615  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1616  cufftCheck(cufftExecR2C(forwardPlans[iGrid],
1617  (cufftReal *)(d_grids + iGrid * gridsize),
1618  (cufftComplex *)(d_trans + iGrid * transize)));
1619  }
1620 
1621  //cudaCheck(cudaStreamSynchronize(stream));
1622 
1623  // reciprocal space calculation
1624  //fprintf(stderr, "Calling scalar_sum\n");
1625  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1626  scalar_sum(true /* Perm_cX_Y_Z */, k1, k2, k3, (k1/2 + 1), k2, k3,
1627  kappa, arx, ary, arz, brx, bry, brz, crx, cry, crz, volume,
1628  d_bm1, d_bm2, d_bm3, 0 /* jBlock */, 0 /* kBlock */,
1629  (bool) doEnergyVirial, &(d_energyVirials[iGrid].energy),
1630  d_energyVirials[iGrid].virial, d_trans + iGrid * transize, stream);
1631  }
1632  //scalar_sum(true /* Perm_cX_Y_Z */, k1, k2, k3, (k1/2 + 1), k2, k3,
1633  // kappa, arx, ary, arz, brx, bry, brz, crx, cry, crz, volume,
1634  // d_bm1, d_bm2, d_bm3, 0 /* jBlock */, 0 /* kBlock */,
1635  // (bool) doEnergyVirial, &(d_energyVirial->energy),
1636  // d_energyVirial->virial, d_tran, stream);
1637  //cudaCheck(cudaStreamSynchronize(stream));
1638 
1639  // backward FFT
1640  //fprintf(stderr, "Calling cufftExecC2R\n");
1641  for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1642  cufftCheck(cufftExecC2R(backwardPlans[iGrid],
1643  (cufftComplex *)(d_trans + iGrid * transize),
1644  (cufftReal *)(d_grids + iGrid * gridsize)));
1645  }
1646 
1647  //cufftCheck(cufftExecC2R(backwardPlan, (cufftComplex *)d_tran,
1648  // (cufftReal *)d_grid));
1649  //cudaCheck(cudaStreamSynchronize(stream));
1650 
1651  // gather force from grid to atoms
1652  // missing cudaTextureObject_t below works for __CUDA_ARCH__ >= 350
1653  //fprintf(stderr, "Calling gather_force\n");
1654  for (unsigned int iGrid = 0; iGrid < num_used_grids; ++iGrid) {
1656  &(d_atoms[iGrid * natoms]), natoms, k1, k2, k3, k1, k2, k3,
1657  k1 /* xsize */, 0 /* jBlock */, 0 /* kBlock */,
1658  true /* pmeGrid.yBlocks == 1 */, true /* pmeGrid.zBlocks == 1 */,
1659  d_grids + iGrid * gridsize, order, d_forces + iGrid * natoms,
1660 #ifdef NAMD_CUDA
1661  gridTexObjArrays[iGrid] /* cudaTextureObject_t */,
1662 #endif
1663  stream);
1664  }
1665 
1666  //gather_force(d_atoms, natoms, k1, k2, k3, k1, k2, k3,
1667  // k1 /* xsize */, 0 /* jBlock */, 0 /* kBlock */,
1668  // true /* pmeGrid.yBlocks == 1 */, true /* pmeGrid.zBlocks == 1 */,
1669  // d_grid, order, d_force, gridTexObj /* cudaTextureObject_t */,
1670  // stream);
1671  //cudaCheck(cudaStreamSynchronize(stream));
1672 
1673  // Copy energy and virial to host if needed
1674  if (doEnergyVirial) {
1675  //fprintf(stderr, "Calling copy_DtoH on d_energyVirial\n");
1676  copy_DtoH<EnergyVirial>(d_energyVirials, h_energyVirials,
1678  //cudaCheck(cudaEventRecord(copyEnergyVirialEvent, stream));
1679  //cudaCheck(cudaStreamSynchronize(stream));
1680  }
1681 
1682  // XXX debugging, quick test for borked forces
1683  //clear_device_array<float3>(d_force, natoms, stream);
1684  if (sim_params.alchOn) {
1685  scaleAndMergeForce(m_step);
1686  }
1687 }
1688 
1689 
1690 // call this after device-host memory transfer has completed
1692  bool doEnergyVirial
1693  ) {
1694  cudaCheck(cudaStreamSynchronize(stream));
1695  SubmitReduction *reduction = getCurrentReduction();
1696  if(doEnergyVirial){
1697  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1698  PatchData* patchData = cpdata.ckLocalBranch();
1699  cudaCheck(cudaSetDevice(deviceID));
1700  double virial[9];
1701  double energy, energy_F, energy_TI_1, energy_TI_2;
1702  const SimParameters& sim_params = *(Node::Object()->simParameters);
1703  if (sim_params.alchOn) {
1704  if (sim_params.alchFepOn) {
1705  scaleAndComputeFEPEnergyVirials(h_energyVirials, m_step, energy, energy_F, virial);
1706  energy += selfEnergy;
1707  energy_F += selfEnergy_FEP;
1708  }
1709  if (sim_params.alchThermIntOn) {
1710  scaleAndComputeTIEnergyVirials(h_energyVirials, m_step, energy, energy_TI_1, energy_TI_2, virial);
1711  energy += selfEnergy;
1712  energy_TI_1 += selfEnergy_TI_1;
1713  energy_TI_2 += selfEnergy_TI_2;
1714  }
1715  } else {
1716  virial[0] = h_energyVirials[0].virial[0];
1717  virial[1] = h_energyVirials[0].virial[1];
1718  virial[2] = h_energyVirials[0].virial[2];
1719  virial[3] = h_energyVirials[0].virial[1];
1720  virial[4] = h_energyVirials[0].virial[3];
1721  virial[5] = h_energyVirials[0].virial[4];
1722  virial[6] = h_energyVirials[0].virial[2];
1723  virial[7] = h_energyVirials[0].virial[4];
1724  virial[8] = h_energyVirials[0].virial[5];
1725  energy = h_energyVirials[0].energy + selfEnergy;
1726  }
1727  #if 0
1728  fprintf(stderr, "PME ENERGY = %g %g\n", h_energyVirials[0].energy, selfEnergy );
1729  fprintf(stderr, "PME VIRIAL =\n"
1730  " %g %g %g\n %g %g %g\n %g %g %g\n",
1731  virial[0], virial[1], virial[2], virial[3], virial[4],
1732  virial[5], virial[6], virial[7], virial[8]);
1733  #endif
1734  reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial[0];
1735  reduction->item(REDUCTION_VIRIAL_SLOW_XY) += virial[1];
1736  reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += virial[2];
1737  reduction->item(REDUCTION_VIRIAL_SLOW_YX) += virial[3];
1738  reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial[4];
1739  reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += virial[5];
1740  reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += virial[6];
1741  reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += virial[7];
1742  reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial[8];
1743  reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += energy;
1744  if (sim_params.alchFepOn) {
1745  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += energy_F;
1746  }
1747  if (sim_params.alchThermIntOn) {
1748  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_1) += energy_TI_1;
1749  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_2) += energy_TI_2;
1750  }
1751  }
1752  reduction->submit();
1753 }
1754 
1755 void CudaPmeOneDevice::calcSelfEnergyAlch(int step) {
1756  SimParameters& sim_params = *(Node::Object()->simParameters);
1757  if (sim_params.alchFepOn) {
1758  const BigReal alchLambda1 = sim_params.getCurrentLambda(step);
1759  const BigReal alchLambda2 = sim_params.getCurrentLambda2(step);
1760  static BigReal lambda1Up = sim_params.getElecLambda(alchLambda1);
1761  static BigReal lambda2Up = sim_params.getElecLambda(alchLambda2);
1762  static BigReal lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
1763  static BigReal lambda2Down = sim_params.getElecLambda(1.0 - alchLambda2);
1764  // compute self energy at the first call
1765  // only compute self energy if factors are changed
1766  if ((lambda1Up != sim_params.getElecLambda(alchLambda1)) ||
1767  (lambda2Up != sim_params.getElecLambda(alchLambda2)) ||
1768  (lambda1Down != sim_params.getElecLambda(1.0 - alchLambda1)) ||
1769  (lambda2Down != sim_params.getElecLambda(1.0 - alchLambda2)) ||
1771  lambda1Up = sim_params.getElecLambda(alchLambda1);
1772  lambda2Up = sim_params.getElecLambda(alchLambda2);
1773  lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
1774  lambda2Down = sim_params.getElecLambda(1.0 - alchLambda2);
1775  selfEnergy = 0.0; // self energy for λ_1
1776  selfEnergy_FEP = 0.0; // self energy for λ_2
1777  cudaCheck(cudaMemsetAsync(d_selfEnergy, 0, sizeof(double), stream));
1778  cudaCheck(cudaMemsetAsync(d_selfEnergy_FEP, 0, sizeof(double), stream));
1779  calcSelfEnergyFEPWrapper(d_selfEnergy, d_selfEnergy_FEP, selfEnergy, selfEnergy_FEP, d_atoms, d_partition, natoms, kappa, sim_params.alchDecouple, lambda1Up, lambda2Up, lambda1Down, lambda2Down, stream);
1781  }
1782  }
1783  if (sim_params.alchThermIntOn) {
1784  const BigReal alchLambda1 = sim_params.getCurrentLambda(step);
1785  static BigReal lambda1Up = sim_params.getElecLambda(alchLambda1);
1786  static BigReal lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
1787  if ((lambda1Up != sim_params.getElecLambda(alchLambda1)) ||
1788  (lambda1Down != sim_params.getElecLambda(1.0 - alchLambda1)) ||
1790  lambda1Up = sim_params.getElecLambda(alchLambda1);
1791  lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
1792  selfEnergy = 0.0;
1793  selfEnergy_TI_1 = 0.0;
1794  selfEnergy_TI_2 = 0.0;
1795  cudaCheck(cudaMemsetAsync(d_selfEnergy, 0, sizeof(double), stream));
1796  cudaCheck(cudaMemsetAsync(d_selfEnergy_TI_1, 0, sizeof(double), stream));
1797  cudaCheck(cudaMemsetAsync(d_selfEnergy_TI_2, 0, sizeof(double), stream));
1800  }
1801  }
1802 }
1803 
1804 void CudaPmeOneDevice::scaleAndMergeForce(int step) {
1805  SimParameters& sim_params = *(Node::Object()->simParameters);
1806  const double alchLambda1 = sim_params.getCurrentLambda(step);
1807  static BigReal lambda1Up = sim_params.getElecLambda(alchLambda1);
1808  static BigReal lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
1809  if ((lambda1Up != sim_params.getElecLambda(alchLambda1)) ||
1810  (lambda1Down != sim_params.getElecLambda(1.0 - alchLambda1)) ||
1812  std::vector<float> scale_factors(num_used_grids);
1813  lambda1Up = sim_params.getElecLambda(alchLambda1);
1814  lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
1815  scale_factors[0] = lambda1Up;
1816  scale_factors[1] = lambda1Down;
1817  if (sim_params.alchDecouple) {
1818  scale_factors[2] = 1.0 - lambda1Up;
1819  scale_factors[3] = 1.0 - lambda1Down;
1820  }
1821  if (bool(sim_params.alchElecLambdaStart) || sim_params.alchThermIntOn) {
1822  scale_factors[num_used_grids-1] = (lambda1Up + lambda1Down - 1.0) * (-1.0);
1823  }
1824  copy_HtoD<float>(scale_factors.data(), d_scaling_factors, num_used_grids);
1826  }
1828 }
1829 
1830 void CudaPmeOneDevice::scaleAndComputeFEPEnergyVirials(const EnergyVirial* energyVirials, int step, double& energy, double& energy_F, double (&virial)[9]) {
1831  double scale1 = 1.0;
1832  double scale2 = 1.0;
1833  energy = 0;
1834  energy_F = 0;
1835  for (unsigned int i = 0; i < 9; ++i) {
1836  virial[i] = 0;
1837  }
1838  SimParameters& sim_params = *(Node::Object()->simParameters);
1839  const BigReal alchLambda = sim_params.getCurrentLambda(step);
1840  const BigReal alchLambda2 = sim_params.getCurrentLambda2(step);
1841  const BigReal elecLambdaUp = sim_params.getElecLambda(alchLambda);
1842  const BigReal elecLambda2Up = sim_params.getElecLambda(alchLambda2);
1843  const BigReal elecLambdaDown = sim_params.getElecLambda(1 - alchLambda);
1844  const BigReal elecLambda2Down = sim_params.getElecLambda(1 - alchLambda2);
1845  energy += energyVirials[0].energy * elecLambdaUp;
1846  energy_F += energyVirials[0].energy * elecLambda2Up;
1847  energy += energyVirials[1].energy * elecLambdaDown;
1848  energy_F += energyVirials[1].energy * elecLambda2Down;
1849  virial[0] += energyVirials[0].virial[0] * elecLambdaUp;
1850  virial[1] += energyVirials[0].virial[1] * elecLambdaUp;
1851  virial[2] += energyVirials[0].virial[2] * elecLambdaUp;
1852  virial[3] += energyVirials[0].virial[1] * elecLambdaUp;
1853  virial[4] += energyVirials[0].virial[3] * elecLambdaUp;
1854  virial[5] += energyVirials[0].virial[4] * elecLambdaUp;
1855  virial[6] += energyVirials[0].virial[2] * elecLambdaUp;
1856  virial[7] += energyVirials[0].virial[4] * elecLambdaUp;
1857  virial[8] += energyVirials[0].virial[5] * elecLambdaUp;
1858  virial[0] += energyVirials[1].virial[0] * elecLambdaDown;
1859  virial[1] += energyVirials[1].virial[1] * elecLambdaDown;
1860  virial[2] += energyVirials[1].virial[2] * elecLambdaDown;
1861  virial[3] += energyVirials[1].virial[1] * elecLambdaDown;
1862  virial[4] += energyVirials[1].virial[3] * elecLambdaDown;
1863  virial[5] += energyVirials[1].virial[4] * elecLambdaDown;
1864  virial[6] += energyVirials[1].virial[2] * elecLambdaDown;
1865  virial[7] += energyVirials[1].virial[4] * elecLambdaDown;
1866  virial[8] += energyVirials[1].virial[5] * elecLambdaDown;
1867  if (sim_params.alchDecouple) {
1868  energy += energyVirials[2].energy * (1.0 - elecLambdaUp);
1869  energy_F += energyVirials[2].energy * (1.0 - elecLambda2Up);
1870  energy += energyVirials[3].energy * (1.0 - elecLambdaDown);
1871  energy_F += energyVirials[3].energy * (1.0 - elecLambda2Down);
1872  virial[0] += energyVirials[2].virial[0] * (1.0 - elecLambdaUp);
1873  virial[1] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1874  virial[2] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1875  virial[3] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1876  virial[4] += energyVirials[2].virial[3] * (1.0 - elecLambdaUp);
1877  virial[5] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1878  virial[6] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1879  virial[7] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1880  virial[8] += energyVirials[2].virial[5] * (1.0 - elecLambdaUp);
1881  virial[0] += energyVirials[3].virial[0] * (1.0 - elecLambdaDown);
1882  virial[1] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1883  virial[2] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1884  virial[3] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1885  virial[4] += energyVirials[3].virial[3] * (1.0 - elecLambdaDown);
1886  virial[5] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1887  virial[6] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1888  virial[7] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1889  virial[8] += energyVirials[3].virial[5] * (1.0 - elecLambdaDown);
1890  if (sim_params.alchElecLambdaStart > 0) {
1891  energy += energyVirials[4].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1892  energy_F += energyVirials[4].energy * (-1.0 * (elecLambda2Up + elecLambda2Down - 1.0));
1893  virial[0] += energyVirials[4].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1894  virial[1] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1895  virial[2] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1896  virial[3] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1897  virial[4] += energyVirials[4].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1898  virial[5] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1899  virial[6] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1900  virial[7] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1901  virial[8] += energyVirials[4].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1902  }
1903  } else {
1904  if (sim_params.alchElecLambdaStart > 0) {
1905  energy += energyVirials[2].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1906  energy_F += energyVirials[2].energy * (-1.0 * (elecLambda2Up + elecLambda2Down - 1.0));
1907  virial[0] += energyVirials[2].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1908  virial[1] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1909  virial[2] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1910  virial[3] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1911  virial[4] += energyVirials[2].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1912  virial[5] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1913  virial[6] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1914  virial[7] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1915  virial[8] += energyVirials[2].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1916  }
1917  }
1918 }
1919 
1920 void CudaPmeOneDevice::scaleAndComputeTIEnergyVirials(const EnergyVirial* energyVirials, int step, double& energy, double& energy_TI_1, double& energy_TI_2, double (&virial)[9]) {
1921  double scale1 = 1.0;
1922  energy =0;
1923  energy_TI_1 = 0;
1924  energy_TI_2 = 0;
1925  for (unsigned int i = 0; i < 9; ++i) {
1926  virial[i] = 0;
1927  }
1928  SimParameters& sim_params = *(Node::Object()->simParameters);
1929  const BigReal alchLambda = sim_params.getCurrentLambda(step);
1930  const BigReal elecLambdaUp = sim_params.getElecLambda(alchLambda);
1931  const BigReal elecLambdaDown = sim_params.getElecLambda(1 - alchLambda);
1932  energy += energyVirials[0].energy * elecLambdaUp;
1933  energy += energyVirials[1].energy * elecLambdaDown;
1934  energy_TI_1 += energyVirials[0].energy;
1935  energy_TI_2 += energyVirials[1].energy;
1936  virial[0] += energyVirials[0].virial[0] * elecLambdaUp;
1937  virial[1] += energyVirials[0].virial[1] * elecLambdaUp;
1938  virial[2] += energyVirials[0].virial[2] * elecLambdaUp;
1939  virial[3] += energyVirials[0].virial[1] * elecLambdaUp;
1940  virial[4] += energyVirials[0].virial[3] * elecLambdaUp;
1941  virial[5] += energyVirials[0].virial[4] * elecLambdaUp;
1942  virial[6] += energyVirials[0].virial[2] * elecLambdaUp;
1943  virial[7] += energyVirials[0].virial[4] * elecLambdaUp;
1944  virial[8] += energyVirials[0].virial[5] * elecLambdaUp;
1945  virial[0] += energyVirials[1].virial[0] * elecLambdaDown;
1946  virial[1] += energyVirials[1].virial[1] * elecLambdaDown;
1947  virial[2] += energyVirials[1].virial[2] * elecLambdaDown;
1948  virial[3] += energyVirials[1].virial[1] * elecLambdaDown;
1949  virial[4] += energyVirials[1].virial[3] * elecLambdaDown;
1950  virial[5] += energyVirials[1].virial[4] * elecLambdaDown;
1951  virial[6] += energyVirials[1].virial[2] * elecLambdaDown;
1952  virial[7] += energyVirials[1].virial[4] * elecLambdaDown;
1953  virial[8] += energyVirials[1].virial[5] * elecLambdaDown;
1954  if (sim_params.alchDecouple) {
1955  energy += energyVirials[2].energy * (1.0 - elecLambdaUp);
1956  energy += energyVirials[3].energy * (1.0 - elecLambdaDown);
1957  energy_TI_1 += -1.0 * energyVirials[2].energy;
1958  energy_TI_2 += -1.0 * energyVirials[3].energy;
1959  virial[0] += energyVirials[2].virial[0] * (1.0 - elecLambdaUp);
1960  virial[1] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1961  virial[2] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1962  virial[3] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1963  virial[4] += energyVirials[2].virial[3] * (1.0 - elecLambdaUp);
1964  virial[5] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1965  virial[6] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1966  virial[7] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1967  virial[8] += energyVirials[2].virial[5] * (1.0 - elecLambdaUp);
1968  virial[0] += energyVirials[3].virial[0] * (1.0 - elecLambdaDown);
1969  virial[1] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1970  virial[2] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1971  virial[3] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1972  virial[4] += energyVirials[3].virial[3] * (1.0 - elecLambdaDown);
1973  virial[5] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1974  virial[6] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1975  virial[7] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1976  virial[8] += energyVirials[3].virial[5] * (1.0 - elecLambdaDown);
1977  energy += energyVirials[4].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1978  energy_TI_1 += -1.0 * energyVirials[4].energy;
1979  energy_TI_2 += -1.0 * energyVirials[4].energy;
1980  virial[0] += energyVirials[4].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1981  virial[1] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1982  virial[2] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1983  virial[3] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1984  virial[4] += energyVirials[4].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1985  virial[5] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1986  virial[6] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1987  virial[7] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1988  virial[8] += energyVirials[4].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1989  } else {
1990  energy += energyVirials[2].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1991  energy_TI_1 += -1.0 * energyVirials[2].energy;
1992  energy_TI_2 += -1.0 * energyVirials[2].energy;
1993  virial[0] += energyVirials[2].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1994  virial[1] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1995  virial[2] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1996  virial[3] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1997  virial[4] += energyVirials[2].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1998  virial[5] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1999  virial[6] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
2000  virial[7] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
2001  virial[8] += energyVirials[2].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
2002  }
2003 }
2004 
2005 
2006 
2007 int CudaPmeOneDevice::getShiftedGrid(const double x, const int grid) {
2008  double w = x + 0.5;
2009  double gw = w * grid;
2010  return floor(gw);
2011 }
2012 
2014  const int numThreads, const int3 patchGridDim, const int order) {
2015 
2016  const int gridBytes = patchGridDim.x * patchGridDim.y * patchGridDim.z * sizeof(float);
2017  const int thetaBytes = PatchLevelPmeData::kDim * (numThreads + PatchLevelPmeData::kThetaPad) *
2018  order * sizeof(float);
2019  const int indexBytes = numThreads * sizeof(char4);
2020 
2021  return gridBytes + thetaBytes + indexBytes;
2022 }
2023 
2025  const int numThreads, const int3 patchGridDim, const int order) {
2026 
2027  const int gridBytes = patchGridDim.x * patchGridDim.y * patchGridDim.z * sizeof(float);
2028  const int thetaBytes = (numThreads + PatchLevelPmeData::kThetaPad) * order *
2029  2 /* theta and dtheta */ * sizeof(float);
2030 
2031  return gridBytes + thetaBytes;
2032 }
2033 
2034 void CudaPmeOneDevice::checkPatchLevelSimParamCompatibility(const int order, const bool periodicY, const bool periodicZ) {
2035  bool use = true;
2036  use = use && (order == 8);
2037  use = use && (periodicY);
2038  use = use && (periodicZ);
2039 
2040  use = use && (deviceCUDA->getNumDevice() == 1); // This is only supported for single GPU currently
2041 
2043 }
2044 
2046  cudaDeviceGetAttribute(&patchLevelPmeData.deviceMaxSharedBytes, cudaDevAttrMaxSharedMemoryPerBlockOptin, deviceID);
2047 
2048  const int3 constexprPatchGridDim = make_int3(
2052 
2055  constexprPatchGridDim, 8 /* order */);
2058  constexprPatchGridDim, 8 /* order */);
2059 
2063 }
2064 
2066  const int numPatches, const CudaLocalRecord* localRecords,
2067  double3* patchMin, double3* patchMax, double3* awayDists) {
2068 
2069  patchLevelPmeData.localRecords = localRecords;
2070 
2071  // If the simulation isn't compatible or the device isn't compatible then no point in checking
2072  // patch sizes
2074 
2075  patchLevelPmeData.numPatches = numPatches;
2076 
2077  if (patchLevelPmeData.h_patchGridOffsets == nullptr) {
2078  allocate_host<int3>(&patchLevelPmeData.h_patchGridOffsets, numPatches);
2079  }
2080  if (patchLevelPmeData.d_patchGridOffsets == nullptr) {
2081  allocate_device<int3>(&patchLevelPmeData.d_patchGridOffsets, numPatches);
2082  }
2083 
2085  const int order = pmeGrid.order;
2086 
2087  // We only need to recompute the grid offsets if the lattice has changed
2088  if (!lattice.isEqual(currentLattice)) {
2089  currentLattice = lattice;
2090 
2091  double sysdima = currentLattice.a_r().unit() * currentLattice.a();
2092  double sysdimb = currentLattice.b_r().unit() * currentLattice.b();
2093  double sysdimc = currentLattice.c_r().unit() * currentLattice.c();
2094 
2095  patchLevelPmeData.patchGridDim = make_int3(0,0,0);
2096 
2097  for (int i = 0; i < numPatches; i++) {
2098  double3 pmin = currentLattice.unscale(patchMin[i]);
2099  double3 pmax = currentLattice.unscale(patchMax[i]);
2100  double3 width = pmax - pmin;
2101 
2102  // Logic copied from margin violation check
2103  double3 marginVal;
2104  marginVal.x = 0.5 * (awayDists[i].x - simParams->cutoff / sysdima);
2105  marginVal.y = 0.5 * (awayDists[i].y - simParams->cutoff / sysdimb);
2106  marginVal.z = 0.5 * (awayDists[i].z - simParams->cutoff / sysdimc);
2107  marginVal = currentLattice.unscale(marginVal);
2108 
2109  double3 minAtom = pmin - marginVal;
2110  double3 maxAtom = pmax + marginVal;
2111 
2112  double3 minScaled = currentLattice.scale(minAtom);
2113  double3 maxScaled = currentLattice.scale(maxAtom);
2114 
2115  int3 gridMin;
2116  gridMin.x = getShiftedGrid(minScaled.x, pmeGrid.K1);
2117  gridMin.y = getShiftedGrid(minScaled.y, pmeGrid.K2);
2118  gridMin.z = getShiftedGrid(minScaled.z, pmeGrid.K3);
2119 
2120  int3 gridMax;
2121  gridMax.x = getShiftedGrid(maxScaled.x, pmeGrid.K1);
2122  gridMax.y = getShiftedGrid(maxScaled.y, pmeGrid.K2);
2123  gridMax.z = getShiftedGrid(maxScaled.z, pmeGrid.K3);
2124 
2125  int3 gridWidth;
2126  gridWidth.x = gridMax.x - gridMin.x + order;
2127  gridWidth.y = gridMax.y - gridMin.y + order;
2128  gridWidth.z = gridMax.z - gridMin.z + order;
2129 
2131  patchLevelPmeData.patchGridDim.x = std::max(patchLevelPmeData.patchGridDim.x, gridWidth.x);
2132  patchLevelPmeData.patchGridDim.y = std::max(patchLevelPmeData.patchGridDim.y, gridWidth.y);
2133  patchLevelPmeData.patchGridDim.z = std::max(patchLevelPmeData.patchGridDim.z, gridWidth.z);
2134  }
2136  numPatches, nullptr);
2137  cudaStreamSynchronize(nullptr);
2138  const int maxGridPoints = patchLevelPmeData.patchGridDim.x *
2140 
2145  }
2146 }
2147 
2148 #endif // NAMD_CUDA
2149 
static Node * Object()
Definition: Node.h:86
void spread_charge_v2(const PatchLevelPmeData patchLevelPmeData, const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const float nfftx_f, const float nffty_f, const float nfftz_f, const int order3, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
void finishReduction(bool doEnergyVirial)
int zBlocks
Definition: PmeBase.h:25
bool isEqual(const Lattice &other) const
Definition: Lattice.h:298
CudaPmeOneDevice(PmeGrid pmeGrid_, int deviceID_, int deviceIndex_)
#define NAMD_EVENT_STOP(eon, id)
cufftHandle * backwardPlans
void compute_b_moduli(double *bm, int K, int order)
Definition: PmeKSpace.C:42
NAMD_HOST_DEVICE Vector c() const
Definition: Lattice.h:270
void energyAndVirialSetCallback(CudaPmePencilXYZ *pencilPtr)
const int permutation
void gatherForceDone(unsigned int iGrid)
static constexpr int kNumThreads
void batchTranspose_xyz_yzx(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
CudaPmeTranspose(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
BigReal alchElecLambdaStart
Definition: Vector.h:72
int K2
Definition: PmeBase.h:21
virtual void submit(void)=0
SimParameters * simParameters
Definition: Node.h:181
int K1
Definition: PmeBase.h:21
void cudaDie(const char *msg, cudaError_t err)
Definition: CudaUtils.C:9
Bool CUDASOAintegrateMode
EnergyVirial * d_energyVirials
NAMD_HOST_DEVICE Position unscale(ScaledPosition s) const
Definition: Lattice.h:77
BigReal & item(int i)
Definition: ReductionMgr.h:336
BigReal z
Definition: Vector.h:74
int getNumDevice()
Definition: DeviceCUDA.h:125
#define cufftCheck(stmt)
void checkPatchLevelLatticeCompatibilityAndComputeOffsets(const Lattice &lattice, const int numPatches, const CudaLocalRecord *localRecords, double3 *patchMin, double3 *patchMax, double3 *awayDists)
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:368
static void getPencilDim(const PmeGrid &pmeGrid, const int permutation, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:32
void energyAndVirialDone(unsigned int iGrid)
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:290
void checkPatchLevelSimParamCompatibility(const int order, const bool periodicY, const bool periodicZ)
void spreadCharge(Lattice &lattice)
BigReal getElecLambda(const BigReal) const
std::vector< int > pos
void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)
void copyAtoms(const int numAtoms, const CudaAtom *atoms)
#define WARPSIZE
Definition: CudaUtils.h:17
PatchLevelPmeData patchLevelPmeData
CudaPmeRealSpaceCompute(PmeGrid pmeGrid, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
void copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
void copyDataDeviceToDevice(const int iblock, float2 *data_out)
PmeGrid pmeGrid
bool dataDstAllocated
void calcSelfEnergyTIWrapper(double *d_selfEnergy, double *d_selfEnergy_TI_1, double *d_selfEnergy_TI_2, double &h_selfEnergy, double &h_selfEnergy_TI_1, double &h_selfEnergy_TI_2, const float4 *d_atoms, const int *d_partition, const int num_atoms, const double ewaldcof, const bool alchDecouple, const double lambda1Up, const double lambda1Down, cudaStream_t stream)
CudaFFTCompute(int deviceID, cudaStream_t stream)
int yBlocks
Definition: PmeBase.h:25
int computeSharedMemoryPatchLevelSpreadCharge(const int numThreads, const int3 patchGridDim, const int order)
void copyDataDeviceToHost(const int iblock, float2 *h_data, const int h_dataSize)
#define order
Definition: PmeRealSpace.C:235
#define NAMD_EVENT_START(eon, id)
int order
Definition: PmeBase.h:23
void NAMD_bug(const char *err_msg)
Definition: common.C:196
const int jblock
void writeHostComplexToDisk(const float2 *h_data, const int size, const char *filename)
NAMD_HOST_DEVICE ScaledPosition scale(Position p) const
Definition: Lattice.h:83
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
BigReal x
Definition: Vector.h:74
void gather_force(const PatchLevelPmeData patchLevelPmeData, const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, const float *data, const int order, float3 *force, const cudaTextureObject_t gridTexObj, cudaStream_t stream)
NAMD_HOST_DEVICE BigReal volume(void) const
Definition: Lattice.h:293
NAMD_HOST_DEVICE Vector a_r() const
Definition: Lattice.h:284
void calcSelfEnergyFEPWrapper(double *d_selfEnergy, double *d_selfEnergy_FEP, double &h_selfEnergy, double &h_selfEnergyFEP, const float4 *d_atoms, const int *d_partition, const int num_atoms, const double ewaldcof, const bool alchDecouple, const double lambda1Up, const double lambda2Up, const double lambda1Down, const double lambda2Down, cudaStream_t stream)
NAMD_HOST_DEVICE Vector b_r() const
Definition: Lattice.h:285
int numAtoms
Definition: Molecule.h:586
void getVirial(double *virial)
cudaTextureObject_t * gridTexObjArrays
static constexpr int kPatchGridDimPad
NAMD_HOST_DEVICE Vector c_r() const
Definition: Lattice.h:286
const int kblock
void energyAndVirialDone(unsigned int iGrid)
Definition: CudaPmeSolver.C:61
void writeComplexToDisk(const float2 *d_data, const int size, const char *filename, cudaStream_t stream)
NAMD_HOST_DEVICE Vector b() const
Definition: Lattice.h:269
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:53
bool dataSrcAllocated
double compute_selfEnergy(double *d_selfEnergy, const float4 *d_atoms, int natoms, double ewaldcof, cudaStream_t stream)
#define simParams
Definition: Output.C:131
int K3
Definition: PmeBase.h:21
const int permutation
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:89
void spread_charge(const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
BigReal getCurrentLambda2(const int) const
BigReal getCurrentLambda(const int) const
void gatherForce(Lattice &lattice, CudaForce *force)
BigReal y
Definition: Vector.h:74
unsigned int multipleGridIndex
void solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float *data)
void gatherForceSetCallback(ComputePmeCUDADevice *devicePtr_in)
void scaleAndMergeForceWrapper(float3 *forces, const float *factors, const size_t num_arrays, const int num_atoms, cudaStream_t stream)
void setDataPtrsYZX(std::vector< float2 *> &dataPtrsNew, float2 *data)
const CudaLocalRecord * localRecords
static constexpr int kDim
void transposeXYZtoYZX(const float2 *data)
void setDataPtrsZXY(std::vector< float2 *> &dataPtrsNew, float2 *data)
float * dataDst
#define cudaCheck(stmt)
Definition: CudaUtils.h:242
CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream, unsigned int iGrid=0)
void batchTranspose_xyz_zxy(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
cufftHandle * forwardPlans
size_t alchGetNumOfPMEGrids() const
void copyDataHostToDevice(const int iblock, float2 *data_in, float2 *data_out)
static constexpr int kThetaPad
void transposeXYZtoZXY(const float2 *data)
static constexpr int kPatchGridDim
NAMD_HOST_DEVICE Vector a() const
Definition: Lattice.h:268
float * dataSrc
NAMD_HOST_DEVICE Vector unit(void) const
Definition: Vector.h:215
void CcdCallBacksReset(void *ignored, double curWallTime)
Molecule * molecule
Definition: Node.h:179
void copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
int computeSharedMemoryPatchLevelGatherForce(const int numThreads, const int3 patchGridDim, const int order)
float2 * getBuffer(const int iblock)
int getShiftedGrid(const double x, const int grid)
EnergyVirial * h_energyVirials
unsigned int multipleGridIndex
double BigReal
Definition: common.h:123
void writeRealToDisk(const float *d_data, const int size, const char *filename, cudaStream_t stream)
void checkPatchLevelDeviceCompatibility()
void compute(const Lattice &lattice, int doEnergyVirial, int step)