NAMD
CudaPmeSolverUtil.C
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <algorithm>
3 #ifdef NAMD_CUDA
4 #include <cuda_runtime.h>
5 #endif
6 #ifdef NAMD_HIP
7 #include "HipDefines.h"
8 #include <hip/hip_runtime.h>
9 #endif
10 #include "ComputeNonbondedUtil.h"
11 #include "ComputePmeCUDAMgr.h"
12 #include "CudaPmeSolver.h"
13 #include "CudaPmeSolverUtil.h"
14 
15 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
16 extern "C" void CcdCallBacksReset(void *ignored, double curWallTime); // fix Charm++
17 
18 void writeComplexToDisk(const float2 *d_data, const int size, const char* filename, cudaStream_t stream) {
19  fprintf(stderr, "writeComplexToDisk %d %s\n", size, filename);
20  float2* h_data = new float2[size];
21  copy_DtoH<float2>(d_data, h_data, size, stream);
22  cudaCheck(cudaStreamSynchronize(stream));
23  FILE *handle = fopen(filename, "w");
24  for (int i=0;i < size;i++)
25  fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
26  fclose(handle);
27  delete [] h_data;
28 }
29 
30 void writeHostComplexToDisk(const float2 *h_data, const int size, const char* filename) {
31  FILE *handle = fopen(filename, "w");
32  for (int i=0;i < size;i++)
33  fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
34  fclose(handle);
35 }
36 
37 void writeRealToDisk(const float *d_data, const int size, const char* filename, cudaStream_t stream) {
38  fprintf(stderr, "writeRealToDisk %d %s\n", size, filename);
39  float* h_data = new float[size];
40  copy_DtoH<float>(d_data, h_data, size, stream);
41  cudaCheck(cudaStreamSynchronize(stream));
42  FILE *handle = fopen(filename, "w");
43  for (int i=0;i < size;i++)
44  fprintf(handle, "%f\n", h_data[i]);
45  fclose(handle);
46  delete [] h_data;
47 }
48 //Use cufft on NVIDIA architectures.
49 #ifdef NAMD_CUDA
50 void CudaFFTCompute::plan3D(int *n, int flags) {
51  cudaCheck(cudaSetDevice(deviceID));
52  forwardType = CUFFT_R2C;
53  backwardType = CUFFT_C2R;
54  cufftCheck(cufftPlan3d(&forwardPlan, n[2], n[1], n[0], CUFFT_R2C));
55  cufftCheck(cufftPlan3d(&backwardPlan, n[2], n[1], n[0], CUFFT_C2R));
56  setStream();
57  // plantype = 3;
58 }
59 
60 void CudaFFTCompute::plan2D(int *n, int howmany, int flags) {
61  cudaCheck(cudaSetDevice(deviceID));
62  forwardType = CUFFT_R2C;
63  backwardType = CUFFT_C2R;
64  int nt[2] = {n[1], n[0]};
65  cufftCheck(cufftPlanMany(&forwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_R2C, howmany));
66  cufftCheck(cufftPlanMany(&backwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, howmany));
67  setStream();
68  // plantype = 2;
69 }
70 
71 void CudaFFTCompute::plan1DX(int *n, int howmany, int flags) {
72  cudaCheck(cudaSetDevice(deviceID));
73  forwardType = CUFFT_R2C;
74  backwardType = CUFFT_C2R;
75  cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_R2C, howmany));
76  cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2R, howmany));
77  setStream();
78  // plantype = 1;
79 }
80 
81 void CudaFFTCompute::plan1DY(int *n, int howmany, int flags) {
82  cudaCheck(cudaSetDevice(deviceID));
83  forwardType = CUFFT_C2C;
84  backwardType = CUFFT_C2C;
85  cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
86  cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
87  setStream();
88  // plantype = 1;
89 }
90 
91 void CudaFFTCompute::plan1DZ(int *n, int howmany, int flags) {
92  cudaCheck(cudaSetDevice(deviceID));
93  forwardType = CUFFT_C2C;
94  backwardType = CUFFT_C2C;
95  cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
96  cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
97  setStream();
98  // plantype = 1;
99 }
100 
102  cudaCheck(cudaSetDevice(deviceID));
103  cufftCheck(cufftDestroy(forwardPlan));
104  cufftCheck(cufftDestroy(backwardPlan));
105  if (dataSrcAllocated) deallocate_device<float>(&dataSrc);
106  if (dataDstAllocated) deallocate_device<float>(&dataDst);
107 }
108 
109 float* CudaFFTCompute::allocateData(const int dataSizeRequired) {
110  cudaCheck(cudaSetDevice(deviceID));
111  float* tmp = NULL;
112  allocate_device<float>(&tmp, dataSizeRequired);
113  return tmp;
114 }
115 
116 // int ncall = 0;
117 
119  cudaCheck(cudaSetDevice(deviceID));
120  // ncall++;
121  if (forwardType == CUFFT_R2C) {
122 
123  cufftCheck(cufftExecR2C(forwardPlan, (cufftReal *)dataSrc, (cufftComplex *)dataDst));
124 
125  // if (ncall == 1) {
126  // writeComplexToDisk((float2 *)dataSrc, (isize/2+1)*jsize*ksize, "dataSrc.txt", stream);
127  // }
128 
129  // if (ncall == 1 && plantype == 2) {
130  // writeComplexToDisk((float2 *)data, (isize/2+1)*jsize*ksize, "data_fx_fy_z.txt", stream);
131  // }
132 
133  } else if (forwardType == CUFFT_C2C) {
134  // nc2cf++;
135  // if (ncall == 1 && nc2cf == 1)
136  // writeComplexToDisk((float2 *)data, 33*64*64, "data_y_z_fx.txt");
137  // else if (ncall == 1 && nc2cf == 2)
138  // writeComplexToDisk((float2 *)data, 33*64*64, "data_z_fx_fy.txt");
139  cufftCheck(cufftExecC2C(forwardPlan, (cufftComplex *)dataSrc, (cufftComplex *)dataDst, CUFFT_FORWARD));
140  // fprintf(stderr, "ncall %d plantype %d\n", ncall, plantype);
141  // if (ncall == 1 && plantype == 1 && isize == 62) {
142  // writeComplexToDisk((float2 *)data, isize*jsize*(ksize/2+1), "data_fy_z_fx.txt", stream);
143  // }
144  // if (ncall == 1 && nc2cf == 1)
145  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fy_z_fx.txt");
146  // else if (ncall == 1 && nc2cf == 2)
147  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fz_fx_fy.txt");
148  } else {
149  cudaNAMD_bug("CudaFFTCompute::forward(), unsupported FFT type");
150  }
151 }
152 
154  cudaCheck(cudaSetDevice(deviceID));
155  if (backwardType == CUFFT_C2R) {
156  // if (ncall == 1) {
157  // if (plantype == 1)
158  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fx_by_bz.txt");
159  // else
160  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fx_fy_fz_2.txt");
161  // }
162 
163  cufftCheck(cufftExecC2R(backwardPlan, (cufftComplex *)dataDst, (cufftReal *)dataSrc));
164 
165  // if (ncall == 1)
166  // if (plantype == 1)
167  // writeRealToDisk(data, 64*64*64, "data_bx_by_bz_1D.txt");
168  // else
169  // writeRealToDisk(data, 64*64*64, "data_bx_by_bz_3D.txt");
170  } else if (backwardType == CUFFT_C2C) {
171  // nc2cb++;
172  // if (ncall == 1 && nc2cb == 1)
173  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fz_fx_fy_2.txt");
174  // else if (ncall == 1 && nc2cb == 2)
175  // writeComplexToDisk((float2 *)data, 33*64*64, "data_fy_bz_fx.txt");
176  cufftCheck(cufftExecC2C(backwardPlan, (cufftComplex *)dataDst, (cufftComplex *)dataSrc, CUFFT_INVERSE));
177  // if (ncall == 1 && nc2cb == 1)
178  // writeComplexToDisk((float2 *)data, 33*64*64, "data_bz_fx_fy.txt");
179  // else if (ncall == 1 && nc2cb == 2)
180  // writeComplexToDisk((float2 *)data, 33*64*64, "data_by_bz_fx.txt");
181  } else {
182  cudaNAMD_bug("CudaFFTCompute::backward(), unsupported FFT type");
183  }
184 }
185 
186 void CudaFFTCompute::setStream() {
187  cudaCheck(cudaSetDevice(deviceID));
188  cufftCheck(cufftSetStream(forwardPlan, stream));
189  cufftCheck(cufftSetStream(backwardPlan, stream));
190 }
191 //On AMD, cuFFT can't be called, and HipFFT is missing alot. Directly plan through rocFFT
192 #else
193 void CudaFFTCompute::createPlans(
194  rocfft_transform_type forwardTransformType, rocfft_transform_type backwardTransformType,
195  size_t dimensions, const size_t* lengths, size_t transforms)
196 {
197  rocfftCheck(
198  rocfft_plan_create(
199  &forwardPlan,
200  rocfft_placement_notinplace,
201  forwardTransformType,
202  rocfft_precision_single,
203  dimensions, lengths, transforms, nullptr
204  )
205  );
206  rocfftCheck(
207  rocfft_plan_create(
208  &backwardPlan,
209  rocfft_placement_notinplace,
210  backwardTransformType,
211  rocfft_precision_single,
212  dimensions, lengths, transforms, nullptr
213  )
214  );
215 
216 
217  rocfftCheck(rocfft_execution_info_create(&forwardPlanInfo));
218  rocfftCheck(rocfft_execution_info_create(&backwardPlanInfo));
219 
220  size_t workBufferSize;
221  rocfftCheck(rocfft_plan_get_work_buffer_size(forwardPlan, &workBufferSize));
222  if(workBufferSize > 0) {
223  cudaCheck(hipMalloc(&forwardWorkBuffer, workBufferSize));
224  rocfftCheck(rocfft_execution_info_set_work_buffer(forwardPlanInfo, forwardWorkBuffer, workBufferSize));
225  } else {
226  forwardWorkBuffer = nullptr;
227  }
228  rocfftCheck(rocfft_plan_get_work_buffer_size(backwardPlan, &workBufferSize));
229  if(workBufferSize > 0) {
230  cudaCheck(hipMalloc(&backwardWorkBuffer, workBufferSize));
231  rocfftCheck(rocfft_execution_info_set_work_buffer(backwardPlanInfo, backwardWorkBuffer, workBufferSize));
232  } else {
233  backwardWorkBuffer = nullptr;
234  }
235 }
236 
237 void CudaFFTCompute::plan3D(int *n, int flags) {
238  cudaCheck(hipSetDevice(deviceID));
239  size_t lengths[3] = { (size_t)n[0], (size_t)n[1], (size_t)n[2] };
240  createPlans(
241  rocfft_transform_type_real_forward, rocfft_transform_type_real_inverse,
242  3, lengths, 1
243  );
244  setStream();
245 }
246 
247 void CudaFFTCompute::plan2D(int *n, int howmany, int flags) {
248  cudaCheck(hipSetDevice(deviceID));
249  size_t lengths[2] = { (size_t)n[0], (size_t)n[1] };
250  createPlans(
251  rocfft_transform_type_real_forward, rocfft_transform_type_real_inverse,
252  2, lengths, howmany
253  );
254  setStream();
255 }
256 
257 void CudaFFTCompute::plan1DX(int *n, int howmany, int flags) {
258  cudaCheck(hipSetDevice(deviceID));
259  size_t lengths[1] = { (size_t)n[0] };
260  createPlans(
261  rocfft_transform_type_real_forward, rocfft_transform_type_real_inverse,
262  1, lengths, howmany
263  );
264  setStream();
265 }
266 
267 void CudaFFTCompute::plan1DY(int *n, int howmany, int flags) {
268  cudaCheck(hipSetDevice(deviceID));
269  size_t lengths[1] = { (size_t)n[0] };
270  createPlans(
271  rocfft_transform_type_complex_forward, rocfft_transform_type_complex_inverse,
272  1, lengths, howmany
273  );
274  setStream();
275 }
276 
277 void CudaFFTCompute::plan1DZ(int *n, int howmany, int flags) {
278  cudaCheck(hipSetDevice(deviceID));
279  size_t lengths[1] = { (size_t)n[0] };
280  createPlans(
281  rocfft_transform_type_complex_forward, rocfft_transform_type_complex_inverse,
282  1, lengths, howmany
283  );
284  setStream();
285 }
286 
288  cudaCheck(hipSetDevice(deviceID));
289  rocfftCheck(rocfft_plan_destroy(forwardPlan));
290  rocfftCheck(rocfft_plan_destroy(backwardPlan));
291  rocfftCheck(rocfft_execution_info_destroy(forwardPlanInfo));
292  rocfftCheck(rocfft_execution_info_destroy(backwardPlanInfo));
293  if (forwardWorkBuffer != nullptr)
294  {
295  cudaCheck(hipFree(forwardWorkBuffer));
296  }
297  if (backwardWorkBuffer != nullptr)
298  {
299  cudaCheck(hipFree(backwardWorkBuffer));
300  }
301  if (dataSrcAllocated) deallocate_device<float>(&dataSrc);
302  if (dataDstAllocated) deallocate_device<float>(&dataDst);
303 }
304 
305 float* CudaFFTCompute::allocateData(const int dataSizeRequired) {
306  cudaCheck(hipSetDevice(deviceID));
307  float* tmp = NULL;
308  allocate_device<float>(&tmp, dataSizeRequired);
309  return tmp;
310 }
311 
313  cudaCheck(hipSetDevice(deviceID));
314  rocfftCheck(rocfft_execute(forwardPlan, (void**)&dataSrc, (void**)&dataDst, forwardPlanInfo));
315 }
316 
318  cudaCheck(hipSetDevice(deviceID));
319  rocfftCheck(rocfft_execute(backwardPlan, (void**)&dataDst, (void**)&dataSrc, backwardPlanInfo));
320 }
321 
322 void CudaFFTCompute::setStream() {
323  cudaCheck(hipSetDevice(deviceID));
324  rocfftCheck(rocfft_execution_info_set_stream(forwardPlanInfo, stream));
325  rocfftCheck(rocfft_execution_info_set_stream(backwardPlanInfo, stream));
326 }
327 #endif
328 //###########################################################################
329 //###########################################################################
330 //###########################################################################
331 
333  const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream) :
334  PmeKSpaceCompute(pmeGrid, permutation, jblock, kblock, kappa),
335  deviceID(deviceID), stream(stream) {
336 
337  cudaCheck(cudaSetDevice(deviceID));
338 
339  // Copy bm1 -> prefac_x on GPU memory
340  float *bm1f = new float[pmeGrid.K1];
341  float *bm2f = new float[pmeGrid.K2];
342  float *bm3f = new float[pmeGrid.K3];
343  for (int i=0;i < pmeGrid.K1;i++) bm1f[i] = (float)bm1[i];
344  for (int i=0;i < pmeGrid.K2;i++) bm2f[i] = (float)bm2[i];
345  for (int i=0;i < pmeGrid.K3;i++) bm3f[i] = (float)bm3[i];
346  allocate_device<float>(&d_bm1, pmeGrid.K1);
347  allocate_device<float>(&d_bm2, pmeGrid.K2);
348  allocate_device<float>(&d_bm3, pmeGrid.K3);
349  copy_HtoD_sync<float>(bm1f, d_bm1, pmeGrid.K1);
350  copy_HtoD_sync<float>(bm2f, d_bm2, pmeGrid.K2);
351  copy_HtoD_sync<float>(bm3f, d_bm3, pmeGrid.K3);
352  delete [] bm1f;
353  delete [] bm2f;
354  delete [] bm3f;
355  allocate_device<EnergyVirial>(&d_energyVirial, 1);
356  allocate_host<EnergyVirial>(&h_energyVirial, 1);
357  // cudaCheck(cudaEventCreateWithFlags(&copyEnergyVirialEvent, cudaEventDisableTiming));
358  cudaCheck(cudaEventCreate(&copyEnergyVirialEvent));
359  // ncall = 0;
360 }
361 
363  cudaCheck(cudaSetDevice(deviceID));
364  deallocate_device<float>(&d_bm1);
365  deallocate_device<float>(&d_bm2);
366  deallocate_device<float>(&d_bm3);
367  deallocate_device<EnergyVirial>(&d_energyVirial);
368  deallocate_host<EnergyVirial>(&h_energyVirial);
369  cudaCheck(cudaEventDestroy(copyEnergyVirialEvent));
370 }
371 
372 void CudaPmeKSpaceCompute::solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float* data) {
373 #if 0
374  // Check lattice to make sure it is updating for constant pressure
375  fprintf(stderr, "K-SPACE LATTICE %g %g %g %g %g %g %g %g %g\n",
376  lattice.a().x, lattice.a().y, lattice.a().z,
377  lattice.b().x, lattice.b().y, lattice.b().z,
378  lattice.c().x, lattice.c().y, lattice.c().z);
379 #endif
380  cudaCheck(cudaSetDevice(deviceID));
381 
382  const bool doEnergyVirial = (doEnergy || doVirial);
383 
384  int nfft1, nfft2, nfft3;
385  float *prefac1, *prefac2, *prefac3;
386 
387  BigReal volume = lattice.volume();
388  Vector a_r = lattice.a_r();
389  Vector b_r = lattice.b_r();
390  Vector c_r = lattice.c_r();
391  float recip1x, recip1y, recip1z;
392  float recip2x, recip2y, recip2z;
393  float recip3x, recip3y, recip3z;
394 
395  if (permutation == Perm_Z_cX_Y) {
396  // Z, X, Y
397  nfft1 = pmeGrid.K3;
398  nfft2 = pmeGrid.K1;
399  nfft3 = pmeGrid.K2;
400  prefac1 = d_bm3;
401  prefac2 = d_bm1;
402  prefac3 = d_bm2;
403  recip1x = c_r.z;
404  recip1y = c_r.x;
405  recip1z = c_r.y;
406  recip2x = a_r.z;
407  recip2y = a_r.x;
408  recip2z = a_r.y;
409  recip3x = b_r.z;
410  recip3y = b_r.x;
411  recip3z = b_r.y;
412  } else if (permutation == Perm_cX_Y_Z) {
413  // X, Y, Z
414  nfft1 = pmeGrid.K1;
415  nfft2 = pmeGrid.K2;
416  nfft3 = pmeGrid.K3;
417  prefac1 = d_bm1;
418  prefac2 = d_bm2;
419  prefac3 = d_bm3;
420  recip1x = a_r.x;
421  recip1y = a_r.y;
422  recip1z = a_r.z;
423  recip2x = b_r.x;
424  recip2y = b_r.y;
425  recip2z = b_r.z;
426  recip3x = c_r.x;
427  recip3y = c_r.y;
428  recip3z = c_r.z;
429  } else {
430  NAMD_bug("CudaPmeKSpaceCompute::solve, invalid permutation");
431  }
432 
433  // ncall++;
434  // if (ncall == 1) {
435  // char filename[256];
436  // sprintf(filename,"dataf_%d_%d.txt",jblock,kblock);
437  // writeComplexToDisk((float2*)data, size1*size2*size3, filename, stream);
438  // }
439 
440  // if (ncall == 1) {
441  // float2* h_data = new float2[size1*size2*size3];
442  // float2* d_data = (float2*)data;
443  // copy_DtoH<float2>(d_data, h_data, size1*size2*size3, stream);
444  // cudaCheck(cudaStreamSynchronize(stream));
445  // FILE *handle = fopen("dataf.txt", "w");
446  // for (int z=0;z < pmeGrid.K3;z++) {
447  // for (int y=0;y < pmeGrid.K2;y++) {
448  // for (int x=0;x < pmeGrid.K1/2+1;x++) {
449  // int i;
450  // if (permutation == Perm_cX_Y_Z) {
451  // i = x + y*size1 + z*size1*size2;
452  // } else {
453  // i = z + x*size1 + y*size1*size2;
454  // }
455  // fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
456  // }
457  // }
458  // }
459  // fclose(handle);
460  // delete [] h_data;
461  // }
462 
463  // Clear energy and virial array if needed
464  if (doEnergyVirial) clear_device_array<EnergyVirial>(d_energyVirial, 1, stream);
465 
466  scalar_sum(permutation == Perm_cX_Y_Z, nfft1, nfft2, nfft3, size1, size2, size3, kappa,
467  recip1x, recip1y, recip1z, recip2x, recip2y, recip2z, recip3x, recip3y, recip3z,
468  volume, prefac1, prefac2, prefac3, j0, k0, doEnergyVirial,
469  &d_energyVirial->energy, d_energyVirial->virial, (float2*)data,
470  stream);
471 
472  // Copy energy and virial to host if needed
473  if (doEnergyVirial) {
474  copy_DtoH<EnergyVirial>(d_energyVirial, h_energyVirial, 1, stream);
475  cudaCheck(cudaEventRecord(copyEnergyVirialEvent, stream));
476  // cudaCheck(cudaStreamSynchronize(stream));
477  }
478 
479 }
480 
481 // void CudaPmeKSpaceCompute::waitEnergyAndVirial() {
482 // cudaCheck(cudaSetDevice(deviceID));
483 // cudaCheck(cudaEventSynchronize(copyEnergyVirialEvent));
484 // }
485 
486 void CudaPmeKSpaceCompute::energyAndVirialCheck(void *arg, double walltime) {
488 
489  cudaError_t err = cudaEventQuery(c->copyEnergyVirialEvent);
490  if (err == cudaSuccess) {
491  // Event has occurred
492  c->checkCount = 0;
493  if (c->pencilXYZPtr != NULL)
494  c->pencilXYZPtr->energyAndVirialDone();
495  else if (c->pencilZPtr != NULL)
496  c->pencilZPtr->energyAndVirialDone();
497  else
498  NAMD_bug("CudaPmeKSpaceCompute::energyAndVirialCheck, pencilXYZPtr and pencilZPtr not set");
499  return;
500  } else if (err == cudaErrorNotReady) {
501  // Event has not occurred
502  c->checkCount++;
503  if (c->checkCount >= 1000000) {
504  char errmsg[256];
505  sprintf(errmsg,"CudaPmeKSpaceCompute::energyAndVirialCheck polled %d times",
506  c->checkCount);
507  cudaDie(errmsg,err);
508  }
509  } else {
510  // Anything else is an error
511  char errmsg[256];
512  sprintf(errmsg,"in CudaPmeKSpaceCompute::energyAndVirialCheck after polling %d times",
513  c->checkCount);
514  cudaDie(errmsg,err);
515  }
516 
517  // Call again
518  CcdCallBacksReset(0, walltime);
519  CcdCallFnAfter(energyAndVirialCheck, arg, 0.1);
520 }
521 
523  cudaCheck(cudaSetDevice(deviceID));
524  pencilXYZPtr = pencilPtr;
525  pencilZPtr = NULL;
526  checkCount = 0;
527  CcdCallBacksReset(0, CmiWallTimer());
528  // Set the call back at 0.1ms
529  CcdCallFnAfter(energyAndVirialCheck, this, 0.1);
530 }
531 
533  cudaCheck(cudaSetDevice(deviceID));
534  pencilXYZPtr = NULL;
535  pencilZPtr = pencilPtr;
536  checkCount = 0;
537  CcdCallBacksReset(0, CmiWallTimer());
538  // Set the call back at 0.1ms
539  CcdCallFnAfter(energyAndVirialCheck, this, 0.1);
540 }
541 
543  return h_energyVirial->energy;
544 }
545 
546 void CudaPmeKSpaceCompute::getVirial(double *virial) {
547  if (permutation == Perm_Z_cX_Y) {
548  // h_energyVirial->virial is storing ZZ, ZX, ZY, XX, XY, YY
549  virial[0] = h_energyVirial->virial[3];
550  virial[1] = h_energyVirial->virial[4];
551  virial[2] = h_energyVirial->virial[1];
552 
553  virial[3] = h_energyVirial->virial[4];
554  virial[4] = h_energyVirial->virial[5];
555  virial[5] = h_energyVirial->virial[2];
556 
557  virial[6] = h_energyVirial->virial[1];
558  virial[7] = h_energyVirial->virial[7];
559  virial[8] = h_energyVirial->virial[0];
560  } else if (permutation == Perm_cX_Y_Z) {
561  // h_energyVirial->virial is storing XX, XY, XZ, YY, YZ, ZZ
562  virial[0] = h_energyVirial->virial[0];
563  virial[1] = h_energyVirial->virial[1];
564  virial[2] = h_energyVirial->virial[2];
565 
566  virial[3] = h_energyVirial->virial[1];
567  virial[4] = h_energyVirial->virial[3];
568  virial[5] = h_energyVirial->virial[4];
569 
570  virial[6] = h_energyVirial->virial[2];
571  virial[7] = h_energyVirial->virial[4];
572  virial[8] = h_energyVirial->virial[5];
573  }
574 }
575 
576 
577 //###########################################################################
578 //###########################################################################
579 //###########################################################################
580 
581 //
582 // Class constructor
583 //
585  const int jblock, const int kblock, int deviceID, cudaStream_t stream) :
586  PmeRealSpaceCompute(pmeGrid, jblock, kblock), deviceID(deviceID), stream(stream) {
587  if (dataSize < xsize*ysize*zsize)
588  NAMD_bug("CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute, insufficient dataSize");
589  cudaCheck(cudaSetDevice(deviceID));
590  d_atomsCapacity = 0;
591  d_atoms = NULL;
592  d_forceCapacity = 0;
593  d_force = NULL;
594  #ifdef NAMD_CUDA
595  tex_data = NULL;
596  tex_data_len = 0;
597  #else
598  grid_data = NULL;
599  grid_data_len = 0;
600  #endif
601  allocate_device<float>(&data, dataSize);
602  setupGridData(data, xsize*ysize*zsize);
603  cudaCheck(cudaEventCreate(&gatherForceEvent));
604 }
605 
606 //
607 // Class desctructor
608 //
610  cudaCheck(cudaSetDevice(deviceID));
611  if (d_atoms != NULL) deallocate_device<CudaAtom>(&d_atoms);
612  if (d_force != NULL) deallocate_device<CudaForce>(&d_force);
613  // if (d_patches != NULL) deallocate_device<PatchInfo>(&d_patches);
614  // deallocate_device<double>(&d_selfEnergy);
615  deallocate_device<float>(&data);
616  cudaCheck(cudaEventDestroy(gatherForceEvent));
617 }
618 
619 // //
620 // // Copy patches and atoms to device memory
621 // //
622 // void CudaPmeRealSpaceCompute::setPatchesAtoms(const int numPatches, const PatchInfo* patches,
623 // const int numAtoms, const CudaAtom* atoms) {
624 
625 // this->numPatches = numPatches;
626 // this->numAtoms = numAtoms;
627 
628 // // Reallocate device arrays as neccessary
629 // reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity, numAtoms, 1.5f);
630 // reallocate_device<PatchInfo>(&d_patches, &d_patchesCapacity, numPatches, 1.5f);
631 
632 // // Copy atom and patch data to device
633 // copy_HtoD<CudaAtom>(atoms, d_atoms, numAtoms, stream);
634 // copy_HtoD<PatchInfo>(patches, d_patches, numPatches, stream);
635 // }
636 
637 //
638 // Copy atoms to device memory
639 //
640 void CudaPmeRealSpaceCompute::copyAtoms(const int numAtoms, const CudaAtom* atoms) {
641  cudaCheck(cudaSetDevice(deviceID));
642  this->numAtoms = numAtoms;
643 
644  // Reallocate device arrays as neccessary
645  reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity, numAtoms, 1.5f);
646 
647  // Copy atom data to device
648  copy_HtoD<CudaAtom>(atoms, d_atoms, numAtoms, stream);
649 }
650 
651 //
652 // Spread charges on grid
653 //
655  cudaCheck(cudaSetDevice(deviceID));
656 
657  // Clear grid
658  clear_device_array<float>(data, xsize*ysize*zsize, stream);
659 
660  spread_charge((const float4*)d_atoms, numAtoms,
661  pmeGrid.K1, pmeGrid.K2, pmeGrid.K3, xsize, ysize, zsize,
662  xsize, y0, z0, (pmeGrid.yBlocks == 1), (pmeGrid.zBlocks == 1),
663  data, pmeGrid.order, stream);
664 
665  // ncall++;
666 
667  // if (ncall == 1) writeRealToDisk(data, xsize*ysize*zsize, "data.txt");
668 }
669 
670 void CudaPmeRealSpaceCompute::cuda_gatherforce_check(void *arg, double walltime) {
672  cudaCheck(cudaSetDevice(c->deviceID));
673 
674  cudaError_t err = cudaEventQuery(c->gatherForceEvent);
675  if (err == cudaSuccess) {
676  // Event has occurred
677  c->checkCount = 0;
678 // c->deviceProxy[CkMyNode()].gatherForceDone();
679  c->devicePtr->gatherForceDone();
680  return;
681  } else if (err == cudaErrorNotReady) {
682  // Event has not occurred
683  c->checkCount++;
684  if (c->checkCount >= 1000000) {
685  char errmsg[256];
686  sprintf(errmsg,"CudaPmeRealSpaceCompute::cuda_gatherforce_check polled %d times",
687  c->checkCount);
688  cudaDie(errmsg,err);
689  }
690  } else {
691  // Anything else is an error
692  char errmsg[256];
693  sprintf(errmsg,"in CudaPmeRealSpaceCompute::cuda_gatherforce_check after polling %d times",
694  c->checkCount);
695  cudaDie(errmsg,err);
696  }
697 
698  // Call again
699  CcdCallBacksReset(0, walltime);
700  CcdCallFnAfter(cuda_gatherforce_check, arg, 0.1);
701 }
702 
704  cudaCheck(cudaSetDevice(deviceID));
705  devicePtr = devicePtr_in;
706  checkCount = 0;
707  CcdCallBacksReset(0, CmiWallTimer());
708  // Set the call back at 0.1ms
709  CcdCallFnAfter(cuda_gatherforce_check, this, 0.1);
710 }
711 
713  cudaCheck(cudaSetDevice(deviceID));
714  cudaCheck(cudaEventSynchronize(gatherForceEvent));
715 }
716 
717 void CudaPmeRealSpaceCompute::setupGridData(float* data, int data_len) {
718  #ifdef NAMD_CUDA
719  //HIP runtime error when using hipCreateTextureObject. No longer needed anyway, so we are moving in that direction now.
720  /*
721 
722  FATAL ERROR: CUDA error hipCreateTextureObject(&gridTexObj, &desc, &tdesc, NULL) in file src/CudaPmeSolverUtil.C, function setupGridTexture, line 744
723  on Pe 11 (jparada-MS-7B09 device 0 pci 0:43:0): hipErrorRuntimeOther
724 ------------- Processor 11 Exiting: Called CmiAbort ------------
725 Reason: FATAL ERROR: CUDA error hipCreateTextureObject(&gridTexObj, &desc, &tdesc, NULL) in file src/CudaPmeSolverUtil.C, function setupGridTexture, line 744
726  on Pe 11 (jparada-MS-7B09 device 0 pci 0:43:0): hipErrorRuntimeOther
727 
728  */
729  if (tex_data == data && tex_data_len == data_len) return;
730  tex_data = data;
731  tex_data_len = data_len;
732  // Use texture objects
733  cudaResourceDesc resDesc;
734  memset(&resDesc, 0, sizeof(resDesc));
735  resDesc.resType = cudaResourceTypeLinear;
736  resDesc.res.linear.devPtr = data;
737  resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
738  resDesc.res.linear.desc.x = sizeof(float)*8;
739  resDesc.res.linear.sizeInBytes = data_len*sizeof(float);
740  cudaTextureDesc texDesc;
741  memset(&texDesc, 0, sizeof(texDesc));
742  texDesc.readMode = cudaReadModeElementType;
743  cudaCheck(cudaCreateTextureObject(&gridTexObj, &resDesc, &texDesc, NULL));
744  #else
745  if (grid_data == data && grid_data_len == data_len) return;
746  grid_data = data;
747  grid_data_len = data_len;
748  #endif
749 }
750 
752  cudaCheck(cudaSetDevice(deviceID));
753 
754  // Re-allocate force array if needed
755  reallocate_device<CudaForce>(&d_force, &d_forceCapacity, numAtoms, 1.5f);
756 
757  gather_force((const float4*)d_atoms, numAtoms,
759  xsize, ysize, zsize, xsize, y0, z0, (pmeGrid.yBlocks == 1), (pmeGrid.zBlocks == 1),
760  data, pmeGrid.order, (float3*)d_force,
761 #ifdef NAMD_CUDA
762  gridTexObj,
763 #endif
764  stream);
765 
766  copy_DtoH<CudaForce>(d_force, force, numAtoms, stream);
767 
768  cudaCheck(cudaEventRecord(gatherForceEvent, stream));
769 }
770 
771 /*
772 double CudaPmeRealSpaceCompute::calcSelfEnergy() {
773  double h_selfEnergy;
774  clear_device_array<double>(d_selfEnergy, 1);
775  calc_sum_charge_squared((const float4*)d_atoms, numAtoms, d_selfEnergy, stream);
776  copy_DtoH<double>(d_selfEnergy, &h_selfEnergy, 1, stream);
777  cudaCheck(cudaStreamSynchronize(stream));
778  // 1.7724538509055160273 = sqrt(pi)
779  h_selfEnergy *= -ComputeNonbondedUtil::ewaldcof/1.7724538509055160273;
780  return h_selfEnergy;
781 }
782 */
783 
784 //###########################################################################
785 //###########################################################################
786 //###########################################################################
787 
788 CudaPmeTranspose::CudaPmeTranspose(PmeGrid pmeGrid, const int permutation,
789  const int jblock, const int kblock, int deviceID, cudaStream_t stream) :
790  PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
791  cudaCheck(cudaSetDevice(deviceID));
792 
793  allocate_device<float2>(&d_data, dataSize);
794 #ifndef P2P_ENABLE_3D
795  allocate_device<float2>(&d_buffer, dataSize);
796 #endif
797 
798  // Setup data pointers to NULL, these can be overridden later on by using setDataPtrs()
799  dataPtrsYZX.resize(nblock, NULL);
800  dataPtrsZXY.resize(nblock, NULL);
801 
802  allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*nblock);
803  allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*nblock);
804 }
805 
807  cudaCheck(cudaSetDevice(deviceID));
808  deallocate_device<float2>(&d_data);
809 #ifndef P2P_ENABLE_3D
810  deallocate_device<float2>(&d_buffer);
811 #endif
812  deallocate_device< TransposeBatch<float2> >(&batchesZXY);
813  deallocate_device< TransposeBatch<float2> >(&batchesYZX);
814 }
815 
816 //
817 // Set dataPtrsYZX
818 //
819 void CudaPmeTranspose::setDataPtrsYZX(std::vector<float2*>& dataPtrsNew, float2* data) {
820  if (dataPtrsYZX.size() != dataPtrsNew.size())
821  NAMD_bug("CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
822  for (int iblock=0;iblock < nblock;iblock++) {
823  dataPtrsYZX[iblock] = dataPtrsNew[iblock];
824  }
825  // Build batched data structures
827 
828  for (int iperm=0;iperm < 3;iperm++) {
829  int isize_out;
830  if (iperm == 0) {
831  // Perm_Z_cX_Y:
832  // ZXY -> XYZ
833  isize_out = pmeGrid.K1/2+1;
834  } else if (iperm == 1) {
835  // Perm_cX_Y_Z:
836  // XYZ -> YZX
837  isize_out = pmeGrid.K2;
838  } else {
839  // Perm_Y_Z_cX:
840  // YZX -> ZXY
841  isize_out = pmeGrid.K3;
842  }
843 
844  int max_nx = 0;
845  for (int iblock=0;iblock < nblock;iblock++) {
846 
847  int x0 = pos[iblock];
848  int nx = pos[iblock+1] - x0;
849  max_nx = std::max(max_nx, nx);
850 
851  int width_out;
852  float2* data_out;
853  if (dataPtrsYZX[iblock] == NULL) {
854  // Local transpose, use internal buffer
855  data_out = d_data + jsize*ksize*x0;
856  width_out = jsize;
857  } else {
858  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
859  data_out = dataPtrsYZX[iblock];
860  width_out = isize_out;
861  }
862 
864  batch.nx = nx;
865  batch.ysize_out = width_out;
866  batch.zsize_out = ksize;
867  batch.data_in = data+x0;
868  batch.data_out = data_out;
869 
870  h_batchesYZX[iperm*nblock + iblock] = batch;
871 
872  // transpose_xyz_yzx(
873  // nx, jsize, ksize,
874  // isize, jsize,
875  // width_out, ksize,
876  // data+x0, data_out, stream);
877  }
878 
879  max_nx_YZX[iperm] = max_nx;
880  }
881 
882  copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*nblock, stream);
883  cudaCheck(cudaStreamSynchronize(stream));
884  delete [] h_batchesYZX;
885 }
886 
887 //
888 // Set dataPtrsZXY
889 //
890 void CudaPmeTranspose::setDataPtrsZXY(std::vector<float2*>& dataPtrsNew, float2* data) {
891  if (dataPtrsZXY.size() != dataPtrsNew.size())
892  NAMD_bug("CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
893  for (int iblock=0;iblock < nblock;iblock++) {
894  dataPtrsZXY[iblock] = dataPtrsNew[iblock];
895  }
896 
897  // Build batched data structures
899 
900  for (int iperm=0;iperm < 3;iperm++) {
901  int isize_out;
902  if (iperm == 0) {
903  // Perm_cX_Y_Z:
904  // XYZ -> ZXY
905  isize_out = pmeGrid.K3;
906  } else if (iperm == 1) {
907  // Perm_Z_cX_Y:
908  // ZXY -> YZX
909  isize_out = pmeGrid.K2;
910  } else {
911  // Perm_Y_Z_cX:
912  // YZX -> XYZ
913  isize_out = pmeGrid.K1/2+1;
914  }
915 
916  int max_nx = 0;
917  for (int iblock=0;iblock < nblock;iblock++) {
918 
919  int x0 = pos[iblock];
920  int nx = pos[iblock+1] - x0;
921  max_nx = std::max(max_nx, nx);
922 
923  int width_out;
924  float2* data_out;
925  if (dataPtrsZXY[iblock] == NULL) {
926  // Local transpose, use internal buffer
927  data_out = d_data + jsize*ksize*x0;
928  width_out = ksize;
929  } else {
930  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
931  data_out = dataPtrsZXY[iblock];
932  width_out = isize_out;
933  }
934 
936  batch.nx = nx;
937  batch.zsize_out = width_out;
938  batch.xsize_out = nx;
939  batch.data_in = data+x0;
940  batch.data_out = data_out;
941 
942  h_batchesZXY[iperm*nblock + iblock] = batch;
943  }
944 
945  max_nx_ZXY[iperm] = max_nx;
946  }
947 
948  copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*nblock, stream);
949  cudaCheck(cudaStreamSynchronize(stream));
950  delete [] h_batchesZXY;
951 }
952 
954  cudaCheck(cudaSetDevice(deviceID));
955 
956  int iperm;
957  switch(permutation) {
958  case Perm_Z_cX_Y:
959  // ZXY -> XYZ
960  iperm = 0;
961  break;
962  case Perm_cX_Y_Z:
963  // XYZ -> YZX
964  iperm = 1;
965  break;
966  case Perm_Y_Z_cX:
967  // YZX -> ZXY
968  iperm = 2;
969  break;
970  default:
971  NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
972  break;
973  }
974 
976  nblock, batchesYZX + iperm*nblock,
977  max_nx_YZX[iperm], jsize, ksize,
978  isize, jsize, stream);
979 
980 
981 /*
982  int isize_out;
983  switch(permutation) {
984  case Perm_Z_cX_Y:
985  // ZXY -> XYZ
986  isize_out = pmeGrid.K1/2+1;
987  break;
988  case Perm_cX_Y_Z:
989  // XYZ -> YZX
990  isize_out = pmeGrid.K2;
991  break;
992  case Perm_Y_Z_cX:
993  // YZX -> ZXY
994  isize_out = pmeGrid.K3;
995  break;
996  default:
997  NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
998  break;
999  }
1000 
1001  for (int iblock=0;iblock < nblock;iblock++) {
1002 
1003  int x0 = pos[iblock];
1004  int nx = pos[iblock+1] - x0;
1005 
1006  int width_out;
1007  float2* data_out;
1008  if (dataPtrsYZX[iblock] == NULL) {
1009  // Local transpose, use internal buffer
1010  data_out = d_data + jsize*ksize*x0;
1011  width_out = jsize;
1012  } else {
1013  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
1014  data_out = dataPtrsYZX[iblock];
1015  width_out = isize_out;
1016  }
1017 
1018  transpose_xyz_yzx(
1019  nx, jsize, ksize,
1020  isize, jsize,
1021  width_out, ksize,
1022  data+x0, data_out, stream);
1023  }
1024 */
1025 }
1026 
1028  cudaCheck(cudaSetDevice(deviceID));
1029 
1030  int iperm;
1031  switch(permutation) {
1032  case Perm_cX_Y_Z:
1033  // XYZ -> ZXY
1034  iperm = 0;
1035  break;
1036  case Perm_Z_cX_Y:
1037  // ZXY -> YZX
1038  iperm = 1;
1039  break;
1040  case Perm_Y_Z_cX:
1041  // YZX -> XYZ
1042  iperm = 2;
1043  break;
1044  default:
1045  NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
1046  break;
1047  }
1048 
1050  nblock, batchesZXY + iperm*nblock,
1051  max_nx_ZXY[iperm], jsize, ksize,
1052  isize, jsize, stream);
1053 
1054 /*
1055  int isize_out;
1056  switch(permutation) {
1057  case Perm_cX_Y_Z:
1058  // XYZ -> ZXY
1059  isize_out = pmeGrid.K3;
1060  break;
1061  case Perm_Z_cX_Y:
1062  // ZXY -> YZX
1063  isize_out = pmeGrid.K2;
1064  break;
1065  case Perm_Y_Z_cX:
1066  // YZX -> XYZ
1067  isize_out = pmeGrid.K1/2+1;
1068  break;
1069  default:
1070  NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
1071  break;
1072  }
1073 
1074  for (int iblock=0;iblock < nblock;iblock++) {
1075 
1076  int x0 = pos[iblock];
1077  int nx = pos[iblock+1] - x0;
1078 
1079  int width_out;
1080  float2* data_out;
1081  if (dataPtrsZXY[iblock] == NULL) {
1082  // Local transpose, use internal buffer
1083  data_out = d_data + jsize*ksize*x0;
1084  width_out = ksize;
1085  } else {
1086  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
1087  data_out = dataPtrsZXY[iblock];
1088  width_out = isize_out;
1089  }
1090 
1091  transpose_xyz_zxy(
1092  nx, jsize, ksize,
1093  isize, jsize,
1094  width_out, nx,
1095  data+x0, data_out, stream);
1096  }
1097 */
1098 }
1099 
1101  cudaCheck(cudaSetDevice(deviceID));
1102  cudaCheck(cudaStreamSynchronize(stream));
1103 }
1104 
1105 void CudaPmeTranspose::copyDataDeviceToHost(const int iblock, float2* h_data, const int h_dataSize) {
1106  cudaCheck(cudaSetDevice(deviceID));
1107 
1108  if (iblock >= nblock)
1109  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
1110 
1111  int x0 = pos[iblock];
1112  int nx = pos[iblock+1] - x0;
1113 
1114  int copySize = jsize*ksize*nx;
1115  int copyStart = jsize*ksize*x0;
1116 
1117  if (copyStart + copySize > dataSize)
1118  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
1119 
1120  if (copySize > h_dataSize)
1121  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
1122 
1123  copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
1124 }
1125 
1126 void CudaPmeTranspose::copyDataHostToDevice(const int iblock, float2* data_in, float2* data_out) {
1127  cudaCheck(cudaSetDevice(deviceID));
1128 
1129  if (iblock >= nblock)
1130  NAMD_bug("CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
1131 
1132  // Determine block size = how much we're copying
1133  int i0, i1, j0, j1, k0, k1;
1134  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1135  int ni = i1-i0+1;
1136  int nj = j1-j0+1;
1137  int nk = k1-k0+1;
1138 
1139  copy3D_HtoD<float2>(data_in, data_out,
1140  0, 0, 0,
1141  ni, nj,
1142  i0, 0, 0,
1143  isize, jsize,
1144  ni, nj, nk, stream);
1145 }
1146 
1147 #ifndef P2P_ENABLE_3D
1148 //
1149 // Copy from temporary buffer to final buffer
1150 //
1151 void CudaPmeTranspose::copyDataDeviceToDevice(const int iblock, float2* data_out) {
1152  cudaCheck(cudaSetDevice(deviceID));
1153 
1154  if (iblock >= nblock)
1155  NAMD_bug("CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
1156 
1157  // Determine block size = how much we're copying
1158  int i0, i1, j0, j1, k0, k1;
1159  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1160  int ni = i1-i0+1;
1161  int nj = j1-j0+1;
1162  int nk = k1-k0+1;
1163 
1164  float2* data_in = d_buffer + i0*nj*nk;
1165 
1166  copy3D_DtoD<float2>(data_in, data_out,
1167  0, 0, 0,
1168  ni, nj,
1169  i0, 0, 0,
1170  isize, jsize,
1171  ni, nj, nk, stream);
1172 }
1173 
1174 //
1175 // Return temporary buffer for block "iblock"
1176 //
1178  if (iblock >= nblock)
1179  NAMD_bug("CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
1180 
1181  // Determine block size = how much we're copying
1182  int i0, i1, j0, j1, k0, k1;
1183  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1184  int ni = i1-i0+1;
1185  int nj = j1-j0+1;
1186  int nk = k1-k0+1;
1187 
1188  return d_buffer + i0*nj*nk;
1189 }
1190 #endif
1191 
1192 void CudaPmeTranspose::copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out,
1193  float2* data_out) {
1194 
1195  int iblock_out = jblock;
1196  int jblock_out = kblock;
1197  int kblock_out = iblock;
1198 
1199  copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1200 }
1201 
1202 void CudaPmeTranspose::copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out,
1203  float2* data_out) {
1204 
1205  int iblock_out = kblock;
1206  int jblock_out = iblock;
1207  int kblock_out = jblock;
1208 
1209  copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1210 }
1211 
1212 void CudaPmeTranspose::copyDataToPeerDevice(const int iblock,
1213  const int iblock_out, const int jblock_out, const int kblock_out,
1214  int deviceID_out, int permutation_out, float2* data_out) {
1215 
1216  cudaCheck(cudaSetDevice(deviceID));
1217 
1218  // Determine block size = how much we're copying
1219  int i0, i1, j0, j1, k0, k1;
1220  getBlockDim(pmeGrid, permutation_out, iblock_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
1221  int ni = i1-i0+1;
1222  int nj = j1-j0+1;
1223  int nk = k1-k0+1;
1224 
1225  getPencilDim(pmeGrid, permutation_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
1226  int isize_out = i1-i0+1;
1227  int jsize_out = j1-j0+1;
1228 
1229  int x0 = pos[iblock];
1230  float2* data_in = d_data + jsize*ksize*x0;
1231 
1232 #ifndef P2P_ENABLE_3D
1233  // Copy into temporary peer device buffer
1234  copy_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out, ni*nj*nk, stream);
1235 #else
1236  copy3D_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out,
1237  0, 0, 0,
1238  ni, nj,
1239  0, 0, 0,
1240  isize_out, jsize_out,
1241  ni, nj, nk, stream);
1242 #endif
1243 
1244 }
1245 
1246 #endif // NAMD_CUDA
int zBlocks
Definition: PmeBase.h:22
Vector a_r() const
Definition: Lattice.h:268
void energyAndVirialSetCallback(CudaPmePencilXYZ *pencilPtr)
const int permutation
void batchTranspose_xyz_yzx(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void setDataPtrsYZX(std::vector< float2 * > &dataPtrsNew, float2 *data)
CudaPmeTranspose(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)
Definition: Vector.h:64
int K2
Definition: PmeBase.h:18
int K1
Definition: PmeBase.h:18
Vector c_r() const
Definition: Lattice.h:270
static __thread atom * atoms
BigReal z
Definition: Vector.h:66
#define cufftCheck(stmt)
void energyAndVirialDone()
Definition: CudaPmeSolver.C:47
static void getPencilDim(const PmeGrid &pmeGrid, const int permutation, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:29
void spreadCharge(Lattice &lattice)
std::vector< int > pos
void CcdCallBacksReset(void *ignored, double curWallTime)
void copyAtoms(const int numAtoms, const CudaAtom *atoms)
CudaPmeRealSpaceCompute(PmeGrid pmeGrid, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
void energyAndVirialDone()
Vector b_r() const
Definition: Lattice.h:269
void copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
__thread cudaStream_t stream
void copyDataDeviceToDevice(const int iblock, float2 *data_out)
PmeGrid pmeGrid
bool dataDstAllocated
int yBlocks
Definition: PmeBase.h:22
void copyDataDeviceToHost(const int iblock, float2 *h_data, const int h_dataSize)
int order
Definition: PmeBase.h:20
void NAMD_bug(const char *err_msg)
Definition: common.C:129
const int jblock
void writeHostComplexToDisk(const float2 *h_data, const int size, const char *filename)
BigReal x
Definition: Vector.h:66
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
Definition: CudaUtils.C:9
void getVirial(double *virial)
const int kblock
CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream)
BigReal volume(void) const
Definition: Lattice.h:277
void writeComplexToDisk(const float2 *d_data, const int size, const char *filename, cudaStream_t stream)
void batchTranspose_xyz_zxy(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:31
bool dataSrcAllocated
__global__ void gather_force(const float4 *xyzq, const int ncoord, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const float *data, const cudaTextureObject_t gridTexObj, const int stride, FT *force)
int K3
Definition: PmeBase.h:18
const int permutation
void setDataPtrsZXY(std::vector< float2 * > &dataPtrsNew, float2 *data)
void spread_charge(const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:86
void gatherForce(Lattice &lattice, CudaForce *force)
BigReal y
Definition: Vector.h:66
Vector b() const
Definition: Lattice.h:253
void solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float *data)
void gatherForceSetCallback(ComputePmeCUDADevice *devicePtr_in)
void transposeXYZtoYZX(const float2 *data)
gridSize y
float * dataDst
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void copyDataHostToDevice(const int iblock, float2 *data_in, float2 *data_out)
void transposeXYZtoZXY(const float2 *data)
gridSize x
float * dataSrc
void copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
Vector a() const
Definition: Lattice.h:252
float2 * getBuffer(const int iblock)
Vector c() const
Definition: Lattice.h:254
double BigReal
Definition: common.h:114
void writeRealToDisk(const float *d_data, const int size, const char *filename, cudaStream_t stream)