namd/doxygen/CudaPmeSolverUtil_8C_source.html

 #include <stdio.h>
 #include <algorithm>
 #include <cstdlib>
 #ifdef NAMD_CUDA
 #include <cuda_runtime.h>
 #endif
 #ifdef NAMD_HIP
 #include <hip/hip_runtime.h>
 #endif
 #include "HipDefines.h"
 #include "ComputeNonbondedUtil.h"
 #include "ComputePmeCUDAMgr.h"
 #include "CudaPmeSolver.h"
 #include "CudaPmeSolverUtil.h"
 #include "Node.h"
 #include "PatchData.h"

 #include "NamdEventsProfiling.h"
 #include "TestArray.h"
 #include "DeviceCUDA.h"

 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
 #if !defined(WIN64)
 extern __thread DeviceCUDA *deviceCUDA;
 #else
 extern __declspec(thread) DeviceCUDA *deviceCUDA;
 #endif
 extern "C" void CcdCallBacksReset(void *ignored, double curWallTime);  // fix Charm++

 void writeComplexToDisk(const float2 *d_data, const int size, const char* filename, cudaStream_t stream) {
   fprintf(stderr, "writeComplexToDisk %d %s\n", size, filename);
   float2* h_data = new float2[size];
   copy_DtoH<float2>(d_data, h_data, size, stream);
   cudaCheck(cudaStreamSynchronize(stream));
   FILE *handle = fopen(filename, "w");
   for (int i=0;i < size;i++)
     fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
   fclose(handle);
   delete [] h_data;
 }

 void writeHostComplexToDisk(const float2 *h_data, const int size, const char* filename) {
   FILE *handle = fopen(filename, "w");
   for (int i=0;i < size;i++)
     fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
   fclose(handle);
 }

 void writeRealToDisk(const float *d_data, const int size, const char* filename, cudaStream_t stream) {
   fprintf(stderr, "writeRealToDisk %d %s\n", size, filename);
   float* h_data = new float[size];
   copy_DtoH<float>(d_data, h_data, size, stream);
   cudaCheck(cudaStreamSynchronize(stream));
   FILE *handle = fopen(filename, "w");
   for (int i=0;i < size;i++)
     fprintf(handle, "%f\n", h_data[i]);
   fclose(handle);
   delete [] h_data;
 }

         CudaFFTCompute::CudaFFTCompute(int deviceID, cudaStream_t stream)
     : deviceID(deviceID), stream(stream) {
     }

 void CudaFFTCompute::plan3D(int *n, int flags) {
   cudaCheck(cudaSetDevice(deviceID));
   forwardType = CUFFT_R2C;
   backwardType = CUFFT_C2R;
   cufftCheck(cufftPlan3d(&forwardPlan, n[2], n[1], n[0], CUFFT_R2C));
   cufftCheck(cufftPlan3d(&backwardPlan, n[2], n[1], n[0], CUFFT_C2R));
   setStream();
   // plantype = 3;
 }

 void CudaFFTCompute::plan2D(int *n, int howmany, int flags) {
   cudaCheck(cudaSetDevice(deviceID));
   forwardType = CUFFT_R2C;
   backwardType = CUFFT_C2R;
   int nt[2] = {n[1], n[0]};
   cufftCheck(cufftPlanMany(&forwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_R2C, howmany));
   cufftCheck(cufftPlanMany(&backwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, howmany));
   setStream();
   // plantype = 2;
 }

 void CudaFFTCompute::plan1DX(int *n, int howmany, int flags) {
   cudaCheck(cudaSetDevice(deviceID));
   forwardType = CUFFT_R2C;
   backwardType = CUFFT_C2R;
   cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_R2C, howmany));
   cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2R, howmany));
   setStream();
   // plantype = 1;
 }

 void CudaFFTCompute::plan1DY(int *n, int howmany, int flags) {
   cudaCheck(cudaSetDevice(deviceID));
   forwardType = CUFFT_C2C;
   backwardType = CUFFT_C2C;
   cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
   cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
   setStream();
   // plantype = 1;
 }

 void CudaFFTCompute::plan1DZ(int *n, int howmany, int flags) {
   cudaCheck(cudaSetDevice(deviceID));
   forwardType = CUFFT_C2C;
   backwardType = CUFFT_C2C;
   cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
   cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
   setStream();
   // plantype = 1;
 }

 CudaFFTCompute::~CudaFFTCompute() {
   cudaCheck(cudaSetDevice(deviceID));
         cufftCheck(cufftDestroy(forwardPlan));
         cufftCheck(cufftDestroy(backwardPlan));
   if (dataSrcAllocated) deallocate_device<float>(&dataSrc);
   if (dataDstAllocated) deallocate_device<float>(&dataDst);
 }

 float* CudaFFTCompute::allocateData(const int dataSizeRequired) {
   cudaCheck(cudaSetDevice(deviceID));
   float* tmp = NULL;
   allocate_device<float>(&tmp, dataSizeRequired);
   return tmp;
 }

 // int ncall = 0;

 void CudaFFTCompute::forward() {
   cudaCheck(cudaSetDevice(deviceID));
   // ncall++;
   if (forwardType == CUFFT_R2C) {
     cufftCheck(cufftExecR2C(forwardPlan, (cufftReal *)dataSrc, (cufftComplex *)dataDst));
 #ifdef TESTPID
     if (1) {
       cudaCheck(cudaStreamSynchronize(stream));
       fprintf(stderr, "AP FORWARD FFT\n");
       fprintf(stderr, "COPY DEVICE ARRAYS BACK TO HOST\n");
       int m = dataDstSize;
       float *tran = 0;
       allocate_host<float>(&tran, m);
       copy_DtoH<float>(dataDst, tran, m, stream);
       cudaCheck(cudaStreamSynchronize(stream));
       TestArray_write<float>("tran_charge_grid_good.bin",
           "transformed charge grid good", tran, m);
       deallocate_host<float>(&tran);
     }
 #endif

     // if (ncall == 1) {
     //   writeComplexToDisk((float2 *)dataSrc, (isize/2+1)*jsize*ksize, "dataSrc.txt", stream);
     // }

     // if (ncall == 1 && plantype == 2) {
     //   writeComplexToDisk((float2 *)data, (isize/2+1)*jsize*ksize, "data_fx_fy_z.txt", stream);
     // }

   } else if (forwardType == CUFFT_C2C) {
     // nc2cf++;
     // if (ncall == 1 && nc2cf == 1)
     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_y_z_fx.txt");
     // else if (ncall == 1 && nc2cf == 2)
     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_z_fx_fy.txt");
     cufftCheck(cufftExecC2C(forwardPlan, (cufftComplex *)dataSrc, (cufftComplex *)dataDst, CUFFT_FORWARD));
     // fprintf(stderr, "ncall %d plantype %d\n", ncall, plantype);
     // if (ncall == 1 && plantype == 1 && isize == 62) {
     //   writeComplexToDisk((float2 *)data, isize*jsize*(ksize/2+1), "data_fy_z_fx.txt", stream);
     // }
     // if (ncall == 1 && nc2cf == 1)
     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_fy_z_fx.txt");
     // else if (ncall == 1 && nc2cf == 2)
     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_fz_fx_fy.txt");
   } else {
     cudaNAMD_bug("CudaFFTCompute::forward(), unsupported FFT type");
   }
 }

 void CudaFFTCompute::backward() {
   cudaCheck(cudaSetDevice(deviceID));
   if (backwardType == CUFFT_C2R) {
     // if (ncall == 1) {
     //   if (plantype == 1)
     //     writeComplexToDisk((float2 *)data, 33*64*64, "data_fx_by_bz.txt");
     //   else
     //     writeComplexToDisk((float2 *)data, 33*64*64, "data_fx_fy_fz_2.txt");
     // }
     cufftCheck(cufftExecC2R(backwardPlan, (cufftComplex *)dataDst, (cufftReal *)dataSrc));
 #ifdef TESTPID
   if (1) {
     cudaCheck(cudaStreamSynchronize(stream));
     fprintf(stderr, "AP BACKWARD FFT\n");
     fprintf(stderr, "COPY DEVICE ARRAYS BACK TO HOST\n");
     float *grid;
     int gridsize = dataSrcSize;
     allocate_host<float>(&grid, gridsize);
     copy_DtoH<float>((float*)dataSrc, grid, gridsize, stream);
     cudaCheck(cudaStreamSynchronize(stream));
     TestArray_write<float>("potential_grid_good.bin",
         "potential grid good", grid, gridsize);
     deallocate_host<float>(&grid);
   }
 #endif

     // if (ncall == 1)
     //   if (plantype == 1)
     //     writeRealToDisk(data, 64*64*64, "data_bx_by_bz_1D.txt");
     //   else
     //     writeRealToDisk(data, 64*64*64, "data_bx_by_bz_3D.txt");
   } else if (backwardType == CUFFT_C2C) {
     // nc2cb++;
     // if (ncall == 1 && nc2cb == 1)
     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_fz_fx_fy_2.txt");
     // else if (ncall == 1 && nc2cb == 2)
     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_fy_bz_fx.txt");
     cufftCheck(cufftExecC2C(backwardPlan, (cufftComplex *)dataDst, (cufftComplex *)dataSrc, CUFFT_INVERSE));
     // if (ncall == 1 && nc2cb == 1)
     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_bz_fx_fy.txt");
     // else if (ncall == 1 && nc2cb == 2)
     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_by_bz_fx.txt");
   } else {
     cudaNAMD_bug("CudaFFTCompute::backward(), unsupported FFT type");
   }
 }

 void CudaFFTCompute::setStream() {
   cudaCheck(cudaSetDevice(deviceID));
   cufftCheck(cufftSetStream(forwardPlan, stream));
   cufftCheck(cufftSetStream(backwardPlan, stream));
 }


 CudaPmeKSpaceCompute::CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation,
   const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream, unsigned int iGrid) :
   PmeKSpaceCompute(pmeGrid, permutation, jblock, kblock, kappa, iGrid),
   deviceID(deviceID), stream(stream) {

   cudaCheck(cudaSetDevice(deviceID));

   // Copy bm1 -> prefac_x on GPU memory
   float *bm1f = new float[pmeGrid.K1];
   float *bm2f = new float[pmeGrid.K2];
   float *bm3f = new float[pmeGrid.K3];
   for (int i=0;i < pmeGrid.K1;i++) bm1f[i] = (float)bm1[i];
   for (int i=0;i < pmeGrid.K2;i++) bm2f[i] = (float)bm2[i];
   for (int i=0;i < pmeGrid.K3;i++) bm3f[i] = (float)bm3[i];
   allocate_device<float>(&d_bm1, pmeGrid.K1);
   allocate_device<float>(&d_bm2, pmeGrid.K2);
   allocate_device<float>(&d_bm3, pmeGrid.K3);
   copy_HtoD_sync<float>(bm1f, d_bm1, pmeGrid.K1);
   copy_HtoD_sync<float>(bm2f, d_bm2, pmeGrid.K2);
   copy_HtoD_sync<float>(bm3f, d_bm3, pmeGrid.K3);
   delete [] bm1f;
   delete [] bm2f;
   delete [] bm3f;
   allocate_device<EnergyVirial>(&d_energyVirial, 1);
   allocate_host<EnergyVirial>(&h_energyVirial, 1);
   // cudaCheck(cudaEventCreateWithFlags(&copyEnergyVirialEvent, cudaEventDisableTiming));
   cudaCheck(cudaEventCreate(&copyEnergyVirialEvent));
   // ncall = 0;
 }

 CudaPmeKSpaceCompute::~CudaPmeKSpaceCompute() {
   cudaCheck(cudaSetDevice(deviceID));
   deallocate_device<float>(&d_bm1);
   deallocate_device<float>(&d_bm2);
   deallocate_device<float>(&d_bm3);
   deallocate_device<EnergyVirial>(&d_energyVirial);
   deallocate_host<EnergyVirial>(&h_energyVirial);
   cudaCheck(cudaEventDestroy(copyEnergyVirialEvent));
 }

 void CudaPmeKSpaceCompute::solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float* data) {
 #if 0
   // Check lattice to make sure it is updating for constant pressure
   fprintf(stderr, "K-SPACE LATTICE  %g %g %g  %g %g %g  %g %g %g\n",
       lattice.a().x, lattice.a().y, lattice.a().z,
       lattice.b().x, lattice.b().y, lattice.b().z,
       lattice.c().x, lattice.c().y, lattice.c().z);
 #endif
   cudaCheck(cudaSetDevice(deviceID));

   const bool doEnergyVirial = (doEnergy || doVirial);

   int nfft1, nfft2, nfft3;
   float *prefac1, *prefac2, *prefac3;

   BigReal volume = lattice.volume();
   Vector a_r = lattice.a_r();
   Vector b_r = lattice.b_r();
   Vector c_r = lattice.c_r();
   float recip1x, recip1y, recip1z;
   float recip2x, recip2y, recip2z;
   float recip3x, recip3y, recip3z;

   if (permutation == Perm_Z_cX_Y) {
     // Z, X, Y
     nfft1 = pmeGrid.K3;
     nfft2 = pmeGrid.K1;
     nfft3 = pmeGrid.K2;
     prefac1 = d_bm3;
     prefac2 = d_bm1;
     prefac3 = d_bm2;
     recip1x = c_r.z;
     recip1y = c_r.x;
     recip1z = c_r.y;
     recip2x = a_r.z;
     recip2y = a_r.x;
     recip2z = a_r.y;
     recip3x = b_r.z;
     recip3y = b_r.x;
     recip3z = b_r.y;
   } else if (permutation == Perm_cX_Y_Z) {
     // X, Y, Z
     nfft1 = pmeGrid.K1;
     nfft2 = pmeGrid.K2;
     nfft3 = pmeGrid.K3;
     prefac1 = d_bm1;
     prefac2 = d_bm2;
     prefac3 = d_bm3;
     recip1x = a_r.x;
     recip1y = a_r.y;
     recip1z = a_r.z;
     recip2x = b_r.x;
     recip2y = b_r.y;
     recip2z = b_r.z;
     recip3x = c_r.x;
     recip3y = c_r.y;
     recip3z = c_r.z;
   } else {
     NAMD_bug("CudaPmeKSpaceCompute::solve, invalid permutation");
   }

   // ncall++;
   // if (ncall == 1) {
   //   char filename[256];
   //   sprintf(filename,"dataf_%d_%d.txt",jblock,kblock);
   //   writeComplexToDisk((float2*)data, size1*size2*size3, filename, stream);
   // }

   // if (ncall == 1) {
   //   float2* h_data = new float2[size1*size2*size3];
   //   float2* d_data = (float2*)data;
   //   copy_DtoH<float2>(d_data, h_data, size1*size2*size3, stream);
   //   cudaCheck(cudaStreamSynchronize(stream));
   //   FILE *handle = fopen("dataf.txt", "w");
   //   for (int z=0;z < pmeGrid.K3;z++) {
   //     for (int y=0;y < pmeGrid.K2;y++) {
   //       for (int x=0;x < pmeGrid.K1/2+1;x++) {
   //         int i;
   //         if (permutation == Perm_cX_Y_Z) {
   //           i = x + y*size1 + z*size1*size2;
   //         } else {
   //           i = z + x*size1 + y*size1*size2;
   //         }
   //         fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
   //       }
   //     }
   //   }
   //   fclose(handle);
   //   delete [] h_data;
   // }

   // Clear energy and virial array if needed
   if (doEnergyVirial) clear_device_array<EnergyVirial>(d_energyVirial, 1, stream);

 #ifdef TESTPID
   if (1) {
     cudaCheck(cudaStreamSynchronize(stream));
     fprintf(stderr, "AP calling scalar sum\n");
     fprintf(stderr, "(permutation == Perm_cX_Y_Z) = %s\n",
         (permutation == Perm_cX_Y_Z ? "true" : "false"));
     fprintf(stderr, "nfft1=%d  nfft2=%d  nfft3=%d\n", nfft1, nfft2, nfft3);
     fprintf(stderr, "size1=%d  size2=%d  size3=%d\n", size1, size2, size3);
     fprintf(stderr, "kappa=%g\n", kappa);
     fprintf(stderr, "recip1x=%g  recip1y=%g  recip1z=%g\n",
         (double)recip1x, (double)recip1y, (double)recip1z);
     fprintf(stderr, "recip2x=%g  recip2y=%g  recip2z=%g\n",
         (double)recip2x, (double)recip2y, (double)recip2z);
     fprintf(stderr, "recip3x=%g  recip3y=%g  recip3z=%g\n",
         (double)recip3x, (double)recip3y, (double)recip3z);
     fprintf(stderr, "volume=%g\n", volume);
     fprintf(stderr, "j0=%d  k0=%d\n", j0, k0);
     float *bm1, *bm2, *bm3;
     allocate_host<float>(&bm1, nfft1);
     allocate_host<float>(&bm2, nfft2);
     allocate_host<float>(&bm3, nfft3);
     copy_DtoH<float>(prefac1, bm1, nfft1, stream);
     copy_DtoH<float>(prefac2, bm2, nfft2, stream);
     copy_DtoH<float>(prefac3, bm3, nfft3, stream);
     TestArray_write<float>("bm1_good.bin", "structure factor bm1 good",
         bm1, nfft1);
     TestArray_write<float>("bm2_good.bin", "structure factor bm2 good",
         bm2, nfft2);
     TestArray_write<float>("bm3_good.bin", "structure factor bm3 good",
         bm3, nfft3);
     deallocate_host<float>(&bm1);
     deallocate_host<float>(&bm2);
     deallocate_host<float>(&bm3);
   }
 #endif

   scalar_sum(permutation == Perm_cX_Y_Z, nfft1, nfft2, nfft3, size1, size2, size3, kappa,
     recip1x, recip1y, recip1z, recip2x, recip2y, recip2z, recip3x, recip3y, recip3z,
     volume, prefac1, prefac2, prefac3, j0, k0, doEnergyVirial,
     &d_energyVirial->energy, d_energyVirial->virial, (float2*)data,
     stream);
 #ifdef TESTPID
   if (1) {
     cudaCheck(cudaStreamSynchronize(stream));
     fprintf(stderr, "AP SCALAR SUM\n");
     fprintf(stderr, "COPY DEVICE ARRAYS BACK TO HOST\n");
     int m = 2 * (nfft1/2 + 1) * nfft2 * nfft3;
     float *tran = 0;
     allocate_host<float>(&tran, m);
     copy_DtoH<float>((float*)data, tran, m, stream);
     cudaCheck(cudaStreamSynchronize(stream));
     TestArray_write<float>("tran_potential_grid_good.bin",
           "transformed potential grid good", tran, m);
     deallocate_host<float>(&tran);
   }
 #endif

   // Copy energy and virial to host if needed
   if (doEnergyVirial) {
     copy_DtoH<EnergyVirial>(d_energyVirial, h_energyVirial, 1, stream);
     cudaCheck(cudaEventRecord(copyEnergyVirialEvent, stream));
     // cudaCheck(cudaStreamSynchronize(stream));
   }

 }

 // void CudaPmeKSpaceCompute::waitEnergyAndVirial() {
 //   cudaCheck(cudaSetDevice(deviceID));
 //   cudaCheck(cudaEventSynchronize(copyEnergyVirialEvent));
 // }

 void CudaPmeKSpaceCompute::energyAndVirialCheck(void *arg, double walltime) {
   CudaPmeKSpaceCompute* c = (CudaPmeKSpaceCompute *)arg;

   cudaError_t err = cudaEventQuery(c->copyEnergyVirialEvent);
   if (err == cudaSuccess) {
     // Event has occurred
     c->checkCount = 0;
     if (c->pencilXYZPtr != NULL)
       c->pencilXYZPtr->energyAndVirialDone(c->multipleGridIndex);
     else if (c->pencilZPtr != NULL)
       c->pencilZPtr->energyAndVirialDone(c->multipleGridIndex);
     else
       NAMD_bug("CudaPmeKSpaceCompute::energyAndVirialCheck, pencilXYZPtr and pencilZPtr not set");
     return;
   } else if (err == cudaErrorNotReady) {
     // Event has not occurred
     c->checkCount++;
     if (c->checkCount >= 1000000) {
       char errmsg[256];
       sprintf(errmsg,"CudaPmeKSpaceCompute::energyAndVirialCheck polled %d times",
               c->checkCount);
       cudaDie(errmsg,err);
     }
   } else {
     // Anything else is an error
     char errmsg[256];
     sprintf(errmsg,"in CudaPmeKSpaceCompute::energyAndVirialCheck after polling %d times",
             c->checkCount);
     cudaDie(errmsg,err);
   }

   // Call again
   CcdCallBacksReset(0, walltime);
   CcdCallFnAfter(energyAndVirialCheck, arg, 0.1);
 }

 void CudaPmeKSpaceCompute::energyAndVirialSetCallback(CudaPmePencilXYZ* pencilPtr) {
   cudaCheck(cudaSetDevice(deviceID));
   pencilXYZPtr = pencilPtr;
   pencilZPtr = NULL;
   checkCount = 0;
   CcdCallBacksReset(0, CmiWallTimer());
   // Set the call back at 0.1ms
   CcdCallFnAfter(energyAndVirialCheck, this, 0.1);
 }

 void CudaPmeKSpaceCompute::energyAndVirialSetCallback(CudaPmePencilZ* pencilPtr) {
   cudaCheck(cudaSetDevice(deviceID));
   pencilXYZPtr = NULL;
   pencilZPtr = pencilPtr;
   checkCount = 0;
   CcdCallBacksReset(0, CmiWallTimer());
   // Set the call back at 0.1ms
   CcdCallFnAfter(energyAndVirialCheck, this, 0.1);
 }

 double CudaPmeKSpaceCompute::getEnergy() {
   return h_energyVirial->energy;
 }

 void CudaPmeKSpaceCompute::getVirial(double *virial) {
   if (permutation == Perm_Z_cX_Y) {
     // h_energyVirial->virial is storing ZZ, ZX, ZY, XX, XY, YY
     virial[0] = h_energyVirial->virial[3];
     virial[1] = h_energyVirial->virial[4];
     virial[2] = h_energyVirial->virial[1];

     virial[3] = h_energyVirial->virial[4];
     virial[4] = h_energyVirial->virial[5];
     virial[5] = h_energyVirial->virial[2];

     virial[6] = h_energyVirial->virial[1];
     virial[7] = h_energyVirial->virial[7];
     virial[8] = h_energyVirial->virial[0];
   } else if (permutation == Perm_cX_Y_Z) {
     // h_energyVirial->virial is storing XX, XY, XZ, YY, YZ, ZZ
     virial[0] = h_energyVirial->virial[0];
     virial[1] = h_energyVirial->virial[1];
     virial[2] = h_energyVirial->virial[2];

     virial[3] = h_energyVirial->virial[1];
     virial[4] = h_energyVirial->virial[3];
     virial[5] = h_energyVirial->virial[4];

     virial[6] = h_energyVirial->virial[2];
     virial[7] = h_energyVirial->virial[4];
     virial[8] = h_energyVirial->virial[5];
   }
 #if 0
   fprintf(stderr, "AP PME VIRIAL =\n"
       "  %g  %g  %g\n  %g  %g  %g\n  %g %g %g\n",
       virial[0], virial[1], virial[2], virial[3], virial[4],
       virial[5], virial[6], virial[7], virial[8]);
 #endif
 }


 //###########################################################################
 //###########################################################################
 //###########################################################################

 //
 // Class constructor
 //
 CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute(PmeGrid pmeGrid,
   const int jblock, const int kblock, int deviceID, cudaStream_t stream) :
   PmeRealSpaceCompute(pmeGrid, jblock, kblock), deviceID(deviceID), stream(stream) {
   if (dataSize < xsize*ysize*zsize)
     NAMD_bug("CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute, insufficient dataSize");
   cudaCheck(cudaSetDevice(deviceID));
   d_atomsCapacity = 0;
   d_atoms = NULL;
   d_forceCapacity = 0;
   d_force = NULL;
   #ifdef NAMD_CUDA
   tex_data = NULL;
   tex_data_len = 0;
   #else
   grid_data = NULL;
   grid_data_len = 0;
   #endif
   allocate_device<float>(&data, dataSize);
   setupGridData(data, xsize*ysize*zsize);
   cudaCheck(cudaEventCreate(&gatherForceEvent));
 }

 //
 // Class desctructor
 //
 CudaPmeRealSpaceCompute::~CudaPmeRealSpaceCompute() {
   cudaCheck(cudaSetDevice(deviceID));
   if (d_atoms != NULL) deallocate_device<CudaAtom>(&d_atoms);
   if (d_force != NULL) deallocate_device<CudaForce>(&d_force);
   // if (d_patches != NULL) deallocate_device<PatchInfo>(&d_patches);
   // deallocate_device<double>(&d_selfEnergy);
   deallocate_device<float>(&data);
   cudaCheck(cudaEventDestroy(gatherForceEvent));
 }

 // //
 // // Copy patches and atoms to device memory
 // //
 // void CudaPmeRealSpaceCompute::setPatchesAtoms(const int numPatches, const PatchInfo* patches,
 //   const int numAtoms, const CudaAtom* atoms) {

 //   this->numPatches = numPatches;
 //   this->numAtoms = numAtoms;

 //   // Reallocate device arrays as neccessary
 //   reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity, numAtoms, 1.5f);
 //   reallocate_device<PatchInfo>(&d_patches, &d_patchesCapacity, numPatches, 1.5f);

 //   // Copy atom and patch data to device
 //   copy_HtoD<CudaAtom>(atoms, d_atoms, numAtoms, stream);
 //   copy_HtoD<PatchInfo>(patches, d_patches, numPatches, stream);
 // }

 //
 // Copy atoms to device memory
 //
 void CudaPmeRealSpaceCompute::copyAtoms(const int numAtoms, const CudaAtom* atoms) {
   cudaCheck(cudaSetDevice(deviceID));
   this->numAtoms = numAtoms;

   // Reallocate device arrays as neccessary
   reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity, numAtoms, 1.5f);

   // Copy atom data to device
   copy_HtoD<CudaAtom>(atoms, d_atoms, numAtoms, stream);
 }

 //
 // Spread charges on grid
 //
 void CudaPmeRealSpaceCompute::spreadCharge(Lattice &lattice) {
   cudaCheck(cudaSetDevice(deviceID));
 #if 0
   if (1) {
     static int step = 0;
     float *xyzq;
     int natoms = numAtoms;
     allocate_host<float>(&xyzq, 4*natoms);
     copy_DtoH<float>((float *)d_atoms, xyzq, 4*natoms, stream);
     cudaCheck(cudaStreamSynchronize(stream));
     char fname[64], remark[64];
     sprintf(fname, "pme_atoms_xyzq_soa_%d.bin", step);
     sprintf(remark, "SOA PME atoms xyzq, step %d\n", step);
     TestArray_write<float>(fname, remark, xyzq, 4*natoms);
     deallocate_host<float>(&xyzq);
     step += 2;
   }
 #endif

   NAMD_EVENT_START(1, NamdProfileEvent::SPREAD_CHARGE);

   // Clear grid
   clear_device_array<float>(data, xsize*ysize*zsize, stream);

 #if defined(TESTPID)
   fprintf(stderr, "Calling spread_charge with parameters:\n");
   fprintf(stderr, "numAtoms = %d\n", numAtoms);
   fprintf(stderr, "pmeGrid.K1 = %d\n", pmeGrid.K1);
   fprintf(stderr, "pmeGrid.K2 = %d\n", pmeGrid.K2);
   fprintf(stderr, "pmeGrid.K3 = %d\n", pmeGrid.K3);
   fprintf(stderr, "xsize = %d\n", xsize);
   fprintf(stderr, "ysize = %d\n", ysize);
   fprintf(stderr, "zsize = %d\n", zsize);
   fprintf(stderr, "y0 = %d\n", y0);
   fprintf(stderr, "z0 = %d\n", z0);
   fprintf(stderr, "(pmeGrid.yBlocks == 1) = %d\n", (pmeGrid.yBlocks == 1));
   fprintf(stderr, "(pmeGrid.zBlocks == 1) = %d\n", (pmeGrid.zBlocks == 1));
   fprintf(stderr, "pmeGrid.order = %d\n", pmeGrid.order);
 #endif
   spread_charge((const float4*)d_atoms, numAtoms,
     pmeGrid.K1, pmeGrid.K2, pmeGrid.K3, xsize, ysize, zsize,
     xsize, y0, z0, (pmeGrid.yBlocks == 1), (pmeGrid.zBlocks == 1),
     data, pmeGrid.order, stream);
 #ifdef TESTPID
   if (1) {
     cudaCheck(cudaStreamSynchronize(stream));
     fprintf(stderr, "AP SPREAD CHARGES\n");
     fprintf(stderr, "COPY DEVICE ARRAYS BACK TO HOST\n");
     float *xyzq;
     allocate_host<float>(&xyzq, 4*numAtoms);
     copy_DtoH<float>((float *)d_atoms, xyzq, 4*numAtoms, stream);
     int gridlen = pmeGrid.K1 * pmeGrid.K2 * pmeGrid.K3;
     float *grid;
     allocate_host<float>(&grid, gridlen);
     copy_DtoH<float>(data, grid, gridlen, stream);
     cudaCheck(cudaStreamSynchronize(stream));
     TestArray_write<float>("xyzq_good.bin", "xyzq good", xyzq, 4*numAtoms);
     TestArray_write<float>("charge_grid_good.bin", "charge grid good",
         grid, gridlen);
     deallocate_host<float>(&xyzq);
     deallocate_host<float>(&grid);
   }
 #endif

   // ncall++;

   // if (ncall == 1) writeRealToDisk(data, xsize*ysize*zsize, "data.txt");
   NAMD_EVENT_STOP(1, NamdProfileEvent::SPREAD_CHARGE);
 }

 void CudaPmeRealSpaceCompute::cuda_gatherforce_check(void *arg, double walltime) {
   CudaPmeRealSpaceCompute* c = (CudaPmeRealSpaceCompute *)arg;
   cudaCheck(cudaSetDevice(c->deviceID));

   cudaError_t err = cudaEventQuery(c->gatherForceEvent);
   if (err == cudaSuccess) {
     // Event has occurred
     c->checkCount = 0;
 //    c->deviceProxy[CkMyNode()].gatherForceDone();
     c->devicePtr->gatherForceDone(c->multipleGridIndex);
     return;
   } else if (err == cudaErrorNotReady) {
     // Event has not occurred
     c->checkCount++;
     if (c->checkCount >= 1000000) {
       char errmsg[256];
       sprintf(errmsg,"CudaPmeRealSpaceCompute::cuda_gatherforce_check polled %d times",
               c->checkCount);
       cudaDie(errmsg,err);
     }
   } else {
     // Anything else is an error
     char errmsg[256];
     sprintf(errmsg,"in CudaPmeRealSpaceCompute::cuda_gatherforce_check after polling %d times",
             c->checkCount);
     cudaDie(errmsg,err);
   }

   // Call again
   CcdCallBacksReset(0, walltime);
   CcdCallFnAfter(cuda_gatherforce_check, arg, 0.1);
 }

 void CudaPmeRealSpaceCompute::gatherForceSetCallback(ComputePmeCUDADevice* devicePtr_in) {
   cudaCheck(cudaSetDevice(deviceID));
   devicePtr = devicePtr_in;
   checkCount = 0;
   CcdCallBacksReset(0, CmiWallTimer());
   // Set the call back at 0.1ms
   CcdCallFnAfter(cuda_gatherforce_check, this, 0.1);
 }

 void CudaPmeRealSpaceCompute::waitGatherForceDone() {
   cudaCheck(cudaSetDevice(deviceID));
   cudaCheck(cudaEventSynchronize(gatherForceEvent));
 }

 void CudaPmeRealSpaceCompute::setupGridData(float* data, int data_len) {
   #ifdef NAMD_CUDA
     //HIP runtime error when using hipCreateTextureObject. No longer needed anyway, so we are moving in that direction now.
   /*

   FATAL ERROR: CUDA error hipCreateTextureObject(&gridTexObj, &desc, &tdesc, NULL) in file src/CudaPmeSolverUtil.C, function setupGridTexture, line 744
  on Pe 11 (jparada-MS-7B09 device 0 pci 0:43:0): hipErrorRuntimeOther
 ------------- Processor 11 Exiting: Called CmiAbort ------------
 Reason: FATAL ERROR: CUDA error hipCreateTextureObject(&gridTexObj, &desc, &tdesc, NULL) in file src/CudaPmeSolverUtil.C, function setupGridTexture, line 744
  on Pe 11 (jparada-MS-7B09 device 0 pci 0:43:0): hipErrorRuntimeOther

   */
   if (tex_data == data && tex_data_len == data_len) return;
   tex_data = data;
   tex_data_len = data_len;
   // Use texture objects
   cudaResourceDesc resDesc;
   memset(&resDesc, 0, sizeof(resDesc));
   resDesc.resType = cudaResourceTypeLinear;
   resDesc.res.linear.devPtr = data;
   resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
   resDesc.res.linear.desc.x = sizeof(float)*8;
   resDesc.res.linear.sizeInBytes = data_len*sizeof(float);
   cudaTextureDesc texDesc;
   memset(&texDesc, 0, sizeof(texDesc));
   texDesc.readMode = cudaReadModeElementType;
   cudaCheck(cudaCreateTextureObject(&gridTexObj, &resDesc, &texDesc, NULL));
   #else
   if (grid_data == data && grid_data_len == data_len) return;
   grid_data = data;
   grid_data_len = data_len;
   #endif
 }

 void CudaPmeRealSpaceCompute::gatherForce(Lattice &lattice, CudaForce* force) {
   cudaCheck(cudaSetDevice(deviceID));

   NAMD_EVENT_START(1, NamdProfileEvent::GATHER_FORCE);

   // Re-allocate force array if needed
   reallocate_device<CudaForce>(&d_force, &d_forceCapacity, numAtoms, 1.5f);

 #ifdef TESTPID
   if (1) {
     fprintf(stderr, "AP gather force arguments\n");
     fprintf(stderr, "numAtoms = %d\n", numAtoms);
     fprintf(stderr, "pmeGrid.K1 = %d\n", pmeGrid.K1);
     fprintf(stderr, "pmeGrid.K2 = %d\n", pmeGrid.K2);
     fprintf(stderr, "pmeGrid.K3 = %d\n", pmeGrid.K3);
     fprintf(stderr, "xsize = %d\n", xsize);
     fprintf(stderr, "ysize = %d\n", ysize);
     fprintf(stderr, "zsize = %d\n", zsize);
     fprintf(stderr, "y0 = %d\n", y0);
     fprintf(stderr, "z0 = %d\n", z0);
     fprintf(stderr, "(pmeGrid.yBlocks == 1) = %d\n", (pmeGrid.yBlocks == 1));
     fprintf(stderr, "(pmeGrid.zBlocks == 1) = %d\n", (pmeGrid.zBlocks == 1));
     fprintf(stderr, "pmeGrid.order = %d\n", pmeGrid.order);
     fprintf(stderr, "gridTexObj = %p\n", gridTexObj);
   }
 #endif
   // The patch-level PME kernels are only used for the GPU-resident code path. The default constructor
   // of PatchLevelPmeData will initialize the compatibility variables to false, so the patch-level kernels
   // won't be used here.
   PatchLevelPmeData patchLevelPmeData;
   gather_force(patchLevelPmeData,
     (const float4*)d_atoms, numAtoms,
     pmeGrid.K1, pmeGrid.K2, pmeGrid.K3,
     xsize, ysize, zsize, xsize, y0, z0, (pmeGrid.yBlocks == 1), (pmeGrid.zBlocks == 1),
     data, pmeGrid.order, (float3*)d_force,
 #ifdef NAMD_CUDA
     gridTexObj,
 #endif
     stream);
 #ifdef TESTPID
   if (1) {
     cudaCheck(cudaStreamSynchronize(stream));
     fprintf(stderr, "AP GATHER FORCE\n");
     fprintf(stderr, "COPY DEVICE ARRAYS BACK TO HOST\n");
     float *xyz;
     int natoms = numAtoms;
     allocate_host<float>(&xyz, 3*natoms);
     copy_DtoH<float>((float*)d_force, xyz, 3*natoms, stream);
     cudaCheck(cudaStreamSynchronize(stream));
     TestArray_write<float>("gather_force_good.bin",
         "gather force good", xyz, 3*natoms);
     deallocate_host<float>(&xyz);
   }
 #endif

   copy_DtoH<CudaForce>(d_force, force, numAtoms, stream);

   cudaCheck(cudaEventRecord(gatherForceEvent, stream));

   NAMD_EVENT_STOP(1, NamdProfileEvent::GATHER_FORCE);
 }

 /*
 double CudaPmeRealSpaceCompute::calcSelfEnergy() {
   double h_selfEnergy;
   clear_device_array<double>(d_selfEnergy, 1);
   calc_sum_charge_squared((const float4*)d_atoms, numAtoms, d_selfEnergy, stream);
   copy_DtoH<double>(d_selfEnergy, &h_selfEnergy, 1, stream);
   cudaCheck(cudaStreamSynchronize(stream));
   // 1.7724538509055160273 = sqrt(pi)
   h_selfEnergy *= -ComputeNonbondedUtil::ewaldcof/1.7724538509055160273;
   return h_selfEnergy;
 }
 */

 //###########################################################################
 //###########################################################################
 //###########################################################################

 CudaPmeTranspose::CudaPmeTranspose(PmeGrid pmeGrid, const int permutation,
     const int jblock, const int kblock, int deviceID, cudaStream_t stream) :
   PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
   cudaCheck(cudaSetDevice(deviceID));

   allocate_device<float2>(&d_data, dataSize);
 #ifndef P2P_ENABLE_3D
   allocate_device<float2>(&d_buffer, dataSize);
 #endif

   // Setup data pointers to NULL, these can be overridden later on by using setDataPtrs()
   dataPtrsYZX.resize(nblock, NULL);
   dataPtrsZXY.resize(nblock, NULL);

   allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*nblock);
   allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*nblock);
 }

 CudaPmeTranspose::~CudaPmeTranspose() {
   cudaCheck(cudaSetDevice(deviceID));
   deallocate_device<float2>(&d_data);
 #ifndef P2P_ENABLE_3D
   deallocate_device<float2>(&d_buffer);
 #endif
   deallocate_device< TransposeBatch<float2> >(&batchesZXY);
   deallocate_device< TransposeBatch<float2> >(&batchesYZX);
 }

 //
 // Set dataPtrsYZX
 //
 void CudaPmeTranspose::setDataPtrsYZX(std::vector<float2*>& dataPtrsNew, float2* data) {
   if (dataPtrsYZX.size() != dataPtrsNew.size())
     NAMD_bug("CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
   for (int iblock=0;iblock < nblock;iblock++) {
     dataPtrsYZX[iblock] = dataPtrsNew[iblock];
   }
   // Build batched data structures
   TransposeBatch<float2> *h_batchesYZX = new TransposeBatch<float2>[3*nblock];

   for (int iperm=0;iperm < 3;iperm++) {
     int isize_out;
     if (iperm == 0) {
       // Perm_Z_cX_Y:
       // ZXY -> XYZ
       isize_out = pmeGrid.K1/2+1;
     } else if (iperm == 1) {
       // Perm_cX_Y_Z:
       // XYZ -> YZX
       isize_out = pmeGrid.K2;
     } else {
       // Perm_Y_Z_cX:
       // YZX -> ZXY
       isize_out = pmeGrid.K3;
     }

     int max_nx = 0;
     for (int iblock=0;iblock < nblock;iblock++) {

       int x0 = pos[iblock];
       int nx = pos[iblock+1] - x0;
       max_nx = std::max(max_nx, nx);

       int width_out;
       float2* data_out;
       if (dataPtrsYZX[iblock] == NULL) {
         // Local transpose, use internal buffer
         data_out = d_data + jsize*ksize*x0;
         width_out = jsize;
       } else {
         // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
         data_out = dataPtrsYZX[iblock];
         width_out = isize_out;
       }

       TransposeBatch<float2> batch;
       batch.nx        = nx;
       batch.ysize_out = width_out;
       batch.zsize_out = ksize;
       batch.data_in   = data+x0;
       batch.data_out  = data_out;

       h_batchesYZX[iperm*nblock + iblock] = batch;

     // transpose_xyz_yzx(
     //   nx, jsize, ksize,
     //   isize, jsize,
     //   width_out, ksize,
     //   data+x0, data_out, stream);
     }

     max_nx_YZX[iperm] = max_nx;
   }

   copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*nblock, stream);
   cudaCheck(cudaStreamSynchronize(stream));
   delete [] h_batchesYZX;
 }

 //
 // Set dataPtrsZXY
 //
 void CudaPmeTranspose::setDataPtrsZXY(std::vector<float2*>& dataPtrsNew, float2* data) {
   if (dataPtrsZXY.size() != dataPtrsNew.size())
     NAMD_bug("CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
   for (int iblock=0;iblock < nblock;iblock++) {
     dataPtrsZXY[iblock] = dataPtrsNew[iblock];
   }

   // Build batched data structures
   TransposeBatch<float2> *h_batchesZXY = new TransposeBatch<float2>[3*nblock];

   for (int iperm=0;iperm < 3;iperm++) {
     int isize_out;
     if (iperm == 0) {
       // Perm_cX_Y_Z:
       // XYZ -> ZXY
       isize_out = pmeGrid.K3;
     } else if (iperm == 1) {
       // Perm_Z_cX_Y:
       // ZXY -> YZX
       isize_out = pmeGrid.K2;
     } else {
       // Perm_Y_Z_cX:
       // YZX -> XYZ
       isize_out = pmeGrid.K1/2+1;
     }

     int max_nx = 0;
     for (int iblock=0;iblock < nblock;iblock++) {

       int x0 = pos[iblock];
       int nx = pos[iblock+1] - x0;
       max_nx = std::max(max_nx, nx);

       int width_out;
       float2* data_out;
       if (dataPtrsZXY[iblock] == NULL) {
         // Local transpose, use internal buffer
         data_out = d_data + jsize*ksize*x0;
         width_out = ksize;
       } else {
         // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
         data_out = dataPtrsZXY[iblock];
         width_out = isize_out;
       }

       TransposeBatch<float2> batch;
       batch.nx        = nx;
       batch.zsize_out = width_out;
       batch.xsize_out = nx;
       batch.data_in   = data+x0;
       batch.data_out  = data_out;
       h_batchesZXY[iperm*nblock + iblock] = batch;
     }

     max_nx_ZXY[iperm] = max_nx;
   }

   copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*nblock, stream);
   cudaCheck(cudaStreamSynchronize(stream));
   delete [] h_batchesZXY;
 }

 void CudaPmeTranspose::transposeXYZtoYZX(const float2* data) {
   cudaCheck(cudaSetDevice(deviceID));

   int iperm;
   switch(permutation) {
     case Perm_Z_cX_Y:
     // ZXY -> XYZ
     iperm = 0;
     break;
     case Perm_cX_Y_Z:
     // XYZ -> YZX
     iperm = 1;
     break;
     case Perm_Y_Z_cX:
     // YZX -> ZXY
     iperm = 2;
     break;
     default:
     NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
     break;
   }

   batchTranspose_xyz_yzx(
     nblock, batchesYZX + iperm*nblock,
     max_nx_YZX[iperm], jsize, ksize,
     isize, jsize, stream);


 /*
   int isize_out;
   switch(permutation) {
     case Perm_Z_cX_Y:
     // ZXY -> XYZ
     isize_out = pmeGrid.K1/2+1;
     break;
     case Perm_cX_Y_Z:
     // XYZ -> YZX
     isize_out = pmeGrid.K2;
     break;
     case Perm_Y_Z_cX:
     // YZX -> ZXY
     isize_out = pmeGrid.K3;
     break;
     default:
     NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
     break;
   }

   for (int iblock=0;iblock < nblock;iblock++) {

     int x0 = pos[iblock];
     int nx = pos[iblock+1] - x0;

     int width_out;
     float2* data_out;
     if (dataPtrsYZX[iblock] == NULL) {
       // Local transpose, use internal buffer
       data_out = d_data + jsize*ksize*x0;
       width_out = jsize;
     } else {
       // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
       data_out = dataPtrsYZX[iblock];
       width_out = isize_out;
     }

     transpose_xyz_yzx(
       nx, jsize, ksize,
       isize, jsize,
       width_out, ksize,
       data+x0, data_out, stream);
   }
 */
 }

 void CudaPmeTranspose::transposeXYZtoZXY(const float2* data) {
   cudaCheck(cudaSetDevice(deviceID));

   int iperm;
   switch(permutation) {
     case Perm_cX_Y_Z:
     // XYZ -> ZXY
     iperm = 0;
     break;
     case Perm_Z_cX_Y:
     // ZXY -> YZX
     iperm = 1;
     break;
     case Perm_Y_Z_cX:
     // YZX -> XYZ
     iperm = 2;
     break;
     default:
     NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
     break;
   }

   batchTranspose_xyz_zxy(
     nblock, batchesZXY + iperm*nblock,
     max_nx_ZXY[iperm], jsize, ksize,
     isize, jsize, stream);

 /*
   int isize_out;
   switch(permutation) {
     case Perm_cX_Y_Z:
     // XYZ -> ZXY
     isize_out = pmeGrid.K3;
     break;
     case Perm_Z_cX_Y:
     // ZXY -> YZX
     isize_out = pmeGrid.K2;
     break;
     case Perm_Y_Z_cX:
     // YZX -> XYZ
     isize_out = pmeGrid.K1/2+1;
     break;
     default:
     NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
     break;
   }

   for (int iblock=0;iblock < nblock;iblock++) {

     int x0 = pos[iblock];
     int nx = pos[iblock+1] - x0;

     int width_out;
     float2* data_out;
     if (dataPtrsZXY[iblock] == NULL) {
       // Local transpose, use internal buffer
       data_out = d_data + jsize*ksize*x0;
       width_out = ksize;
     } else {
       // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
       data_out = dataPtrsZXY[iblock];
       width_out = isize_out;
     }

     transpose_xyz_zxy(
       nx, jsize, ksize,
       isize, jsize,
       width_out, nx,
       data+x0, data_out, stream);
   }
 */
 }

 void CudaPmeTranspose::waitStreamSynchronize() {
   cudaCheck(cudaSetDevice(deviceID));
   cudaCheck(cudaStreamSynchronize(stream));
 }

 void CudaPmeTranspose::copyDataDeviceToHost(const int iblock, float2* h_data, const int h_dataSize) {
   cudaCheck(cudaSetDevice(deviceID));

   if (iblock >= nblock)
     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");

   int x0 = pos[iblock];
   int nx = pos[iblock+1] - x0;

   int copySize  = jsize*ksize*nx;
   int copyStart = jsize*ksize*x0;

   if (copyStart + copySize > dataSize)
     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");

   if (copySize > h_dataSize)
     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");

   copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
 }

 void CudaPmeTranspose::copyDataHostToDevice(const int iblock, float2* data_in, float2* data_out) {
   cudaCheck(cudaSetDevice(deviceID));

   if (iblock >= nblock)
     NAMD_bug("CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");

   // Determine block size = how much we're copying
   int i0, i1, j0, j1, k0, k1;
   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
   int ni = i1-i0+1;
   int nj = j1-j0+1;
   int nk = k1-k0+1;

   copy3D_HtoD<float2>(data_in, data_out,
     0, 0, 0,
     ni, nj,
     i0, 0, 0,
     isize, jsize,
     ni, nj, nk, stream);
 }

 #ifndef P2P_ENABLE_3D
 //
 // Copy from temporary buffer to final buffer
 //
 void CudaPmeTranspose::copyDataDeviceToDevice(const int iblock, float2* data_out) {
   cudaCheck(cudaSetDevice(deviceID));

   if (iblock >= nblock)
     NAMD_bug("CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");

   // Determine block size = how much we're copying
   int i0, i1, j0, j1, k0, k1;
   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
   int ni = i1-i0+1;
   int nj = j1-j0+1;
   int nk = k1-k0+1;

   float2* data_in = d_buffer + i0*nj*nk;

   copy3D_DtoD<float2>(data_in, data_out,
     0, 0, 0,
     ni, nj,
     i0, 0, 0,
     isize, jsize,
     ni, nj, nk, stream);
 }

 //
 // Return temporary buffer for block "iblock"
 //
 float2* CudaPmeTranspose::getBuffer(const int iblock) {
   if (iblock >= nblock)
     NAMD_bug("CudaPmeTranspose::getBuffer, block index exceeds number of blocks");

   // Determine block size = how much we're copying
   int i0, i1, j0, j1, k0, k1;
   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
   int ni = i1-i0+1;
   int nj = j1-j0+1;
   int nk = k1-k0+1;

   return d_buffer + i0*nj*nk;
 }
 #endif

 void CudaPmeTranspose::copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out,
   float2* data_out) {

   int iblock_out = jblock;
   int jblock_out = kblock;
   int kblock_out = iblock;

   copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
 }

 void CudaPmeTranspose::copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out,
   float2* data_out) {

   int iblock_out = kblock;
   int jblock_out = iblock;
   int kblock_out = jblock;

   copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
 }

 void CudaPmeTranspose::copyDataToPeerDevice(const int iblock,
   const int iblock_out, const int jblock_out, const int kblock_out,
   int deviceID_out, int permutation_out, float2* data_out) {

   cudaCheck(cudaSetDevice(deviceID));

   // Determine block size = how much we're copying
   int i0, i1, j0, j1, k0, k1;
   getBlockDim(pmeGrid, permutation_out, iblock_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
   int ni = i1-i0+1;
   int nj = j1-j0+1;
   int nk = k1-k0+1;

   getPencilDim(pmeGrid, permutation_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
   int isize_out = i1-i0+1;
   int jsize_out = j1-j0+1;

   int x0 = pos[iblock];
   float2* data_in = d_data + jsize*ksize*x0;

 #ifndef P2P_ENABLE_3D
   // Copy into temporary peer device buffer
   copy_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out, ni*nj*nk, stream);
 #else
   copy3D_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out,
     0, 0, 0,
     ni, nj,
     0, 0, 0,
     isize_out, jsize_out,
     ni, nj, nk, stream);
 #endif

 }


 CudaPmeOneDevice::CudaPmeOneDevice(
     PmeGrid pmeGrid_,
     int deviceID_,
     int deviceIndex_
     ) :
   pmeGrid(pmeGrid_), deviceID(deviceID_), deviceIndex(deviceIndex_),
   natoms(0), d_atoms(0), d_forces(0),
   d_grids(0), gridsize(0),
   d_trans(0), transize(0),
   d_bm1(0), d_bm2(0), d_bm3(0),
   kappa(ComputeNonbondedUtil::ewaldcof),
   self_energy_alch_first_time(true),
   force_scaling_alch_first_time(true),
   selfEnergy(0), selfEnergy_FEP(0), selfEnergy_TI_1(0), selfEnergy_TI_2(0), m_step(0)
 {
 //   fprintf(stderr, "CudaPmeOneDevice constructor START ******************************************\n");
   const SimParameters& sim_params = *(Node::Object()->simParameters);
   natoms = Node::Object()->molecule->numAtoms;
   // Determine how many grids we need for the alchemical route
   if (sim_params.alchOn) {
     num_used_grids = sim_params.alchGetNumOfPMEGrids();
   } else {
     num_used_grids = 1;
   }
   cudaCheck(cudaSetDevice(deviceID));

   // Check to see if the simulation and device is compatible with patch-level kernels. The results
   // will be worked in the PatchLevelPmeData field
   checkPatchLevelSimParamCompatibility(pmeGrid.order, true /* periodic Y */, true /* periodic Z */);
   checkPatchLevelDeviceCompatibility();

   if (!sim_params.CUDASOAintegrateMode) {
     NAMD_bug("CudaPmeOneDevice requires GPU-resident mode");
   }
   reductionGpuResident = ReductionMgr::Object()->willSubmit(REDUCTIONS_GPURESIDENT);

   // create our own CUDA stream
 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
   CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
   int leastPriority, greatestPriority;
   cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
   cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
 #else
   cudaCheck(cudaStreamCreate(&stream));
 #endif

   allocate_host<EnergyVirial>(&h_energyVirials, num_used_grids);
   allocate_device<EnergyVirial>(&d_energyVirials, num_used_grids);
   allocate_device<float>(&d_scaling_factors, num_used_grids);
   allocate_device<double>(&d_selfEnergy, 1);
   if (sim_params.alchFepOn) {
     allocate_device<double>(&d_selfEnergy_FEP, 1);
   } else {
     d_selfEnergy_FEP = NULL;
   }
   if (sim_params.alchThermIntOn) {
     allocate_device<double>(&d_selfEnergy_TI_1, 1);
     allocate_device<double>(&d_selfEnergy_TI_2, 1);
   } else {
     d_selfEnergy_TI_1 = NULL;
     d_selfEnergy_TI_2 = NULL;
   }

   // create device buffer space for atom positions and forces
   // to be accessed externally through PatchData
   allocate_device<float4>(&d_atoms, num_used_grids * natoms);
   allocate_device<float3>(&d_forces, num_used_grids * natoms);
   if (sim_params.alchOn) {
     allocate_device<int>(&d_partition, natoms);
   } else {
     d_partition = NULL;
   }
 #ifdef NODEGROUP_FORCE_REGISTER
   DeviceData& devData = cpdata.ckLocalBranch()->devData[deviceIndex];
   devData.s_datoms = (CudaAtom *) (d_atoms);
   devData.f_slow = (CudaForce *) (d_forces);
   devData.f_slow_size = natoms;
   devData.s_datoms_partition = d_partition;
 #endif
   int k1 = pmeGrid.K1;
   int k2 = pmeGrid.K2;
   int k3 = pmeGrid.K3;
   int order = pmeGrid.order;
   gridsize = k1 * k2 * k3;
   transize = (k1/2 + 1) * k2 * k3;

 #if defined(NAMD_CUDA) || defined(NAMD_HIP)

   // set up cufft
   forwardPlans = new cufftHandle[num_used_grids];
   backwardPlans = new cufftHandle[num_used_grids];
   for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
     cufftCheck(cufftPlan3d(&(forwardPlans[iGrid]), k3, k2, k1, CUFFT_R2C));
     cufftCheck(cufftPlan3d(&(backwardPlans[iGrid]), k3, k2, k1, CUFFT_C2R));
     cufftCheck(cufftSetStream(forwardPlans[iGrid], stream));
     cufftCheck(cufftSetStream(backwardPlans[iGrid], stream));
   }
 #endif

 #ifdef NAMD_CUDA
   cudaDeviceProp deviceProp;
   cudaCheck(cudaGetDeviceProperties(&deviceProp, deviceID));
   const int texture_alignment = int(deviceProp.textureAlignment);
   // d_grids and d_grids + N * gridsize will be used as device pointers for ::cudaResourceDesc::res::linear::devPtr
   // check if (d_grids + N * gridsize) is an address aligned to ::cudaDeviceProp::textureAlignment
   // which is required by cudaCreateTextureObject()
   // or maybe I should use cudaMallocPitch()?
   if ((gridsize % texture_alignment) != 0) {
     // if it is not aligned, padding is required
     gridsize = (int(gridsize / texture_alignment) + 1) * texture_alignment;
   }
   // Is it necesary to align transize too?
 //   if ((transize % texture_alignment) != 0) {
 //     // if it is not aligned, padding is required
 //     transize = (int(transize / texture_alignment) + 1) * texture_alignment;
 //   }
   allocate_device<float>(&d_grids, num_used_grids * gridsize);
   allocate_device<float2>(&d_trans, num_used_grids * transize);
   gridTexObjArrays = new cudaTextureObject_t[num_used_grids];
   for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
     // set up texture object
     cudaResourceDesc resDesc;
     memset(&resDesc, 0, sizeof(resDesc));
     resDesc.resType = cudaResourceTypeLinear;
     resDesc.res.linear.devPtr = (void*)(d_grids + iGrid * (size_t)gridsize);
     resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
     resDesc.res.linear.desc.x = sizeof(float)*8;
     resDesc.res.linear.sizeInBytes = gridsize*sizeof(float);
     cudaTextureDesc texDesc;
     memset(&texDesc, 0, sizeof(texDesc));
     texDesc.readMode = cudaReadModeElementType;
     cudaCheck(cudaCreateTextureObject(&(gridTexObjArrays[iGrid]), &resDesc, &texDesc, NULL));
   }
 #else
   allocate_device<float>(&d_grids, num_used_grids * gridsize);
   allocate_device<float2>(&d_trans, num_used_grids * transize);
 #endif
   // calculate prefactors
   double *bm1 = new double[k1];
   double *bm2 = new double[k2];
   double *bm3 = new double[k3];
   // Use compute_b_moduli from PmeKSpace.C
   extern void compute_b_moduli(double *bm, int k, int order);
   compute_b_moduli(bm1, k1, order);
   compute_b_moduli(bm2, k2, order);
   compute_b_moduli(bm3, k3, order);

   // allocate space for and copy prefactors onto GPU
   float *bm1f = new float[k1];
   float *bm2f = new float[k2];
   float *bm3f = new float[k3];
   for (int i=0;  i < k1;  i++)  bm1f[i] = (float) bm1[i];
   for (int i=0;  i < k2;  i++)  bm2f[i] = (float) bm2[i];
   for (int i=0;  i < k3;  i++)  bm3f[i] = (float) bm3[i];
   allocate_device<float>(&d_bm1, k1);
   allocate_device<float>(&d_bm2, k2);
   allocate_device<float>(&d_bm3, k3);
   copy_HtoD_sync<float>(bm1f, d_bm1, k1);
   copy_HtoD_sync<float>(bm2f, d_bm2, k2);
   copy_HtoD_sync<float>(bm3f, d_bm3, k3);
   delete [] bm1f;
   delete [] bm2f;
   delete [] bm3f;
   delete [] bm1;
   delete [] bm2;
   delete [] bm3;

   cudaCheck(cudaStreamSynchronize(stream));

 //   fprintf(stderr, "CudaPmeOneDevice constructor END ********************************************\n");
 }

 CudaPmeOneDevice::~CudaPmeOneDevice() {
   deallocate_device<float4>(&d_atoms);
   deallocate_device<float3>(&d_forces);
   deallocate_device<float2>(&d_trans);
   deallocate_device<float>(&d_grids);
   deallocate_host<EnergyVirial>(&h_energyVirials);
   deallocate_device<EnergyVirial>(&d_energyVirials);
   deallocate_device<float>(&d_scaling_factors);
 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
   for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
     cufftCheck(cufftDestroy(forwardPlans[iGrid]));
     cufftCheck(cufftDestroy(backwardPlans[iGrid]));
 #if defined(NAMD_CUDA) // only CUDA uses texture objects
     cudaCheck(cudaDestroyTextureObject(gridTexObjArrays[iGrid]));
 #endif
   }

   if (patchLevelPmeData.h_patchGridOffsets != nullptr) {
     deallocate_host<int3>(&patchLevelPmeData.h_patchGridOffsets);
   }
   if (patchLevelPmeData.d_patchGridOffsets != nullptr) {
     deallocate_device<int3>(&patchLevelPmeData.d_patchGridOffsets);
   }

   delete[] forwardPlans;
   delete[] backwardPlans;
 #if defined(NAMD_CUDA) // only CUDA uses texture objects
   delete[] gridTexObjArrays;
 #endif


 #endif
   deallocate_device<double>(&d_selfEnergy);
   if (d_partition != NULL) deallocate_device<int>(&d_partition);
   if (d_selfEnergy_FEP != NULL) deallocate_device<double>(&d_selfEnergy_FEP);
   if (d_selfEnergy_TI_1 != NULL) deallocate_device<double>(&d_selfEnergy_TI_1);
   if (d_selfEnergy_TI_2 != NULL) deallocate_device<double>(&d_selfEnergy_TI_2);
   deallocate_device<float>(&d_bm1);
   deallocate_device<float>(&d_bm2);
   deallocate_device<float>(&d_bm3);
   cudaCheck(cudaStreamDestroy(stream));

   if (reductionGpuResident) {
     delete reductionGpuResident;
   }
 }

 void CudaPmeOneDevice::compute(
     const Lattice &lattice,
 //    const CudaAtom *d_atoms,
 //    CudaForce *d_force,
 //    int natoms,
     int doEnergyVirial,
     int step
 #if 0
     double d_energy[1],
     double d_virial[6]
 #endif
     ) {
 //   fprintf(stderr, "CudaPmeOneDevice compute ****************************************************\n");
   int k1 = pmeGrid.K1;
   int k2 = pmeGrid.K2;
   int k3 = pmeGrid.K3;
   int order = pmeGrid.order;
   double volume = lattice.volume();
   Vector a_r = lattice.a_r();
   Vector b_r = lattice.b_r();
   Vector c_r = lattice.c_r();
   float arx = a_r.x;
   float ary = a_r.y;
   float arz = a_r.z;
   float brx = b_r.x;
   float bry = b_r.y;
   float brz = b_r.z;
   float crx = c_r.x;
   float cry = c_r.y;
   float crz = c_r.z;
   m_step = step;

   //JM:  actually necessary if you reserve a PME device!
   cudaCheck(cudaSetDevice(deviceID));
   const SimParameters& sim_params = *(Node::Object()->simParameters);

   // clear force array
   //fprintf(stderr, "Calling clear_device_array on d_force\n");
   clear_device_array<float3>(d_forces, num_used_grids * natoms, stream);
   // clear grid
   //fprintf(stderr, "Calling clear_device_array on d_grid\n");
   clear_device_array<float>(d_grids, num_used_grids * gridsize, stream);
   clear_device_array<float2>(d_trans, num_used_grids * transize, stream);

   // Clear energy and virial array if needed
   if (doEnergyVirial) {
     // clear_device_array<EnergyVirial>(d_energyVirial, 1, stream);
     clear_device_array<EnergyVirial>(d_energyVirials, num_used_grids * 1, stream);
     const bool updateSelfEnergy = (step == sim_params.firstTimestep) || (selfEnergy == 0);
     if (updateSelfEnergy && (sim_params.alchOn == false)) {
       clear_device_array<double>(d_selfEnergy, 1, stream);
       // calculate self energy term if not yet done
       selfEnergy = compute_selfEnergy(d_selfEnergy, d_atoms, natoms,
           kappa, stream);
       //fprintf(stderr, "selfEnergy = %12.8f\n", selfEnergy);
     }
     /* the self energy depends on the scaling factor, or lambda
      * the cases when self energy will be changed:
      * 1. If alchLambdaFreq > 0, we will have a linear scaling of lambda. Lambda is changed EVERY STEP!
      * 2. In most cases, users will not use alchLambdaFreq > 0, but simulations may enter another lambda-window by using TCL scripts.
      * in summary, the self energy will be not changed unless lambda is changed.
      * so calcSelfEnergyAlch() would compare lambda of current step with the one from last step.
      * only if lambda is changed, the calcSelfEnergyFEPKernel or calcSelfEnergyTIKernel will be executed again.
      */
     if (sim_params.alchOn) calcSelfEnergyAlch(m_step);
   }

 #if 0

   spread_charge(d_atoms, natoms, k1, k2, k3, k1, k2, k3,
       k1 /* xsize */, 0 /* jBlock */, 0 /* kBlock */,
       true /* pmeGrid.yBlocks == 1 */, true /* pmeGrid.zBlocks == 1 */,
       d_grid, order, stream);
 #else
   const int order3 = ((order*order*order-1)/WARPSIZE + 1)*WARPSIZE;
   for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
     spread_charge_v2(patchLevelPmeData,
         d_atoms + iGrid * natoms, natoms, k1, k2, k3,
         float(k1), (float)k2, (float)k3, order3,
         k1, k2, k3,
         k1 /* xsize */, 0 /* jBlock */, 0 /* kBlock */,
         true /* pmeGrid.yBlocks == 1 */, true /* pmeGrid.zBlocks == 1 */,
         d_grids + iGrid * gridsize, order, stream);
   }

 #endif
   //cudaCheck(cudaStreamSynchronize(stream));

   // forward FFT
   //fprintf(stderr, "Calling cufftExecR2C\n");
   //cufftCheck(cufftExecR2C(forwardPlan, (cufftReal *)d_grid,
   //      (cufftComplex *)d_tran));

   for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
     cufftCheck(cufftExecR2C(forwardPlans[iGrid],
           (cufftReal *)(d_grids + iGrid * gridsize),
           (cufftComplex *)(d_trans + iGrid * transize)));
   }

   //cudaCheck(cudaStreamSynchronize(stream));

   // reciprocal space calculation
   //fprintf(stderr, "Calling scalar_sum\n");
   for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
     scalar_sum(true /* Perm_cX_Y_Z */, k1, k2, k3, (k1/2 + 1), k2, k3,
         kappa, arx, ary, arz, brx, bry, brz, crx, cry, crz, volume,
         d_bm1, d_bm2, d_bm3, 0 /* jBlock */, 0 /* kBlock */,
         (bool) doEnergyVirial, &(d_energyVirials[iGrid].energy),
         d_energyVirials[iGrid].virial, d_trans + iGrid * transize, stream);
   }
   //scalar_sum(true /* Perm_cX_Y_Z */, k1, k2, k3, (k1/2 + 1), k2, k3,
   //    kappa, arx, ary, arz, brx, bry, brz, crx, cry, crz, volume,
   //    d_bm1, d_bm2, d_bm3, 0 /* jBlock */, 0 /* kBlock */,
   //    (bool) doEnergyVirial, &(d_energyVirial->energy),
   //    d_energyVirial->virial, d_tran, stream);
   //cudaCheck(cudaStreamSynchronize(stream));

   // backward FFT
   //fprintf(stderr, "Calling cufftExecC2R\n");
   for (size_t iGrid = 0; iGrid < num_used_grids; ++iGrid) {
     cufftCheck(cufftExecC2R(backwardPlans[iGrid],
           (cufftComplex *)(d_trans + iGrid * transize),
           (cufftReal *)(d_grids + iGrid * gridsize)));
   }

   //cufftCheck(cufftExecC2R(backwardPlan, (cufftComplex *)d_tran,
   //      (cufftReal *)d_grid));
   //cudaCheck(cudaStreamSynchronize(stream));

   // gather force from grid to atoms
   // missing cudaTextureObject_t below works for __CUDA_ARCH__ >= 350
   //fprintf(stderr, "Calling gather_force\n");
   for (unsigned int iGrid = 0; iGrid < num_used_grids; ++iGrid) {
     gather_force(patchLevelPmeData,
       &(d_atoms[iGrid * natoms]), natoms, k1, k2, k3, k1, k2, k3,
       k1 /* xsize */, 0 /* jBlock */, 0 /* kBlock */,
       true /* pmeGrid.yBlocks == 1 */, true /* pmeGrid.zBlocks == 1 */,
       d_grids + iGrid * gridsize, order, d_forces + iGrid * natoms,
 #ifdef NAMD_CUDA
       gridTexObjArrays[iGrid] /* cudaTextureObject_t */,
 #endif
       stream);
   }

   //gather_force(d_atoms, natoms, k1, k2, k3, k1, k2, k3,
   //    k1 /* xsize */, 0 /* jBlock */, 0 /* kBlock */,
   //    true /* pmeGrid.yBlocks == 1 */, true /* pmeGrid.zBlocks == 1 */,
   //    d_grid, order, d_force, gridTexObj /* cudaTextureObject_t */,
   //    stream);
   //cudaCheck(cudaStreamSynchronize(stream));

   // Copy energy and virial to host if needed
   if (doEnergyVirial) {
     //fprintf(stderr, "Calling copy_DtoH on d_energyVirial\n");
     copy_DtoH<EnergyVirial>(d_energyVirials, h_energyVirials,
         num_used_grids, stream);
     //cudaCheck(cudaEventRecord(copyEnergyVirialEvent, stream));
     //cudaCheck(cudaStreamSynchronize(stream));
   }

   // XXX debugging, quick test for borked forces
   //clear_device_array<float3>(d_force, natoms, stream);
   if (sim_params.alchOn) {
     scaleAndMergeForce(m_step);
   }
 }


 // call this after device-host memory transfer has completed
 void CudaPmeOneDevice::finishReduction(
     bool doEnergyVirial
     ) {
   cudaCheck(cudaStreamSynchronize(stream));
   SubmitReduction *reduction = getCurrentReduction();
   if(doEnergyVirial){
     CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
     PatchData* patchData = cpdata.ckLocalBranch();
     cudaCheck(cudaSetDevice(deviceID));
     double virial[9];
     double energy, energy_F, energy_TI_1, energy_TI_2;
     const SimParameters& sim_params = *(Node::Object()->simParameters);
     if (sim_params.alchOn) {
       if (sim_params.alchFepOn) {
         scaleAndComputeFEPEnergyVirials(h_energyVirials, m_step, energy, energy_F, virial);
         energy += selfEnergy;
         energy_F += selfEnergy_FEP;
       }
       if (sim_params.alchThermIntOn) {
         scaleAndComputeTIEnergyVirials(h_energyVirials, m_step, energy, energy_TI_1, energy_TI_2, virial);
         energy += selfEnergy;
         energy_TI_1 += selfEnergy_TI_1;
         energy_TI_2 += selfEnergy_TI_2;
       }
     } else {
       virial[0] = h_energyVirials[0].virial[0];
       virial[1] = h_energyVirials[0].virial[1];
       virial[2] = h_energyVirials[0].virial[2];
       virial[3] = h_energyVirials[0].virial[1];
       virial[4] = h_energyVirials[0].virial[3];
       virial[5] = h_energyVirials[0].virial[4];
       virial[6] = h_energyVirials[0].virial[2];
       virial[7] = h_energyVirials[0].virial[4];
       virial[8] = h_energyVirials[0].virial[5];
       energy = h_energyVirials[0].energy + selfEnergy;
     }
   #if 0
     fprintf(stderr, "PME ENERGY = %g %g\n", h_energyVirials[0].energy, selfEnergy );
     fprintf(stderr, "PME VIRIAL =\n"
         "  %g  %g  %g\n  %g  %g  %g\n  %g %g %g\n",
         virial[0], virial[1], virial[2], virial[3], virial[4],
         virial[5], virial[6], virial[7], virial[8]);
   #endif
     reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial[0];
     reduction->item(REDUCTION_VIRIAL_SLOW_XY) += virial[1];
     reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += virial[2];
     reduction->item(REDUCTION_VIRIAL_SLOW_YX) += virial[3];
     reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial[4];
     reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += virial[5];
     reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += virial[6];
     reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += virial[7];
     reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial[8];
     reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += energy;
     if (sim_params.alchFepOn) {
       reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += energy_F;
     }
     if (sim_params.alchThermIntOn) {
       reduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_1) += energy_TI_1;
       reduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_2) += energy_TI_2;
     }
   }
   reduction->submit();
 }

 void CudaPmeOneDevice::calcSelfEnergyAlch(int step) {
   SimParameters& sim_params = *(Node::Object()->simParameters);
   if (sim_params.alchFepOn) {
     const BigReal alchLambda1 = sim_params.getCurrentLambda(step);
     const BigReal alchLambda2 = sim_params.getCurrentLambda2(step);
     static BigReal lambda1Up   = sim_params.getElecLambda(alchLambda1);
     static BigReal lambda2Up   = sim_params.getElecLambda(alchLambda2);
     static BigReal lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
     static BigReal lambda2Down = sim_params.getElecLambda(1.0 - alchLambda2);
     // compute self energy at the first call
     // only compute self energy if factors are changed
     if ((lambda1Up != sim_params.getElecLambda(alchLambda1)) ||
         (lambda2Up != sim_params.getElecLambda(alchLambda2)) ||
         (lambda1Down != sim_params.getElecLambda(1.0 - alchLambda1)) ||
         (lambda2Down != sim_params.getElecLambda(1.0 - alchLambda2)) ||
          self_energy_alch_first_time) {
       lambda1Up   = sim_params.getElecLambda(alchLambda1);
       lambda2Up   = sim_params.getElecLambda(alchLambda2);
       lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
       lambda2Down = sim_params.getElecLambda(1.0 - alchLambda2);
       selfEnergy = 0.0; // self energy for ÃƒÅ½Ã‚Â»_1
       selfEnergy_FEP = 0.0; // self energy for ÃƒÅ½Ã‚Â»_2
       cudaCheck(cudaMemsetAsync(d_selfEnergy, 0, sizeof(double), stream));
       cudaCheck(cudaMemsetAsync(d_selfEnergy_FEP, 0, sizeof(double), stream));
       calcSelfEnergyFEPWrapper(d_selfEnergy, d_selfEnergy_FEP, selfEnergy, selfEnergy_FEP, d_atoms, d_partition, natoms, kappa, sim_params.alchDecouple, lambda1Up, lambda2Up, lambda1Down, lambda2Down, stream);
       if (self_energy_alch_first_time) self_energy_alch_first_time = false;
     }
   }
   if (sim_params.alchThermIntOn) {
     const BigReal alchLambda1 = sim_params.getCurrentLambda(step);
     static BigReal lambda1Up   = sim_params.getElecLambda(alchLambda1);
     static BigReal lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
     if ((lambda1Up != sim_params.getElecLambda(alchLambda1)) ||
         (lambda1Down != sim_params.getElecLambda(1.0 - alchLambda1)) ||
         self_energy_alch_first_time) {
       lambda1Up   = sim_params.getElecLambda(alchLambda1);
       lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
       selfEnergy = 0.0;
       selfEnergy_TI_1 = 0.0;
       selfEnergy_TI_2 = 0.0;
       cudaCheck(cudaMemsetAsync(d_selfEnergy, 0, sizeof(double), stream));
       cudaCheck(cudaMemsetAsync(d_selfEnergy_TI_1, 0, sizeof(double), stream));
       cudaCheck(cudaMemsetAsync(d_selfEnergy_TI_2, 0, sizeof(double), stream));
       calcSelfEnergyTIWrapper(d_selfEnergy, d_selfEnergy_TI_1, d_selfEnergy_TI_2, selfEnergy, selfEnergy_TI_1, selfEnergy_TI_2, d_atoms, d_partition, natoms, kappa, sim_params.alchDecouple, lambda1Up, lambda1Down, stream);
       if (self_energy_alch_first_time) self_energy_alch_first_time = false;
     }
   }
 }

 void CudaPmeOneDevice::scaleAndMergeForce(int step) {
   SimParameters& sim_params = *(Node::Object()->simParameters);
   const double alchLambda1   = sim_params.getCurrentLambda(step);
   static BigReal lambda1Up   = sim_params.getElecLambda(alchLambda1);
   static BigReal lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
   if ((lambda1Up != sim_params.getElecLambda(alchLambda1)) ||
       (lambda1Down != sim_params.getElecLambda(1.0 - alchLambda1)) ||
       force_scaling_alch_first_time) {
     std::vector<float> scale_factors(num_used_grids);
     lambda1Up   = sim_params.getElecLambda(alchLambda1);
     lambda1Down = sim_params.getElecLambda(1.0 - alchLambda1);
     scale_factors[0] = lambda1Up;
     scale_factors[1] = lambda1Down;
     if (sim_params.alchDecouple) {
       scale_factors[2] = 1.0 - lambda1Up;
       scale_factors[3] = 1.0 - lambda1Down;
     }
     if (bool(sim_params.alchElecLambdaStart) || sim_params.alchThermIntOn) {
       scale_factors[num_used_grids-1] = (lambda1Up + lambda1Down - 1.0) * (-1.0);
     }
     copy_HtoD<float>(scale_factors.data(), d_scaling_factors, num_used_grids);
     if (force_scaling_alch_first_time) force_scaling_alch_first_time = false;
   }
   scaleAndMergeForceWrapper(d_forces, d_scaling_factors, num_used_grids, natoms, stream);
 }

 void CudaPmeOneDevice::scaleAndComputeFEPEnergyVirials(const EnergyVirial* energyVirials, int step, double& energy, double& energy_F, double (&virial)[9]) {
   double scale1 = 1.0;
   double scale2 = 1.0;
   energy = 0;
   energy_F = 0;
   for (unsigned int i = 0; i < 9; ++i) {
     virial[i] = 0;
   }
   SimParameters& sim_params = *(Node::Object()->simParameters);
   const BigReal alchLambda  = sim_params.getCurrentLambda(step);
   const BigReal alchLambda2 = sim_params.getCurrentLambda2(step);
   const BigReal elecLambdaUp  = sim_params.getElecLambda(alchLambda);
   const BigReal elecLambda2Up = sim_params.getElecLambda(alchLambda2);
   const BigReal elecLambdaDown  = sim_params.getElecLambda(1 - alchLambda);
   const BigReal elecLambda2Down = sim_params.getElecLambda(1 - alchLambda2);
   energy   += energyVirials[0].energy * elecLambdaUp;
   energy_F += energyVirials[0].energy * elecLambda2Up;
   energy   += energyVirials[1].energy * elecLambdaDown;
   energy_F += energyVirials[1].energy * elecLambda2Down;
   virial[0] += energyVirials[0].virial[0] * elecLambdaUp;
   virial[1] += energyVirials[0].virial[1] * elecLambdaUp;
   virial[2] += energyVirials[0].virial[2] * elecLambdaUp;
   virial[3] += energyVirials[0].virial[1] * elecLambdaUp;
   virial[4] += energyVirials[0].virial[3] * elecLambdaUp;
   virial[5] += energyVirials[0].virial[4] * elecLambdaUp;
   virial[6] += energyVirials[0].virial[2] * elecLambdaUp;
   virial[7] += energyVirials[0].virial[4] * elecLambdaUp;
   virial[8] += energyVirials[0].virial[5] * elecLambdaUp;
   virial[0] += energyVirials[1].virial[0] * elecLambdaDown;
   virial[1] += energyVirials[1].virial[1] * elecLambdaDown;
   virial[2] += energyVirials[1].virial[2] * elecLambdaDown;
   virial[3] += energyVirials[1].virial[1] * elecLambdaDown;
   virial[4] += energyVirials[1].virial[3] * elecLambdaDown;
   virial[5] += energyVirials[1].virial[4] * elecLambdaDown;
   virial[6] += energyVirials[1].virial[2] * elecLambdaDown;
   virial[7] += energyVirials[1].virial[4] * elecLambdaDown;
   virial[8] += energyVirials[1].virial[5] * elecLambdaDown;
   if (sim_params.alchDecouple) {
     energy   += energyVirials[2].energy * (1.0 - elecLambdaUp);
     energy_F += energyVirials[2].energy * (1.0 - elecLambda2Up);
     energy   += energyVirials[3].energy * (1.0 - elecLambdaDown);
     energy_F += energyVirials[3].energy * (1.0 - elecLambda2Down);
     virial[0] += energyVirials[2].virial[0] * (1.0 - elecLambdaUp);
     virial[1] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
     virial[2] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
     virial[3] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
     virial[4] += energyVirials[2].virial[3] * (1.0 - elecLambdaUp);
     virial[5] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
     virial[6] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
     virial[7] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
     virial[8] += energyVirials[2].virial[5] * (1.0 - elecLambdaUp);
     virial[0] += energyVirials[3].virial[0] * (1.0 - elecLambdaDown);
     virial[1] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
     virial[2] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
     virial[3] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
     virial[4] += energyVirials[3].virial[3] * (1.0 - elecLambdaDown);
     virial[5] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
     virial[6] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
     virial[7] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
     virial[8] += energyVirials[3].virial[5] * (1.0 - elecLambdaDown);
     if (sim_params.alchElecLambdaStart > 0) {
       energy   += energyVirials[4].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
       energy_F += energyVirials[4].energy * (-1.0 * (elecLambda2Up + elecLambda2Down - 1.0));
       virial[0] += energyVirials[4].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
       virial[1] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
       virial[2] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
       virial[3] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
       virial[4] += energyVirials[4].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
       virial[5] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
       virial[6] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
       virial[7] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
       virial[8] += energyVirials[4].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
     }
   } else {
     if (sim_params.alchElecLambdaStart > 0) {
       energy   += energyVirials[2].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
       energy_F += energyVirials[2].energy * (-1.0 * (elecLambda2Up + elecLambda2Down - 1.0));
       virial[0] += energyVirials[2].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
       virial[1] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
       virial[2] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
       virial[3] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
       virial[4] += energyVirials[2].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
       virial[5] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
       virial[6] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
       virial[7] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
       virial[8] += energyVirials[2].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
     }
   }
 }

 void CudaPmeOneDevice::scaleAndComputeTIEnergyVirials(const EnergyVirial* energyVirials, int step, double& energy, double& energy_TI_1, double& energy_TI_2, double (&virial)[9]) {
   double scale1 = 1.0;
   energy =0;
   energy_TI_1 = 0;
   energy_TI_2 = 0;
   for (unsigned int i = 0; i < 9; ++i) {
     virial[i] = 0;
   }
   SimParameters& sim_params = *(Node::Object()->simParameters);
   const BigReal alchLambda   = sim_params.getCurrentLambda(step);
   const BigReal elecLambdaUp = sim_params.getElecLambda(alchLambda);
   const BigReal elecLambdaDown = sim_params.getElecLambda(1 - alchLambda);
   energy      += energyVirials[0].energy * elecLambdaUp;
   energy      += energyVirials[1].energy * elecLambdaDown;
   energy_TI_1 += energyVirials[0].energy;
   energy_TI_2 += energyVirials[1].energy;
   virial[0] += energyVirials[0].virial[0] * elecLambdaUp;
   virial[1] += energyVirials[0].virial[1] * elecLambdaUp;
   virial[2] += energyVirials[0].virial[2] * elecLambdaUp;
   virial[3] += energyVirials[0].virial[1] * elecLambdaUp;
   virial[4] += energyVirials[0].virial[3] * elecLambdaUp;
   virial[5] += energyVirials[0].virial[4] * elecLambdaUp;
   virial[6] += energyVirials[0].virial[2] * elecLambdaUp;
   virial[7] += energyVirials[0].virial[4] * elecLambdaUp;
   virial[8] += energyVirials[0].virial[5] * elecLambdaUp;
   virial[0] += energyVirials[1].virial[0] * elecLambdaDown;
   virial[1] += energyVirials[1].virial[1] * elecLambdaDown;
   virial[2] += energyVirials[1].virial[2] * elecLambdaDown;
   virial[3] += energyVirials[1].virial[1] * elecLambdaDown;
   virial[4] += energyVirials[1].virial[3] * elecLambdaDown;
   virial[5] += energyVirials[1].virial[4] * elecLambdaDown;
   virial[6] += energyVirials[1].virial[2] * elecLambdaDown;
   virial[7] += energyVirials[1].virial[4] * elecLambdaDown;
   virial[8] += energyVirials[1].virial[5] * elecLambdaDown;
   if (sim_params.alchDecouple) {
     energy      += energyVirials[2].energy * (1.0 - elecLambdaUp);
     energy      += energyVirials[3].energy * (1.0 - elecLambdaDown);
     energy_TI_1 += -1.0 * energyVirials[2].energy;
     energy_TI_2 += -1.0 * energyVirials[3].energy;
     virial[0] += energyVirials[2].virial[0] * (1.0 - elecLambdaUp);
     virial[1] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
     virial[2] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
     virial[3] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
     virial[4] += energyVirials[2].virial[3] * (1.0 - elecLambdaUp);
     virial[5] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
     virial[6] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
     virial[7] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
     virial[8] += energyVirials[2].virial[5] * (1.0 - elecLambdaUp);
     virial[0] += energyVirials[3].virial[0] * (1.0 - elecLambdaDown);
     virial[1] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
     virial[2] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
     virial[3] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
     virial[4] += energyVirials[3].virial[3] * (1.0 - elecLambdaDown);
     virial[5] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
     virial[6] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
     virial[7] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
     virial[8] += energyVirials[3].virial[5] * (1.0 - elecLambdaDown);
     energy      += energyVirials[4].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
     energy_TI_1 += -1.0 * energyVirials[4].energy;
     energy_TI_2 += -1.0 * energyVirials[4].energy;
     virial[0] += energyVirials[4].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
     virial[1] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
     virial[2] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
     virial[3] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
     virial[4] += energyVirials[4].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
     virial[5] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
     virial[6] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
     virial[7] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
     virial[8] += energyVirials[4].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
   } else {
     energy      += energyVirials[2].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
     energy_TI_1 += -1.0 * energyVirials[2].energy;
     energy_TI_2 += -1.0 * energyVirials[2].energy;
     virial[0] += energyVirials[2].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
     virial[1] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
     virial[2] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
     virial[3] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
     virial[4] += energyVirials[2].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
     virial[5] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
     virial[6] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
     virial[7] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
     virial[8] += energyVirials[2].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
   }
 }


 int CudaPmeOneDevice::getShiftedGrid(const double x, const int grid) {
   double w = x + 0.5;
   double gw = w * grid;
   return floor(gw);
 }

 int CudaPmeOneDevice::computeSharedMemoryPatchLevelSpreadCharge(
   const int numThreads, const int3 patchGridDim, const int order) {

   const int gridBytes = patchGridDim.x * patchGridDim.y * patchGridDim.z * sizeof(float);
   const int thetaBytes = PatchLevelPmeData::kDim * (numThreads + PatchLevelPmeData::kThetaPad) *
     order * sizeof(float);
   const int indexBytes = numThreads * sizeof(char4);

   return gridBytes + thetaBytes + indexBytes;
 }

 int CudaPmeOneDevice::computeSharedMemoryPatchLevelGatherForce(
   const int numThreads, const int3 patchGridDim, const int order) {

   const int gridBytes = patchGridDim.x * patchGridDim.y * patchGridDim.z * sizeof(float);
   const int thetaBytes = (numThreads + PatchLevelPmeData::kThetaPad) * order *
     2 /* theta and dtheta */ * sizeof(float);

   return gridBytes + thetaBytes;
 }

 void CudaPmeOneDevice::checkPatchLevelSimParamCompatibility(const int order, const bool periodicY, const bool periodicZ) {
   bool use = true;
   use = use && (order == 8);
   use = use && (periodicY);
   use = use && (periodicZ);

   use = use && (deviceCUDA->getNumDevice() == 1); // This is only supported for single GPU currently

   patchLevelPmeData.simulationCompatible = use;
 }

 void CudaPmeOneDevice::checkPatchLevelDeviceCompatibility() {
   cudaDeviceGetAttribute(&patchLevelPmeData.deviceMaxSharedBytes, cudaDevAttrMaxSharedMemoryPerBlockOptin, deviceID);

   const int3 constexprPatchGridDim = make_int3(
     PatchLevelPmeData::kPatchGridDimPad,
     PatchLevelPmeData::kPatchGridDim,
     PatchLevelPmeData::kPatchGridDim);

   patchLevelPmeData.spreadChargeSharedBytes = computeSharedMemoryPatchLevelSpreadCharge(
     PatchLevelPmeData::kNumThreads,
     constexprPatchGridDim, 8 /* order */);
   patchLevelPmeData.gatherForceSharedBytes = computeSharedMemoryPatchLevelGatherForce(
     PatchLevelPmeData::kNumThreads,
     constexprPatchGridDim, 8 /* order */);

   patchLevelPmeData.deviceCompatible =
       (patchLevelPmeData.spreadChargeSharedBytes <= patchLevelPmeData.deviceMaxSharedBytes) &&
       (patchLevelPmeData.gatherForceSharedBytes <= patchLevelPmeData.deviceMaxSharedBytes);
 }

 void CudaPmeOneDevice::checkPatchLevelLatticeCompatibilityAndComputeOffsets(const Lattice& lattice,
   const int numPatches, const CudaLocalRecord* localRecords,
   double3* patchMin, double3* patchMax, double3* awayDists) {

   patchLevelPmeData.localRecords = localRecords;

   // If the simulation isn't compatible or the device isn't compatible then no point in checking
   // patch sizes
   if (!patchLevelPmeData.simulationCompatible || !patchLevelPmeData.deviceCompatible) return;

   patchLevelPmeData.numPatches = numPatches;

   if (patchLevelPmeData.h_patchGridOffsets == nullptr) {
     allocate_host<int3>(&patchLevelPmeData.h_patchGridOffsets, numPatches);
   }
   if (patchLevelPmeData.d_patchGridOffsets == nullptr) {
     allocate_device<int3>(&patchLevelPmeData.d_patchGridOffsets, numPatches);
   }

   SimParameters* simParams = Node::Object()->simParameters;
   const int order = pmeGrid.order;

   // We only need to recompute the grid offsets if the lattice has changed
   if (!lattice.isEqual(currentLattice)) {
     currentLattice = lattice;

     double sysdima = currentLattice.a_r().unit() * currentLattice.a();
     double sysdimb = currentLattice.b_r().unit() * currentLattice.b();
     double sysdimc = currentLattice.c_r().unit() * currentLattice.c();

     patchLevelPmeData.patchGridDim = make_int3(0,0,0);

     for (int i = 0; i < numPatches; i++) {
       double3 pmin = currentLattice.unscale(patchMin[i]);
       double3 pmax = currentLattice.unscale(patchMax[i]);
       double3 width = pmax - pmin;

       // Logic copied from margin violation check
       double3 marginVal;
       marginVal.x = 0.5 * (awayDists[i].x - simParams->cutoff / sysdima);
       marginVal.y = 0.5 * (awayDists[i].y - simParams->cutoff / sysdimb);
       marginVal.z = 0.5 * (awayDists[i].z - simParams->cutoff / sysdimc);
       marginVal = currentLattice.unscale(marginVal);

       double3 minAtom = pmin - marginVal;
       double3 maxAtom = pmax + marginVal;

       double3 minScaled = currentLattice.scale(minAtom);
       double3 maxScaled = currentLattice.scale(maxAtom);

       int3 gridMin;
       gridMin.x = getShiftedGrid(minScaled.x, pmeGrid.K1);
       gridMin.y = getShiftedGrid(minScaled.y, pmeGrid.K2);
       gridMin.z = getShiftedGrid(minScaled.z, pmeGrid.K3);

       int3 gridMax;
       gridMax.x = getShiftedGrid(maxScaled.x, pmeGrid.K1);
       gridMax.y = getShiftedGrid(maxScaled.y, pmeGrid.K2);
       gridMax.z = getShiftedGrid(maxScaled.z, pmeGrid.K3);

       int3 gridWidth;
       gridWidth.x = gridMax.x - gridMin.x + order;
       gridWidth.y = gridMax.y - gridMin.y + order;
       gridWidth.z = gridMax.z - gridMin.z + order;

       patchLevelPmeData.h_patchGridOffsets[i] = gridMin;
       patchLevelPmeData.patchGridDim.x = std::max(patchLevelPmeData.patchGridDim.x, gridWidth.x);
       patchLevelPmeData.patchGridDim.y = std::max(patchLevelPmeData.patchGridDim.y, gridWidth.y);
       patchLevelPmeData.patchGridDim.z = std::max(patchLevelPmeData.patchGridDim.z, gridWidth.z);
     }
     copy_HtoD<int3>(patchLevelPmeData.h_patchGridOffsets, patchLevelPmeData.d_patchGridOffsets,
       numPatches, nullptr);
     cudaStreamSynchronize(nullptr);
     const int maxGridPoints = patchLevelPmeData.patchGridDim.x *
         patchLevelPmeData.patchGridDim.y * patchLevelPmeData.patchGridDim.z;

     patchLevelPmeData.latticeCompatible =
         patchLevelPmeData.patchGridDim.x <= PatchLevelPmeData::kPatchGridDim &&
         patchLevelPmeData.patchGridDim.x <= PatchLevelPmeData::kPatchGridDim &&
         patchLevelPmeData.patchGridDim.x <= PatchLevelPmeData::kPatchGridDim;
   }
 }

 #endif // NAMD_CUDA

Node::Object
static Node * Object()
Definition: Node.h:86

spread_charge_v2
void spread_charge_v2(const PatchLevelPmeData patchLevelPmeData, const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const float nfftx_f, const float nffty_f, const float nfftz_f, const int order3, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)

CudaPmeOneDevice::finishReduction
void finishReduction(bool doEnergyVirial)
Definition: CudaPmeSolverUtil.C:1691

PmeGrid::zBlocks
int zBlocks
Definition: PmeBase.h:25

Lattice::isEqual
bool isEqual(const Lattice &other) const
Definition: Lattice.h:298

REDUCTION_ELECT_ENERGY_SLOW_F
Definition: ReductionMgr.h:85

CudaPmeOneDevice::deviceID
int deviceID
Definition: CudaPmeSolverUtil.h:212

PmeRealSpaceCompute::numAtoms
int numAtoms
Definition: PmeSolverUtil.h:321

PmeKSpaceCompute::k0
int k0
Definition: PmeSolverUtil.h:277

PatchData.h

PatchLevelPmeData::patchGridDim
int3 patchGridDim
Definition: CudaPmeSolverUtilKernel.h:27

CudaPmeOneDevice::CudaPmeOneDevice
CudaPmeOneDevice(PmeGrid pmeGrid_, int deviceID_, int deviceIndex_)
Definition: CudaPmeSolverUtil.C:1303

NAMD_EVENT_STOP
#define NAMD_EVENT_STOP(eon, id)
Definition: NamdEventsProfiling.h:318

PmeRealSpaceCompute::y0
int y0
Definition: PmeSolverUtil.h:325

CudaPmeOneDevice::backwardPlans
cufftHandle * backwardPlans
Definition: CudaPmeSolverUtil.h:238

CudaPmeOneDevice::deviceIndex
int deviceIndex
Definition: CudaPmeSolverUtil.h:213

compute_b_moduli
void compute_b_moduli(double *bm, int K, int order)
Definition: PmeKSpace.C:42

Lattice::c
NAMD_HOST_DEVICE Vector c() const
Definition: Lattice.h:270

PmeRealSpaceCompute::zsize
int zsize
Definition: PmeSolverUtil.h:327

CudaPmeOneDevice::selfEnergy_FEP
double selfEnergy_FEP
Definition: CudaPmeSolverUtil.h:261

PatchLevelPmeData::gatherForceSharedBytes
int gatherForceSharedBytes
Definition: CudaPmeSolverUtilKernel.h:15

CudaPmeKSpaceCompute::energyAndVirialSetCallback
void energyAndVirialSetCallback(CudaPmePencilXYZ *pencilPtr)
Definition: CudaPmeSolverUtil.C:477

PmeTranspose::permutation
const int permutation
Definition: PmeSolverUtil.h:427

CudaPmeOneDevice::force_scaling_alch_first_time
bool force_scaling_alch_first_time
Definition: CudaPmeSolverUtil.h:255

ComputePmeCUDADevice::gatherForceDone
void gatherForceDone(unsigned int iGrid)
Definition: ComputePmeCUDAMgr.C:1816

CudaPmeOneDevice::selfEnergy_TI_2
double selfEnergy_TI_2
Definition: CudaPmeSolverUtil.h:263

PatchLevelPmeData::kNumThreads
static constexpr int kNumThreads
Definition: CudaPmeSolverUtilKernel.h:8

batchTranspose_xyz_yzx
void batchTranspose_xyz_yzx(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)

CudaPmeKSpaceCompute::~CudaPmeKSpaceCompute
~CudaPmeKSpaceCompute()
Definition: CudaPmeSolverUtil.C:266

CudaPmeOneDevice::d_forces
float3 * d_forces
Definition: CudaPmeSolverUtil.h:221

CudaPmeTranspose::CudaPmeTranspose
CudaPmeTranspose(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
Definition: CudaPmeSolverUtil.C:845

REDUCTION_ELECT_ENERGY_SLOW_TI_1
Definition: ReductionMgr.h:86

PmeRealSpaceCompute::dataSize
int dataSize
Definition: PmeSolverUtil.h:329

SimParameters
Definition: SimParameters.h:139

CudaPmeOneDevice::d_selfEnergy_FEP
double * d_selfEnergy_FEP
Definition: CudaPmeSolverUtil.h:257

NamdEventsProfiling.h

CudaPmeSolver.h

CudaPmeRealSpaceCompute::~CudaPmeRealSpaceCompute
~CudaPmeRealSpaceCompute()
Definition: CudaPmeSolverUtil.C:570

PmeGrid
Definition: PmeBase.h:20

SimParameters::alchElecLambdaStart
BigReal alchElecLambdaStart
Definition: SimParameters.h:500

Vector
Definition: Vector.h:72

PmeGrid::K2
int K2
Definition: PmeBase.h:21

HipDefines.h

SubmitReduction::submit
virtual void submit(void)=0

Node::simParameters
SimParameters * simParameters
Definition: Node.h:181

PmeGrid::K1
int K1
Definition: PmeBase.h:21

TransposeBatch::zsize_out
int zsize_out
Definition: CudaPmeSolverUtilKernel.h:92

PmeTranspose::isize
int isize
Definition: PmeSolverUtil.h:429

cudaDie
void cudaDie(const char *msg, cudaError_t err)
Definition: CudaUtils.C:9

PatchLevelPmeData::simulationCompatible
bool simulationCompatible
Definition: CudaPmeSolverUtilKernel.h:17

SimParameters::CUDASOAintegrateMode
Bool CUDASOAintegrateMode
Definition: SimParameters.h:168

PmeRealSpaceCompute::z0
int z0
Definition: PmeSolverUtil.h:325

CudaPmeOneDevice::d_energyVirials
EnergyVirial * d_energyVirials
Definition: CudaPmeSolverUtil.h:251

Node.h

PatchLevelPmeData::d_patchGridOffsets
int3 * d_patchGridOffsets
Definition: CudaPmeSolverUtilKernel.h:28

Lattice::unscale
NAMD_HOST_DEVICE Position unscale(ScaledPosition s) const
Definition: Lattice.h:77

SubmitReduction::item
BigReal & item(int i)
Definition: ReductionMgr.h:336

CudaForce
Definition: CudaRecord.h:62

PatchLevelPmeData
Definition: CudaPmeSolverUtilKernel.h:6

Vector::z
BigReal z
Definition: Vector.h:74

SimParameters::alchOn
Bool alchOn
Definition: SimParameters.h:476

TransposeBatch::nx
int nx
Definition: CudaPmeSolverUtilKernel.h:91

DeviceCUDA::getNumDevice
int getNumDevice()
Definition: DeviceCUDA.h:125

cufftCheck
#define cufftCheck(stmt)
Definition: CudaPmeSolverUtil.h:29

CudaPmeOneDevice::checkPatchLevelLatticeCompatibilityAndComputeOffsets
void checkPatchLevelLatticeCompatibilityAndComputeOffsets(const Lattice &lattice, const int numPatches, const CudaLocalRecord *localRecords, double3 *patchMin, double3 *patchMax, double3 *awayDists)
Definition: CudaPmeSolverUtil.C:2065

CudaLocalRecord
Definition: CudaRecord.h:35

PmeTranspose::ksize
int ksize
Definition: PmeSolverUtil.h:429

ReductionMgr::willSubmit
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:368

CudaPmeOneDevice::d_partition
int * d_partition
Definition: CudaPmeSolverUtil.h:220

PatchLevelPmeData::deviceCompatible
bool deviceCompatible
Definition: CudaPmeSolverUtilKernel.h:16

CudaPmeOneDevice::d_trans
float2 * d_trans
Definition: CudaPmeSolverUtil.h:231

getPencilDim
static void getPencilDim(const PmeGrid &pmeGrid, const int permutation, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:32

CudaPmePencilZ::energyAndVirialDone
void energyAndVirialDone(unsigned int iGrid)
Definition: CudaPmeSolver.C:1092

ReductionMgr::Object
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:290

CudaPmeOneDevice::checkPatchLevelSimParamCompatibility
void checkPatchLevelSimParamCompatibility(const int order, const bool periodicY, const bool periodicZ)
Definition: CudaPmeSolverUtil.C:2034

PmeKSpaceCompute::size1
int size1
Definition: PmeSolverUtil.h:276

PatchData
Definition: PatchData.h:116

CudaPmeRealSpaceCompute::spreadCharge
void spreadCharge(Lattice &lattice)
Definition: CudaPmeSolverUtil.C:615

SimParameters::getElecLambda
BigReal getElecLambda(const BigReal) const
Definition: SimParameters.C:8545

PmeTranspose::pos
std::vector< int > pos
Definition: PmeSolverUtil.h:432

Perm_Z_cX_Y
Definition: PmeSolverUtil.h:20

scalar_sum
void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)

CudaPmeRealSpaceCompute::copyAtoms
void copyAtoms(const int numAtoms, const CudaAtom *atoms)
Definition: CudaPmeSolverUtil.C:601

WARPSIZE
#define WARPSIZE
Definition: CudaUtils.h:17

CudaPmeOneDevice::patchLevelPmeData
PatchLevelPmeData patchLevelPmeData
Definition: CudaPmeSolverUtil.h:282

CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute
CudaPmeRealSpaceCompute(PmeGrid pmeGrid, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
Definition: CudaPmeSolverUtil.C:545

PmeKSpaceCompute
Definition: PmeSolverUtil.h:268

CudaPmeOneDevice::selfEnergy
double selfEnergy
Definition: CudaPmeSolverUtil.h:260

ComputePmeCUDADevice
Definition: ComputePmeCUDAMgr.h:420

CudaFFTCompute::backward
void backward()
Definition: CudaPmeSolverUtil.C:182

CudaPmeOneDevice::d_atoms
float4 * d_atoms
Definition: CudaPmeSolverUtil.h:219

TransposeBatch::data_out
T * data_out
Definition: CudaPmeSolverUtilKernel.h:90

CudaPmePencilXYZ
Definition: CudaPmeSolver.h:57

PatchLevelPmeData::spreadChargeSharedBytes
int spreadChargeSharedBytes
Definition: CudaPmeSolverUtilKernel.h:14

CudaPmeTranspose::copyDataToPeerDeviceZXY
void copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
Definition: CudaPmeSolverUtil.C:1258

TransposeBatch< float2 >

CudaPmeOneDevice::d_bm1
float * d_bm1
Definition: CudaPmeSolverUtil.h:241

CudaPmeTranspose::copyDataDeviceToDevice
void copyDataDeviceToDevice(const int iblock, float2 *data_out)
Definition: CudaPmeSolverUtil.C:1207

PmeTranspose::pmeGrid
PmeGrid pmeGrid
Definition: PmeSolverUtil.h:426

TransposeBatch::xsize_out
int xsize_out
Definition: CudaPmeSolverUtilKernel.h:93

FFTCompute::dataDstAllocated
bool dataDstAllocated
Definition: PmeSolverUtil.h:232

calcSelfEnergyTIWrapper
void calcSelfEnergyTIWrapper(double *d_selfEnergy, double *d_selfEnergy_TI_1, double *d_selfEnergy_TI_2, double &h_selfEnergy, double &h_selfEnergy_TI_1, double &h_selfEnergy_TI_2, const float4 *d_atoms, const int *d_partition, const int num_atoms, const double ewaldcof, const bool alchDecouple, const double lambda1Up, const double lambda1Down, cudaStream_t stream)

CudaFFTCompute::CudaFFTCompute
CudaFFTCompute(int deviceID, cudaStream_t stream)
Definition: CudaPmeSolverUtil.C:61

PmeGrid::yBlocks
int yBlocks
Definition: PmeBase.h:25

Perm_cX_Y_Z
Definition: PmeSolverUtil.h:20

PatchLevelPmeData::deviceMaxSharedBytes
int deviceMaxSharedBytes
Definition: CudaPmeSolverUtilKernel.h:13

CudaPmeOneDevice::computeSharedMemoryPatchLevelSpreadCharge
int computeSharedMemoryPatchLevelSpreadCharge(const int numThreads, const int3 patchGridDim, const int order)
Definition: CudaPmeSolverUtil.C:2013

CudaPmeTranspose::copyDataDeviceToHost
void copyDataDeviceToHost(const int iblock, float2 *h_data, const int h_dataSize)
Definition: CudaPmeSolverUtil.C:1161

order
#define order
Definition: PmeRealSpace.C:235

PmeKSpaceCompute::size3
int size3
Definition: PmeSolverUtil.h:276

NAMD_EVENT_START
#define NAMD_EVENT_START(eon, id)
Definition: NamdEventsProfiling.h:312

ComputePmeCUDAMgr.h

FFTCompute::dataDstSize
int dataDstSize
Definition: PmeSolverUtil.h:229

CudaPmeOneDevice::d_selfEnergy
double * d_selfEnergy
Definition: CudaPmeSolverUtil.h:256

PmeGrid::order
int order
Definition: PmeBase.h:23

REDUCTION_ELECT_ENERGY_SLOW_TI_2
Definition: ReductionMgr.h:87

CudaPmeOneDevice::m_step
int m_step
Definition: CudaPmeSolverUtil.h:264

PatchLevelPmeData::numPatches
int numPatches
Definition: CudaPmeSolverUtilKernel.h:24

SubmitReduction
Definition: ReductionMgr.h:326

CudaPmeKSpaceCompute
Definition: CudaPmeSolverUtil.h:73

PmeRealSpaceCompute::data
float * data
Definition: PmeSolverUtil.h:330

ComputeNonbondedUtil.h

NAMD_bug
void NAMD_bug(const char *err_msg)
Definition: common.C:196

SimParameters::firstTimestep
int firstTimestep
Definition: SimParameters.h:1038

ComputeNonbondedUtil
Definition: ComputeNonbondedUtil.h:240

PmeKSpaceCompute::j0
int j0
Definition: PmeSolverUtil.h:277

PmeTranspose::jblock
const int jblock
Definition: PmeSolverUtil.h:428

writeHostComplexToDisk
void writeHostComplexToDisk(const float2 *h_data, const int size, const char *filename)
Definition: CudaPmeSolverUtil.C:42

PmeKSpaceCompute::bm2
double * bm2
Definition: PmeSolverUtil.h:272

PmeRealSpaceCompute::pmeGrid
PmeGrid pmeGrid
Definition: PmeSolverUtil.h:323

CudaFFTCompute::~CudaFFTCompute
~CudaFFTCompute()
Definition: CudaPmeSolverUtil.C:116

PmeTranspose::dataSize
int dataSize
Definition: PmeSolverUtil.h:430

Lattice::scale
NAMD_HOST_DEVICE ScaledPosition scale(Position p) const
Definition: Lattice.h:83

PmeKSpaceCompute::pmeGrid
PmeGrid pmeGrid
Definition: PmeSolverUtil.h:270

PmeKSpaceCompute::bm1
double * bm1
Definition: PmeSolverUtil.h:272

deviceCUDA
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23

CudaPmeOneDevice::EnergyVirial::energy
double energy
Definition: CudaPmeSolverUtil.h:248

CudaPmeOneDevice::currentLattice
Lattice currentLattice
Definition: CudaPmeSolverUtil.h:283

PmeKSpaceCompute::size2
int size2
Definition: PmeSolverUtil.h:276

CudaPmeOneDevice::self_energy_alch_first_time
bool self_energy_alch_first_time
Definition: CudaPmeSolverUtil.h:254

TransposeBatch::data_in
T * data_in
Definition: CudaPmeSolverUtilKernel.h:89

PmeTranspose::nblock
int nblock
Definition: PmeSolverUtil.h:431

Vector::x
BigReal x
Definition: Vector.h:74

gather_force
void gather_force(const PatchLevelPmeData patchLevelPmeData, const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, const float *data, const int order, float3 *force, const cudaTextureObject_t gridTexObj, cudaStream_t stream)

Lattice::volume
NAMD_HOST_DEVICE BigReal volume(void) const
Definition: Lattice.h:293

SimParameters::alchDecouple
Bool alchDecouple
Definition: SimParameters.h:522

Lattice::a_r
NAMD_HOST_DEVICE Vector a_r() const
Definition: Lattice.h:284

PmeRealSpaceCompute::xsize
int xsize
Definition: PmeSolverUtil.h:327

CudaPmeSolverUtil.h

calcSelfEnergyFEPWrapper
void calcSelfEnergyFEPWrapper(double *d_selfEnergy, double *d_selfEnergy_FEP, double &h_selfEnergy, double &h_selfEnergyFEP, const float4 *d_atoms, const int *d_partition, const int num_atoms, const double ewaldcof, const bool alchDecouple, const double lambda1Up, const double lambda2Up, const double lambda1Down, const double lambda2Down, cudaStream_t stream)

Lattice::b_r
NAMD_HOST_DEVICE Vector b_r() const
Definition: Lattice.h:285

Molecule::numAtoms
int numAtoms
Definition: Molecule.h:586

CudaPmeKSpaceCompute::getVirial
void getVirial(double *virial)
Definition: CudaPmeSolverUtil.C:501

CudaAtom
Definition: CudaRecord.h:58

CudaPmeOneDevice::gridTexObjArrays
cudaTextureObject_t * gridTexObjArrays
Definition: CudaPmeSolverUtil.h:224

PatchLevelPmeData::kPatchGridDimPad
static constexpr int kPatchGridDimPad
Definition: CudaPmeSolverUtilKernel.h:10

CudaPmeKSpaceCompute::getEnergy
double getEnergy()
Definition: CudaPmeSolverUtil.C:497

CudaPmeOneDevice::EnergyVirial::virial
double virial[6]
Definition: CudaPmeSolverUtil.h:249

CudaPmeOneDevice::stream
cudaStream_t stream
Definition: CudaPmeSolverUtil.h:214

PmeKSpaceCompute::bm3
double * bm3
Definition: PmeSolverUtil.h:272

Lattice::c_r
NAMD_HOST_DEVICE Vector c_r() const
Definition: Lattice.h:286

PmeTranspose::kblock
const int kblock
Definition: PmeSolverUtil.h:428

CudaPmePencilXYZ::energyAndVirialDone
void energyAndVirialDone(unsigned int iGrid)
Definition: CudaPmeSolver.C:61

TestArray.h

CudaPmeOneDevice::kappa
double kappa
Definition: CudaPmeSolverUtil.h:245

writeComplexToDisk
void writeComplexToDisk(const float2 *d_data, const int size, const char *filename, cudaStream_t stream)
Definition: CudaPmeSolverUtil.C:30

Lattice::b
NAMD_HOST_DEVICE Vector b() const
Definition: Lattice.h:269

PmeKSpaceCompute::kappa
double kappa
Definition: PmeSolverUtil.h:273

Perm_Y_Z_cX
Definition: PmeSolverUtil.h:20

CudaPmeOneDevice::pmeGrid
PmeGrid pmeGrid
Definition: CudaPmeSolverUtil.h:211

CudaPmeOneDevice::d_selfEnergy_TI_2
double * d_selfEnergy_TI_2
Definition: CudaPmeSolverUtil.h:259

FFTCompute::dataSrcSize
int dataSrcSize
Definition: PmeSolverUtil.h:228

CudaPmeRealSpaceCompute
Definition: CudaPmeSolverUtil.h:112

CudaPmeOneDevice::d_bm3
float * d_bm3
Definition: CudaPmeSolverUtil.h:243

cudaNAMD_bug
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:53

REDUCTIONS_GPURESIDENT
Definition: ReductionMgr.h:184

CudaPmeOneDevice::natoms
int natoms
Definition: CudaPmeSolverUtil.h:216

FFTCompute::dataSrcAllocated
bool dataSrcAllocated
Definition: PmeSolverUtil.h:231

CudaPmeOneDevice::transize
size_t transize
Definition: CudaPmeSolverUtil.h:234

PmeTranspose::jsize
int jsize
Definition: PmeSolverUtil.h:429

compute_selfEnergy
double compute_selfEnergy(double *d_selfEnergy, const float4 *d_atoms, int natoms, double ewaldcof, cudaStream_t stream)

simParams
#define simParams
Definition: Output.C:131

PmeGrid::K3
int K3
Definition: PmeBase.h:21

PmeKSpaceCompute::permutation
const int permutation
Definition: PmeSolverUtil.h:274

CudaPmeOneDevice::d_grids
float * d_grids
Definition: CudaPmeSolverUtil.h:227

PmeTranspose
Definition: PmeSolverUtil.h:424

DeviceCUDA.h

getBlockDim
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:89

spread_charge
void spread_charge(const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)

CudaFFTCompute::forward
void forward()
Definition: CudaPmeSolverUtil.C:133

SimParameters::getCurrentLambda2
BigReal getCurrentLambda2(const int) const
Definition: SimParameters.C:8453

SimParameters::getCurrentLambda
BigReal getCurrentLambda(const int) const
Definition: SimParameters.C:8503

CudaPmeRealSpaceCompute::gatherForce
void gatherForce(Lattice &lattice, CudaForce *force)
Definition: CudaPmeSolverUtil.C:766

CudaPmeTranspose::waitStreamSynchronize
void waitStreamSynchronize()
Definition: CudaPmeSolverUtil.C:1156

Vector::y
BigReal y
Definition: Vector.h:74

PmeRealSpaceCompute::multipleGridIndex
unsigned int multipleGridIndex
Definition: PmeSolverUtil.h:334

CudaPmeKSpaceCompute::solve
void solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float *data)
Definition: CudaPmeSolverUtil.C:276

CudaPmeRealSpaceCompute::gatherForceSetCallback
void gatherForceSetCallback(ComputePmeCUDADevice *devicePtr_in)
Definition: CudaPmeSolverUtil.C:718

scaleAndMergeForceWrapper
void scaleAndMergeForceWrapper(float3 *forces, const float *factors, const size_t num_arrays, const int num_atoms, cudaStream_t stream)

CudaPmeTranspose::setDataPtrsYZX
void setDataPtrsYZX(std::vector< float2 *> &dataPtrsNew, float2 *data)
Definition: CudaPmeSolverUtil.C:876

PatchLevelPmeData::localRecords
const CudaLocalRecord * localRecords
Definition: CudaPmeSolverUtilKernel.h:25

CudaPmeOneDevice::gridsize
size_t gridsize
Definition: CudaPmeSolverUtil.h:233

PatchLevelPmeData::kDim
static constexpr int kDim
Definition: CudaPmeSolverUtilKernel.h:7

PatchLevelPmeData::latticeCompatible
bool latticeCompatible
Definition: CudaPmeSolverUtilKernel.h:18

CudaPmeTranspose::transposeXYZtoYZX
void transposeXYZtoYZX(const float2 *data)
Definition: CudaPmeSolverUtil.C:1009

CudaPmeOneDevice::d_bm2
float * d_bm2
Definition: CudaPmeSolverUtil.h:242

CudaPmeOneDevice::selfEnergy_TI_1
double selfEnergy_TI_1
Definition: CudaPmeSolverUtil.h:262

CudaPmeTranspose::setDataPtrsZXY
void setDataPtrsZXY(std::vector< float2 *> &dataPtrsNew, float2 *data)
Definition: CudaPmeSolverUtil.C:947

FFTCompute::dataDst
float * dataDst
Definition: PmeSolverUtil.h:226

cudaCheck
#define cudaCheck(stmt)
Definition: CudaUtils.h:242

CudaPmeKSpaceCompute::CudaPmeKSpaceCompute
CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream, unsigned int iGrid=0)
Definition: CudaPmeSolverUtil.C:236

batchTranspose_xyz_zxy
void batchTranspose_xyz_zxy(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)

CudaPmeOneDevice::num_used_grids
size_t num_used_grids
Definition: CudaPmeSolverUtil.h:217

Lattice
Definition: Lattice.h:17

CudaPmeOneDevice::forwardPlans
cufftHandle * forwardPlans
Definition: CudaPmeSolverUtil.h:237

SimParameters::alchGetNumOfPMEGrids
size_t alchGetNumOfPMEGrids() const
Definition: SimParameters.C:8587

CudaPmeTranspose::copyDataHostToDevice
void copyDataHostToDevice(const int iblock, float2 *data_in, float2 *data_out)
Definition: CudaPmeSolverUtil.C:1182

PmeRealSpaceCompute
Definition: PmeSolverUtil.h:317

PatchLevelPmeData::kThetaPad
static constexpr int kThetaPad
Definition: CudaPmeSolverUtilKernel.h:9

CudaPmeTranspose::transposeXYZtoZXY
void transposeXYZtoZXY(const float2 *data)
Definition: CudaPmeSolverUtil.C:1083

PatchLevelPmeData::kPatchGridDim
static constexpr int kPatchGridDim
Definition: CudaPmeSolverUtilKernel.h:11

Lattice::a
NAMD_HOST_DEVICE Vector a() const
Definition: Lattice.h:268

CudaPmePencilZ
Definition: CudaPmeSolver.h:190

FFTCompute::dataSrc
float * dataSrc
Definition: PmeSolverUtil.h:225

PatchLevelPmeData::h_patchGridOffsets
int3 * h_patchGridOffsets
Definition: CudaPmeSolverUtilKernel.h:29

Vector::unit
NAMD_HOST_DEVICE Vector unit(void) const
Definition: Vector.h:215

CcdCallBacksReset
void CcdCallBacksReset(void *ignored, double curWallTime)

CudaPmeRealSpaceCompute::waitGatherForceDone
void waitGatherForceDone()
Definition: CudaPmeSolverUtil.C:727

Node::molecule
Molecule * molecule
Definition: Node.h:179

CudaPmeTranspose::copyDataToPeerDeviceYZX
void copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
Definition: CudaPmeSolverUtil.C:1248

CudaPmeOneDevice::computeSharedMemoryPatchLevelGatherForce
int computeSharedMemoryPatchLevelGatherForce(const int numThreads, const int3 patchGridDim, const int order)
Definition: CudaPmeSolverUtil.C:2024

CudaPmeTranspose::getBuffer
float2 * getBuffer(const int iblock)
Definition: CudaPmeSolverUtil.C:1233

CudaPmeOneDevice::getShiftedGrid
int getShiftedGrid(const double x, const int grid)
Definition: CudaPmeSolverUtil.C:2007

TransposeBatch::ysize_out
int ysize_out
Definition: CudaPmeSolverUtilKernel.h:94

CudaPmeOneDevice::d_selfEnergy_TI_1
double * d_selfEnergy_TI_1
Definition: CudaPmeSolverUtil.h:258

REDUCTION_ELECT_ENERGY_SLOW
Definition: ReductionMgr.h:84

DeviceCUDA
Definition: DeviceCUDA.h:54

CudaPmeTranspose::~CudaPmeTranspose
~CudaPmeTranspose()
Definition: CudaPmeSolverUtil.C:863

CudaPmeOneDevice::h_energyVirials
EnergyVirial * h_energyVirials
Definition: CudaPmeSolverUtil.h:252

PmeKSpaceCompute::multipleGridIndex
unsigned int multipleGridIndex
Definition: PmeSolverUtil.h:279

BigReal
double BigReal
Definition: common.h:123

SimParameters::alchThermIntOn
Bool alchThermIntOn
Definition: SimParameters.h:480

writeRealToDisk
void writeRealToDisk(const float *d_data, const int size, const char *filename, cudaStream_t stream)
Definition: CudaPmeSolverUtil.C:49

SimParameters::alchFepOn
Bool alchFepOn
Definition: SimParameters.h:477

CudaPmeOneDevice::d_scaling_factors
float * d_scaling_factors
Definition: CudaPmeSolverUtil.h:222

CudaPmeOneDevice::~CudaPmeOneDevice
~CudaPmeOneDevice()
Definition: CudaPmeSolverUtil.C:1475

CudaPmeOneDevice::checkPatchLevelDeviceCompatibility
void checkPatchLevelDeviceCompatibility()
Definition: CudaPmeSolverUtil.C:2045

PmeRealSpaceCompute::ysize
int ysize
Definition: PmeSolverUtil.h:327

CudaPmeOneDevice::compute
void compute(const Lattice &lattice, int doEnergyVirial, int step)
Definition: CudaPmeSolverUtil.C:1522