CudaPmeSolverUtil.C

Go to the documentation of this file.
00001 #include <stdio.h>
00002 #include <algorithm>
00003 #ifdef NAMD_CUDA
00004 #include <cuda_runtime.h>
00005 #endif
00006 #include "ComputeNonbondedUtil.h"
00007 #include "ComputePmeCUDAMgr.h"
00008 #include "CudaPmeSolver.h"
00009 #include "CudaPmeSolverUtil.h"
00010 
00011 #ifdef NAMD_CUDA
00012 extern "C" void CcdCallBacksReset(void *ignored, double curWallTime);  // fix Charm++
00013 
00014 void writeComplexToDisk(const float2 *d_data, const int size, const char* filename, cudaStream_t stream) {
00015   fprintf(stderr, "writeComplexToDisk %d %s\n", size, filename);
00016   float2* h_data = new float2[size];
00017   copy_DtoH<float2>(d_data, h_data, size, stream);
00018   cudaCheck(cudaStreamSynchronize(stream));
00019   FILE *handle = fopen(filename, "w");
00020   for (int i=0;i < size;i++)
00021     fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
00022   fclose(handle);
00023   delete [] h_data;
00024 }
00025 
00026 void writeHostComplexToDisk(const float2 *h_data, const int size, const char* filename) {
00027   FILE *handle = fopen(filename, "w");
00028   for (int i=0;i < size;i++)
00029     fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
00030   fclose(handle);
00031 }
00032 
00033 void writeRealToDisk(const float *d_data, const int size, const char* filename, cudaStream_t stream) {
00034   fprintf(stderr, "writeRealToDisk %d %s\n", size, filename);
00035   float* h_data = new float[size];
00036   copy_DtoH<float>(d_data, h_data, size, stream);
00037   cudaCheck(cudaStreamSynchronize(stream));
00038   FILE *handle = fopen(filename, "w");
00039   for (int i=0;i < size;i++)
00040     fprintf(handle, "%f\n", h_data[i]);
00041   fclose(handle);
00042   delete [] h_data;
00043 }
00044 
00045 void CudaFFTCompute::plan3D(int *n, int flags) {
00046   cudaCheck(cudaSetDevice(deviceID));
00047   forwardType = CUFFT_R2C;
00048   backwardType = CUFFT_C2R;
00049   cufftCheck(cufftPlan3d(&forwardPlan, n[2], n[1], n[0], CUFFT_R2C));
00050   cufftCheck(cufftPlan3d(&backwardPlan, n[2], n[1], n[0], CUFFT_C2R));
00051   setStream();
00052   // plantype = 3;
00053 }
00054 
00055 void CudaFFTCompute::plan2D(int *n, int howmany, int flags) {
00056   cudaCheck(cudaSetDevice(deviceID));
00057   forwardType = CUFFT_R2C;
00058   backwardType = CUFFT_C2R;
00059   int nt[2] = {n[1], n[0]};
00060   cufftCheck(cufftPlanMany(&forwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_R2C, howmany));
00061   cufftCheck(cufftPlanMany(&backwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, howmany));
00062   setStream();
00063   // plantype = 2;
00064 }
00065 
00066 void CudaFFTCompute::plan1DX(int *n, int howmany, int flags) {
00067   cudaCheck(cudaSetDevice(deviceID));
00068   forwardType = CUFFT_R2C;
00069   backwardType = CUFFT_C2R;
00070   cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_R2C, howmany));
00071   cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2R, howmany));
00072   setStream();
00073   // plantype = 1;
00074 }
00075 
00076 void CudaFFTCompute::plan1DY(int *n, int howmany, int flags) {
00077   cudaCheck(cudaSetDevice(deviceID));
00078   forwardType = CUFFT_C2C;
00079   backwardType = CUFFT_C2C;
00080   cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
00081   cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
00082   setStream();
00083   // plantype = 1;
00084 }
00085 
00086 void CudaFFTCompute::plan1DZ(int *n, int howmany, int flags) {
00087   cudaCheck(cudaSetDevice(deviceID));
00088   forwardType = CUFFT_C2C;
00089   backwardType = CUFFT_C2C;
00090   cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
00091   cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
00092   setStream();
00093   // plantype = 1;
00094 }
00095 
00096 CudaFFTCompute::~CudaFFTCompute() {
00097   cudaCheck(cudaSetDevice(deviceID));
00098         cufftCheck(cufftDestroy(forwardPlan));
00099         cufftCheck(cufftDestroy(backwardPlan));
00100   if (dataSrcAllocated) deallocate_device<float>(&dataSrc);
00101   if (dataDstAllocated) deallocate_device<float>(&dataDst);
00102 }
00103 
00104 float* CudaFFTCompute::allocateData(const int dataSizeRequired) {
00105   cudaCheck(cudaSetDevice(deviceID));
00106   float* tmp = NULL;
00107   allocate_device<float>(&tmp, dataSizeRequired);
00108   return tmp;
00109 }
00110 
00111 // int ncall = 0;
00112 
00113 void CudaFFTCompute::forward() {
00114   cudaCheck(cudaSetDevice(deviceID));
00115   // ncall++;
00116   if (forwardType == CUFFT_R2C) {
00117 
00118     cufftCheck(cufftExecR2C(forwardPlan, (cufftReal *)dataSrc, (cufftComplex *)dataDst));
00119 
00120     // if (ncall == 1) {
00121     //   writeComplexToDisk((float2 *)dataSrc, (isize/2+1)*jsize*ksize, "dataSrc.txt", stream);
00122     // }
00123 
00124     // if (ncall == 1 && plantype == 2) {
00125     //   writeComplexToDisk((float2 *)data, (isize/2+1)*jsize*ksize, "data_fx_fy_z.txt", stream);
00126     // }
00127 
00128   } else if (forwardType == CUFFT_C2C) {
00129     // nc2cf++;
00130     // if (ncall == 1 && nc2cf == 1)
00131     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_y_z_fx.txt");
00132     // else if (ncall == 1 && nc2cf == 2)
00133     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_z_fx_fy.txt");
00134     cufftCheck(cufftExecC2C(forwardPlan, (cufftComplex *)dataSrc, (cufftComplex *)dataDst, CUFFT_FORWARD));
00135     // fprintf(stderr, "ncall %d plantype %d\n", ncall, plantype);
00136     // if (ncall == 1 && plantype == 1 && isize == 62) {
00137     //   writeComplexToDisk((float2 *)data, isize*jsize*(ksize/2+1), "data_fy_z_fx.txt", stream);
00138     // }
00139     // if (ncall == 1 && nc2cf == 1)
00140     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_fy_z_fx.txt");
00141     // else if (ncall == 1 && nc2cf == 2)
00142     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_fz_fx_fy.txt");
00143   } else {
00144     cudaNAMD_bug("CudaFFTCompute::forward(), unsupported FFT type");
00145   }
00146 }
00147 
00148 void CudaFFTCompute::backward() {
00149   cudaCheck(cudaSetDevice(deviceID));
00150   if (backwardType == CUFFT_C2R) {
00151     // if (ncall == 1) {
00152     //   if (plantype == 1)
00153     //     writeComplexToDisk((float2 *)data, 33*64*64, "data_fx_by_bz.txt");
00154     //   else
00155     //     writeComplexToDisk((float2 *)data, 33*64*64, "data_fx_fy_fz_2.txt");
00156     // }
00157 
00158     cufftCheck(cufftExecC2R(backwardPlan, (cufftComplex *)dataDst, (cufftReal *)dataSrc));
00159 
00160     // if (ncall == 1)
00161     //   if (plantype == 1)
00162     //     writeRealToDisk(data, 64*64*64, "data_bx_by_bz_1D.txt");
00163     //   else
00164     //     writeRealToDisk(data, 64*64*64, "data_bx_by_bz_3D.txt");
00165   } else if (backwardType == CUFFT_C2C) {
00166     // nc2cb++;
00167     // if (ncall == 1 && nc2cb == 1)
00168     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_fz_fx_fy_2.txt");
00169     // else if (ncall == 1 && nc2cb == 2)
00170     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_fy_bz_fx.txt");
00171     cufftCheck(cufftExecC2C(backwardPlan, (cufftComplex *)dataDst, (cufftComplex *)dataSrc, CUFFT_INVERSE));
00172     // if (ncall == 1 && nc2cb == 1)
00173     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_bz_fx_fy.txt");
00174     // else if (ncall == 1 && nc2cb == 2)
00175     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_by_bz_fx.txt");
00176   } else {
00177     cudaNAMD_bug("CudaFFTCompute::backward(), unsupported FFT type");
00178   }
00179 }
00180 
00181 void CudaFFTCompute::setStream() {
00182   cudaCheck(cudaSetDevice(deviceID));
00183   cufftCheck(cufftSetStream(forwardPlan, stream));
00184   cufftCheck(cufftSetStream(backwardPlan, stream));
00185 }
00186 
00187 //###########################################################################
00188 //###########################################################################
00189 //###########################################################################
00190 
00191 CudaPmeKSpaceCompute::CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation,
00192   const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream) : 
00193   PmeKSpaceCompute(pmeGrid, permutation, jblock, kblock, kappa),
00194   deviceID(deviceID), stream(stream) {
00195 
00196   cudaCheck(cudaSetDevice(deviceID));
00197 
00198   // Copy bm1 -> prefac_x on GPU memory
00199   float *bm1f = new float[pmeGrid.K1];
00200   float *bm2f = new float[pmeGrid.K2];
00201   float *bm3f = new float[pmeGrid.K3];
00202   for (int i=0;i < pmeGrid.K1;i++) bm1f[i] = (float)bm1[i];
00203   for (int i=0;i < pmeGrid.K2;i++) bm2f[i] = (float)bm2[i];
00204   for (int i=0;i < pmeGrid.K3;i++) bm3f[i] = (float)bm3[i];
00205   allocate_device<float>(&d_bm1, pmeGrid.K1);
00206   allocate_device<float>(&d_bm2, pmeGrid.K2);
00207   allocate_device<float>(&d_bm3, pmeGrid.K3);
00208   copy_HtoD_sync<float>(bm1f, d_bm1, pmeGrid.K1);
00209   copy_HtoD_sync<float>(bm2f, d_bm2, pmeGrid.K2);
00210   copy_HtoD_sync<float>(bm3f, d_bm3, pmeGrid.K3);
00211   delete [] bm1f;
00212   delete [] bm2f;
00213   delete [] bm3f;
00214   allocate_device<EnergyVirial>(&d_energyVirial, 1);
00215   allocate_host<EnergyVirial>(&h_energyVirial, 1);
00216   // cudaCheck(cudaEventCreateWithFlags(&copyEnergyVirialEvent, cudaEventDisableTiming));
00217   cudaCheck(cudaEventCreate(&copyEnergyVirialEvent));
00218   // ncall = 0;
00219 }
00220 
00221 CudaPmeKSpaceCompute::~CudaPmeKSpaceCompute() {
00222   cudaCheck(cudaSetDevice(deviceID));
00223   deallocate_device<float>(&d_bm1);
00224   deallocate_device<float>(&d_bm2);
00225   deallocate_device<float>(&d_bm3);
00226   deallocate_device<EnergyVirial>(&d_energyVirial);
00227   deallocate_host<EnergyVirial>(&h_energyVirial);
00228   cudaCheck(cudaEventDestroy(copyEnergyVirialEvent));
00229 }
00230 
00231 void CudaPmeKSpaceCompute::solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float* data) {
00232   cudaCheck(cudaSetDevice(deviceID));
00233 
00234   const bool doEnergyVirial = (doEnergy || doVirial);
00235 
00236   int nfft1, nfft2, nfft3;
00237   float *prefac1, *prefac2, *prefac3;
00238 
00239   BigReal volume = lattice.volume();
00240   Vector a_r = lattice.a_r();
00241   Vector b_r = lattice.b_r();
00242   Vector c_r = lattice.c_r();
00243   float recip1x, recip1y, recip1z;
00244   float recip2x, recip2y, recip2z;
00245   float recip3x, recip3y, recip3z;
00246 
00247   if (permutation == Perm_Z_cX_Y) {
00248     // Z, X, Y
00249     nfft1 = pmeGrid.K3;
00250     nfft2 = pmeGrid.K1;
00251     nfft3 = pmeGrid.K2;
00252     prefac1 = d_bm3;
00253     prefac2 = d_bm1;
00254     prefac3 = d_bm2;
00255     recip1x = c_r.z;
00256     recip1y = c_r.x;
00257     recip1z = c_r.y;
00258     recip2x = a_r.z;
00259     recip2y = a_r.x;
00260     recip2z = a_r.y;
00261     recip3x = b_r.z;
00262     recip3y = b_r.x;
00263     recip3z = b_r.y;
00264   } else if (permutation == Perm_cX_Y_Z) {
00265     // X, Y, Z
00266     nfft1 = pmeGrid.K1;
00267     nfft2 = pmeGrid.K2;
00268     nfft3 = pmeGrid.K3;
00269     prefac1 = d_bm1;
00270     prefac2 = d_bm2;
00271     prefac3 = d_bm3;
00272     recip1x = a_r.x;
00273     recip1y = a_r.y;
00274     recip1z = a_r.z;
00275     recip2x = b_r.x;
00276     recip2y = b_r.y;
00277     recip2z = b_r.z;
00278     recip3x = c_r.x;
00279     recip3y = c_r.y;
00280     recip3z = c_r.z;
00281   } else {
00282     NAMD_bug("CudaPmeKSpaceCompute::solve, invalid permutation");
00283   }
00284 
00285   // ncall++;
00286   // if (ncall == 1) {
00287   //   char filename[256];
00288   //   sprintf(filename,"dataf_%d_%d.txt",jblock,kblock);
00289   //   writeComplexToDisk((float2*)data, size1*size2*size3, filename, stream);
00290   // }
00291 
00292   // if (ncall == 1) {
00293   //   float2* h_data = new float2[size1*size2*size3];
00294   //   float2* d_data = (float2*)data;
00295   //   copy_DtoH<float2>(d_data, h_data, size1*size2*size3, stream);
00296   //   cudaCheck(cudaStreamSynchronize(stream));
00297   //   FILE *handle = fopen("dataf.txt", "w");
00298   //   for (int z=0;z < pmeGrid.K3;z++) {
00299   //     for (int y=0;y < pmeGrid.K2;y++) {
00300   //       for (int x=0;x < pmeGrid.K1/2+1;x++) {
00301   //         int i;
00302   //         if (permutation == Perm_cX_Y_Z) {
00303   //           i = x + y*size1 + z*size1*size2;
00304   //         } else {
00305   //           i = z + x*size1 + y*size1*size2;
00306   //         }
00307   //         fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
00308   //       }
00309   //     }
00310   //   }
00311   //   fclose(handle);
00312   //   delete [] h_data;
00313   // }
00314 
00315   // Clear energy and virial array if needed
00316   if (doEnergyVirial) clear_device_array<EnergyVirial>(d_energyVirial, 1, stream);
00317 
00318   scalar_sum(permutation == Perm_cX_Y_Z, nfft1, nfft2, nfft3, size1, size2, size3, kappa,
00319     recip1x, recip1y, recip1z, recip2x, recip2y, recip2z, recip3x, recip3y, recip3z,
00320     volume, prefac1, prefac2, prefac3, j0, k0, doEnergyVirial,
00321     &d_energyVirial->energy, d_energyVirial->virial, (float2*)data, 
00322     stream);
00323 
00324   // Copy energy and virial to host if needed
00325   if (doEnergyVirial) {
00326     copy_DtoH<EnergyVirial>(d_energyVirial, h_energyVirial, 1, stream);
00327     cudaCheck(cudaEventRecord(copyEnergyVirialEvent, stream));
00328     // cudaCheck(cudaStreamSynchronize(stream));
00329   }
00330 
00331 }
00332 
00333 // void CudaPmeKSpaceCompute::waitEnergyAndVirial() {
00334 //   cudaCheck(cudaSetDevice(deviceID));
00335 //   cudaCheck(cudaEventSynchronize(copyEnergyVirialEvent));
00336 // }
00337 
00338 void CudaPmeKSpaceCompute::energyAndVirialCheck(void *arg, double walltime) {
00339   CudaPmeKSpaceCompute* c = (CudaPmeKSpaceCompute *)arg;
00340 
00341   cudaError_t err = cudaEventQuery(c->copyEnergyVirialEvent);
00342   if (err == cudaSuccess) {
00343     // Event has occurred
00344     c->checkCount = 0;
00345     if (c->pencilXYZPtr != NULL)
00346       c->pencilXYZPtr->energyAndVirialDone();
00347     else if (c->pencilZPtr != NULL)
00348       c->pencilZPtr->energyAndVirialDone();
00349     else
00350       NAMD_bug("CudaPmeKSpaceCompute::energyAndVirialCheck, pencilXYZPtr and pencilZPtr not set");
00351     return;
00352   } else if (err == cudaErrorNotReady) {
00353     // Event has not occurred
00354     c->checkCount++;
00355     if (c->checkCount >= 1000000) {
00356       NAMD_bug("CudaPmeKSpaceCompute::energyAndVirialCheck, check count exceeded");
00357     }
00358   } else {
00359     // Anything else is an error
00360     NAMD_bug("CudaPmeKSpaceCompute::energyAndVirialCheck, cudaEventQuery returned error");
00361   }
00362 
00363   // Call again 
00364   CcdCallBacksReset(0, walltime);
00365   CcdCallFnAfter(energyAndVirialCheck, arg, 0.1);
00366 }
00367 
00368 void CudaPmeKSpaceCompute::energyAndVirialSetCallback(CudaPmePencilXYZ* pencilPtr) {
00369   cudaCheck(cudaSetDevice(deviceID));
00370   pencilXYZPtr = pencilPtr;
00371   pencilZPtr = NULL;
00372   checkCount = 0;
00373   CcdCallBacksReset(0, CmiWallTimer());
00374   // Set the call back at 0.1ms
00375   CcdCallFnAfter(energyAndVirialCheck, this, 0.1);
00376 }
00377 
00378 void CudaPmeKSpaceCompute::energyAndVirialSetCallback(CudaPmePencilZ* pencilPtr) {
00379   cudaCheck(cudaSetDevice(deviceID));
00380   pencilXYZPtr = NULL;
00381   pencilZPtr = pencilPtr;
00382   checkCount = 0;
00383   CcdCallBacksReset(0, CmiWallTimer());
00384   // Set the call back at 0.1ms
00385   CcdCallFnAfter(energyAndVirialCheck, this, 0.1);
00386 }
00387 
00388 double CudaPmeKSpaceCompute::getEnergy() {
00389   return h_energyVirial->energy;
00390 }
00391 
00392 void CudaPmeKSpaceCompute::getVirial(double *virial) {
00393   if (permutation == Perm_Z_cX_Y) {
00394     // h_energyVirial->virial is storing ZZ, ZX, ZY, XX, XY, YY
00395     virial[0] = h_energyVirial->virial[3];
00396     virial[1] = h_energyVirial->virial[4];
00397     virial[2] = h_energyVirial->virial[1];
00398 
00399     virial[3] = h_energyVirial->virial[4];
00400     virial[4] = h_energyVirial->virial[5];
00401     virial[5] = h_energyVirial->virial[2];
00402 
00403     virial[6] = h_energyVirial->virial[1];
00404     virial[7] = h_energyVirial->virial[7];
00405     virial[8] = h_energyVirial->virial[0];
00406   } else if (permutation == Perm_cX_Y_Z) {
00407     // h_energyVirial->virial is storing XX, XY, XZ, YY, YZ, ZZ
00408     virial[0] = h_energyVirial->virial[0];
00409     virial[1] = h_energyVirial->virial[1];
00410     virial[2] = h_energyVirial->virial[2];
00411 
00412     virial[3] = h_energyVirial->virial[1];
00413     virial[4] = h_energyVirial->virial[3];
00414     virial[5] = h_energyVirial->virial[4];
00415 
00416     virial[6] = h_energyVirial->virial[2];
00417     virial[7] = h_energyVirial->virial[4];
00418     virial[8] = h_energyVirial->virial[5];
00419   }
00420 }
00421 
00422 
00423 //###########################################################################
00424 //###########################################################################
00425 //###########################################################################
00426 
00427 //
00428 // Class constructor
00429 //
00430 CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute(PmeGrid pmeGrid,
00431   const int jblock, const int kblock, int deviceID, cudaStream_t stream) : 
00432   PmeRealSpaceCompute(pmeGrid, jblock, kblock), deviceID(deviceID), stream(stream) {
00433   if (dataSize < xsize*ysize*zsize)
00434     NAMD_bug("CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute, insufficient dataSize");
00435   cudaCheck(cudaSetDevice(deviceID));
00436   d_atomsCapacity = 0;
00437   d_atoms = NULL;
00438   d_forceCapacity = 0;
00439   d_force = NULL;
00440   tex_data = NULL;
00441   tex_data_len = 0;
00442   allocate_device<float>(&data, dataSize);
00443   setupGridTexture(data, xsize*ysize*zsize);
00444   cudaCheck(cudaEventCreate(&gatherForceEvent));
00445 }
00446 
00447 //
00448 // Class desctructor
00449 //
00450 CudaPmeRealSpaceCompute::~CudaPmeRealSpaceCompute() {
00451   cudaCheck(cudaSetDevice(deviceID));
00452   if (d_atoms != NULL) deallocate_device<CudaAtom>(&d_atoms);
00453   if (d_force != NULL) deallocate_device<CudaForce>(&d_force);
00454   // if (d_patches != NULL) deallocate_device<PatchInfo>(&d_patches);
00455   // deallocate_device<double>(&d_selfEnergy);
00456   deallocate_device<float>(&data);
00457   cudaCheck(cudaEventDestroy(gatherForceEvent));
00458 }
00459 
00460 // //
00461 // // Copy patches and atoms to device memory
00462 // //
00463 // void CudaPmeRealSpaceCompute::setPatchesAtoms(const int numPatches, const PatchInfo* patches,
00464 //   const int numAtoms, const CudaAtom* atoms) {
00465 
00466 //   this->numPatches = numPatches;
00467 //   this->numAtoms = numAtoms;
00468 
00469 //   // Reallocate device arrays as neccessary
00470 //   reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity, numAtoms, 1.5f);
00471 //   reallocate_device<PatchInfo>(&d_patches, &d_patchesCapacity, numPatches, 1.5f);
00472 
00473 //   // Copy atom and patch data to device
00474 //   copy_HtoD<CudaAtom>(atoms, d_atoms, numAtoms, stream);
00475 //   copy_HtoD<PatchInfo>(patches, d_patches, numPatches, stream);
00476 // }
00477 
00478 //
00479 // Copy atoms to device memory
00480 //
00481 void CudaPmeRealSpaceCompute::copyAtoms(const int numAtoms, const CudaAtom* atoms) {
00482   cudaCheck(cudaSetDevice(deviceID));
00483   this->numAtoms = numAtoms;
00484 
00485   // Reallocate device arrays as neccessary
00486   reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity, numAtoms, 1.5f);
00487 
00488   // Copy atom data to device
00489   copy_HtoD<CudaAtom>(atoms, d_atoms, numAtoms, stream);
00490 }
00491 
00492 //
00493 // Spread charges on grid
00494 //
00495 void CudaPmeRealSpaceCompute::spreadCharge(Lattice &lattice) {
00496   cudaCheck(cudaSetDevice(deviceID));
00497 
00498   // Clear grid
00499   clear_device_array<float>(data, xsize*ysize*zsize, stream);
00500 
00501   spread_charge((const float4*)d_atoms, numAtoms,
00502     pmeGrid.K1, pmeGrid.K2, pmeGrid.K3, xsize, ysize, zsize,
00503     xsize, y0, z0, (pmeGrid.yBlocks == 1), (pmeGrid.zBlocks == 1),
00504     data, pmeGrid.order, stream);
00505 
00506   // ncall++;
00507 
00508   // if (ncall == 1) writeRealToDisk(data, xsize*ysize*zsize, "data.txt");
00509 }
00510 
00511 void CudaPmeRealSpaceCompute::cuda_gatherforce_check(void *arg, double walltime) {
00512   CudaPmeRealSpaceCompute* c = (CudaPmeRealSpaceCompute *)arg;
00513   cudaCheck(cudaSetDevice(c->deviceID));
00514 
00515   cudaError_t err = cudaEventQuery(c->gatherForceEvent);
00516   if (err == cudaSuccess) {
00517     // Event has occurred
00518     c->checkCount = 0;
00519 //    c->deviceProxy[CkMyNode()].gatherForceDone();
00520     c->devicePtr->gatherForceDone();
00521     return;
00522   } else if (err == cudaErrorNotReady) {
00523     // Event has not occurred
00524     c->checkCount++;
00525     if (c->checkCount >= 1000000) {
00526       NAMD_bug("CudaPmeRealSpaceCompute::cuda_gatherforce_check, check count exceeded");
00527     }
00528   } else {
00529     // Anything else is an error
00530     NAMD_bug("CudaPmeRealSpaceCompute::cuda_gatherforce_check, cudaEventQuery returned error");
00531   }
00532 
00533   // Call again 
00534   CcdCallBacksReset(0, walltime);
00535   CcdCallFnAfter(cuda_gatherforce_check, arg, 0.1);
00536 }
00537 
00538 void CudaPmeRealSpaceCompute::gatherForceSetCallback(ComputePmeCUDADevice* devicePtr_in) {
00539   cudaCheck(cudaSetDevice(deviceID));
00540   devicePtr = devicePtr_in;
00541   checkCount = 0;
00542   CcdCallBacksReset(0, CmiWallTimer());
00543   // Set the call back at 0.1ms
00544   CcdCallFnAfter(cuda_gatherforce_check, this, 0.1);
00545 }
00546 
00547 void CudaPmeRealSpaceCompute::waitGatherForceDone() {
00548   cudaCheck(cudaSetDevice(deviceID));
00549   cudaCheck(cudaEventSynchronize(gatherForceEvent));
00550 }
00551 
00552 void CudaPmeRealSpaceCompute::setupGridTexture(float* data, int data_len) {
00553   if (tex_data == data && tex_data_len == data_len) return;
00554   tex_data = data;
00555   tex_data_len = data_len;
00556   // Use texture objects
00557   cudaResourceDesc resDesc;
00558   memset(&resDesc, 0, sizeof(resDesc));
00559   resDesc.resType = cudaResourceTypeLinear;
00560   resDesc.res.linear.devPtr = data;
00561   resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
00562   resDesc.res.linear.desc.x = sizeof(float)*8;
00563   resDesc.res.linear.sizeInBytes = data_len*sizeof(float);
00564   cudaTextureDesc texDesc;
00565   memset(&texDesc, 0, sizeof(texDesc));
00566   texDesc.readMode = cudaReadModeElementType;
00567   cudaCheck(cudaCreateTextureObject(&gridTexObj, &resDesc, &texDesc, NULL));
00568 }
00569 
00570 void CudaPmeRealSpaceCompute::gatherForce(Lattice &lattice, CudaForce* force) {
00571   cudaCheck(cudaSetDevice(deviceID));
00572 
00573   // Re-allocate force array if needed
00574   reallocate_device<CudaForce>(&d_force, &d_forceCapacity, numAtoms, 1.5f);
00575 
00576   gather_force((const float4*)d_atoms, numAtoms,
00577     pmeGrid.K1, pmeGrid.K2, pmeGrid.K3,
00578     xsize, ysize, zsize, xsize, y0, z0, (pmeGrid.yBlocks == 1), (pmeGrid.zBlocks == 1),
00579     data, pmeGrid.order, (float3*)d_force, 
00580     gridTexObj,
00581     stream);
00582 
00583   copy_DtoH<CudaForce>(d_force, force, numAtoms, stream);
00584 
00585   cudaCheck(cudaEventRecord(gatherForceEvent, stream));
00586 }
00587 
00588 /*
00589 double CudaPmeRealSpaceCompute::calcSelfEnergy() {
00590   double h_selfEnergy;
00591   clear_device_array<double>(d_selfEnergy, 1);
00592   calc_sum_charge_squared((const float4*)d_atoms, numAtoms, d_selfEnergy, stream);
00593   copy_DtoH<double>(d_selfEnergy, &h_selfEnergy, 1, stream);
00594   cudaCheck(cudaStreamSynchronize(stream));
00595   // 1.7724538509055160273 = sqrt(pi)
00596   h_selfEnergy *= -ComputeNonbondedUtil::ewaldcof/1.7724538509055160273;
00597   return h_selfEnergy;
00598 }
00599 */
00600 
00601 //###########################################################################
00602 //###########################################################################
00603 //###########################################################################
00604 
00605 CudaPmeTranspose::CudaPmeTranspose(PmeGrid pmeGrid, const int permutation,
00606     const int jblock, const int kblock, int deviceID, cudaStream_t stream) : 
00607   PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
00608   cudaCheck(cudaSetDevice(deviceID));
00609 
00610   allocate_device<float2>(&d_data, dataSize);
00611 #ifndef P2P_ENABLE_3D
00612   allocate_device<float2>(&d_buffer, dataSize);
00613 #endif
00614 
00615   // Setup data pointers to NULL, these can be overridden later on by using setDataPtrs()
00616   dataPtrsYZX.resize(nblock, NULL);
00617   dataPtrsZXY.resize(nblock, NULL);
00618 
00619   allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*nblock);
00620   allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*nblock);
00621 }
00622 
00623 CudaPmeTranspose::~CudaPmeTranspose() {
00624   cudaCheck(cudaSetDevice(deviceID));
00625   deallocate_device<float2>(&d_data);
00626 #ifndef P2P_ENABLE_3D
00627   deallocate_device<float2>(&d_buffer);
00628 #endif
00629   deallocate_device< TransposeBatch<float2> >(&batchesZXY);
00630   deallocate_device< TransposeBatch<float2> >(&batchesYZX);
00631 }
00632 
00633 //
00634 // Set dataPtrsYZX
00635 //
00636 void CudaPmeTranspose::setDataPtrsYZX(std::vector<float2*>& dataPtrsNew, float2* data) {
00637   if (dataPtrsYZX.size() != dataPtrsNew.size())
00638     NAMD_bug("CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
00639   for (int iblock=0;iblock < nblock;iblock++) {
00640     dataPtrsYZX[iblock] = dataPtrsNew[iblock];
00641   }
00642   // Build batched data structures
00643   TransposeBatch<float2> *h_batchesYZX = new TransposeBatch<float2>[3*nblock];
00644 
00645   for (int iperm=0;iperm < 3;iperm++) {
00646     int isize_out;
00647     if (iperm == 0) {
00648       // Perm_Z_cX_Y:
00649       // ZXY -> XYZ
00650       isize_out = pmeGrid.K1/2+1;
00651     } else if (iperm == 1) {
00652       // Perm_cX_Y_Z:
00653       // XYZ -> YZX
00654       isize_out = pmeGrid.K2;
00655     } else {
00656       // Perm_Y_Z_cX:
00657       // YZX -> ZXY
00658       isize_out = pmeGrid.K3;
00659     }
00660 
00661     int max_nx = 0;
00662     for (int iblock=0;iblock < nblock;iblock++) {
00663 
00664       int x0 = pos[iblock];
00665       int nx = pos[iblock+1] - x0;
00666       max_nx = std::max(max_nx, nx);
00667 
00668       int width_out;
00669       float2* data_out;
00670       if (dataPtrsYZX[iblock] == NULL) {
00671         // Local transpose, use internal buffer
00672         data_out = d_data + jsize*ksize*x0;
00673         width_out = jsize;
00674       } else {
00675         // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00676         data_out = dataPtrsYZX[iblock];
00677         width_out = isize_out;
00678       }
00679 
00680       TransposeBatch<float2> batch;
00681       batch.nx        = nx;
00682       batch.ysize_out = width_out;
00683       batch.zsize_out = ksize;
00684       batch.data_in   = data+x0;
00685       batch.data_out  = data_out;
00686 
00687       h_batchesYZX[iperm*nblock + iblock] = batch;
00688 
00689     // transpose_xyz_yzx(
00690     //   nx, jsize, ksize,
00691     //   isize, jsize,
00692     //   width_out, ksize,
00693     //   data+x0, data_out, stream);
00694     }
00695 
00696     max_nx_YZX[iperm] = max_nx;
00697   }
00698 
00699   copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*nblock, stream);
00700   cudaCheck(cudaStreamSynchronize(stream));
00701   delete [] h_batchesYZX;
00702 }
00703 
00704 //
00705 // Set dataPtrsZXY
00706 //
00707 void CudaPmeTranspose::setDataPtrsZXY(std::vector<float2*>& dataPtrsNew, float2* data) {
00708   if (dataPtrsZXY.size() != dataPtrsNew.size())
00709     NAMD_bug("CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
00710   for (int iblock=0;iblock < nblock;iblock++) {
00711     dataPtrsZXY[iblock] = dataPtrsNew[iblock];
00712   }
00713 
00714   // Build batched data structures
00715   TransposeBatch<float2> *h_batchesZXY = new TransposeBatch<float2>[3*nblock];
00716 
00717   for (int iperm=0;iperm < 3;iperm++) {
00718     int isize_out;
00719     if (iperm == 0) {
00720       // Perm_cX_Y_Z:
00721       // XYZ -> ZXY
00722       isize_out = pmeGrid.K3;
00723     } else if (iperm == 1) {
00724       // Perm_Z_cX_Y:
00725       // ZXY -> YZX
00726       isize_out = pmeGrid.K2;
00727     } else {
00728       // Perm_Y_Z_cX:
00729       // YZX -> XYZ
00730       isize_out = pmeGrid.K1/2+1;
00731     }
00732 
00733     int max_nx = 0;
00734     for (int iblock=0;iblock < nblock;iblock++) {
00735 
00736       int x0 = pos[iblock];
00737       int nx = pos[iblock+1] - x0;
00738       max_nx = std::max(max_nx, nx);
00739 
00740       int width_out;
00741       float2* data_out;
00742       if (dataPtrsZXY[iblock] == NULL) {
00743         // Local transpose, use internal buffer
00744         data_out = d_data + jsize*ksize*x0;
00745         width_out = ksize;
00746       } else {
00747         // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00748         data_out = dataPtrsZXY[iblock];
00749         width_out = isize_out;
00750       }
00751 
00752       TransposeBatch<float2> batch;
00753       batch.nx        = nx;
00754       batch.zsize_out = width_out;
00755       batch.xsize_out = nx;
00756       batch.data_in   = data+x0;
00757       batch.data_out  = data_out;
00758 
00759       h_batchesZXY[iperm*nblock + iblock] = batch;
00760     }
00761 
00762     max_nx_ZXY[iperm] = max_nx;
00763   }
00764 
00765   copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*nblock, stream);
00766   cudaCheck(cudaStreamSynchronize(stream));
00767   delete [] h_batchesZXY;
00768 }
00769 
00770 void CudaPmeTranspose::transposeXYZtoYZX(const float2* data) {
00771   cudaCheck(cudaSetDevice(deviceID));
00772 
00773   int iperm;
00774   switch(permutation) {
00775     case Perm_Z_cX_Y:
00776     // ZXY -> XYZ
00777     iperm = 0;
00778     break;
00779     case Perm_cX_Y_Z:
00780     // XYZ -> YZX
00781     iperm = 1;
00782     break;
00783     case Perm_Y_Z_cX:
00784     // YZX -> ZXY
00785     iperm = 2;
00786     break;
00787     default:
00788     NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
00789     break;
00790   }
00791 
00792   batchTranspose_xyz_yzx(
00793     nblock, batchesYZX + iperm*nblock,
00794     max_nx_YZX[iperm], jsize, ksize,
00795     isize, jsize, stream);
00796 
00797 
00798 /*
00799   int isize_out;
00800   switch(permutation) {
00801     case Perm_Z_cX_Y:
00802     // ZXY -> XYZ
00803     isize_out = pmeGrid.K1/2+1;
00804     break;
00805     case Perm_cX_Y_Z:
00806     // XYZ -> YZX
00807     isize_out = pmeGrid.K2;
00808     break;
00809     case Perm_Y_Z_cX:
00810     // YZX -> ZXY
00811     isize_out = pmeGrid.K3;
00812     break;
00813     default:
00814     NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
00815     break;
00816   }
00817 
00818   for (int iblock=0;iblock < nblock;iblock++) {
00819 
00820     int x0 = pos[iblock];
00821     int nx = pos[iblock+1] - x0;
00822 
00823     int width_out;
00824     float2* data_out;
00825     if (dataPtrsYZX[iblock] == NULL) {
00826       // Local transpose, use internal buffer
00827       data_out = d_data + jsize*ksize*x0;
00828       width_out = jsize;
00829     } else {
00830       // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00831       data_out = dataPtrsYZX[iblock];
00832       width_out = isize_out;
00833     }
00834 
00835     transpose_xyz_yzx(
00836       nx, jsize, ksize,
00837       isize, jsize,
00838       width_out, ksize,
00839       data+x0, data_out, stream);
00840   }
00841 */
00842 }
00843 
00844 void CudaPmeTranspose::transposeXYZtoZXY(const float2* data) {
00845   cudaCheck(cudaSetDevice(deviceID));
00846 
00847   int iperm;
00848   switch(permutation) {
00849     case Perm_cX_Y_Z:
00850     // XYZ -> ZXY
00851     iperm = 0;
00852     break;
00853     case Perm_Z_cX_Y:
00854     // ZXY -> YZX
00855     iperm = 1;
00856     break;
00857     case Perm_Y_Z_cX:
00858     // YZX -> XYZ
00859     iperm = 2;
00860     break;
00861     default:
00862     NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
00863     break;
00864   }
00865 
00866   batchTranspose_xyz_zxy(
00867     nblock, batchesZXY + iperm*nblock,
00868     max_nx_ZXY[iperm], jsize, ksize,
00869     isize, jsize, stream);
00870 
00871 /*
00872   int isize_out;
00873   switch(permutation) {
00874     case Perm_cX_Y_Z:
00875     // XYZ -> ZXY
00876     isize_out = pmeGrid.K3;
00877     break;
00878     case Perm_Z_cX_Y:
00879     // ZXY -> YZX
00880     isize_out = pmeGrid.K2;
00881     break;
00882     case Perm_Y_Z_cX:
00883     // YZX -> XYZ
00884     isize_out = pmeGrid.K1/2+1;
00885     break;
00886     default:
00887     NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
00888     break;
00889   }
00890 
00891   for (int iblock=0;iblock < nblock;iblock++) {
00892 
00893     int x0 = pos[iblock];
00894     int nx = pos[iblock+1] - x0;
00895 
00896     int width_out;
00897     float2* data_out;
00898     if (dataPtrsZXY[iblock] == NULL) {
00899       // Local transpose, use internal buffer
00900       data_out = d_data + jsize*ksize*x0;
00901       width_out = ksize;
00902     } else {
00903       // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00904       data_out = dataPtrsZXY[iblock];
00905       width_out = isize_out;
00906     }
00907 
00908     transpose_xyz_zxy(
00909       nx, jsize, ksize,
00910       isize, jsize,
00911       width_out, nx,
00912       data+x0, data_out, stream);
00913   }
00914 */
00915 }
00916 
00917 void CudaPmeTranspose::waitStreamSynchronize() {
00918   cudaCheck(cudaSetDevice(deviceID));
00919   cudaCheck(cudaStreamSynchronize(stream));
00920 }
00921 
00922 void CudaPmeTranspose::copyDataDeviceToHost(const int iblock, float2* h_data, const int h_dataSize) {
00923   cudaCheck(cudaSetDevice(deviceID));
00924 
00925   if (iblock >= nblock)
00926     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
00927 
00928   int x0 = pos[iblock];
00929   int nx = pos[iblock+1] - x0;
00930 
00931   int copySize  = jsize*ksize*nx;
00932   int copyStart = jsize*ksize*x0;
00933 
00934   if (copyStart + copySize > dataSize)
00935     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
00936 
00937   if (copySize > h_dataSize) 
00938     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
00939 
00940   copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
00941 }
00942 
00943 void CudaPmeTranspose::copyDataHostToDevice(const int iblock, float2* data_in, float2* data_out) {
00944   cudaCheck(cudaSetDevice(deviceID));
00945 
00946   if (iblock >= nblock)
00947     NAMD_bug("CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
00948 
00949   // Determine block size = how much we're copying
00950   int i0, i1, j0, j1, k0, k1;
00951   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
00952   int ni = i1-i0+1;
00953   int nj = j1-j0+1;
00954   int nk = k1-k0+1;
00955 
00956   copy3D_HtoD<float2>(data_in, data_out,
00957     0, 0, 0,
00958     ni, nj,
00959     i0, 0, 0,
00960     isize, jsize,
00961     ni, nj, nk, stream);
00962 }
00963 
00964 #ifndef P2P_ENABLE_3D
00965 //
00966 // Copy from temporary buffer to final buffer
00967 //
00968 void CudaPmeTranspose::copyDataDeviceToDevice(const int iblock, float2* data_out) {
00969   cudaCheck(cudaSetDevice(deviceID));
00970 
00971   if (iblock >= nblock)
00972     NAMD_bug("CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
00973 
00974   // Determine block size = how much we're copying
00975   int i0, i1, j0, j1, k0, k1;
00976   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
00977   int ni = i1-i0+1;
00978   int nj = j1-j0+1;
00979   int nk = k1-k0+1;
00980 
00981   float2* data_in = d_buffer + i0*nj*nk;
00982 
00983   copy3D_DtoD<float2>(data_in, data_out,
00984     0, 0, 0,
00985     ni, nj,
00986     i0, 0, 0,
00987     isize, jsize,
00988     ni, nj, nk, stream);
00989 }
00990 
00991 //
00992 // Return temporary buffer for block "iblock"
00993 //
00994 float2* CudaPmeTranspose::getBuffer(const int iblock) {
00995   if (iblock >= nblock)
00996     NAMD_bug("CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
00997 
00998   // Determine block size = how much we're copying
00999   int i0, i1, j0, j1, k0, k1;
01000   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
01001   int ni = i1-i0+1;
01002   int nj = j1-j0+1;
01003   int nk = k1-k0+1;
01004 
01005   return d_buffer + i0*nj*nk;
01006 }
01007 #endif
01008 
01009 void CudaPmeTranspose::copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out,
01010   float2* data_out) {
01011 
01012   int iblock_out = jblock;
01013   int jblock_out = kblock;
01014   int kblock_out = iblock;
01015 
01016   copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
01017 }
01018 
01019 void CudaPmeTranspose::copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out,
01020   float2* data_out) {
01021 
01022   int iblock_out = kblock;
01023   int jblock_out = iblock;
01024   int kblock_out = jblock;
01025 
01026   copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
01027 }
01028 
01029 void CudaPmeTranspose::copyDataToPeerDevice(const int iblock,
01030   const int iblock_out, const int jblock_out, const int kblock_out,
01031   int deviceID_out, int permutation_out, float2* data_out) {
01032 
01033   cudaCheck(cudaSetDevice(deviceID));
01034 
01035   // Determine block size = how much we're copying
01036   int i0, i1, j0, j1, k0, k1;
01037   getBlockDim(pmeGrid, permutation_out, iblock_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
01038   int ni = i1-i0+1;
01039   int nj = j1-j0+1;
01040   int nk = k1-k0+1;
01041 
01042   getPencilDim(pmeGrid, permutation_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
01043   int isize_out = i1-i0+1;
01044   int jsize_out = j1-j0+1;
01045 
01046   int x0 = pos[iblock];
01047   float2* data_in = d_data + jsize*ksize*x0;
01048 
01049 #ifndef P2P_ENABLE_3D
01050   // Copy into temporary peer device buffer
01051   copy_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out, ni*nj*nk, stream);
01052 #else
01053   copy3D_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out,
01054     0, 0, 0,
01055     ni, nj,
01056     0, 0, 0,
01057     isize_out, jsize_out,
01058     ni, nj, nk, stream);
01059 #endif
01060 
01061 }
01062 
01063 #endif // NAMD_CUDA

Generated on Tue Sep 25 01:17:13 2018 for NAMD by  doxygen 1.4.7