CudaPmeSolverUtil.C

Go to the documentation of this file.
00001 #include <stdio.h>
00002 #include <algorithm>
00003 #ifdef NAMD_CUDA
00004 #include <cuda_runtime.h>
00005 #endif
00006 #include "ComputeNonbondedUtil.h"
00007 #include "ComputePmeCUDAMgr.h"
00008 #include "CudaPmeSolver.h"
00009 #include "CudaPmeSolverUtil.h"
00010 
00011 #ifdef NAMD_CUDA
00012 extern "C" void CcdCallBacksReset(void *ignored, double curWallTime);  // fix Charm++
00013 
00014 void writeComplexToDisk(const float2 *d_data, const int size, const char* filename, cudaStream_t stream) {
00015   fprintf(stderr, "writeComplexToDisk %d %s\n", size, filename);
00016   float2* h_data = new float2[size];
00017   copy_DtoH<float2>(d_data, h_data, size, stream);
00018   cudaCheck(cudaStreamSynchronize(stream));
00019   FILE *handle = fopen(filename, "w");
00020   for (int i=0;i < size;i++)
00021     fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
00022   fclose(handle);
00023   delete [] h_data;
00024 }
00025 
00026 void writeHostComplexToDisk(const float2 *h_data, const int size, const char* filename) {
00027   FILE *handle = fopen(filename, "w");
00028   for (int i=0;i < size;i++)
00029     fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
00030   fclose(handle);
00031 }
00032 
00033 void writeRealToDisk(const float *d_data, const int size, const char* filename, cudaStream_t stream) {
00034   fprintf(stderr, "writeRealToDisk %d %s\n", size, filename);
00035   float* h_data = new float[size];
00036   copy_DtoH<float>(d_data, h_data, size, stream);
00037   cudaCheck(cudaStreamSynchronize(stream));
00038   FILE *handle = fopen(filename, "w");
00039   for (int i=0;i < size;i++)
00040     fprintf(handle, "%f\n", h_data[i]);
00041   fclose(handle);
00042   delete [] h_data;
00043 }
00044 
00045 void CudaFFTCompute::plan3D(int *n, int flags) {
00046   cudaCheck(cudaSetDevice(deviceID));
00047   forwardType = CUFFT_R2C;
00048   backwardType = CUFFT_C2R;
00049   cufftCheck(cufftPlan3d(&forwardPlan, n[2], n[1], n[0], CUFFT_R2C));
00050   cufftCheck(cufftPlan3d(&backwardPlan, n[2], n[1], n[0], CUFFT_C2R));
00051   cufftCheck(cufftSetCompatibilityMode(forwardPlan, CUFFT_COMPATIBILITY_FFTW_PADDING));
00052   cufftCheck(cufftSetCompatibilityMode(backwardPlan, CUFFT_COMPATIBILITY_FFTW_PADDING));
00053   setStream();
00054   // plantype = 3;
00055 }
00056 
00057 void CudaFFTCompute::plan2D(int *n, int howmany, int flags) {
00058   cudaCheck(cudaSetDevice(deviceID));
00059   forwardType = CUFFT_R2C;
00060   backwardType = CUFFT_C2R;
00061   int nt[2] = {n[1], n[0]};
00062   cufftCheck(cufftPlanMany(&forwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_R2C, howmany));
00063   cufftCheck(cufftPlanMany(&backwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, howmany));
00064   cufftCheck(cufftSetCompatibilityMode(forwardPlan, CUFFT_COMPATIBILITY_FFTW_PADDING));
00065   cufftCheck(cufftSetCompatibilityMode(backwardPlan, CUFFT_COMPATIBILITY_FFTW_PADDING));
00066   setStream();
00067   // plantype = 2;
00068 }
00069 
00070 void CudaFFTCompute::plan1DX(int *n, int howmany, int flags) {
00071   cudaCheck(cudaSetDevice(deviceID));
00072   forwardType = CUFFT_R2C;
00073   backwardType = CUFFT_C2R;
00074   cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_R2C, howmany));
00075   cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2R, howmany));
00076   cufftCheck(cufftSetCompatibilityMode(forwardPlan, CUFFT_COMPATIBILITY_FFTW_PADDING));
00077   cufftCheck(cufftSetCompatibilityMode(backwardPlan, CUFFT_COMPATIBILITY_FFTW_PADDING));
00078   setStream();
00079   // plantype = 1;
00080 }
00081 
00082 void CudaFFTCompute::plan1DY(int *n, int howmany, int flags) {
00083   cudaCheck(cudaSetDevice(deviceID));
00084   forwardType = CUFFT_C2C;
00085   backwardType = CUFFT_C2C;
00086   cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
00087   cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
00088   cufftCheck(cufftSetCompatibilityMode(forwardPlan, CUFFT_COMPATIBILITY_FFTW_PADDING));
00089   cufftCheck(cufftSetCompatibilityMode(backwardPlan, CUFFT_COMPATIBILITY_FFTW_PADDING));
00090   setStream();
00091   // plantype = 1;
00092 }
00093 
00094 void CudaFFTCompute::plan1DZ(int *n, int howmany, int flags) {
00095   cudaCheck(cudaSetDevice(deviceID));
00096   forwardType = CUFFT_C2C;
00097   backwardType = CUFFT_C2C;
00098   cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
00099   cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
00100   cufftCheck(cufftSetCompatibilityMode(forwardPlan, CUFFT_COMPATIBILITY_FFTW_PADDING));
00101   cufftCheck(cufftSetCompatibilityMode(backwardPlan, CUFFT_COMPATIBILITY_FFTW_PADDING));
00102   setStream();
00103   // plantype = 1;
00104 }
00105 
00106 CudaFFTCompute::~CudaFFTCompute() {
00107   cudaCheck(cudaSetDevice(deviceID));
00108         cufftCheck(cufftDestroy(forwardPlan));
00109         cufftCheck(cufftDestroy(backwardPlan));
00110   if (dataSrcAllocated) deallocate_device<float>(&dataSrc);
00111   if (dataDstAllocated) deallocate_device<float>(&dataDst);
00112 }
00113 
00114 float* CudaFFTCompute::allocateData(const int dataSizeRequired) {
00115   cudaCheck(cudaSetDevice(deviceID));
00116   float* tmp = NULL;
00117   allocate_device<float>(&tmp, dataSizeRequired);
00118   return tmp;
00119 }
00120 
00121 // int ncall = 0;
00122 
00123 void CudaFFTCompute::forward() {
00124   cudaCheck(cudaSetDevice(deviceID));
00125   // ncall++;
00126   if (forwardType == CUFFT_R2C) {
00127 
00128     cufftCheck(cufftExecR2C(forwardPlan, (cufftReal *)dataSrc, (cufftComplex *)dataDst));
00129 
00130     // if (ncall == 1) {
00131     //   writeComplexToDisk((float2 *)dataSrc, (isize/2+1)*jsize*ksize, "dataSrc.txt", stream);
00132     // }
00133 
00134     // if (ncall == 1 && plantype == 2) {
00135     //   writeComplexToDisk((float2 *)data, (isize/2+1)*jsize*ksize, "data_fx_fy_z.txt", stream);
00136     // }
00137 
00138   } else if (forwardType == CUFFT_C2C) {
00139     // nc2cf++;
00140     // if (ncall == 1 && nc2cf == 1)
00141     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_y_z_fx.txt");
00142     // else if (ncall == 1 && nc2cf == 2)
00143     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_z_fx_fy.txt");
00144     cufftCheck(cufftExecC2C(forwardPlan, (cufftComplex *)dataSrc, (cufftComplex *)dataDst, CUFFT_FORWARD));
00145     // fprintf(stderr, "ncall %d plantype %d\n", ncall, plantype);
00146     // if (ncall == 1 && plantype == 1 && isize == 62) {
00147     //   writeComplexToDisk((float2 *)data, isize*jsize*(ksize/2+1), "data_fy_z_fx.txt", stream);
00148     // }
00149     // if (ncall == 1 && nc2cf == 1)
00150     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_fy_z_fx.txt");
00151     // else if (ncall == 1 && nc2cf == 2)
00152     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_fz_fx_fy.txt");
00153   } else {
00154     cudaNAMD_bug("CudaFFTCompute::forward(), unsupported FFT type");
00155   }
00156 }
00157 
00158 void CudaFFTCompute::backward() {
00159   cudaCheck(cudaSetDevice(deviceID));
00160   if (backwardType == CUFFT_C2R) {
00161     // if (ncall == 1) {
00162     //   if (plantype == 1)
00163     //     writeComplexToDisk((float2 *)data, 33*64*64, "data_fx_by_bz.txt");
00164     //   else
00165     //     writeComplexToDisk((float2 *)data, 33*64*64, "data_fx_fy_fz_2.txt");
00166     // }
00167 
00168     cufftCheck(cufftExecC2R(backwardPlan, (cufftComplex *)dataDst, (cufftReal *)dataSrc));
00169 
00170     // if (ncall == 1)
00171     //   if (plantype == 1)
00172     //     writeRealToDisk(data, 64*64*64, "data_bx_by_bz_1D.txt");
00173     //   else
00174     //     writeRealToDisk(data, 64*64*64, "data_bx_by_bz_3D.txt");
00175   } else if (backwardType == CUFFT_C2C) {
00176     // nc2cb++;
00177     // if (ncall == 1 && nc2cb == 1)
00178     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_fz_fx_fy_2.txt");
00179     // else if (ncall == 1 && nc2cb == 2)
00180     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_fy_bz_fx.txt");
00181     cufftCheck(cufftExecC2C(backwardPlan, (cufftComplex *)dataDst, (cufftComplex *)dataSrc, CUFFT_INVERSE));
00182     // if (ncall == 1 && nc2cb == 1)
00183     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_bz_fx_fy.txt");
00184     // else if (ncall == 1 && nc2cb == 2)
00185     //   writeComplexToDisk((float2 *)data, 33*64*64, "data_by_bz_fx.txt");
00186   } else {
00187     cudaNAMD_bug("CudaFFTCompute::backward(), unsupported FFT type");
00188   }
00189 }
00190 
00191 void CudaFFTCompute::setStream() {
00192   cudaCheck(cudaSetDevice(deviceID));
00193   cufftCheck(cufftSetStream(forwardPlan, stream));
00194   cufftCheck(cufftSetStream(backwardPlan, stream));
00195 }
00196 
00197 //###########################################################################
00198 //###########################################################################
00199 //###########################################################################
00200 
00201 CudaPmeKSpaceCompute::CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation,
00202   const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream) : 
00203   PmeKSpaceCompute(pmeGrid, permutation, jblock, kblock, kappa),
00204   deviceID(deviceID), stream(stream) {
00205 
00206   cudaCheck(cudaSetDevice(deviceID));
00207 
00208   // Copy bm1 -> prefac_x on GPU memory
00209   float *bm1f = new float[pmeGrid.K1];
00210   float *bm2f = new float[pmeGrid.K2];
00211   float *bm3f = new float[pmeGrid.K3];
00212   for (int i=0;i < pmeGrid.K1;i++) bm1f[i] = (float)bm1[i];
00213   for (int i=0;i < pmeGrid.K2;i++) bm2f[i] = (float)bm2[i];
00214   for (int i=0;i < pmeGrid.K3;i++) bm3f[i] = (float)bm3[i];
00215   allocate_device<float>(&d_bm1, pmeGrid.K1);
00216   allocate_device<float>(&d_bm2, pmeGrid.K2);
00217   allocate_device<float>(&d_bm3, pmeGrid.K3);
00218   copy_HtoD_sync<float>(bm1f, d_bm1, pmeGrid.K1);
00219   copy_HtoD_sync<float>(bm2f, d_bm2, pmeGrid.K2);
00220   copy_HtoD_sync<float>(bm3f, d_bm3, pmeGrid.K3);
00221   delete [] bm1f;
00222   delete [] bm2f;
00223   delete [] bm3f;
00224   allocate_device<EnergyVirial>(&d_energyVirial, 1);
00225   allocate_host<EnergyVirial>(&h_energyVirial, 1);
00226   // cudaCheck(cudaEventCreateWithFlags(&copyEnergyVirialEvent, cudaEventDisableTiming));
00227   cudaCheck(cudaEventCreate(&copyEnergyVirialEvent));
00228   // ncall = 0;
00229 }
00230 
00231 CudaPmeKSpaceCompute::~CudaPmeKSpaceCompute() {
00232   cudaCheck(cudaSetDevice(deviceID));
00233   deallocate_device<float>(&d_bm1);
00234   deallocate_device<float>(&d_bm2);
00235   deallocate_device<float>(&d_bm3);
00236   deallocate_device<EnergyVirial>(&d_energyVirial);
00237   deallocate_host<EnergyVirial>(&h_energyVirial);
00238   cudaCheck(cudaEventDestroy(copyEnergyVirialEvent));
00239 }
00240 
00241 void CudaPmeKSpaceCompute::solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float* data) {
00242   cudaCheck(cudaSetDevice(deviceID));
00243 
00244   const bool doEnergyVirial = (doEnergy || doVirial);
00245 
00246   int nfft1, nfft2, nfft3;
00247   float *prefac1, *prefac2, *prefac3;
00248 
00249   BigReal volume = lattice.volume();
00250   Vector a_r = lattice.a_r();
00251   Vector b_r = lattice.b_r();
00252   Vector c_r = lattice.c_r();
00253   float recip1x, recip1y, recip1z;
00254   float recip2x, recip2y, recip2z;
00255   float recip3x, recip3y, recip3z;
00256 
00257   if (permutation == Perm_Z_cX_Y) {
00258     // Z, X, Y
00259     nfft1 = pmeGrid.K3;
00260     nfft2 = pmeGrid.K1;
00261     nfft3 = pmeGrid.K2;
00262     prefac1 = d_bm3;
00263     prefac2 = d_bm1;
00264     prefac3 = d_bm2;
00265     recip1x = c_r.z;
00266     recip1y = c_r.x;
00267     recip1z = c_r.y;
00268     recip2x = a_r.z;
00269     recip2y = a_r.x;
00270     recip2z = a_r.y;
00271     recip3x = b_r.z;
00272     recip3y = b_r.x;
00273     recip3z = b_r.y;
00274   } else if (permutation == Perm_cX_Y_Z) {
00275     // X, Y, Z
00276     nfft1 = pmeGrid.K1;
00277     nfft2 = pmeGrid.K2;
00278     nfft3 = pmeGrid.K3;
00279     prefac1 = d_bm1;
00280     prefac2 = d_bm2;
00281     prefac3 = d_bm3;
00282     recip1x = a_r.x;
00283     recip1y = a_r.y;
00284     recip1z = a_r.z;
00285     recip2x = b_r.x;
00286     recip2y = b_r.y;
00287     recip2z = b_r.z;
00288     recip3x = c_r.x;
00289     recip3y = c_r.y;
00290     recip3z = c_r.z;
00291   } else {
00292     NAMD_bug("CudaPmeKSpaceCompute::solve, invalid permutation");
00293   }
00294 
00295   // ncall++;
00296   // if (ncall == 1) {
00297   //   char filename[256];
00298   //   sprintf(filename,"dataf_%d_%d.txt",jblock,kblock);
00299   //   writeComplexToDisk((float2*)data, size1*size2*size3, filename, stream);
00300   // }
00301 
00302   // if (ncall == 1) {
00303   //   float2* h_data = new float2[size1*size2*size3];
00304   //   float2* d_data = (float2*)data;
00305   //   copy_DtoH<float2>(d_data, h_data, size1*size2*size3, stream);
00306   //   cudaCheck(cudaStreamSynchronize(stream));
00307   //   FILE *handle = fopen("dataf.txt", "w");
00308   //   for (int z=0;z < pmeGrid.K3;z++) {
00309   //     for (int y=0;y < pmeGrid.K2;y++) {
00310   //       for (int x=0;x < pmeGrid.K1/2+1;x++) {
00311   //         int i;
00312   //         if (permutation == Perm_cX_Y_Z) {
00313   //           i = x + y*size1 + z*size1*size2;
00314   //         } else {
00315   //           i = z + x*size1 + y*size1*size2;
00316   //         }
00317   //         fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y);
00318   //       }
00319   //     }
00320   //   }
00321   //   fclose(handle);
00322   //   delete [] h_data;
00323   // }
00324 
00325   // Clear energy and virial array if needed
00326   if (doEnergyVirial) clear_device_array<EnergyVirial>(d_energyVirial, 1, stream);
00327 
00328   scalar_sum(permutation == Perm_cX_Y_Z, nfft1, nfft2, nfft3, size1, size2, size3, kappa,
00329     recip1x, recip1y, recip1z, recip2x, recip2y, recip2z, recip3x, recip3y, recip3z,
00330     volume, prefac1, prefac2, prefac3, j0, k0, doEnergyVirial,
00331     &d_energyVirial->energy, d_energyVirial->virial, (float2*)data, 
00332     stream);
00333 
00334   // Copy energy and virial to host if needed
00335   if (doEnergyVirial) {
00336     copy_DtoH<EnergyVirial>(d_energyVirial, h_energyVirial, 1, stream);
00337     cudaCheck(cudaEventRecord(copyEnergyVirialEvent, stream));
00338     // cudaCheck(cudaStreamSynchronize(stream));
00339   }
00340 
00341 }
00342 
00343 // void CudaPmeKSpaceCompute::waitEnergyAndVirial() {
00344 //   cudaCheck(cudaSetDevice(deviceID));
00345 //   cudaCheck(cudaEventSynchronize(copyEnergyVirialEvent));
00346 // }
00347 
00348 void CudaPmeKSpaceCompute::energyAndVirialCheck(void *arg, double walltime) {
00349   CudaPmeKSpaceCompute* c = (CudaPmeKSpaceCompute *)arg;
00350 
00351   cudaError_t err = cudaEventQuery(c->copyEnergyVirialEvent);
00352   if (err == cudaSuccess) {
00353     // Event has occurred
00354     c->checkCount = 0;
00355     if (c->pencilXYZPtr != NULL)
00356       c->pencilXYZPtr->energyAndVirialDone();
00357     else if (c->pencilZPtr != NULL)
00358       c->pencilZPtr->energyAndVirialDone();
00359     else
00360       NAMD_bug("CudaPmeKSpaceCompute::energyAndVirialCheck, pencilXYZPtr and pencilZPtr not set");
00361     return;
00362   } else if (err == cudaErrorNotReady) {
00363     // Event has not occurred
00364     c->checkCount++;
00365     if (c->checkCount >= 1000000) {
00366       NAMD_bug("CudaPmeKSpaceCompute::energyAndVirialCheck, check count exceeded");
00367     }
00368   } else {
00369     // Anything else is an error
00370     NAMD_bug("CudaPmeKSpaceCompute::energyAndVirialCheck, cudaEventQuery returned error");
00371   }
00372 
00373   // Call again 
00374   CcdCallBacksReset(0, walltime);
00375   CcdCallFnAfter(energyAndVirialCheck, arg, 0.1);
00376 }
00377 
00378 void CudaPmeKSpaceCompute::energyAndVirialSetCallback(CudaPmePencilXYZ* pencilPtr) {
00379   cudaCheck(cudaSetDevice(deviceID));
00380   pencilXYZPtr = pencilPtr;
00381   pencilZPtr = NULL;
00382   checkCount = 0;
00383   CcdCallBacksReset(0, CmiWallTimer());
00384   // Set the call back at 0.1ms
00385   CcdCallFnAfter(energyAndVirialCheck, this, 0.1);
00386 }
00387 
00388 void CudaPmeKSpaceCompute::energyAndVirialSetCallback(CudaPmePencilZ* pencilPtr) {
00389   cudaCheck(cudaSetDevice(deviceID));
00390   pencilXYZPtr = NULL;
00391   pencilZPtr = pencilPtr;
00392   checkCount = 0;
00393   CcdCallBacksReset(0, CmiWallTimer());
00394   // Set the call back at 0.1ms
00395   CcdCallFnAfter(energyAndVirialCheck, this, 0.1);
00396 }
00397 
00398 double CudaPmeKSpaceCompute::getEnergy() {
00399   return h_energyVirial->energy;
00400 }
00401 
00402 void CudaPmeKSpaceCompute::getVirial(double *virial) {
00403   if (permutation == Perm_Z_cX_Y) {
00404     // h_energyVirial->virial is storing ZZ, ZX, ZY, XX, XY, YY
00405     virial[0] = h_energyVirial->virial[3];
00406     virial[1] = h_energyVirial->virial[4];
00407     virial[2] = h_energyVirial->virial[1];
00408 
00409     virial[3] = h_energyVirial->virial[4];
00410     virial[4] = h_energyVirial->virial[5];
00411     virial[5] = h_energyVirial->virial[2];
00412 
00413     virial[6] = h_energyVirial->virial[1];
00414     virial[7] = h_energyVirial->virial[7];
00415     virial[8] = h_energyVirial->virial[0];
00416   } else if (permutation == Perm_cX_Y_Z) {
00417     // h_energyVirial->virial is storing XX, XY, XZ, YY, YZ, ZZ
00418     virial[0] = h_energyVirial->virial[0];
00419     virial[1] = h_energyVirial->virial[1];
00420     virial[2] = h_energyVirial->virial[2];
00421 
00422     virial[3] = h_energyVirial->virial[1];
00423     virial[4] = h_energyVirial->virial[3];
00424     virial[5] = h_energyVirial->virial[4];
00425 
00426     virial[6] = h_energyVirial->virial[2];
00427     virial[7] = h_energyVirial->virial[4];
00428     virial[8] = h_energyVirial->virial[5];
00429   }
00430 }
00431 
00432 
00433 //###########################################################################
00434 //###########################################################################
00435 //###########################################################################
00436 
00437 //
00438 // Class constructor
00439 //
00440 CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute(PmeGrid pmeGrid,
00441   const int jblock, const int kblock, int deviceID, cudaStream_t stream) : 
00442   PmeRealSpaceCompute(pmeGrid, jblock, kblock), deviceID(deviceID), stream(stream) {
00443   if (dataSize < xsize*ysize*zsize)
00444     NAMD_bug("CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute, insufficient dataSize");
00445   cudaCheck(cudaSetDevice(deviceID));
00446   d_atomsCapacity = 0;
00447   d_atoms = NULL;
00448   d_forceCapacity = 0;
00449   d_force = NULL;
00450   tex_data = NULL;
00451   tex_data_len = 0;
00452   allocate_device<float>(&data, dataSize);
00453   setupGridTexture(data, xsize*ysize*zsize);
00454   cudaCheck(cudaEventCreate(&gatherForceEvent));
00455 }
00456 
00457 //
00458 // Class desctructor
00459 //
00460 CudaPmeRealSpaceCompute::~CudaPmeRealSpaceCompute() {
00461   cudaCheck(cudaSetDevice(deviceID));
00462   if (d_atoms != NULL) deallocate_device<CudaAtom>(&d_atoms);
00463   if (d_force != NULL) deallocate_device<CudaForce>(&d_force);
00464   // if (d_patches != NULL) deallocate_device<PatchInfo>(&d_patches);
00465   // deallocate_device<double>(&d_selfEnergy);
00466   deallocate_device<float>(&data);
00467   cudaCheck(cudaEventDestroy(gatherForceEvent));
00468 }
00469 
00470 // //
00471 // // Copy patches and atoms to device memory
00472 // //
00473 // void CudaPmeRealSpaceCompute::setPatchesAtoms(const int numPatches, const PatchInfo* patches,
00474 //   const int numAtoms, const CudaAtom* atoms) {
00475 
00476 //   this->numPatches = numPatches;
00477 //   this->numAtoms = numAtoms;
00478 
00479 //   // Reallocate device arrays as neccessary
00480 //   reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity, numAtoms, 1.5f);
00481 //   reallocate_device<PatchInfo>(&d_patches, &d_patchesCapacity, numPatches, 1.5f);
00482 
00483 //   // Copy atom and patch data to device
00484 //   copy_HtoD<CudaAtom>(atoms, d_atoms, numAtoms, stream);
00485 //   copy_HtoD<PatchInfo>(patches, d_patches, numPatches, stream);
00486 // }
00487 
00488 //
00489 // Copy atoms to device memory
00490 //
00491 void CudaPmeRealSpaceCompute::copyAtoms(const int numAtoms, const CudaAtom* atoms) {
00492   cudaCheck(cudaSetDevice(deviceID));
00493   this->numAtoms = numAtoms;
00494 
00495   // Reallocate device arrays as neccessary
00496   reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity, numAtoms, 1.5f);
00497 
00498   // Copy atom data to device
00499   copy_HtoD<CudaAtom>(atoms, d_atoms, numAtoms, stream);
00500 }
00501 
00502 //
00503 // Spread charges on grid
00504 //
00505 void CudaPmeRealSpaceCompute::spreadCharge(Lattice &lattice) {
00506   cudaCheck(cudaSetDevice(deviceID));
00507 
00508   // Clear grid
00509   clear_device_array<float>(data, xsize*ysize*zsize, stream);
00510 
00511   spread_charge((const float4*)d_atoms, numAtoms,
00512     pmeGrid.K1, pmeGrid.K2, pmeGrid.K3, xsize, ysize, zsize,
00513     xsize, y0, z0, (pmeGrid.yBlocks == 1), (pmeGrid.zBlocks == 1),
00514     data, pmeGrid.order, stream);
00515 
00516   // ncall++;
00517 
00518   // if (ncall == 1) writeRealToDisk(data, xsize*ysize*zsize, "data.txt");
00519 }
00520 
00521 void CudaPmeRealSpaceCompute::cuda_gatherforce_check(void *arg, double walltime) {
00522   CudaPmeRealSpaceCompute* c = (CudaPmeRealSpaceCompute *)arg;
00523   cudaCheck(cudaSetDevice(c->deviceID));
00524 
00525   cudaError_t err = cudaEventQuery(c->gatherForceEvent);
00526   if (err == cudaSuccess) {
00527     // Event has occurred
00528     c->checkCount = 0;
00529 //    c->deviceProxy[CkMyNode()].gatherForceDone();
00530     c->devicePtr->gatherForceDone();
00531     return;
00532   } else if (err == cudaErrorNotReady) {
00533     // Event has not occurred
00534     c->checkCount++;
00535     if (c->checkCount >= 1000000) {
00536       NAMD_bug("CudaPmeRealSpaceCompute::cuda_gatherforce_check, check count exceeded");
00537     }
00538   } else {
00539     // Anything else is an error
00540     NAMD_bug("CudaPmeRealSpaceCompute::cuda_gatherforce_check, cudaEventQuery returned error");
00541   }
00542 
00543   // Call again 
00544   CcdCallBacksReset(0, walltime);
00545   CcdCallFnAfter(cuda_gatherforce_check, arg, 0.1);
00546 }
00547 
00548 void CudaPmeRealSpaceCompute::gatherForceSetCallback(ComputePmeCUDADevice* devicePtr_in) {
00549   cudaCheck(cudaSetDevice(deviceID));
00550   devicePtr = devicePtr_in;
00551   checkCount = 0;
00552   CcdCallBacksReset(0, CmiWallTimer());
00553   // Set the call back at 0.1ms
00554   CcdCallFnAfter(cuda_gatherforce_check, this, 0.1);
00555 }
00556 
00557 void CudaPmeRealSpaceCompute::waitGatherForceDone() {
00558   cudaCheck(cudaSetDevice(deviceID));
00559   cudaCheck(cudaEventSynchronize(gatherForceEvent));
00560 }
00561 
00562 void CudaPmeRealSpaceCompute::setupGridTexture(float* data, int data_len) {
00563   if (tex_data == data && tex_data_len == data_len) return;
00564   tex_data = data;
00565   tex_data_len = data_len;
00566   // Use texture objects
00567   cudaResourceDesc resDesc;
00568   memset(&resDesc, 0, sizeof(resDesc));
00569   resDesc.resType = cudaResourceTypeLinear;
00570   resDesc.res.linear.devPtr = data;
00571   resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
00572   resDesc.res.linear.desc.x = sizeof(float)*8;
00573   resDesc.res.linear.sizeInBytes = data_len*sizeof(float);
00574   cudaTextureDesc texDesc;
00575   memset(&texDesc, 0, sizeof(texDesc));
00576   texDesc.readMode = cudaReadModeElementType;
00577   cudaCheck(cudaCreateTextureObject(&gridTexObj, &resDesc, &texDesc, NULL));
00578 }
00579 
00580 void CudaPmeRealSpaceCompute::gatherForce(Lattice &lattice, CudaForce* force) {
00581   cudaCheck(cudaSetDevice(deviceID));
00582 
00583   // Re-allocate force array if needed
00584   reallocate_device<CudaForce>(&d_force, &d_forceCapacity, numAtoms, 1.5f);
00585 
00586   gather_force((const float4*)d_atoms, numAtoms,
00587     pmeGrid.K1, pmeGrid.K2, pmeGrid.K3,
00588     xsize, ysize, zsize, xsize, y0, z0, (pmeGrid.yBlocks == 1), (pmeGrid.zBlocks == 1),
00589     data, pmeGrid.order, (float3*)d_force, 
00590     gridTexObj,
00591     stream);
00592 
00593   copy_DtoH<CudaForce>(d_force, force, numAtoms, stream);
00594 
00595   cudaCheck(cudaEventRecord(gatherForceEvent, stream));
00596 }
00597 
00598 /*
00599 double CudaPmeRealSpaceCompute::calcSelfEnergy() {
00600   double h_selfEnergy;
00601   clear_device_array<double>(d_selfEnergy, 1);
00602   calc_sum_charge_squared((const float4*)d_atoms, numAtoms, d_selfEnergy, stream);
00603   copy_DtoH<double>(d_selfEnergy, &h_selfEnergy, 1, stream);
00604   cudaCheck(cudaStreamSynchronize(stream));
00605   // 1.7724538509055160273 = sqrt(pi)
00606   h_selfEnergy *= -ComputeNonbondedUtil::ewaldcof/1.7724538509055160273;
00607   return h_selfEnergy;
00608 }
00609 */
00610 
00611 //###########################################################################
00612 //###########################################################################
00613 //###########################################################################
00614 
00615 CudaPmeTranspose::CudaPmeTranspose(PmeGrid pmeGrid, const int permutation,
00616     const int jblock, const int kblock, int deviceID, cudaStream_t stream) : 
00617   PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
00618   cudaCheck(cudaSetDevice(deviceID));
00619 
00620   allocate_device<float2>(&d_data, dataSize);
00621 #ifndef P2P_ENABLE_3D
00622   allocate_device<float2>(&d_buffer, dataSize);
00623 #endif
00624 
00625   // Setup data pointers to NULL, these can be overridden later on by using setDataPtrs()
00626   dataPtrsYZX.resize(nblock, NULL);
00627   dataPtrsZXY.resize(nblock, NULL);
00628 
00629   allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*nblock);
00630   allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*nblock);
00631 }
00632 
00633 CudaPmeTranspose::~CudaPmeTranspose() {
00634   cudaCheck(cudaSetDevice(deviceID));
00635   deallocate_device<float2>(&d_data);
00636 #ifndef P2P_ENABLE_3D
00637   deallocate_device<float2>(&d_buffer);
00638 #endif
00639   deallocate_device< TransposeBatch<float2> >(&batchesZXY);
00640   deallocate_device< TransposeBatch<float2> >(&batchesYZX);
00641 }
00642 
00643 //
00644 // Set dataPtrsYZX
00645 //
00646 void CudaPmeTranspose::setDataPtrsYZX(std::vector<float2*>& dataPtrsNew, float2* data) {
00647   if (dataPtrsYZX.size() != dataPtrsNew.size())
00648     NAMD_bug("CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
00649   for (int iblock=0;iblock < nblock;iblock++) {
00650     dataPtrsYZX[iblock] = dataPtrsNew[iblock];
00651   }
00652   // Build batched data structures
00653   TransposeBatch<float2> *h_batchesYZX = new TransposeBatch<float2>[3*nblock];
00654 
00655   for (int iperm=0;iperm < 3;iperm++) {
00656     int isize_out;
00657     if (iperm == 0) {
00658       // Perm_Z_cX_Y:
00659       // ZXY -> XYZ
00660       isize_out = pmeGrid.K1/2+1;
00661     } else if (iperm == 1) {
00662       // Perm_cX_Y_Z:
00663       // XYZ -> YZX
00664       isize_out = pmeGrid.K2;
00665     } else {
00666       // Perm_Y_Z_cX:
00667       // YZX -> ZXY
00668       isize_out = pmeGrid.K3;
00669     }
00670 
00671     int max_nx = 0;
00672     for (int iblock=0;iblock < nblock;iblock++) {
00673 
00674       int x0 = pos[iblock];
00675       int nx = pos[iblock+1] - x0;
00676       max_nx = std::max(max_nx, nx);
00677 
00678       int width_out;
00679       float2* data_out;
00680       if (dataPtrsYZX[iblock] == NULL) {
00681         // Local transpose, use internal buffer
00682         data_out = d_data + jsize*ksize*x0;
00683         width_out = jsize;
00684       } else {
00685         // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00686         data_out = dataPtrsYZX[iblock];
00687         width_out = isize_out;
00688       }
00689 
00690       TransposeBatch<float2> batch;
00691       batch.nx        = nx;
00692       batch.ysize_out = width_out;
00693       batch.zsize_out = ksize;
00694       batch.data_in   = data+x0;
00695       batch.data_out  = data_out;
00696 
00697       h_batchesYZX[iperm*nblock + iblock] = batch;
00698 
00699     // transpose_xyz_yzx(
00700     //   nx, jsize, ksize,
00701     //   isize, jsize,
00702     //   width_out, ksize,
00703     //   data+x0, data_out, stream);
00704     }
00705 
00706     max_nx_YZX[iperm] = max_nx;
00707   }
00708 
00709   copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*nblock, stream);
00710   cudaCheck(cudaStreamSynchronize(stream));
00711   delete [] h_batchesYZX;
00712 }
00713 
00714 //
00715 // Set dataPtrsZXY
00716 //
00717 void CudaPmeTranspose::setDataPtrsZXY(std::vector<float2*>& dataPtrsNew, float2* data) {
00718   if (dataPtrsZXY.size() != dataPtrsNew.size())
00719     NAMD_bug("CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
00720   for (int iblock=0;iblock < nblock;iblock++) {
00721     dataPtrsZXY[iblock] = dataPtrsNew[iblock];
00722   }
00723 
00724   // Build batched data structures
00725   TransposeBatch<float2> *h_batchesZXY = new TransposeBatch<float2>[3*nblock];
00726 
00727   for (int iperm=0;iperm < 3;iperm++) {
00728     int isize_out;
00729     if (iperm == 0) {
00730       // Perm_cX_Y_Z:
00731       // XYZ -> ZXY
00732       isize_out = pmeGrid.K3;
00733     } else if (iperm == 1) {
00734       // Perm_Z_cX_Y:
00735       // ZXY -> YZX
00736       isize_out = pmeGrid.K2;
00737     } else {
00738       // Perm_Y_Z_cX:
00739       // YZX -> XYZ
00740       isize_out = pmeGrid.K1/2+1;
00741     }
00742 
00743     int max_nx = 0;
00744     for (int iblock=0;iblock < nblock;iblock++) {
00745 
00746       int x0 = pos[iblock];
00747       int nx = pos[iblock+1] - x0;
00748       max_nx = std::max(max_nx, nx);
00749 
00750       int width_out;
00751       float2* data_out;
00752       if (dataPtrsZXY[iblock] == NULL) {
00753         // Local transpose, use internal buffer
00754         data_out = d_data + jsize*ksize*x0;
00755         width_out = ksize;
00756       } else {
00757         // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00758         data_out = dataPtrsZXY[iblock];
00759         width_out = isize_out;
00760       }
00761 
00762       TransposeBatch<float2> batch;
00763       batch.nx        = nx;
00764       batch.zsize_out = width_out;
00765       batch.xsize_out = nx;
00766       batch.data_in   = data+x0;
00767       batch.data_out  = data_out;
00768 
00769       h_batchesZXY[iperm*nblock + iblock] = batch;
00770     }
00771 
00772     max_nx_ZXY[iperm] = max_nx;
00773   }
00774 
00775   copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*nblock, stream);
00776   cudaCheck(cudaStreamSynchronize(stream));
00777   delete [] h_batchesZXY;
00778 }
00779 
00780 void CudaPmeTranspose::transposeXYZtoYZX(const float2* data) {
00781   cudaCheck(cudaSetDevice(deviceID));
00782 
00783   int iperm;
00784   switch(permutation) {
00785     case Perm_Z_cX_Y:
00786     // ZXY -> XYZ
00787     iperm = 0;
00788     break;
00789     case Perm_cX_Y_Z:
00790     // XYZ -> YZX
00791     iperm = 1;
00792     break;
00793     case Perm_Y_Z_cX:
00794     // YZX -> ZXY
00795     iperm = 2;
00796     break;
00797     default:
00798     NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
00799     break;
00800   }
00801 
00802   batchTranspose_xyz_yzx(
00803     nblock, batchesYZX + iperm*nblock,
00804     max_nx_YZX[iperm], jsize, ksize,
00805     isize, jsize, stream);
00806 
00807 
00808 /*
00809   int isize_out;
00810   switch(permutation) {
00811     case Perm_Z_cX_Y:
00812     // ZXY -> XYZ
00813     isize_out = pmeGrid.K1/2+1;
00814     break;
00815     case Perm_cX_Y_Z:
00816     // XYZ -> YZX
00817     isize_out = pmeGrid.K2;
00818     break;
00819     case Perm_Y_Z_cX:
00820     // YZX -> ZXY
00821     isize_out = pmeGrid.K3;
00822     break;
00823     default:
00824     NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
00825     break;
00826   }
00827 
00828   for (int iblock=0;iblock < nblock;iblock++) {
00829 
00830     int x0 = pos[iblock];
00831     int nx = pos[iblock+1] - x0;
00832 
00833     int width_out;
00834     float2* data_out;
00835     if (dataPtrsYZX[iblock] == NULL) {
00836       // Local transpose, use internal buffer
00837       data_out = d_data + jsize*ksize*x0;
00838       width_out = jsize;
00839     } else {
00840       // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00841       data_out = dataPtrsYZX[iblock];
00842       width_out = isize_out;
00843     }
00844 
00845     transpose_xyz_yzx(
00846       nx, jsize, ksize,
00847       isize, jsize,
00848       width_out, ksize,
00849       data+x0, data_out, stream);
00850   }
00851 */
00852 }
00853 
00854 void CudaPmeTranspose::transposeXYZtoZXY(const float2* data) {
00855   cudaCheck(cudaSetDevice(deviceID));
00856 
00857   int iperm;
00858   switch(permutation) {
00859     case Perm_cX_Y_Z:
00860     // XYZ -> ZXY
00861     iperm = 0;
00862     break;
00863     case Perm_Z_cX_Y:
00864     // ZXY -> YZX
00865     iperm = 1;
00866     break;
00867     case Perm_Y_Z_cX:
00868     // YZX -> XYZ
00869     iperm = 2;
00870     break;
00871     default:
00872     NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
00873     break;
00874   }
00875 
00876   batchTranspose_xyz_zxy(
00877     nblock, batchesZXY + iperm*nblock,
00878     max_nx_ZXY[iperm], jsize, ksize,
00879     isize, jsize, stream);
00880 
00881 /*
00882   int isize_out;
00883   switch(permutation) {
00884     case Perm_cX_Y_Z:
00885     // XYZ -> ZXY
00886     isize_out = pmeGrid.K3;
00887     break;
00888     case Perm_Z_cX_Y:
00889     // ZXY -> YZX
00890     isize_out = pmeGrid.K2;
00891     break;
00892     case Perm_Y_Z_cX:
00893     // YZX -> XYZ
00894     isize_out = pmeGrid.K1/2+1;
00895     break;
00896     default:
00897     NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
00898     break;
00899   }
00900 
00901   for (int iblock=0;iblock < nblock;iblock++) {
00902 
00903     int x0 = pos[iblock];
00904     int nx = pos[iblock+1] - x0;
00905 
00906     int width_out;
00907     float2* data_out;
00908     if (dataPtrsZXY[iblock] == NULL) {
00909       // Local transpose, use internal buffer
00910       data_out = d_data + jsize*ksize*x0;
00911       width_out = ksize;
00912     } else {
00913       // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00914       data_out = dataPtrsZXY[iblock];
00915       width_out = isize_out;
00916     }
00917 
00918     transpose_xyz_zxy(
00919       nx, jsize, ksize,
00920       isize, jsize,
00921       width_out, nx,
00922       data+x0, data_out, stream);
00923   }
00924 */
00925 }
00926 
00927 void CudaPmeTranspose::waitStreamSynchronize() {
00928   cudaCheck(cudaSetDevice(deviceID));
00929   cudaCheck(cudaStreamSynchronize(stream));
00930 }
00931 
00932 void CudaPmeTranspose::copyDataDeviceToHost(const int iblock, float2* h_data, const int h_dataSize) {
00933   cudaCheck(cudaSetDevice(deviceID));
00934 
00935   if (iblock >= nblock)
00936     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
00937 
00938   int x0 = pos[iblock];
00939   int nx = pos[iblock+1] - x0;
00940 
00941   int copySize  = jsize*ksize*nx;
00942   int copyStart = jsize*ksize*x0;
00943 
00944   if (copyStart + copySize > dataSize)
00945     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
00946 
00947   if (copySize > h_dataSize) 
00948     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
00949 
00950   copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
00951 }
00952 
00953 void CudaPmeTranspose::copyDataHostToDevice(const int iblock, float2* data_in, float2* data_out) {
00954   cudaCheck(cudaSetDevice(deviceID));
00955 
00956   if (iblock >= nblock)
00957     NAMD_bug("CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
00958 
00959   // Determine block size = how much we're copying
00960   int i0, i1, j0, j1, k0, k1;
00961   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
00962   int ni = i1-i0+1;
00963   int nj = j1-j0+1;
00964   int nk = k1-k0+1;
00965 
00966   copy3D_HtoD<float2>(data_in, data_out,
00967     0, 0, 0,
00968     ni, nj,
00969     i0, 0, 0,
00970     isize, jsize,
00971     ni, nj, nk, stream);
00972 }
00973 
00974 #ifndef P2P_ENABLE_3D
00975 //
00976 // Copy from temporary buffer to final buffer
00977 //
00978 void CudaPmeTranspose::copyDataDeviceToDevice(const int iblock, float2* data_out) {
00979   cudaCheck(cudaSetDevice(deviceID));
00980 
00981   if (iblock >= nblock)
00982     NAMD_bug("CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
00983 
00984   // Determine block size = how much we're copying
00985   int i0, i1, j0, j1, k0, k1;
00986   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
00987   int ni = i1-i0+1;
00988   int nj = j1-j0+1;
00989   int nk = k1-k0+1;
00990 
00991   float2* data_in = d_buffer + i0*nj*nk;
00992 
00993   copy3D_DtoD<float2>(data_in, data_out,
00994     0, 0, 0,
00995     ni, nj,
00996     i0, 0, 0,
00997     isize, jsize,
00998     ni, nj, nk, stream);
00999 }
01000 
01001 //
01002 // Return temporary buffer for block "iblock"
01003 //
01004 float2* CudaPmeTranspose::getBuffer(const int iblock) {
01005   if (iblock >= nblock)
01006     NAMD_bug("CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
01007 
01008   // Determine block size = how much we're copying
01009   int i0, i1, j0, j1, k0, k1;
01010   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
01011   int ni = i1-i0+1;
01012   int nj = j1-j0+1;
01013   int nk = k1-k0+1;
01014 
01015   return d_buffer + i0*nj*nk;
01016 }
01017 #endif
01018 
01019 void CudaPmeTranspose::copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out,
01020   float2* data_out) {
01021 
01022   int iblock_out = jblock;
01023   int jblock_out = kblock;
01024   int kblock_out = iblock;
01025 
01026   copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
01027 }
01028 
01029 void CudaPmeTranspose::copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out,
01030   float2* data_out) {
01031 
01032   int iblock_out = kblock;
01033   int jblock_out = iblock;
01034   int kblock_out = jblock;
01035 
01036   copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
01037 }
01038 
01039 void CudaPmeTranspose::copyDataToPeerDevice(const int iblock,
01040   const int iblock_out, const int jblock_out, const int kblock_out,
01041   int deviceID_out, int permutation_out, float2* data_out) {
01042 
01043   cudaCheck(cudaSetDevice(deviceID));
01044 
01045   // Determine block size = how much we're copying
01046   int i0, i1, j0, j1, k0, k1;
01047   getBlockDim(pmeGrid, permutation_out, iblock_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
01048   int ni = i1-i0+1;
01049   int nj = j1-j0+1;
01050   int nk = k1-k0+1;
01051 
01052   getPencilDim(pmeGrid, permutation_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
01053   int isize_out = i1-i0+1;
01054   int jsize_out = j1-j0+1;
01055 
01056   int x0 = pos[iblock];
01057   float2* data_in = d_data + jsize*ksize*x0;
01058 
01059 #ifndef P2P_ENABLE_3D
01060   // Copy into temporary peer device buffer
01061   copy_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out, ni*nj*nk, stream);
01062 #else
01063   copy3D_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out,
01064     0, 0, 0,
01065     ni, nj,
01066     0, 0, 0,
01067     isize_out, jsize_out,
01068     ni, nj, nk, stream);
01069 #endif
01070 
01071 }
01072 
01073 #endif // NAMD_CUDA

Generated on Mon Nov 20 01:17:12 2017 for NAMD by  doxygen 1.4.7