CudaPmeSolverUtil.h

Go to the documentation of this file.
00001 #ifndef CUDAPMESOLVERUTIL_H
00002 #define CUDAPMESOLVERUTIL_H
00003 #include <stdio.h>
00004 #ifdef NAMD_CUDA
00005 #include <cuda.h>
00006 #include <cufft.h>
00007 #endif // NAMD_CUDA
00008 #include "PmeSolverUtil.h"
00009 #include "CudaUtils.h"
00010 #include "CudaPmeSolverUtilKernel.h"
00011 
00012 #ifdef NAMD_CUDA
00013 void writeComplexToDisk(const float2 *d_data, const int size, const char* filename, cudaStream_t stream);
00014 void writeHostComplexToDisk(const float2 *h_data, const int size, const char* filename);
00015 void writeRealToDisk(const float *d_data, const int size, const char* filename, cudaStream_t stream);
00016 
00017 #define cufftCheck(stmt) do {                                           \
00018   cufftResult err = stmt;                                               \
00019   if (err != CUFFT_SUCCESS) {                                           \
00020         char msg[128];  \
00021           sprintf(msg, "%s in file %s, function %s\n", #stmt,__FILE__,__FUNCTION__); \
00022           cudaDie(msg); \
00023   }                                                                     \
00024 } while(0)
00025 
00026 //
00027 // CUDA implementation of FFTCompute
00028 //
00029 class CudaFFTCompute : public FFTCompute {
00030 private:
00031   cufftHandle forwardPlan, backwardPlan;
00032   cufftType_t forwardType, backwardType;
00033   int deviceID;
00034         cudaStream_t stream;
00035         void setStream();
00036 
00037 private:
00038         float* allocateData(const int dataSizeRequired);
00039         void plan3D(int *n, int flags);
00040         void plan2D(int *n, int howmany, int flags);
00041         void plan1DX(int *n, int howmany, int flags);
00042         void plan1DY(int *n, int howmany, int flags);
00043         void plan1DZ(int *n, int howmany, int flags);
00044         // int ncall, plantype;
00045 
00046 public:
00047         CudaFFTCompute(int deviceID, cudaStream_t stream) : deviceID(deviceID), stream(stream) {}
00048         ~CudaFFTCompute();
00049         void forward();
00050         void backward();
00051 };
00052 
00053 //
00054 // Cuda implementation of PmeKSpaceCompute class
00055 //
00056 class CudaPmePencilXYZ;
00057 class CudaPmePencilZ;
00058 
00059 class CudaPmeKSpaceCompute : public PmeKSpaceCompute {
00060 private:
00061         int deviceID;
00062         cudaStream_t stream;
00063         // Device memory versions of (bm1, bm2, bm3)
00064         float *d_bm1, *d_bm2, *d_bm3;
00065         //float *prefac_x, *prefac_y, *prefac_z;
00066         struct EnergyVirial {
00067                 double energy;
00068                 double virial[9];
00069         };
00070         EnergyVirial* d_energyVirial;
00071         EnergyVirial* h_energyVirial;
00072         cudaEvent_t copyEnergyVirialEvent;
00073         bool ortho;
00074   // Check counter for event polling in energyAndVirialCheck()
00075   int checkCount;
00076         static void energyAndVirialCheck(void *arg, double walltime);
00077         CudaPmePencilXYZ* pencilXYZPtr;
00078         CudaPmePencilZ* pencilZPtr;
00079 public:
00080         CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation,
00081                 const int jblock, const int kblock, double kappa,
00082                 int deviceID, cudaStream_t stream);
00083         ~CudaPmeKSpaceCompute();
00084         void solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float* data);
00085         // void waitEnergyAndVirial();
00086         double getEnergy();
00087         void getVirial(double *virial);
00088         void energyAndVirialSetCallback(CudaPmePencilXYZ* pencilPtr);
00089         void energyAndVirialSetCallback(CudaPmePencilZ* pencilPtr);
00090 };
00091 
00092 //
00093 // Cuda implementation of PmeRealSpaceCompute class
00094 //
00095 
00096 class ComputePmeCUDADevice;
00097 
00098 class CudaPmeRealSpaceCompute : public PmeRealSpaceCompute {
00099 private:
00100   bool gridTexObjActive;
00101   cudaTextureObject_t gridTexObj;
00102   int tex_data_len;
00103         float* tex_data;
00104         int deviceID;
00105         cudaStream_t stream;
00106         void setupGridTexture(float* data, int data_len);
00107         // Device memory for atoms
00108         int d_atomsCapacity;
00109         CudaAtom* d_atoms;
00110         // Device memory for patches
00111         // int d_patchesCapacity;
00112         // PatchInfo* d_patches;
00113         // Device memory for forces
00114         int d_forceCapacity;
00115         CudaForce* d_force;
00116         // // Device memory for self energy
00117         // double* d_selfEnergy;
00118   // Events
00119   cudaEvent_t gatherForceEvent;
00120   // Check counter for event polling
00121   int checkCount;
00122   // Store device pointer for event polling
00123   ComputePmeCUDADevice* devicePtr;
00124   static void cuda_gatherforce_check(void *arg, double walltime);
00125 public:
00126         CudaPmeRealSpaceCompute(PmeGrid pmeGrid, const int jblock, const int kblock,
00127                 int deviceID, cudaStream_t stream);
00128         ~CudaPmeRealSpaceCompute();
00129         void copyAtoms(const int numAtoms, const CudaAtom* atoms);
00130         void spreadCharge(Lattice &lattice);
00131         void gatherForce(Lattice &lattice, CudaForce* force);
00132         void gatherForceSetCallback(ComputePmeCUDADevice* devicePtr_in);
00133         void waitGatherForceDone();
00134 };
00135 
00136 //
00137 // Cuda implementation of PmeTranspose class
00138 //
00139 class CudaPmeTranspose : public PmeTranspose {
00140 private:
00141         int deviceID;
00142         cudaStream_t stream;
00143         float2* d_data;
00144 #ifndef P2P_ENABLE_3D
00145         float2* d_buffer;
00146 #endif
00147         // List of device data pointers for transpose destinations on:
00148         // (a) this device on a different pencil (e.g. in XYZ->YZX transpose, on Y -pencil)
00149         // (b) different device on a different pencil
00150         // If NULL, use the local d_data -buffer
00151         std::vector<float2*> dataPtrsYZX;
00152         std::vector<float2*> dataPtrsZXY;
00153 
00154         // Batch data
00155         int max_nx_YZX[3];
00156         TransposeBatch<float2> *batchesYZX;
00157         int max_nx_ZXY[3];
00158         TransposeBatch<float2> *batchesZXY;
00159 
00160         void copyDataToPeerDevice(const int iblock,
00161                 const int iblock_out, const int jblock_out, const int kblock_out,
00162                 int deviceID_out, int permutation_out, float2* data_out);
00163 public:
00164         CudaPmeTranspose(PmeGrid pmeGrid, const int permutation,
00165                 const int jblock, const int kblock, int deviceID, cudaStream_t stream);
00166         ~CudaPmeTranspose();
00167         void setDataPtrsYZX(std::vector<float2*>& dataPtrsNew, float2* data);
00168         void setDataPtrsZXY(std::vector<float2*>& dataPtrsNew, float2* data);
00169         void transposeXYZtoYZX(const float2* data);
00170         void transposeXYZtoZXY(const float2* data);
00171         // void waitTransposeDone();
00172         void waitStreamSynchronize();
00173         void copyDataDeviceToHost(const int iblock, float2* h_data, const int h_dataSize);
00174         void copyDataHostToDevice(const int iblock, float2* data_in, float2* data_out);
00175 #ifndef P2P_ENABLE_3D
00176         void copyDataDeviceToDevice(const int iblock, float2* data_out);
00177         float2* getBuffer(const int iblock);
00178 #endif
00179         void copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out, float2* data_out);
00180         void copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out, float2* data_out);
00181 };
00182 #endif // NAMD_CUDA
00183 #endif // CUDAPMESOLVERUTIL_H

Generated on Sat Nov 18 01:17:13 2017 for NAMD by  doxygen 1.4.7