ComputePmeCUDAMgr.h

Go to the documentation of this file.
00001 #ifndef COMPUTEPMECUDAMGR_H
00002 #define COMPUTEPMECUDAMGR_H
00003 
00004 #include <vector>
00005 
00006 #include "PmeBase.h"
00007 #include "PmeSolver.h"
00008 #include "PmeSolverUtil.h"
00009 #include "ComputePmeCUDAMgr.decl.h"
00010 #ifdef NAMD_CUDA
00011 #include <cuda_runtime.h>  // Needed for cudaStream_t that is used in ComputePmeCUDAMgr -class
00012 #endif
00013 
00014 #ifdef NAMD_CUDA
00015 class ComputePmeCUDA;
00016 
00017 //
00018 // Base class for thread safe atom storage
00019 //
00020 class PmeAtomStorage {
00021 public:
00022   PmeAtomStorage(const bool useIndex) : useIndex(useIndex) {
00023     numAtoms = 0;
00024     atomCapacity = 0;
00025     atom = NULL;
00026     atomIndexCapacity = 0;
00027     atomIndex = NULL;
00028     overflowStart = 0;
00029     overflowEnd = 0;
00030     overflowAtomCapacity = 0;
00031     overflowAtom = NULL;
00032     overflowAtomIndexCapacity = 0;
00033     overflowAtomIndex = NULL;
00034     lock_ = CmiCreateLock();
00035   }
00036   virtual ~PmeAtomStorage() {
00037     CmiDestroyLock(lock_);
00038   }
00039 
00040   int addAtoms(const int natom, const CudaAtom* src) {
00041     return addAtoms_(natom, src, NULL);
00042   }
00043 
00044   int addAtomsWithIndex(const int natom, const CudaAtom* src, const int* index) {
00045     return addAtoms_(natom, src, index);
00046   }
00047 
00048   // Finish up, must be called after "done" is returned by addAtoms.
00049   // Only the last thread that gets the "done" signal from addAtoms can enter here.
00050   void finish() {
00051     if (overflowEnd-overflowStart > 0) {
00052       resize_((void **)&atom, numAtoms, atomCapacity, sizeof(CudaAtom));
00053       if (useIndex) resize_((void **)&atomIndex, numAtoms, atomIndexCapacity, sizeof(int));
00054       memcpy_(atom+overflowStart, overflowAtom, (overflowEnd - overflowStart)*sizeof(CudaAtom));
00055       if (useIndex) memcpy_(atomIndex+overflowStart, overflowAtomIndex, (overflowEnd - overflowStart)*sizeof(int));
00056       overflowStart = 0;
00057       overflowEnd = 0;
00058     }
00059   }
00060 
00061   // Clear and reset storage to initial stage.
00062   // Only the last thread that gets the "done" signal from addAtoms can enter here.
00063   void clear() {
00064     patchPos.clear();
00065     numAtoms = 0;    
00066   }
00067 
00068   // Return pointer to atom data
00069   CudaAtom* getAtoms() {
00070     return atom;
00071   }
00072 
00073   // Return pointer to patch positions
00074   int* getPatchPos() {
00075     return patchPos.data();
00076   }
00077 
00078   int getNumPatches() {
00079     return patchPos.size();
00080   }
00081 
00082   int getNumAtoms() {
00083     return numAtoms;
00084   }
00085 
00086   int* getAtomIndex() {
00087     if (!useIndex)
00088       NAMD_bug("PmeAtomStorage::getAtomIndex, no indexing enabled");
00089     return atomIndex;
00090   }
00091 
00092 protected:
00093   // Atom array
00094   CudaAtom* atom;
00095   // Atom index array
00096   int* atomIndex;
00097   // Overflow atom array
00098   CudaAtom* overflowAtom;
00099   // Overflow atom index array
00100   int* overflowAtomIndex;
00101 
00102 private:
00103   // If true, uses indexed atom arrays
00104   const bool useIndex;
00105   // Node lock
00106   CmiNodeLock lock_;
00107   // Data overflow
00108   int overflowAtomCapacity;
00109   int overflowAtomIndexCapacity;
00110   int overflowStart;
00111   int overflowEnd;
00112   // Number of atoms currently in storage
00113   int numAtoms;
00114   // Atom patch position
00115   std::vector<int> patchPos;
00116   // Atom array capacity
00117   int atomCapacity;
00118   // Atom index array capacity
00119   int atomIndexCapacity;
00120 
00121   // Resize array with 1.5x extra storage
00122   void resize_(void **array, int sizeRequested, int& arrayCapacity, const int sizeofType) {
00123     // If array is not NULL and has enough capacity => we have nothing to do
00124     if (*array != NULL && arrayCapacity >= sizeRequested) return;
00125 
00126     // Otherwise, allocate new array
00127     int newArrayCapacity = (int)(sizeRequested*1.5);
00128     void* newArray = alloc_(sizeofType*newArrayCapacity);
00129 
00130     if (*array != NULL) {
00131       // We have old array => copy contents to new array
00132       memcpy_(newArray, *array, arrayCapacity*sizeofType);
00133       // De-allocate old array
00134       dealloc_(*array);
00135     }
00136 
00137     // Set new capacity and array pointer
00138     arrayCapacity = newArrayCapacity;
00139     *array = newArray;
00140   }
00141 
00142   virtual void memcpy_(void *dst, const void* src, const int size) {
00143     memcpy(dst, src, size);
00144   }
00145 
00146   virtual void copyWithIndex_(CudaAtom* dst, const CudaAtom* src, const int natom, const int* indexSrc) {
00147     for (int i=0;i < natom;i++) dst[i] = src[indexSrc[i]];
00148   }
00149 
00150   // Allocate array of size bytes
00151   virtual void* alloc_(const int size)=0;
00152 
00153   // Deallocate array
00154   virtual void dealloc_(void *p)=0;
00155 
00156   // Add atoms in thread-safe manner.
00157   // Returns the patch index where the atoms were added
00158   int addAtoms_(const int natom, const CudaAtom* src, const int* index) {
00159     CmiLock(lock_);
00160     // Accumulate position for patches:
00161     // atoms for patch i are in the range [ patchPos[i-1], patchPos[i]-1 ]
00162     int patchInd = patchPos.size();
00163     int ppos = (patchInd == 0) ? natom : patchPos[patchInd-1] + natom;
00164     patchPos.push_back(ppos);
00165     int pos = numAtoms;
00166     bool overflow = false;
00167     numAtoms += natom;
00168     // Check for overflow
00169     if (numAtoms > atomCapacity || (useIndex && numAtoms > atomIndexCapacity)) {
00170       // number of atoms exceeds capacity, store into overflow buffer
00171       // Note: storing to overflow should be very infrequent, most likely only
00172       // in the initial call
00173       if (overflowEnd-overflowStart == 0) {
00174         overflowStart = pos;
00175         overflowEnd = pos;
00176       }
00177       overflowEnd += natom;
00178       if (overflowEnd-overflowStart > overflowAtomCapacity) {
00179         resize_((void **)&overflowAtom, overflowEnd-overflowStart, overflowAtomCapacity, sizeof(CudaAtom));
00180       }
00181       if (useIndex && overflowEnd-overflowStart > overflowAtomIndexCapacity) {
00182         resize_((void **)&overflowAtomIndex, overflowEnd-overflowStart, overflowAtomIndexCapacity, sizeof(int));
00183       }
00184       if (index != NULL) {
00185         if (useIndex) memcpy_(overflowAtomIndex+overflowEnd-overflowStart-natom, index, natom*sizeof(int));
00186         copyWithIndex_(overflowAtom+overflowEnd-overflowStart-natom, src, natom, index);
00187       } else {
00188         memcpy_(overflowAtom+overflowEnd-overflowStart-natom, src, natom*sizeof(CudaAtom));
00189       }
00190       overflow = true;
00191     }
00192     CmiUnlock(lock_);
00193     // If no overflow, copy to final position
00194     if (!overflow) {
00195       if (index != NULL) {
00196         if (useIndex) memcpy_(atomIndex+pos, index, natom*sizeof(int));
00197         copyWithIndex_(atom+pos, src, natom, index);
00198       } else {
00199         memcpy_(atom+pos, src, natom*sizeof(CudaAtom));
00200       }
00201     }
00202     return patchInd;
00203   }
00204 
00205 };
00206 
00207 class PmeAtomMsg : public CMessage_PmeAtomMsg {
00208 public:
00209   CudaAtom *atoms;
00210   int numAtoms;
00211   int i, j;
00212   ComputePmeCUDA* compute;
00213   int pe;
00214   bool doEnergy, doVirial;
00215   // int miny, minz;
00216 };
00217 
00218 class PmeForceMsg : public CMessage_PmeForceMsg {
00219 public:
00220   CudaForce *force;
00221   int pe;
00222   int numAtoms;
00223   int numStrayAtoms;
00224   bool zeroCopy;
00225   ComputePmeCUDA* compute;
00226 };
00227 
00228 class PmeLaunchMsg : public CMessage_PmeLaunchMsg {
00229 public:
00230   CudaForce* force;
00231   int natom;
00232   int pe;
00233   ComputePmeCUDA* compute;
00234 };
00235 
00236 class RegisterPatchMsg : public CMessage_RegisterPatchMsg {
00237 public:
00238   int i, j;
00239 };
00240 
00241 class NumDevicesMsg : public CMessage_NumDevicesMsg {
00242 public:
00243   NumDevicesMsg(int numDevices) : numDevices(numDevices) {}
00244   int numDevices;
00245 };
00246 
00247 class PmeAtomPencilMsg : public CMessage_PmeAtomPencilMsg {
00248 public:
00249   CudaAtom* atoms;
00250   int numAtoms;
00251   int y, z;
00252   int srcY, srcZ;
00253   bool doEnergy, doVirial;
00254 };
00255 
00256 class PmeForcePencilMsg : public CMessage_PmeForcePencilMsg {
00257 public:
00258   CudaForce* force;
00259   int numAtoms;
00260   int y, z;
00261   int srcY, srcZ;
00262 };
00263 
00264 class CProxy_ComputePmeCUDADevice;
00265 class RecvDeviceMsg : public CMessage_RecvDeviceMsg {
00266 public:
00267   CProxy_ComputePmeCUDADevice* dev;
00268   int numDevicesMax;
00269 };
00270 
00271 class PmeAtomFiler : public CBase_PmeAtomFiler {
00272 public:
00273   PmeAtomFiler();
00274   PmeAtomFiler(CkMigrateMessage *);
00275   ~PmeAtomFiler();
00276   void fileAtoms(const int numAtoms, const CudaAtom* atoms, Lattice &lattice, const PmeGrid &pmeGrid,
00277     const int pencilIndexY, const int pencilIndexZ, const int ylo, const int yhi, const int zlo, const int zhi);
00278   // static inline int yBlock(int p) {return p % 3;}
00279   // static inline int zBlock(int p) {return p / 3;}
00280   int getNumAtoms(int p) {return pencilSize[p];}
00281   int* getAtomIndex(int p) {return pencil[p];}
00282 private:
00283   // 9 Pencils + 1 Stay atom pencil
00284   int pencilSize[9+1];
00285   int pencilCapacity[9+1];
00286   int* pencil[9+1];
00287 };
00288 
00289 
00290 class CProxy_ComputePmeCUDAMgr;
00291 class ComputePmeCUDADevice : public CBase_ComputePmeCUDADevice {
00292 public:
00293   // ComputePmeCUDADevice_SDAG_CODE;
00294   ComputePmeCUDADevice();
00295   ComputePmeCUDADevice(CkMigrateMessage *m);
00296   ~ComputePmeCUDADevice();
00297   void initialize(PmeGrid& pmeGrid_in, int pencilIndexY_in, int pencilIndexZ_in,
00298     int deviceID_in, int pmePencilType_in, CProxy_ComputePmeCUDAMgr mgrProxy_in,
00299     CProxy_PmeAtomFiler pmeAtomFiler_in);
00300   int getDeviceID();
00301   cudaStream_t getStream();
00302   CProxy_ComputePmeCUDAMgr getMgrProxy();
00303   void setPencilProxy(CProxy_CudaPmePencilXYZ pmePencilXYZ_in);
00304   void setPencilProxy(CProxy_CudaPmePencilXY pmePencilXY_in);
00305   void setPencilProxy(CProxy_CudaPmePencilX pmePencilX_in);
00306   void activate_pencils();
00307   void initializePatches(int numHomePatches_in);
00308   void registerNeighbor();
00309   void recvAtoms(PmeAtomMsg *msg);
00310   void sendAtomsToNeighbors();
00311   void sendAtomsToNeighbor(int y, int z, int atomIval);
00312   void recvAtomsFromNeighbor(PmeAtomPencilMsg *msg);
00313   void registerRecvAtomsFromNeighbor();
00314   void spreadCharge();
00315   void gatherForce();
00316   void gatherForceDone();
00317   void sendForcesToNeighbors();
00318   void recvForcesFromNeighbor(PmeForcePencilMsg *msg);
00319   void mergeForcesOnPatch(int homePatchIndex);
00320   void sendForcesToPatch(PmeForceMsg *forceMsg);
00321 
00322   void gatherForceDoneSubset(int first, int last);
00323 
00324 private:
00325   //
00326   bool doVirial, doEnergy;
00327   // PME grid definiton
00328   PmeGrid pmeGrid;
00329   // PME pencil type
00330   int pmePencilType;
00331   // Neighboring pencil bounds, [-1,1]
00332   int ylo, yhi, zlo, zhi;
00333   // Size of the neighboring pencil grid, maximum value 3. yNBlocks = yhi - ylo + 1
00334   int yNBlocks, zNBlocks;
00335   // Number of home patches for this device
00336   int numHomePatches;
00337   // Pencil location for this device
00338   int pencilIndexY, pencilIndexZ;
00339 
00340   // Number of neighbors expected to provide atoms including self
00341   int numNeighborsExpected;
00342 
00343   // Number of stray atoms
00344   int numStrayAtoms;
00345 
00346   // Node locks
00347   CmiNodeLock lock_numHomePatchesMerged;
00348   CmiNodeLock lock_numPencils;
00349   CmiNodeLock lock_numNeighborsRecv;
00350   CmiNodeLock lock_recvAtoms;
00351 
00352   int atomI, forceI;
00353 
00354   //----------------------------------------------------------------------------------
00355   // Book keeping
00356   // NOTE: We keep two copies of pmeAtomStorage and homePatchIndexList so that forces can be
00357   //       merged while next patch of atoms is already being received
00358   //----------------------------------------------------------------------------------
00359   // Storage for each pencil on the yNBlocks x zNBlocks grid
00360   std::vector< PmeAtomStorage* > pmeAtomStorage[2];
00361   std::vector<bool> pmeAtomStorageAllocatedHere;
00362 
00363   // Size numHomePatches:
00364   // Tells how many pencils have contributed to home patch
00365   std::vector<int> numPencils[2];
00366 
00367   // Pencil location
00368   struct PencilLocation {
00369     // Pencil index
00370     int pp;
00371     // Patch location in the pencil
00372     int pencilPatchIndex;
00373     PencilLocation(int pp, int pencilPatchIndex) : pp(pp), pencilPatchIndex(pencilPatchIndex) {}
00374   };
00375 
00376   // Size numHomePatches
00377   std::vector< std::vector<PencilLocation> > plList[2];
00378 
00379   // Size numHomePatches
00380   std::vector< PmeForceMsg* > homePatchForceMsgs[2];
00381 
00382   // // Size numHomePatches
00383   // std::vector<int> numHomeAtoms[2];
00384 
00385   std::vector< std::vector<int> > homePatchIndexList[2];
00386 
00387   // Number of neighbors from which we have received atoms
00388   int numNeighborsRecv;
00389 
00390   // Number of home patches we have received atom from
00391   int numHomePatchesRecv;
00392 
00393   // Number of home patches we have merged forces for
00394   int numHomePatchesMerged;
00395 
00396   // Size yNBlocks*zNBlocks
00397   std::vector< PmeForcePencilMsg* > neighborForcePencilMsgs;
00398   // std::vector< PmeForcePencil > neighborForcePencils;
00399 
00400   // Size yNBlocks*zNBlocks
00401   std::vector<int> neighborPatchIndex;
00402   //----------------------------------------------------------------------------------
00403 
00404   // CUDA stream
00405   cudaStream_t stream;
00406   bool streamCreated;
00407   // Device ID
00408   int deviceID;
00409   // Charge spreading and force gathering
00410   PmeRealSpaceCompute* pmeRealSpaceCompute;
00411   // Host memory force array
00412   int forceCapacity;
00413   CudaForce* force;
00414 
00415   // Proxy for the manager
00416   CProxy_ComputePmeCUDAMgr mgrProxy;
00417 
00418   // Atom filer proxy
00419   CProxy_PmeAtomFiler pmeAtomFiler;
00420 
00421   // Pencil proxy
00422   CProxy_CudaPmePencilXYZ pmePencilXYZ;
00423   CProxy_CudaPmePencilXY pmePencilXY;
00424   CProxy_CudaPmePencilX pmePencilX;
00425 
00426   // For event tracing
00427   double beforeWalltime;
00428 };
00429 
00430 class ComputePmeCUDAMgr : public CBase_ComputePmeCUDAMgr {
00431 public:
00432   ComputePmeCUDAMgr_SDAG_CODE;
00433         ComputePmeCUDAMgr();
00434   ComputePmeCUDAMgr(CkMigrateMessage *);
00435         ~ComputePmeCUDAMgr();
00436   void setupPencils();
00437   void initialize(CkQdMsg *msg);
00438   void initialize_pencils(CkQdMsg *msg);
00439   void activate_pencils(CkQdMsg *msg);
00440   PmeGrid getPmeGrid() {return pmeGrid;}
00441   int getNode(int i, int j);
00442   int getDevice(int i, int j);
00443   int getDevicePencilY(int i, int j);
00444   int getDevicePencilZ(int i, int j);
00445   int getDeviceIDPencilX(int i, int j);
00446   int getDeviceIDPencilY(int i, int j);
00447   int getDeviceIDPencilZ(int i, int j);
00448   void recvPencils(CProxy_CudaPmePencilXYZ xyz);
00449   void recvPencils(CProxy_CudaPmePencilXY xy, CProxy_CudaPmePencilZ z);
00450   void recvPencils(CProxy_CudaPmePencilX x, CProxy_CudaPmePencilY y, CProxy_CudaPmePencilZ z);
00451   void recvSelfEnergy(PmeSelfEnergyMsg *msg);
00452 
00453   void createDevicesAndAtomFiler();
00454   void recvDevices(RecvDeviceMsg* msg);
00455   void recvAtomFiler(CProxy_PmeAtomFiler filer);
00456   void skip();
00457   void recvAtoms(PmeAtomMsg *msg);
00458   void getHomePencil(PatchID patchID, int& homey, int& homez);
00459   int getHomeNode(PatchID patchID);
00460 
00461   bool isPmePe(int pe);
00462   bool isPmeNode(int node);
00463   bool isPmeDevice(int deviceID);
00464 
00465   static ComputePmeCUDAMgr* Object() {
00466     CProxy_ComputePmeCUDAMgr mgrProxy(CkpvAccess(BOCclass_group).computePmeCUDAMgr);
00467     return mgrProxy.ckLocalBranch();    
00468   }
00469 protected:
00470 
00471 private:
00472   void restrictToMaxPMEPencils();
00473 
00474   // ---------------------------------------------
00475   // For .ci file
00476   // Counter for determining numDevicesMax
00477   int numNodesContributed;
00478   int numDevicesMax;
00479 
00480   // Number of home patches for each device on this manager
00481   std::vector<int> numHomePatchesList;
00482 
00483   // Counter for "registerPatchDone"
00484   int numTotalPatches;
00485   // ---------------------------------------------
00486 
00487   // PME pencil type: 1=column, 2=slab, 3=box
00488   int pmePencilType;
00489 
00490         PmeGrid pmeGrid;
00491 
00492   // Number of CUDA devices on this node that are used for PME computation
00493   int numDevices;
00494 
00495   std::vector<int> xPes;
00496   std::vector<int> yPes;
00497   std::vector<int> zPes;
00498 
00499   // List of pencil coordinates (i,j) for each device held by this node
00500   struct IJ {
00501     int i, j;
00502   };
00503   std::vector<IJ> ijPencilX;
00504   std::vector<IJ> ijPencilY;
00505   std::vector<IJ> ijPencilZ;
00506 
00507   struct NodeDevice {
00508     int node;
00509     int device;
00510   };
00511   std::vector<NodeDevice> nodeDeviceList;
00512 
00513   // Atom filer proxy
00514   CProxy_PmeAtomFiler pmeAtomFiler;
00515 
00516   // Device proxies
00517   std::vector<CProxy_ComputePmeCUDADevice> deviceProxy;
00518 
00519   // Extra devices
00520   struct ExtraDevice {
00521     int deviceID;
00522     cudaStream_t stream;
00523   };
00524   std::vector<ExtraDevice> extraDevices;
00525 
00526   // Pencil proxies
00527   CProxy_CudaPmePencilXYZ pmePencilXYZ;
00528   CProxy_CudaPmePencilXY pmePencilXY;
00529   CProxy_CudaPmePencilX pmePencilX;
00530   CProxy_CudaPmePencilY pmePencilY;
00531   CProxy_CudaPmePencilZ pmePencilZ;
00532 
00533 };
00534 #else // NAMD_CUDA
00535 class ComputePmeCUDAMgr : public CBase_ComputePmeCUDAMgr {
00536 };
00537 #endif // NAMD_CUDA
00538 
00539 #endif // COMPUTEPMECUDAMGR_H

Generated on Mon Nov 20 01:17:12 2017 for NAMD by  doxygen 1.4.7