NAMD
ComputeCUDAMgr.C
Go to the documentation of this file.
1 #include "NamdTypes.h"
2 #include "common.h"
3 #include "Node.h"
4 #include "ComputeCUDAMgr.h"
5 #include "PatchData.h"
6 #include "DeviceCUDA.h"
7 #include "CudaUtils.h"
9 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
10 #ifdef WIN32
11 #define __thread __declspec(thread)
12 #endif
13 extern __thread DeviceCUDA *deviceCUDA;
14 
15 //
16 // Class constructor
17 //
19  // __sdag_init();
20  numDevices = 0;
21  // numNodesContributed = 0;
22  // numDevicesMax = 0;
23  cudaPmeOneDevice = NULL; // XXX is this needed?
24  cudaGlobalMasterObject = nullptr;
25 }
26 
27 //
28 // Class constructor
29 //
30 ComputeCUDAMgr::ComputeCUDAMgr(CkMigrateMessage *) {
31  // __sdag_init();
32  NAMD_bug("ComputeCUDAMgr cannot be migrated");
33  numDevices = 0;
34  // numNodesContributed = 0;
35  // numDevicesMax = 0;
36  cudaPmeOneDevice = NULL; // XXX is this needed?
37  cudaGlobalMasterObject = nullptr;
38 }
39 
40 //
41 // Class destructor
42 //
44  for (int i=0;i < numDevices;i++) {
45  if (cudaNonbondedTablesList[i] != NULL) delete cudaNonbondedTablesList[i];
46  if (cudaComputeNonbondedList[i] != NULL) delete cudaComputeNonbondedList[i];
47 #ifdef BONDED_CUDA
48  if (computeBondedCUDAList[i] != NULL) delete computeBondedCUDAList[i];
49 #endif
50  if(curSMDCOM != NULL && curSMDCOM[i] != NULL) cudaFree(curSMDCOM[i]);
51  if(curGrp1COM != NULL && curGrp1COM[i] != NULL) cudaFree(curGrp1COM[i]);
52  if(curGrp2COM != NULL && curGrp2COM[i] != NULL) cudaFree(curGrp2COM[i]);
53  }
54  if(curSMDCOM != NULL) cudaFree(curSMDCOM);
55  if(curGrp1COM != NULL) cudaFree(curGrp1COM);
56  if(curGrp2COM != NULL) cudaFree(curGrp2COM);
57 }
58 
59 //
60 // Initialize manager
61 // This gets called on rank 0 of each node
62 //
63 void ComputeCUDAMgr::initialize(CkQdMsg *msg) {
64  if (msg != NULL) delete msg;
65 
66  numDevices = deviceCUDA->getDeviceCount();
67 #ifdef NODEGROUP_FORCE_REGISTER
68  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
69  PatchData *pdata = cpdata.ckLocalBranch();
70  int ndevs = deviceCUDA->getNumDevice() + 1*deviceCUDA->isGpuReservedPme();
71  pdata->devData.resize(numDevices);
72 
73  {
74  // Pointers to SOA integration data
75  allocate_host<bool*>(&(pdata->h_devHasForces),ndevs);
76  allocate_host<double*>(&(pdata->h_soa_fb_x), ndevs);
77  allocate_host<double*>(&(pdata->h_soa_fb_y), ndevs);
78  allocate_host<double*>(&(pdata->h_soa_fb_z), ndevs);
79 
80  allocate_host<double*>(&(pdata->h_soa_fn_x), ndevs);
81  allocate_host<double*>(&(pdata->h_soa_fn_y), ndevs);
82  allocate_host<double*>(&(pdata->h_soa_fn_z), ndevs);
83 
84  allocate_host<double*>(&(pdata->h_soa_fs_x), ndevs);
85  allocate_host<double*>(&(pdata->h_soa_fs_y), ndevs);
86  allocate_host<double*>(&(pdata->h_soa_fs_z), ndevs);
87 
88  allocate_host<double*>(&(pdata->h_soa_pos_x), ndevs);
89  allocate_host<double*>(&(pdata->h_soa_pos_y), ndevs);
90  allocate_host<double*>(&(pdata->h_soa_pos_z), ndevs);
91 
92  allocate_host<double*>(&(pdata->h_soa_vel_x), deviceCUDA->getNumDevice());
93  allocate_host<double*>(&(pdata->h_soa_vel_y), deviceCUDA->getNumDevice());
94  allocate_host<double*>(&(pdata->h_soa_vel_z), deviceCUDA->getNumDevice());
95 
96  allocate_host<float*> (&(pdata->h_soa_charge), deviceCUDA->getNumDevice());
97 
98  allocate_host<int*> (&(pdata->h_soa_id), deviceCUDA->getNumDevice());
99  allocate_host<int*> (&(pdata->h_soa_vdwType), deviceCUDA->getNumDevice());
100  allocate_host<int*> (&(pdata->h_soa_sortOrder), deviceCUDA->getNumDevice());
101  allocate_host<int*> (&(pdata->h_soa_unsortOrder), deviceCUDA->getNumDevice());
102  allocate_host<double3*>(&(pdata->h_soa_patchCenter), deviceCUDA->getNumDevice());
103  allocate_host<int4*> (&(pdata->h_soa_migrationDestination), deviceCUDA->getNumDevice());
104  allocate_host<int*> (&(pdata->h_soa_sortSoluteIndex), deviceCUDA->getNumDevice());
105 
106  allocate_host<int*> (&(pdata->h_soa_partition), deviceCUDA->getNumDevice());
107 
108  allocate_host<FullAtom*>(&(pdata->h_atomdata_AoS), deviceCUDA->getNumDevice());
109  allocate_host<CudaLocalRecord*>(&(pdata->h_peer_record), deviceCUDA->getNumDevice());
110 
111  allocate_host<int*>(&(pdata->h_tupleCount.bond), deviceCUDA->getNumDevice());
112  allocate_host<int*>(&(pdata->h_tupleCount.angle), deviceCUDA->getNumDevice());
113  allocate_host<int*>(&(pdata->h_tupleCount.dihedral), deviceCUDA->getNumDevice());
114  allocate_host<int*>(&(pdata->h_tupleCount.improper), deviceCUDA->getNumDevice());
115  allocate_host<int*>(&(pdata->h_tupleCount.modifiedExclusion), deviceCUDA->getNumDevice());
116  allocate_host<int*>(&(pdata->h_tupleCount.exclusion), deviceCUDA->getNumDevice());
117  allocate_host<int*>(&(pdata->h_tupleCount.crossterm), deviceCUDA->getNumDevice());
118 
119  allocate_host<int*>(&(pdata->h_tupleOffset.bond), deviceCUDA->getNumDevice());
120  allocate_host<int*>(&(pdata->h_tupleOffset.angle), deviceCUDA->getNumDevice());
121  allocate_host<int*>(&(pdata->h_tupleOffset.dihedral), deviceCUDA->getNumDevice());
122  allocate_host<int*>(&(pdata->h_tupleOffset.improper), deviceCUDA->getNumDevice());
123  allocate_host<int*>(&(pdata->h_tupleOffset.modifiedExclusion), deviceCUDA->getNumDevice());
124  allocate_host<int*>(&(pdata->h_tupleOffset.exclusion), deviceCUDA->getNumDevice());
125  allocate_host<int*>(&(pdata->h_tupleOffset.crossterm), deviceCUDA->getNumDevice());
126 
127  allocate_host<CudaBondStage*>(&(pdata->h_tupleDataStage.bond), deviceCUDA->getNumDevice());
128  allocate_host<CudaAngleStage*>(&(pdata->h_tupleDataStage.angle), deviceCUDA->getNumDevice());
129  allocate_host<CudaDihedralStage*>(&(pdata->h_tupleDataStage.dihedral), deviceCUDA->getNumDevice());
130  allocate_host<CudaDihedralStage*>(&(pdata->h_tupleDataStage.improper), deviceCUDA->getNumDevice());
131  allocate_host<CudaExclusionStage*>(&(pdata->h_tupleDataStage.modifiedExclusion), deviceCUDA->getNumDevice());
132  allocate_host<CudaExclusionStage*>(&(pdata->h_tupleDataStage.exclusion), deviceCUDA->getNumDevice());
133  allocate_host<CudaCrosstermStage*>(&(pdata->h_tupleDataStage.crossterm), deviceCUDA->getNumDevice());
134  }
135 
136  // Allocate the work queues
137  allocate_host<unsigned int*>(&(pdata->d_queues), ndevs);
138  allocate_host<unsigned int>(&(pdata->d_queueCounters), ndevs);
139 
140  cudaCheck(cudaMemset(pdata->d_queueCounters, 0, sizeof(unsigned int)*ndevs));
141 
142  pdata->migrationFlagPerDevice.resize(deviceCUDA->getNumDevice());
143 
144  pdata->tupleReallocationFlagPerDevice.resize(deviceCUDA->getNumDevice());
145  pdata->atomReallocationFlagPerDevice.resize(deviceCUDA->getNumDevice());
146  pdata->maxNumBonds.store(0);
147  pdata->maxNumAngles.store(0);
148  pdata->maxNumDihedrals.store(0);
149  pdata->maxNumImpropers.store(0);
150  pdata->maxNumModifiedExclusions.store(0);
151  pdata->maxNumExclusions.store(0);
152  pdata->maxNumCrossterms.store(0);
153  pdata->devicePatchMapFlag.resize(CkNumPes(), 0);
154 #ifdef NAMD_NCCL_ALLREDUCE
155  // Allocate NCCL-related stuff
156  deviceCUDA->setupNcclUniqueId();
157  // After I do this, I can go ahead and register it in patchData
158  pdata->ncclId = deviceCUDA->getNcclUniqueId(); // registered in ngroup
159 #endif
160  // allocate global data for mGpuOn shared memory accumulation
161  // one per device, each referencing a numDevices element buffer
163  if(simParams->SMDOn && numDevices>1){
164  allocate_device<double3*>(&curSMDCOM, sizeof(double3*)*numDevices);
165  }
166  else
167  {
168  curSMDCOM = NULL;
169  }
170  if(simParams->groupRestraintsOn){
171  // as SMD, but we we need numGroups buffers for type1 and type2
172  allocate_host<double3**>(&curGrp1COM, sizeof(double3**)*simParams->groupRestraintsCount);
173  allocate_host<double3**>(&curGrp2COM, sizeof(double3**)*simParams->groupRestraintsCount);
174  for(int i=0;i<simParams->groupRestraintsCount;i++)
175  {
176  allocate_device<double3*>(&curGrp1COM[i], sizeof(double3*)*numDevices);
177  allocate_device<double3*>(&curGrp2COM[i], sizeof(double3*)*numDevices);
178  }
179  }
180  else
181  {
182  curGrp1COM = NULL;
183  curGrp2COM = NULL;
184  }
185 #endif
186 
187  // Create pointers to devices
188  cudaNonbondedTablesList.resize(numDevices, NULL);
189  cudaComputeNonbondedList.resize(numDevices, NULL);
190 #ifdef BONDED_CUDA
191  computeBondedCUDAList.resize(numDevices, NULL);
192 #endif
193  if (cudaPmeOneDevice != NULL) delete cudaPmeOneDevice;
194  cudaPmeOneDevice = NULL;
195 
196  // Create CUDA non-bonded tables for all devices that are used for computation
197  for (int i=0;i < deviceCUDA->getNumDevice();i++) {
198  int deviceID = deviceCUDA->getDeviceIDbyRank(i);
199  cudaNonbondedTablesList[deviceID] = new CudaNonbondedTables(deviceID);
200  }
201 }
202 
203 //
204 // Update nonbonded tables
205 // Should be called only on rank 0 of each node
206 //
208  if ( CkMyRank() ) NAMD_bug("ComputeCUDAMgr::update() should be called only by rank 0");
209  for (int i=0; i < deviceCUDA->getNumDevice(); i++) {
210  int deviceID = deviceCUDA->getDeviceIDbyRank(i);
211  // calls update function from CudaNonbondedTables
212  cudaNonbondedTablesList[deviceID]->updateTables();
213  }
214 }
215 
217  // Get pointer to ComputeCUDAMgr on this node
218  CProxy_ComputeCUDAMgr computeCUDAMgrProxy = CkpvAccess(BOCclass_group).computeCUDAMgr;
219  ComputeCUDAMgr* computeCUDAMgr = computeCUDAMgrProxy.ckLocalBranch();
220  if (computeCUDAMgr == NULL)
221  NAMD_bug("getComputeCUDAMgr, unable to locate local branch of BOC entry ComputeCUDAMgr");
222  return computeCUDAMgr;
223 }
224 
226  // initialize pmeGrid from simParams
228  PmeGrid pmeGrid;
229  pmeGrid.K1 = simParams->PMEGridSizeX;
230  pmeGrid.K2 = simParams->PMEGridSizeY;
231  pmeGrid.K3 = simParams->PMEGridSizeZ;
232  pmeGrid.order = simParams->PMEInterpOrder;
233  pmeGrid.dim2 = pmeGrid.K2;
234  pmeGrid.dim3 = 2 * (pmeGrid.K3/2 + 1);
235  // override settings for PME pencils
236  pmeGrid.xBlocks = 1;
237  pmeGrid.yBlocks = 1;
238  pmeGrid.zBlocks = 1;
239  pmeGrid.block1 = pmeGrid.K1;
240  pmeGrid.block2 = pmeGrid.K2;
241  pmeGrid.block3 = pmeGrid.K3;
242  // use shared deviceID class
243  int deviceID = 0;
244  int deviceIndex = 0;
245 #ifdef NODEGROUP_FORCE_REGISTER
246  deviceID = deviceCUDA->getPmeDevice();
247  deviceIndex = deviceCUDA->getPmeDeviceIndex();
248 #endif
249  if (cudaPmeOneDevice != NULL) delete cudaPmeOneDevice;
250  cudaPmeOneDevice = new CudaPmeOneDevice(pmeGrid, deviceID, deviceIndex);
251  return cudaPmeOneDevice;
252 }
253 
255  return cudaPmeOneDevice;
256 }
257 
258 //
259 // Creates CudaComputeNonbonded object
260 //
262  int deviceID = deviceCUDA->getDeviceID();
263  if (cudaComputeNonbondedList.at(deviceID) != NULL)
264  NAMD_bug("ComputeCUDAMgr::createCudaComputeNonbonded called twice");
265  if (cudaNonbondedTablesList.at(deviceID) == NULL)
266  NAMD_bug("ComputeCUDAMgr::createCudaComputeNonbonded, non-bonded CUDA tables not created");
267  //bool doStreaming = !deviceCUDA->getNoStreaming() && !Node::Object()->simParameters->GBISOn && !Node::Object()->simParameters->CUDASOAintegrate;
269  cudaComputeNonbondedList[deviceID] = new CudaComputeNonbonded(c, deviceID, *cudaNonbondedTablesList[deviceID], doStreaming);
270  return cudaComputeNonbondedList[deviceID];
271 }
272 
273 //
274 // Returns CudaComputeNonbonded for this Pe
275 //
277  // Get device ID for this Pe
278  int deviceID = deviceCUDA->getDeviceID();
279  CudaComputeNonbonded* p = cudaComputeNonbondedList[deviceID];
280  if (p == NULL)
281  NAMD_bug("ComputeCUDAMgr::getCudaComputeNonbonded(), device not created yet");
282  return p;
283 }
284 
285 #ifdef BONDED_CUDA
286 //
287 // Creates ComputeBondedCUDA object
288 //
289 ComputeBondedCUDA* ComputeCUDAMgr::createComputeBondedCUDA(ComputeID c, ComputeMgr* computeMgr) {
290  int deviceID = deviceCUDA->getDeviceID();
291  if (computeBondedCUDAList.at(deviceID) != NULL)
292  NAMD_bug("ComputeCUDAMgr::createComputeBondedCUDA called twice");
293  if (cudaNonbondedTablesList.at(deviceID) == NULL)
294  NAMD_bug("ComputeCUDAMgr::createCudaComputeNonbonded, non-bonded CUDA tables not created");
295  computeBondedCUDAList[deviceID] = new ComputeBondedCUDA(c, computeMgr, deviceID, *cudaNonbondedTablesList[deviceID]);
296  return computeBondedCUDAList[deviceID];
297 }
298 
299 //
300 // Returns ComputeBondedCUDA for this Pe
301 //
302 ComputeBondedCUDA* ComputeCUDAMgr::getComputeBondedCUDA() {
303  // Get device ID for this Pe
304  int deviceID = deviceCUDA->getDeviceID();
305  ComputeBondedCUDA* p = computeBondedCUDAList[deviceID];
306  if (p == NULL)
307  NAMD_bug("ComputeCUDAMgr::getComputeBondedCUDA(), device not created yet");
308  return p;
309 }
310 #endif // BONDED_CUDA
311 
312 std::shared_ptr<CudaGlobalMasterServer> ComputeCUDAMgr::getCudaGlobalMaster() {
313  return cudaGlobalMasterObject;
314 }
315 
316 std::shared_ptr<CudaGlobalMasterServer> ComputeCUDAMgr::createCudaGlobalMaster() {
317  iout << iINFO << "Creating CUDAGlobalMaster on PE " << CkMyPe() << '\n' << endi;
318  if (cudaGlobalMasterObject) {
319  return cudaGlobalMasterObject;
320  }
321  const int deviceID = deviceCUDA->getGlobalDevice();
323  cudaGlobalMasterObject = std::make_shared<CudaGlobalMasterServer>(deviceID, simParams->cudaGlobalProfilingFreq);
324  return cudaGlobalMasterObject;
325 }
326 
327 #endif // defined(NAMD_CUDA) || defined(NAMD_HIP)
328 
329 #include "ComputeCUDAMgr.def.h"
static Node * Object()
Definition: Node.h:86
int dim2
Definition: PmeBase.h:22
int zBlocks
Definition: PmeBase.h:25
std::ostream & iINFO(std::ostream &s)
Definition: InfoStream.C:81
int getDeviceCount()
Definition: DeviceCUDA.h:124
std::shared_ptr< CudaGlobalMasterServer > getCudaGlobalMaster()
int dim3
Definition: PmeBase.h:22
int32 ComputeID
Definition: NamdTypes.h:288
double3 ** curSMDCOM
int K2
Definition: PmeBase.h:21
SimParameters * simParameters
Definition: Node.h:181
int K1
Definition: PmeBase.h:21
void initialize(CkQdMsg *msg)
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
int getPmeDevice()
Definition: DeviceCUDA.h:165
int block1
Definition: PmeBase.h:24
int getNumDevice()
Definition: DeviceCUDA.h:125
#define iout
Definition: InfoStream.h:51
int block2
Definition: PmeBase.h:24
int getGlobalDevice() const
Definition: DeviceCUDA.h:171
int yBlocks
Definition: PmeBase.h:25
int getPmeDeviceIndex()
Definition: DeviceCUDA.h:167
int order
Definition: PmeBase.h:23
double3 *** curGrp2COM
void NAMD_bug(const char *err_msg)
Definition: common.C:195
static ComputeCUDAMgr * getComputeCUDAMgr()
int block3
Definition: PmeBase.h:24
double3 *** curGrp1COM
std::shared_ptr< CudaGlobalMasterServer > createCudaGlobalMaster()
bool isGpuReservedPme()
Definition: DeviceCUDA.h:164
int getDeviceID()
Definition: DeviceCUDA.h:144
#define simParams
Definition: Output.C:131
int K3
Definition: PmeBase.h:21
int getDeviceIDbyRank(int rank)
Definition: DeviceCUDA.h:145
int getNoStreaming()
Definition: DeviceCUDA.h:130
CudaComputeNonbonded * getCudaComputeNonbonded()
CudaComputeNonbonded * createCudaComputeNonbonded(ComputeID c)
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
int xBlocks
Definition: PmeBase.h:25
CudaPmeOneDevice * createCudaPmeOneDevice()
CudaPmeOneDevice * getCudaPmeOneDevice()