NAMD
ComputeCUDAMgr.C
Go to the documentation of this file.
1 #include "NamdTypes.h"
2 #include "common.h"
3 #include "Node.h"
4 #include "ComputeCUDAMgr.h"
5 #include "PatchData.h"
6 #include "DeviceCUDA.h"
7 #include "CudaUtils.h"
9 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
10 #ifdef WIN32
11 #define __thread __declspec(thread)
12 #endif
13 extern __thread DeviceCUDA *deviceCUDA;
14 
15 //
16 // Class constructor
17 //
19  // __sdag_init();
20  numDevices = 0;
21  // numNodesContributed = 0;
22  // numDevicesMax = 0;
23  cudaPmeOneDevice = NULL; // XXX is this needed?
24 #ifdef NAMD_CUDA
25  cudaGlobalMasterObject = nullptr;
26 #endif // NAMD_CUDA
27 }
28 
29 //
30 // Class constructor
31 //
32 ComputeCUDAMgr::ComputeCUDAMgr(CkMigrateMessage *) {
33  // __sdag_init();
34  NAMD_bug("ComputeCUDAMgr cannot be migrated");
35  numDevices = 0;
36  // numNodesContributed = 0;
37  // numDevicesMax = 0;
38  cudaPmeOneDevice = NULL; // XXX is this needed?
39 #ifdef NAMD_CUDA
40  cudaGlobalMasterObject = nullptr;
41 #endif // NAMD_CUDA
42 }
43 
44 //
45 // Class destructor
46 //
48  for (int i=0;i < numDevices;i++) {
49  if (cudaNonbondedTablesList[i] != NULL) delete cudaNonbondedTablesList[i];
50  if (cudaComputeNonbondedList[i] != NULL) delete cudaComputeNonbondedList[i];
51 #ifdef BONDED_CUDA
52  if (computeBondedCUDAList[i] != NULL) delete computeBondedCUDAList[i];
53 #endif
54  }
55  delete cudaPmeOneDevice;
56 }
57 
58 //
59 // Initialize manager
60 // This gets called on rank 0 of each node
61 //
62 void ComputeCUDAMgr::initialize(CkQdMsg *msg) {
63  if (msg != NULL) delete msg;
64 
65  numDevices = deviceCUDA->getDeviceCount();
66 #ifdef NODEGROUP_FORCE_REGISTER
67  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
68  PatchData *pdata = cpdata.ckLocalBranch();
69  int ndevs = deviceCUDA->getNumDevice() + 1*deviceCUDA->isGpuReservedPme();
70  pdata->devData.resize(numDevices);
71 
72  {
73  // Pointers to SOA integration data
74  allocate_host<bool*>(&(pdata->h_devHasForces),ndevs);
75  allocate_host<double*>(&(pdata->h_soa_fb_x), ndevs);
76  allocate_host<double*>(&(pdata->h_soa_fb_y), ndevs);
77  allocate_host<double*>(&(pdata->h_soa_fb_z), ndevs);
78 
79  allocate_host<double*>(&(pdata->h_soa_fn_x), ndevs);
80  allocate_host<double*>(&(pdata->h_soa_fn_y), ndevs);
81  allocate_host<double*>(&(pdata->h_soa_fn_z), ndevs);
82 
83  allocate_host<double*>(&(pdata->h_soa_fs_x), ndevs);
84  allocate_host<double*>(&(pdata->h_soa_fs_y), ndevs);
85  allocate_host<double*>(&(pdata->h_soa_fs_z), ndevs);
86 
87  allocate_host<double*>(&(pdata->h_soa_pos_x), ndevs);
88  allocate_host<double*>(&(pdata->h_soa_pos_y), ndevs);
89  allocate_host<double*>(&(pdata->h_soa_pos_z), ndevs);
90 
91  allocate_host<double*>(&(pdata->h_soa_vel_x), deviceCUDA->getNumDevice());
92  allocate_host<double*>(&(pdata->h_soa_vel_y), deviceCUDA->getNumDevice());
93  allocate_host<double*>(&(pdata->h_soa_vel_z), deviceCUDA->getNumDevice());
94 
95  allocate_host<float*> (&(pdata->h_soa_charge), deviceCUDA->getNumDevice());
96 
97  allocate_host<int*> (&(pdata->h_soa_id), deviceCUDA->getNumDevice());
98  allocate_host<int*> (&(pdata->h_soa_vdwType), deviceCUDA->getNumDevice());
99  allocate_host<int*> (&(pdata->h_soa_sortOrder), deviceCUDA->getNumDevice());
100  allocate_host<int*> (&(pdata->h_soa_unsortOrder), deviceCUDA->getNumDevice());
101  allocate_host<double3*>(&(pdata->h_soa_patchCenter), deviceCUDA->getNumDevice());
102  allocate_host<int4*> (&(pdata->h_soa_migrationDestination), deviceCUDA->getNumDevice());
103  allocate_host<int*> (&(pdata->h_soa_sortSoluteIndex), deviceCUDA->getNumDevice());
104 
105  allocate_host<int*> (&(pdata->h_soa_partition), deviceCUDA->getNumDevice());
106 
107  allocate_host<FullAtom*>(&(pdata->h_atomdata_AoS), deviceCUDA->getNumDevice());
108  allocate_host<CudaLocalRecord*>(&(pdata->h_peer_record), deviceCUDA->getNumDevice());
109 
110  allocate_host<int*>(&(pdata->h_tupleCount.bond), deviceCUDA->getNumDevice());
111  allocate_host<int*>(&(pdata->h_tupleCount.angle), deviceCUDA->getNumDevice());
112  allocate_host<int*>(&(pdata->h_tupleCount.dihedral), deviceCUDA->getNumDevice());
113  allocate_host<int*>(&(pdata->h_tupleCount.improper), deviceCUDA->getNumDevice());
114  allocate_host<int*>(&(pdata->h_tupleCount.modifiedExclusion), deviceCUDA->getNumDevice());
115  allocate_host<int*>(&(pdata->h_tupleCount.exclusion), deviceCUDA->getNumDevice());
116  allocate_host<int*>(&(pdata->h_tupleCount.crossterm), deviceCUDA->getNumDevice());
117 
118  allocate_host<int*>(&(pdata->h_tupleOffset.bond), deviceCUDA->getNumDevice());
119  allocate_host<int*>(&(pdata->h_tupleOffset.angle), deviceCUDA->getNumDevice());
120  allocate_host<int*>(&(pdata->h_tupleOffset.dihedral), deviceCUDA->getNumDevice());
121  allocate_host<int*>(&(pdata->h_tupleOffset.improper), deviceCUDA->getNumDevice());
122  allocate_host<int*>(&(pdata->h_tupleOffset.modifiedExclusion), deviceCUDA->getNumDevice());
123  allocate_host<int*>(&(pdata->h_tupleOffset.exclusion), deviceCUDA->getNumDevice());
124  allocate_host<int*>(&(pdata->h_tupleOffset.crossterm), deviceCUDA->getNumDevice());
125 
126  allocate_host<CudaBondStage*>(&(pdata->h_tupleDataStage.bond), deviceCUDA->getNumDevice());
127  allocate_host<CudaAngleStage*>(&(pdata->h_tupleDataStage.angle), deviceCUDA->getNumDevice());
128  allocate_host<CudaDihedralStage*>(&(pdata->h_tupleDataStage.dihedral), deviceCUDA->getNumDevice());
129  allocate_host<CudaDihedralStage*>(&(pdata->h_tupleDataStage.improper), deviceCUDA->getNumDevice());
130  allocate_host<CudaExclusionStage*>(&(pdata->h_tupleDataStage.modifiedExclusion), deviceCUDA->getNumDevice());
131  allocate_host<CudaExclusionStage*>(&(pdata->h_tupleDataStage.exclusion), deviceCUDA->getNumDevice());
132  allocate_host<CudaCrosstermStage*>(&(pdata->h_tupleDataStage.crossterm), deviceCUDA->getNumDevice());
133  }
134 
135  // Allocate the work queues
136  allocate_host<unsigned int*>(&(pdata->d_queues), ndevs);
137  allocate_host<unsigned int>(&(pdata->d_queueCounters), ndevs);
138 
139  cudaCheck(cudaMemset(pdata->d_queueCounters, 0, sizeof(unsigned int)*ndevs));
140 
141  pdata->migrationFlagPerDevice.resize(deviceCUDA->getNumDevice());
142 
143  pdata->tupleReallocationFlagPerDevice.resize(deviceCUDA->getNumDevice());
144  pdata->atomReallocationFlagPerDevice.resize(deviceCUDA->getNumDevice());
145  pdata->maxNumBonds.store(0);
146  pdata->maxNumAngles.store(0);
147  pdata->maxNumDihedrals.store(0);
148  pdata->maxNumImpropers.store(0);
149  pdata->maxNumModifiedExclusions.store(0);
150  pdata->maxNumExclusions.store(0);
151  pdata->maxNumCrossterms.store(0);
152  pdata->devicePatchMapFlag.resize(CkNumPes(), 0);
153 #ifdef NAMD_NCCL_ALLREDUCE
154  // Allocate NCCL-related stuff
155  deviceCUDA->setupNcclUniqueId();
156  // After I do this, I can go ahead and register it in patchData
157  pdata->ncclId = deviceCUDA->getNcclUniqueId(); // registered in ngroup
158 #endif
159 #endif
160 
161  // Create pointers to devices
162  cudaNonbondedTablesList.resize(numDevices, NULL);
163  cudaComputeNonbondedList.resize(numDevices, NULL);
164 #ifdef BONDED_CUDA
165  computeBondedCUDAList.resize(numDevices, NULL);
166 #endif
167  if (cudaPmeOneDevice != NULL) delete cudaPmeOneDevice;
168  cudaPmeOneDevice = NULL;
169 
170  // Create CUDA non-bonded tables for all devices that are used for computation
171  for (int i=0;i < deviceCUDA->getNumDevice();i++) {
172  int deviceID = deviceCUDA->getDeviceIDbyRank(i);
173  cudaNonbondedTablesList[deviceID] = new CudaNonbondedTables(deviceID);
174  }
175 
176 
177 
178 }
179 
180 //
181 // Update nonbonded tables
182 // Should be called only on rank 0 of each node
183 //
185  if ( CkMyRank() ) NAMD_bug("ComputeCUDAMgr::update() should be called only by rank 0");
186  for (int i=0; i < deviceCUDA->getNumDevice(); i++) {
187  int deviceID = deviceCUDA->getDeviceIDbyRank(i);
188  // calls update function from CudaNonbondedTables
189  cudaNonbondedTablesList[deviceID]->updateTables();
190  }
191 }
192 
194  // Get pointer to ComputeCUDAMgr on this node
195  CProxy_ComputeCUDAMgr computeCUDAMgrProxy = CkpvAccess(BOCclass_group).computeCUDAMgr;
196  ComputeCUDAMgr* computeCUDAMgr = computeCUDAMgrProxy.ckLocalBranch();
197  if (computeCUDAMgr == NULL)
198  NAMD_bug("getComputeCUDAMgr, unable to locate local branch of BOC entry ComputeCUDAMgr");
199  return computeCUDAMgr;
200 }
201 
203  // initialize pmeGrid from simParams
205  PmeGrid pmeGrid;
206  pmeGrid.K1 = simParams->PMEGridSizeX;
207  pmeGrid.K2 = simParams->PMEGridSizeY;
208  pmeGrid.K3 = simParams->PMEGridSizeZ;
209  pmeGrid.order = simParams->PMEInterpOrder;
210  pmeGrid.dim2 = pmeGrid.K2;
211  pmeGrid.dim3 = 2 * (pmeGrid.K3/2 + 1);
212  // override settings for PME pencils
213  pmeGrid.xBlocks = 1;
214  pmeGrid.yBlocks = 1;
215  pmeGrid.zBlocks = 1;
216  pmeGrid.block1 = pmeGrid.K1;
217  pmeGrid.block2 = pmeGrid.K2;
218  pmeGrid.block3 = pmeGrid.K3;
219  // use shared deviceID class
220  int deviceID = 0;
221  int deviceIndex = 0;
222 #ifdef NODEGROUP_FORCE_REGISTER
223  deviceID = deviceCUDA->getPmeDevice();
224  deviceIndex = deviceCUDA->getPmeDeviceIndex();
225 #endif
226  if (cudaPmeOneDevice != NULL) delete cudaPmeOneDevice;
227  cudaPmeOneDevice = new CudaPmeOneDevice(pmeGrid, deviceID, deviceIndex);
228  return cudaPmeOneDevice;
229 }
230 
232  return cudaPmeOneDevice;
233 }
234 
235 //
236 // Creates CudaComputeNonbonded object
237 //
239  int deviceID = deviceCUDA->getDeviceID();
240  if (cudaComputeNonbondedList.at(deviceID) != NULL)
241  NAMD_bug("ComputeCUDAMgr::createCudaComputeNonbonded called twice");
242  if (cudaNonbondedTablesList.at(deviceID) == NULL)
243  NAMD_bug("ComputeCUDAMgr::createCudaComputeNonbonded, non-bonded CUDA tables not created");
244  //bool doStreaming = !deviceCUDA->getNoStreaming() && !Node::Object()->simParameters->GBISOn && !Node::Object()->simParameters->CUDASOAintegrate;
246  cudaComputeNonbondedList[deviceID] = new CudaComputeNonbonded(c, deviceID, *cudaNonbondedTablesList[deviceID], doStreaming);
247  return cudaComputeNonbondedList[deviceID];
248 }
249 
250 //
251 // Returns CudaComputeNonbonded for this Pe
252 //
254  // Get device ID for this Pe
255  int deviceID = deviceCUDA->getDeviceID();
256  CudaComputeNonbonded* p = cudaComputeNonbondedList[deviceID];
257  if (p == NULL)
258  NAMD_bug("ComputeCUDAMgr::getCudaComputeNonbonded(), device not created yet");
259  return p;
260 }
261 
262 #ifdef BONDED_CUDA
263 //
264 // Creates ComputeBondedCUDA object
265 //
266 ComputeBondedCUDA* ComputeCUDAMgr::createComputeBondedCUDA(ComputeID c, ComputeMgr* computeMgr) {
267  int deviceID = deviceCUDA->getDeviceID();
268  if (computeBondedCUDAList.at(deviceID) != NULL)
269  NAMD_bug("ComputeCUDAMgr::createComputeBondedCUDA called twice");
270  if (cudaNonbondedTablesList.at(deviceID) == NULL)
271  NAMD_bug("ComputeCUDAMgr::createCudaComputeNonbonded, non-bonded CUDA tables not created");
272  computeBondedCUDAList[deviceID] = new ComputeBondedCUDA(c, computeMgr, deviceID, *cudaNonbondedTablesList[deviceID]);
273  return computeBondedCUDAList[deviceID];
274 }
275 
276 //
277 // Returns ComputeBondedCUDA for this Pe
278 //
279 ComputeBondedCUDA* ComputeCUDAMgr::getComputeBondedCUDA() {
280  // Get device ID for this Pe
281  int deviceID = deviceCUDA->getDeviceID();
282  ComputeBondedCUDA* p = computeBondedCUDAList[deviceID];
283  if (p == NULL)
284  NAMD_bug("ComputeCUDAMgr::getComputeBondedCUDA(), device not created yet");
285  return p;
286 }
287 #endif // BONDED_CUDA
288 
289 #ifdef NAMD_CUDA
290 std::shared_ptr<CudaGlobalMasterServer> ComputeCUDAMgr::getCudaGlobalMaster() {
291  return cudaGlobalMasterObject;
292 }
293 
294 std::shared_ptr<CudaGlobalMasterServer> ComputeCUDAMgr::createCudaGlobalMaster() {
295  iout << iINFO << "Creating CUDAGlobalMaster on PE " << CkMyPe() << '\n' << endi;
296  if (cudaGlobalMasterObject) {
297  return cudaGlobalMasterObject;
298  }
299  const int deviceID = deviceCUDA->getGlobalDevice();
301  cudaGlobalMasterObject = std::make_shared<CudaGlobalMasterServer>(deviceID, simParams->cudaGlobalProfilingFreq);
302  return cudaGlobalMasterObject;
303 }
304 #endif // NAMD_CUDA
305 
306 #endif // defined(NAMD_CUDA) || defined(NAMD_HIP)
307 
308 #include "ComputeCUDAMgr.def.h"
static Node * Object()
Definition: Node.h:86
int dim2
Definition: PmeBase.h:22
int zBlocks
Definition: PmeBase.h:25
std::ostream & iINFO(std::ostream &s)
Definition: InfoStream.C:81
int getDeviceCount()
Definition: DeviceCUDA.h:124
std::shared_ptr< CudaGlobalMasterServer > getCudaGlobalMaster()
int dim3
Definition: PmeBase.h:22
int32 ComputeID
Definition: NamdTypes.h:278
int K2
Definition: PmeBase.h:21
SimParameters * simParameters
Definition: Node.h:181
int K1
Definition: PmeBase.h:21
void initialize(CkQdMsg *msg)
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
int getPmeDevice()
Definition: DeviceCUDA.h:165
int block1
Definition: PmeBase.h:24
int getNumDevice()
Definition: DeviceCUDA.h:125
#define iout
Definition: InfoStream.h:51
int block2
Definition: PmeBase.h:24
int getGlobalDevice() const
Definition: DeviceCUDA.h:171
int yBlocks
Definition: PmeBase.h:25
int getPmeDeviceIndex()
Definition: DeviceCUDA.h:167
int order
Definition: PmeBase.h:23
void NAMD_bug(const char *err_msg)
Definition: common.C:195
static ComputeCUDAMgr * getComputeCUDAMgr()
int block3
Definition: PmeBase.h:24
std::shared_ptr< CudaGlobalMasterServer > createCudaGlobalMaster()
bool isGpuReservedPme()
Definition: DeviceCUDA.h:164
int getDeviceID()
Definition: DeviceCUDA.h:144
#define simParams
Definition: Output.C:129
int K3
Definition: PmeBase.h:21
int getDeviceIDbyRank(int rank)
Definition: DeviceCUDA.h:145
int getNoStreaming()
Definition: DeviceCUDA.h:130
CudaComputeNonbonded * getCudaComputeNonbonded()
CudaComputeNonbonded * createCudaComputeNonbonded(ComputeID c)
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
int xBlocks
Definition: PmeBase.h:25
CudaPmeOneDevice * createCudaPmeOneDevice()
CudaPmeOneDevice * getCudaPmeOneDevice()