NAMD
ComputeSMDCUDA.C
Go to the documentation of this file.
1 #include "ComputeSMDCUDA.h"
2 #include "ComputeSMDCUDAKernel.h"
3 #include "SimParameters.h"
4 #include "PDB.h"
5 #include "PDBData.h"
6 #include "Node.h"
7 #include "Molecule.h"
8 #include "InfoStream.h"
9 #include "ComputeCUDAMgr.h"
10 
11 #define MIN_DEBUG_LEVEL 3
12 //#define DEBUGM
13 #include "Debug.h"
14 #ifdef DEBUGM
15 // need these to use deviceCUDA in debugging output
16 #include "DeviceCUDA.h"
17 extern __thread DeviceCUDA *deviceCUDA;
18 #endif
19 
20 #ifdef NODEGROUP_FORCE_REGISTER
21 
22 ComputeSMDCUDA::ComputeSMDCUDA(
23  std::vector<HomePatch*> &patchList,
24  double springConstant,
25  double transverseSpringConstant,
26  double velocity,
27  double3 direction,
28  int outputFrequency,
29  int firstTimeStep,
30  const char* filename,
31  bool isMasterDevice,
32  int numAtoms,
33  int numDevices,
34  int deviceIndex,
35  bool _mGpuOn ){
36  DebugM(3, "ComputeSMDCUDA\n" << endi);
37  // I could use an initializer list, but I don't like them
38  this->patchList = &patchList;
39  this->springConstant = springConstant;
40  this->transverseSpringConstant = transverseSpringConstant;
41  this->velocity = velocity;
42  this->direction = direction;
43  this->outputFrequency = outputFrequency;
44  this->firstTimeStep = firstTimeStep;
45  this->filename = filename;
46  this->isMasterDevice = isMasterDevice;
47  this->numAtoms = numAtoms;
48  this->mGpuOn =_mGpuOn;
49  this->numDevices = numDevices;
50  this->deviceIndex = deviceIndex;
52 
53  smdAtomsGlobalIndex.clear();
54  // I need to save the global index of atoms. That way I can quickly rebuild the SMD index vector
55  allocate_host<double3>(&curCOM, 1);
56  parseAtoms();
57 
58  smdAtomsSOAIndex.resize(this->numSMDAtoms);
59  allocate_device<unsigned int>(&d_tbcatomic, 1);
60  allocate_device<double3>(&d_curCOM, 1);
61  allocate_device<int>(&d_smdAtomsSOAIndex, this->numSMDAtoms);
62 
63  if(mGpuOn)
64  {// each SMD needs its own peer array
65  allocate_device<double3>(&d_peerCOM, sizeof(double3));
66 #ifdef DEBUGM
67  allocate_host<double3>(&h_peerCOM, sizeof(double3));
68 #endif
69  }
70 
71  // set the current COM value to {0, 0, 0}
72  curCOM->x = 0.0;
73  curCOM->y = 0.0;
74  curCOM->z = 0.0;
75 
76  copy_HtoD<double3>(curCOM, d_curCOM, 1);
77  cudaCheck(cudaMemset(d_tbcatomic, 0, sizeof(unsigned int)));
78 
79 }
80 
81 ComputeSMDCUDA::~ComputeSMDCUDA(){
82  DebugM(3, "~ComputeSMDCUDA\n" << endi);
83  deallocate_host<double3>(&curCOM);
84  deallocate_device<unsigned int>(&d_tbcatomic);
85  deallocate_device<int>(&d_smdAtomsSOAIndex);
86 }
87 
88 // This builds the global vector index - swiped from GlobalMasterSMD.C
89 void ComputeSMDCUDA::parseAtoms(){
90  DebugM(3, "parseAtoms\n" << endi);
91  PDB smdpdb(filename);
92  origCOM.x = origCOM.y = origCOM.z = 0;
93  Molecule *mol = Node::Object()->molecule; // to get masses
94  int numPDBAtoms = smdpdb.num_atoms();
95  if(numPDBAtoms < 1 ) NAMD_die("No Atoms found in SMDFile\n");
96 
97  BigReal imass = 0;
98 
99  if (numPDBAtoms != this->numAtoms){
100  fprintf(stderr, "Error, wrong numPDB (%d vs %d)\n",numPDBAtoms, this->numAtoms);
101  NAMD_die("The number of atoms in SMDFile must be equal to the total number of atoms in the structure!\n");
102  }
103 
104  // Would this work on PDB atoms? Is the data replicated for everyone?
105  for(int i = 0; i < numPDBAtoms; i++){
106  // MEMOPT obviously doesn't work with CUDASOA, so we can just use this
107  PDBAtom *atom = smdpdb.atom(i);
108  if(atom->occupancy()){ // It's a SMD atom! Add it to the list
109  smdAtomsGlobalIndex.push_back(i);
110 
111  // compute the center of mass
112  BigReal mass = mol->atommass(i);
113  origCOM.x += atom->xcoor()*mass;
114  origCOM.y += atom->ycoor()*mass;
115  origCOM.z += atom->zcoor()*mass;
116  imass += mass;
117  }
118  }
119 
120  inv_group_mass = 1.0 / imass;
121  origCOM.x *= inv_group_mass;
122  origCOM.y *= inv_group_mass;
123  origCOM.z *= inv_group_mass;
124 
125  if (imass == 0) // we didn't find any!
126  NAMD_die("SMDFile contained no SMD atoms (atoms w/ nonzero occupancy)\n");
127 
128  this->numSMDAtoms = smdAtomsGlobalIndex.size();
129 }
130 
131 void ComputeSMDCUDA::updateAtoms(
132  std::vector<AtomMap*> &atomMapsList,
133  std::vector<CudaLocalRecord> &localRecords,
134  const int* h_globalToLocalID) {
135  DebugM(3, "[" << CkMyPe() << "]" <<" updateAtoms full "<< smdAtomsGlobalIndex.size()<< " SOA "<< smdAtomsSOAIndex.size()<< "\n" << endi);
136  smdAtomsSOAIndex.clear();
137 #ifdef DEBUGM
138  smdAtomsSOAtoGlobalLocalMap.clear();
139 #endif
140  for(int i = 0 ; i < this->numSMDAtoms; i++){
141  int gid = smdAtomsGlobalIndex[i];
142  LocalID lid;
143  // Search for a valid localID in all atoms
144  for(int j = 0 ; j < atomMapsList.size(); j++){
145  lid = atomMapsList[j]->localID(gid);
146  if( lid.pid != -1) break;
147  }
148  //JM NOTE: Fields of lid need to be != -1, bc the atom needs to be somewhere
149  // otherwise we have a bug
150  if(lid.pid == -1){
151  if(!mGpuOn)
152  {
153  NAMD_bug(" LocalAtomID not found in patchMap");
154  }
155  }
156  else{
157 
158  int soaPid = h_globalToLocalID[lid.pid]; // Converts global patch ID to its local position in our SOA data structures
159  int soaIndex = localRecords[soaPid].bufferOffset + lid.index;
160  smdAtomsSOAIndex.push_back(soaIndex);
161 #ifdef DEBUGM
162  // so we can reverse map from the sorted SOA list to decomp independent
163  smdAtomsSOAtoGlobalLocalMap[soaIndex]=gid;
164 #endif
165  }
166  }
167  int numLocalSMDAtoms=smdAtomsSOAIndex.size();
168  DebugM(3, "[" << CkMyPe() << "]" << " updateAtoms " << numLocalSMDAtoms << "\n" << endi);
169  if(numLocalSMDAtoms>0)
170  { // only a device involved in the computation will have all the results
171  ComputeCUDAMgr* cudaMgr = ComputeCUDAMgr::getComputeCUDAMgr(); // last one wins
172  cudaMgr->reducerSMDDevice.store(deviceIndex);
173  }
174  // Sort vector for better coalesce memory access
175  std::sort(smdAtomsSOAIndex.begin(), smdAtomsSOAIndex.end());
176  copy_HtoD<int>(smdAtomsSOAIndex.data(), d_smdAtomsSOAIndex, numLocalSMDAtoms);
177 }
178 
179 void ComputeSMDCUDA::computeCOMMGpu(
180  const Lattice lat,
181  const float * d_mass,
182  const double* d_pos_x,
183  const double* d_pos_y,
184  const double* d_pos_z,
185  const char3* d_transform,
186  cudaStream_t stream)
187 {
188  DebugM(3, "[" << CkMyPe() << "]" << " computeCOMMGpu " << this->smdAtomsSOAIndex.size() << " 1st "<< this->smdAtomsSOAIndex[0]<< "\n" << endi);
190  computeCOMSMDMgpu(this->smdAtomsSOAIndex.size(),lat,
191  d_mass, d_pos_x, d_pos_y, d_pos_z,
192  d_transform, this->d_smdAtomsSOAIndex,
193  d_peerCOM, cudaMgr->curSMDCOM,
194  this->d_tbcatomic, numDevices, deviceIndex, stream);
195 #ifdef DEBUGM
196  cudaCheck(cudaStreamSynchronize(stream));
197  copy_DtoH_sync<double3>(d_peerCOM, h_peerCOM, 1);
198  DebugM(3, "deviceIndex "<< deviceIndex << " COM " <<h_peerCOM[0].x*inv_group_mass<<", "
199  <<h_peerCOM[0].y*inv_group_mass<<", "
200  <<h_peerCOM[0].z*inv_group_mass<<"\n"<<endi);
201 
202 #endif
203 }
204 
205 void ComputeSMDCUDA::doForce(
206  const int timeStep,
207  const Lattice &lat,
208  bool doEnergy,
209  const float* d_mass,
210  const double* d_pos_x,
211  const double* d_pos_y,
212  const double* d_pos_z,
213  const char3* d_transform,
214  double* d_f_normal_x,
215  double* d_f_normal_y,
216  double* d_f_normal_z,
217  cudaTensor* d_extVirial,
218  double* h_extEnergy,
219  double3* h_extForce,
220  cudaTensor* h_extVirial,
221  cudaStream_t stream
222  )
223 {
224  DebugM(3, "[" << CkMyPe() << "]" << " doForce " << this->smdAtomsSOAIndex.size() <<"\n" << endi);
226 
227  bool doOutput = ((timeStep % this->outputFrequency) == 0);
228  if(mGpuOn)
229  { // only the reducerDevice does output and energy
230  doOutput&=cudaMgr->reducerSMDDevice == deviceIndex;
231  }
232 
233 #ifdef DEBUGM
234  cudaCheck(cudaStreamSynchronize(stream));
235  copy_DtoH_sync<double3>(d_peerCOM, h_peerCOM, 1);
236  DebugM(3, " deviceIndex "<< deviceIndex << " com " <<h_peerCOM[0].x*inv_group_mass<<", "
237  <<h_peerCOM[0].y*inv_group_mass<<", "
238  <<h_peerCOM[0].z*inv_group_mass<<"\n"<<endi);
239 
240 #endif
241 
242 
243  int bvalue;
244 
245  computeSMDForce(
246  lat,
247  this->inv_group_mass,
248  this->springConstant,
249  this->transverseSpringConstant,
250  this->velocity,
251  this->direction,
252  doEnergy || doOutput,
253  timeStep,
254  mGpuOn,
255  this->origCOM,
256  d_mass,
257  d_pos_x,
258  d_pos_y,
259  d_pos_z,
260  d_transform,
261  d_f_normal_x,
262  d_f_normal_y,
263  d_f_normal_z,
264  this->smdAtomsSOAIndex.size(),
265  this->d_smdAtomsSOAIndex,
266  this->d_curCOM,
267  this->curCOM,
268  cudaMgr->curSMDCOM,
269  d_extVirial,
270  h_extEnergy,
271  h_extForce,
272  h_extVirial,
273  this->d_tbcatomic,
274  numDevices,
275  deviceIndex,
276  stream
277  );
278  if(doOutput){
279  cudaCheck(cudaStreamSynchronize(stream));
280 #ifdef DEBUGM
281  Vector f(h_extForce->x, h_extForce->y, h_extForce->z);
282 #endif
283  DebugM(3, "[" << CkMyPe() << "]" << " co force " << f*PNPERKCALMOL <<"\n" << endi);
284  if(cudaMgr->reducerSMDDevice == deviceIndex)
285  {
286  // only one device does the output
287  outputStep(timeStep, curCOM, h_extForce);
288  }
289  }
290 }
291 
292 void ComputeSMDCUDA::outputStep(const int timeStep, double3* curCOM, double3* extForce)
293 {
294  Vector p(curCOM->x, curCOM->y, curCOM->z);
295  Vector f(extForce->x, extForce->y, extForce->z);
296  if(timeStep % (100*this->outputFrequency) == 0) {
297  iout << "SMDTITLE: TS CURRENT_POSITION FORCE\n" << endi;
298  }
299  iout << "SMD " << timeStep << ' ' << p << ' ' << f*PNPERKCALMOL << '\n' << endi;
300 
301 }
302 void ComputeSMDCUDA::outputCOM(std::string comment)
303 {
304 #ifdef DEBUGM
306  Vector p(curCOM->x, curCOM->y, curCOM->z);
307  // Vector g(cudaMgr->curSMDCOM[0].x*inv_group_mass, cudaMgr->curSMDCOM[0].y*inv_group_mass, cudaMgr->curSMDCOM[0].z*inv_group_mass);
308  // DebugM(3, comment << " origCOM : "<< this->origCOM <<" cur : "<<p<< " G : "<< g << "\n");
309 #endif
310 }
311 
312 
313 void ComputeSMDCUDA::initPeerCOM(double3** d_peerPoolCOM, cudaStream_t stream){
314  DebugM(3, "[" << CkMyPe() << "]" << " initPeerCOM\n" << endi);
315  initPeerCOMmgpu(numDevices, deviceIndex, d_peerPoolCOM, d_peerCOM, stream);
316 }
317 
318 
319 void ComputeSMDCUDA::dump(std::string comment,
320  const int step,
321  const int numAtoms,
322  const double* d_pos_x,
323  const double* d_pos_y,
324  const double* d_pos_z,
325  const float* d_mass
326  )
327 {
328 
329 #ifdef DEBUGM
330  DebugM(3, "[" << CkMyPe() << "]" << " dump " << comment<<" "<< smdAtomsSOAIndex.size() <<"\n" << endi);
332  // copy data from GPU to local buffers for output
333 
334  // output GID X Y Z MASS as these can be collected for decomp
335  // independent comparisons
336  float* mass;
337  double *pos_x;
338  double *pos_y;
339  double *pos_z;
340  allocate_host<double>(&pos_x, numAtoms);
341  allocate_host<double>(&pos_y, numAtoms);
342  allocate_host<double>(&pos_z, numAtoms);
343  allocate_host<float>(&mass, numAtoms);
344  copy_DtoH_sync<double>(d_pos_x, pos_x, numAtoms);
345  copy_DtoH_sync<double>(d_pos_y, pos_y, numAtoms);
346  copy_DtoH_sync<double>(d_pos_z, pos_z, numAtoms);
347  copy_DtoH_sync<float>(d_mass, mass, numAtoms);
348  std::ofstream ofs;
349  std::string name("smd-");
350  name+=std::to_string(CkMyPe());
351  name+=".dat";
352  ofs.open(name, std::ofstream::out | std::ofstream::app);
353  for(int i=0; i<smdAtomsSOAIndex.size(); i++)
354  {
355  int soaid=smdAtomsSOAIndex[i];
356  ofs <<step<<"- "<< smdAtomsSOAtoGlobalLocalMap[soaid]<<":"<<pos_x[soaid] <<","<<pos_y[soaid]<<","<<pos_z[soaid]<<","<<mass[soaid]<< "\n";
357  }
358  ofs.close();
359 #endif
360 }
361 
362 #endif // NODEGROUP_FORCE_REGISTER
static Node * Object()
Definition: Node.h:86
Definition: PDB.h:36
Definition: Vector.h:72
#define DebugM(x, y)
Definition: Debug.h:75
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
BigReal zcoor(void)
Definition: PDBData.C:433
#define iout
Definition: InfoStream.h:51
Molecule stores the structural information for the system.
Definition: Molecule.h:174
std::atomic< int > reducerSMDDevice
int32 index
Definition: NamdTypes.h:300
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
void NAMD_bug(const char *err_msg)
Definition: common.C:195
static ComputeCUDAMgr * getComputeCUDAMgr()
BigReal ycoor(void)
Definition: PDBData.C:429
void NAMD_die(const char *err_msg)
Definition: common.C:147
PatchID pid
Definition: NamdTypes.h:299
Real atommass(int anum) const
Definition: Molecule.h:1114
BigReal xcoor(void)
Definition: PDBData.C:425
#define PNPERKCALMOL
Definition: common.h:59
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
BigReal occupancy(void)
Definition: PDBData.C:444
Molecule * molecule
Definition: Node.h:179
double BigReal
Definition: common.h:123