NAMD
CudaNonbondedTables.C
Go to the documentation of this file.
1 #include "Molecule.h"
2 #include "ComputeNonbondedUtil.h"
3 #include "LJTable.h"
4 #include "CudaUtils.h"
5 #include "CudaNonbondedTables.h"
6 #include "Node.h"
7 #include "Parameters.h"
8 #include "SimParameters.h"
9 #include "InfoStream.h"
10 
11 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
12 
13 CudaNonbondedTables::CudaNonbondedTables(const int deviceID) : deviceID(deviceID) {
14 
15  vdwCoefTable = NULL;
16  vdwCoefTableWidth = 0;
17 #if !defined(USE_TABLE_ARRAYS)
18  forceTableTex = 0;
19  energyTableTex = 0;
20 #endif
21  forceTable = NULL;
22  energyTable = NULL;
23  drudeNbTholeTijTable = NULL;
24  drudeNbthole = false;
25  numPotentialNbtholeTerms = 0;
26 
27  exclusionTable = NULL;
28  r2_table = NULL;
29 
30 #if !defined(USE_TABLE_ARRAYS)
31  drudeNbTholeTijTableTex = 0;
32  modifiedExclusionForceTableTex = 0;
33  modifiedExclusionEnergyTableTex = 0;
34 #endif
35  modifiedExclusionForceTable = NULL;
36  modifiedExclusionEnergyTable = NULL;
37 
38  cudaCheck(cudaSetDevice(deviceID));
39  buildForceAndEnergyTables(FORCE_ENERGY_TABLE_SIZE);
40  buildVdwCoefTable();
41 }
42 
44  cudaCheck(cudaSetDevice(deviceID));
45  if (vdwCoefTable != NULL) deallocate_device<float2>(&vdwCoefTable);
46  if (exclusionTable != NULL) deallocate_device<float4>(&exclusionTable);
47  if (r2_table != NULL) deallocate_device<float>(&r2_table);
48  if (drudeNbTholeTijTable != NULL) deallocate_device(&drudeNbTholeTijTable);
49 
50 #if !defined(USE_TABLE_ARRAYS)
51  cudaCheck(cudaFreeArray(forceArray));
52  cudaCheck(cudaFreeArray(energyArray));
53  cudaCheck(cudaFreeArray(modifiedExclusionForceArray));
54  cudaCheck(cudaFreeArray(modifiedExclusionEnergyArray));
55 
56  cudaCheck(cudaDestroyTextureObject(forceTableTex));
57  cudaCheck(cudaDestroyTextureObject(energyTableTex));
58  cudaCheck(cudaDestroyTextureObject(modifiedExclusionForceTableTex));
59  cudaCheck(cudaDestroyTextureObject(modifiedExclusionEnergyTableTex));
60  // TODO: I don't see the code to destroy vdwCoefTableTex and exclusionVdwCoefTableTex. Is this correct
61  if (drudeNbthole) cudaCheck(cudaDestroyTextureObject(drudeNbTholeTijTableTex));
62 #endif
63 
64  if (forceTable != NULL) deallocate_device<float4>(&forceTable);
65  if (energyTable != NULL) deallocate_device<float4>(&energyTable);
66  if (modifiedExclusionForceTable != NULL) deallocate_device<float4>(&modifiedExclusionForceTable);
67  if (modifiedExclusionEnergyTable != NULL) deallocate_device<float4>(&modifiedExclusionEnergyTable);
68 }
69 
70 //This method effectively replaces bindTextureObject, which was a workaround to limitations on early GPU architectures.
71 template <typename T>
72 void copyTable(int size, T* h_table, T*& d_table, bool update=false) {
73  // Copy to device
74  if ( ! update) {
75  allocate_device<T>(&d_table, size);
76  }
77  copy_HtoD_sync<T>(h_table, d_table, size);
78 
79 }
80 
81 #ifndef USE_TABLE_ARRAYS
82 template <typename T>
83 void bindTextureObject(int size, T* h_table, T*& d_table, cudaTextureObject_t& tex, bool update=false) {
84  // Copy to device
85  if ( ! update) {
86  allocate_device<T>(&d_table, size);
87  }
88  else {
89  cudaCheck(cudaDestroyTextureObject(tex));
90  }
91  copy_HtoD_sync<T>(h_table, d_table, size);
92 
93  // Create texture object
94  cudaResourceDesc resDesc;
95  memset(&resDesc, 0, sizeof(resDesc));
96  resDesc.resType = cudaResourceTypeLinear;
97  resDesc.res.linear.devPtr = d_table;
98  resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
99  resDesc.res.linear.desc.x = sizeof(float)*8; // bits per channel
100  if (sizeof(T) >= sizeof(float)*2) resDesc.res.linear.desc.y = sizeof(float)*8; // bits per channel
101  if (sizeof(T) >= sizeof(float)*3) resDesc.res.linear.desc.z = sizeof(float)*8; // bits per channel
102  if (sizeof(T) >= sizeof(float)*4) resDesc.res.linear.desc.w = sizeof(float)*8; // bits per channel
103  resDesc.res.linear.sizeInBytes = size*sizeof(T);
104 
105  cudaTextureDesc texDesc;
106  memset(&texDesc, 0, sizeof(texDesc));
107  texDesc.readMode = cudaReadModeElementType;
108  //texDesc.normalizedCoords = 0;
109 
110  cudaCheck(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
111 
112 }
113 #endif
114 
115 
116 //
117 // Builds the VdW Lennard-Jones coefficient table. Swiped from ComputeNonbondedCUDA.C
118 // NOTE: Can only be called once
119 //
120 void CudaNonbondedTables::buildVdwCoefTable(bool update/*=false*/) {
122  drudeNbthole = simParams->drudeOn && (simParams->drudeNbtholeCut > 0.0);
123 
124  const LJTable* const ljTable = ComputeNonbondedUtil:: ljTable;
125  const int dim = ljTable->get_table_dim();
126 
127  // round dim up to odd multiple of WARPSIZE/2
128  int tsize = (((dim+(WARPSIZE/2)+(WARPSIZE-1))/WARPSIZE)*WARPSIZE)-(WARPSIZE/2);
129  if ( tsize < dim ) NAMD_bug("CudaNonbondedTables::buildVdwCoefTable bad tsize");
130 
131  float2 *h_vdwCoefTable = new float2[tsize*tsize];
132  float2 *h_exclusionVdwCoefTable = new float2[tsize*tsize];
133 
134  float2 *row = h_vdwCoefTable;
135  float2 *exclusionRow = h_exclusionVdwCoefTable;
136  for ( int i=0; i<dim; ++i, row += tsize, exclusionRow += tsize ) {
137  for ( int j=0; j<dim; ++j ) {
138  const LJTable::TableEntry *e = ljTable->table_val(i,j);
139  row[j].x = e->A * ComputeNonbondedUtil::scaling;
140  row[j].y = e->B * ComputeNonbondedUtil::scaling;
141  exclusionRow[j].x = ((e+1)->A - e->A)* ComputeNonbondedUtil::scaling;
142  exclusionRow[j].y = ((e+1)->B - e->B)* ComputeNonbondedUtil::scaling;
143  }
144  }
145 
146  if (drudeNbthole) {
147  iout << iINFO << "Building Drude NbThole correction paramter table.\n" << endi;
148  numPotentialNbtholeTerms = 0;
149  std::vector<float> h_drudeNbTholeTijTable(tsize * tsize);
150  float *tholeTijRow = h_drudeNbTholeTijTable.data();
151  const Parameters *parameters = Node::Object()->parameters;
152  const NbtholePairValue * const nbthole_array = parameters->nbthole_array;
153  // Copy and store the NBTHOLE tij parameters into GPU memory
154  for (int i = 0; i < dim; ++i, tholeTijRow += tsize) {
155  for (int j = 0; j < dim; ++j) {
156  bool found = false;
157  // According to Benoit, the physical meaning of tij is the distance at
158  // which the overlap of the electronic clouds start to happen, so I use
159  // the negative value to indicate that the tij is not available for a
160  // specific VDW type (i,j).
161  float tij = -1.0;
162  for (int k = 0; k < parameters->NumNbtholePairParams; ++k) {
163  if (((nbthole_array[k].ind1 == i) && (nbthole_array[k].ind2 == j)) ||
164  ((nbthole_array[k].ind2 == i) && (nbthole_array[k].ind1 == j))) {
165  found = true;
166  tij = nbthole_array[k].tholeij;
167  if (tij < 0) {
168  const std::string error =
169  "Negative NbThole thole_ij parameter is not allowed in GPU, "
170  "please check thole_ij between atom types " +
171  parameters->atom_type_name_str(nbthole_array[k].ind1) +
172  " and " +
173  parameters->atom_type_name_str(nbthole_array[k].ind2) +
174  ".";
175  NAMD_die(error.c_str());
176  }
177  break;
178  }
179  }
180  if (found) {
181  ++numPotentialNbtholeTerms;
182  }
183  tholeTijRow[j] = tij;
184  }
185  }
186  iout << iINFO << "There are potentially " << numPotentialNbtholeTerms / 2 << " NbThole correction interactions.\n" << endi;
187 #if !defined(USE_TABLE_ARRAYS)
188  bindTextureObject<float>(
189  tsize*tsize, h_drudeNbTholeTijTable.data(), drudeNbTholeTijTable,
190  drudeNbTholeTijTableTex, update);
191 #else
192  copyTable<float>(tsize*tsize, h_drudeNbTholeTijTable.data(), drudeNbTholeTijTable, update);
193 #endif
194  }
195 
196  vdwCoefTableWidth = tsize;
197 #if !defined(USE_TABLE_ARRAYS)
198  bindTextureObject<float2>(tsize*tsize, h_vdwCoefTable, vdwCoefTable, vdwCoefTableTex, update);
199  bindTextureObject<float2>(tsize*tsize, h_exclusionVdwCoefTable, exclusionVdwCoefTable, exclusionVdwCoefTableTex, update);
200 #else
201  copyTable<float2>(tsize*tsize, h_vdwCoefTable, vdwCoefTable, update);
202  copyTable<float2>(tsize*tsize, h_exclusionVdwCoefTable, exclusionVdwCoefTable, update);
203 #endif
204 
205  delete[] h_vdwCoefTable;
206  delete [] h_exclusionVdwCoefTable;
207 
208  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
209  CkPrintf("Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim);
210  }
211 }
212 
213 // Update tables from newer CPU values
215  buildVdwCoefTable(true);
216 }
217 
218 //
219 // Builds force and energy tables. Swiped from ComputeNonbondedCUDA.C
220 //
221 template <typename T>
222 void buildForceAndEnergyTable(const int tableSize, const double* r2list, const BigReal* src_table, const bool flip,
223  const BigReal prefac, const int dst_stride, T* dst_force, T* dst_energy) {
224 
225  const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta;
226  const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp;
227  const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
228 
229  union { double f; int32 i[2]; } byte_order_test;
230  byte_order_test.f = 1.0; // should occupy high-order bits only
231  int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
232 
233  for ( int i=1; i<tableSize; ++i ) {
234  double r = ((double) tableSize) / ( (double) i + 0.5 );
235  int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc; // table_i >= 0
236 
237  if ( r > ComputeNonbondedUtil::cutoff ) {
238  dst_force[i*dst_stride] = 0.;
239  dst_energy[i*dst_stride] = 0.;
240  continue;
241  }
242 
243  BigReal diffa = r2list[i] - ComputeNonbondedUtil::r2_table[table_i];
244 
245  BigReal table_a, table_b, table_c, table_d;
246  if (flip) {
247  table_a = src_table[4*table_i+3];
248  table_b = src_table[4*table_i+2];
249  table_c = src_table[4*table_i+1];
250  table_d = src_table[4*table_i];
251  } else {
252  table_a = src_table[4*table_i];
253  table_b = src_table[4*table_i+1];
254  table_c = src_table[4*table_i+2];
255  table_d = src_table[4*table_i+3];
256  }
257 
258  BigReal grad = ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
259  dst_force[i*dst_stride] = prefac * 2. * grad;
260  BigReal ener = table_a + diffa * ( ( table_d * diffa + table_c ) * diffa + table_b);
261  dst_energy[i*dst_stride] = prefac * ener;
262  }
263 
264  dst_force[0] = 0.;
265  dst_energy[0] = dst_energy[1*dst_stride];
266 }
267 
268 #if !defined(USE_TABLE_ARRAYS)
269 template <typename T>
270 void bindTextureObject(int tableSize, int tableWidth, T* h_table,
271  cudaArray_t& array, cudaTextureObject_t& tableTex, T** d_table) {
272 
273 #if defined(NAMD_CUDA)
274  allocate_device_T((void **)d_table, tableSize, sizeof(T)*tableWidth);
275  copy_HtoD_T(h_table, *d_table, tableSize, sizeof(T)*tableWidth);
276 
277  cudaChannelFormatDesc desc;
278  memset(&desc, 0, sizeof(desc));
279  desc.x = sizeof(T)*8;
280  if (tableWidth >= 2) desc.y = sizeof(T)*8;
281  if (tableWidth >= 3) desc.z = sizeof(T)*8;
282  if (tableWidth >= 4) desc.w = sizeof(T)*8;
283  desc.f = cudaChannelFormatKindFloat;
284  cudaCheck(cudaMallocArray(&array, &desc, tableSize, 0));
285  cudaCheck(cudaMemcpyToArray(array, 0, 0, h_table, tableSize*sizeof(T)*tableWidth, cudaMemcpyHostToDevice));
286 
287  cudaResourceDesc resDesc;
288  memset(&resDesc, 0, sizeof(resDesc));
289  resDesc.resType = cudaResourceTypeArray;
290  resDesc.res.array.array = array;
291 
292  cudaTextureDesc texDesc;
293  memset(&texDesc, 0, sizeof(texDesc));
294  texDesc.addressMode[0] = cudaAddressModeClamp;
295  texDesc.filterMode = cudaFilterModeLinear;
296  texDesc.normalizedCoords = 1;
297 
298  cudaCheck(cudaCreateTextureObject(&tableTex, &resDesc, &texDesc, NULL));
299 #else
300  // tex1Dfetch is used in kernels, so create a linear texture
301  // The texture is 1 texel wider to simplify (and optimize) index clamping for tex1Dfetch
302  allocate_device_T((void **)d_table, tableSize + 1, sizeof(T)*tableWidth);
303  copy_HtoD_T(h_table, *d_table, tableSize, sizeof(T)*tableWidth);
304  copy_HtoD_T(h_table + tableWidth * (tableSize - 1), *d_table + tableWidth * tableSize, 1, sizeof(T)*tableWidth);
305 
306  cudaResourceDesc resDesc;
307  memset(&resDesc, 0, sizeof(resDesc));
308  resDesc.resType = cudaResourceTypeLinear;
309  resDesc.res.linear.devPtr = *d_table;
310  resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
311  resDesc.res.linear.desc.x = sizeof(T)*8;
312  if (tableWidth >= 2) resDesc.res.linear.desc.y = sizeof(T)*8;
313  if (tableWidth >= 3) resDesc.res.linear.desc.z = sizeof(T)*8;
314  if (tableWidth >= 4) resDesc.res.linear.desc.w = sizeof(T)*8;
315  resDesc.res.linear.sizeInBytes = (tableSize + 1)*sizeof(T)*tableWidth;
316 
317  cudaTextureDesc texDesc;
318  memset(&texDesc, 0, sizeof(texDesc));
319  texDesc.readMode = cudaReadModeElementType;
320 
321  cudaCheck(cudaCreateTextureObject(&tableTex, &resDesc, &texDesc, NULL));
322 #endif
323 }
324 #endif
325 
326 void CudaNonbondedTables::buildForceAndEnergyTables(int tableSize) {
327  forceAndEnergyTableSize = tableSize;
328 
329  // Build r2list
330  const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta;
331  const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp;
332  const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
333 
334  double* r2list = new double[tableSize]; // double to match cpu code
335  for ( int i=1; i<tableSize; ++i ) {
336  double r = ((double) tableSize) / ( (double) i + 0.5 );
337  r2list[i] = r*r + r2_delta;
338  }
339 
340  // Non-bonded
341  {
342  float* t = new float[tableSize * 4];
343  float* et = new float[tableSize * 4];
344 
345  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::fast_table, false, 1.0,
346  4, &t[0], &et[0]);
347 
348  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::vdwb_table, false, -1.0,
349  4, &t[1], &et[1]);
350 
351  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::vdwa_table, false, 1.0,
352  4, &t[2], &et[2]);
353 
354  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::scor_table, false, 1.0,
355  4, &t[3], &et[3]);
356 
357 #if !defined(USE_TABLE_ARRAYS)
358  bindTextureObject<float>(tableSize, 4, t, forceArray, forceTableTex, (float **)&forceTable);
359  bindTextureObject<float>(tableSize, 4, et, energyArray, energyTableTex, (float **)&energyTable);
360 #else
361  copyTable<float4>(tableSize, (float4*)t, forceTable);
362  copyTable<float4>(tableSize, (float4*)et, energyTable);
363 #endif
364  delete [] t;
365  delete [] et;
366  }
367 
368  // Modified exclusions
369  {
370  float* t = new float[tableSize * 4];
371  float* et = new float[tableSize * 4];
372 
373  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::fast_table, false, -1.0,
374  4, &t[0], &et[0]);
375 
376  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::vdwb_table, false, -1.0,
377  4, &t[1], &et[1]);
378 
379  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::vdwa_table, false, 1.0,
380  4, &t[2], &et[2]);
381 
382  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::slow_table, true, -1.0,
383  4, &t[3], &et[3]);
384  // delete [] flip_slow_table;
385 #ifndef USE_TABLE_ARRAYS
386  bindTextureObject<float>(tableSize, 4, t, modifiedExclusionForceArray, modifiedExclusionForceTableTex, (float **)&modifiedExclusionForceTable);
387  bindTextureObject<float>(tableSize, 4, et, modifiedExclusionEnergyArray, modifiedExclusionEnergyTableTex, (float **)&modifiedExclusionEnergyTable);
388 #else
389  copyTable<float4>(tableSize, (float4*)t, modifiedExclusionForceTable);
390  copyTable<float4>(tableSize, (float4*)et, modifiedExclusionEnergyTable);
391 #endif
392  delete [] t;
393  delete [] et;
394  }
395 
396  // Exclusions
397  {
398  BigReal* corr_full_table = new BigReal[ComputeNonbondedUtil::table_length*4];
399  for (int i=0;i < ComputeNonbondedUtil::table_length*4;i++) {
401  }
402 
403  float4* h_exclusionTable = new float4[ComputeNonbondedUtil::table_length];
404  float* h_r2_table = new float[ComputeNonbondedUtil::table_length];
405  for (int i=0;i < ComputeNonbondedUtil::table_length;i++) {
406  h_exclusionTable[i].x = 6.0*corr_full_table[4*i + 3];
407  h_exclusionTable[i].y = 4.0*corr_full_table[4*i + 2];
408  h_exclusionTable[i].z = 2.0*corr_full_table[4*i + 1];
409  h_exclusionTable[i].w = 1.0*corr_full_table[4*i + 0];
410  h_r2_table[i] = ComputeNonbondedUtil::r2_table[i];
411  }
412 
413  copyTable<float>(ComputeNonbondedUtil::table_length, h_r2_table, r2_table);
414  copyTable<float4>(ComputeNonbondedUtil::table_length, h_exclusionTable, exclusionTable);
415 
416  delete [] h_exclusionTable;
417  delete [] h_r2_table;
418 
419  }
420 
421  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
422  CkPrintf("Info: Updated CUDA force table with %d elements.\n", tableSize);
423  }
424 
425  delete [] r2list;
426 }
427 
428 #endif // NAMD_CUDA
429 
static Node * Object()
Definition: Node.h:86
static BigReal * fast_table
std::ostream & iINFO(std::ostream &s)
Definition: InfoStream.C:81
static BigReal * scor_table
void allocate_device_T(void **pp, const size_t len, const size_t sizeofT)
Definition: CudaUtils.C:97
SimParameters * simParameters
Definition: Node.h:181
void deallocate_device(T **pp)
Definition: CudaUtils.h:333
int32_t int32
Definition: common.h:38
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
static BigReal * vdwa_table
std::string atom_type_name_str(Index a) const
Definition: Parameters.C:9381
void bindTextureObject(int size, T *h_table, T *&d_table, cudaTextureObject_t &tex, bool update=false)
#define iout
Definition: InfoStream.h:51
#define WARPSIZE
Definition: CudaUtils.h:17
static BigReal * full_table
int NumNbtholePairParams
Definition: Parameters.h:339
void buildForceAndEnergyTable(const int tableSize, const double *r2list, const BigReal *src_table, const bool flip, const BigReal prefac, const int dst_stride, T *dst_force, T *dst_energy)
void NAMD_bug(const char *err_msg)
Definition: common.C:195
int get_table_dim() const
Definition: LJTable.h:44
#define FORCE_ENERGY_TABLE_SIZE
Definition: CudaUtils.h:40
const TableEntry * table_val(unsigned int i, unsigned int j) const
Definition: LJTable.h:35
void NAMD_die(const char *err_msg)
Definition: common.C:147
NbtholePairValue * nbthole_array
Definition: Parameters.h:317
Parameters * parameters
Definition: Node.h:180
static BigReal * slow_table
#define simParams
Definition: Output.C:131
void copy_HtoD_T(const void *h_array, void *d_array, size_t array_len, const size_t sizeofT)
Definition: CudaUtils.C:224
static BigReal * vdwb_table
static const LJTable * ljTable
static BigReal * corr_table
void copyTable(int size, T *h_table, T *&d_table, bool update=false)
CudaNonbondedTables(const int deviceID)
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
double BigReal
Definition: common.h:123