NAMD
CudaNonbondedTables.C
Go to the documentation of this file.
1 #include "charm++.h"
2 #include "NamdTypes.h"
3 #include "ComputeNonbondedUtil.h"
4 #include "LJTable.h"
5 #include "CudaUtils.h"
6 #include "CudaNonbondedTables.h"
7 
8 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
9 
10 CudaNonbondedTables::CudaNonbondedTables(const int deviceID) : deviceID(deviceID) {
11 
12  vdwCoefTable = NULL;
13  vdwCoefTableWidth = 0;
14 
15  forceTableTex = 0;
16  forceTable = NULL;
17  energyTableTex = 0;
18  energyTable = NULL;
19 
20  exclusionTable = NULL;
21  r2_table = NULL;
22 
23  modifiedExclusionForceTableTex = 0;
24  modifiedExclusionEnergyTableTex = 0;
25  modifiedExclusionForceTable = NULL;
26  modifiedExclusionEnergyTable = NULL;
27 
28  cudaCheck(cudaSetDevice(deviceID));
29  buildForceAndEnergyTables(FORCE_ENERGY_TABLE_SIZE);
30  buildVdwCoefTable();
31 }
32 
34  cudaCheck(cudaSetDevice(deviceID));
35  if (vdwCoefTable != NULL) deallocate_device<float2>(&vdwCoefTable);
36  if (exclusionTable != NULL) deallocate_device<float4>(&exclusionTable);
37  if (r2_table != NULL) deallocate_device<float>(&r2_table);
38 
39  cudaCheck(cudaFreeArray(forceArray));
40  cudaCheck(cudaFreeArray(energyArray));
41  cudaCheck(cudaFreeArray(modifiedExclusionForceArray));
42  cudaCheck(cudaFreeArray(modifiedExclusionEnergyArray));
43 
44  cudaCheck(cudaDestroyTextureObject(forceTableTex));
45  cudaCheck(cudaDestroyTextureObject(energyTableTex));
46  cudaCheck(cudaDestroyTextureObject(modifiedExclusionForceTableTex));
47  cudaCheck(cudaDestroyTextureObject(modifiedExclusionEnergyTableTex));
48 
49  if (forceTable != NULL) deallocate_device<float4>(&forceTable);
50  if (energyTable != NULL) deallocate_device<float4>(&energyTable);
51  if (modifiedExclusionForceTable != NULL) deallocate_device<float4>(&modifiedExclusionForceTable);
52  if (modifiedExclusionEnergyTable != NULL) deallocate_device<float4>(&modifiedExclusionEnergyTable);
53 }
54 
55 //This method effectively replaces bindTextureObject, which was a workaround to limitations on early GPU architectures.
56 template <typename T>
57 void copyTable(int size, T* h_table, T*& d_table, bool update=false) {
58  // Copy to device
59  if ( ! update) {
60  allocate_device<T>(&d_table, size);
61  }
62  copy_HtoD_sync<T>(h_table, d_table, size);
63 
64 }
65 
66 template <typename T>
67 void bindTextureObject(int size, T* h_table, T*& d_table, cudaTextureObject_t& tex, bool update=false) {
68  // Copy to device
69  if ( ! update) {
70  allocate_device<T>(&d_table, size);
71  }
72  else {
73  cudaCheck(cudaDestroyTextureObject(tex));
74  }
75  copy_HtoD_sync<T>(h_table, d_table, size);
76 
77  // Create texture object
78  cudaResourceDesc resDesc;
79  memset(&resDesc, 0, sizeof(resDesc));
80  resDesc.resType = cudaResourceTypeLinear;
81  resDesc.res.linear.devPtr = d_table;
82  resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
83  resDesc.res.linear.desc.x = sizeof(float)*8; // bits per channel
84  if (sizeof(T) >= sizeof(float)*2) resDesc.res.linear.desc.y = sizeof(float)*8; // bits per channel
85  if (sizeof(T) >= sizeof(float)*3) resDesc.res.linear.desc.z = sizeof(float)*8; // bits per channel
86  if (sizeof(T) >= sizeof(float)*4) resDesc.res.linear.desc.w = sizeof(float)*8; // bits per channel
87  resDesc.res.linear.sizeInBytes = size*sizeof(T);
88 
89  cudaTextureDesc texDesc;
90  memset(&texDesc, 0, sizeof(texDesc));
91  texDesc.readMode = cudaReadModeElementType;
92  //texDesc.normalizedCoords = 0;
93 
94  cudaCheck(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
95 
96 }
97 
98 //
99 // Builds the VdW Lennard-Jones coefficient table. Swiped from ComputeNonbondedCUDA.C
100 // NOTE: Can only be called once
101 //
102 void CudaNonbondedTables::buildVdwCoefTable(bool update/*=false*/) {
103 
104  const LJTable* const ljTable = ComputeNonbondedUtil:: ljTable;
105  const int dim = ljTable->get_table_dim();
106 
107  // round dim up to odd multiple of WARPSIZE/2
108  int tsize = (((dim+WARPSIZE/2+WARPSIZE-1)/WARPSIZE)*WARPSIZE)-WARPSIZE/2;
109  if ( tsize < dim ) NAMD_bug("CudaNonbondedTables::buildVdwCoefTable bad tsize");
110 
111  float2 *h_vdwCoefTable = new float2[tsize*tsize];
112  float2 *h_exclusionVdwCoefTable = new float2[tsize*tsize];
113  float2 *row = h_vdwCoefTable;
114  float2 *exclusionRow = h_exclusionVdwCoefTable;
115  for ( int i=0; i<dim; ++i, row += tsize, exclusionRow += tsize ) {
116  for ( int j=0; j<dim; ++j ) {
117  const LJTable::TableEntry *e = ljTable->table_val(i,j);
118  row[j].x = e->A * ComputeNonbondedUtil::scaling;
119  row[j].y = e->B * ComputeNonbondedUtil::scaling;
120  exclusionRow[j].x = ((e+1)->A - e->A)* ComputeNonbondedUtil::scaling;
121  exclusionRow[j].y = ((e+1)->B - e->B)* ComputeNonbondedUtil::scaling;
122  }
123  }
124 
125  vdwCoefTableWidth = tsize;
126 
127  bindTextureObject<float2>(tsize*tsize, h_vdwCoefTable, vdwCoefTable, vdwCoefTableTex, update);
128  bindTextureObject<float2>(tsize*tsize, h_exclusionVdwCoefTable, exclusionVdwCoefTable, exclusionVdwCoefTableTex, update);
129 
130  delete [] h_vdwCoefTable;
131  delete [] h_exclusionVdwCoefTable;
132 
133  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
134  CkPrintf("Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim);
135  }
136 }
137 
138 // Update tables from newer CPU values
140  buildVdwCoefTable(true);
141 }
142 
143 //
144 // Builds force and energy tables. Swiped from ComputeNonbondedCUDA.C
145 //
146 template <typename T>
147 void buildForceAndEnergyTable(const int tableSize, const double* r2list, const BigReal* src_table, const bool flip,
148  const BigReal prefac, const int dst_stride, T* dst_force, T* dst_energy) {
149 
150  const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta;
151  const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp;
152  const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
153 
154  union { double f; int32 i[2]; } byte_order_test;
155  byte_order_test.f = 1.0; // should occupy high-order bits only
156  int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
157 
158  for ( int i=1; i<tableSize; ++i ) {
159  double r = ((double) tableSize) / ( (double) i + 0.5 );
160  int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc; // table_i >= 0
161 
162  if ( r > ComputeNonbondedUtil::cutoff ) {
163  dst_force[i*dst_stride] = 0.;
164  dst_energy[i*dst_stride] = 0.;
165  continue;
166  }
167 
168  BigReal diffa = r2list[i] - ComputeNonbondedUtil::r2_table[table_i];
169 
170  BigReal table_a, table_b, table_c, table_d;
171  if (flip) {
172  table_a = src_table[4*table_i+3];
173  table_b = src_table[4*table_i+2];
174  table_c = src_table[4*table_i+1];
175  table_d = src_table[4*table_i];
176  } else {
177  table_a = src_table[4*table_i];
178  table_b = src_table[4*table_i+1];
179  table_c = src_table[4*table_i+2];
180  table_d = src_table[4*table_i+3];
181  }
182 
183  BigReal grad = ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
184  dst_force[i*dst_stride] = prefac * 2. * grad;
185  BigReal ener = table_a + diffa * ( ( table_d * diffa + table_c ) * diffa + table_b);
186  dst_energy[i*dst_stride] = prefac * ener;
187  }
188 
189  dst_force[0] = 0.;
190  dst_energy[0] = dst_energy[1*dst_stride];
191 }
192 
193 template <typename T>
194 void bindTextureObject(int tableSize, int tableWidth, T* h_table,
195  cudaArray_t& array, cudaTextureObject_t& tableTex, T** d_table) {
196 
197 #if defined(NAMD_CUDA)
198  allocate_device_T((void **)d_table, tableSize, sizeof(T)*tableWidth);
199  copy_HtoD_T(h_table, *d_table, tableSize, sizeof(T)*tableWidth);
200 
201  cudaChannelFormatDesc desc;
202  memset(&desc, 0, sizeof(desc));
203  desc.x = sizeof(T)*8;
204  if (tableWidth >= 2) desc.y = sizeof(T)*8;
205  if (tableWidth >= 3) desc.z = sizeof(T)*8;
206  if (tableWidth >= 4) desc.w = sizeof(T)*8;
207  desc.f = cudaChannelFormatKindFloat;
208  cudaCheck(cudaMallocArray(&array, &desc, tableSize, 0));
209  cudaCheck(cudaMemcpyToArray(array, 0, 0, h_table, tableSize*sizeof(T)*tableWidth, cudaMemcpyHostToDevice));
210 
211  cudaResourceDesc resDesc;
212  memset(&resDesc, 0, sizeof(resDesc));
213  resDesc.resType = cudaResourceTypeArray;
214  resDesc.res.array.array = array;
215 
216  cudaTextureDesc texDesc;
217  memset(&texDesc, 0, sizeof(texDesc));
218  texDesc.addressMode[0] = cudaAddressModeClamp;
219  texDesc.filterMode = cudaFilterModeLinear;
220  texDesc.normalizedCoords = 1;
221 
222  cudaCheck(cudaCreateTextureObject(&tableTex, &resDesc, &texDesc, NULL));
223 #else
224  // tex1Dfetch is used in kernels, so create a linear texture
225  // The texture is 1 texel wider to simplify (and optimize) index clamping for tex1Dfetch
226  allocate_device_T((void **)d_table, tableSize + 1, sizeof(T)*tableWidth);
227  copy_HtoD_T(h_table, *d_table, tableSize, sizeof(T)*tableWidth);
228  copy_HtoD_T(h_table + tableWidth * (tableSize - 1), *d_table + tableWidth * tableSize, 1, sizeof(T)*tableWidth);
229 
230  cudaResourceDesc resDesc;
231  memset(&resDesc, 0, sizeof(resDesc));
232  resDesc.resType = cudaResourceTypeLinear;
233  resDesc.res.linear.devPtr = *d_table;
234  resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
235  resDesc.res.linear.desc.x = sizeof(T)*8;
236  if (tableWidth >= 2) resDesc.res.linear.desc.y = sizeof(T)*8;
237  if (tableWidth >= 3) resDesc.res.linear.desc.z = sizeof(T)*8;
238  if (tableWidth >= 4) resDesc.res.linear.desc.w = sizeof(T)*8;
239  resDesc.res.linear.sizeInBytes = (tableSize + 1)*sizeof(T)*tableWidth;
240 
241  cudaTextureDesc texDesc;
242  memset(&texDesc, 0, sizeof(texDesc));
243  texDesc.readMode = cudaReadModeElementType;
244 
245  cudaCheck(cudaCreateTextureObject(&tableTex, &resDesc, &texDesc, NULL));
246 #endif
247 }
248 
249 void CudaNonbondedTables::buildForceAndEnergyTables(int tableSize) {
250  forceAndEnergyTableSize = tableSize;
251 
252  // Build r2list
253  const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta;
254  const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp;
255  const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
256 
257  double* r2list = new double[tableSize]; // double to match cpu code
258  for ( int i=1; i<tableSize; ++i ) {
259  double r = ((double) tableSize) / ( (double) i + 0.5 );
260  r2list[i] = r*r + r2_delta;
261  }
262 
263  // Non-bonded
264  {
265  float* t = new float[tableSize * 4];
266  float* et = new float[tableSize * 4];
267 
268  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::fast_table, false, 1.0,
269  4, &t[0], &et[0]);
270 
271  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::vdwb_table, false, -1.0,
272  4, &t[1], &et[1]);
273 
274  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::vdwa_table, false, 1.0,
275  4, &t[2], &et[2]);
276 
277  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::scor_table, false, 1.0,
278  4, &t[3], &et[3]);
279 
280  bindTextureObject<float>(tableSize, 4, t, forceArray, forceTableTex, (float **)&forceTable);
281  bindTextureObject<float>(tableSize, 4, et, energyArray, energyTableTex, (float **)&energyTable);
282  delete [] t;
283  delete [] et;
284  }
285 
286  // Modified exclusions
287  {
288  float* t = new float[tableSize*4];
289  float* et = new float[tableSize*4];
290 
291  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::fast_table, false, -1.0,
292  4, &t[0], &et[0]);
293 
294  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::vdwb_table, false, -1.0,
295  4, &t[1], &et[1]);
296 
297  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::vdwa_table, false, 1.0,
298  4, &t[2], &et[2]);
299 
300  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::slow_table, true, -1.0,
301  4, &t[3], &et[3]);
302  // delete [] flip_slow_table;
303 
304  bindTextureObject<float>(tableSize, 4, t, modifiedExclusionForceArray, modifiedExclusionForceTableTex, (float **)&modifiedExclusionForceTable);
305  bindTextureObject<float>(tableSize, 4, et, modifiedExclusionEnergyArray, modifiedExclusionEnergyTableTex, (float **)&modifiedExclusionEnergyTable);
306  delete [] t;
307  delete [] et;
308  }
309 
310  // Exclusions
311  {
312  BigReal* corr_full_table = new BigReal[ComputeNonbondedUtil::table_length*4];
313  for (int i=0;i < ComputeNonbondedUtil::table_length*4;i++) {
315  }
316 
317  float4* h_exclusionTable = new float4[ComputeNonbondedUtil::table_length];
318  float* h_r2_table = new float[ComputeNonbondedUtil::table_length];
319  for (int i=0;i < ComputeNonbondedUtil::table_length;i++) {
320  h_exclusionTable[i].x = 6.0*corr_full_table[4*i + 3];
321  h_exclusionTable[i].y = 4.0*corr_full_table[4*i + 2];
322  h_exclusionTable[i].z = 2.0*corr_full_table[4*i + 1];
323  h_exclusionTable[i].w = 1.0*corr_full_table[4*i + 0];
324  h_r2_table[i] = ComputeNonbondedUtil::r2_table[i];
325  }
326 
327  copyTable<float>(ComputeNonbondedUtil::table_length, h_r2_table, r2_table);
328  copyTable<float4>(ComputeNonbondedUtil::table_length, h_exclusionTable, exclusionTable);
329 
330  delete [] h_exclusionTable;
331  delete [] h_r2_table;
332 
333  }
334 
335  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
336  CkPrintf("Info: Updated CUDA force table with %d elements.\n", tableSize);
337  }
338 
339  delete [] r2list;
340 }
341 
342 #endif // NAMD_CUDA
static BigReal * fast_table
static BigReal * scor_table
short int32
Definition: dumpdcd.c:24
const BigReal A
void allocate_device_T(void **pp, const int len, const size_t sizeofT)
Definition: CudaUtils.C:75
static BigReal * vdwa_table
int get_table_dim() const
Definition: LJTable.h:44
void bindTextureObject(int size, T *h_table, T *&d_table, cudaTextureObject_t &tex, bool update=false)
#define WARPSIZE
Definition: CudaUtils.h:10
static BigReal * full_table
const TableEntry * table_val(unsigned int i, unsigned int j) const
Definition: LJTable.h:35
void buildForceAndEnergyTable(const int tableSize, const double *r2list, const BigReal *src_table, const bool flip, const BigReal prefac, const int dst_stride, T *dst_force, T *dst_energy)
void NAMD_bug(const char *err_msg)
Definition: common.C:129
#define FORCE_ENERGY_TABLE_SIZE
Definition: CudaUtils.h:19
void copy_HtoD_T(const void *h_array, void *d_array, int array_len, const size_t sizeofT)
Definition: CudaUtils.C:180
static BigReal * slow_table
static BigReal * vdwb_table
static const LJTable * ljTable
const BigReal B
static BigReal * corr_table
void copyTable(int size, T *h_table, T *&d_table, bool update=false)
CudaNonbondedTables(const int deviceID)
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
double BigReal
Definition: common.h:114