NAMD
CudaNonbondedTables.C
Go to the documentation of this file.
1 #include "charm++.h"
2 #include "NamdTypes.h"
3 #include "ComputeNonbondedUtil.h"
4 #include "LJTable.h"
5 #include "CudaUtils.h"
6 #include "CudaNonbondedTables.h"
7 
8 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
9 
10 CudaNonbondedTables::CudaNonbondedTables(const int deviceID) : deviceID(deviceID) {
11 
12  vdwCoefTable = NULL;
13  vdwCoefTableWidth = 0;
14 #if !defined(USE_TABLE_ARRAYS)
15  forceTableTex = 0;
16  energyTableTex = 0;
17 #endif
18  forceTable = NULL;
19  energyTable = NULL;
20 
21  exclusionTable = NULL;
22  r2_table = NULL;
23 
24 #if !defined(USE_TABLE_ARRAYS)
25  modifiedExclusionForceTableTex = 0;
26  modifiedExclusionEnergyTableTex = 0;
27 #endif
28  modifiedExclusionForceTable = NULL;
29  modifiedExclusionEnergyTable = NULL;
30 
31  cudaCheck(cudaSetDevice(deviceID));
32  buildForceAndEnergyTables(FORCE_ENERGY_TABLE_SIZE);
33  buildVdwCoefTable();
34 }
35 
37  cudaCheck(cudaSetDevice(deviceID));
38  if (vdwCoefTable != NULL) deallocate_device<float2>(&vdwCoefTable);
39  if (exclusionTable != NULL) deallocate_device<float4>(&exclusionTable);
40  if (r2_table != NULL) deallocate_device<float>(&r2_table);
41 
42 #if !defined(USE_TABLE_ARRAYS)
43  cudaCheck(cudaFreeArray(forceArray));
44  cudaCheck(cudaFreeArray(energyArray));
45  cudaCheck(cudaFreeArray(modifiedExclusionForceArray));
46  cudaCheck(cudaFreeArray(modifiedExclusionEnergyArray));
47 
48  cudaCheck(cudaDestroyTextureObject(forceTableTex));
49  cudaCheck(cudaDestroyTextureObject(energyTableTex));
50  cudaCheck(cudaDestroyTextureObject(modifiedExclusionForceTableTex));
51  cudaCheck(cudaDestroyTextureObject(modifiedExclusionEnergyTableTex));
52 #endif
53 
54  if (forceTable != NULL) deallocate_device<float4>(&forceTable);
55  if (energyTable != NULL) deallocate_device<float4>(&energyTable);
56  if (modifiedExclusionForceTable != NULL) deallocate_device<float4>(&modifiedExclusionForceTable);
57  if (modifiedExclusionEnergyTable != NULL) deallocate_device<float4>(&modifiedExclusionEnergyTable);
58 }
59 
60 //This method effectively replaces bindTextureObject, which was a workaround to limitations on early GPU architectures.
61 template <typename T>
62 void copyTable(int size, T* h_table, T*& d_table, bool update=false) {
63  // Copy to device
64  if ( ! update) {
65  allocate_device<T>(&d_table, size);
66  }
67  copy_HtoD_sync<T>(h_table, d_table, size);
68 
69 }
70 
71 #ifndef USE_TABLE_ARRAYS
72 template <typename T>
73 void bindTextureObject(int size, T* h_table, T*& d_table, cudaTextureObject_t& tex, bool update=false) {
74  // Copy to device
75  if ( ! update) {
76  allocate_device<T>(&d_table, size);
77  }
78  else {
79  cudaCheck(cudaDestroyTextureObject(tex));
80  }
81  copy_HtoD_sync<T>(h_table, d_table, size);
82 
83  // Create texture object
84  cudaResourceDesc resDesc;
85  memset(&resDesc, 0, sizeof(resDesc));
86  resDesc.resType = cudaResourceTypeLinear;
87  resDesc.res.linear.devPtr = d_table;
88  resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
89  resDesc.res.linear.desc.x = sizeof(float)*8; // bits per channel
90  if (sizeof(T) >= sizeof(float)*2) resDesc.res.linear.desc.y = sizeof(float)*8; // bits per channel
91  if (sizeof(T) >= sizeof(float)*3) resDesc.res.linear.desc.z = sizeof(float)*8; // bits per channel
92  if (sizeof(T) >= sizeof(float)*4) resDesc.res.linear.desc.w = sizeof(float)*8; // bits per channel
93  resDesc.res.linear.sizeInBytes = size*sizeof(T);
94 
95  cudaTextureDesc texDesc;
96  memset(&texDesc, 0, sizeof(texDesc));
97  texDesc.readMode = cudaReadModeElementType;
98  //texDesc.normalizedCoords = 0;
99 
100  cudaCheck(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
101 
102 }
103 #endif
104 
105 
106 //
107 // Builds the VdW Lennard-Jones coefficient table. Swiped from ComputeNonbondedCUDA.C
108 // NOTE: Can only be called once
109 //
110 void CudaNonbondedTables::buildVdwCoefTable(bool update/*=false*/) {
111 
112  const LJTable* const ljTable = ComputeNonbondedUtil:: ljTable;
113  const int dim = ljTable->get_table_dim();
114 
115  // round dim up to odd multiple of WARPSIZE/2
116  int tsize = (((dim+(WARPSIZE/2)+(WARPSIZE-1))/WARPSIZE)*WARPSIZE)-(WARPSIZE/2);
117  if ( tsize < dim ) NAMD_bug("CudaNonbondedTables::buildVdwCoefTable bad tsize");
118 
119  float2 *h_vdwCoefTable = new float2[tsize*tsize];
120  float2 *h_exclusionVdwCoefTable = new float2[tsize*tsize];
121  float2 *row = h_vdwCoefTable;
122  float2 *exclusionRow = h_exclusionVdwCoefTable;
123  for ( int i=0; i<dim; ++i, row += tsize, exclusionRow += tsize ) {
124  for ( int j=0; j<dim; ++j ) {
125  const LJTable::TableEntry *e = ljTable->table_val(i,j);
126  row[j].x = e->A * ComputeNonbondedUtil::scaling;
127  row[j].y = e->B * ComputeNonbondedUtil::scaling;
128  exclusionRow[j].x = ((e+1)->A - e->A)* ComputeNonbondedUtil::scaling;
129  exclusionRow[j].y = ((e+1)->B - e->B)* ComputeNonbondedUtil::scaling;
130  }
131  }
132 
133  vdwCoefTableWidth = tsize;
134 #if !defined(USE_TABLE_ARRAYS)
135  bindTextureObject<float2>(tsize*tsize, h_vdwCoefTable, vdwCoefTable, vdwCoefTableTex, update);
136  bindTextureObject<float2>(tsize*tsize, h_exclusionVdwCoefTable, exclusionVdwCoefTable, exclusionVdwCoefTableTex, update);
137 #else
138  copyTable<float2>(tsize*tsize, h_vdwCoefTable, vdwCoefTable, update);
139  copyTable<float2>(tsize*tsize, h_exclusionVdwCoefTable, exclusionVdwCoefTable, update);
140 #endif
141 
142  delete[] h_vdwCoefTable;
143  delete [] h_exclusionVdwCoefTable;
144 
145  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
146  CkPrintf("Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim);
147  }
148 }
149 
150 // Update tables from newer CPU values
152  buildVdwCoefTable(true);
153 }
154 
155 //
156 // Builds force and energy tables. Swiped from ComputeNonbondedCUDA.C
157 //
158 template <typename T>
159 void buildForceAndEnergyTable(const int tableSize, const double* r2list, const BigReal* src_table, const bool flip,
160  const BigReal prefac, const int dst_stride, T* dst_force, T* dst_energy) {
161 
162  const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta;
163  const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp;
164  const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
165 
166  union { double f; int32 i[2]; } byte_order_test;
167  byte_order_test.f = 1.0; // should occupy high-order bits only
168  int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
169 
170  for ( int i=1; i<tableSize; ++i ) {
171  double r = ((double) tableSize) / ( (double) i + 0.5 );
172  int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc; // table_i >= 0
173 
174  if ( r > ComputeNonbondedUtil::cutoff ) {
175  dst_force[i*dst_stride] = 0.;
176  dst_energy[i*dst_stride] = 0.;
177  continue;
178  }
179 
180  BigReal diffa = r2list[i] - ComputeNonbondedUtil::r2_table[table_i];
181 
182  BigReal table_a, table_b, table_c, table_d;
183  if (flip) {
184  table_a = src_table[4*table_i+3];
185  table_b = src_table[4*table_i+2];
186  table_c = src_table[4*table_i+1];
187  table_d = src_table[4*table_i];
188  } else {
189  table_a = src_table[4*table_i];
190  table_b = src_table[4*table_i+1];
191  table_c = src_table[4*table_i+2];
192  table_d = src_table[4*table_i+3];
193  }
194 
195  BigReal grad = ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
196  dst_force[i*dst_stride] = prefac * 2. * grad;
197  BigReal ener = table_a + diffa * ( ( table_d * diffa + table_c ) * diffa + table_b);
198  dst_energy[i*dst_stride] = prefac * ener;
199  }
200 
201  dst_force[0] = 0.;
202  dst_energy[0] = dst_energy[1*dst_stride];
203 }
204 
205 #if !defined(USE_TABLE_ARRAYS)
206 template <typename T>
207 void bindTextureObject(int tableSize, int tableWidth, T* h_table,
208  cudaArray_t& array, cudaTextureObject_t& tableTex, T** d_table) {
209 
210 #if defined(NAMD_CUDA)
211  allocate_device_T((void **)d_table, tableSize, sizeof(T)*tableWidth);
212  copy_HtoD_T(h_table, *d_table, tableSize, sizeof(T)*tableWidth);
213 
214  cudaChannelFormatDesc desc;
215  memset(&desc, 0, sizeof(desc));
216  desc.x = sizeof(T)*8;
217  if (tableWidth >= 2) desc.y = sizeof(T)*8;
218  if (tableWidth >= 3) desc.z = sizeof(T)*8;
219  if (tableWidth >= 4) desc.w = sizeof(T)*8;
220  desc.f = cudaChannelFormatKindFloat;
221  cudaCheck(cudaMallocArray(&array, &desc, tableSize, 0));
222  cudaCheck(cudaMemcpyToArray(array, 0, 0, h_table, tableSize*sizeof(T)*tableWidth, cudaMemcpyHostToDevice));
223 
224  cudaResourceDesc resDesc;
225  memset(&resDesc, 0, sizeof(resDesc));
226  resDesc.resType = cudaResourceTypeArray;
227  resDesc.res.array.array = array;
228 
229  cudaTextureDesc texDesc;
230  memset(&texDesc, 0, sizeof(texDesc));
231  texDesc.addressMode[0] = cudaAddressModeClamp;
232  texDesc.filterMode = cudaFilterModeLinear;
233  texDesc.normalizedCoords = 1;
234 
235  cudaCheck(cudaCreateTextureObject(&tableTex, &resDesc, &texDesc, NULL));
236 #else
237  // tex1Dfetch is used in kernels, so create a linear texture
238  // The texture is 1 texel wider to simplify (and optimize) index clamping for tex1Dfetch
239  allocate_device_T((void **)d_table, tableSize + 1, sizeof(T)*tableWidth);
240  copy_HtoD_T(h_table, *d_table, tableSize, sizeof(T)*tableWidth);
241  copy_HtoD_T(h_table + tableWidth * (tableSize - 1), *d_table + tableWidth * tableSize, 1, sizeof(T)*tableWidth);
242 
243  cudaResourceDesc resDesc;
244  memset(&resDesc, 0, sizeof(resDesc));
245  resDesc.resType = cudaResourceTypeLinear;
246  resDesc.res.linear.devPtr = *d_table;
247  resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
248  resDesc.res.linear.desc.x = sizeof(T)*8;
249  if (tableWidth >= 2) resDesc.res.linear.desc.y = sizeof(T)*8;
250  if (tableWidth >= 3) resDesc.res.linear.desc.z = sizeof(T)*8;
251  if (tableWidth >= 4) resDesc.res.linear.desc.w = sizeof(T)*8;
252  resDesc.res.linear.sizeInBytes = (tableSize + 1)*sizeof(T)*tableWidth;
253 
254  cudaTextureDesc texDesc;
255  memset(&texDesc, 0, sizeof(texDesc));
256  texDesc.readMode = cudaReadModeElementType;
257 
258  cudaCheck(cudaCreateTextureObject(&tableTex, &resDesc, &texDesc, NULL));
259 #endif
260 }
261 #endif
262 
263 void CudaNonbondedTables::buildForceAndEnergyTables(int tableSize) {
264  forceAndEnergyTableSize = tableSize;
265 
266  // Build r2list
267  const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta;
268  const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp;
269  const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
270 
271  double* r2list = new double[tableSize]; // double to match cpu code
272  for ( int i=1; i<tableSize; ++i ) {
273  double r = ((double) tableSize) / ( (double) i + 0.5 );
274  r2list[i] = r*r + r2_delta;
275  }
276 
277  // Non-bonded
278  {
279  float* t = new float[tableSize * 4];
280  float* et = new float[tableSize * 4];
281 
282  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::fast_table, false, 1.0,
283  4, &t[0], &et[0]);
284 
285  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::vdwb_table, false, -1.0,
286  4, &t[1], &et[1]);
287 
288  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::vdwa_table, false, 1.0,
289  4, &t[2], &et[2]);
290 
291  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::scor_table, false, 1.0,
292  4, &t[3], &et[3]);
293 
294 #if !defined(USE_TABLE_ARRAYS)
295  bindTextureObject<float>(tableSize, 4, t, forceArray, forceTableTex, (float **)&forceTable);
296  bindTextureObject<float>(tableSize, 4, et, energyArray, energyTableTex, (float **)&energyTable);
297 #else
298  copyTable<float4>(tableSize, (float4*)t, forceTable);
299  copyTable<float4>(tableSize, (float4*)et, energyTable);
300 #endif
301  delete [] t;
302  delete [] et;
303  }
304 
305  // Modified exclusions
306  {
307  float* t = new float[tableSize * 4];
308  float* et = new float[tableSize * 4];
309 
310  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::fast_table, false, -1.0,
311  4, &t[0], &et[0]);
312 
313  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::vdwb_table, false, -1.0,
314  4, &t[1], &et[1]);
315 
316  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::vdwa_table, false, 1.0,
317  4, &t[2], &et[2]);
318 
319  buildForceAndEnergyTable<float>(tableSize, r2list, ComputeNonbondedUtil::slow_table, true, -1.0,
320  4, &t[3], &et[3]);
321  // delete [] flip_slow_table;
322 #ifndef USE_TABLE_ARRAYS
323  bindTextureObject<float>(tableSize, 4, t, modifiedExclusionForceArray, modifiedExclusionForceTableTex, (float **)&modifiedExclusionForceTable);
324  bindTextureObject<float>(tableSize, 4, et, modifiedExclusionEnergyArray, modifiedExclusionEnergyTableTex, (float **)&modifiedExclusionEnergyTable);
325 #else
326  copyTable<float4>(tableSize, (float4*)t, modifiedExclusionForceTable);
327  copyTable<float4>(tableSize, (float4*)et, modifiedExclusionEnergyTable);
328 #endif
329  delete [] t;
330  delete [] et;
331  }
332 
333  // Exclusions
334  {
335  BigReal* corr_full_table = new BigReal[ComputeNonbondedUtil::table_length*4];
336  for (int i=0;i < ComputeNonbondedUtil::table_length*4;i++) {
338  }
339 
340  float4* h_exclusionTable = new float4[ComputeNonbondedUtil::table_length];
341  float* h_r2_table = new float[ComputeNonbondedUtil::table_length];
342  for (int i=0;i < ComputeNonbondedUtil::table_length;i++) {
343  h_exclusionTable[i].x = 6.0*corr_full_table[4*i + 3];
344  h_exclusionTable[i].y = 4.0*corr_full_table[4*i + 2];
345  h_exclusionTable[i].z = 2.0*corr_full_table[4*i + 1];
346  h_exclusionTable[i].w = 1.0*corr_full_table[4*i + 0];
347  h_r2_table[i] = ComputeNonbondedUtil::r2_table[i];
348  }
349 
350  copyTable<float>(ComputeNonbondedUtil::table_length, h_r2_table, r2_table);
351  copyTable<float4>(ComputeNonbondedUtil::table_length, h_exclusionTable, exclusionTable);
352 
353  delete [] h_exclusionTable;
354  delete [] h_r2_table;
355 
356  }
357 
358  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
359  CkPrintf("Info: Updated CUDA force table with %d elements.\n", tableSize);
360  }
361 
362  delete [] r2list;
363 }
364 
365 #endif // NAMD_CUDA
366 
static BigReal * fast_table
static BigReal * scor_table
void allocate_device_T(void **pp, const size_t len, const size_t sizeofT)
Definition: CudaUtils.C:97
int32_t int32
Definition: common.h:38
static BigReal * vdwa_table
void bindTextureObject(int size, T *h_table, T *&d_table, cudaTextureObject_t &tex, bool update=false)
#define WARPSIZE
Definition: CudaUtils.h:17
static BigReal * full_table
void buildForceAndEnergyTable(const int tableSize, const double *r2list, const BigReal *src_table, const bool flip, const BigReal prefac, const int dst_stride, T *dst_force, T *dst_energy)
void NAMD_bug(const char *err_msg)
Definition: common.C:195
int get_table_dim() const
Definition: LJTable.h:44
#define FORCE_ENERGY_TABLE_SIZE
Definition: CudaUtils.h:40
const TableEntry * table_val(unsigned int i, unsigned int j) const
Definition: LJTable.h:35
static BigReal * slow_table
void copy_HtoD_T(const void *h_array, void *d_array, size_t array_len, const size_t sizeofT)
Definition: CudaUtils.C:224
static BigReal * vdwb_table
static const LJTable * ljTable
static BigReal * corr_table
void copyTable(int size, T *h_table, T *&d_table, bool update=false)
CudaNonbondedTables(const int deviceID)
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
double BigReal
Definition: common.h:123