8 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
13 vdwCoefTableWidth = 0;
20 exclusionTable = NULL;
23 modifiedExclusionForceTableTex = 0;
24 modifiedExclusionEnergyTableTex = 0;
25 modifiedExclusionForceTable = NULL;
26 modifiedExclusionEnergyTable = NULL;
35 if (vdwCoefTable != NULL) deallocate_device<float2>(&vdwCoefTable);
36 if (exclusionTable != NULL) deallocate_device<float4>(&exclusionTable);
37 if (r2_table != NULL) deallocate_device<float>(&r2_table);
41 cudaCheck(cudaFreeArray(modifiedExclusionForceArray));
42 cudaCheck(cudaFreeArray(modifiedExclusionEnergyArray));
44 cudaCheck(cudaDestroyTextureObject(forceTableTex));
45 cudaCheck(cudaDestroyTextureObject(energyTableTex));
46 cudaCheck(cudaDestroyTextureObject(modifiedExclusionForceTableTex));
47 cudaCheck(cudaDestroyTextureObject(modifiedExclusionEnergyTableTex));
49 if (forceTable != NULL) deallocate_device<float4>(&forceTable);
50 if (energyTable != NULL) deallocate_device<float4>(&energyTable);
51 if (modifiedExclusionForceTable != NULL) deallocate_device<float4>(&modifiedExclusionForceTable);
52 if (modifiedExclusionEnergyTable != NULL) deallocate_device<float4>(&modifiedExclusionEnergyTable);
57 void copyTable(
int size, T* h_table, T*& d_table,
bool update=
false) {
60 allocate_device<T>(&d_table, size);
62 copy_HtoD_sync<T>(h_table, d_table, size);
67 void bindTextureObject(
int size, T* h_table, T*& d_table, cudaTextureObject_t& tex,
bool update=
false) {
70 allocate_device<T>(&d_table, size);
75 copy_HtoD_sync<T>(h_table, d_table, size);
78 cudaResourceDesc resDesc;
79 memset(&resDesc, 0,
sizeof(resDesc));
80 resDesc.resType = cudaResourceTypeLinear;
81 resDesc.res.linear.devPtr = d_table;
82 resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
83 resDesc.res.linear.desc.x =
sizeof(float)*8;
84 if (
sizeof(T) >=
sizeof(float)*2) resDesc.res.linear.desc.y =
sizeof(float)*8;
85 if (
sizeof(T) >=
sizeof(float)*3) resDesc.res.linear.desc.z =
sizeof(float)*8;
86 if (
sizeof(T) >=
sizeof(float)*4) resDesc.res.linear.desc.w =
sizeof(float)*8;
87 resDesc.res.linear.sizeInBytes = size*
sizeof(T);
89 cudaTextureDesc texDesc;
90 memset(&texDesc, 0,
sizeof(texDesc));
91 texDesc.readMode = cudaReadModeElementType;
94 cudaCheck(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
102 void CudaNonbondedTables::buildVdwCoefTable(
bool update) {
109 if ( tsize < dim )
NAMD_bug(
"CudaNonbondedTables::buildVdwCoefTable bad tsize");
112 float2 *h_exclusionVdwCoefTable =
new float2[tsize*tsize];
113 float2 *row = h_vdwCoefTable;
114 float2 *exclusionRow = h_exclusionVdwCoefTable;
115 for (
int i=0; i<dim; ++i, row += tsize, exclusionRow += tsize ) {
116 for (
int j=0; j<dim; ++j ) {
125 vdwCoefTableWidth = tsize;
127 bindTextureObject<float2>(tsize*tsize, h_vdwCoefTable, vdwCoefTable, vdwCoefTableTex, update);
128 bindTextureObject<float2>(tsize*tsize, h_exclusionVdwCoefTable, exclusionVdwCoefTable, exclusionVdwCoefTableTex, update);
130 delete [] h_vdwCoefTable;
131 delete [] h_exclusionVdwCoefTable;
133 if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
134 CkPrintf(
"Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim);
140 buildVdwCoefTable(
true);
146 template <
typename T>
148 const BigReal prefac,
const int dst_stride, T* dst_force, T* dst_energy) {
152 const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
154 union {
double f;
int32 i[2]; } byte_order_test;
155 byte_order_test.f = 1.0;
156 int32 *r2iilist = (
int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
158 for (
int i=1; i<tableSize; ++i ) {
159 double r = ((double) tableSize) / ( (double) i + 0.5 );
160 int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc;
163 dst_force[i*dst_stride] = 0.;
164 dst_energy[i*dst_stride] = 0.;
170 BigReal table_a, table_b, table_c, table_d;
172 table_a = src_table[4*table_i+3];
173 table_b = src_table[4*table_i+2];
174 table_c = src_table[4*table_i+1];
175 table_d = src_table[4*table_i];
177 table_a = src_table[4*table_i];
178 table_b = src_table[4*table_i+1];
179 table_c = src_table[4*table_i+2];
180 table_d = src_table[4*table_i+3];
183 BigReal grad = ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
184 dst_force[i*dst_stride] = prefac * 2. * grad;
185 BigReal ener = table_a + diffa * ( ( table_d * diffa + table_c ) * diffa + table_b);
186 dst_energy[i*dst_stride] = prefac * ener;
190 dst_energy[0] = dst_energy[1*dst_stride];
193 template <
typename T>
195 cudaArray_t& array, cudaTextureObject_t& tableTex, T** d_table) {
197 #if defined(NAMD_CUDA)
199 copy_HtoD_T(h_table, *d_table, tableSize,
sizeof(T)*tableWidth);
201 cudaChannelFormatDesc desc;
202 memset(&desc, 0,
sizeof(desc));
203 desc.x =
sizeof(T)*8;
204 if (tableWidth >= 2) desc.y =
sizeof(T)*8;
205 if (tableWidth >= 3) desc.z =
sizeof(T)*8;
206 if (tableWidth >= 4) desc.w =
sizeof(T)*8;
207 desc.f = cudaChannelFormatKindFloat;
208 cudaCheck(cudaMallocArray(&array, &desc, tableSize, 0));
209 cudaCheck(cudaMemcpyToArray(array, 0, 0, h_table, tableSize*
sizeof(T)*tableWidth, cudaMemcpyHostToDevice));
211 cudaResourceDesc resDesc;
212 memset(&resDesc, 0,
sizeof(resDesc));
213 resDesc.resType = cudaResourceTypeArray;
214 resDesc.res.array.array = array;
216 cudaTextureDesc texDesc;
217 memset(&texDesc, 0,
sizeof(texDesc));
218 texDesc.addressMode[0] = cudaAddressModeClamp;
219 texDesc.filterMode = cudaFilterModeLinear;
220 texDesc.normalizedCoords = 1;
222 cudaCheck(cudaCreateTextureObject(&tableTex, &resDesc, &texDesc, NULL));
227 copy_HtoD_T(h_table, *d_table, tableSize,
sizeof(T)*tableWidth);
228 copy_HtoD_T(h_table + tableWidth * (tableSize - 1), *d_table + tableWidth * tableSize, 1,
sizeof(T)*tableWidth);
230 cudaResourceDesc resDesc;
231 memset(&resDesc, 0,
sizeof(resDesc));
232 resDesc.resType = cudaResourceTypeLinear;
233 resDesc.res.linear.devPtr = *d_table;
234 resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
235 resDesc.res.linear.desc.x =
sizeof(T)*8;
236 if (tableWidth >= 2) resDesc.res.linear.desc.y =
sizeof(T)*8;
237 if (tableWidth >= 3) resDesc.res.linear.desc.z =
sizeof(T)*8;
238 if (tableWidth >= 4) resDesc.res.linear.desc.w =
sizeof(T)*8;
239 resDesc.res.linear.sizeInBytes = (tableSize + 1)*
sizeof(T)*tableWidth;
241 cudaTextureDesc texDesc;
242 memset(&texDesc, 0,
sizeof(texDesc));
243 texDesc.readMode = cudaReadModeElementType;
245 cudaCheck(cudaCreateTextureObject(&tableTex, &resDesc, &texDesc, NULL));
249 void CudaNonbondedTables::buildForceAndEnergyTables(
int tableSize) {
250 forceAndEnergyTableSize = tableSize;
255 const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
257 double* r2list =
new double[tableSize];
258 for (
int i=1; i<tableSize; ++i ) {
259 double r = ((double) tableSize) / ( (double) i + 0.5 );
260 r2list[i] = r*r + r2_delta;
265 float* t =
new float[tableSize * 4];
266 float* et =
new float[tableSize * 4];
280 bindTextureObject<float>(tableSize, 4, t, forceArray, forceTableTex, (
float **)&forceTable);
281 bindTextureObject<float>(tableSize, 4, et, energyArray, energyTableTex, (
float **)&energyTable);
288 float* t =
new float[tableSize*4];
289 float* et =
new float[tableSize*4];
304 bindTextureObject<float>(tableSize, 4, t, modifiedExclusionForceArray, modifiedExclusionForceTableTex, (
float **)&modifiedExclusionForceTable);
305 bindTextureObject<float>(tableSize, 4, et, modifiedExclusionEnergyArray, modifiedExclusionEnergyTableTex, (
float **)&modifiedExclusionEnergyTable);
320 h_exclusionTable[i].x = 6.0*corr_full_table[4*i + 3];
321 h_exclusionTable[i].y = 4.0*corr_full_table[4*i + 2];
322 h_exclusionTable[i].z = 2.0*corr_full_table[4*i + 1];
323 h_exclusionTable[i].w = 1.0*corr_full_table[4*i + 0];
330 delete [] h_exclusionTable;
331 delete [] h_r2_table;
335 if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
336 CkPrintf(
"Info: Updated CUDA force table with %d elements.\n", tableSize);
static BigReal * fast_table
static BigReal * scor_table
void allocate_device_T(void **pp, const int len, const size_t sizeofT)
static BigReal * vdwa_table
int get_table_dim() const
void bindTextureObject(int size, T *h_table, T *&d_table, cudaTextureObject_t &tex, bool update=false)
static BigReal * full_table
static BigReal * r2_table
const TableEntry * table_val(unsigned int i, unsigned int j) const
void buildForceAndEnergyTable(const int tableSize, const double *r2list, const BigReal *src_table, const bool flip, const BigReal prefac, const int dst_stride, T *dst_force, T *dst_energy)
void NAMD_bug(const char *err_msg)
#define FORCE_ENERGY_TABLE_SIZE
void copy_HtoD_T(const void *h_array, void *d_array, int array_len, const size_t sizeofT)
static BigReal * slow_table
static BigReal * vdwb_table
static const LJTable * ljTable
static BigReal * corr_table
void copyTable(int size, T *h_table, T *&d_table, bool update=false)
CudaNonbondedTables(const int deviceID)