00001
00002 #include "ComputeNonbondedCUDAKernel.h"
00003 #include <stdio.h>
00004
00005 #ifdef NAMD_CUDA
00006
00007
00008 __constant__ unsigned int exclusions[MAX_EXCLUSIONS];
00009
00010 #define SET_EXCL(EXCL,BASE,DIFF) \
00011 (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
00012
00013 void cuda_bind_exclusions(const unsigned int *t, int n) {
00014
00015 #if 0
00016 unsigned int excl[MAX_EXCLUSIONS];
00017
00018 for ( int i=0; i<MAX_EXCLUSIONS; ++i ) { excl[i] = 0; }
00019
00020 SET_EXCL(excl,0,2);
00021 SET_EXCL(excl,0,3);
00022 SET_EXCL(excl,0,4);
00023
00024 cudaMemcpyToSymbol(exclusions, excl, MAX_EXCLUSIONS*sizeof(unsigned int), 0);
00025 #endif
00026
00027 cudaMemcpyToSymbol(exclusions, t, n*sizeof(unsigned int), 0);
00028 cuda_errcheck("memcpy to exclusions");
00029 }
00030
00031
00032 texture<float4, 1, cudaReadModeElementType> force_table;
00033
00034 void cuda_bind_force_table(const float4 *t) {
00035 static cudaArray *ct;
00036 if ( ! ct ) {
00037 cudaMallocArray(&ct, &force_table.channelDesc, FORCE_TABLE_SIZE, 1);
00038 cuda_errcheck("allocating force table");
00039 } cudaMemcpyToArray(ct, 0, 0, t, FORCE_TABLE_SIZE*sizeof(float4), cudaMemcpyHostToDevice);
00040 cuda_errcheck("memcpy to force table");
00041
00042 force_table.normalized = true;
00043 force_table.addressMode[0] = cudaAddressModeClamp;
00044 force_table.addressMode[1] = cudaAddressModeClamp;
00045 force_table.filterMode = cudaFilterModeLinear;
00046
00047 cudaBindTextureToArray(force_table, ct);
00048 cuda_errcheck("binding force table to texture");
00049 }
00050
00051 static int patch_pairs_size;
00052 static patch_pair *patch_pairs;
00053
00054 static int force_lists_size;
00055 static force_list *force_lists;
00056
00057 static int force_buffers_size;
00058 static float4 *force_buffers;
00059 static float4 *slow_force_buffers;
00060
00061 static int atoms_size;
00062 static atom *atoms;
00063 static atom_param *atom_params;
00064 static float4 *forces;
00065 static float4 *slow_forces;
00066
00067 static int patch_pairs_alloc;
00068 static int force_buffers_alloc;
00069 static int force_lists_alloc;
00070 static int atoms_alloc;
00071
00072 static int max_atoms_per_patch;
00073
00074
00075 cudaStream_t stream;
00076
00077 void cuda_init() {
00078 forces = 0;
00079 slow_forces = 0;
00080 atom_params = 0;
00081 atoms = 0;
00082 force_buffers = 0;
00083 slow_force_buffers = 0;
00084 force_lists = 0;
00085 patch_pairs = 0;
00086
00087 patch_pairs_alloc = 0;
00088 force_buffers_alloc = 0;
00089 force_lists_alloc = 0;
00090 atoms_alloc = 0;
00091
00092 cudaStreamCreate(&stream);
00093 cuda_errcheck("cudaStreamCreate");
00094 }
00095
00096 void cuda_bind_patch_pairs(const patch_pair *pp, int npp,
00097 const force_list *fl, int nfl,
00098 int atoms_size_p, int force_buffers_size_p,
00099 int max_atoms_per_patch_p) {
00100
00101 patch_pairs_size = npp;
00102 force_buffers_size = force_buffers_size_p;
00103 force_lists_size = nfl;
00104 atoms_size = atoms_size_p;
00105 max_atoms_per_patch = max_atoms_per_patch_p;
00106
00107 #if 0
00108 printf("%d %d %d %d %d %d %d %d\n",
00109 patch_pairs_size , patch_pairs_alloc ,
00110 force_buffers_size , force_buffers_alloc ,
00111 force_lists_size , force_lists_alloc ,
00112 atoms_size , atoms_alloc );
00113 #endif
00114
00115 if ( patch_pairs_size > patch_pairs_alloc ||
00116 force_buffers_size > force_buffers_alloc ||
00117 force_lists_size > force_lists_alloc ||
00118 atoms_size > atoms_alloc ) {
00119
00120 patch_pairs_alloc = (int) (1.2 * patch_pairs_size);
00121 force_buffers_alloc = (int) (1.2 * force_buffers_size);
00122 force_lists_alloc = (int) (1.2 * force_lists_size);
00123 atoms_alloc = (int) (1.2 * atoms_size);
00124
00125 if ( forces ) cudaFree(forces);
00126 if ( slow_forces ) cudaFree(slow_forces);
00127 if ( atom_params ) cudaFree(atom_params);
00128 if ( atoms ) cudaFree(atoms);
00129 if ( force_buffers ) cudaFree(force_buffers);
00130 if ( slow_force_buffers ) cudaFree(slow_force_buffers);
00131 if ( force_lists ) cudaFree(force_lists);
00132 if ( patch_pairs ) cudaFree(patch_pairs);
00133 cuda_errcheck("free everything");
00134
00135 #if 1
00136 int totalmem = patch_pairs_alloc * sizeof(patch_pair) +
00137 force_lists_alloc * sizeof(force_list) +
00138 2 * force_buffers_alloc * sizeof(float4) +
00139 atoms_alloc * sizeof(atom) +
00140 atoms_alloc * sizeof(atom_param) +
00141 2 * atoms_alloc * sizeof(float4);
00142
00143 #endif
00144
00145 cudaMalloc((void**) &patch_pairs, patch_pairs_alloc * sizeof(patch_pair));
00146 cudaMalloc((void**) &force_lists, force_lists_alloc * sizeof(force_list));
00147 cudaMalloc((void**) &force_buffers, force_buffers_alloc * sizeof(float4));
00148 cudaMalloc((void**) &slow_force_buffers, force_buffers_alloc * sizeof(float4));
00149 cudaMalloc((void**) &atoms, atoms_alloc * sizeof(atom));
00150 cudaMalloc((void**) &atom_params, atoms_alloc * sizeof(atom_param));
00151 cudaMalloc((void**) &forces, atoms_alloc * sizeof(float4));
00152 cudaMalloc((void**) &slow_forces, atoms_alloc * sizeof(float4));
00153 cuda_errcheck("malloc everything");
00154
00155 }
00156
00157 cudaMemcpy(patch_pairs, pp, npp * sizeof(patch_pair),
00158 cudaMemcpyHostToDevice);
00159 cuda_errcheck("memcpy to patch_pairs");
00160
00161 cudaMemcpy(force_lists, fl, nfl * sizeof(force_list),
00162 cudaMemcpyHostToDevice);
00163 cuda_errcheck("memcpy to force_lists");
00164 }
00165
00166 void cuda_bind_atom_params(const atom_param *t) {
00167 cudaMemcpyAsync(atom_params, t, atoms_size * sizeof(atom_param),
00168 cudaMemcpyHostToDevice, stream);
00169 cuda_errcheck("memcpy to atom_params");
00170 }
00171
00172 void cuda_bind_atoms(const atom *a) {
00173 cuda_errcheck("before memcpy to atoms");
00174 cudaMemcpyAsync(atoms, a, atoms_size * sizeof(atom),
00175 cudaMemcpyHostToDevice, stream);
00176 cuda_errcheck("memcpy to atoms");
00177 }
00178
00179 void cuda_load_forces(float4 *f, float4 *f_slow, int begin, int count) {
00180
00181 cudaMemcpyAsync(f+begin, forces+begin, count * sizeof(float4),
00182 cudaMemcpyDeviceToHost, stream);
00183 if ( f_slow ) {
00184 cudaMemcpyAsync(f_slow+begin, slow_forces+begin, count * sizeof(float4),
00185 cudaMemcpyDeviceToHost, stream);
00186 }
00187 cuda_errcheck("memcpy from forces");
00188 }
00189
00190
00191 #if 0
00192 __host__ __device__ static int3 patch_coords_from_id(
00193 dim3 PATCH_GRID, int id) {
00194
00195 return make_int3( id % PATCH_GRID.x,
00196 ( id / PATCH_GRID.x ) % PATCH_GRID.y,
00197 id / ( PATCH_GRID.x * PATCH_GRID.y ) );
00198 }
00199
00200 __host__ __device__ static int patch_id_from_coords(
00201 dim3 PATCH_GRID, int3 coords) {
00202
00203
00204 int x = (coords.x + 4 * PATCH_GRID.x) % PATCH_GRID.x;
00205 int y = (coords.y + 4 * PATCH_GRID.y) % PATCH_GRID.y;
00206 int z = (coords.z + 4 * PATCH_GRID.z) % PATCH_GRID.z;
00207
00208 return ( z * PATCH_GRID.y + y ) * PATCH_GRID.x + x;
00209 }
00210
00211 __host__ __device__ static int3 patch_offset_from_neighbor(int neighbor) {
00212
00213
00214 int3 coords = patch_coords_from_id(make_uint3(3,3,3), neighbor);
00215 return make_int3(coords.x - 1, coords.y - 1, coords.z - 1);
00216
00217 }
00218 #endif
00219
00220 #define BLOCK_SIZE 64
00221 #define SHARED_SIZE 16
00222
00223 __global__ static void dev_nonbonded(
00224 const patch_pair *patch_pairs,
00225 const atom *atoms,
00226 const atom_param *atom_params,
00227 float4 *force_buffers,
00228 float4 *slow_force_buffers,
00229 float3 lata, float3 latb, float3 latc,
00230 float cutoff2) {
00231
00232
00233
00234
00235 #define jpqs jpqu.d
00236 __shared__ union {
00237 atom d[SHARED_SIZE];
00238 unsigned int i[BLOCK_SIZE];
00239 float f[BLOCK_SIZE];
00240 } jpqu;
00241
00242 #define japs japu.d
00243 __shared__ union {
00244 atom_param d[SHARED_SIZE];
00245 unsigned int i[BLOCK_SIZE];
00246 } japu;
00247
00248 #define myPatchPair pp.pp
00249 __shared__ union { patch_pair pp; unsigned int i[12]; } pp;
00250
00251 if ( threadIdx.x < (sizeof(patch_pair)>>2) ) {
00252 unsigned int tmp = ((unsigned int*)patch_pairs)[
00253 (sizeof(patch_pair)>>2)*blockIdx.x+threadIdx.x];
00254 pp.i[threadIdx.x] = tmp;
00255 }
00256 __syncthreads();
00257
00258
00259 if ( threadIdx.x == 0 ) {
00260 float offx = myPatchPair.offset.x * lata.x
00261 + myPatchPair.offset.y * latb.x
00262 + myPatchPair.offset.z * latc.x;
00263 float offy = myPatchPair.offset.x * lata.y
00264 + myPatchPair.offset.y * latb.y
00265 + myPatchPair.offset.z * latc.y;
00266 float offz = myPatchPair.offset.x * lata.z
00267 + myPatchPair.offset.y * latb.z
00268 + myPatchPair.offset.z * latc.z;
00269 myPatchPair.offset.x = offx;
00270 myPatchPair.offset.y = offy;
00271 myPatchPair.offset.z = offz;
00272 }
00273 __syncthreads();
00274
00275 #if 0
00276
00277 if ( blockIdx.y ) {
00278
00279 if ( myPatchPair.patch1_force_start == myPatchPair.patch2_force_start ) {
00280 return;
00281 }
00282
00283 } else {
00284
00285 if ( threadIdx.x == 0 &&
00286 myPatchPair.patch1_force_start != myPatchPair.patch2_force_start ) {
00287
00288 #undef SWAP
00289 #define SWAP(FIELD1, FIELD2) { \
00290 unsigned int tmp1 = myPatchPair.FIELD1; \
00291 unsigned int tmp2 = myPatchPair.FIELD2; \
00292 if ( tmp1 != tmp2 ) { \
00293 myPatchPair.FIELD1 = tmp2; \
00294 myPatchPair.FIELD2 = tmp1; \
00295 } \
00296 }
00297
00298 SWAP(patch1_size, patch2_size)
00299 SWAP(patch1_force_size, patch2_force_size)
00300 SWAP(patch1_atom_start, patch2_atom_start)
00301 SWAP(patch1_force_start, patch2_force_start)
00302
00303 myPatchPair.offset.x *= -1.f;
00304 myPatchPair.offset.y *= -1.f;
00305 myPatchPair.offset.z *= -1.f;
00306
00307 }
00308
00309 __syncthreads();
00310
00311 }
00312 #endif
00313
00314 for ( int blocki = 0;
00315 blocki < myPatchPair.patch1_force_size;
00316 blocki += BLOCK_SIZE ) {
00317
00318 __syncthreads();
00319
00320 atom ipq;
00321 struct {
00322 float sqrt_epsilon;
00323 float half_sigma;
00324 int index; } iap;
00325
00326
00327 if ( blocki + threadIdx.x < myPatchPair.patch1_force_size ) {
00328 int i = myPatchPair.patch1_atom_start + blocki + threadIdx.x;
00329 float4 tmpa = ((float4*)atoms)[i];
00330
00331 ipq.position.x = tmpa.x + myPatchPair.offset.x;
00332 ipq.position.y = tmpa.y + myPatchPair.offset.y;
00333 ipq.position.z = tmpa.z + myPatchPair.offset.z;
00334 ipq.charge = tmpa.w;
00335
00336 uint4 tmpap = ((uint4*)atom_params)[i];
00337
00338 jpqu.i[threadIdx.x] = tmpap.x;
00339 iap.sqrt_epsilon = jpqu.f[threadIdx.x];
00340 jpqu.i[threadIdx.x] = tmpap.y;
00341 iap.half_sigma = jpqu.f[threadIdx.x];
00342 iap.index = tmpap.z;
00343 }
00344
00345 float4 ife, ife_slow;
00346 ife.x = 0.f;
00347 ife.y = 0.f;
00348 ife.z = 0.f;
00349 ife.w = 0.f;
00350 ife_slow.x = 0.f;
00351 ife_slow.y = 0.f;
00352 ife_slow.z = 0.f;
00353 ife_slow.w = 0.f;
00354
00355 for ( int blockj = 0;
00356 blockj < myPatchPair.patch2_size;
00357 blockj += SHARED_SIZE ) {
00358
00359 int shared_size = myPatchPair.patch2_size - blockj;
00360 if ( shared_size > SHARED_SIZE ) shared_size = SHARED_SIZE;
00361
00362
00363
00364 __syncthreads();
00365
00366 if ( threadIdx.x < 4 * shared_size ) {
00367 int j = myPatchPair.patch2_atom_start + blockj;
00368 jpqu.i[threadIdx.x] = ((unsigned int *)(atoms + j))[threadIdx.x];
00369 japu.i[threadIdx.x] = ((unsigned int *)(atom_params + j))[threadIdx.x];
00370 }
00371 __syncthreads();
00372
00373
00374 if ( blocki + threadIdx.x < myPatchPair.patch1_force_size ) {
00375
00376
00377 #define FORCE_INNER_LOOP(IPQ,IAP) \
00378 for ( int j = 0; j < shared_size; ++j ) { \
00379 \
00380 float tmpx = jpqs[j].position.x - IPQ.position.x; \
00381 float tmpy = jpqs[j].position.y - IPQ.position.y; \
00382 float tmpz = jpqs[j].position.z - IPQ.position.z; \
00383 float r2 = tmpx*tmpx + tmpy*tmpy + tmpz*tmpz; \
00384 if ( r2 < cutoff2 ) { \
00385 float4 fi = tex1D(force_table, 1.f/sqrt(r2)); \
00386 bool excluded = false; \
00387 int indexdiff = (int)(IAP.index) - (int)(japs[j].index); \
00388 if ( abs(indexdiff) <= (int) japs[j].excl_maxdiff ) { \
00389 indexdiff += japs[j].excl_index; \
00390 excluded = ((exclusions[indexdiff>>5] & (1<<(indexdiff&31))) != 0); \
00391 } \
00392 float e = IAP.half_sigma + japs[j].half_sigma; \
00393 e *= e*e; \
00394 e *= e; \
00395 e *= ( e * fi.z + fi.y ); \
00396 e *= IAP.sqrt_epsilon * japs[j].sqrt_epsilon; \
00397 float e_slow = IPQ.charge * jpqs[j].charge; \
00398 e += e_slow * fi.x; \
00399 e_slow *= fi.w; \
00400 if ( ! excluded ) { \
00401 ife.w += r2 * e; \
00402 ife.x += tmpx * e; \
00403 ife.y += tmpy * e; \
00404 ife.z += tmpz * e; \
00405 ife_slow.w += r2 * e_slow; \
00406 ife_slow.x += tmpx * e_slow; \
00407 ife_slow.y += tmpy * e_slow; \
00408 ife_slow.z += tmpz * e_slow; \
00409 } \
00410 } \
00411 }
00412 FORCE_INNER_LOOP(ipq,iap)
00413
00414 }
00415 }
00416
00417 if ( blocki + threadIdx.x < myPatchPair.patch1_force_size ) {
00418 int i_out = myPatchPair.patch1_force_start + blocki + threadIdx.x;
00419 force_buffers[i_out] = ife;
00420 if ( slow_force_buffers ) {
00421 slow_force_buffers[i_out] = ife_slow;
00422 }
00423 }
00424
00425 }
00426
00427 }
00428
00429
00430 __global__ static void dev_sum_forces(
00431 const force_list *force_lists,
00432 const float4 *force_buffers,
00433 float4 *forces) {
00434
00435
00436
00437
00438 __shared__ force_list myForceList;
00439
00440 if ( threadIdx.x == 0 ) {
00441 myForceList = force_lists[blockIdx.x];
00442 }
00443 __syncthreads();
00444
00445 for ( int j = threadIdx.x; j < myForceList.patch_size; j += blockDim.x ) {
00446
00447 const float4 *fbuf = force_buffers + myForceList.force_list_start + j;
00448 float4 fout;
00449 fout.x = 0.f;
00450 fout.y = 0.f;
00451 fout.z = 0.f;
00452 fout.w = 0.f;
00453 for ( int i=0; i < myForceList.force_list_size; ++i ) {
00454 float4 f = *fbuf;
00455 fout.x += f.x;
00456 fout.y += f.y;
00457 fout.z += f.z;
00458 fout.w += f.w;
00459 fbuf += myForceList.patch_size;
00460 }
00461
00462 forces[myForceList.force_output_start + j] = fout;
00463
00464 }
00465 }
00466
00467
00468 void cuda_nonbonded_forces(float3 lata, float3 latb, float3 latc, float cutoff2,
00469 int cbegin, int ccount, int pbegin, int pcount, int doSlow) {
00470
00471 if ( ccount ) {
00472
00473 dev_nonbonded<<< ccount, BLOCK_SIZE, 0, stream
00474 >>>(patch_pairs+cbegin,atoms,atom_params,force_buffers,
00475 (doSlow?slow_force_buffers:0), lata, latb, latc, cutoff2);
00476 cuda_errcheck("dev_nonbonded");
00477 }
00478
00479 if ( pcount ) {
00480
00481 dev_sum_forces<<< pcount, 64, 0, stream
00482 >>>(force_lists+pbegin,force_buffers,forces);
00483 if ( doSlow ) {
00484 dev_sum_forces<<< pcount, 64, 0, stream
00485 >>>(force_lists+pbegin,slow_force_buffers,slow_forces);
00486 }
00487 cuda_errcheck("dev_sum_forces");
00488 }
00489
00490 }
00491
00492
00493 int cuda_stream_finished() {
00494 return ( cudaStreamQuery(stream) == cudaSuccess );
00495 }
00496
00497
00498 #endif // NAMD_CUDA
00499