00001
00002 #include "common.h"
00003 #include "charm++.h"
00004
00005 #ifdef NAMD_CUDA
00006 #include <cuda_runtime.h>
00007 #include <cuda.h>
00008 #endif
00009
00010 #include "WorkDistrib.h"
00011 #include "ComputeMgr.h"
00012 #include "ComputeNonbondedCUDA.h"
00013 #include "ComputeNonbondedCUDAKernel.h"
00014 #include "ObjectArena.h"
00015
00016 #ifdef NAMD_CUDA
00017
00018 extern cudaStream_t stream;
00019
00020 void cuda_errcheck(const char *msg) {
00021 cudaError_t err;
00022 if ((err = cudaGetLastError()) != cudaSuccess) {
00023 char errmsg[1024];
00024 sprintf(errmsg,"CUDA error %s: %s", msg, cudaGetErrorString(err));
00025 NAMD_die(errmsg);
00026 }
00027 }
00028
00029 char *devicelist;
00030 static int usedevicelist;
00031 static int ignoresharing;
00032
00033 void cuda_getargs(char **argv) {
00034 devicelist = 0;
00035 usedevicelist = CmiGetArgStringDesc(argv, "+devices", &devicelist,
00036 "comma-delimited list of CUDA device numbers such as 0,2,1,2");
00037 ignoresharing = CmiGetArgFlag(argv, "+ignoresharing");
00038 }
00039
00040 static int shared_gpu;
00041 static int first_pe_sharing_gpu;
00042 static int next_pe_sharing_gpu;
00043
00044 static int gpu_is_mine;
00045
00046 void cuda_initialize() {
00047
00048 char host[128];
00049 #ifdef NOHOSTNAME
00050 sprintf(host,"unknown");
00051 #else
00052 gethostname(host, 128); host[127] = 0;
00053 #endif
00054
00055 int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
00056 int myRankInPhysicalNode;
00057 int numPesOnPhysicalNode;
00058 int *pesOnPhysicalNode;
00059 CmiGetPesOnPhysicalNode(myPhysicalNodeID,
00060 &pesOnPhysicalNode,&numPesOnPhysicalNode);
00061 {
00062 int i;
00063 for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00064 if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
00065 i = numPesOnPhysicalNode;
00066 break;
00067 }
00068 if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
00069 }
00070 if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
00071 CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
00072 for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00073 CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
00074 i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
00075 }
00076 myRankInPhysicalNode = 0;
00077 numPesOnPhysicalNode = 1;
00078 pesOnPhysicalNode = new int[1];
00079 pesOnPhysicalNode[0] = CkMyPe();
00080 } else {
00081 myRankInPhysicalNode = i;
00082 }
00083 }
00084
00085
00086 int deviceCount = 0;
00087 cudaGetDeviceCount(&deviceCount);
00088 if ( deviceCount <= 0 ) {
00089 NAMD_die("No CUDA devices found.");
00090 }
00091
00092 int *devices;
00093 int ndevices = 0;
00094 int nexclusive = 0;
00095 if ( usedevicelist ) {
00096 devices = new int[strlen(devicelist)];
00097 int i = 0;
00098 while ( devicelist[i] ) {
00099 ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
00100 while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
00101 while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
00102 }
00103 } else {
00104 if ( ! CkMyPe() ) {
00105 CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
00106 }
00107 devices = new int[deviceCount];
00108 for ( int i=0; i<deviceCount; ++i ) {
00109 int dev = i % deviceCount;
00110 #if CUDA_VERSION >= 2020
00111 cudaDeviceProp deviceProp;
00112 cudaGetDeviceProperties(&deviceProp, dev);
00113 if ( deviceProp.computeMode != cudaComputeModeProhibited
00114 && deviceProp.multiProcessorCount > 2 ) {
00115 devices[ndevices++] = dev;
00116 }
00117 if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
00118 ++nexclusive;
00119 }
00120 #else
00121 devices[ndevices++] = dev;
00122 #endif
00123 }
00124 }
00125
00126 if ( ! ndevices ) {
00127 NAMD_die("All CUDA devices are in prohibited mode.");
00128 }
00129
00130 shared_gpu = 0;
00131 gpu_is_mine = 1;
00132 first_pe_sharing_gpu = CkMyPe();
00133 next_pe_sharing_gpu = CkMyPe();
00134
00135 if ( (ndevices >= numPesOnPhysicalNode) || (nexclusive == 0) ) {
00136
00137 int dev;
00138 if ( numPesOnPhysicalNode > 1 ) {
00139 dev = devices[myRankInPhysicalNode % ndevices];
00140 if ( ! ignoresharing ) {
00141 for ( int i = (myRankInPhysicalNode + 1) % numPesOnPhysicalNode;
00142 i != myRankInPhysicalNode;
00143 i = (i + 1) % numPesOnPhysicalNode ) {
00144 if (devices[i % ndevices] == dev) {
00145 shared_gpu = 1;
00146 next_pe_sharing_gpu = pesOnPhysicalNode[i];
00147 break;
00148 }
00149 }
00150 }
00151 if ( shared_gpu ) {
00152 for ( int i = 0; i < numPesOnPhysicalNode; ++i ) {
00153 if (devices[i % ndevices] == dev) {
00154 first_pe_sharing_gpu = pesOnPhysicalNode[i];
00155 break;
00156 }
00157 }
00158 CkPrintf("Pe %d sharing CUDA device %d first %d next %d\n",
00159 CkMyPe(), dev, first_pe_sharing_gpu, next_pe_sharing_gpu);
00160 }
00161 } else {
00162 dev = devices[CkMyPe() % ndevices];
00163 }
00164
00165
00166
00167 first_pe_sharing_gpu = CkMyPe();
00168 next_pe_sharing_gpu = CkMyPe();
00169
00170 gpu_is_mine = ( first_pe_sharing_gpu == CkMyPe() );
00171
00172 if ( dev >= deviceCount ) {
00173 char buf[256];
00174 sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
00175 CkMyPe(), dev, host, deviceCount);
00176 NAMD_die(buf);
00177 }
00178
00179 cudaDeviceProp deviceProp;
00180 cudaGetDeviceProperties(&deviceProp, dev);
00181 CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s' Mem: %dMB Rev: %d.%d\n",
00182 CkMyPe(), myRankInPhysicalNode, dev, host,
00183 deviceProp.name, deviceProp.totalGlobalMem / (1024*1024),
00184 deviceProp.major, deviceProp.minor);
00185
00186 cudaSetDevice(dev);
00187 cudaError_t err;
00188 if ((err = cudaGetLastError()) != cudaSuccess) {
00189 char errmsg[1024];
00190 sprintf(errmsg,"CUDA error binding to device %d on pe %d: %s",
00191 dev, CkMyPe(), cudaGetErrorString(err));
00192 NAMD_die(errmsg);
00193 }
00194
00195 }
00196
00197 if ( sizeof(patch_pair) & 15 ) NAMD_die("sizeof(patch_pair) % 16 != 0");
00198 if ( sizeof(force_list) & 15 ) NAMD_die("sizeof(force_list) % 16 != 0");
00199 if ( sizeof(atom) & 15 ) NAMD_die("sizeof(atom) % 16 != 0");
00200 if ( sizeof(atom_param) & 15 ) NAMD_die("sizeof(atom_param) % 16 != 0");
00201
00202 cuda_init();
00203
00204 }
00205
00206
00207 void build_cuda_force_table() {
00208 ComputeNonbondedCUDA::build_force_table();
00209 }
00210
00211 void ComputeNonbondedCUDA::build_force_table() {
00212
00213 float4 t[FORCE_TABLE_SIZE];
00214
00215 const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta;
00216 const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp;
00217
00218 const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
00219
00220 double r2list[FORCE_TABLE_SIZE];
00221 for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00222 double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00223 r2list[i] = r*r + r2_delta;
00224 }
00225
00226 union { double f; int32 i[2]; } byte_order_test;
00227 byte_order_test.f = 1.0;
00228 int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
00229
00230 for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00231 double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00232 int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc;
00233
00234 if ( r > cutoff ) {
00235 t[i].x = 0.;
00236 t[i].y = 0.;
00237 t[i].z = 0.;
00238 t[i].w = 0.;
00239 continue;
00240 }
00241
00242 BigReal diffa = r2list[i] - r2_table[table_i];
00243
00244
00245
00246 {
00247
00248 BigReal table_b = fast_table[4*table_i+1];
00249 BigReal table_c = fast_table[4*table_i+2];
00250 BigReal table_d = fast_table[4*table_i+3];
00251 BigReal grad =
00252 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00253 t[i].x = 2. * grad;
00254 }
00255
00256
00257
00258
00259 {
00260
00261 BigReal table_b = scor_table[4*table_i+1];
00262 BigReal table_c = scor_table[4*table_i+2];
00263 BigReal table_d = scor_table[4*table_i+3];
00264 BigReal grad =
00265 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00266 t[i].w = 2. * grad;
00267 }
00268
00269
00270
00271
00272 {
00273
00274 BigReal table_b = vdwb_table[4*table_i+1];
00275 BigReal table_c = vdwb_table[4*table_i+2];
00276 BigReal table_d = vdwb_table[4*table_i+3];
00277 BigReal grad =
00278 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00279 t[i].y = 2. * -1. * grad;
00280 }
00281
00282
00283
00284
00285 {
00286
00287 BigReal table_b = vdwa_table[4*table_i+1];
00288 BigReal table_c = vdwa_table[4*table_i+2];
00289 BigReal table_d = vdwa_table[4*table_i+3];
00290 BigReal grad =
00291 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00292 t[i].z = 2. * grad;
00293 }
00294
00295
00296
00297
00298
00299
00300
00301
00302
00303
00304
00305
00306
00307
00308 }
00309
00310 t[0].x = 0.f;
00311 t[0].y = 0.f;
00312 t[0].z = 0.f;
00313 t[0].w = 0.f;
00314
00315 cuda_bind_force_table(t);
00316
00317 if ( ! CkMyPe() ) {
00318 CkPrintf("Info: Updated CUDA force table with %d elements.\n", FORCE_TABLE_SIZE);
00319 }
00320 }
00321
00322 static ComputeNonbondedCUDA* cudaCompute = 0;
00323 static ComputeMgr *computeMgr = 0;
00324
00325 static ushort2 *exclusionsByAtom;
00326
00327 void ComputeNonbondedCUDA::build_exclusions() {
00328 Molecule *mol = Node::Object()->molecule;
00329 int natoms = mol->numAtoms;
00330 exclusionsByAtom = new ushort2[natoms];
00331
00332
00333
00334 ObjectArena<int32> listArena;
00335 ResizeArray<int32*> unique_lists;
00336 SortableResizeArray<int32> curList;
00337 int totalbits = 0;
00338 for ( int i=0; i<natoms; ++i ) {
00339 const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
00340 curList.resize(0);
00341 int n = mol_list[0] + 1;
00342 curList.add(0);
00343 for ( int j=1; j<n; ++j ) { curList.add(mol_list[j] - i); }
00344 curList.sort();
00345
00346 int j;
00347 for ( j=0; j<unique_lists.size(); ++j ) {
00348 if ( n != unique_lists[j][0] ) continue;
00349 int k;
00350 for ( k=0; k<n; ++k ) {
00351 if ( unique_lists[j][k+3] != curList[k] ) break;
00352 }
00353 if ( k == n ) break;
00354 }
00355 if ( j == unique_lists.size() ) {
00356 int32 *list = listArena.getNewArray(n+3);
00357 list[0] = n;
00358 int maxdiff = 0;
00359 maxdiff = -1 * curList[0];
00360 if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
00361 list[1] = maxdiff;
00362 list[2] = totalbits + maxdiff;
00363 totalbits += 2*maxdiff + 1;
00364 for ( int k=0; k<n; ++k ) {
00365 list[k+3] = curList[k];
00366 }
00367 unique_lists.add(list);
00368 }
00369 exclusionsByAtom[i].x = unique_lists[j][1];
00370 exclusionsByAtom[i].y = unique_lists[j][2];
00371 }
00372
00373 if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
00374
00375 if ( ! CkMyPe() ) {
00376 CkPrintf("Info: Found %d unique exclusion lists needing %d bytes\n",
00377 unique_lists.size(), totalbits / 8);
00378 }
00379
00380 #define SET_EXCL(EXCL,BASE,DIFF) \
00381 (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
00382
00383 unsigned int *exclusion_bits = new unsigned int[totalbits/32];
00384 memset(exclusion_bits, 0, totalbits/8);
00385
00386 int base = 0;
00387 for ( int i=0; i<unique_lists.size(); ++i ) {
00388 base += unique_lists[i][1];
00389 if ( base != unique_lists[i][2] ) {
00390 NAMD_bug("ComputeNonbondedCUDA::build_exclusions base != stored");
00391 }
00392 int n = unique_lists[i][0];
00393 for ( int j=0; j<n; ++j ) {
00394 SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
00395 }
00396 base += unique_lists[i][1] + 1;
00397 }
00398
00399 cuda_bind_exclusions(exclusion_bits, totalbits/32);
00400
00401 delete [] exclusion_bits;
00402 }
00403
00404
00405 void register_cuda_compute_self(ComputeID c, PatchID pid) {
00406
00407 if ( ! cudaCompute ) NAMD_die("register_self called early");
00408
00409 cudaCompute->requirePatch(pid);
00410
00411 ComputeNonbondedCUDA::compute_record cr;
00412 cr.c = c;
00413 cr.pid[0] = pid; cr.pid[1] = pid;
00414 cr.offset = 0.;
00415 if ( cudaCompute->patchMap->node(pid) == CkMyPe() ) {
00416 cudaCompute->localComputeRecords.add(cr);
00417 } else {
00418 cudaCompute->remoteComputeRecords.add(cr);
00419 }
00420 }
00421
00422 void register_cuda_compute_pair(ComputeID c, PatchID pid[], int t[]) {
00423
00424 if ( ! cudaCompute ) NAMD_die("register_pair called early");
00425
00426 cudaCompute->requirePatch(pid[0]);
00427 cudaCompute->requirePatch(pid[1]);
00428
00429 ComputeNonbondedCUDA::compute_record cr, cr2;
00430 cr.c = c; cr2.c = c;
00431 cr.pid[0] = pid[0]; cr.pid[1] = pid[1];
00432 cr2.pid[0] = pid[1]; cr2.pid[1] = pid[0];
00433
00434 int t1 = t[0];
00435 int t2 = t[1];
00436 Vector offset = cudaCompute->patchMap->center(pid[0])
00437 - cudaCompute->patchMap->center(pid[1]);
00438 offset.x += (t1%3-1) - (t2%3-1);
00439 offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
00440 offset.z += (t1/9-1) - (t2/9-1);
00441 cr.offset = offset;
00442 cr2.offset = -1. * offset;
00443
00444 if ( cudaCompute->patchMap->node(pid[0]) == CkMyPe() ) {
00445 cudaCompute->localComputeRecords.add(cr);
00446 } else {
00447 cudaCompute->remoteComputeRecords.add(cr);
00448 }
00449 if ( cudaCompute->patchMap->node(pid[1]) == CkMyPe() ) {
00450 cudaCompute->localComputeRecords.add(cr2);
00451 } else {
00452 cudaCompute->remoteComputeRecords.add(cr2);
00453 }
00454 }
00455
00456 void unregister_cuda_compute(ComputeID c) {
00457
00458 NAMD_die("unregister_compute unimplemented");
00459
00460 }
00461
00462 static int atomsChanged = 0;
00463 static int computesChanged = 0;
00464
00465 static cudaEvent_t start_upload;
00466 static cudaEvent_t start_calc;
00467 static cudaEvent_t end_remote_calc;
00468 static cudaEvent_t end_remote_download;
00469 static cudaEvent_t end_local_calc;
00470 static cudaEvent_t end_local_download;
00471
00472 ComputeNonbondedCUDA::ComputeNonbondedCUDA(ComputeID c, ComputeMgr *mgr) : Compute(c) {
00473
00474 cudaCompute = this;
00475 computeMgr = mgr;
00476 patchMap = PatchMap::Object();
00477 atomMap = AtomMap::Object();
00478 reduction = 0;
00479 build_exclusions();
00480
00481 SimParameters *params = Node::Object()->simParameters;
00482 if (params->pressureProfileOn) {
00483 NAMD_die("pressure profile not supported in CUDA");
00484 }
00485
00486 atomsChanged = 1;
00487 computesChanged = 1;
00488 workStarted = 0;
00489 basePriority = PROXY_DATA_PRIORITY;
00490
00491 cudaEventCreate(&start_upload);
00492 cudaEventCreate(&start_calc);
00493 cudaEventCreate(&end_remote_calc);
00494 cudaEventCreate(&end_remote_download);
00495 cudaEventCreate(&end_local_calc);
00496 cudaEventCreate(&end_local_download);
00497 }
00498
00499
00500 ComputeNonbondedCUDA::~ComputeNonbondedCUDA() { ; }
00501
00502 void ComputeNonbondedCUDA::requirePatch(int pid) {
00503
00504 if ( ! reduction ) {
00505 reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00506 }
00507 computesChanged = 1;
00508 patch_record &pr = patchRecords.item(pid);
00509 if ( pr.refCount == 0 ) {
00510 pr.isLocal = ( patchMap->node(pid) == CkMyPe() );
00511 if ( pr.isLocal ) {
00512 localActivePatches.add(pid);
00513 } else {
00514 remoteActivePatches.add(pid);
00515 }
00516 activePatches.add(pid);
00517 setNumPatches(activePatches.size());
00518 pr.patchID = pid;
00519 pr.p = patchMap->patch(pid);
00520 pr.positionBox = pr.p->registerPositionPickup(cid);
00521 pr.forceBox = pr.p->registerForceDeposit(cid);
00522 pr.x = NULL;
00523 pr.xExt = NULL;
00524 pr.r = NULL;
00525 pr.f = NULL;
00526 }
00527 pr.refCount += 1;
00528 }
00529
00530 static ResizeArray<patch_pair> patch_pairs;
00531 static ResizeArray<force_list> force_lists;
00532
00533 static int num_atom_records_allocated;
00534 static atom_param* atom_params;
00535 static atom* atoms;
00536 static float4* forces;
00537 static float4* slow_forces;
00538
00539 static int cuda_timer_count;
00540 static double cuda_timer_total;
00541 static double kernel_time;
00542
00543 static int ccd_index_remote_download;
00544 static int ccd_index_local_download;
00545 #define CUDA_CONDITION CcdPERIODIC
00546
00547 void cuda_check_remote_progress(void *arg, double) {
00548 if ( cudaEventQuery(end_remote_download) == cudaSuccess ) {
00549
00550 WorkDistrib::messageEnqueueWork((ComputeNonbondedCUDA *) arg);
00551 } else {
00552 ccd_index_remote_download = CcdCallOnCondition(CUDA_CONDITION, cuda_check_remote_progress, arg);
00553 }
00554 }
00555
00556 void cuda_check_local_progress(void *arg, double) {
00557 if ( cudaEventQuery(end_local_download) == cudaSuccess ) {
00558 kernel_time += CkWallTimer();
00559 WorkDistrib::messageEnqueueWork((ComputeNonbondedCUDA *) arg);
00560 } else {
00561 ccd_index_local_download = CcdCallOnCondition(CUDA_CONDITION, cuda_check_local_progress, arg);
00562 }
00563 }
00564
00565 #if 0
00566
00567 void cuda_check_progress(void *arg, double) {
00568 if ( cuda_stream_finished() ) {
00569 kernel_time += CkWallTimer();
00570 CcdCallOnCondition(CUDA_CONDITION, ccd_index);
00571
00572 WorkDistrib::messageEnqueueWork((ComputeNonbondedCUDA *) arg);
00573 }
00574 }
00575 #endif
00576
00577 void ComputeNonbondedCUDA::atomUpdate() { atomsChanged = 1; }
00578
00579 static int kernel_launch_state = 0;
00580
00581 void ComputeNonbondedCUDA::doWork() {
00582
00583
00584
00585 if ( workStarted ) {
00586 if ( finishWork() ) {
00587 workStarted = 0;
00588 basePriority = PROXY_DATA_PRIORITY;
00589 } else {
00590 workStarted = 2;
00591 basePriority = COMPUTE_HOME_PRIORITY;
00592 if ( kernel_launch_state > 2 ) ccd_index_local_download = CcdCallOnCondition(CUDA_CONDITION,cuda_check_local_progress,this);
00593 }
00594 return;
00595 }
00596 workStarted = 1;
00597 basePriority = COMPUTE_PROXY_PRIORITY;
00598
00599 Molecule *mol = Node::Object()->molecule;
00600 Parameters *params = Node::Object()->parameters;
00601 SimParameters *simParams = Node::Object()->simParameters;
00602
00603 if ( simParams->useFlexibleCell ) {
00604 NAMD_die("flexible-cell constant pressure not supported in CUDA");
00605 }
00606
00607 for ( int i=0; i<activePatches.size(); ++i ) {
00608 patch_record &pr = patchRecords[activePatches[i]];
00609 pr.x = pr.positionBox->open();
00610 pr.xExt = pr.p->getCompAtomExtInfo();
00611 }
00612
00613 if ( atomsChanged || computesChanged ) {
00614 int npatches = activePatches.size();
00615
00616 if ( computesChanged ) {
00617 computesChanged = 0;
00618
00619 int *ap = activePatches.begin();
00620 for ( int i=0; i<localActivePatches.size(); ++i ) {
00621 *(ap++) = localActivePatches[i];
00622 }
00623 for ( int i=0; i<remoteActivePatches.size(); ++i ) {
00624 *(ap++) = remoteActivePatches[i];
00625 }
00626
00627 int nlc = localComputeRecords.size();
00628 int nrc = remoteComputeRecords.size();
00629 computeRecords.resize(nlc+nrc);
00630 compute_record *cr = computeRecords.begin();
00631 for ( int i=0; i<nlc; ++i ) {
00632 *(cr++) = localComputeRecords[i];
00633 }
00634 for ( int i=0; i<nrc; ++i ) {
00635 *(cr++) = remoteComputeRecords[i];
00636 }
00637
00638 force_lists.resize(npatches);
00639 for ( int i=0; i<npatches; ++i ) {
00640 patchRecords[activePatches[i]].localIndex = i;
00641 force_lists[i].force_list_size = 0;
00642 }
00643
00644 int ncomputes = computeRecords.size();
00645 patch_pairs.resize(ncomputes);
00646 for ( int i=0; i<ncomputes; ++i ) {
00647 ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
00648 int lp1 = patchRecords[cr.pid[0]].localIndex;
00649 int lp2 = patchRecords[cr.pid[1]].localIndex;
00650 force_lists[lp1].force_list_size++;
00651 patch_pair &pp = patch_pairs[i];
00652 pp.offset.x = cr.offset.x;
00653 pp.offset.y = cr.offset.y;
00654 pp.offset.z = cr.offset.z;
00655 }
00656
00657 if ( simParams->outputCudaTiming ) {
00658 CkPrintf("Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
00659 CkMyPe(), localActivePatches.size(), remoteActivePatches.size(),
00660 localComputeRecords.size(), remoteComputeRecords.size());
00661 }
00662 }
00663
00664 int istart = 0;
00665 int flstart = 0;
00666 int max_atoms_per_patch = 0;
00667 int i;
00668 for ( i=0; i<npatches; ++i ) {
00669 if ( i == localActivePatches.size() ) {
00670 num_local_atom_records = istart;
00671 }
00672 force_lists[i].force_list_start = flstart;
00673 force_lists[i].force_output_start = istart;
00674 patch_record &pr = patchRecords[activePatches[i]];
00675 pr.localStart = istart;
00676 int natoms = pr.p->getNumAtoms();
00677 int nfreeatoms = natoms;
00678 if ( fixedAtomsOn ) {
00679 const CompAtomExt *aExt = pr.xExt;
00680 for ( int j=0; j<natoms; ++j ) {
00681 if ( aExt[j].atomFixed ) --nfreeatoms;
00682 }
00683 }
00684 if ( natoms > max_atoms_per_patch ) max_atoms_per_patch = natoms;
00685 pr.numAtoms = natoms;
00686 pr.numFreeAtoms = nfreeatoms;
00687 if ( natoms & 15 ) { natoms += 16 - (natoms & 15); }
00688 if ( nfreeatoms & 15 ) { nfreeatoms += 16 - (nfreeatoms & 15); }
00689 force_lists[i].patch_size = nfreeatoms;
00690 flstart += nfreeatoms * force_lists[i].force_list_size;
00691 istart += natoms;
00692 force_lists[i].force_list_size = 0;
00693 }
00694 if ( i == localActivePatches.size() ) {
00695 num_local_atom_records = istart;
00696 }
00697 num_force_records = flstart;
00698 num_atom_records = istart;
00699 num_remote_atom_records = num_atom_records - num_local_atom_records;
00700 if ( num_atom_records > num_atom_records_allocated ) {
00701 if ( num_atom_records_allocated ) {
00702 cudaFreeHost(atom_params);
00703 cudaFreeHost(atoms);
00704 cudaFreeHost(forces);
00705 cudaFreeHost(slow_forces);
00706 }
00707 num_atom_records_allocated = 1.1 * num_atom_records + 1;
00708 cudaMallocHost((void**)&atom_params,sizeof(atom_param)*num_atom_records_allocated);
00709 cudaMallocHost((void**)&atoms,sizeof(atom)*num_atom_records_allocated);
00710 cudaMallocHost((void**)&forces,sizeof(float4)*num_atom_records_allocated);
00711 cudaMallocHost((void**)&slow_forces,sizeof(float4)*num_atom_records_allocated);
00712 }
00713
00714 #if 0
00715 if ( max_atoms_per_patch > MAX_ATOMS_PER_PATCH ) {
00716 char errstr[1024];
00717 sprintf(errstr,"Found patch with %d atoms; limit is %d for CUDA."
00718 " Try enabling twoAwayX, twoAwayY, and/or twoAwayZ to fix this.",
00719 max_atoms_per_patch, MAX_ATOMS_PER_PATCH);
00720 NAMD_die(errstr);
00721 }
00722 #endif
00723
00724 int ncomputes = computeRecords.size();
00725 for ( int i=0; i<ncomputes; ++i ) {
00726 ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
00727 int p1 = cr.pid[0];
00728 int p2 = cr.pid[1];
00729 int lp1 = patchRecords[p1].localIndex;
00730 int lp2 = patchRecords[p2].localIndex;
00731 patch_pair &pp = patch_pairs[i];
00732 pp.patch1_atom_start = patchRecords[p1].localStart;
00733 pp.patch1_force_start = force_lists[lp1].force_list_start +
00734 force_lists[lp1].patch_size * force_lists[lp1].force_list_size;
00735 pp.patch1_size = patchRecords[p1].numAtoms;
00736 pp.patch1_force_size = patchRecords[p1].numFreeAtoms;
00737
00738
00739 pp.patch2_atom_start = patchRecords[p2].localStart;
00740
00741
00742 pp.patch2_size = patchRecords[p2].numAtoms;
00743
00744
00745 force_lists[lp1].force_list_size++;
00746
00747
00748
00749
00750
00751 }
00752
00753 #if 0
00754 CkPrintf("Pe %d cuda_bind_patch_pairs %d %d %d %d %d\n", CkMyPe(),
00755 patch_pairs.size(), force_lists.size(),
00756 num_atom_records, num_force_records,
00757 max_atoms_per_patch);
00758 #endif
00759
00760 int totalmem = patch_pairs.size() * sizeof(patch_pair) +
00761 force_lists.size() * sizeof(force_list) +
00762 num_force_records * sizeof(float4) +
00763 num_atom_records * sizeof(atom) +
00764 num_atom_records * sizeof(atom_param) +
00765 num_atom_records * sizeof(float4);
00766 int totalcopy = num_atom_records * ( sizeof(atom) + sizeof(float4) );
00767
00768
00769
00770
00771
00772 cuda_bind_patch_pairs(patch_pairs.begin(), patch_pairs.size(),
00773 force_lists.begin(), force_lists.size(),
00774 num_atom_records, num_force_records,
00775 max_atoms_per_patch);
00776
00777 }
00778
00779 double charge_scaling = sqrt(COLOUMB * scaling * dielectric_1);
00780
00781
00782 for ( int i=0; i<activePatches.size(); ++i ) {
00783 patch_record &pr = patchRecords[activePatches[i]];
00784
00785 int start = pr.localStart;
00786 int n = pr.numAtoms;
00787 const CompAtom *a = pr.x;
00788 const CompAtomExt *aExt = pr.xExt;
00789 if ( atomsChanged ) {
00790 atom_param *ap = atom_params + start;
00791 int k = 0;
00792 for ( int j=0; j<n; ++j ) {
00793
00794 if ( fixedAtomsOn && aExt[j].atomFixed ) continue;
00795 ap[k].index = aExt[j].id;
00796 ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
00797 ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
00798 int vdwtype = mol->atomvdwtype(aExt[j].id);
00799 Real sig, eps, sig14, eps14;
00800 params->get_vdw_params(&sig,&eps,&sig14,&eps14,vdwtype);
00801 ap[k].sqrt_epsilon = sqrt(4.0 * scaling * eps);
00802 ap[k].half_sigma = 0.5 * sig;
00803
00804 ++k;
00805 }
00806 if ( fixedAtomsOn ) for ( int j=0; j<n; ++j ) {
00807
00808 if ( ! (fixedAtomsOn && aExt[j].atomFixed) ) continue;
00809 ap[k].index = aExt[j].id;
00810 ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
00811 ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
00812 int vdwtype = mol->atomvdwtype(aExt[j].id);
00813 Real sig, eps, sig14, eps14;
00814 params->get_vdw_params(&sig,&eps,&sig14,&eps14,vdwtype);
00815 ap[k].sqrt_epsilon = sqrt(4.0 * scaling * eps);
00816 ap[k].half_sigma = 0.5 * sig;
00817
00818 ++k;
00819 }
00820 }
00821 {
00822 Vector center =
00823 pr.p->flags.lattice.unscale(cudaCompute->patchMap->center(pr.patchID));
00824 atom *ap = atoms + start;
00825 int k = 0;
00826 for ( int j=0; j<n; ++j ) {
00827
00828 if ( fixedAtomsOn && aExt[j].atomFixed ) continue;
00829 ap[k].position.x = a[j].position.x - center.x;
00830 ap[k].position.y = a[j].position.y - center.y;
00831 ap[k].position.z = a[j].position.z - center.z;
00832 ap[k].charge = charge_scaling * a[j].charge;
00833 ++k;
00834 }
00835 if ( fixedAtomsOn ) for ( int j=0; j<n; ++j ) {
00836
00837 if ( ! (fixedAtomsOn && aExt[j].atomFixed) ) continue;
00838 ap[k].position.x = a[j].position.x - center.x;
00839 ap[k].position.y = a[j].position.y - center.y;
00840 ap[k].position.z = a[j].position.z - center.z;
00841 ap[k].charge = charge_scaling * a[j].charge;
00842 ++k;
00843 }
00844 }
00845 }
00846
00847 kernel_time = -1. * CkWallTimer();
00848 #if 0
00849 kernel_launch_state = 3;
00850
00851 cudaEventRecord(start_upload, stream);
00852
00853 if ( atomsChanged ) {
00854 cuda_bind_atom_params(atom_params);
00855 }
00856
00857 XXX - THIS PATH NOT UPDATED FOR SLOW FORCES - DO NOT COMPILE
00858 cuda_bind_atoms(atoms);
00859 cudaEventRecord(start_calc, stream);
00860 cuda_nonbonded_forces(cutoff2,
00861 localComputeRecords.size(),remoteComputeRecords.size(),
00862 localActivePatches.size(),remoteActivePatches.size());
00863 cudaEventRecord(end_remote_calc, stream);
00864 cuda_load_forces(forces,num_local_atom_records,num_remote_atom_records);
00865 cudaEventRecord(end_remote_download, stream);
00866 cuda_nonbonded_forces(cutoff2,
00867 0,localComputeRecords.size(),
00868 0,localActivePatches.size());
00869 cudaEventRecord(end_local_calc, stream);
00870 cuda_load_forces(forces,0,num_local_atom_records);
00871 cudaEventRecord(end_local_download, stream);
00872
00873 if ( cuda_stream_finished() ) {
00874 CkPrintf("CUDA not overlapping with CPU work.\n");
00875 }
00876
00877
00878
00879 ccd_index_remote_download = CcdCallOnCondition(CUDA_CONDITION,cuda_check_remote_progress,this);
00880 #else
00881
00882 kernel_launch_state = 1;
00883 if ( gpu_is_mine ) recvYieldDevice(-1);
00884
00885 #endif
00886 }
00887
00888 static int ccd_index_remote_calc;
00889 void cuda_check_remote_calc(void *arg, double) {
00890
00891
00892 if ( cudaEventQuery(end_remote_download) == cudaSuccess ) {
00893
00894 computeMgr->sendYieldDevice(next_pe_sharing_gpu);
00895
00896 } else {
00897 ccd_index_remote_calc = CcdCallOnCondition(CUDA_CONDITION, cuda_check_remote_calc, arg);
00898 }
00899 }
00900
00901 static int ccd_index_local_calc;
00902 void cuda_check_local_calc(void *arg, double) {
00903
00904
00905 if ( cudaEventQuery(end_local_download) == cudaSuccess ) {
00906
00907 computeMgr->sendYieldDevice(next_pe_sharing_gpu);
00908
00909 } else {
00910 ccd_index_local_calc = CcdCallOnCondition(CUDA_CONDITION, cuda_check_local_calc, arg);
00911 }
00912 }
00913
00914
00915
00916 void ComputeNonbondedCUDA::recvYieldDevice(int pe) {
00917
00918
00919
00920
00921 Flags &flags = patchRecords[activePatches[0]].p->flags;
00922 int doSlow = flags.doFullElectrostatics;
00923
00924 Lattice &lattice = flags.lattice;
00925 float3 lata, latb, latc;
00926 lata.x = lattice.a().x;
00927 lata.y = lattice.a().y;
00928 lata.z = lattice.a().z;
00929 latb.x = lattice.b().x;
00930 latb.y = lattice.b().y;
00931 latb.z = lattice.b().z;
00932 latc.x = lattice.c().x;
00933 latc.y = lattice.c().y;
00934 latc.z = lattice.c().z;
00935
00936 switch ( kernel_launch_state ) {
00937 case 1:
00938 ++kernel_launch_state;
00939 gpu_is_mine = 0;
00940 cudaEventRecord(start_upload, stream);
00941
00942 if ( atomsChanged ) {
00943 cuda_bind_atom_params(atom_params);
00944 }
00945
00946 cuda_bind_atoms(atoms);
00947 cudaEventRecord(start_calc, stream);
00948 cuda_nonbonded_forces(lata, latb, latc, cutoff2,
00949 localComputeRecords.size(),remoteComputeRecords.size(),
00950 localActivePatches.size(),remoteActivePatches.size(), doSlow);
00951 cudaEventRecord(end_remote_calc, stream);
00952 cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
00953 num_local_atom_records,num_remote_atom_records);
00954 cudaEventRecord(end_remote_download, stream);
00955 ccd_index_remote_download = CcdCallOnCondition(CUDA_CONDITION,cuda_check_remote_progress,this);
00956 if ( shared_gpu ) {
00957 ccd_index_remote_calc = CcdCallOnCondition(CUDA_CONDITION,cuda_check_remote_calc,this);
00958 break;
00959 }
00960
00961 case 2:
00962 ++kernel_launch_state;
00963 gpu_is_mine = 0;
00964 cuda_nonbonded_forces(lata, latb, latc, cutoff2,
00965 0,localComputeRecords.size(),
00966 0,localActivePatches.size(), doSlow);
00967 cudaEventRecord(end_local_calc, stream);
00968 cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
00969 0,num_local_atom_records);
00970 cudaEventRecord(end_local_download, stream);
00971 if ( workStarted == 2 ) ccd_index_local_download = CcdCallOnCondition(CUDA_CONDITION,cuda_check_local_progress,this);
00972 if ( shared_gpu ) {
00973 ccd_index_local_calc = CcdCallOnCondition(CUDA_CONDITION,cuda_check_local_calc,this);
00974 break;
00975 }
00976
00977 default:
00978 gpu_is_mine = 1;
00979 break;
00980 }
00981
00982 }
00983
00984
00985 int ComputeNonbondedCUDA::finishWork() {
00986
00987 cuda_errcheck("cuda stream completed");
00988
00989 Molecule *mol = Node::Object()->molecule;
00990 SimParameters *simParams = Node::Object()->simParameters;
00991
00992 Flags &flags = patchRecords[activePatches[0]].p->flags;
00993 int doSlow = flags.doFullElectrostatics;
00994
00995 ResizeArray<int> &patches( workStarted == 1 ?
00996 remoteActivePatches : localActivePatches );
00997
00998 for ( int i=0; i<patches.size(); ++i ) {
00999 patch_record &pr = patchRecords[patches[i]];
01000 pr.r = pr.forceBox->open();
01001 pr.f = pr.r->f[Results::nbond];
01002 }
01003
01004
01005 double virial = 0.;
01006 double virial_slow = 0.;
01007
01008 for ( int i=0; i<patches.size(); ++i ) {
01009 patch_record &pr = patchRecords[patches[i]];
01010 int start = pr.localStart;
01011 int n = pr.numAtoms;
01012 Force *f = pr.f;
01013 Force *f_slow = pr.r->f[Results::slow];
01014 const CompAtom *a = pr.x;
01015 const CompAtomExt *aExt = pr.xExt;
01016 float4 *af = forces + start;
01017 float4 *af_slow = slow_forces + start;
01018 int k = 0;
01019 for ( int j=0; j<n; ++j ) {
01020
01021 if ( fixedAtomsOn && aExt[j].atomFixed ) continue;
01022 f[j].x += af[k].x;
01023 f[j].y += af[k].y;
01024 f[j].z += af[k].z;
01025
01026 virial += af[k].w;
01027 if ( doSlow ) {
01028 f_slow[j].x += af_slow[k].x;
01029 f_slow[j].y += af_slow[k].y;
01030 f_slow[j].z += af_slow[k].z;
01031 virial_slow += af_slow[k].w;
01032 }
01033 ++k;
01034 }
01035
01036 #if 0
01037
01038 if ( CkNumPes() == 1 ) {
01039 const CompAtomExt *aExt = pr.xExt;
01040 int k = 0;
01041 for ( int j=0; j<n; ++j ) {
01042
01043 if ( fixedAtomsOn && aExt[j].atomFixed ) continue;
01044 int excl_expected = mol->get_full_exclusions_for_atom(aExt[j].id)[0] + 1;
01045 if ( af[k].w != excl_expected ) {
01046 CkPrintf("%d:%d(%d) atom %d found %d exclusions but expected %d\n",
01047 i, j, k, aExt[j].id, (int)af[k].w, excl_expected );
01048 }
01049 ++k;
01050 }
01051 }
01052 #endif
01053
01054
01055 #if 0
01056 if ( i % 31 == 0 ) for ( int j=0; j<3; ++j ) {
01057 CkPrintf("Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
01058 j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
01059 af[j].w);
01060 }
01061 #endif
01062 pr.positionBox->close(&(pr.x));
01063 pr.forceBox->close(&(pr.r));
01064 }
01065
01066 virial *= (-1./6.);
01067 reduction->item(REDUCTION_VIRIAL_NBOND_XX) += virial;
01068 reduction->item(REDUCTION_VIRIAL_NBOND_YY) += virial;
01069 reduction->item(REDUCTION_VIRIAL_NBOND_ZZ) += virial;
01070 if ( doSlow ) {
01071 virial_slow *= (-1./6.);
01072 reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial_slow;
01073 reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial_slow;
01074 reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial_slow;
01075 }
01076
01077 if ( workStarted == 1 ) return 0;
01078
01079 atomsChanged = 0;
01080 reduction->submit();
01081
01082
01083
01084
01085
01086
01087
01088 float upload_ms, remote_calc_ms, remote_download_ms;
01089 float local_calc_ms, local_download_ms, total_ms;
01090 cuda_errcheck("before event timers");
01091 cudaEventElapsedTime(&upload_ms, start_upload, start_calc);
01092 cuda_errcheck("event timer 1");
01093 cudaEventElapsedTime(&remote_calc_ms, start_calc, end_remote_calc);
01094 cuda_errcheck("event timer 2");
01095 cudaEventElapsedTime(&remote_download_ms, end_remote_calc, end_remote_download);
01096 cuda_errcheck("event timer 3");
01097 cudaEventElapsedTime(&local_calc_ms, end_remote_download, end_local_calc);
01098 cuda_errcheck("event timer 4");
01099 cudaEventElapsedTime(&local_download_ms, end_local_calc, end_local_download);
01100 cuda_errcheck("event timer 5");
01101 cudaEventElapsedTime(&total_ms, start_upload, end_local_download);
01102 cuda_errcheck("event timer 6");
01103 cuda_errcheck("event timers");
01104
01105 cuda_timer_total += kernel_time;
01106 if ( simParams->outputCudaTiming &&
01107 cuda_timer_count % simParams->outputCudaTiming == 0 ) {
01108 cuda_timer_total /= (cuda_timer_count + 1);
01109 CkPrintf("CUDA EVENT TIMING: %d %f %f %f %f %f %f\n",
01110 CkMyPe(), upload_ms, remote_calc_ms, remote_download_ms,
01111 local_calc_ms, local_download_ms, total_ms);
01112 CkPrintf("CUDA TIMING: %f ms/step on node %d\n",
01113 cuda_timer_total * 1.e3, CkMyPe());
01114 cuda_timer_count = 0;
01115 cuda_timer_total = 0;
01116 }
01117 cuda_timer_count++;
01118
01119 return 1;
01120 }
01121
01122
01123 #endif // NAMD_CUDA
01124