00001
00002 #include "common.h"
00003 #include "charm++.h"
00004
00005 #ifdef NAMD_CUDA
00006 #include <cuda_runtime.h>
00007 #include <cuda.h>
00008 #endif
00009
00010 #include "WorkDistrib.h"
00011 #include "ComputeMgr.h"
00012 #include "ProxyMgr.h"
00013 #include "ComputeNonbondedCUDA.h"
00014 #include "ComputeNonbondedCUDAKernel.h"
00015 #include "LJTable.h"
00016 #include "ObjectArena.h"
00017 #include "SortAtoms.h"
00018 #include <algorithm>
00019
00020 #include "NamdTypes.h"
00021
00022
00023 #undef PRINT_GBIS
00024
00025 #ifdef NAMD_CUDA
00026
00027 #ifdef WIN32
00028 #define __thread __declspec(thread)
00029 #endif
00030
00031 extern __thread cudaStream_t stream;
00032 extern __thread cudaStream_t stream2;
00033
00034 void cuda_errcheck(const char *msg) {
00035 cudaError_t err;
00036 if ((err = cudaGetLastError()) != cudaSuccess) {
00037 char host[128];
00038 #ifdef NOHOSTNAME
00039 sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00040 #else
00041 gethostname(host, 128); host[127] = 0;
00042 #endif
00043 char devstr[128] = "";
00044 int devnum;
00045 if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00046 sprintf(devstr, " device %d", devnum);
00047 }
00048 char errmsg[1024];
00049 sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
00050 NAMD_die(errmsg);
00051 }
00052 }
00053
00054 void cuda_die(const char *msg) {
00055 char host[128];
00056 #ifdef NOHOSTNAME
00057 sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00058 #else
00059 gethostname(host, 128); host[127] = 0;
00060 #endif
00061 char devstr[128] = "";
00062 int devnum;
00063 if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00064 sprintf(devstr, " device %d", devnum);
00065 }
00066 char errmsg[1024];
00067 sprintf(errmsg,"CUDA error on Pe %d (%s%s): %s", CkMyPe(), host, devstr, msg);
00068 NAMD_die(errmsg);
00069 }
00070
00071 char *devicelist;
00072 static __thread int usedevicelist;
00073 static __thread int ignoresharing;
00074 static __thread int mergegrids;
00075
00076 void cuda_getargs(char **argv) {
00077 devicelist = 0;
00078 usedevicelist = CmiGetArgStringDesc(argv, "+devices", &devicelist,
00079 "comma-delimited list of CUDA device numbers such as 0,2,1,2");
00080 ignoresharing = CmiGetArgFlag(argv, "+ignoresharing");
00081 mergegrids = CmiGetArgFlag(argv, "+mergegrids");
00082 }
00083
00084 static __thread int shared_gpu;
00085 static __thread int first_pe_sharing_gpu;
00086 static __thread int next_pe_sharing_gpu;
00087 static __thread int devicePe;
00088 static __thread int numPesSharingDevice;
00089 static __thread int *pesSharingDevice;
00090
00091 static __thread int gpu_is_mine;
00092
00093 int cuda_device_pe() { return devicePe; }
00094
00095 bool cuda_device_shared_with_pe(int pe) {
00096 for ( int i=0; i<numPesSharingDevice; ++i ) {
00097 if ( pesSharingDevice[i] == pe ) return true;
00098 }
00099 return false;
00100 }
00101
00102 static inline bool sortop_bitreverse(int a, int b) {
00103 if ( a == b ) return 0;
00104 for ( int bit = 1; bit; bit *= 2 ) {
00105 if ( (a&bit) != (b&bit) ) return ((a&bit) < (b&bit));
00106 }
00107 return 0;
00108 }
00109
00110
00111
00112
00113
00114
00115
00116
00117 #define CUDA_EVENT_ID_POLL_REMOTE 98
00118 #define CUDA_TRACE_POLL_REMOTE \
00119 traceUserEvent(CUDA_EVENT_ID_POLL_REMOTE)
00120 #define CUDA_EVENT_ID_POLL_LOCAL 99
00121 #define CUDA_TRACE_POLL_LOCAL \
00122 traceUserEvent(CUDA_EVENT_ID_POLL_LOCAL)
00123 #define CUDA_EVENT_ID_BASE 100
00124 #define CUDA_TRACE_REMOTE(START,END) \
00125 do { int dev; cudaGetDevice(&dev); traceUserBracketEvent( \
00126 CUDA_EVENT_ID_BASE + 2 * dev, START, END); } while (0)
00127 #define CUDA_TRACE_LOCAL(START,END) \
00128 do { int dev; cudaGetDevice(&dev); traceUserBracketEvent( \
00129 CUDA_EVENT_ID_BASE + 2 * dev + 1, START, END); } while (0)
00130
00131 void cuda_register_user_events() {
00132
00133 traceRegisterUserEvent("CUDA poll remote", CUDA_EVENT_ID_POLL_REMOTE);
00134 traceRegisterUserEvent("CUDA poll local", CUDA_EVENT_ID_POLL_LOCAL);
00135
00136 #define REGISTER_DEVICE_EVENTS(DEV) \
00137 traceRegisterUserEvent("CUDA device " #DEV " remote", CUDA_EVENT_ID_BASE + 2 * DEV); \
00138 traceRegisterUserEvent("CUDA device " #DEV " local", CUDA_EVENT_ID_BASE + 2 * DEV + 1);
00139
00140 REGISTER_DEVICE_EVENTS(0)
00141 REGISTER_DEVICE_EVENTS(1)
00142 REGISTER_DEVICE_EVENTS(2)
00143 REGISTER_DEVICE_EVENTS(3)
00144 REGISTER_DEVICE_EVENTS(4)
00145 REGISTER_DEVICE_EVENTS(5)
00146 REGISTER_DEVICE_EVENTS(6)
00147 REGISTER_DEVICE_EVENTS(7)
00148 REGISTER_DEVICE_EVENTS(8)
00149 REGISTER_DEVICE_EVENTS(9)
00150 REGISTER_DEVICE_EVENTS(10)
00151 REGISTER_DEVICE_EVENTS(11)
00152 REGISTER_DEVICE_EVENTS(12)
00153 REGISTER_DEVICE_EVENTS(13)
00154 REGISTER_DEVICE_EVENTS(14)
00155 REGISTER_DEVICE_EVENTS(15)
00156
00157 }
00158
00159 void cuda_initialize() {
00160
00161 if ( 0 == CkMyPe() ) cuda_register_user_events();
00162
00163 char host[128];
00164 #ifdef NOHOSTNAME
00165 sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00166 #else
00167 gethostname(host, 128); host[127] = 0;
00168 #endif
00169
00170 int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
00171 int myRankInPhysicalNode;
00172 int numPesOnPhysicalNode;
00173 int *pesOnPhysicalNode;
00174 CmiGetPesOnPhysicalNode(myPhysicalNodeID,
00175 &pesOnPhysicalNode,&numPesOnPhysicalNode);
00176
00177 {
00178 int i;
00179 for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00180 if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
00181 i = numPesOnPhysicalNode;
00182 break;
00183 }
00184 if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
00185 }
00186 if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
00187 CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
00188 for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00189 CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
00190 i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
00191 }
00192 myRankInPhysicalNode = 0;
00193 numPesOnPhysicalNode = 1;
00194 pesOnPhysicalNode = new int[1];
00195 pesOnPhysicalNode[0] = CkMyPe();
00196 } else {
00197 myRankInPhysicalNode = i;
00198 }
00199 }
00200
00201
00202 int deviceCount = 0;
00203 cudaGetDeviceCount(&deviceCount);
00204 cuda_errcheck("in cudaGetDeviceCount");
00205 if ( deviceCount <= 0 ) {
00206 cuda_die("No CUDA devices found.");
00207 }
00208
00209 int *devices;
00210 int ndevices = 0;
00211 int nexclusive = 0;
00212 if ( usedevicelist ) {
00213 devices = new int[strlen(devicelist)];
00214 int i = 0;
00215 while ( devicelist[i] ) {
00216 ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
00217 while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
00218 while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
00219 }
00220 } else {
00221 if ( ! CkMyPe() ) {
00222 CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
00223 }
00224 devices = new int[deviceCount];
00225 for ( int i=0; i<deviceCount; ++i ) {
00226 int dev = i % deviceCount;
00227 #if CUDA_VERSION >= 2020
00228 cudaDeviceProp deviceProp;
00229 cudaGetDeviceProperties(&deviceProp, dev);
00230 cuda_errcheck("in cudaGetDeviceProperties");
00231 if ( deviceProp.computeMode != cudaComputeModeProhibited
00232 && (deviceProp.major > 1 || deviceProp.minor >= 1)
00233 && deviceProp.canMapHostMemory
00234 && deviceProp.multiProcessorCount > 2 ) {
00235 devices[ndevices++] = dev;
00236 }
00237 if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
00238 ++nexclusive;
00239 }
00240 #else
00241 devices[ndevices++] = dev;
00242 #endif
00243 }
00244 }
00245
00246 if ( ! ndevices ) {
00247 cuda_die("All CUDA devices are in prohibited mode, of compute capability 1.0, unable to map host memory, too small, or otherwise unusable.");
00248 }
00249
00250 shared_gpu = 0;
00251 gpu_is_mine = 1;
00252 first_pe_sharing_gpu = CkMyPe();
00253 next_pe_sharing_gpu = CkMyPe();
00254
00255 {
00256
00257 int dev;
00258 if ( numPesOnPhysicalNode > 1 ) {
00259 int myDeviceRank = myRankInPhysicalNode * ndevices / numPesOnPhysicalNode;
00260 dev = devices[myDeviceRank];
00261 devicePe = CkMyPe();
00262 if ( ignoresharing ) {
00263 pesSharingDevice = new int[1];
00264 pesSharingDevice[0] = CkMyPe();
00265 numPesSharingDevice = 1;
00266 } else {
00267 pesSharingDevice = new int[numPesOnPhysicalNode];
00268 devicePe = -1;
00269 numPesSharingDevice = 0;
00270 for ( int i = 0; i < numPesOnPhysicalNode; ++i ) {
00271 if ( i * ndevices / numPesOnPhysicalNode == myDeviceRank ) {
00272 int thisPe = pesOnPhysicalNode[i];
00273 pesSharingDevice[numPesSharingDevice++] = thisPe;
00274 if ( devicePe < 1 ) devicePe = thisPe;
00275 if ( sortop_bitreverse(thisPe,devicePe) ) devicePe = thisPe;
00276 }
00277 }
00278 for ( int j = 0; j < ndevices; ++j ) {
00279 if ( devices[j] == dev && j != myDeviceRank ) shared_gpu = 1;
00280 }
00281 }
00282 if ( shared_gpu && devicePe == CkMyPe() ) {
00283 CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
00284 }
00285 } else {
00286 dev = devices[CkMyPe() % ndevices];
00287 devicePe = CkMyPe();
00288 pesSharingDevice = new int[1];
00289 pesSharingDevice[0] = CkMyPe();
00290 numPesSharingDevice = 1;
00291 }
00292
00293 if ( devicePe != CkMyPe() ) {
00294 CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
00295 CkMyPe(), myRankInPhysicalNode, devicePe);
00296 return;
00297 }
00298
00299
00300
00301 first_pe_sharing_gpu = CkMyPe();
00302 next_pe_sharing_gpu = CkMyPe();
00303
00304 gpu_is_mine = ( first_pe_sharing_gpu == CkMyPe() );
00305
00306 if ( dev >= deviceCount ) {
00307 char buf[256];
00308 sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
00309 CkMyPe(), dev, host, deviceCount);
00310 NAMD_die(buf);
00311 }
00312
00313 cudaError_t err;
00314 cudaDeviceProp deviceProp;
00315 err = cudaGetDeviceProperties(&deviceProp, dev);
00316 if (err == cudaSuccess) {
00317 CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s' Mem: %dMB Rev: %d.%d\n",
00318 CkMyPe(), myRankInPhysicalNode, dev, host,
00319 deviceProp.name, deviceProp.totalGlobalMem / (1024*1024),
00320 deviceProp.major, deviceProp.minor);
00321
00322 err = cudaSetDevice(dev);
00323 }
00324 if ( err != cudaSuccess) {
00325 char errmsg[1024];
00326 sprintf(errmsg,"CUDA error binding to device %d on pe %d: %s",
00327 dev, CkMyPe(), cudaGetErrorString(err));
00328 NAMD_die(errmsg);
00329 }
00330
00331 }
00332
00333 cudaSetDeviceFlags(cudaDeviceMapHost);
00334 cuda_errcheck("in cudaSetDeviceFlags");
00335
00336 int dev;
00337 if ( cudaGetDevice(&dev) == cudaSuccess ) {
00338 cudaDeviceProp deviceProp;
00339 cudaGetDeviceProperties(&deviceProp, dev);
00340 cuda_errcheck("in cudaGetDeviceProperties");
00341 if ( deviceProp.computeMode == cudaComputeModeProhibited )
00342 cuda_die("device in prohibited mode");
00343 if ( deviceProp.major < 2 && deviceProp.minor < 1 )
00344 cuda_die("device not of compute capability 1.1 or higher");
00345 if ( ! deviceProp.canMapHostMemory )
00346 cuda_die("device cannot map host memory");
00347 }
00348
00349 if ( sizeof(patch_pair) & 15 ) NAMD_bug("sizeof(patch_pair) % 16 != 0");
00350 if ( sizeof(force_list) & 15 ) NAMD_bug("sizeof(force_list) % 16 != 0");
00351 if ( sizeof(atom) & 15 ) NAMD_bug("sizeof(atom) % 16 != 0");
00352 if ( sizeof(atom_param) & 15 ) NAMD_bug("sizeof(atom_param) % 16 != 0");
00353
00354 }
00355
00356 static __thread ComputeNonbondedCUDA* cudaCompute = 0;
00357 static __thread ComputeMgr *computeMgr = 0;
00358
00359 void send_build_cuda_force_table() {
00360 computeMgr->sendBuildCudaForceTable();
00361 }
00362
00363 void build_cuda_force_table() {
00364 if ( devicePe != CkMyPe() ) return;
00365 ComputeNonbondedCUDA::build_lj_table();
00366 ComputeNonbondedCUDA::build_force_table();
00367 }
00368
00369 void ComputeNonbondedCUDA::build_lj_table() {
00370
00371 const LJTable* const ljTable = ComputeNonbondedUtil:: ljTable;
00372 const int dim = ljTable->get_table_dim();
00373
00374
00375 int tsize = (((dim+16+31)/32)*32)-16;
00376 if ( tsize < dim ) NAMD_bug("ComputeNonbondedCUDA::build_lj_table bad tsize");
00377
00378 float2 *t = new float2[tsize*tsize];
00379 float2 *row = t;
00380 for ( int i=0; i<dim; ++i, row += tsize ) {
00381 for ( int j=0; j<dim; ++j ) {
00382 const LJTable::TableEntry *e = ljTable->table_val(i,j);
00383 row[j].x = e->A * scaling;
00384 row[j].y = e->B * scaling;
00385 }
00386 }
00387
00388 cuda_bind_lj_table(t,tsize);
00389 delete [] t;
00390
00391 if ( ! CkMyPe() ) {
00392 CkPrintf("Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim);
00393 }
00394 }
00395
00396 void ComputeNonbondedCUDA::build_force_table() {
00397
00398 float4 t[FORCE_TABLE_SIZE];
00399 float4 et[FORCE_TABLE_SIZE];
00400
00401 const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta;
00402 const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp;
00403
00404 const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
00405
00406 double r2list[FORCE_TABLE_SIZE];
00407 for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00408 double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00409 r2list[i] = r*r + r2_delta;
00410 }
00411
00412 union { double f; int32 i[2]; } byte_order_test;
00413 byte_order_test.f = 1.0;
00414 int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
00415
00416 for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00417 double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00418 int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc;
00419
00420 if ( r > cutoff ) {
00421 t[i].x = 0.;
00422 t[i].y = 0.;
00423 t[i].z = 0.;
00424 t[i].w = 0.;
00425 et[i].x = 0.;
00426 et[i].y = 0.;
00427 et[i].z = 0.;
00428 et[i].w = 0.;
00429 continue;
00430 }
00431
00432 BigReal diffa = r2list[i] - r2_table[table_i];
00433
00434
00435
00436 {
00437 BigReal table_a = fast_table[4*table_i];
00438 BigReal table_b = fast_table[4*table_i+1];
00439 BigReal table_c = fast_table[4*table_i+2];
00440 BigReal table_d = fast_table[4*table_i+3];
00441 BigReal grad =
00442 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00443 t[i].x = 2. * grad;
00444 BigReal ener = table_a + diffa *
00445 ( ( table_d * diffa + table_c ) * diffa + table_b);
00446 et[i].x = ener;
00447 }
00448
00449
00450
00451
00452 {
00453 BigReal table_a = scor_table[4*table_i];
00454 BigReal table_b = scor_table[4*table_i+1];
00455 BigReal table_c = scor_table[4*table_i+2];
00456 BigReal table_d = scor_table[4*table_i+3];
00457 BigReal grad =
00458 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00459 t[i].w = 2. * grad;
00460 BigReal ener = table_a + diffa *
00461 ( ( table_d * diffa + table_c ) * diffa + table_b);
00462 et[i].w = ener;
00463 }
00464
00465
00466
00467
00468 {
00469 BigReal table_a = vdwb_table[4*table_i];
00470 BigReal table_b = vdwb_table[4*table_i+1];
00471 BigReal table_c = vdwb_table[4*table_i+2];
00472 BigReal table_d = vdwb_table[4*table_i+3];
00473 BigReal grad =
00474 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00475 t[i].y = 2. * -1. * grad;
00476 BigReal ener = table_a + diffa *
00477 ( ( table_d * diffa + table_c ) * diffa + table_b);
00478 et[i].y = -1. * ener;
00479 }
00480
00481
00482
00483
00484 {
00485 BigReal table_a = vdwa_table[4*table_i];
00486 BigReal table_b = vdwa_table[4*table_i+1];
00487 BigReal table_c = vdwa_table[4*table_i+2];
00488 BigReal table_d = vdwa_table[4*table_i+3];
00489 BigReal grad =
00490 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00491 t[i].z = 2. * grad;
00492 BigReal ener = table_a + diffa *
00493 ( ( table_d * diffa + table_c ) * diffa + table_b);
00494 et[i].z = ener;
00495 }
00496
00497
00498
00499
00500
00501
00502
00503
00504
00505
00506
00507
00508
00509
00510 }
00511
00512 t[0].x = 0.f;
00513 t[0].y = 0.f;
00514 t[0].z = 0.f;
00515 t[0].w = 0.f;
00516 et[0].x = et[1].x;
00517 et[0].y = et[1].y;
00518 et[0].z = et[1].z;
00519 et[0].w = et[1].w;
00520
00521 cuda_bind_force_table(t,et);
00522
00523 if ( ! CkMyPe() ) {
00524 CkPrintf("Info: Updated CUDA force table with %d elements.\n", FORCE_TABLE_SIZE);
00525 }
00526 }
00527
00528 struct exlist_sortop {
00529 bool operator() (int32 *li, int32 *lj) {
00530 return ( li[1] < lj[1] );
00531 }
00532 };
00533
00534 static __thread int2 *exclusionsByAtom;
00535
00536 void ComputeNonbondedCUDA::build_exclusions() {
00537 Molecule *mol = Node::Object()->molecule;
00538
00539 #ifdef MEM_OPT_VERSION
00540 int natoms = mol->exclSigPoolSize;
00541 #else
00542 int natoms = mol->numAtoms;
00543 #endif
00544
00545 exclusionsByAtom = new int2[natoms];
00546
00547
00548
00549 ObjectArena<int32> listArena;
00550 ResizeArray<int32*> unique_lists;
00551 int32 **listsByAtom = new int32*[natoms];
00552 SortableResizeArray<int32> curList;
00553 for ( int i=0; i<natoms; ++i ) {
00554 curList.resize(0);
00555 curList.add(0);
00556 #ifdef MEM_OPT_VERSION
00557 const ExclusionSignature *sig = mol->exclSigPool + i;
00558 int n = sig->fullExclCnt;
00559 for ( int j=0; j<n; ++j ) { curList.add(sig->fullOffset[j]); }
00560 n += 1;
00561 #else
00562 const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
00563 int n = mol_list[0] + 1;
00564 for ( int j=1; j<n; ++j ) { curList.add(mol_list[j] - i); }
00565 #endif
00566 curList.sort();
00567
00568 int j;
00569 for ( j=0; j<unique_lists.size(); ++j ) {
00570 if ( n != unique_lists[j][0] ) continue;
00571 int k;
00572 for ( k=0; k<n; ++k ) {
00573 if ( unique_lists[j][k+3] != curList[k] ) break;
00574 }
00575 if ( k == n ) break;
00576 }
00577 if ( j == unique_lists.size() ) {
00578 int32 *list = listArena.getNewArray(n+3);
00579 list[0] = n;
00580 int maxdiff = 0;
00581 maxdiff = -1 * curList[0];
00582 if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
00583 list[1] = maxdiff;
00584 for ( int k=0; k<n; ++k ) {
00585 list[k+3] = curList[k];
00586 }
00587 unique_lists.add(list);
00588 }
00589 listsByAtom[i] = unique_lists[j];
00590 }
00591
00592 std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
00593 long int totalbits = 0;
00594 int nlists = unique_lists.size();
00595 for ( int j=0; j<nlists; ++j ) {
00596 int32 *list = unique_lists[j];
00597 int maxdiff = list[1];
00598 list[2] = totalbits + maxdiff;
00599 totalbits += 2*maxdiff + 1;
00600 }
00601 for ( int i=0; i<natoms; ++i ) {
00602 exclusionsByAtom[i].x = listsByAtom[i][1];
00603 exclusionsByAtom[i].y = listsByAtom[i][2];
00604 }
00605 delete [] listsByAtom;
00606
00607 if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
00608
00609 if ( ! CkMyPe() ) {
00610 long int bytesneeded = totalbits / 8;
00611 CkPrintf("Info: Found %d unique exclusion lists needing %ld bytes\n",
00612 unique_lists.size(), bytesneeded);
00613
00614 long int bytesavail = MAX_EXCLUSIONS * sizeof(unsigned int);
00615 if ( bytesneeded > bytesavail ) {
00616 char errmsg[512];
00617 sprintf(errmsg,"%ld bytes of CUDA memory needed for exclusions "
00618 "but only %ld bytes can be addressed with 32-bit int.",
00619 bytesneeded, bytesavail);
00620 NAMD_die(errmsg);
00621 }
00622 }
00623
00624 #define SET_EXCL(EXCL,BASE,DIFF) \
00625 (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
00626
00627 unsigned int *exclusion_bits = new unsigned int[totalbits/32];
00628 memset(exclusion_bits, 0, totalbits/8);
00629
00630 long int base = 0;
00631 for ( int i=0; i<unique_lists.size(); ++i ) {
00632 base += unique_lists[i][1];
00633 if ( base != unique_lists[i][2] ) {
00634 NAMD_bug("ComputeNonbondedCUDA::build_exclusions base != stored");
00635 }
00636 int n = unique_lists[i][0];
00637 for ( int j=0; j<n; ++j ) {
00638 SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
00639 }
00640 base += unique_lists[i][1] + 1;
00641 }
00642
00643 cuda_bind_exclusions(exclusion_bits, totalbits/32);
00644
00645 delete [] exclusion_bits;
00646 }
00647
00648
00649 void register_cuda_compute_self(ComputeID c, PatchID pid) {
00650
00651 if ( ! cudaCompute ) NAMD_bug("register_self called early");
00652
00653 cudaCompute->requirePatch(pid);
00654
00655 ComputeNonbondedCUDA::compute_record cr;
00656 cr.c = c;
00657 cr.pid[0] = pid; cr.pid[1] = pid;
00658 cr.offset = 0.;
00659 if ( cudaCompute->patchRecords[pid].isLocal ) {
00660 cudaCompute->localComputeRecords.add(cr);
00661 } else {
00662 cudaCompute->remoteComputeRecords.add(cr);
00663 }
00664 }
00665
00666 void register_cuda_compute_pair(ComputeID c, PatchID pid[], int t[]) {
00667
00668 if ( ! cudaCompute ) NAMD_bug("register_pair called early");
00669
00670 cudaCompute->requirePatch(pid[0]);
00671 cudaCompute->requirePatch(pid[1]);
00672
00673 ComputeNonbondedCUDA::compute_record cr, cr2;
00674 cr.c = c; cr2.c = c;
00675 cr.pid[0] = pid[0]; cr.pid[1] = pid[1];
00676 cr2.pid[0] = pid[1]; cr2.pid[1] = pid[0];
00677
00678 int t1 = t[0];
00679 int t2 = t[1];
00680 Vector offset = cudaCompute->patchMap->center(pid[0])
00681 - cudaCompute->patchMap->center(pid[1]);
00682 offset.x += (t1%3-1) - (t2%3-1);
00683 offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
00684 offset.z += (t1/9-1) - (t2/9-1);
00685 cr.offset = offset;
00686 cr2.offset = -1. * offset;
00687
00688 if ( cudaCompute->patchRecords[pid[0]].isLocal ) {
00689 cudaCompute->localComputeRecords.add(cr);
00690 } else {
00691 cudaCompute->remoteComputeRecords.add(cr);
00692 }
00693 if ( cudaCompute->patchRecords[pid[1]].isLocal ) {
00694 cudaCompute->localComputeRecords.add(cr2);
00695 } else {
00696 cudaCompute->remoteComputeRecords.add(cr2);
00697 }
00698 }
00699
00700 void unregister_cuda_compute(ComputeID c) {
00701
00702 NAMD_bug("unregister_compute unimplemented");
00703
00704 }
00705
00706 static __thread int atomsChanged = 0;
00707 static __thread int computesChanged = 0;
00708
00709 static __thread int pairlistsValid = 0;
00710 static __thread float pairlistTolerance = 0.;
00711 static __thread int usePairlists = 0;
00712 static __thread int savePairlists = 0;
00713 static __thread float plcutoff2 = 0;
00714
00715
00716 static __thread cudaEvent_t start_calc;
00717 static __thread cudaEvent_t end_remote_download;
00718 static __thread cudaEvent_t end_local_download;
00719
00720 static __thread ResizeArray<patch_pair> *patch_pairs_ptr;
00721 static __thread ResizeArray<force_list> *force_lists_ptr;
00722 #define PATCH_PAIRS_REF ResizeArray<patch_pair> &patch_pairs(*patch_pairs_ptr);
00723 #define FORCE_LISTS_REF ResizeArray<force_list> &force_lists(*force_lists_ptr);
00724
00725 ComputeNonbondedCUDA::ComputeNonbondedCUDA(ComputeID c, ComputeMgr *mgr,
00726 ComputeNonbondedCUDA *m, int idx) : Compute(c), slaveIndex(idx) {
00727 #ifdef PRINT_GBIS
00728 CkPrintf("C.N.CUDA[%d]::constructor cid=%d\n", CkMyPe(), c);
00729 #endif
00730
00731
00732 master = m ? m : this;
00733 cudaCompute = this;
00734 computeMgr = mgr;
00735 patchMap = PatchMap::Object();
00736 atomMap = AtomMap::Object();
00737 reduction = 0;
00738
00739 SimParameters *params = Node::Object()->simParameters;
00740 if (params->pressureProfileOn) {
00741 NAMD_die("pressure profile not supported in CUDA");
00742 }
00743
00744 atomsChanged = 1;
00745 computesChanged = 1;
00746 workStarted = 0;
00747 basePriority = PROXY_DATA_PRIORITY;
00748 localWorkMsg2 = new (PRIORITY_SIZE) LocalWorkMsg;
00749
00750 if ( master != this ) {
00751 masterPe = master->masterPe;
00752 master->slaves[slaveIndex] = this;
00753 if ( master->slavePes[slaveIndex] != CkMyPe() ) {
00754 NAMD_bug("ComputeNonbondedCUDA slavePes[slaveIndex] != CkMyPe");
00755 }
00756 registerPatches();
00757 return;
00758 }
00759 masterPe = CkMyPe();
00760
00761 cuda_init();
00762 build_exclusions();
00763
00764 cudaEventCreateWithFlags(&start_calc,cudaEventDisableTiming);
00765 cudaEventCreateWithFlags(&end_remote_download,cudaEventDisableTiming);
00766 cudaEventCreateWithFlags(&end_local_download,cudaEventDisableTiming);
00767
00768 patch_pairs_ptr = new ResizeArray<patch_pair>;
00769 force_lists_ptr = new ResizeArray<force_list>;
00770 }
00771
00772
00773 ComputeNonbondedCUDA::~ComputeNonbondedCUDA() { ; }
00774
00775 void ComputeNonbondedCUDA::requirePatch(int pid) {
00776
00777 computesChanged = 1;
00778 patch_record &pr = patchRecords.item(pid);
00779 if ( pr.refCount == 0 ) {
00780 if ( mergegrids ) {
00781 pr.isLocal = 0;
00782 } else if ( CkNumNodes() < 2 ) {
00783 pr.isLocal = 1 & ( 1 ^ patchMap->index_a(pid) ^
00784 patchMap->index_b(pid) ^ patchMap->index_c(pid) );
00785 } else {
00786 pr.isLocal = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
00787 }
00788 if ( pr.isLocal ) {
00789 localActivePatches.add(pid);
00790 } else {
00791 remoteActivePatches.add(pid);
00792 }
00793 activePatches.add(pid);
00794
00795 pr.patchID = pid;
00796 pr.hostPe = -1;
00797
00798
00799
00800
00801 pr.x = NULL;
00802 pr.xExt = NULL;
00803 pr.r = NULL;
00804 pr.f = NULL;
00805 pr.intRad = NULL;
00806 pr.psiSum = NULL;
00807 pr.bornRad = NULL;
00808 pr.dEdaSum = NULL;
00809 pr.dHdrPrefix = NULL;
00810 }
00811 pr.refCount += 1;
00812 }
00813
00814 void ComputeNonbondedCUDA::registerPatches() {
00815
00816 SimParameters *simParams = Node::Object()->simParameters;
00817 int npatches = master->activePatches.size();
00818 int *pids = master->activePatches.begin();
00819 patch_record *recs = master->patchRecords.begin();
00820 for ( int i=0; i<npatches; ++i ) {
00821 int pid = pids[i];
00822 patch_record &pr = recs[pid];
00823 if ( pr.hostPe == CkMyPe() ) {
00824
00825
00826
00827 hostedPatches.add(pid);
00828 if ( pr.isLocal ) {
00829 localHostedPatches.add(pid);
00830 } else {
00831 remoteHostedPatches.add(pid);
00832 }
00833 ProxyMgr::Object()->createProxy(pid);
00834 pr.p = patchMap->patch(pid);
00835 pr.positionBox = pr.p->registerPositionPickup(this);
00836 pr.forceBox = pr.p->registerForceDeposit(this);
00837 if (simParams->GBISOn) {
00838 pr.intRadBox = pr.p->registerIntRadPickup(this);
00839 pr.psiSumBox = pr.p->registerPsiSumDeposit(this);
00840 pr.bornRadBox = pr.p->registerBornRadPickup(this);
00841 pr.dEdaSumBox = pr.p->registerDEdaSumDeposit(this);
00842 pr.dHdrPrefixBox = pr.p->registerDHdrPrefixPickup(this);
00843 }
00844 }
00845 }
00846 if ( master == this ) setNumPatches(activePatches.size());
00847 else setNumPatches(hostedPatches.size());
00848 CkPrintf("Pe %d hosts %d local and %d remote patches for pe %d\n", CkMyPe(), localHostedPatches.size(), remoteHostedPatches.size(), masterPe);
00849 }
00850
00851 void ComputeNonbondedCUDA::assignPatches() {
00852
00853 int *pesOnNodeSharingDevice = new int[CkMyNodeSize()];
00854 int numPesOnNodeSharingDevice = 0;
00855 int masterIndex = -1;
00856 for ( int i=0; i<numPesSharingDevice; ++i ) {
00857 int pe = pesSharingDevice[i];
00858 if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
00859 if ( CkNodeOf(pe) == CkMyNode() ) {
00860 pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
00861 }
00862 }
00863
00864 int npatches = activePatches.size();
00865
00866 if ( npatches ) {
00867 reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00868 }
00869
00870 int *count = new int[npatches];
00871 memset(count, 0, sizeof(int)*npatches);
00872 int *pcount = new int[numPesOnNodeSharingDevice];
00873 memset(pcount, 0, sizeof(int)*numPesOnNodeSharingDevice);
00874 int *rankpcount = new int[CkMyNodeSize()];
00875 memset(rankpcount, 0, sizeof(int)*CkMyNodeSize());
00876 char *table = new char[npatches*numPesOnNodeSharingDevice];
00877 memset(table, 0, npatches*numPesOnNodeSharingDevice);
00878
00879 int unassignedpatches = npatches;
00880
00881 if ( 0 ) {
00882 for ( int i=0; i<npatches; ++i ) {
00883 int pid = activePatches[i];
00884 patch_record &pr = patchRecords[pid];
00885 pr.hostPe = CkMyPe();
00886 }
00887 unassignedpatches = 0;
00888 pcount[masterIndex] = npatches;
00889 } else
00890
00891
00892 for ( int i=0; i<npatches; ++i ) {
00893 int pid = activePatches[i];
00894 patch_record &pr = patchRecords[pid];
00895 int homePe = patchMap->node(pid);
00896 for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00897 int pe = pesOnNodeSharingDevice[j];
00898 if ( pe == homePe ) {
00899 pr.hostPe = pe; --unassignedpatches;
00900 pcount[j] += 1;
00901 }
00902 if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
00903 table[i*numPesOnNodeSharingDevice+j] = 1;
00904 }
00905 }
00906 if ( pr.hostPe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
00907 pr.hostPe = homePe; --unassignedpatches;
00908 rankpcount[CkRankOf(homePe)] += 1;
00909 }
00910 }
00911
00912 int assignj = 0;
00913 for ( int i=0; i<npatches; ++i ) {
00914 int pid = activePatches[i];
00915 patch_record &pr = patchRecords[pid];
00916 if ( pr.hostPe != -1 ) continue;
00917 int c = 0;
00918 int lastj;
00919 for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00920 if ( table[i*numPesOnNodeSharingDevice+j] ) { ++c; lastj=j; }
00921 }
00922 count[i] = c;
00923 if ( c == 1 ) {
00924 pr.hostPe = pesOnNodeSharingDevice[lastj];
00925 --unassignedpatches;
00926 pcount[lastj] += 1;
00927 }
00928 }
00929 while ( unassignedpatches ) {
00930 int i;
00931 for ( i=0; i<npatches; ++i ) {
00932 if ( ! table[i*numPesOnNodeSharingDevice+assignj] ) continue;
00933 int pid = activePatches[i];
00934 patch_record &pr = patchRecords[pid];
00935 if ( pr.hostPe != -1 ) continue;
00936 pr.hostPe = pesOnNodeSharingDevice[assignj];
00937 --unassignedpatches;
00938 pcount[assignj] += 1;
00939 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00940 break;
00941 }
00942 if ( i<npatches ) continue;
00943 for ( i=0; i<npatches; ++i ) {
00944 int pid = activePatches[i];
00945 patch_record &pr = patchRecords[pid];
00946 if ( pr.hostPe != -1 ) continue;
00947 if ( count[i] ) continue;
00948 pr.hostPe = pesOnNodeSharingDevice[assignj];
00949 --unassignedpatches;
00950 pcount[assignj] += 1;
00951 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00952 break;
00953 }
00954 if ( i<npatches ) continue;
00955 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00956 }
00957
00958 for ( int i=0; i<npatches; ++i ) {
00959 int pid = activePatches[i];
00960 patch_record &pr = patchRecords[pid];
00961
00962 }
00963
00964 slavePes = new int[CkMyNodeSize()];
00965 slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
00966 numSlaves = 0;
00967 for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00968 int pe = pesOnNodeSharingDevice[j];
00969 int rank = pe - CkNodeFirst(CkMyNode());
00970
00971
00972 if ( pe == CkMyPe() ) continue;
00973 if ( ! pcount[j] && ! rankpcount[rank] ) continue;
00974 rankpcount[rank] = 0;
00975 slavePes[numSlaves] = pe;
00976 computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00977 ++numSlaves;
00978 }
00979 for ( int j=0; j<CkMyNodeSize(); ++j ) {
00980 int pe = CkNodeFirst(CkMyNode()) + j;
00981
00982
00983 if ( ! rankpcount[j] ) continue;
00984 if ( pe == CkMyPe() ) continue;
00985 slavePes[numSlaves] = pe;
00986 computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00987 ++numSlaves;
00988 }
00989 registerPatches();
00990
00991 delete [] pesOnNodeSharingDevice;
00992 delete [] count;
00993 delete [] pcount;
00994 delete [] rankpcount;
00995 delete [] table;
00996 }
00997
00998 static __thread int num_atom_records_allocated;
00999
01000 static __thread atom_param* atom_params;
01001 static __thread atom* atoms;
01002
01003
01004 static __thread int num_virials;
01005 static __thread int num_virials_allocated;
01006 static __thread float *virials;
01007 static __thread float *slow_virials;
01008 static __thread float *energy_gbis;
01009
01010
01011 static __thread float *intRad0H;
01012 static __thread float *intRadSH;
01013
01014 static __thread float *bornRadH;
01015
01016 static __thread float *dHdrPrefixH;
01017
01018 static __thread int cuda_timer_count;
01019 static __thread double cuda_timer_total;
01020 static __thread double kernel_time;
01021 static __thread double remote_submit_time;
01022 static __thread double local_submit_time;
01023
01024 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
01025
01026 #ifdef PRINT_GBIS
01027 #define GBISP(...) CkPrintf(__VA_ARGS__);
01028 #else
01029 #define GBISP(...)
01030 #endif
01031
01032 #define count_limit 1000000
01033 static __thread int check_remote_count;
01034 static __thread int check_local_count;
01035
01036 void cuda_check_remote_progress(void *arg, double walltime) {
01037
01038 CUDA_TRACE_POLL_REMOTE;
01039 cudaError_t err = cudaEventQuery(end_remote_download);
01040 if ( err == cudaSuccess ) {
01041 local_submit_time = walltime;
01042 CUDA_TRACE_REMOTE(remote_submit_time,local_submit_time);
01043 if ( mergegrids ) {
01044 kernel_time = local_submit_time - kernel_time;
01045 }
01046 ((ComputeNonbondedCUDA *) arg)->messageFinishWork();
01047 check_remote_count = 0;
01048 } else if ( err != cudaErrorNotReady ) {
01049 cuda_errcheck("in cuda_check_remote_progress");
01050 NAMD_bug("cuda_errcheck missed error in cuda_check_remote_progress");
01051 } else if ( ++check_remote_count >= count_limit ) {
01052 char errmsg[256];
01053 sprintf(errmsg,"cuda_check_remote_progress polled %d times over %f s on step %d",
01054 check_remote_count, walltime - remote_submit_time,
01055 ((ComputeNonbondedCUDA *) arg)->step);
01056 cuda_errcheck(errmsg);
01057 NAMD_die(errmsg);
01058 } else if ( check_local_count ) {
01059 NAMD_bug("nonzero check_local_count in cuda_check_remote_progress");
01060 } else {
01061 CUDA_POLL(cuda_check_remote_progress, arg);
01062 }
01063 }
01064
01065 void cuda_check_local_progress(void *arg, double walltime) {
01066
01067 CUDA_TRACE_POLL_LOCAL;
01068 cudaError_t err = cudaEventQuery(end_local_download);
01069 if ( err == cudaSuccess ) {
01070 CUDA_TRACE_LOCAL(local_submit_time,walltime);
01071 kernel_time = walltime - kernel_time;
01072 ((ComputeNonbondedCUDA *) arg)->messageFinishWork();
01073 check_local_count = 0;
01074 } else if ( err != cudaErrorNotReady ) {
01075 cuda_errcheck("in cuda_check_local_progress");
01076 NAMD_bug("cuda_errcheck missed error in cuda_check_local_progress");
01077 } else if ( ++check_local_count >= count_limit ) {
01078 char errmsg[256];
01079 sprintf(errmsg,"cuda_check_local_progress polled %d times over %f s on step %d",
01080 check_local_count, walltime - local_submit_time,
01081 ((ComputeNonbondedCUDA *) arg)->step);
01082 cuda_errcheck(errmsg);
01083 NAMD_die(errmsg);
01084 } else if ( check_remote_count ) {
01085 NAMD_bug("nonzero check_remote_count in cuda_check_local_progress");
01086 } else {
01087 CUDA_POLL(cuda_check_local_progress, arg);
01088 }
01089 }
01090
01091 #if 0
01092
01093 void cuda_check_progress(void *arg, double walltime) {
01094 if ( cuda_stream_finished() ) {
01095 kernel_time = walltime - kernel_time;
01096 CUDA_POLL(ccd_index);
01097
01098 WorkDistrib::messageEnqueueWork((ComputeNonbondedCUDA *) arg);
01099 }
01100 }
01101 #endif
01102
01103 void ComputeNonbondedCUDA::atomUpdate() { atomsChanged = 1; }
01104
01105 static __thread int kernel_launch_state = 0;
01106
01107 struct cr_sortop {
01108 const Lattice &l;
01109 cr_sortop(const Lattice &lattice) : l(lattice) { }
01110 bool operator() (ComputeNonbondedCUDA::compute_record i,
01111 ComputeNonbondedCUDA::compute_record j) {
01112 Vector a = l.a();
01113 Vector b = l.b();
01114 Vector c = l.c();
01115 BigReal ri = (i.offset.x * a + i.offset.y * b + i.offset.z * c).length2();
01116 BigReal rj = (j.offset.x * a + j.offset.y * b + j.offset.z * c).length2();
01117 return ( ri < rj );
01118 }
01119 };
01120
01121 int ComputeNonbondedCUDA::noWork() {
01122
01123 SimParameters *simParams = Node::Object()->simParameters;
01124 Flags &flags = master->patchRecords[hostedPatches[0]].p->flags;
01125 lattice = flags.lattice;
01126 doSlow = flags.doFullElectrostatics;
01127 doEnergy = flags.doEnergy;
01128 step = flags.step;
01129
01130 if ( ! flags.doNonbonded ) {
01131 GBISP("GBIS[%d] noWork() don't do nonbonded\n",CkMyPe());
01132 for ( int i=0; i<hostedPatches.size(); ++i ) {
01133 patch_record &pr = master->patchRecords[hostedPatches[i]];
01134 pr.positionBox->skip();
01135 pr.forceBox->skip();
01136 if (simParams->GBISOn) {
01137 pr.intRadBox->skip();
01138 pr.psiSumBox->skip();
01139 pr.bornRadBox->skip();
01140 pr.dEdaSumBox->skip();
01141 pr.dHdrPrefixBox->skip();
01142 }
01143 }
01144 if ( master != this ) {
01145 computeMgr->sendNonbondedCUDASlaveReady(masterPe,
01146 hostedPatches.size(),atomsChanged,sequence());
01147 }
01148 if ( reduction ) reduction->submit();
01149 return 1;
01150 }
01151
01152 for ( int i=0; i<hostedPatches.size(); ++i ) {
01153 patch_record &pr = master->patchRecords[hostedPatches[i]];
01154 if (!simParams->GBISOn || gbisPhase == 1) {
01155 GBISP("GBIS[%d] noWork() P0[%d] open()\n",CkMyPe(), pr.patchID);
01156 pr.x = pr.positionBox->open();
01157 pr.xExt = pr.p->getCompAtomExtInfo();
01158 }
01159
01160 if (simParams->GBISOn) {
01161 if (gbisPhase == 1) {
01162 GBISP("GBIS[%d] noWork() P1[%d] open()\n",CkMyPe(),pr.patchID);
01163 pr.intRad = pr.intRadBox->open();
01164 pr.psiSum = pr.psiSumBox->open();
01165 } else if (gbisPhase == 2) {
01166 GBISP("GBIS[%d] noWork() P2[%d] open()\n",CkMyPe(),pr.patchID);
01167 pr.bornRad = pr.bornRadBox->open();
01168 pr.dEdaSum = pr.dEdaSumBox->open();
01169 } else if (gbisPhase == 3) {
01170 GBISP("GBIS[%d] noWork() P3[%d] open()\n",CkMyPe(),pr.patchID);
01171 pr.dHdrPrefix = pr.dHdrPrefixBox->open();
01172 }
01173 GBISP("opened GBIS boxes");
01174 }
01175 }
01176
01177 if ( master == this ) return 0;
01178
01179
01180 computeMgr->sendNonbondedCUDASlaveReady(masterPe,
01181 hostedPatches.size(),atomsChanged,sequence());
01182
01183 workStarted = 1;
01184 basePriority = COMPUTE_PROXY_PRIORITY;
01185
01186 return 1;
01187 }
01188
01189 void ComputeNonbondedCUDA::doWork() {
01190 GBISP("C.N.CUDA[%d]::doWork: seq %d, phase %d, workStarted %d\n", \
01191 CkMyPe(), sequence(), gbisPhase, workStarted);
01192
01193 PATCH_PAIRS_REF;
01194 FORCE_LISTS_REF;
01195
01196 if ( workStarted ) {
01197 if ( finishWork() ) {
01198 workStarted = 0;
01199 basePriority = PROXY_DATA_PRIORITY;
01200 } else {
01201 workStarted = 2;
01202 basePriority = PROXY_RESULTS_PRIORITY;
01203 if ( master == this && kernel_launch_state > 2 ) {
01204 cuda_check_local_progress(this,0.);
01205 }
01206 }
01207 return;
01208 }
01209
01210 workStarted = 1;
01211 basePriority = COMPUTE_PROXY_PRIORITY;
01212
01213 Molecule *mol = Node::Object()->molecule;
01214 Parameters *params = Node::Object()->parameters;
01215 SimParameters *simParams = Node::Object()->simParameters;
01216
01217
01218 if (!simParams->GBISOn || gbisPhase == 1) {
01219
01220
01221 if ( atomsChanged || computesChanged ) {
01222 int npatches = activePatches.size();
01223
01224 pairlistsValid = 0;
01225 pairlistTolerance = 0.;
01226
01227 if ( computesChanged ) {
01228 computesChanged = 0;
01229
01230 num_virials = npatches;
01231 if ( num_virials > num_virials_allocated ) {
01232 if ( num_virials_allocated ) {
01233 cudaFreeHost(virials);
01234 cudaFreeHost(energy_gbis);
01235 cuda_errcheck("in cudaFreeHost virials");
01236 }
01237 num_virials_allocated = 1.1 * num_virials + 1;
01238 cudaHostAlloc((void**)&virials,2*16*sizeof(float)*num_virials_allocated,cudaHostAllocMapped);
01239 cudaHostAlloc((void**)&energy_gbis,sizeof(float)*num_virials_allocated,cudaHostAllocMapped);
01240 for (int i = 0; i < num_virials_allocated; i++)
01241 energy_gbis[i] = 0.f;
01242 cuda_errcheck("in cudaHostAlloc virials");
01243 slow_virials = virials + 16*num_virials;
01244 }
01245
01246 int *ap = activePatches.begin();
01247 for ( int i=0; i<localActivePatches.size(); ++i ) {
01248 *(ap++) = localActivePatches[i];
01249 }
01250 for ( int i=0; i<remoteActivePatches.size(); ++i ) {
01251 *(ap++) = remoteActivePatches[i];
01252 }
01253
01254
01255 cr_sortop so(lattice);
01256 std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),so);
01257 std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),so);
01258
01259 int nlc = localComputeRecords.size();
01260 int nrc = remoteComputeRecords.size();
01261 computeRecords.resize(nlc+nrc);
01262 compute_record *cr = computeRecords.begin();
01263 for ( int i=0; i<nlc; ++i ) {
01264 *(cr++) = localComputeRecords[i];
01265 }
01266 for ( int i=0; i<nrc; ++i ) {
01267 *(cr++) = remoteComputeRecords[i];
01268 }
01269
01270 force_lists.resize(npatches);
01271 for ( int i=0; i<npatches; ++i ) {
01272 patchRecords[activePatches[i]].localIndex = i;
01273 force_lists[i].force_list_size = 0;
01274 }
01275
01276 int ncomputes = computeRecords.size();
01277 patch_pairs.resize(ncomputes);
01278 for ( int i=0; i<ncomputes; ++i ) {
01279 ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01280 int lp1 = patchRecords[cr.pid[0]].localIndex;
01281 int lp2 = patchRecords[cr.pid[1]].localIndex;
01282 force_lists[lp1].force_list_size++;
01283 patch_pair &pp = patch_pairs[i];
01284 pp.offset.x = cr.offset.x;
01285 pp.offset.y = cr.offset.y;
01286 pp.offset.z = cr.offset.z;
01287 }
01288
01289 for ( int i=0; i<ncomputes; ++i ) {
01290 ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01291 int lp1 = patchRecords[cr.pid[0]].localIndex;
01292 patch_pair &pp = patch_pairs[i];
01293 pp.patch1_force_list_index = lp1;
01294 pp.patch1_force_list_size = force_lists[lp1].force_list_size;
01295 }
01296
01297 if ( 1 || simParams->outputCudaTiming ) {
01298 CkPrintf("Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
01299 CkMyPe(), localActivePatches.size(), remoteActivePatches.size(),
01300 localComputeRecords.size(), remoteComputeRecords.size());
01301 }
01302 }
01303
01304 int istart = 0;
01305 int flstart = 0;
01306 int vlstart = 0;
01307 int max_atoms_per_patch = 0;
01308 int i;
01309 for ( i=0; i<npatches; ++i ) {
01310 if ( i == localActivePatches.size() ) {
01311 num_local_atom_records = istart;
01312 }
01313 force_lists[i].force_list_start = flstart;
01314 force_lists[i].force_output_start = istart;
01315 force_lists[i].atom_start = istart;
01316 force_lists[i].virial_list_start = vlstart;
01317 force_lists[i].virial_output_start = 16*i;
01318 patch_record &pr = patchRecords[activePatches[i]];
01319 pr.localStart = istart;
01320 int natoms = pr.p->getNumAtoms();
01321 int nfreeatoms = natoms;
01322 if ( fixedAtomsOn ) {
01323 const CompAtomExt *aExt = pr.xExt;
01324 for ( int j=0; j<natoms; ++j ) {
01325 if ( aExt[j].atomFixed ) --nfreeatoms;
01326 }
01327 }
01328 if ( natoms > max_atoms_per_patch ) max_atoms_per_patch = natoms;
01329 pr.numAtoms = natoms;
01330 pr.numFreeAtoms = nfreeatoms;
01331 force_lists[i].patch_size = nfreeatoms;
01332 if ( natoms & 15 ) { natoms += 16 - (natoms & 15); }
01333 if ( nfreeatoms & 15 ) { nfreeatoms += 16 - (nfreeatoms & 15); }
01334 force_lists[i].patch_stride = nfreeatoms;
01335 flstart += nfreeatoms * force_lists[i].force_list_size;
01336 vlstart += 16 * force_lists[i].force_list_size;
01337 istart += natoms;
01338 force_lists[i].force_list_size = 0;
01339 }
01340 if ( i == localActivePatches.size() ) {
01341 num_local_atom_records = istart;
01342 }
01343 num_force_records = flstart;
01344 num_atom_records = istart;
01345 num_remote_atom_records = num_atom_records - num_local_atom_records;
01346 if ( num_atom_records > num_atom_records_allocated ) {
01347 if ( num_atom_records_allocated ) {
01348
01349 cudaFreeHost(atom_params);
01350 cudaFreeHost(atoms);
01351 cudaFreeHost(forces);
01352 cudaFreeHost(slow_forces);
01353 if (simParams->GBISOn) {
01354 cudaFreeHost(intRad0H);
01355 cudaFreeHost(intRadSH);
01356 cudaFreeHost(psiSumH);
01357 cudaFreeHost(bornRadH);
01358 cudaFreeHost(dEdaSumH);
01359 cudaFreeHost(dHdrPrefixH);
01360 }
01361 cuda_errcheck("in cudaFreeHost");
01362 }
01363 num_atom_records_allocated = 1.1 * num_atom_records + 1;
01364
01365 cudaMallocHost((void**)&atom_params,sizeof(atom_param)*num_atom_records_allocated);
01366 cudaMallocHost((void**)&atoms,sizeof(atom)*num_atom_records_allocated);
01367 cuda_errcheck("in cudaMallocHost atoms");
01368 cudaHostAlloc((void**)&forces,sizeof(float4)*num_atom_records_allocated,cudaHostAllocMapped);
01369 cudaHostAlloc((void**)&slow_forces,sizeof(float4)*num_atom_records_allocated,cudaHostAllocMapped);
01370
01371 if (simParams->GBISOn) {
01372 cudaMallocHost((void**)&intRad0H,sizeof(float)*num_atom_records_allocated);
01373 cudaMallocHost((void**)&intRadSH,sizeof(float)*num_atom_records_allocated);
01374 cudaHostAlloc((void**)&psiSumH, sizeof(GBReal)*num_atom_records_allocated,cudaHostAllocMapped);
01375 cudaMallocHost((void**)&bornRadH,sizeof(float)*num_atom_records_allocated);
01376 cudaHostAlloc((void**)&dEdaSumH,sizeof(GBReal)*num_atom_records_allocated,cudaHostAllocMapped);
01377 cudaMallocHost((void**)&dHdrPrefixH,sizeof(float)*num_atom_records_allocated);
01378 }
01379 cuda_errcheck("in cudaHostAlloc forces");
01380 }
01381
01382 int bfstart = 0;
01383 int ncomputes = computeRecords.size();
01384 for ( int i=0; i<ncomputes; ++i ) {
01385 ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01386 int p1 = cr.pid[0];
01387 int p2 = cr.pid[1];
01388 int lp1 = patchRecords[p1].localIndex;
01389 int lp2 = patchRecords[p2].localIndex;
01390 patch_pair &pp = patch_pairs[i];
01391 pp.patch1_atom_start = patchRecords[p1].localStart;
01392 pp.patch1_force_start = force_lists[lp1].force_list_start +
01393 force_lists[lp1].patch_stride * force_lists[lp1].force_list_size;
01394 pp.virial_start = force_lists[lp1].virial_list_start +
01395 16 * force_lists[lp1].force_list_size;
01396 pp.patch1_size = patchRecords[p1].numAtoms;
01397 pp.patch1_force_size = patchRecords[p1].numFreeAtoms;
01398 pp.patch2_force_size = patchRecords[p2].numFreeAtoms;
01399 pp.block_flags_start = bfstart;
01400 bfstart += ((pp.patch1_force_size + 127) >> 7) << 5;
01401
01402
01403 pp.patch2_atom_start = patchRecords[p2].localStart;
01404
01405
01406 pp.patch2_size = patchRecords[p2].numAtoms;
01407
01408
01409 force_lists[lp1].force_list_size++;
01410
01411
01412
01413
01414
01415 }
01416
01417 #if 0
01418 CkPrintf("Pe %d cuda_bind_patch_pairs %d %d %d %d %d\n", CkMyPe(),
01419 patch_pairs.size(), force_lists.size(),
01420 num_atom_records, num_force_records,
01421 max_atoms_per_patch);
01422 #endif
01423
01424 int totalmem = patch_pairs.size() * sizeof(patch_pair) +
01425 force_lists.size() * sizeof(force_list) +
01426 num_force_records * sizeof(float4) +
01427 num_atom_records * sizeof(atom) +
01428 num_atom_records * sizeof(atom_param) +
01429 num_atom_records * sizeof(float4);
01430 int totalcopy = num_atom_records * ( sizeof(atom) + sizeof(float4) );
01431
01432
01433
01434
01435
01436 cuda_bind_patch_pairs(patch_pairs.begin(), patch_pairs.size(),
01437 force_lists.begin(), force_lists.size(),
01438 num_atom_records, num_force_records,
01439 bfstart, max_atoms_per_patch);
01440
01441 }
01442
01443 double charge_scaling = sqrt(COULOMB * scaling * dielectric_1);
01444
01445 Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01446 float maxAtomMovement = 0.;
01447 float maxPatchTolerance = 0.;
01448
01449 for ( int i=0; i<activePatches.size(); ++i ) {
01450 patch_record &pr = patchRecords[activePatches[i]];
01451
01452 float maxMove = pr.p->flags.maxAtomMovement;
01453 if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
01454
01455 float maxTol = pr.p->flags.pairlistTolerance;
01456 if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
01457
01458 int start = pr.localStart;
01459 int n = pr.numAtoms;
01460 const CompAtom *a = pr.x;
01461 const CompAtomExt *aExt = pr.xExt;
01462
01463 if ( atomsChanged ) {
01464 #if 0
01465 int nfree = pr.numFreeAtoms;
01466 if ( nfree != n ) {
01467 int k = 0;
01468 for ( int j=0; j<n; ++j ) {
01469
01470 if ( ! aExt[j].atomFixed ) ao[k++] = j;
01471 }
01472 if ( fixedAtomsOn ) for ( int j=0; j<n; ++j ) {
01473
01474 if ( aExt[j].atomFixed ) ao[k++] = j;
01475 }
01476 } else {
01477 for ( int j=0; j<n; ++j ) {
01478 ao[j] = j;
01479 }
01480 }
01481
01482 sortAtomsForCUDA(ao,a,nfree,n);
01483 #endif
01484
01485 atom_param *ap = atom_params + start;
01486 for ( int k=0; k<n; ++k ) {
01487 int j = aExt[k].sortOrder;
01488 ap[k].vdw_type = a[j].vdwType;
01489 ap[k].index = aExt[j].id;
01490 #ifdef MEM_OPT_VERSION
01491 ap[k].excl_index = exclusionsByAtom[aExt[j].exclId].y;
01492 ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].exclId].x;
01493 #else
01494 ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
01495 ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
01496 #endif
01497 }
01498 }
01499 {
01500 #if 1
01501 const CudaAtom *ac = pr.p->getCudaAtomList();
01502 atom *ap = atoms + start;
01503 memcpy(ap, ac, sizeof(atom)*n);
01504 #else
01505 Vector center =
01506 pr.p->flags.lattice.unscale(cudaCompute->patchMap->center(pr.patchID));
01507 atom *ap = atoms + start;
01508 for ( int k=0; k<n; ++k ) {
01509 int j = aExt[k].sortOrder;
01510 ap[k].position.x = a[j].position.x - center.x;
01511 ap[k].position.y = a[j].position.y - center.y;
01512 ap[k].position.z = a[j].position.z - center.z;
01513 ap[k].charge = charge_scaling * a[j].charge;
01514 }
01515 #endif
01516 }
01517 }
01518
01519
01520
01521
01522
01523
01524 savePairlists = 0;
01525 usePairlists = 0;
01526 if ( flags.savePairlists ) {
01527 savePairlists = 1;
01528 usePairlists = 1;
01529 } else if ( flags.usePairlists ) {
01530 if ( ! pairlistsValid ||
01531 ( 2. * maxAtomMovement > pairlistTolerance ) ) {
01532 reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
01533 } else {
01534 usePairlists = 1;
01535 }
01536 }
01537 if ( ! usePairlists ) {
01538 pairlistsValid = 0;
01539 }
01540 float plcutoff = cutoff;
01541 if ( savePairlists ) {
01542 pairlistsValid = 1;
01543 pairlistTolerance = 2. * maxPatchTolerance;
01544 plcutoff += pairlistTolerance;
01545 }
01546 plcutoff2 = plcutoff * plcutoff;
01547
01548
01549
01550
01551 #if 0
01552
01553 if ( 1 ) {
01554 Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01555 Lattice &lattice = flags.lattice;
01556 float3 lata, latb, latc;
01557 lata.x = lattice.a().x;
01558 lata.y = lattice.a().y;
01559 lata.z = lattice.a().z;
01560 latb.x = lattice.b().x;
01561 latb.y = lattice.b().y;
01562 latb.z = lattice.b().z;
01563 latc.x = lattice.c().x;
01564 latc.y = lattice.c().y;
01565 latc.z = lattice.c().z;
01566
01567 int ncomputes = computeRecords.size();
01568 for ( int ic=0; ic<ncomputes; ++ic ) {
01569 patch_pair &pp = patch_pairs[ic];
01570 atom *a1 = atoms + pp.patch1_atom_start;
01571 int n1 = pp.patch1_size;
01572 atom *a2 = atoms + pp.patch2_atom_start;
01573 int n2 = pp.patch2_size;
01574 float offx = pp.offset.x * lata.x
01575 + pp.offset.y * latb.x
01576 + pp.offset.z * latc.x;
01577 float offy = pp.offset.x * lata.y
01578 + pp.offset.y * latb.y
01579 + pp.offset.z * latc.y;
01580 float offz = pp.offset.x * lata.z
01581 + pp.offset.y * latb.z
01582 + pp.offset.z * latc.z;
01583
01584 int atoms_tried = 0;
01585 int blocks_tried = 0;
01586 int atoms_used = 0;
01587 int blocks_used = 0;
01588 for ( int ii=0; ii<n1; ii+=32 ) {
01589 for ( int jj=0; jj<n2; jj+=16 ) {
01590 int block_used = 0;
01591 for ( int j=jj; j<jj+16 && j<n2; ++j ) {
01592 int atom_used = 0;
01593 for ( int i=ii; i<ii+32 && i<n1; ++i ) {
01594 float dx = offx + a1[i].position.x - a2[j].position.x;
01595 float dy = offy + a1[i].position.y - a2[j].position.y;
01596 float dz = offz + a1[i].position.z - a2[j].position.z;
01597 float r2 = dx*dx + dy*dy + dz*dz;
01598 if ( r2 < cutoff2 ) atom_used = 1;
01599 }
01600 ++atoms_tried;
01601 if ( atom_used ) { block_used = 1; ++atoms_used; }
01602 }
01603 ++blocks_tried;
01604 if ( block_used ) { ++blocks_used; }
01605 }
01606 }
01607 CkPrintf("blocks = %d/%d (%f) atoms = %d/%d (%f)\n",
01608 blocks_used, blocks_tried, blocks_used/(float)blocks_tried,
01609 atoms_used, atoms_tried, atoms_used/(float)atoms_tried);
01610 }
01611 }
01612 #endif
01613
01614 }
01615
01616
01617 if (simParams->GBISOn) {
01618
01619 for ( int i=0; i<activePatches.size(); ++i ) {
01620 patch_record &pr = master->patchRecords[activePatches[i]];
01621 GBISP("doWork[%d] accessing arrays for P%d\n",CkMyPe(),gbisPhase);
01622 if (gbisPhase == 1) {
01623
01624 if (atomsChanged) {
01625 float *intRad0 = intRad0H + pr.localStart;
01626 float *intRadS = intRadSH + pr.localStart;
01627 for ( int k=0; k<pr.numAtoms; ++k ) {
01628 int j = pr.xExt[k].sortOrder;
01629 intRad0[k] = pr.intRad[2*j+0];
01630 intRadS[k] = pr.intRad[2*j+1];
01631 }
01632 }
01633 } else if (gbisPhase == 2) {
01634 float *bornRad = bornRadH + pr.localStart;
01635 for ( int k=0; k<pr.numAtoms; ++k ) {
01636 int j = pr.xExt[k].sortOrder;
01637 bornRad[k] = pr.bornRad[j];
01638 }
01639 } else if (gbisPhase == 3) {
01640 float *dHdrPrefix = dHdrPrefixH + pr.localStart;
01641 for ( int k=0; k<pr.numAtoms; ++k ) {
01642 int j = pr.xExt[k].sortOrder;
01643 dHdrPrefix[k] = pr.dHdrPrefix[j];
01644 }
01645 }
01646 }
01647 }
01648
01649 kernel_time = CkWallTimer();
01650 kernel_launch_state = 1;
01651 if ( gpu_is_mine ) recvYieldDevice(-1);
01652 }
01653
01654 void cuda_check_remote_calc(void *arg, double ) {
01655
01656
01657 if ( cudaEventQuery(end_remote_download) == cudaSuccess ) {
01658
01659 computeMgr->sendYieldDevice(next_pe_sharing_gpu);
01660
01661 } else {
01662 CUDA_POLL(cuda_check_remote_calc, arg);
01663 }
01664 }
01665
01666 void cuda_check_local_calc(void *arg, double ) {
01667
01668
01669 if ( cudaEventQuery(end_local_download) == cudaSuccess ) {
01670
01671 computeMgr->sendYieldDevice(next_pe_sharing_gpu);
01672
01673 } else {
01674 CUDA_POLL(cuda_check_local_calc, arg);
01675 }
01676 }
01677
01678
01679
01680 void ComputeNonbondedCUDA::recvYieldDevice(int pe) {
01681 GBISP("C.N.CUDA[%d]::recvYieldDevice: seq %d, workStarted %d, \
01682 gbisPhase %d, kls %d, from pe %d\n", CkMyPe(), sequence(), \
01683 workStarted, gbisPhase, kernel_launch_state, pe)
01684
01685 float3 lata, latb, latc;
01686 lata.x = lattice.a().x;
01687 lata.y = lattice.a().y;
01688 lata.z = lattice.a().z;
01689 latb.x = lattice.b().x;
01690 latb.y = lattice.b().y;
01691 latb.z = lattice.b().z;
01692 latc.x = lattice.c().x;
01693 latc.y = lattice.c().y;
01694 latc.z = lattice.c().z;
01695 SimParameters *simParams = Node::Object()->simParameters;
01696
01697 switch ( kernel_launch_state ) {
01699
01700 case 1:
01701 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: case 1\n", CkMyPe())
01702 ++kernel_launch_state;
01703 gpu_is_mine = 0;
01704 remote_submit_time = CkWallTimer();
01705
01706 if (!simParams->GBISOn || gbisPhase == 1) {
01707
01708 if ( atomsChanged ) {
01709 cuda_bind_atom_params(atom_params);
01710 }
01711 if ( simParams->GBISOn) {
01712 cuda_bind_GBIS_psiSum(psiSumH);
01713 if ( atomsChanged ) {
01714 cuda_bind_GBIS_intRad(intRad0H, intRadSH);
01715 }
01716 }
01717 cuda_bind_atoms(atoms);
01718 cuda_bind_forces(forces, slow_forces);
01719 cuda_bind_virials(virials);
01720 cuda_bind_GBIS_energy(energy_gbis);
01721 cudaEventRecord(start_calc, stream);
01722
01723 cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
01724 localComputeRecords.size(),remoteComputeRecords.size(),
01725 localActivePatches.size(),remoteActivePatches.size(),
01726 doSlow, doEnergy, usePairlists, savePairlists, stream);
01727 if (simParams->GBISOn) {
01728 cuda_GBIS_P1(
01729 localComputeRecords.size(),remoteComputeRecords.size(),
01730 localActivePatches.size(),remoteActivePatches.size(),
01731 simParams->alpha_cutoff-simParams->fsMax,
01732 simParams->coulomb_radius_offset,
01733 lata, latb, latc, stream);
01734 }
01735
01736
01737 cudaEventRecord(end_remote_download, stream);
01738 CUDA_POLL(cuda_check_remote_progress,this);
01739 if ( shared_gpu && ! mergegrids ) {
01740 CUDA_POLL(cuda_check_remote_calc,this);
01741 break;
01742 }
01743 }
01744 if (simParams->GBISOn) {
01745 if (gbisPhase == 1) {
01746
01747 } else if (gbisPhase == 2) {
01748 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P2>>>\n", CkMyPe())
01749
01750 cuda_bind_GBIS_bornRad(bornRadH);
01751 cuda_bind_GBIS_dEdaSum(dEdaSumH);
01752 cudaEventRecord(start_calc, stream);
01753 cuda_GBIS_P2(
01754 localComputeRecords.size(),remoteComputeRecords.size(),
01755 localActivePatches.size(),remoteActivePatches.size(),
01756 (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
01757 simParams->nonbondedScaling, simParams->kappa,
01758 (simParams->switchingActive ? simParams->switchingDist : -1.0),
01759 simParams->dielectric, simParams->solvent_dielectric,
01760 lata, latb, latc,
01761 doEnergy, doSlow, stream
01762 );
01763 cudaEventRecord(end_remote_download, stream);
01764 CUDA_POLL(cuda_check_remote_progress,this);
01765 } else if (gbisPhase == 3) {
01766 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P3>>>\n", CkMyPe())
01767
01768 cuda_bind_GBIS_dHdrPrefix(dHdrPrefixH);
01769 cudaEventRecord(start_calc, stream);
01770 if (doSlow)
01771 cuda_GBIS_P3(
01772 localComputeRecords.size(),remoteComputeRecords.size(),
01773 localActivePatches.size(),remoteActivePatches.size(),
01774 (simParams->alpha_cutoff-simParams->fsMax),
01775 simParams->coulomb_radius_offset,
01776 simParams->nonbondedScaling,
01777 lata, latb, latc, stream
01778 );
01779 cudaEventRecord(end_remote_download, stream);
01780 CUDA_POLL(cuda_check_remote_progress,this);
01781 }
01782 }
01783
01785
01786 case 2:
01787 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: case 2\n", CkMyPe())
01788 ++kernel_launch_state;
01789 gpu_is_mine = 0;
01790
01791 #if 0
01792 cudaStreamWaitEvent(stream2, start_calc, 0);
01793 #else
01794 #define stream2 stream
01795 #endif
01796
01797 if (!simParams->GBISOn || gbisPhase == 1) {
01798
01799 cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
01800 0,localComputeRecords.size(),
01801 0,localActivePatches.size(),
01802 doSlow, doEnergy, usePairlists, savePairlists, stream2);
01803 if (simParams->GBISOn) {
01804 cuda_GBIS_P1(
01805 0,localComputeRecords.size(),
01806 0,localActivePatches.size(),
01807 simParams->alpha_cutoff-simParams->fsMax,
01808 simParams->coulomb_radius_offset,
01809 lata, latb, latc, stream2 );
01810 }
01811
01812
01813
01814 cudaEventRecord(end_local_download, stream2);
01815 if ( workStarted == 2 ) {
01816 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
01817 cuda_check_local_progress\n", CkMyPe())
01818 CUDA_POLL(cuda_check_local_progress,this);
01819 }
01820 if ( shared_gpu && ! mergegrids ) {
01821 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
01822 cuda_check_local_calc\n", CkMyPe())
01823 CUDA_POLL(cuda_check_local_calc,this);
01824 break;
01825 }
01826
01827 }
01828 if (simParams->GBISOn) {
01829 if (gbisPhase == 1) {
01830
01831 } else if (gbisPhase == 2) {
01832 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P2>>>\n", CkMyPe())
01833 cuda_GBIS_P2(
01834 0,localComputeRecords.size(),
01835 0,localActivePatches.size(),
01836 (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
01837 simParams->nonbondedScaling, simParams->kappa,
01838 (simParams->switchingActive ? simParams->switchingDist : -1.0),
01839 simParams->dielectric, simParams->solvent_dielectric,
01840 lata, latb, latc,
01841 doEnergy, doSlow, stream2
01842 );
01843 cudaEventRecord(end_local_download, stream2);
01844 if ( workStarted == 2 ) {
01845 CUDA_POLL(cuda_check_local_progress,this);
01846 }
01847 } else if (gbisPhase == 3) {
01848 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P3>>>\n", CkMyPe())
01849 if (doSlow)
01850 cuda_GBIS_P3(
01851 0,localComputeRecords.size(),
01852 0,localActivePatches.size(),
01853 (simParams->alpha_cutoff-simParams->fsMax),
01854 simParams->coulomb_radius_offset,
01855 simParams->nonbondedScaling,
01856 lata, latb, latc, stream2
01857 );
01858 cudaEventRecord(end_local_download, stream2);
01859 if ( workStarted == 2 ) {
01860 CUDA_POLL(cuda_check_local_progress,this);
01861 }
01862 }
01863 }
01864
01865
01866 default:
01867 GBISP("C.N.CUDA[%d]::recvYieldDevice: case default\n", CkMyPe())
01868 gpu_is_mine = 1;
01869 break;
01870 }
01871 GBISP("C.N.CUDA[%d]::recvYieldDevice: DONE\n", CkMyPe())
01872 }
01873
01874
01875 void ComputeNonbondedCUDA::messageFinishWork() {
01876
01877 cuda_errcheck("at cuda stream completed");
01878
01879 for ( int i = 0; i < numSlaves; ++i ) {
01880 computeMgr->sendNonbondedCUDASlaveEnqueue(slaves[i],slavePes[i],sequence(),priority(),workStarted);
01881 }
01882
01883 WorkDistrib::messageEnqueueWork(this);
01884
01885 }
01886
01887
01888 int ComputeNonbondedCUDA::finishWork() {
01889 GBISP("C.N.CUDA[%d]::fnWork: workStarted %d, phase %d\n", \
01890 CkMyPe(), workStarted, gbisPhase)
01891
01892 Molecule *mol = Node::Object()->molecule;
01893 SimParameters *simParams = Node::Object()->simParameters;
01894
01895 ResizeArray<int> &patches( workStarted == 1 ?
01896 remoteHostedPatches : localHostedPatches );
01897
01898
01899 if ( !simParams->GBISOn || gbisPhase == 1 ) {
01900 for ( int i=0; i<patches.size(); ++i ) {
01901 patch_record &pr = master->patchRecords[patches[i]];
01902 GBISP("GBIS[%d] fnWork() P0[%d] force.open()\n",CkMyPe(), pr.patchID);
01903 pr.r = pr.forceBox->open();
01904 }
01905 }
01906
01907
01908
01909
01910 for ( int i=0; i<patches.size(); ++i ) {
01911
01912 patch_record &pr = master->patchRecords[patches[i]];
01913
01914 int start = pr.localStart;
01915 const CompAtomExt *aExt = pr.xExt;
01916 if ( !simParams->GBISOn || gbisPhase == 3 ) {
01917 int nfree = pr.numFreeAtoms;
01918 pr.f = pr.r->f[Results::nbond];
01919 Force *f = pr.f;
01920 Force *f_slow = pr.r->f[Results::slow];
01921 const CompAtom *a = pr.x;
01922
01923 float4 *af = master->forces + start;
01924 float4 *af_slow = master->slow_forces + start;
01925
01926 for ( int k=0; k<nfree; ++k ) {
01927 int j = aExt[k].sortOrder;
01928 f[j].x += af[k].x;
01929 f[j].y += af[k].y;
01930 f[j].z += af[k].z;
01931
01932
01933 if ( doSlow ) {
01934 f_slow[j].x += af_slow[k].x;
01935 f_slow[j].y += af_slow[k].y;
01936 f_slow[j].z += af_slow[k].z;
01937
01938 }
01939 }
01940
01941 #if 1
01942
01943 if ( CkNumPes() == 1 ) {
01944 const CompAtomExt *aExt = pr.xExt;
01945 for ( int k=0; k<nfree; ++k ) {
01946 int j = aExt[k].sortOrder;
01947 #ifdef MEM_OPT_VERSION
01948 int excl_expected = mol->exclSigPool[aExt[j].exclId].fullExclCnt + 1;
01949 #else
01950 int excl_expected = mol->get_full_exclusions_for_atom(aExt[j].id)[0] + 1;
01951 #endif
01952 if ( af[k].w != excl_expected ) {
01953 CkPrintf("%d:%d(%d) atom %d found %d exclusions but expected %d\n",
01954 i, j, k, aExt[j].id, (int)af[k].w, excl_expected );
01955 }
01956 }
01957 }
01958 #endif
01959 }
01960
01961 #if 0
01962 if ( i % 31 == 0 ) for ( int j=0; j<3; ++j ) {
01963 CkPrintf("Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
01964 j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
01965 af[j].w);
01966 }
01967 #endif
01968
01969
01970 if (simParams->GBISOn) {
01971 if (gbisPhase == 1) {
01972
01973 GBReal *psiSumMaster = master->psiSumH + start;
01974 for ( int k=0; k<pr.numAtoms; ++k ) {
01975 int j = aExt[k].sortOrder;
01976 pr.psiSum[j] += psiSumMaster[k];
01977 }
01978 GBISP("C.N.CUDA[%d]::fnWork: P1 psiSum.close()\n", CkMyPe());
01979 pr.psiSumBox->close(&(pr.psiSum));
01980
01981 } else if (gbisPhase == 2) {
01982
01983 GBReal *dEdaSumMaster = master->dEdaSumH + start;
01984 for ( int k=0; k<pr.numAtoms; ++k ) {
01985 int j = aExt[k].sortOrder;
01986 pr.dEdaSum[j] += dEdaSumMaster[k];
01987 }
01988 GBISP("C.N.CUDA[%d]::fnWork: P2 dEdaSum.close()\n", CkMyPe());
01989 pr.dEdaSumBox->close(&(pr.dEdaSum));
01990
01991 } else if (gbisPhase == 3) {
01992 GBISP("C.N.CUDA[%d]::fnWork: P3 all.close()\n", CkMyPe());
01993 pr.intRadBox->close(&(pr.intRad));
01994 pr.bornRadBox->close(&(pr.bornRad));
01995 pr.dHdrPrefixBox->close(&(pr.dHdrPrefix));
01996 pr.positionBox->close(&(pr.x));
01997 pr.forceBox->close(&(pr.r));
01998 }
01999 } else {
02000 GBISP("C.N.CUDA[%d]::fnWork: pos/force.close()\n", CkMyPe());
02001 pr.positionBox->close(&(pr.x));
02002 pr.forceBox->close(&(pr.r));
02003 }
02004 }
02005
02006 #if 0
02007 virial *= (-1./6.);
02008 reduction->item(REDUCTION_VIRIAL_NBOND_XX) += virial;
02009 reduction->item(REDUCTION_VIRIAL_NBOND_YY) += virial;
02010 reduction->item(REDUCTION_VIRIAL_NBOND_ZZ) += virial;
02011 if ( doSlow ) {
02012 virial_slow *= (-1./6.);
02013 reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial_slow;
02014 reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial_slow;
02015 reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial_slow;
02016 }
02017 #endif
02018
02019 if ( workStarted == 1 && ! mergegrids &&
02020 ( localHostedPatches.size() || master == this ) ) {
02021 GBISP("not finished, call again\n");
02022 return 0;
02023 }
02024
02025 if ( master != this ) {
02026 GBISP("finished\n");
02027 if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);
02028 atomsChanged = 0;
02029 return 1;
02030 }
02031
02032 cuda_timer_total += kernel_time;
02033
02034 if ( !simParams->GBISOn || gbisPhase == 3 ) {
02035
02036 {
02037 Tensor virial_tensor;
02038 BigReal energyv = 0.;
02039 BigReal energye = 0.;
02040 BigReal energys = 0.;
02041 for ( int i = 0; i < num_virials; ++i ) {
02042 virial_tensor.xx += virials[16*i];
02043 virial_tensor.xy += virials[16*i+1];
02044 virial_tensor.xz += virials[16*i+2];
02045 virial_tensor.yx += virials[16*i+3];
02046 virial_tensor.yy += virials[16*i+4];
02047 virial_tensor.yz += virials[16*i+5];
02048 virial_tensor.zx += virials[16*i+6];
02049 virial_tensor.zy += virials[16*i+7];
02050 virial_tensor.zz += virials[16*i+8];
02051 energyv += virials[16*i+9];
02052 energye += virials[16*i+10];
02053 energys += virials[16*i+11];
02054 if (simParams->GBISOn) {
02055 energye += energy_gbis[i];
02056 }
02057 }
02058 ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NBOND,virial_tensor);
02059 if ( doEnergy ) {
02060 reduction->item(REDUCTION_LJ_ENERGY) += energyv;
02061 reduction->item(REDUCTION_ELECT_ENERGY) += energye;
02062 reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += energys;
02063 }
02064 }
02065 if ( doSlow ) {
02066 Tensor virial_slow_tensor;
02067 for ( int i = 0; i < num_virials; ++i ) {
02068 virial_slow_tensor.xx += slow_virials[16*i];
02069 virial_slow_tensor.xy += slow_virials[16*i+1];
02070 virial_slow_tensor.xz += slow_virials[16*i+2];
02071 virial_slow_tensor.yx += slow_virials[16*i+3];
02072 virial_slow_tensor.yy += slow_virials[16*i+4];
02073 virial_slow_tensor.yz += slow_virials[16*i+5];
02074 virial_slow_tensor.zx += slow_virials[16*i+6];
02075 virial_slow_tensor.zy += slow_virials[16*i+7];
02076 virial_slow_tensor.zz += slow_virials[16*i+8];
02077 }
02078 ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_SLOW,virial_slow_tensor);
02079 }
02080
02081 atomsChanged = 0;
02082 reduction->submit();
02083
02084 cuda_timer_count++;
02085 if ( simParams->outputCudaTiming &&
02086 step % simParams->outputCudaTiming == 0 ) {
02087
02088
02089
02090
02091
02092
02093
02094 #if 0
02095 float upload_ms, remote_calc_ms;
02096 float local_calc_ms, total_ms;
02097 cuda_errcheck("before event timers");
02098 cudaEventElapsedTime(&upload_ms, start_upload, start_calc);
02099 cuda_errcheck("in event timer 1");
02100 cudaEventElapsedTime(&remote_calc_ms, start_calc, end_remote_download);
02101 cuda_errcheck("in event timer 2");
02102 cudaEventElapsedTime(&local_calc_ms, end_remote_download, end_local_download);
02103 cuda_errcheck("in event timer 3");
02104 cudaEventElapsedTime(&total_ms, start_upload, end_local_download);
02105 cuda_errcheck("in event timer 4");
02106 cuda_errcheck("in event timers");
02107
02108 CkPrintf("CUDA EVENT TIMING: %d %f %f %f %f\n",
02109 CkMyPe(), upload_ms, remote_calc_ms,
02110 local_calc_ms, total_ms);
02111 #endif
02112
02113 if ( cuda_timer_count >= simParams->outputCudaTiming ) {
02114 cuda_timer_total /= cuda_timer_count;
02115 CkPrintf("CUDA TIMING: %d %f ms/step on node %d\n",
02116 step, cuda_timer_total * 1.e3, CkMyPe());
02117 }
02118 cuda_timer_count = 0;
02119 cuda_timer_total = 0;
02120 }
02121
02122 }
02123
02124
02125 GBISP("C.N.CUDA[%d]::fnWork: incrementing phase\n", CkMyPe())
02126 if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);
02127
02128 GBISP("C.N.CUDA[%d] finished ready for next step\n",CkMyPe());
02129 return 1;
02130 }
02131
02132
02133 #endif // NAMD_CUDA
02134