00001
00002 #include "common.h"
00003 #include "charm++.h"
00004
00005 #ifdef NAMD_CUDA
00006 #include <cuda_runtime.h>
00007 #include <cuda.h>
00008 #endif
00009
00010 #include "WorkDistrib.h"
00011 #include "ComputeMgr.h"
00012 #include "ProxyMgr.h"
00013 #include "ComputeNonbondedCUDA.h"
00014 #include "ComputeNonbondedCUDAKernel.h"
00015 #include "LJTable.h"
00016 #include "ObjectArena.h"
00017 #include "SortAtoms.h"
00018 #include <algorithm>
00019
00020 #include "NamdTypes.h"
00021
00022
00023 #undef PRINT_GBIS
00024
00025 #ifdef NAMD_CUDA
00026
00027 #ifdef WIN32
00028 #define __thread __declspec(thread)
00029 #endif
00030
00031 extern __thread cudaStream_t stream;
00032 extern __thread cudaStream_t stream2;
00033
00034 void cuda_errcheck(const char *msg) {
00035 cudaError_t err;
00036 if ((err = cudaGetLastError()) != cudaSuccess) {
00037 char host[128];
00038 #ifdef NOHOSTNAME
00039 sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00040 #else
00041 gethostname(host, 128); host[127] = 0;
00042 #endif
00043 char devstr[128] = "";
00044 int devnum;
00045 if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00046 sprintf(devstr, " device %d", devnum);
00047 }
00048 char errmsg[1024];
00049 sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
00050 NAMD_die(errmsg);
00051 }
00052 }
00053
00054 void cuda_die(const char *msg) {
00055 char host[128];
00056 #ifdef NOHOSTNAME
00057 sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00058 #else
00059 gethostname(host, 128); host[127] = 0;
00060 #endif
00061 char devstr[128] = "";
00062 int devnum;
00063 if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00064 sprintf(devstr, " device %d", devnum);
00065 }
00066 char errmsg[1024];
00067 sprintf(errmsg,"CUDA error on Pe %d (%s%s): %s", CkMyPe(), host, devstr, msg);
00068 NAMD_die(errmsg);
00069 }
00070
00071 char *devicelist;
00072 static __thread int usedevicelist;
00073 static __thread int ignoresharing;
00074 static __thread int mergegrids;
00075
00076 void cuda_getargs(char **argv) {
00077 devicelist = 0;
00078 usedevicelist = CmiGetArgStringDesc(argv, "+devices", &devicelist,
00079 "comma-delimited list of CUDA device numbers such as 0,2,1,2");
00080 ignoresharing = CmiGetArgFlag(argv, "+ignoresharing");
00081 mergegrids = CmiGetArgFlag(argv, "+mergegrids");
00082 }
00083
00084 static __thread int shared_gpu;
00085 static __thread int first_pe_sharing_gpu;
00086 static __thread int next_pe_sharing_gpu;
00087 static __thread int devicePe;
00088 static __thread int numPesSharingDevice;
00089 static __thread int *pesSharingDevice;
00090
00091 static __thread int gpu_is_mine;
00092
00093 int cuda_device_pe() { return devicePe; }
00094
00095 bool cuda_device_shared_with_pe(int pe) {
00096 for ( int i=0; i<numPesSharingDevice; ++i ) {
00097 if ( pesSharingDevice[i] == pe ) return true;
00098 }
00099 return false;
00100 }
00101
00102 static inline bool sortop_bitreverse(int a, int b) {
00103 if ( a == b ) return 0;
00104 for ( int bit = 1; bit; bit *= 2 ) {
00105 if ( (a&bit) != (b&bit) ) return ((a&bit) < (b&bit));
00106 }
00107 return 0;
00108 }
00109
00110
00111
00112
00113
00114
00115
00116
00117 #define CUDA_EVENT_ID_BASE 100
00118 #define CUDA_TRACE_REMOTE(START,END) \
00119 do { int dev; cudaGetDevice(&dev); traceUserBracketEvent( \
00120 CUDA_EVENT_ID_BASE + 2 * dev, START, END); } while (0)
00121 #define CUDA_TRACE_LOCAL(START,END) \
00122 do { int dev; cudaGetDevice(&dev); traceUserBracketEvent( \
00123 CUDA_EVENT_ID_BASE + 2 * dev + 1, START, END); } while (0)
00124
00125 void cuda_register_user_events() {
00126
00127 #define REGISTER_DEVICE_EVENTS(DEV) \
00128 traceRegisterUserEvent("CUDA device " #DEV " remote", CUDA_EVENT_ID_BASE + 2 * DEV); \
00129 traceRegisterUserEvent("CUDA device " #DEV " local", CUDA_EVENT_ID_BASE + 2 * DEV + 1);
00130
00131 REGISTER_DEVICE_EVENTS(0)
00132 REGISTER_DEVICE_EVENTS(1)
00133 REGISTER_DEVICE_EVENTS(2)
00134 REGISTER_DEVICE_EVENTS(3)
00135 REGISTER_DEVICE_EVENTS(4)
00136 REGISTER_DEVICE_EVENTS(5)
00137 REGISTER_DEVICE_EVENTS(6)
00138 REGISTER_DEVICE_EVENTS(7)
00139 REGISTER_DEVICE_EVENTS(8)
00140 REGISTER_DEVICE_EVENTS(9)
00141 REGISTER_DEVICE_EVENTS(10)
00142 REGISTER_DEVICE_EVENTS(11)
00143 REGISTER_DEVICE_EVENTS(12)
00144 REGISTER_DEVICE_EVENTS(13)
00145 REGISTER_DEVICE_EVENTS(14)
00146 REGISTER_DEVICE_EVENTS(15)
00147
00148 }
00149
00150 void cuda_initialize() {
00151
00152 if ( 0 == CkMyPe() ) cuda_register_user_events();
00153
00154 char host[128];
00155 #ifdef NOHOSTNAME
00156 sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00157 #else
00158 gethostname(host, 128); host[127] = 0;
00159 #endif
00160
00161 int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
00162 int myRankInPhysicalNode;
00163 int numPesOnPhysicalNode;
00164 int *pesOnPhysicalNode;
00165 CmiGetPesOnPhysicalNode(myPhysicalNodeID,
00166 &pesOnPhysicalNode,&numPesOnPhysicalNode);
00167
00168 {
00169 int i;
00170 for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00171 if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
00172 i = numPesOnPhysicalNode;
00173 break;
00174 }
00175 if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
00176 }
00177 if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
00178 CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
00179 for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00180 CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
00181 i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
00182 }
00183 myRankInPhysicalNode = 0;
00184 numPesOnPhysicalNode = 1;
00185 pesOnPhysicalNode = new int[1];
00186 pesOnPhysicalNode[0] = CkMyPe();
00187 } else {
00188 myRankInPhysicalNode = i;
00189 }
00190 }
00191
00192
00193 int deviceCount = 0;
00194 cudaGetDeviceCount(&deviceCount);
00195 cuda_errcheck("in cudaGetDeviceCount");
00196 if ( deviceCount <= 0 ) {
00197 cuda_die("No CUDA devices found.");
00198 }
00199
00200 int *devices;
00201 int ndevices = 0;
00202 int nexclusive = 0;
00203 if ( usedevicelist ) {
00204 devices = new int[strlen(devicelist)];
00205 int i = 0;
00206 while ( devicelist[i] ) {
00207 ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
00208 while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
00209 while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
00210 }
00211 } else {
00212 if ( ! CkMyPe() ) {
00213 CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
00214 }
00215 devices = new int[deviceCount];
00216 for ( int i=0; i<deviceCount; ++i ) {
00217 int dev = i % deviceCount;
00218 #if CUDA_VERSION >= 2020
00219 cudaDeviceProp deviceProp;
00220 cudaGetDeviceProperties(&deviceProp, dev);
00221 cuda_errcheck("in cudaGetDeviceProperties");
00222 if ( deviceProp.computeMode != cudaComputeModeProhibited
00223 && (deviceProp.major > 1 || deviceProp.minor >= 1)
00224 && deviceProp.canMapHostMemory
00225 && deviceProp.multiProcessorCount > 2 ) {
00226 devices[ndevices++] = dev;
00227 }
00228 if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
00229 ++nexclusive;
00230 }
00231 #else
00232 devices[ndevices++] = dev;
00233 #endif
00234 }
00235 }
00236
00237 if ( ! ndevices ) {
00238 cuda_die("All CUDA devices are in prohibited mode, of compute capability 1.0, or otherwise unusable.");
00239 }
00240
00241 shared_gpu = 0;
00242 gpu_is_mine = 1;
00243 first_pe_sharing_gpu = CkMyPe();
00244 next_pe_sharing_gpu = CkMyPe();
00245
00246 {
00247
00248 int dev;
00249 if ( numPesOnPhysicalNode > 1 ) {
00250 int myDeviceRank = myRankInPhysicalNode * ndevices / numPesOnPhysicalNode;
00251 dev = devices[myDeviceRank];
00252 devicePe = CkMyPe();
00253 if ( ignoresharing ) {
00254 pesSharingDevice = new int[1];
00255 pesSharingDevice[0] = CkMyPe();
00256 numPesSharingDevice = 1;
00257 } else {
00258 pesSharingDevice = new int[numPesOnPhysicalNode];
00259 devicePe = -1;
00260 numPesSharingDevice = 0;
00261 for ( int i = 0; i < numPesOnPhysicalNode; ++i ) {
00262 if ( i * ndevices / numPesOnPhysicalNode == myDeviceRank ) {
00263 int thisPe = pesOnPhysicalNode[i];
00264 pesSharingDevice[numPesSharingDevice++] = thisPe;
00265 if ( devicePe < 1 ) devicePe = thisPe;
00266 if ( sortop_bitreverse(thisPe,devicePe) ) devicePe = thisPe;
00267 }
00268 }
00269 for ( int j = 0; j < ndevices; ++j ) {
00270 if ( devices[j] == dev && j != myDeviceRank ) shared_gpu = 1;
00271 }
00272 }
00273 if ( shared_gpu && devicePe == CkMyPe() ) {
00274 CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
00275 }
00276 } else {
00277 dev = devices[CkMyPe() % ndevices];
00278 devicePe = CkMyPe();
00279 pesSharingDevice = new int[1];
00280 pesSharingDevice[0] = CkMyPe();
00281 numPesSharingDevice = 1;
00282 }
00283
00284 if ( devicePe != CkMyPe() ) {
00285 CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
00286 CkMyPe(), myRankInPhysicalNode, devicePe);
00287 return;
00288 }
00289
00290
00291
00292 first_pe_sharing_gpu = CkMyPe();
00293 next_pe_sharing_gpu = CkMyPe();
00294
00295 gpu_is_mine = ( first_pe_sharing_gpu == CkMyPe() );
00296
00297 if ( dev >= deviceCount ) {
00298 char buf[256];
00299 sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
00300 CkMyPe(), dev, host, deviceCount);
00301 NAMD_die(buf);
00302 }
00303
00304 cudaError_t err;
00305 cudaDeviceProp deviceProp;
00306 err = cudaGetDeviceProperties(&deviceProp, dev);
00307 if (err == cudaSuccess) {
00308 CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s' Mem: %dMB Rev: %d.%d\n",
00309 CkMyPe(), myRankInPhysicalNode, dev, host,
00310 deviceProp.name, deviceProp.totalGlobalMem / (1024*1024),
00311 deviceProp.major, deviceProp.minor);
00312
00313 err = cudaSetDevice(dev);
00314 }
00315 if ( err != cudaSuccess) {
00316 char errmsg[1024];
00317 sprintf(errmsg,"CUDA error binding to device %d on pe %d: %s",
00318 dev, CkMyPe(), cudaGetErrorString(err));
00319 NAMD_die(errmsg);
00320 }
00321
00322 }
00323
00324 cudaSetDeviceFlags(cudaDeviceMapHost);
00325 cuda_errcheck("in cudaSetDeviceFlags");
00326
00327 int dev;
00328 if ( cudaGetDevice(&dev) == cudaSuccess ) {
00329 cudaDeviceProp deviceProp;
00330 cudaGetDeviceProperties(&deviceProp, dev);
00331 cuda_errcheck("in cudaGetDeviceProperties");
00332 if ( deviceProp.computeMode == cudaComputeModeProhibited )
00333 cuda_die("device in prohibited mode");
00334 if ( deviceProp.major < 2 && deviceProp.minor < 1 )
00335 cuda_die("device not of compute capability 1.1 or higher");
00336 if ( ! deviceProp.canMapHostMemory )
00337 cuda_die("device cannot map host memory");
00338 }
00339
00340 if ( sizeof(patch_pair) & 15 ) NAMD_bug("sizeof(patch_pair) % 16 != 0");
00341 if ( sizeof(force_list) & 15 ) NAMD_bug("sizeof(force_list) % 16 != 0");
00342 if ( sizeof(atom) & 15 ) NAMD_bug("sizeof(atom) % 16 != 0");
00343 if ( sizeof(atom_param) & 15 ) NAMD_bug("sizeof(atom_param) % 16 != 0");
00344
00345 }
00346
00347 static __thread ComputeNonbondedCUDA* cudaCompute = 0;
00348 static __thread ComputeMgr *computeMgr = 0;
00349
00350 void send_build_cuda_force_table() {
00351 computeMgr->sendBuildCudaForceTable();
00352 }
00353
00354 void build_cuda_force_table() {
00355 if ( devicePe != CkMyPe() ) return;
00356 ComputeNonbondedCUDA::build_lj_table();
00357 ComputeNonbondedCUDA::build_force_table();
00358 }
00359
00360 void ComputeNonbondedCUDA::build_lj_table() {
00361
00362 const LJTable* const ljTable = ComputeNonbondedUtil:: ljTable;
00363 const int dim = ljTable->get_table_dim();
00364
00365
00366 int tsize = (((dim+16+31)/32)*32)-16;
00367 if ( tsize < dim ) NAMD_bug("ComputeNonbondedCUDA::build_lj_table bad tsize");
00368
00369 float2 *t = new float2[tsize*tsize];
00370 float2 *row = t;
00371 for ( int i=0; i<dim; ++i, row += tsize ) {
00372 for ( int j=0; j<dim; ++j ) {
00373 const LJTable::TableEntry *e = ljTable->table_val(i,j);
00374 row[j].x = e->A * scaling;
00375 row[j].y = e->B * scaling;
00376 }
00377 }
00378
00379 cuda_bind_lj_table(t,tsize);
00380 delete [] t;
00381
00382 if ( ! CkMyPe() ) {
00383 CkPrintf("Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim);
00384 }
00385 }
00386
00387 void ComputeNonbondedCUDA::build_force_table() {
00388
00389 float4 t[FORCE_TABLE_SIZE];
00390 float4 et[FORCE_TABLE_SIZE];
00391
00392 const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta;
00393 const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp;
00394
00395 const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
00396
00397 double r2list[FORCE_TABLE_SIZE];
00398 for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00399 double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00400 r2list[i] = r*r + r2_delta;
00401 }
00402
00403 union { double f; int32 i[2]; } byte_order_test;
00404 byte_order_test.f = 1.0;
00405 int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
00406
00407 for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00408 double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00409 int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc;
00410
00411 if ( r > cutoff ) {
00412 t[i].x = 0.;
00413 t[i].y = 0.;
00414 t[i].z = 0.;
00415 t[i].w = 0.;
00416 et[i].x = 0.;
00417 et[i].y = 0.;
00418 et[i].z = 0.;
00419 et[i].w = 0.;
00420 continue;
00421 }
00422
00423 BigReal diffa = r2list[i] - r2_table[table_i];
00424
00425
00426
00427 {
00428 BigReal table_a = fast_table[4*table_i];
00429 BigReal table_b = fast_table[4*table_i+1];
00430 BigReal table_c = fast_table[4*table_i+2];
00431 BigReal table_d = fast_table[4*table_i+3];
00432 BigReal grad =
00433 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00434 t[i].x = 2. * grad;
00435 BigReal ener = table_a + diffa *
00436 ( ( table_d * diffa + table_c ) * diffa + table_b);
00437 et[i].x = ener;
00438 }
00439
00440
00441
00442
00443 {
00444 BigReal table_a = scor_table[4*table_i];
00445 BigReal table_b = scor_table[4*table_i+1];
00446 BigReal table_c = scor_table[4*table_i+2];
00447 BigReal table_d = scor_table[4*table_i+3];
00448 BigReal grad =
00449 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00450 t[i].w = 2. * grad;
00451 BigReal ener = table_a + diffa *
00452 ( ( table_d * diffa + table_c ) * diffa + table_b);
00453 et[i].w = ener;
00454 }
00455
00456
00457
00458
00459 {
00460 BigReal table_a = vdwb_table[4*table_i];
00461 BigReal table_b = vdwb_table[4*table_i+1];
00462 BigReal table_c = vdwb_table[4*table_i+2];
00463 BigReal table_d = vdwb_table[4*table_i+3];
00464 BigReal grad =
00465 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00466 t[i].y = 2. * -1. * grad;
00467 BigReal ener = table_a + diffa *
00468 ( ( table_d * diffa + table_c ) * diffa + table_b);
00469 et[i].y = -1. * ener;
00470 }
00471
00472
00473
00474
00475 {
00476 BigReal table_a = vdwa_table[4*table_i];
00477 BigReal table_b = vdwa_table[4*table_i+1];
00478 BigReal table_c = vdwa_table[4*table_i+2];
00479 BigReal table_d = vdwa_table[4*table_i+3];
00480 BigReal grad =
00481 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00482 t[i].z = 2. * grad;
00483 BigReal ener = table_a + diffa *
00484 ( ( table_d * diffa + table_c ) * diffa + table_b);
00485 et[i].z = ener;
00486 }
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501 }
00502
00503 t[0].x = 0.f;
00504 t[0].y = 0.f;
00505 t[0].z = 0.f;
00506 t[0].w = 0.f;
00507 et[0].x = et[1].x;
00508 et[0].y = et[1].y;
00509 et[0].z = et[1].z;
00510 et[0].w = et[1].w;
00511
00512 cuda_bind_force_table(t,et);
00513
00514 if ( ! CkMyPe() ) {
00515 CkPrintf("Info: Updated CUDA force table with %d elements.\n", FORCE_TABLE_SIZE);
00516 }
00517 }
00518
00519 struct exlist_sortop {
00520 bool operator() (int32 *li, int32 *lj) {
00521 return ( li[1] < lj[1] );
00522 }
00523 };
00524
00525 static __thread int2 *exclusionsByAtom;
00526
00527 void ComputeNonbondedCUDA::build_exclusions() {
00528 Molecule *mol = Node::Object()->molecule;
00529
00530 #ifdef MEM_OPT_VERSION
00531 int natoms = mol->exclSigPoolSize;
00532 #else
00533 int natoms = mol->numAtoms;
00534 #endif
00535
00536 exclusionsByAtom = new int2[natoms];
00537
00538
00539
00540 ObjectArena<int32> listArena;
00541 ResizeArray<int32*> unique_lists;
00542 int32 **listsByAtom = new int32*[natoms];
00543 SortableResizeArray<int32> curList;
00544 for ( int i=0; i<natoms; ++i ) {
00545 curList.resize(0);
00546 curList.add(0);
00547 #ifdef MEM_OPT_VERSION
00548 const ExclusionSignature *sig = mol->exclSigPool + i;
00549 int n = sig->fullExclCnt;
00550 for ( int j=0; j<n; ++j ) { curList.add(sig->fullOffset[j]); }
00551 n += 1;
00552 #else
00553 const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
00554 int n = mol_list[0] + 1;
00555 for ( int j=1; j<n; ++j ) { curList.add(mol_list[j] - i); }
00556 #endif
00557 curList.sort();
00558
00559 int j;
00560 for ( j=0; j<unique_lists.size(); ++j ) {
00561 if ( n != unique_lists[j][0] ) continue;
00562 int k;
00563 for ( k=0; k<n; ++k ) {
00564 if ( unique_lists[j][k+3] != curList[k] ) break;
00565 }
00566 if ( k == n ) break;
00567 }
00568 if ( j == unique_lists.size() ) {
00569 int32 *list = listArena.getNewArray(n+3);
00570 list[0] = n;
00571 int maxdiff = 0;
00572 maxdiff = -1 * curList[0];
00573 if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
00574 list[1] = maxdiff;
00575 for ( int k=0; k<n; ++k ) {
00576 list[k+3] = curList[k];
00577 }
00578 unique_lists.add(list);
00579 }
00580 listsByAtom[i] = unique_lists[j];
00581 }
00582
00583 std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
00584 long int totalbits = 0;
00585 int nlists = unique_lists.size();
00586 for ( int j=0; j<nlists; ++j ) {
00587 int32 *list = unique_lists[j];
00588 int maxdiff = list[1];
00589 list[2] = totalbits + maxdiff;
00590 totalbits += 2*maxdiff + 1;
00591 }
00592 for ( int i=0; i<natoms; ++i ) {
00593 exclusionsByAtom[i].x = listsByAtom[i][1];
00594 exclusionsByAtom[i].y = listsByAtom[i][2];
00595 }
00596 delete [] listsByAtom;
00597
00598 if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
00599
00600 if ( ! CkMyPe() ) {
00601 long int bytesneeded = totalbits / 8;
00602 CkPrintf("Info: Found %d unique exclusion lists needing %ld bytes\n",
00603 unique_lists.size(), bytesneeded);
00604
00605 long int bytesavail = MAX_EXCLUSIONS * sizeof(unsigned int);
00606 if ( bytesneeded > bytesavail ) {
00607 char errmsg[512];
00608 sprintf(errmsg,"%ld bytes of CUDA memory needed for exclusions "
00609 "but only %ld bytes can be addressed with 32-bit int.",
00610 bytesneeded, bytesavail);
00611 NAMD_die(errmsg);
00612 }
00613 }
00614
00615 #define SET_EXCL(EXCL,BASE,DIFF) \
00616 (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
00617
00618 unsigned int *exclusion_bits = new unsigned int[totalbits/32];
00619 memset(exclusion_bits, 0, totalbits/8);
00620
00621 long int base = 0;
00622 for ( int i=0; i<unique_lists.size(); ++i ) {
00623 base += unique_lists[i][1];
00624 if ( base != unique_lists[i][2] ) {
00625 NAMD_bug("ComputeNonbondedCUDA::build_exclusions base != stored");
00626 }
00627 int n = unique_lists[i][0];
00628 for ( int j=0; j<n; ++j ) {
00629 SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
00630 }
00631 base += unique_lists[i][1] + 1;
00632 }
00633
00634 cuda_bind_exclusions(exclusion_bits, totalbits/32);
00635
00636 delete [] exclusion_bits;
00637 }
00638
00639
00640 void register_cuda_compute_self(ComputeID c, PatchID pid) {
00641
00642 if ( ! cudaCompute ) NAMD_bug("register_self called early");
00643
00644 cudaCompute->requirePatch(pid);
00645
00646 ComputeNonbondedCUDA::compute_record cr;
00647 cr.c = c;
00648 cr.pid[0] = pid; cr.pid[1] = pid;
00649 cr.offset = 0.;
00650 if ( cudaCompute->patchRecords[pid].isLocal ) {
00651 cudaCompute->localComputeRecords.add(cr);
00652 } else {
00653 cudaCompute->remoteComputeRecords.add(cr);
00654 }
00655 }
00656
00657 void register_cuda_compute_pair(ComputeID c, PatchID pid[], int t[]) {
00658
00659 if ( ! cudaCompute ) NAMD_bug("register_pair called early");
00660
00661 cudaCompute->requirePatch(pid[0]);
00662 cudaCompute->requirePatch(pid[1]);
00663
00664 ComputeNonbondedCUDA::compute_record cr, cr2;
00665 cr.c = c; cr2.c = c;
00666 cr.pid[0] = pid[0]; cr.pid[1] = pid[1];
00667 cr2.pid[0] = pid[1]; cr2.pid[1] = pid[0];
00668
00669 int t1 = t[0];
00670 int t2 = t[1];
00671 Vector offset = cudaCompute->patchMap->center(pid[0])
00672 - cudaCompute->patchMap->center(pid[1]);
00673 offset.x += (t1%3-1) - (t2%3-1);
00674 offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
00675 offset.z += (t1/9-1) - (t2/9-1);
00676 cr.offset = offset;
00677 cr2.offset = -1. * offset;
00678
00679 if ( cudaCompute->patchRecords[pid[0]].isLocal ) {
00680 cudaCompute->localComputeRecords.add(cr);
00681 } else {
00682 cudaCompute->remoteComputeRecords.add(cr);
00683 }
00684 if ( cudaCompute->patchRecords[pid[1]].isLocal ) {
00685 cudaCompute->localComputeRecords.add(cr2);
00686 } else {
00687 cudaCompute->remoteComputeRecords.add(cr2);
00688 }
00689 }
00690
00691 void unregister_cuda_compute(ComputeID c) {
00692
00693 NAMD_bug("unregister_compute unimplemented");
00694
00695 }
00696
00697 static __thread int atomsChanged = 0;
00698 static __thread int computesChanged = 0;
00699
00700 static __thread int pairlistsValid = 0;
00701 static __thread float pairlistTolerance = 0.;
00702 static __thread int usePairlists = 0;
00703 static __thread int savePairlists = 0;
00704 static __thread float plcutoff2 = 0;
00705
00706
00707 static __thread cudaEvent_t start_calc;
00708 static __thread cudaEvent_t end_remote_download;
00709 static __thread cudaEvent_t end_local_download;
00710
00711 static __thread ResizeArray<patch_pair> *patch_pairs_ptr;
00712 static __thread ResizeArray<force_list> *force_lists_ptr;
00713 #define PATCH_PAIRS_REF ResizeArray<patch_pair> &patch_pairs(*patch_pairs_ptr);
00714 #define FORCE_LISTS_REF ResizeArray<force_list> &force_lists(*force_lists_ptr);
00715
00716 ComputeNonbondedCUDA::ComputeNonbondedCUDA(ComputeID c, ComputeMgr *mgr,
00717 ComputeNonbondedCUDA *m, int idx) : Compute(c), slaveIndex(idx) {
00718 #ifdef PRINT_GBIS
00719 CkPrintf("C.N.CUDA[%d]::constructor cid=%d\n", CkMyPe(), c);
00720 #endif
00721
00722
00723 master = m ? m : this;
00724 cudaCompute = this;
00725 computeMgr = mgr;
00726 patchMap = PatchMap::Object();
00727 atomMap = AtomMap::Object();
00728 reduction = 0;
00729
00730 SimParameters *params = Node::Object()->simParameters;
00731 if (params->pressureProfileOn) {
00732 NAMD_die("pressure profile not supported in CUDA");
00733 }
00734
00735 atomsChanged = 1;
00736 computesChanged = 1;
00737 workStarted = 0;
00738 basePriority = PROXY_DATA_PRIORITY;
00739 localWorkMsg2 = new (PRIORITY_SIZE) LocalWorkMsg;
00740
00741 if ( master != this ) {
00742 masterPe = master->masterPe;
00743 master->slaves[slaveIndex] = this;
00744 if ( master->slavePes[slaveIndex] != CkMyPe() ) {
00745 NAMD_bug("ComputeNonbondedCUDA slavePes[slaveIndex] != CkMyPe");
00746 }
00747 registerPatches();
00748 return;
00749 }
00750 masterPe = CkMyPe();
00751
00752 reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00753
00754 cuda_init();
00755 build_exclusions();
00756
00757 cudaEventCreateWithFlags(&start_calc,cudaEventDisableTiming);
00758 cudaEventCreateWithFlags(&end_remote_download,cudaEventDisableTiming);
00759 cudaEventCreateWithFlags(&end_local_download,cudaEventDisableTiming);
00760
00761 patch_pairs_ptr = new ResizeArray<patch_pair>;
00762 force_lists_ptr = new ResizeArray<force_list>;
00763 }
00764
00765
00766 ComputeNonbondedCUDA::~ComputeNonbondedCUDA() { ; }
00767
00768 void ComputeNonbondedCUDA::requirePatch(int pid) {
00769
00770 computesChanged = 1;
00771 patch_record &pr = patchRecords.item(pid);
00772 if ( pr.refCount == 0 ) {
00773 if ( mergegrids ) {
00774 pr.isLocal = 0;
00775 } else if ( CkNumNodes() < 2 ) {
00776 pr.isLocal = 1 & ( 1 ^ patchMap->index_a(pid) ^
00777 patchMap->index_b(pid) ^ patchMap->index_c(pid) );
00778 } else {
00779 pr.isLocal = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
00780 }
00781 if ( pr.isLocal ) {
00782 localActivePatches.add(pid);
00783 } else {
00784 remoteActivePatches.add(pid);
00785 }
00786 activePatches.add(pid);
00787
00788 pr.patchID = pid;
00789 pr.hostPe = -1;
00790
00791
00792
00793
00794 pr.x = NULL;
00795 pr.xExt = NULL;
00796 pr.r = NULL;
00797 pr.f = NULL;
00798 pr.intRad = NULL;
00799 pr.psiSum = NULL;
00800 pr.bornRad = NULL;
00801 pr.dEdaSum = NULL;
00802 pr.dHdrPrefix = NULL;
00803 }
00804 pr.refCount += 1;
00805 }
00806
00807 void ComputeNonbondedCUDA::registerPatches() {
00808
00809 SimParameters *simParams = Node::Object()->simParameters;
00810 int npatches = master->activePatches.size();
00811 int *pids = master->activePatches.begin();
00812 patch_record *recs = master->patchRecords.begin();
00813 for ( int i=0; i<npatches; ++i ) {
00814 int pid = pids[i];
00815 patch_record &pr = recs[pid];
00816 if ( pr.hostPe == CkMyPe() ) {
00817
00818
00819
00820 hostedPatches.add(pid);
00821 if ( pr.isLocal ) {
00822 localHostedPatches.add(pid);
00823 } else {
00824 remoteHostedPatches.add(pid);
00825 }
00826 ProxyMgr::Object()->createProxy(pid);
00827 pr.p = patchMap->patch(pid);
00828 pr.positionBox = pr.p->registerPositionPickup(this);
00829 pr.forceBox = pr.p->registerForceDeposit(this);
00830 if (simParams->GBISOn) {
00831 pr.intRadBox = pr.p->registerIntRadPickup(this);
00832 pr.psiSumBox = pr.p->registerPsiSumDeposit(this);
00833 pr.bornRadBox = pr.p->registerBornRadPickup(this);
00834 pr.dEdaSumBox = pr.p->registerDEdaSumDeposit(this);
00835 pr.dHdrPrefixBox = pr.p->registerDHdrPrefixPickup(this);
00836 }
00837 }
00838 }
00839 if ( master == this ) setNumPatches(activePatches.size());
00840 else setNumPatches(hostedPatches.size());
00841 CkPrintf("Pe %d hosts %d local and %d remote patches for pe %d\n", CkMyPe(), localHostedPatches.size(), remoteHostedPatches.size(), masterPe);
00842 }
00843
00844 void ComputeNonbondedCUDA::assignPatches() {
00845
00846 int *pesOnNodeSharingDevice = new int[CkMyNodeSize()];
00847 int numPesOnNodeSharingDevice = 0;
00848 int masterIndex = -1;
00849 for ( int i=0; i<numPesSharingDevice; ++i ) {
00850 int pe = pesSharingDevice[i];
00851 if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
00852 if ( CkNodeOf(pe) == CkMyNode() ) {
00853 pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
00854 }
00855 }
00856
00857 int npatches = activePatches.size();
00858 int *count = new int[npatches];
00859 memset(count, 0, sizeof(int)*npatches);
00860 int *pcount = new int[numPesOnNodeSharingDevice];
00861 memset(pcount, 0, sizeof(int)*numPesOnNodeSharingDevice);
00862 int *rankpcount = new int[CkMyNodeSize()];
00863 memset(rankpcount, 0, sizeof(int)*CkMyNodeSize());
00864 char *table = new char[npatches*numPesOnNodeSharingDevice];
00865 memset(table, 0, npatches*numPesOnNodeSharingDevice);
00866
00867 int unassignedpatches = npatches;
00868
00869 if ( 0 ) {
00870 for ( int i=0; i<npatches; ++i ) {
00871 int pid = activePatches[i];
00872 patch_record &pr = patchRecords[pid];
00873 pr.hostPe = CkMyPe();
00874 }
00875 unassignedpatches = 0;
00876 pcount[masterIndex] = npatches;
00877 } else
00878
00879
00880 for ( int i=0; i<npatches; ++i ) {
00881 int pid = activePatches[i];
00882 patch_record &pr = patchRecords[pid];
00883 int homePe = patchMap->node(pid);
00884 for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00885 int pe = pesOnNodeSharingDevice[j];
00886 if ( pe == homePe ) {
00887 pr.hostPe = pe; --unassignedpatches;
00888 pcount[j] += 1;
00889 }
00890 if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
00891 table[i*numPesOnNodeSharingDevice+j] = 1;
00892 }
00893 }
00894 if ( pr.hostPe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
00895 pr.hostPe = homePe; --unassignedpatches;
00896 rankpcount[CkRankOf(homePe)] += 1;
00897 }
00898 }
00899
00900 int assignj = 0;
00901 for ( int i=0; i<npatches; ++i ) {
00902 int pid = activePatches[i];
00903 patch_record &pr = patchRecords[pid];
00904 if ( pr.hostPe != -1 ) continue;
00905 int c = 0;
00906 int lastj;
00907 for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00908 if ( table[i*numPesOnNodeSharingDevice+j] ) { ++c; lastj=j; }
00909 }
00910 count[i] = c;
00911 if ( c == 1 ) {
00912 pr.hostPe = pesOnNodeSharingDevice[lastj];
00913 --unassignedpatches;
00914 pcount[lastj] += 1;
00915 }
00916 }
00917 while ( unassignedpatches ) {
00918 int i;
00919 for ( i=0; i<npatches; ++i ) {
00920 if ( ! table[i*numPesOnNodeSharingDevice+assignj] ) continue;
00921 int pid = activePatches[i];
00922 patch_record &pr = patchRecords[pid];
00923 if ( pr.hostPe != -1 ) continue;
00924 pr.hostPe = pesOnNodeSharingDevice[assignj];
00925 --unassignedpatches;
00926 pcount[assignj] += 1;
00927 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00928 break;
00929 }
00930 if ( i<npatches ) continue;
00931 for ( i=0; i<npatches; ++i ) {
00932 int pid = activePatches[i];
00933 patch_record &pr = patchRecords[pid];
00934 if ( pr.hostPe != -1 ) continue;
00935 if ( count[i] ) continue;
00936 pr.hostPe = pesOnNodeSharingDevice[assignj];
00937 --unassignedpatches;
00938 pcount[assignj] += 1;
00939 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00940 break;
00941 }
00942 if ( i<npatches ) continue;
00943 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00944 }
00945
00946 for ( int i=0; i<npatches; ++i ) {
00947 int pid = activePatches[i];
00948 patch_record &pr = patchRecords[pid];
00949
00950 }
00951
00952 slavePes = new int[CkMyNodeSize()];
00953 slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
00954 numSlaves = 0;
00955 for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00956 int pe = pesOnNodeSharingDevice[j];
00957 int rank = pe - CkNodeFirst(CkMyNode());
00958
00959
00960 if ( pe == CkMyPe() ) continue;
00961 if ( ! pcount[j] && ! rankpcount[rank] ) continue;
00962 rankpcount[rank] = 0;
00963 slavePes[numSlaves] = pe;
00964 computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00965 ++numSlaves;
00966 }
00967 for ( int j=0; j<CkMyNodeSize(); ++j ) {
00968 int pe = CkNodeFirst(CkMyNode()) + j;
00969
00970
00971 if ( ! rankpcount[j] ) continue;
00972 if ( pe == CkMyPe() ) continue;
00973 slavePes[numSlaves] = pe;
00974 computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00975 ++numSlaves;
00976 }
00977 registerPatches();
00978
00979 delete [] pesOnNodeSharingDevice;
00980 delete [] count;
00981 delete [] pcount;
00982 delete [] rankpcount;
00983 delete [] table;
00984 }
00985
00986 static __thread int num_atom_records_allocated;
00987
00988 static __thread atom_param* atom_params;
00989 static __thread atom* atoms;
00990
00991
00992 static __thread int num_virials;
00993 static __thread int num_virials_allocated;
00994 static __thread float *virials;
00995 static __thread float *slow_virials;
00996 static __thread float *energy_gbis;
00997
00998
00999 static __thread float *intRad0H;
01000 static __thread float *intRadSH;
01001
01002 static __thread float *bornRadH;
01003
01004 static __thread float *dHdrPrefixH;
01005
01006 static __thread int cuda_timer_count;
01007 static __thread double cuda_timer_total;
01008 static __thread double kernel_time;
01009 static __thread double remote_submit_time;
01010 static __thread double local_submit_time;
01011
01012 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
01013
01014 #ifdef PRINT_GBIS
01015 #define GBISP(...) CkPrintf(__VA_ARGS__);
01016 #else
01017 #define GBISP(...)
01018 #endif
01019
01020 #define count_limit 1000000
01021 static __thread int check_remote_count;
01022 static __thread int check_local_count;
01023
01024 void cuda_check_remote_progress(void *arg, double) {
01025
01026 cudaError_t err = cudaEventQuery(end_remote_download);
01027 if ( err == cudaSuccess ) {
01028 local_submit_time = CkWallTimer();
01029 CUDA_TRACE_REMOTE(remote_submit_time,local_submit_time);
01030 ((ComputeNonbondedCUDA *) arg)->messageFinishWork();
01031 check_remote_count = 0;
01032 } else if ( err != cudaErrorNotReady ) {
01033 cuda_errcheck("in cuda_check_remote_progress");
01034 NAMD_bug("cuda_errcheck missed error in cuda_check_remote_progress");
01035 } else if ( ++check_remote_count >= count_limit ) {
01036 char errmsg[256];
01037 sprintf(errmsg,"cuda_check_remote_progress polled %d times over %f s on step %d",
01038 check_remote_count, CkWallTimer() - remote_submit_time,
01039 ((ComputeNonbondedCUDA *) arg)->step);
01040 cuda_errcheck(errmsg);
01041 NAMD_die(errmsg);
01042 } else if ( check_local_count ) {
01043 NAMD_bug("nonzero check_local_count in cuda_check_remote_progress");
01044 } else {
01045 CUDA_POLL(cuda_check_remote_progress, arg);
01046 }
01047 }
01048
01049 void cuda_check_local_progress(void *arg, double) {
01050
01051 cudaError_t err = cudaEventQuery(end_local_download);
01052 if ( err == cudaSuccess ) {
01053 double wall_time = CkWallTimer();
01054 CUDA_TRACE_LOCAL(local_submit_time,wall_time);
01055 kernel_time = wall_time - kernel_time;
01056 ((ComputeNonbondedCUDA *) arg)->messageFinishWork();
01057 check_local_count = 0;
01058 } else if ( err != cudaErrorNotReady ) {
01059 cuda_errcheck("in cuda_check_local_progress");
01060 NAMD_bug("cuda_errcheck missed error in cuda_check_local_progress");
01061 } else if ( ++check_local_count >= count_limit ) {
01062 char errmsg[256];
01063 sprintf(errmsg,"cuda_check_local_progress polled %d times over %f s on step %d",
01064 check_local_count, CkWallTimer() - local_submit_time,
01065 ((ComputeNonbondedCUDA *) arg)->step);
01066 cuda_errcheck(errmsg);
01067 NAMD_die(errmsg);
01068 } else if ( check_remote_count ) {
01069 NAMD_bug("nonzero check_remote_count in cuda_check_local_progress");
01070 } else {
01071 CUDA_POLL(cuda_check_local_progress, arg);
01072 }
01073 }
01074
01075 #if 0
01076
01077 void cuda_check_progress(void *arg, double) {
01078 if ( cuda_stream_finished() ) {
01079 kernel_time = CkWallTimer() - kernel_time;
01080 CUDA_POLL(ccd_index);
01081
01082 WorkDistrib::messageEnqueueWork((ComputeNonbondedCUDA *) arg);
01083 }
01084 }
01085 #endif
01086
01087 void ComputeNonbondedCUDA::atomUpdate() { atomsChanged = 1; }
01088
01089 static __thread int kernel_launch_state = 0;
01090
01091 struct cr_sortop {
01092 const Lattice &l;
01093 cr_sortop(const Lattice &lattice) : l(lattice) { }
01094 bool operator() (ComputeNonbondedCUDA::compute_record i,
01095 ComputeNonbondedCUDA::compute_record j) {
01096 Vector a = l.a();
01097 Vector b = l.b();
01098 Vector c = l.c();
01099 BigReal ri = (i.offset.x * a + i.offset.y * b + i.offset.z * c).length2();
01100 BigReal rj = (j.offset.x * a + j.offset.y * b + j.offset.z * c).length2();
01101 return ( ri < rj );
01102 }
01103 };
01104
01105 int ComputeNonbondedCUDA::noWork() {
01106
01107 SimParameters *simParams = Node::Object()->simParameters;
01108 Flags &flags = master->patchRecords[hostedPatches[0]].p->flags;
01109 lattice = flags.lattice;
01110 doSlow = flags.doFullElectrostatics;
01111 doEnergy = flags.doEnergy;
01112 step = flags.step;
01113
01114 if ( ! flags.doNonbonded ) {
01115 GBISP("GBIS[%d] noWork() don't do nonbonded\n",CkMyPe());
01116 for ( int i=0; i<hostedPatches.size(); ++i ) {
01117 patch_record &pr = master->patchRecords[hostedPatches[i]];
01118 pr.positionBox->skip();
01119 pr.forceBox->skip();
01120 if (simParams->GBISOn) {
01121 pr.intRadBox->skip();
01122 pr.psiSumBox->skip();
01123 pr.bornRadBox->skip();
01124 pr.dEdaSumBox->skip();
01125 pr.dHdrPrefixBox->skip();
01126 }
01127 }
01128 if ( master != this ) {
01129 computeMgr->sendNonbondedCUDASlaveReady(masterPe,
01130 hostedPatches.size(),atomsChanged,sequence());
01131 }
01132 if ( reduction ) reduction->submit();
01133 return 1;
01134 }
01135
01136 for ( int i=0; i<hostedPatches.size(); ++i ) {
01137 patch_record &pr = master->patchRecords[hostedPatches[i]];
01138 if (!simParams->GBISOn || gbisPhase == 1) {
01139 GBISP("GBIS[%d] noWork() P0[%d] open()\n",CkMyPe(), pr.patchID);
01140 pr.x = pr.positionBox->open();
01141 pr.xExt = pr.p->getCompAtomExtInfo();
01142 }
01143
01144 if (simParams->GBISOn) {
01145 if (gbisPhase == 1) {
01146 GBISP("GBIS[%d] noWork() P1[%d] open()\n",CkMyPe(),pr.patchID);
01147 pr.intRad = pr.intRadBox->open();
01148 pr.psiSum = pr.psiSumBox->open();
01149 } else if (gbisPhase == 2) {
01150 GBISP("GBIS[%d] noWork() P2[%d] open()\n",CkMyPe(),pr.patchID);
01151 pr.bornRad = pr.bornRadBox->open();
01152 pr.dEdaSum = pr.dEdaSumBox->open();
01153 } else if (gbisPhase == 3) {
01154 GBISP("GBIS[%d] noWork() P3[%d] open()\n",CkMyPe(),pr.patchID);
01155 pr.dHdrPrefix = pr.dHdrPrefixBox->open();
01156 }
01157 GBISP("opened GBIS boxes");
01158 }
01159 }
01160
01161 if ( master == this ) return 0;
01162
01163
01164 computeMgr->sendNonbondedCUDASlaveReady(masterPe,
01165 hostedPatches.size(),atomsChanged,sequence());
01166
01167 workStarted = 1;
01168 basePriority = COMPUTE_PROXY_PRIORITY;
01169
01170 return 1;
01171 }
01172
01173 void ComputeNonbondedCUDA::doWork() {
01174 GBISP("C.N.CUDA[%d]::doWork: seq %d, phase %d, workStarted %d\n", \
01175 CkMyPe(), sequence(), gbisPhase, workStarted);
01176
01177 PATCH_PAIRS_REF;
01178 FORCE_LISTS_REF;
01179
01180 if ( workStarted ) {
01181 if ( finishWork() ) {
01182 workStarted = 0;
01183 basePriority = PROXY_DATA_PRIORITY;
01184 } else {
01185 workStarted = 2;
01186 basePriority = PROXY_RESULTS_PRIORITY;
01187 if ( master == this && kernel_launch_state > 2 ) {
01188 cuda_check_local_progress(this,0.);
01189 }
01190 }
01191 return;
01192 }
01193
01194 workStarted = 1;
01195 basePriority = COMPUTE_PROXY_PRIORITY;
01196
01197 Molecule *mol = Node::Object()->molecule;
01198 Parameters *params = Node::Object()->parameters;
01199 SimParameters *simParams = Node::Object()->simParameters;
01200
01201
01202 if (!simParams->GBISOn || gbisPhase == 1) {
01203
01204
01205 if ( atomsChanged || computesChanged ) {
01206 int npatches = activePatches.size();
01207
01208 pairlistsValid = 0;
01209 pairlistTolerance = 0.;
01210
01211 if ( computesChanged ) {
01212 computesChanged = 0;
01213
01214 num_virials = npatches;
01215 if ( num_virials > num_virials_allocated ) {
01216 if ( num_virials_allocated ) {
01217 cudaFreeHost(virials);
01218 cudaFreeHost(energy_gbis);
01219 cuda_errcheck("in cudaFreeHost virials");
01220 }
01221 num_virials_allocated = 1.1 * num_virials + 1;
01222 cudaHostAlloc((void**)&virials,2*16*sizeof(float)*num_virials_allocated,cudaHostAllocMapped);
01223 cudaHostAlloc((void**)&energy_gbis,sizeof(float)*num_virials_allocated,cudaHostAllocMapped);
01224 for (int i = 0; i < num_virials_allocated; i++)
01225 energy_gbis[i] = 0.f;
01226 cuda_errcheck("in cudaHostAlloc virials");
01227 slow_virials = virials + 16*num_virials;
01228 }
01229
01230 int *ap = activePatches.begin();
01231 for ( int i=0; i<localActivePatches.size(); ++i ) {
01232 *(ap++) = localActivePatches[i];
01233 }
01234 for ( int i=0; i<remoteActivePatches.size(); ++i ) {
01235 *(ap++) = remoteActivePatches[i];
01236 }
01237
01238
01239 cr_sortop so(lattice);
01240 std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),so);
01241 std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),so);
01242
01243 int nlc = localComputeRecords.size();
01244 int nrc = remoteComputeRecords.size();
01245 computeRecords.resize(nlc+nrc);
01246 compute_record *cr = computeRecords.begin();
01247 for ( int i=0; i<nlc; ++i ) {
01248 *(cr++) = localComputeRecords[i];
01249 }
01250 for ( int i=0; i<nrc; ++i ) {
01251 *(cr++) = remoteComputeRecords[i];
01252 }
01253
01254 force_lists.resize(npatches);
01255 for ( int i=0; i<npatches; ++i ) {
01256 patchRecords[activePatches[i]].localIndex = i;
01257 force_lists[i].force_list_size = 0;
01258 }
01259
01260 int ncomputes = computeRecords.size();
01261 patch_pairs.resize(ncomputes);
01262 for ( int i=0; i<ncomputes; ++i ) {
01263 ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01264 int lp1 = patchRecords[cr.pid[0]].localIndex;
01265 int lp2 = patchRecords[cr.pid[1]].localIndex;
01266 force_lists[lp1].force_list_size++;
01267 patch_pair &pp = patch_pairs[i];
01268 pp.offset.x = cr.offset.x;
01269 pp.offset.y = cr.offset.y;
01270 pp.offset.z = cr.offset.z;
01271 }
01272
01273 for ( int i=0; i<ncomputes; ++i ) {
01274 ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01275 int lp1 = patchRecords[cr.pid[0]].localIndex;
01276 patch_pair &pp = patch_pairs[i];
01277 pp.patch1_force_list_index = lp1;
01278 pp.patch1_force_list_size = force_lists[lp1].force_list_size;
01279 }
01280
01281 if ( 1 || simParams->outputCudaTiming ) {
01282 CkPrintf("Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
01283 CkMyPe(), localActivePatches.size(), remoteActivePatches.size(),
01284 localComputeRecords.size(), remoteComputeRecords.size());
01285 }
01286 }
01287
01288 int istart = 0;
01289 int flstart = 0;
01290 int vlstart = 0;
01291 int max_atoms_per_patch = 0;
01292 int i;
01293 for ( i=0; i<npatches; ++i ) {
01294 if ( i == localActivePatches.size() ) {
01295 num_local_atom_records = istart;
01296 }
01297 force_lists[i].force_list_start = flstart;
01298 force_lists[i].force_output_start = istart;
01299 force_lists[i].atom_start = istart;
01300 force_lists[i].virial_list_start = vlstart;
01301 force_lists[i].virial_output_start = 16*i;
01302 patch_record &pr = patchRecords[activePatches[i]];
01303 pr.localStart = istart;
01304 int natoms = pr.p->getNumAtoms();
01305 int nfreeatoms = natoms;
01306 if ( fixedAtomsOn ) {
01307 const CompAtomExt *aExt = pr.xExt;
01308 for ( int j=0; j<natoms; ++j ) {
01309 if ( aExt[j].atomFixed ) --nfreeatoms;
01310 }
01311 }
01312 if ( natoms > max_atoms_per_patch ) max_atoms_per_patch = natoms;
01313 pr.numAtoms = natoms;
01314 pr.numFreeAtoms = nfreeatoms;
01315 force_lists[i].patch_size = nfreeatoms;
01316 if ( natoms & 15 ) { natoms += 16 - (natoms & 15); }
01317 if ( nfreeatoms & 15 ) { nfreeatoms += 16 - (nfreeatoms & 15); }
01318 force_lists[i].patch_stride = nfreeatoms;
01319 flstart += nfreeatoms * force_lists[i].force_list_size;
01320 vlstart += 16 * force_lists[i].force_list_size;
01321 istart += natoms;
01322 force_lists[i].force_list_size = 0;
01323 }
01324 if ( i == localActivePatches.size() ) {
01325 num_local_atom_records = istart;
01326 }
01327 num_force_records = flstart;
01328 num_atom_records = istart;
01329 num_remote_atom_records = num_atom_records - num_local_atom_records;
01330 if ( num_atom_records > num_atom_records_allocated ) {
01331 if ( num_atom_records_allocated ) {
01332
01333 cudaFreeHost(atom_params);
01334 cudaFreeHost(atoms);
01335 cudaFreeHost(forces);
01336 cudaFreeHost(slow_forces);
01337 if (simParams->GBISOn) {
01338 cudaFreeHost(intRad0H);
01339 cudaFreeHost(intRadSH);
01340 cudaFreeHost(psiSumH);
01341 cudaFreeHost(bornRadH);
01342 cudaFreeHost(dEdaSumH);
01343 cudaFreeHost(dHdrPrefixH);
01344 }
01345 cuda_errcheck("in cudaFreeHost");
01346 }
01347 num_atom_records_allocated = 1.1 * num_atom_records + 1;
01348
01349 cudaMallocHost((void**)&atom_params,sizeof(atom_param)*num_atom_records_allocated);
01350 cudaMallocHost((void**)&atoms,sizeof(atom)*num_atom_records_allocated);
01351 cuda_errcheck("in cudaMallocHost atoms");
01352 cudaHostAlloc((void**)&forces,sizeof(float4)*num_atom_records_allocated,cudaHostAllocMapped);
01353 cudaHostAlloc((void**)&slow_forces,sizeof(float4)*num_atom_records_allocated,cudaHostAllocMapped);
01354
01355 if (simParams->GBISOn) {
01356 cudaMallocHost((void**)&intRad0H,sizeof(float)*num_atom_records_allocated);
01357 cudaMallocHost((void**)&intRadSH,sizeof(float)*num_atom_records_allocated);
01358 cudaHostAlloc((void**)&psiSumH, sizeof(GBReal)*num_atom_records_allocated,cudaHostAllocMapped);
01359 cudaMallocHost((void**)&bornRadH,sizeof(float)*num_atom_records_allocated);
01360 cudaHostAlloc((void**)&dEdaSumH,sizeof(GBReal)*num_atom_records_allocated,cudaHostAllocMapped);
01361 cudaMallocHost((void**)&dHdrPrefixH,sizeof(float)*num_atom_records_allocated);
01362 }
01363 cuda_errcheck("in cudaHostAlloc forces");
01364 }
01365
01366 int bfstart = 0;
01367 int ncomputes = computeRecords.size();
01368 for ( int i=0; i<ncomputes; ++i ) {
01369 ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01370 int p1 = cr.pid[0];
01371 int p2 = cr.pid[1];
01372 int lp1 = patchRecords[p1].localIndex;
01373 int lp2 = patchRecords[p2].localIndex;
01374 patch_pair &pp = patch_pairs[i];
01375 pp.patch1_atom_start = patchRecords[p1].localStart;
01376 pp.patch1_force_start = force_lists[lp1].force_list_start +
01377 force_lists[lp1].patch_stride * force_lists[lp1].force_list_size;
01378 pp.virial_start = force_lists[lp1].virial_list_start +
01379 16 * force_lists[lp1].force_list_size;
01380 pp.patch1_size = patchRecords[p1].numAtoms;
01381 pp.patch1_force_size = patchRecords[p1].numFreeAtoms;
01382 pp.patch2_force_size = patchRecords[p2].numFreeAtoms;
01383 pp.block_flags_start = bfstart;
01384 bfstart += ((pp.patch1_force_size + 127) >> 7) << 5;
01385
01386
01387 pp.patch2_atom_start = patchRecords[p2].localStart;
01388
01389
01390 pp.patch2_size = patchRecords[p2].numAtoms;
01391
01392
01393 force_lists[lp1].force_list_size++;
01394
01395
01396
01397
01398
01399 }
01400
01401 #if 0
01402 CkPrintf("Pe %d cuda_bind_patch_pairs %d %d %d %d %d\n", CkMyPe(),
01403 patch_pairs.size(), force_lists.size(),
01404 num_atom_records, num_force_records,
01405 max_atoms_per_patch);
01406 #endif
01407
01408 int totalmem = patch_pairs.size() * sizeof(patch_pair) +
01409 force_lists.size() * sizeof(force_list) +
01410 num_force_records * sizeof(float4) +
01411 num_atom_records * sizeof(atom) +
01412 num_atom_records * sizeof(atom_param) +
01413 num_atom_records * sizeof(float4);
01414 int totalcopy = num_atom_records * ( sizeof(atom) + sizeof(float4) );
01415
01416
01417
01418
01419
01420 cuda_bind_patch_pairs(patch_pairs.begin(), patch_pairs.size(),
01421 force_lists.begin(), force_lists.size(),
01422 num_atom_records, num_force_records,
01423 bfstart, max_atoms_per_patch);
01424
01425 }
01426
01427 double charge_scaling = sqrt(COULOMB * scaling * dielectric_1);
01428
01429 Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01430 float maxAtomMovement = 0.;
01431 float maxPatchTolerance = 0.;
01432
01433 for ( int i=0; i<activePatches.size(); ++i ) {
01434 patch_record &pr = patchRecords[activePatches[i]];
01435
01436 float maxMove = pr.p->flags.maxAtomMovement;
01437 if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
01438
01439 float maxTol = pr.p->flags.pairlistTolerance;
01440 if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
01441
01442 int start = pr.localStart;
01443 int n = pr.numAtoms;
01444 const CompAtom *a = pr.x;
01445 const CompAtomExt *aExt = pr.xExt;
01446
01447 if ( atomsChanged ) {
01448 #if 0
01449 int nfree = pr.numFreeAtoms;
01450 if ( nfree != n ) {
01451 int k = 0;
01452 for ( int j=0; j<n; ++j ) {
01453
01454 if ( ! aExt[j].atomFixed ) ao[k++] = j;
01455 }
01456 if ( fixedAtomsOn ) for ( int j=0; j<n; ++j ) {
01457
01458 if ( aExt[j].atomFixed ) ao[k++] = j;
01459 }
01460 } else {
01461 for ( int j=0; j<n; ++j ) {
01462 ao[j] = j;
01463 }
01464 }
01465
01466 sortAtomsForCUDA(ao,a,nfree,n);
01467 #endif
01468
01469 atom_param *ap = atom_params + start;
01470 for ( int k=0; k<n; ++k ) {
01471 int j = aExt[k].sortOrder;
01472 ap[k].vdw_type = a[j].vdwType;
01473 ap[k].index = aExt[j].id;
01474 #ifdef MEM_OPT_VERSION
01475 ap[k].excl_index = exclusionsByAtom[aExt[j].exclId].y;
01476 ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].exclId].x;
01477 #else
01478 ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
01479 ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
01480 #endif
01481 }
01482 }
01483 {
01484 #if 1
01485 const CudaAtom *ac = pr.p->getCudaAtomList();
01486 atom *ap = atoms + start;
01487 memcpy(ap, ac, sizeof(atom)*n);
01488 #else
01489 Vector center =
01490 pr.p->flags.lattice.unscale(cudaCompute->patchMap->center(pr.patchID));
01491 atom *ap = atoms + start;
01492 for ( int k=0; k<n; ++k ) {
01493 int j = aExt[k].sortOrder;
01494 ap[k].position.x = a[j].position.x - center.x;
01495 ap[k].position.y = a[j].position.y - center.y;
01496 ap[k].position.z = a[j].position.z - center.z;
01497 ap[k].charge = charge_scaling * a[j].charge;
01498 }
01499 #endif
01500 }
01501 }
01502
01503
01504
01505
01506
01507
01508 savePairlists = 0;
01509 usePairlists = 0;
01510 if ( flags.savePairlists ) {
01511 savePairlists = 1;
01512 usePairlists = 1;
01513 } else if ( flags.usePairlists ) {
01514 if ( ! pairlistsValid ||
01515 ( 2. * maxAtomMovement > pairlistTolerance ) ) {
01516 reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
01517 } else {
01518 usePairlists = 1;
01519 }
01520 }
01521 if ( ! usePairlists ) {
01522 pairlistsValid = 0;
01523 }
01524 float plcutoff = cutoff;
01525 if ( savePairlists ) {
01526 pairlistsValid = 1;
01527 pairlistTolerance = 2. * maxPatchTolerance;
01528 plcutoff += pairlistTolerance;
01529 }
01530 plcutoff2 = plcutoff * plcutoff;
01531
01532
01533
01534
01535 #if 0
01536
01537 if ( 1 ) {
01538 Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01539 Lattice &lattice = flags.lattice;
01540 float3 lata, latb, latc;
01541 lata.x = lattice.a().x;
01542 lata.y = lattice.a().y;
01543 lata.z = lattice.a().z;
01544 latb.x = lattice.b().x;
01545 latb.y = lattice.b().y;
01546 latb.z = lattice.b().z;
01547 latc.x = lattice.c().x;
01548 latc.y = lattice.c().y;
01549 latc.z = lattice.c().z;
01550
01551 int ncomputes = computeRecords.size();
01552 for ( int ic=0; ic<ncomputes; ++ic ) {
01553 patch_pair &pp = patch_pairs[ic];
01554 atom *a1 = atoms + pp.patch1_atom_start;
01555 int n1 = pp.patch1_size;
01556 atom *a2 = atoms + pp.patch2_atom_start;
01557 int n2 = pp.patch2_size;
01558 float offx = pp.offset.x * lata.x
01559 + pp.offset.y * latb.x
01560 + pp.offset.z * latc.x;
01561 float offy = pp.offset.x * lata.y
01562 + pp.offset.y * latb.y
01563 + pp.offset.z * latc.y;
01564 float offz = pp.offset.x * lata.z
01565 + pp.offset.y * latb.z
01566 + pp.offset.z * latc.z;
01567
01568 int atoms_tried = 0;
01569 int blocks_tried = 0;
01570 int atoms_used = 0;
01571 int blocks_used = 0;
01572 for ( int ii=0; ii<n1; ii+=32 ) {
01573 for ( int jj=0; jj<n2; jj+=16 ) {
01574 int block_used = 0;
01575 for ( int j=jj; j<jj+16 && j<n2; ++j ) {
01576 int atom_used = 0;
01577 for ( int i=ii; i<ii+32 && i<n1; ++i ) {
01578 float dx = offx + a1[i].position.x - a2[j].position.x;
01579 float dy = offy + a1[i].position.y - a2[j].position.y;
01580 float dz = offz + a1[i].position.z - a2[j].position.z;
01581 float r2 = dx*dx + dy*dy + dz*dz;
01582 if ( r2 < cutoff2 ) atom_used = 1;
01583 }
01584 ++atoms_tried;
01585 if ( atom_used ) { block_used = 1; ++atoms_used; }
01586 }
01587 ++blocks_tried;
01588 if ( block_used ) { ++blocks_used; }
01589 }
01590 }
01591 CkPrintf("blocks = %d/%d (%f) atoms = %d/%d (%f)\n",
01592 blocks_used, blocks_tried, blocks_used/(float)blocks_tried,
01593 atoms_used, atoms_tried, atoms_used/(float)atoms_tried);
01594 }
01595 }
01596 #endif
01597
01598 }
01599
01600
01601 if (simParams->GBISOn) {
01602
01603 for ( int i=0; i<activePatches.size(); ++i ) {
01604 patch_record &pr = master->patchRecords[activePatches[i]];
01605 GBISP("doWork[%d] accessing arrays for P%d\n",CkMyPe(),gbisPhase);
01606 if (gbisPhase == 1) {
01607
01608 if (atomsChanged) {
01609 float *intRad0 = intRad0H + pr.localStart;
01610 float *intRadS = intRadSH + pr.localStart;
01611 for ( int k=0; k<pr.numAtoms; ++k ) {
01612 int j = pr.xExt[k].sortOrder;
01613 intRad0[k] = pr.intRad[2*j+0];
01614 intRadS[k] = pr.intRad[2*j+1];
01615 }
01616 }
01617 } else if (gbisPhase == 2) {
01618 float *bornRad = bornRadH + pr.localStart;
01619 for ( int k=0; k<pr.numAtoms; ++k ) {
01620 int j = pr.xExt[k].sortOrder;
01621 bornRad[k] = pr.bornRad[j];
01622 }
01623 } else if (gbisPhase == 3) {
01624 float *dHdrPrefix = dHdrPrefixH + pr.localStart;
01625 for ( int k=0; k<pr.numAtoms; ++k ) {
01626 int j = pr.xExt[k].sortOrder;
01627 dHdrPrefix[k] = pr.dHdrPrefix[j];
01628 }
01629 }
01630 }
01631 }
01632
01633 kernel_time = CkWallTimer();
01634 kernel_launch_state = 1;
01635 if ( gpu_is_mine ) recvYieldDevice(-1);
01636 }
01637
01638 void cuda_check_remote_calc(void *arg, double) {
01639
01640
01641 if ( cudaEventQuery(end_remote_download) == cudaSuccess ) {
01642
01643 computeMgr->sendYieldDevice(next_pe_sharing_gpu);
01644
01645 } else {
01646 CUDA_POLL(cuda_check_remote_calc, arg);
01647 }
01648 }
01649
01650 void cuda_check_local_calc(void *arg, double) {
01651
01652
01653 if ( cudaEventQuery(end_local_download) == cudaSuccess ) {
01654
01655 computeMgr->sendYieldDevice(next_pe_sharing_gpu);
01656
01657 } else {
01658 CUDA_POLL(cuda_check_local_calc, arg);
01659 }
01660 }
01661
01662
01663
01664 void ComputeNonbondedCUDA::recvYieldDevice(int pe) {
01665 GBISP("C.N.CUDA[%d]::recvYieldDevice: seq %d, workStarted %d, \
01666 gbisPhase %d, kls %d, from pe %d\n", CkMyPe(), sequence(), \
01667 workStarted, gbisPhase, kernel_launch_state, pe)
01668
01669 float3 lata, latb, latc;
01670 lata.x = lattice.a().x;
01671 lata.y = lattice.a().y;
01672 lata.z = lattice.a().z;
01673 latb.x = lattice.b().x;
01674 latb.y = lattice.b().y;
01675 latb.z = lattice.b().z;
01676 latc.x = lattice.c().x;
01677 latc.y = lattice.c().y;
01678 latc.z = lattice.c().z;
01679 SimParameters *simParams = Node::Object()->simParameters;
01680
01681 switch ( kernel_launch_state ) {
01683
01684 case 1:
01685 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: case 1\n", CkMyPe())
01686 ++kernel_launch_state;
01687 gpu_is_mine = 0;
01688 remote_submit_time = CkWallTimer();
01689
01690 if (!simParams->GBISOn || gbisPhase == 1) {
01691
01692 if ( atomsChanged ) {
01693 cuda_bind_atom_params(atom_params);
01694 }
01695 if ( simParams->GBISOn) {
01696 cuda_bind_GBIS_psiSum(psiSumH);
01697 if ( atomsChanged ) {
01698 cuda_bind_GBIS_intRad(intRad0H, intRadSH);
01699 }
01700 }
01701 cuda_bind_atoms(atoms);
01702 cuda_bind_forces(forces, slow_forces);
01703 cuda_bind_virials(virials);
01704 cuda_bind_GBIS_energy(energy_gbis);
01705 cudaEventRecord(start_calc, stream);
01706
01707 cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
01708 localComputeRecords.size(),remoteComputeRecords.size(),
01709 localActivePatches.size(),remoteActivePatches.size(),
01710 doSlow, doEnergy, usePairlists, savePairlists, stream);
01711 if (simParams->GBISOn) {
01712 cuda_GBIS_P1(
01713 localComputeRecords.size(),remoteComputeRecords.size(),
01714 localActivePatches.size(),remoteActivePatches.size(),
01715 simParams->alpha_cutoff-simParams->fsMax,
01716 simParams->coulomb_radius_offset,
01717 lata, latb, latc, stream);
01718 }
01719
01720
01721 cudaEventRecord(end_remote_download, stream);
01722 CUDA_POLL(cuda_check_remote_progress,this);
01723 if ( shared_gpu && ! mergegrids ) {
01724 CUDA_POLL(cuda_check_remote_calc,this);
01725 break;
01726 }
01727 }
01728 if (simParams->GBISOn) {
01729 if (gbisPhase == 1) {
01730
01731 } else if (gbisPhase == 2) {
01732 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P2>>>\n", CkMyPe())
01733
01734 cuda_bind_GBIS_bornRad(bornRadH);
01735 cuda_bind_GBIS_dEdaSum(dEdaSumH);
01736 cudaEventRecord(start_calc, stream);
01737 cuda_GBIS_P2(
01738 localComputeRecords.size(),remoteComputeRecords.size(),
01739 localActivePatches.size(),remoteActivePatches.size(),
01740 (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
01741 simParams->nonbondedScaling, simParams->kappa,
01742 (simParams->switchingActive ? simParams->switchingDist : -1.0),
01743 simParams->dielectric, simParams->solvent_dielectric,
01744 lata, latb, latc,
01745 doEnergy, doSlow, stream
01746 );
01747 cudaEventRecord(end_remote_download, stream);
01748 CUDA_POLL(cuda_check_remote_progress,this);
01749 } else if (gbisPhase == 3) {
01750 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P3>>>\n", CkMyPe())
01751
01752 cuda_bind_GBIS_dHdrPrefix(dHdrPrefixH);
01753 cudaEventRecord(start_calc, stream);
01754 if (doSlow)
01755 cuda_GBIS_P3(
01756 localComputeRecords.size(),remoteComputeRecords.size(),
01757 localActivePatches.size(),remoteActivePatches.size(),
01758 (simParams->alpha_cutoff-simParams->fsMax),
01759 simParams->coulomb_radius_offset,
01760 simParams->nonbondedScaling,
01761 lata, latb, latc, stream
01762 );
01763 cudaEventRecord(end_remote_download, stream);
01764 CUDA_POLL(cuda_check_remote_progress,this);
01765 }
01766 }
01767
01769
01770 case 2:
01771 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: case 2\n", CkMyPe())
01772 ++kernel_launch_state;
01773 gpu_is_mine = 0;
01774
01775 #if 0
01776 cudaStreamWaitEvent(stream2, start_calc, 0);
01777 #else
01778 #define stream2 stream
01779 #endif
01780
01781 if (!simParams->GBISOn || gbisPhase == 1) {
01782
01783 cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
01784 0,localComputeRecords.size(),
01785 0,localActivePatches.size(),
01786 doSlow, doEnergy, usePairlists, savePairlists, stream2);
01787 if (simParams->GBISOn) {
01788 cuda_GBIS_P1(
01789 0,localComputeRecords.size(),
01790 0,localActivePatches.size(),
01791 simParams->alpha_cutoff-simParams->fsMax,
01792 simParams->coulomb_radius_offset,
01793 lata, latb, latc, stream2 );
01794 }
01795
01796
01797
01798 cudaEventRecord(end_local_download, stream2);
01799 if ( workStarted == 2 ) {
01800 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
01801 cuda_check_local_progress\n", CkMyPe())
01802 CUDA_POLL(cuda_check_local_progress,this);
01803 }
01804 if ( shared_gpu && ! mergegrids ) {
01805 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
01806 cuda_check_local_calc\n", CkMyPe())
01807 CUDA_POLL(cuda_check_local_calc,this);
01808 break;
01809 }
01810
01811 }
01812 if (simParams->GBISOn) {
01813 if (gbisPhase == 1) {
01814
01815 } else if (gbisPhase == 2) {
01816 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P2>>>\n", CkMyPe())
01817 cuda_GBIS_P2(
01818 0,localComputeRecords.size(),
01819 0,localActivePatches.size(),
01820 (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
01821 simParams->nonbondedScaling, simParams->kappa,
01822 (simParams->switchingActive ? simParams->switchingDist : -1.0),
01823 simParams->dielectric, simParams->solvent_dielectric,
01824 lata, latb, latc,
01825 doEnergy, doSlow, stream2
01826 );
01827 cudaEventRecord(end_local_download, stream2);
01828 if ( workStarted == 2 ) {
01829 CUDA_POLL(cuda_check_local_progress,this);
01830 }
01831 } else if (gbisPhase == 3) {
01832 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P3>>>\n", CkMyPe())
01833 if (doSlow)
01834 cuda_GBIS_P3(
01835 0,localComputeRecords.size(),
01836 0,localActivePatches.size(),
01837 (simParams->alpha_cutoff-simParams->fsMax),
01838 simParams->coulomb_radius_offset,
01839 simParams->nonbondedScaling,
01840 lata, latb, latc, stream2
01841 );
01842 cudaEventRecord(end_local_download, stream2);
01843 if ( workStarted == 2 ) {
01844 CUDA_POLL(cuda_check_local_progress,this);
01845 }
01846 }
01847 }
01848
01849
01850 default:
01851 GBISP("C.N.CUDA[%d]::recvYieldDevice: case default\n", CkMyPe())
01852 gpu_is_mine = 1;
01853 break;
01854 }
01855 GBISP("C.N.CUDA[%d]::recvYieldDevice: DONE\n", CkMyPe())
01856 }
01857
01858
01859 void ComputeNonbondedCUDA::messageFinishWork() {
01860
01861 cuda_errcheck("at cuda stream completed");
01862
01863 for ( int i = 0; i < numSlaves; ++i ) {
01864 computeMgr->sendNonbondedCUDASlaveEnqueue(slaves[i],slavePes[i],sequence(),priority(),workStarted);
01865 }
01866
01867 WorkDistrib::messageEnqueueWork(this);
01868
01869 }
01870
01871
01872 int ComputeNonbondedCUDA::finishWork() {
01873 GBISP("C.N.CUDA[%d]::fnWork: workStarted %d, phase %d\n", \
01874 CkMyPe(), workStarted, gbisPhase)
01875
01876 Molecule *mol = Node::Object()->molecule;
01877 SimParameters *simParams = Node::Object()->simParameters;
01878
01879 ResizeArray<int> &patches( workStarted == 1 ?
01880 remoteHostedPatches : localHostedPatches );
01881
01882
01883 if ( !simParams->GBISOn || gbisPhase == 1 ) {
01884 for ( int i=0; i<patches.size(); ++i ) {
01885 patch_record &pr = master->patchRecords[patches[i]];
01886 GBISP("GBIS[%d] fnWork() P0[%d] force.open()\n",CkMyPe(), pr.patchID);
01887 pr.r = pr.forceBox->open();
01888 }
01889 }
01890
01891
01892
01893
01894 for ( int i=0; i<patches.size(); ++i ) {
01895
01896 patch_record &pr = master->patchRecords[patches[i]];
01897
01898 int start = pr.localStart;
01899 const CompAtomExt *aExt = pr.xExt;
01900 if ( !simParams->GBISOn || gbisPhase == 3 ) {
01901 int nfree = pr.numFreeAtoms;
01902 pr.f = pr.r->f[Results::nbond];
01903 Force *f = pr.f;
01904 Force *f_slow = pr.r->f[Results::slow];
01905 const CompAtom *a = pr.x;
01906
01907 float4 *af = master->forces + start;
01908 float4 *af_slow = master->slow_forces + start;
01909
01910 for ( int k=0; k<nfree; ++k ) {
01911 int j = aExt[k].sortOrder;
01912 f[j].x += af[k].x;
01913 f[j].y += af[k].y;
01914 f[j].z += af[k].z;
01915
01916
01917 if ( doSlow ) {
01918 f_slow[j].x += af_slow[k].x;
01919 f_slow[j].y += af_slow[k].y;
01920 f_slow[j].z += af_slow[k].z;
01921
01922 }
01923 }
01924
01925 #if 1
01926
01927 if ( CkNumPes() == 1 ) {
01928 const CompAtomExt *aExt = pr.xExt;
01929 for ( int k=0; k<nfree; ++k ) {
01930 int j = aExt[k].sortOrder;
01931 #ifdef MEM_OPT_VERSION
01932 int excl_expected = mol->exclSigPool[aExt[j].exclId].fullExclCnt + 1;
01933 #else
01934 int excl_expected = mol->get_full_exclusions_for_atom(aExt[j].id)[0] + 1;
01935 #endif
01936 if ( af[k].w != excl_expected ) {
01937 CkPrintf("%d:%d(%d) atom %d found %d exclusions but expected %d\n",
01938 i, j, k, aExt[j].id, (int)af[k].w, excl_expected );
01939 }
01940 }
01941 }
01942 #endif
01943 }
01944
01945 #if 0
01946 if ( i % 31 == 0 ) for ( int j=0; j<3; ++j ) {
01947 CkPrintf("Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
01948 j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
01949 af[j].w);
01950 }
01951 #endif
01952
01953
01954 if (simParams->GBISOn) {
01955 if (gbisPhase == 1) {
01956
01957 GBReal *psiSumMaster = master->psiSumH + start;
01958 for ( int k=0; k<pr.numAtoms; ++k ) {
01959 int j = aExt[k].sortOrder;
01960 pr.psiSum[j] += psiSumMaster[k];
01961 }
01962 GBISP("C.N.CUDA[%d]::fnWork: P1 psiSum.close()\n", CkMyPe());
01963 pr.psiSumBox->close(&(pr.psiSum));
01964
01965 } else if (gbisPhase == 2) {
01966
01967 GBReal *dEdaSumMaster = master->dEdaSumH + start;
01968 for ( int k=0; k<pr.numAtoms; ++k ) {
01969 int j = aExt[k].sortOrder;
01970 pr.dEdaSum[j] += dEdaSumMaster[k];
01971 }
01972 GBISP("C.N.CUDA[%d]::fnWork: P2 dEdaSum.close()\n", CkMyPe());
01973 pr.dEdaSumBox->close(&(pr.dEdaSum));
01974
01975 } else if (gbisPhase == 3) {
01976 GBISP("C.N.CUDA[%d]::fnWork: P3 all.close()\n", CkMyPe());
01977 pr.intRadBox->close(&(pr.intRad));
01978 pr.bornRadBox->close(&(pr.bornRad));
01979 pr.dHdrPrefixBox->close(&(pr.dHdrPrefix));
01980 pr.positionBox->close(&(pr.x));
01981 pr.forceBox->close(&(pr.r));
01982 }
01983 } else {
01984 GBISP("C.N.CUDA[%d]::fnWork: pos/force.close()\n", CkMyPe());
01985 pr.positionBox->close(&(pr.x));
01986 pr.forceBox->close(&(pr.r));
01987 }
01988 }
01989
01990 #if 0
01991 virial *= (-1./6.);
01992 reduction->item(REDUCTION_VIRIAL_NBOND_XX) += virial;
01993 reduction->item(REDUCTION_VIRIAL_NBOND_YY) += virial;
01994 reduction->item(REDUCTION_VIRIAL_NBOND_ZZ) += virial;
01995 if ( doSlow ) {
01996 virial_slow *= (-1./6.);
01997 reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial_slow;
01998 reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial_slow;
01999 reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial_slow;
02000 }
02001 #endif
02002
02003 if ( workStarted == 1 && ! mergegrids &&
02004 ( localHostedPatches.size() || master == this ) ) {
02005 GBISP("not finished, call again\n");
02006 return 0;
02007 }
02008
02009 if ( master != this ) {
02010 GBISP("finished\n");
02011 if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);
02012 atomsChanged = 0;
02013 return 1;
02014 }
02015
02016 cuda_timer_total += kernel_time;
02017
02018 if ( !simParams->GBISOn || gbisPhase == 3 ) {
02019
02020 {
02021 Tensor virial_tensor;
02022 BigReal energyv = 0.;
02023 BigReal energye = 0.;
02024 BigReal energys = 0.;
02025 for ( int i = 0; i < num_virials; ++i ) {
02026 virial_tensor.xx += virials[16*i];
02027 virial_tensor.xy += virials[16*i+1];
02028 virial_tensor.xz += virials[16*i+2];
02029 virial_tensor.yx += virials[16*i+3];
02030 virial_tensor.yy += virials[16*i+4];
02031 virial_tensor.yz += virials[16*i+5];
02032 virial_tensor.zx += virials[16*i+6];
02033 virial_tensor.zy += virials[16*i+7];
02034 virial_tensor.zz += virials[16*i+8];
02035 energyv += virials[16*i+9];
02036 energye += virials[16*i+10];
02037 energys += virials[16*i+11];
02038 if (simParams->GBISOn) {
02039 energye += energy_gbis[i];
02040 }
02041 }
02042 ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NBOND,virial_tensor);
02043 if ( doEnergy ) {
02044 reduction->item(REDUCTION_LJ_ENERGY) += energyv;
02045 reduction->item(REDUCTION_ELECT_ENERGY) += energye;
02046 reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += energys;
02047 }
02048 }
02049 if ( doSlow ) {
02050 Tensor virial_slow_tensor;
02051 for ( int i = 0; i < num_virials; ++i ) {
02052 virial_slow_tensor.xx += slow_virials[16*i];
02053 virial_slow_tensor.xy += slow_virials[16*i+1];
02054 virial_slow_tensor.xz += slow_virials[16*i+2];
02055 virial_slow_tensor.yx += slow_virials[16*i+3];
02056 virial_slow_tensor.yy += slow_virials[16*i+4];
02057 virial_slow_tensor.yz += slow_virials[16*i+5];
02058 virial_slow_tensor.zx += slow_virials[16*i+6];
02059 virial_slow_tensor.zy += slow_virials[16*i+7];
02060 virial_slow_tensor.zz += slow_virials[16*i+8];
02061 }
02062 ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_SLOW,virial_slow_tensor);
02063 }
02064
02065 atomsChanged = 0;
02066 reduction->submit();
02067
02068 cuda_timer_count++;
02069 if ( simParams->outputCudaTiming &&
02070 step % simParams->outputCudaTiming == 0 ) {
02071
02072
02073
02074
02075
02076
02077
02078 #if 0
02079 float upload_ms, remote_calc_ms;
02080 float local_calc_ms, total_ms;
02081 cuda_errcheck("before event timers");
02082 cudaEventElapsedTime(&upload_ms, start_upload, start_calc);
02083 cuda_errcheck("in event timer 1");
02084 cudaEventElapsedTime(&remote_calc_ms, start_calc, end_remote_download);
02085 cuda_errcheck("in event timer 2");
02086 cudaEventElapsedTime(&local_calc_ms, end_remote_download, end_local_download);
02087 cuda_errcheck("in event timer 3");
02088 cudaEventElapsedTime(&total_ms, start_upload, end_local_download);
02089 cuda_errcheck("in event timer 4");
02090 cuda_errcheck("in event timers");
02091
02092 CkPrintf("CUDA EVENT TIMING: %d %f %f %f %f\n",
02093 CkMyPe(), upload_ms, remote_calc_ms,
02094 local_calc_ms, total_ms);
02095 #endif
02096
02097 if ( cuda_timer_count >= simParams->outputCudaTiming ) {
02098 cuda_timer_total /= cuda_timer_count;
02099 CkPrintf("CUDA TIMING: %d %f ms/step on node %d\n",
02100 step, cuda_timer_total * 1.e3, CkMyPe());
02101 }
02102 cuda_timer_count = 0;
02103 cuda_timer_total = 0;
02104 }
02105
02106 }
02107
02108
02109 GBISP("C.N.CUDA[%d]::fnWork: incrementing phase\n", CkMyPe())
02110 if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);
02111
02112 GBISP("C.N.CUDA[%d] finished ready for next step\n",CkMyPe());
02113 return 1;
02114 }
02115
02116
02117 #endif // NAMD_CUDA
02118