Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

ComputeNonbondedCUDA.C

Go to the documentation of this file.
00001 
00002 #include "common.h"
00003 #include "charm++.h"
00004 
00005 #ifdef NAMD_CUDA
00006 #include <cuda_runtime.h>
00007 #include <cuda.h>
00008 #endif
00009 
00010 #include "WorkDistrib.h"
00011 #include "ComputeMgr.h"
00012 #include "ProxyMgr.h"
00013 #include "ComputeNonbondedCUDA.h"
00014 #include "ComputeNonbondedCUDAKernel.h"
00015 #include "LJTable.h"
00016 #include "ObjectArena.h"
00017 #include "SortAtoms.h"
00018 #include <algorithm>
00019 
00020 #include "NamdTypes.h"
00021 
00022 //#define PRINT_GBIS
00023 #undef PRINT_GBIS
00024 
00025 #ifdef NAMD_CUDA
00026 
00027 #ifdef WIN32
00028 #define __thread __declspec(thread)
00029 #endif
00030 
00031 extern __thread cudaStream_t stream;
00032 extern __thread cudaStream_t stream2;
00033 
00034 void cuda_errcheck(const char *msg) {
00035   cudaError_t err;
00036   if ((err = cudaGetLastError()) != cudaSuccess) {
00037     char host[128];
00038 #ifdef NOHOSTNAME
00039     sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00040 #else
00041     gethostname(host, 128);  host[127] = 0;
00042 #endif
00043     char devstr[128] = "";
00044     int devnum;
00045     if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00046       sprintf(devstr, " device %d", devnum);
00047     }
00048     char errmsg[1024];
00049     sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
00050     NAMD_die(errmsg);
00051   }
00052 }
00053 
00054 void cuda_die(const char *msg) {
00055     char host[128];
00056 #ifdef NOHOSTNAME
00057     sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00058 #else
00059     gethostname(host, 128);  host[127] = 0;
00060 #endif
00061     char devstr[128] = "";
00062     int devnum;
00063     if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00064       sprintf(devstr, " device %d", devnum);
00065     }
00066     char errmsg[1024];
00067     sprintf(errmsg,"CUDA error on Pe %d (%s%s): %s", CkMyPe(), host, devstr, msg);
00068     NAMD_die(errmsg);
00069 }
00070 
00071 char *devicelist;
00072 static __thread int usedevicelist;
00073 static __thread int ignoresharing;
00074 static __thread int mergegrids;
00075 
00076 void cuda_getargs(char **argv) {
00077   devicelist = 0;
00078   usedevicelist = CmiGetArgStringDesc(argv, "+devices", &devicelist,
00079         "comma-delimited list of CUDA device numbers such as 0,2,1,2");
00080   ignoresharing = CmiGetArgFlag(argv, "+ignoresharing");
00081   mergegrids = CmiGetArgFlag(argv, "+mergegrids");
00082 }
00083 
00084 static __thread int shared_gpu;
00085 static __thread int first_pe_sharing_gpu;
00086 static __thread int next_pe_sharing_gpu;
00087 static __thread int devicePe;
00088 static __thread int numPesSharingDevice;
00089 static __thread int *pesSharingDevice;
00090 
00091 static __thread int gpu_is_mine;
00092 
00093 int cuda_device_pe() { return devicePe; }
00094 
00095 bool cuda_device_shared_with_pe(int pe) {
00096   for ( int i=0; i<numPesSharingDevice; ++i ) {
00097     if ( pesSharingDevice[i] == pe ) return true;
00098   }
00099   return false;
00100 }
00101 
00102 static inline bool sortop_bitreverse(int a, int b) {
00103   if ( a == b ) return 0; 
00104   for ( int bit = 1; bit; bit *= 2 ) {
00105     if ( (a&bit) != (b&bit) ) return ((a&bit) < (b&bit));
00106   }
00107   return 0;
00108 }
00109 
00110 /*
00111 BASE
00112 2 types (remote & local)
00113 16 pes per node
00114 3 phases (1, 2, 3)
00115 */
00116 
00117 #define CUDA_EVENT_ID_POLL_REMOTE 98
00118 #define CUDA_TRACE_POLL_REMOTE \
00119   traceUserEvent(CUDA_EVENT_ID_POLL_REMOTE)
00120 #define CUDA_EVENT_ID_POLL_LOCAL 99
00121 #define CUDA_TRACE_POLL_LOCAL \
00122   traceUserEvent(CUDA_EVENT_ID_POLL_LOCAL)
00123 #define CUDA_EVENT_ID_BASE 100
00124 #define CUDA_TRACE_REMOTE(START,END) \
00125   do { int dev; cudaGetDevice(&dev); traceUserBracketEvent( \
00126        CUDA_EVENT_ID_BASE + 2 * dev, START, END); } while (0)
00127 #define CUDA_TRACE_LOCAL(START,END) \
00128   do { int dev; cudaGetDevice(&dev); traceUserBracketEvent( \
00129        CUDA_EVENT_ID_BASE + 2 * dev + 1, START, END); } while (0)
00130 
00131 void cuda_register_user_events() {
00132 
00133   traceRegisterUserEvent("CUDA poll remote", CUDA_EVENT_ID_POLL_REMOTE);
00134   traceRegisterUserEvent("CUDA poll local", CUDA_EVENT_ID_POLL_LOCAL);
00135 
00136 #define REGISTER_DEVICE_EVENTS(DEV) \
00137   traceRegisterUserEvent("CUDA device " #DEV " remote", CUDA_EVENT_ID_BASE + 2 * DEV); \
00138   traceRegisterUserEvent("CUDA device " #DEV " local", CUDA_EVENT_ID_BASE + 2 * DEV + 1);
00139 
00140   REGISTER_DEVICE_EVENTS(0)
00141   REGISTER_DEVICE_EVENTS(1)
00142   REGISTER_DEVICE_EVENTS(2)
00143   REGISTER_DEVICE_EVENTS(3)
00144   REGISTER_DEVICE_EVENTS(4)
00145   REGISTER_DEVICE_EVENTS(5)
00146   REGISTER_DEVICE_EVENTS(6)
00147   REGISTER_DEVICE_EVENTS(7)
00148   REGISTER_DEVICE_EVENTS(8)
00149   REGISTER_DEVICE_EVENTS(9)
00150   REGISTER_DEVICE_EVENTS(10)
00151   REGISTER_DEVICE_EVENTS(11)
00152   REGISTER_DEVICE_EVENTS(12)
00153   REGISTER_DEVICE_EVENTS(13)
00154   REGISTER_DEVICE_EVENTS(14)
00155   REGISTER_DEVICE_EVENTS(15)
00156 
00157 }
00158 
00159 void cuda_initialize() {
00160 
00161   if ( 0 == CkMyPe() ) cuda_register_user_events();
00162 
00163   char host[128];
00164 #ifdef NOHOSTNAME
00165   sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00166 #else
00167   gethostname(host, 128);  host[127] = 0;
00168 #endif
00169 
00170   int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
00171   int myRankInPhysicalNode;
00172   int numPesOnPhysicalNode;
00173   int *pesOnPhysicalNode;
00174   CmiGetPesOnPhysicalNode(myPhysicalNodeID,
00175                            &pesOnPhysicalNode,&numPesOnPhysicalNode);
00176 
00177   {
00178     int i;
00179     for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00180       if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
00181         i = numPesOnPhysicalNode;
00182         break;
00183       }
00184       if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
00185     }
00186     if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
00187       CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
00188       for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00189         CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
00190           i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
00191       }
00192       myRankInPhysicalNode = 0;
00193       numPesOnPhysicalNode = 1;
00194       pesOnPhysicalNode = new int[1];
00195       pesOnPhysicalNode[0] = CkMyPe();
00196     } else {
00197       myRankInPhysicalNode = i;
00198     }
00199   }
00200   // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
00201 
00202   int deviceCount = 0;
00203   cudaGetDeviceCount(&deviceCount);
00204   cuda_errcheck("in cudaGetDeviceCount");
00205   if ( deviceCount <= 0 ) {
00206     cuda_die("No CUDA devices found.");
00207   }
00208 
00209   int *devices;
00210   int ndevices = 0;
00211   int nexclusive = 0;
00212   if ( usedevicelist ) {
00213     devices = new int[strlen(devicelist)];
00214     int i = 0;
00215     while ( devicelist[i] ) {
00216       ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
00217       while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
00218       while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
00219     }
00220   } else {
00221     if ( ! CkMyPe() ) {
00222       CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
00223     }
00224     devices = new int[deviceCount];
00225     for ( int i=0; i<deviceCount; ++i ) {
00226       int dev = i % deviceCount;
00227 #if CUDA_VERSION >= 2020
00228       cudaDeviceProp deviceProp;
00229       cudaGetDeviceProperties(&deviceProp, dev);
00230       cuda_errcheck("in cudaGetDeviceProperties");
00231       if ( deviceProp.computeMode != cudaComputeModeProhibited
00232            && (deviceProp.major > 1 || deviceProp.minor >= 1)
00233            && deviceProp.canMapHostMemory
00234            && deviceProp.multiProcessorCount > 2 ) {  // exclude weak cards
00235         devices[ndevices++] = dev;
00236       }
00237       if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
00238         ++nexclusive;
00239       }
00240 #else
00241       devices[ndevices++] = dev;
00242 #endif
00243     }
00244   }
00245 
00246   if ( ! ndevices ) {
00247     cuda_die("All CUDA devices are in prohibited mode, of compute capability 1.0, unable to map host memory, too small, or otherwise unusable.");
00248   }
00249 
00250   shared_gpu = 0;
00251   gpu_is_mine = 1;
00252   first_pe_sharing_gpu = CkMyPe();
00253   next_pe_sharing_gpu = CkMyPe();
00254 
00255  /* if ( (ndevices >= numPesOnPhysicalNode) || (nexclusive == 0) ) */ {
00256 
00257   int dev;
00258   if ( numPesOnPhysicalNode > 1 ) {
00259     int myDeviceRank = myRankInPhysicalNode * ndevices / numPesOnPhysicalNode;
00260     dev = devices[myDeviceRank];
00261     devicePe = CkMyPe();
00262     if ( ignoresharing ) {
00263       pesSharingDevice = new int[1];
00264       pesSharingDevice[0] = CkMyPe();
00265       numPesSharingDevice = 1;
00266     } else {
00267       pesSharingDevice = new int[numPesOnPhysicalNode];
00268       devicePe = -1;
00269       numPesSharingDevice = 0;
00270       for ( int i = 0; i < numPesOnPhysicalNode; ++i ) {
00271         if ( i * ndevices / numPesOnPhysicalNode == myDeviceRank ) {
00272           int thisPe = pesOnPhysicalNode[i];
00273           pesSharingDevice[numPesSharingDevice++] = thisPe;
00274           if ( devicePe < 1 ) devicePe = thisPe;
00275           if ( sortop_bitreverse(thisPe,devicePe) ) devicePe = thisPe;
00276         }
00277       }
00278       for ( int j = 0; j < ndevices; ++j ) {
00279         if ( devices[j] == dev && j != myDeviceRank ) shared_gpu = 1;
00280       }
00281     }
00282     if ( shared_gpu && devicePe == CkMyPe() ) {
00283       CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
00284     }
00285   } else {  // in case phys node code is lying
00286     dev = devices[CkMyPe() % ndevices];
00287     devicePe = CkMyPe();
00288     pesSharingDevice = new int[1];
00289     pesSharingDevice[0] = CkMyPe();
00290     numPesSharingDevice = 1;
00291   }
00292 
00293   if ( devicePe != CkMyPe() ) {
00294     CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
00295              CkMyPe(), myRankInPhysicalNode, devicePe);
00296     return;
00297   }
00298 
00299   // disable token-passing but don't submit local until remote finished
00300   // if shared_gpu is true, otherwise submit all work immediately
00301   first_pe_sharing_gpu = CkMyPe();
00302   next_pe_sharing_gpu = CkMyPe();
00303 
00304   gpu_is_mine = ( first_pe_sharing_gpu == CkMyPe() ); 
00305 
00306   if ( dev >= deviceCount ) {
00307     char buf[256];
00308     sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
00309                 CkMyPe(), dev, host, deviceCount);
00310     NAMD_die(buf);
00311   }
00312 
00313   cudaError_t err;
00314   cudaDeviceProp deviceProp;
00315   err = cudaGetDeviceProperties(&deviceProp, dev);
00316   if (err == cudaSuccess) {
00317     CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s'  Mem: %dMB  Rev: %d.%d\n",
00318              CkMyPe(), myRankInPhysicalNode, dev, host,
00319              deviceProp.name, deviceProp.totalGlobalMem / (1024*1024),
00320              deviceProp.major, deviceProp.minor);
00321 
00322     err = cudaSetDevice(dev);
00323   }
00324   if ( err != cudaSuccess) {
00325     char errmsg[1024];
00326     sprintf(errmsg,"CUDA error binding to device %d on pe %d: %s",
00327                         dev, CkMyPe(), cudaGetErrorString(err));
00328     NAMD_die(errmsg);
00329   }
00330 
00331  }  // just let CUDA pick a device for us
00332 
00333   cudaSetDeviceFlags(cudaDeviceMapHost);
00334   cuda_errcheck("in cudaSetDeviceFlags");
00335 
00336   int dev;
00337   if ( cudaGetDevice(&dev) == cudaSuccess ) {
00338     cudaDeviceProp deviceProp;
00339     cudaGetDeviceProperties(&deviceProp, dev);
00340     cuda_errcheck("in cudaGetDeviceProperties");
00341     if ( deviceProp.computeMode == cudaComputeModeProhibited )
00342       cuda_die("device in prohibited mode");
00343     if ( deviceProp.major < 2 && deviceProp.minor < 1 )
00344       cuda_die("device not of compute capability 1.1 or higher");
00345     if ( ! deviceProp.canMapHostMemory )
00346       cuda_die("device cannot map host memory");
00347   }
00348 
00349   if ( sizeof(patch_pair) & 15 ) NAMD_bug("sizeof(patch_pair) % 16 != 0");
00350   if ( sizeof(force_list) & 15 ) NAMD_bug("sizeof(force_list) % 16 != 0");
00351   if ( sizeof(atom) & 15 ) NAMD_bug("sizeof(atom) % 16 != 0");
00352   if ( sizeof(atom_param) & 15 ) NAMD_bug("sizeof(atom_param) % 16 != 0");
00353 
00354 }
00355 
00356 static __thread ComputeNonbondedCUDA* cudaCompute = 0;
00357 static __thread ComputeMgr *computeMgr = 0;
00358 
00359 void send_build_cuda_force_table() {
00360   computeMgr->sendBuildCudaForceTable();
00361 }
00362 
00363 void build_cuda_force_table() {
00364   if ( devicePe != CkMyPe() ) return;
00365   ComputeNonbondedCUDA::build_lj_table();
00366   ComputeNonbondedCUDA::build_force_table();
00367 }
00368 
00369 void ComputeNonbondedCUDA::build_lj_table() {  // static
00370 
00371   const LJTable* const ljTable = ComputeNonbondedUtil:: ljTable;
00372   const int dim = ljTable->get_table_dim();
00373 
00374   // round dim up to odd multiple of 16
00375   int tsize = (((dim+16+31)/32)*32)-16;
00376   if ( tsize < dim ) NAMD_bug("ComputeNonbondedCUDA::build_lj_table bad tsize");
00377 
00378   float2 *t = new float2[tsize*tsize];
00379   float2 *row = t;
00380   for ( int i=0; i<dim; ++i, row += tsize ) {
00381     for ( int j=0; j<dim; ++j ) {
00382       const LJTable::TableEntry *e = ljTable->table_val(i,j);
00383       row[j].x = e->A * scaling;
00384       row[j].y = e->B * scaling;
00385     }
00386   }
00387 
00388   cuda_bind_lj_table(t,tsize);
00389   delete [] t;
00390 
00391   if ( ! CkMyPe() ) {
00392     CkPrintf("Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim);
00393   }
00394 }
00395 
00396 void ComputeNonbondedCUDA::build_force_table() {  // static
00397 
00398   float4 t[FORCE_TABLE_SIZE];
00399   float4 et[FORCE_TABLE_SIZE];  // energy table
00400 
00401   const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta;
00402   const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp;
00403   // const int r2_delta_expc = 64 * (r2_delta_exp - 127);
00404   const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
00405 
00406   double r2list[FORCE_TABLE_SIZE];  // double to match cpu code
00407   for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00408     double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00409     r2list[i] = r*r + r2_delta;
00410   }
00411 
00412   union { double f; int32 i[2]; } byte_order_test;
00413   byte_order_test.f = 1.0;  // should occupy high-order bits only
00414   int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
00415 
00416   for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00417     double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00418     int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc;  // table_i >= 0
00419 
00420     if ( r > cutoff ) {
00421       t[i].x = 0.;
00422       t[i].y = 0.;
00423       t[i].z = 0.;
00424       t[i].w = 0.;
00425       et[i].x = 0.;
00426       et[i].y = 0.;
00427       et[i].z = 0.;
00428       et[i].w = 0.;
00429       continue;
00430     }
00431 
00432     BigReal diffa = r2list[i] - r2_table[table_i];
00433 
00434     // coulomb 1/r or fast force
00435     // t[i].x = 1. / (r2 * r);  // -1/r * d/dr r^-1
00436     {
00437       BigReal table_a = fast_table[4*table_i];
00438       BigReal table_b = fast_table[4*table_i+1];
00439       BigReal table_c = fast_table[4*table_i+2];
00440       BigReal table_d = fast_table[4*table_i+3];
00441       BigReal grad =
00442                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00443       t[i].x = 2. * grad;
00444       BigReal ener = table_a + diffa *
00445                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00446       et[i].x = ener;
00447     }
00448 
00449 
00450     // pme correction for slow force
00451     // t[i].w = 0.;
00452     {
00453       BigReal table_a = scor_table[4*table_i];
00454       BigReal table_b = scor_table[4*table_i+1];
00455       BigReal table_c = scor_table[4*table_i+2];
00456       BigReal table_d = scor_table[4*table_i+3];
00457       BigReal grad =
00458                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00459       t[i].w = 2. * grad;
00460       BigReal ener = table_a + diffa *
00461                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00462       et[i].w = ener;
00463     }
00464 
00465 
00466     // vdw 1/r^6
00467     // t[i].y = 6. / (r8);  // -1/r * d/dr r^-6
00468     {
00469       BigReal table_a = vdwb_table[4*table_i];
00470       BigReal table_b = vdwb_table[4*table_i+1];
00471       BigReal table_c = vdwb_table[4*table_i+2];
00472       BigReal table_d = vdwb_table[4*table_i+3];
00473       BigReal grad =
00474                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00475       t[i].y = 2. * -1. * grad;
00476       BigReal ener = table_a + diffa *
00477                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00478       et[i].y = -1. * ener;
00479     }
00480 
00481 
00482     // vdw 1/r^12
00483     // t[i].z = 12e / (r8 * r4 * r2);  // -1/r * d/dr r^-12
00484     {
00485       BigReal table_a = vdwa_table[4*table_i];
00486       BigReal table_b = vdwa_table[4*table_i+1];
00487       BigReal table_c = vdwa_table[4*table_i+2];
00488       BigReal table_d = vdwa_table[4*table_i+3];
00489       BigReal grad =
00490                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00491       t[i].z = 2. * grad;
00492       BigReal ener = table_a + diffa *
00493                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00494       et[i].z = ener;
00495     }
00496 
00497     // CkPrintf("%d %g %g %g %g %g %g\n", i, r, diffa,
00498     //   t[i].x, t[i].y, t[i].z, t[i].w);
00499 
00500 /*
00501     double r2 = r * r;
00502     double r4 = r2 * r2;
00503     double r8 = r4 * r4;
00504 
00505     t[i].x = 1. / (r2 * r);  // -1/r * d/dr r^-1
00506     t[i].y = 6. / (r8);  // -1/r * d/dr r^-6
00507     t[i].z = 12. / (r8 * r4 * r2);  // -1/r * d/dr r^-12
00508     t[i].w = 0.;
00509 */
00510   }
00511 
00512   t[0].x = 0.f;
00513   t[0].y = 0.f;
00514   t[0].z = 0.f;
00515   t[0].w = 0.f;
00516   et[0].x = et[1].x;
00517   et[0].y = et[1].y;
00518   et[0].z = et[1].z;
00519   et[0].w = et[1].w;
00520 
00521   cuda_bind_force_table(t,et);
00522 
00523   if ( ! CkMyPe() ) {
00524     CkPrintf("Info: Updated CUDA force table with %d elements.\n", FORCE_TABLE_SIZE);
00525   }
00526 }
00527 
00528 struct exlist_sortop {
00529   bool operator() (int32 *li, int32 *lj) {
00530     return ( li[1] < lj[1] );
00531   }
00532 };
00533 
00534 static __thread int2 *exclusionsByAtom;
00535 
00536 void ComputeNonbondedCUDA::build_exclusions() {
00537   Molecule *mol = Node::Object()->molecule;
00538 
00539 #ifdef MEM_OPT_VERSION
00540   int natoms = mol->exclSigPoolSize;
00541 #else
00542   int natoms = mol->numAtoms; 
00543 #endif
00544 
00545   exclusionsByAtom = new int2[natoms];
00546 
00547   // create unique sorted lists
00548 
00549   ObjectArena<int32> listArena;
00550   ResizeArray<int32*> unique_lists;
00551   int32 **listsByAtom = new int32*[natoms];
00552   SortableResizeArray<int32> curList;
00553   for ( int i=0; i<natoms; ++i ) {
00554     curList.resize(0);
00555     curList.add(0);  // always excluded from self
00556 #ifdef MEM_OPT_VERSION
00557     const ExclusionSignature *sig = mol->exclSigPool + i;
00558     int n = sig->fullExclCnt;
00559     for ( int j=0; j<n; ++j ) { curList.add(sig->fullOffset[j]); }
00560     n += 1;
00561 #else
00562     const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
00563     int n = mol_list[0] + 1;
00564     for ( int j=1; j<n; ++j ) { curList.add(mol_list[j] - i); }
00565 #endif
00566     curList.sort();
00567 
00568     int j;
00569     for ( j=0; j<unique_lists.size(); ++j ) {
00570       if ( n != unique_lists[j][0] ) continue;  // no match
00571       int k;
00572       for ( k=0; k<n; ++k ) {
00573         if ( unique_lists[j][k+3] != curList[k] ) break;
00574       }
00575       if ( k == n ) break;  // found match
00576     }
00577     if ( j == unique_lists.size() ) {  // no match
00578       int32 *list = listArena.getNewArray(n+3);
00579       list[0] = n;
00580       int maxdiff = 0;
00581       maxdiff = -1 * curList[0];
00582       if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
00583       list[1] = maxdiff;
00584       for ( int k=0; k<n; ++k ) {
00585         list[k+3] = curList[k];
00586       }
00587       unique_lists.add(list);
00588     }
00589     listsByAtom[i] = unique_lists[j];
00590   }
00591   // sort lists by maxdiff
00592   std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
00593   long int totalbits = 0;
00594   int nlists = unique_lists.size();
00595   for ( int j=0; j<nlists; ++j ) {
00596     int32 *list = unique_lists[j];
00597     int maxdiff = list[1];
00598     list[2] = totalbits + maxdiff;
00599     totalbits += 2*maxdiff + 1;
00600   }
00601   for ( int i=0; i<natoms; ++i ) {
00602     exclusionsByAtom[i].x = listsByAtom[i][1];  // maxdiff
00603     exclusionsByAtom[i].y = listsByAtom[i][2];  // start
00604   }
00605   delete [] listsByAtom;
00606 
00607   if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
00608 
00609   if ( ! CkMyPe() ) {
00610     long int bytesneeded = totalbits / 8;
00611     CkPrintf("Info: Found %d unique exclusion lists needing %ld bytes\n",
00612                 unique_lists.size(), bytesneeded);
00613 
00614     long int bytesavail = MAX_EXCLUSIONS * sizeof(unsigned int);
00615     if ( bytesneeded > bytesavail ) {
00616       char errmsg[512];
00617       sprintf(errmsg,"%ld bytes of CUDA memory needed for exclusions "
00618                      "but only %ld bytes can be addressed with 32-bit int.",
00619                      bytesneeded, bytesavail);
00620       NAMD_die(errmsg);
00621     }
00622   }
00623 
00624 #define SET_EXCL(EXCL,BASE,DIFF) \
00625          (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
00626 
00627   unsigned int *exclusion_bits = new unsigned int[totalbits/32];
00628   memset(exclusion_bits, 0, totalbits/8);
00629 
00630   long int base = 0;
00631   for ( int i=0; i<unique_lists.size(); ++i ) {
00632     base += unique_lists[i][1];
00633     if ( base != unique_lists[i][2] ) {
00634       NAMD_bug("ComputeNonbondedCUDA::build_exclusions base != stored");
00635     }
00636     int n = unique_lists[i][0];
00637     for ( int j=0; j<n; ++j ) {
00638       SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
00639     }
00640     base += unique_lists[i][1] + 1;
00641   }
00642 
00643   cuda_bind_exclusions(exclusion_bits, totalbits/32);
00644 
00645   delete [] exclusion_bits;
00646 }
00647 
00648 
00649 void register_cuda_compute_self(ComputeID c, PatchID pid) {
00650 
00651   if ( ! cudaCompute ) NAMD_bug("register_self called early");
00652 
00653   cudaCompute->requirePatch(pid);
00654 
00655   ComputeNonbondedCUDA::compute_record cr;
00656   cr.c = c;
00657   cr.pid[0] = pid;  cr.pid[1] = pid;
00658   cr.offset = 0.;
00659   if ( cudaCompute->patchRecords[pid].isLocal ) {
00660     cudaCompute->localComputeRecords.add(cr);
00661   } else {
00662     cudaCompute->remoteComputeRecords.add(cr);
00663   }
00664 }
00665 
00666 void register_cuda_compute_pair(ComputeID c, PatchID pid[], int t[]) {
00667 
00668   if ( ! cudaCompute ) NAMD_bug("register_pair called early");
00669  
00670   cudaCompute->requirePatch(pid[0]);
00671   cudaCompute->requirePatch(pid[1]);
00672 
00673   ComputeNonbondedCUDA::compute_record cr, cr2;
00674   cr.c = c;  cr2.c = c;
00675   cr.pid[0] = pid[0];  cr.pid[1] = pid[1];
00676   cr2.pid[0] = pid[1];  cr2.pid[1] = pid[0];
00677 
00678   int t1 = t[0];
00679   int t2 = t[1];
00680   Vector offset = cudaCompute->patchMap->center(pid[0])
00681                 - cudaCompute->patchMap->center(pid[1]);
00682   offset.x += (t1%3-1) - (t2%3-1);
00683   offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
00684   offset.z += (t1/9-1) - (t2/9-1);
00685   cr.offset = offset;
00686   cr2.offset = -1. * offset;
00687     
00688   if ( cudaCompute->patchRecords[pid[0]].isLocal ) {
00689     cudaCompute->localComputeRecords.add(cr);
00690   } else {
00691     cudaCompute->remoteComputeRecords.add(cr);
00692   }
00693   if ( cudaCompute->patchRecords[pid[1]].isLocal ) {
00694     cudaCompute->localComputeRecords.add(cr2);
00695   } else {
00696     cudaCompute->remoteComputeRecords.add(cr2);
00697   }
00698 }
00699 
00700 void unregister_cuda_compute(ComputeID c) {  // static
00701 
00702   NAMD_bug("unregister_compute unimplemented");
00703 
00704 }
00705 
00706 static __thread int atomsChanged = 0;
00707 static __thread int computesChanged = 0;
00708 
00709 static __thread int pairlistsValid = 0;
00710 static __thread float pairlistTolerance = 0.;
00711 static __thread int usePairlists = 0;
00712 static __thread int savePairlists = 0;
00713 static __thread float plcutoff2 = 0;
00714 
00715 // static __thread cudaEvent_t start_upload;
00716 static __thread cudaEvent_t start_calc;
00717 static __thread cudaEvent_t end_remote_download;
00718 static __thread cudaEvent_t end_local_download;
00719 
00720 static __thread ResizeArray<patch_pair> *patch_pairs_ptr;
00721 static __thread ResizeArray<force_list> *force_lists_ptr;
00722 #define PATCH_PAIRS_REF ResizeArray<patch_pair> &patch_pairs(*patch_pairs_ptr);
00723 #define FORCE_LISTS_REF ResizeArray<force_list> &force_lists(*force_lists_ptr);
00724 
00725 ComputeNonbondedCUDA::ComputeNonbondedCUDA(ComputeID c, ComputeMgr *mgr,
00726                 ComputeNonbondedCUDA *m, int idx) : Compute(c), slaveIndex(idx) {
00727 #ifdef PRINT_GBIS
00728    CkPrintf("C.N.CUDA[%d]::constructor cid=%d\n", CkMyPe(), c);
00729 #endif
00730 
00731   // CkPrintf("create ComputeNonbondedCUDA\n");
00732   master = m ? m : this;
00733   cudaCompute = this;
00734   computeMgr = mgr;
00735   patchMap = PatchMap::Object();
00736   atomMap = AtomMap::Object();
00737   reduction = 0;
00738 
00739   SimParameters *params = Node::Object()->simParameters;
00740   if (params->pressureProfileOn) {
00741     NAMD_die("pressure profile not supported in CUDA");
00742   }
00743 
00744   atomsChanged = 1;
00745   computesChanged = 1;
00746   workStarted = 0;
00747   basePriority = PROXY_DATA_PRIORITY;
00748   localWorkMsg2 = new (PRIORITY_SIZE) LocalWorkMsg;
00749 
00750   if ( master != this ) { // I am slave
00751     masterPe = master->masterPe;
00752     master->slaves[slaveIndex] = this;
00753     if ( master->slavePes[slaveIndex] != CkMyPe() ) {
00754       NAMD_bug("ComputeNonbondedCUDA slavePes[slaveIndex] != CkMyPe");
00755     }
00756     registerPatches();
00757     return;
00758   }
00759   masterPe = CkMyPe();
00760 
00761   cuda_init();
00762   build_exclusions();
00763   // cudaEventCreate(&start_upload);
00764   cudaEventCreateWithFlags(&start_calc,cudaEventDisableTiming);
00765   cudaEventCreateWithFlags(&end_remote_download,cudaEventDisableTiming);
00766   cudaEventCreateWithFlags(&end_local_download,cudaEventDisableTiming);
00767 
00768   patch_pairs_ptr = new ResizeArray<patch_pair>;
00769   force_lists_ptr = new ResizeArray<force_list>;
00770 }
00771 
00772 
00773 ComputeNonbondedCUDA::~ComputeNonbondedCUDA() { ; }
00774 
00775 void ComputeNonbondedCUDA::requirePatch(int pid) {
00776 
00777   computesChanged = 1;
00778   patch_record &pr = patchRecords.item(pid);
00779   if ( pr.refCount == 0 ) {
00780     if ( mergegrids ) {
00781       pr.isLocal = 0;
00782     } else if ( CkNumNodes() < 2 ) {
00783       pr.isLocal = 1 & ( 1 ^ patchMap->index_a(pid) ^
00784          patchMap->index_b(pid) ^ patchMap->index_c(pid) );
00785     } else {
00786       pr.isLocal = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
00787     }
00788     if ( pr.isLocal ) {
00789       localActivePatches.add(pid);
00790     } else {
00791       remoteActivePatches.add(pid);
00792     }
00793     activePatches.add(pid);
00794     // setNumPatches(activePatches.size());
00795     pr.patchID = pid;
00796     pr.hostPe = -1;
00797     // ProxyMgr::Object()->createProxy(pid);
00798     // pr.p = patchMap->patch(pid);
00799     // pr.positionBox = pr.p->registerPositionPickup(this);
00800     // pr.forceBox = pr.p->registerForceDeposit(this);
00801     pr.x = NULL;
00802     pr.xExt = NULL;
00803     pr.r = NULL;
00804     pr.f = NULL;
00805     pr.intRad      = NULL;
00806     pr.psiSum      = NULL;
00807     pr.bornRad     = NULL;
00808     pr.dEdaSum     = NULL;
00809     pr.dHdrPrefix  = NULL;
00810   }
00811   pr.refCount += 1;
00812 }
00813 
00814 void ComputeNonbondedCUDA::registerPatches() {
00815 
00816   SimParameters *simParams = Node::Object()->simParameters;
00817   int npatches = master->activePatches.size();
00818   int *pids = master->activePatches.begin();
00819   patch_record *recs = master->patchRecords.begin();
00820   for ( int i=0; i<npatches; ++i ) {
00821     int pid = pids[i];
00822     patch_record &pr = recs[pid];
00823     if ( pr.hostPe == CkMyPe() ) {
00824       //if ( ! reduction ) {
00825       //  reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00826       //}
00827       hostedPatches.add(pid);
00828       if ( pr.isLocal ) {
00829         localHostedPatches.add(pid);
00830       } else {
00831         remoteHostedPatches.add(pid);
00832       }
00833       ProxyMgr::Object()->createProxy(pid);
00834       pr.p = patchMap->patch(pid);
00835       pr.positionBox = pr.p->registerPositionPickup(this);
00836       pr.forceBox = pr.p->registerForceDeposit(this);
00837       if (simParams->GBISOn) {
00838         pr.intRadBox      = pr.p->registerIntRadPickup(this);
00839         pr.psiSumBox      = pr.p->registerPsiSumDeposit(this);
00840         pr.bornRadBox     = pr.p->registerBornRadPickup(this);
00841         pr.dEdaSumBox     = pr.p->registerDEdaSumDeposit(this);
00842         pr.dHdrPrefixBox  = pr.p->registerDHdrPrefixPickup(this);
00843       }
00844     }
00845   }
00846   if ( master == this ) setNumPatches(activePatches.size());
00847   else setNumPatches(hostedPatches.size());
00848   CkPrintf("Pe %d hosts %d local and %d remote patches for pe %d\n", CkMyPe(), localHostedPatches.size(), remoteHostedPatches.size(), masterPe);
00849 }
00850 
00851 void ComputeNonbondedCUDA::assignPatches() {
00852 
00853   int *pesOnNodeSharingDevice = new int[CkMyNodeSize()];
00854   int numPesOnNodeSharingDevice = 0;
00855   int masterIndex = -1;
00856   for ( int i=0; i<numPesSharingDevice; ++i ) {
00857     int pe = pesSharingDevice[i];
00858     if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
00859     if ( CkNodeOf(pe) == CkMyNode() ) {
00860       pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
00861     }
00862   }
00863 
00864   int npatches = activePatches.size();
00865 
00866   if ( npatches ) {
00867     reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00868   }
00869 
00870   int *count = new int[npatches];
00871   memset(count, 0, sizeof(int)*npatches);
00872   int *pcount = new int[numPesOnNodeSharingDevice];
00873   memset(pcount, 0, sizeof(int)*numPesOnNodeSharingDevice);
00874   int *rankpcount = new int[CkMyNodeSize()];
00875   memset(rankpcount, 0, sizeof(int)*CkMyNodeSize());
00876   char *table = new char[npatches*numPesOnNodeSharingDevice];
00877   memset(table, 0, npatches*numPesOnNodeSharingDevice);
00878 
00879   int unassignedpatches = npatches;
00880 
00881   if ( 0 ) { // assign all to device pe
00882     for ( int i=0; i<npatches; ++i ) {
00883       int pid = activePatches[i];
00884       patch_record &pr = patchRecords[pid];
00885       pr.hostPe = CkMyPe();
00886     }
00887     unassignedpatches = 0;
00888     pcount[masterIndex] = npatches;
00889   } else 
00890 
00891   // assign if home pe and build table of natural proxies
00892   for ( int i=0; i<npatches; ++i ) {
00893     int pid = activePatches[i];
00894     patch_record &pr = patchRecords[pid];
00895     int homePe = patchMap->node(pid);
00896     for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00897       int pe = pesOnNodeSharingDevice[j];
00898       if ( pe == homePe ) {
00899         pr.hostPe = pe;  --unassignedpatches;
00900         pcount[j] += 1;
00901       }
00902       if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
00903         table[i*numPesOnNodeSharingDevice+j] = 1;
00904       }
00905     }
00906     if ( pr.hostPe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
00907       pr.hostPe = homePe;  --unassignedpatches;
00908       rankpcount[CkRankOf(homePe)] += 1;
00909     }
00910   }
00911   // assign if only one pe has a required proxy
00912   int assignj = 0;
00913   for ( int i=0; i<npatches; ++i ) {
00914     int pid = activePatches[i];
00915     patch_record &pr = patchRecords[pid];
00916     if ( pr.hostPe != -1 ) continue;
00917     int c = 0;
00918     int lastj;
00919     for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00920       if ( table[i*numPesOnNodeSharingDevice+j] ) { ++c; lastj=j; }
00921     }
00922     count[i] = c;
00923     if ( c == 1 ) {
00924       pr.hostPe = pesOnNodeSharingDevice[lastj];
00925       --unassignedpatches;
00926       pcount[lastj] += 1;
00927     }
00928   }
00929   while ( unassignedpatches ) {
00930     int i;
00931     for ( i=0; i<npatches; ++i ) {
00932       if ( ! table[i*numPesOnNodeSharingDevice+assignj] ) continue;
00933       int pid = activePatches[i];
00934       patch_record &pr = patchRecords[pid];
00935       if ( pr.hostPe != -1 ) continue;
00936       pr.hostPe = pesOnNodeSharingDevice[assignj];
00937       --unassignedpatches;
00938       pcount[assignj] += 1;
00939       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00940       break;
00941     }
00942     if ( i<npatches ) continue;  // start search again
00943     for ( i=0; i<npatches; ++i ) {
00944       int pid = activePatches[i];
00945       patch_record &pr = patchRecords[pid];
00946       if ( pr.hostPe != -1 ) continue;
00947       if ( count[i] ) continue;
00948       pr.hostPe = pesOnNodeSharingDevice[assignj];
00949       --unassignedpatches;
00950       pcount[assignj] += 1;
00951       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00952       break;
00953     }
00954     if ( i<npatches ) continue;  // start search again
00955     if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00956   }
00957 
00958   for ( int i=0; i<npatches; ++i ) {
00959     int pid = activePatches[i];
00960     patch_record &pr = patchRecords[pid];
00961     // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), pid, pr.hostPe);
00962   }
00963 
00964   slavePes = new int[CkMyNodeSize()];
00965   slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
00966   numSlaves = 0;
00967   for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00968     int pe = pesOnNodeSharingDevice[j];
00969     int rank = pe - CkNodeFirst(CkMyNode());
00970     // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
00971     //          CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
00972     if ( pe == CkMyPe() ) continue;
00973     if ( ! pcount[j] && ! rankpcount[rank] ) continue;
00974     rankpcount[rank] = 0;  // skip in rank loop below
00975     slavePes[numSlaves] = pe;
00976     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00977     ++numSlaves;
00978   }
00979   for ( int j=0; j<CkMyNodeSize(); ++j ) {
00980     int pe = CkNodeFirst(CkMyNode()) + j;
00981     // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
00982     //          CkMyPe(),j,pe,rankpcount[j]);
00983     if ( ! rankpcount[j] ) continue;
00984     if ( pe == CkMyPe() ) continue;
00985     slavePes[numSlaves] = pe;
00986     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00987     ++numSlaves;
00988   }
00989   registerPatches();
00990 
00991   delete [] pesOnNodeSharingDevice;
00992   delete [] count;
00993   delete [] pcount;
00994   delete [] rankpcount;
00995   delete [] table;
00996 }
00997 
00998 static __thread int num_atom_records_allocated;
00999 // static __thread int* atom_order;
01000 static __thread atom_param* atom_params;
01001 static __thread atom* atoms;
01002 // static __thread float4* forces;
01003 // static __thread float4* slow_forces;
01004 static __thread int num_virials;
01005 static __thread int num_virials_allocated;
01006 static __thread float *virials;
01007 static __thread float *slow_virials;
01008 static __thread float *energy_gbis;
01009 
01010 //GBIS host pointers
01011 static __thread float *intRad0H;
01012 static __thread float *intRadSH;
01013 //static __thread GBReal *psiSumH; //moved into class
01014 static __thread float *bornRadH;
01015 //static __thread GBReal *dEdaSumH; //moved into class
01016 static __thread float *dHdrPrefixH;
01017 
01018 static __thread int cuda_timer_count;
01019 static __thread double cuda_timer_total;
01020 static __thread double kernel_time;
01021 static __thread double remote_submit_time;
01022 static __thread double local_submit_time;
01023 
01024 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
01025 
01026 #ifdef PRINT_GBIS
01027 #define GBISP(...) CkPrintf(__VA_ARGS__);
01028 #else
01029 #define GBISP(...)
01030 #endif
01031 
01032 #define count_limit 1000000
01033 static __thread int check_remote_count;
01034 static __thread int check_local_count;
01035 
01036 void cuda_check_remote_progress(void *arg, double walltime) {
01037 
01038   CUDA_TRACE_POLL_REMOTE;
01039   cudaError_t err = cudaEventQuery(end_remote_download);
01040   if ( err == cudaSuccess ) {
01041     local_submit_time = walltime;
01042     CUDA_TRACE_REMOTE(remote_submit_time,local_submit_time);
01043     if ( mergegrids ) {  // no local
01044       kernel_time = local_submit_time - kernel_time;
01045     }
01046     ((ComputeNonbondedCUDA *) arg)->messageFinishWork();
01047     check_remote_count = 0;
01048   } else if ( err != cudaErrorNotReady ) {
01049     cuda_errcheck("in cuda_check_remote_progress");
01050     NAMD_bug("cuda_errcheck missed error in cuda_check_remote_progress");
01051   } else if ( ++check_remote_count >= count_limit ) {
01052     char errmsg[256];
01053     sprintf(errmsg,"cuda_check_remote_progress polled %d times over %f s on step %d",
01054             check_remote_count, walltime - remote_submit_time,
01055             ((ComputeNonbondedCUDA *) arg)->step);
01056     cuda_errcheck(errmsg);
01057     NAMD_die(errmsg);
01058   } else if ( check_local_count ) {
01059     NAMD_bug("nonzero check_local_count in cuda_check_remote_progress");
01060   } else {
01061     CUDA_POLL(cuda_check_remote_progress, arg);
01062   }
01063 }
01064 
01065 void cuda_check_local_progress(void *arg, double walltime) {
01066 
01067   CUDA_TRACE_POLL_LOCAL;
01068   cudaError_t err = cudaEventQuery(end_local_download);
01069   if ( err == cudaSuccess ) {
01070     CUDA_TRACE_LOCAL(local_submit_time,walltime);
01071     kernel_time = walltime - kernel_time;
01072     ((ComputeNonbondedCUDA *) arg)->messageFinishWork();
01073     check_local_count = 0;
01074   } else if ( err != cudaErrorNotReady ) {
01075     cuda_errcheck("in cuda_check_local_progress");
01076     NAMD_bug("cuda_errcheck missed error in cuda_check_local_progress");
01077   } else if ( ++check_local_count >= count_limit ) {
01078     char errmsg[256];
01079     sprintf(errmsg,"cuda_check_local_progress polled %d times over %f s on step %d",
01080             check_local_count, walltime - local_submit_time,
01081             ((ComputeNonbondedCUDA *) arg)->step);
01082     cuda_errcheck(errmsg);
01083     NAMD_die(errmsg);
01084   } else if ( check_remote_count ) {
01085     NAMD_bug("nonzero check_remote_count in cuda_check_local_progress");
01086   } else {
01087     CUDA_POLL(cuda_check_local_progress, arg);
01088   }
01089 }
01090 
01091 #if 0
01092 // don't use this one unless timer is part of stream, above is better
01093 void cuda_check_progress(void *arg, double walltime) {
01094   if ( cuda_stream_finished() ) {
01095     kernel_time = walltime - kernel_time;
01096     CUDA_POLL(ccd_index);
01097     // ((ComputeNonbondedCUDA *) arg)->finishWork();
01098     WorkDistrib::messageEnqueueWork((ComputeNonbondedCUDA *) arg);
01099   }
01100 }
01101 #endif
01102 
01103 void ComputeNonbondedCUDA::atomUpdate() { atomsChanged = 1; }
01104 
01105 static __thread int kernel_launch_state = 0;
01106 
01107 struct cr_sortop {
01108   const Lattice &l;
01109   cr_sortop(const Lattice &lattice) : l(lattice) { }
01110   bool operator() (ComputeNonbondedCUDA::compute_record i,
01111                         ComputeNonbondedCUDA::compute_record j) {
01112     Vector a = l.a();
01113     Vector b = l.b();
01114     Vector c = l.c();
01115     BigReal ri = (i.offset.x * a + i.offset.y * b + i.offset.z * c).length2();
01116     BigReal rj = (j.offset.x * a + j.offset.y * b + j.offset.z * c).length2();
01117     return ( ri < rj );
01118   }
01119 };
01120 
01121 int ComputeNonbondedCUDA::noWork() {
01122 
01123   SimParameters *simParams = Node::Object()->simParameters;
01124   Flags &flags = master->patchRecords[hostedPatches[0]].p->flags;
01125   lattice = flags.lattice;
01126   doSlow = flags.doFullElectrostatics;
01127   doEnergy = flags.doEnergy;
01128   step = flags.step;
01129 
01130   if ( ! flags.doNonbonded ) {
01131 GBISP("GBIS[%d] noWork() don't do nonbonded\n",CkMyPe());
01132     for ( int i=0; i<hostedPatches.size(); ++i ) {
01133       patch_record &pr = master->patchRecords[hostedPatches[i]];
01134       pr.positionBox->skip();
01135       pr.forceBox->skip();
01136       if (simParams->GBISOn) {
01137         pr.intRadBox->skip();
01138         pr.psiSumBox->skip();
01139         pr.bornRadBox->skip();
01140         pr.dEdaSumBox->skip();
01141         pr.dHdrPrefixBox->skip();
01142       }
01143     }
01144     if ( master != this ) {
01145       computeMgr->sendNonbondedCUDASlaveReady(masterPe,
01146                         hostedPatches.size(),atomsChanged,sequence());
01147     }
01148     if ( reduction ) reduction->submit();
01149     return 1;
01150   }
01151 
01152   for ( int i=0; i<hostedPatches.size(); ++i ) {
01153     patch_record &pr = master->patchRecords[hostedPatches[i]];
01154     if (!simParams->GBISOn || gbisPhase == 1) {
01155 GBISP("GBIS[%d] noWork() P0[%d] open()\n",CkMyPe(), pr.patchID);
01156       pr.x = pr.positionBox->open();
01157       pr.xExt = pr.p->getCompAtomExtInfo();
01158     }
01159 
01160     if (simParams->GBISOn) {
01161       if (gbisPhase == 1) {
01162 GBISP("GBIS[%d] noWork() P1[%d] open()\n",CkMyPe(),pr.patchID);
01163         pr.intRad     = pr.intRadBox->open();
01164         pr.psiSum     = pr.psiSumBox->open();
01165       } else if (gbisPhase == 2) {
01166 GBISP("GBIS[%d] noWork() P2[%d] open()\n",CkMyPe(),pr.patchID);
01167         pr.bornRad    = pr.bornRadBox->open();
01168         pr.dEdaSum    = pr.dEdaSumBox->open();
01169       } else if (gbisPhase == 3) {
01170 GBISP("GBIS[%d] noWork() P3[%d] open()\n",CkMyPe(),pr.patchID);
01171         pr.dHdrPrefix = pr.dHdrPrefixBox->open();
01172       }
01173 GBISP("opened GBIS boxes");
01174     }
01175   }
01176 
01177   if ( master == this ) return 0; //work to do, enqueue as usual
01178 
01179   // message masterPe
01180   computeMgr->sendNonbondedCUDASlaveReady(masterPe,
01181                         hostedPatches.size(),atomsChanged,sequence());
01182 
01183   workStarted = 1;
01184   basePriority = COMPUTE_PROXY_PRIORITY;
01185 
01186   return 1;
01187 }
01188 
01189 void ComputeNonbondedCUDA::doWork() {
01190 GBISP("C.N.CUDA[%d]::doWork: seq %d, phase %d, workStarted %d\n", \
01191 CkMyPe(), sequence(), gbisPhase, workStarted);
01192 
01193   PATCH_PAIRS_REF;
01194   FORCE_LISTS_REF;
01195 
01196   if ( workStarted ) { //if work already started, check if finished
01197     if ( finishWork() ) {  // finished
01198       workStarted = 0;
01199       basePriority = PROXY_DATA_PRIORITY;  // higher to aid overlap
01200     } else {  // need to call again
01201       workStarted = 2;
01202       basePriority = PROXY_RESULTS_PRIORITY;  // lower for local
01203       if ( master == this && kernel_launch_state > 2 ) {
01204         cuda_check_local_progress(this,0.);  // launches polling
01205       }
01206     }
01207     return;
01208   }
01209 
01210   workStarted = 1;
01211   basePriority = COMPUTE_PROXY_PRIORITY;
01212 
01213   Molecule *mol = Node::Object()->molecule;
01214   Parameters *params = Node::Object()->parameters;
01215   SimParameters *simParams = Node::Object()->simParameters;
01216 
01217   //execute only during GBIS phase 1, or if not using GBIS
01218   if (!simParams->GBISOn || gbisPhase == 1) {
01219 
01220   //bind new patches to GPU
01221  if ( atomsChanged || computesChanged ) {
01222   int npatches = activePatches.size();
01223 
01224   pairlistsValid = 0;
01225   pairlistTolerance = 0.;
01226 
01227   if ( computesChanged ) {
01228     computesChanged = 0;
01229 
01230     num_virials = npatches;
01231     if ( num_virials > num_virials_allocated ) {
01232       if ( num_virials_allocated ) {
01233         cudaFreeHost(virials);
01234         cudaFreeHost(energy_gbis);
01235         cuda_errcheck("in cudaFreeHost virials");
01236       }
01237       num_virials_allocated = 1.1 * num_virials + 1;
01238       cudaHostAlloc((void**)&virials,2*16*sizeof(float)*num_virials_allocated,cudaHostAllocMapped);
01239       cudaHostAlloc((void**)&energy_gbis,sizeof(float)*num_virials_allocated,cudaHostAllocMapped);
01240       for (int i  = 0; i < num_virials_allocated; i++)
01241         energy_gbis[i] = 0.f;
01242       cuda_errcheck("in cudaHostAlloc virials");
01243       slow_virials = virials + 16*num_virials;
01244     }
01245 
01246     int *ap = activePatches.begin();
01247     for ( int i=0; i<localActivePatches.size(); ++i ) {
01248       *(ap++) = localActivePatches[i];
01249     }
01250     for ( int i=0; i<remoteActivePatches.size(); ++i ) {
01251       *(ap++) = remoteActivePatches[i];
01252     }
01253 
01254     // sort computes by distance between patches
01255     cr_sortop so(lattice);
01256     std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),so);
01257     std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),so);
01258  
01259     int nlc = localComputeRecords.size();
01260     int nrc = remoteComputeRecords.size();
01261     computeRecords.resize(nlc+nrc);
01262     compute_record *cr = computeRecords.begin();
01263     for ( int i=0; i<nlc; ++i ) {
01264       *(cr++) = localComputeRecords[i];
01265     }
01266     for ( int i=0; i<nrc; ++i ) {
01267       *(cr++) = remoteComputeRecords[i];
01268     }
01269 
01270     force_lists.resize(npatches);
01271     for ( int i=0; i<npatches; ++i ) {
01272       patchRecords[activePatches[i]].localIndex = i;
01273       force_lists[i].force_list_size = 0;
01274     }
01275 
01276     int ncomputes = computeRecords.size();
01277     patch_pairs.resize(ncomputes);
01278     for ( int i=0; i<ncomputes; ++i ) {
01279       ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01280       int lp1 = patchRecords[cr.pid[0]].localIndex;
01281       int lp2 = patchRecords[cr.pid[1]].localIndex;
01282       force_lists[lp1].force_list_size++;
01283       patch_pair &pp = patch_pairs[i];
01284       pp.offset.x = cr.offset.x;
01285       pp.offset.y = cr.offset.y;
01286       pp.offset.z = cr.offset.z;
01287     }
01288 
01289     for ( int i=0; i<ncomputes; ++i ) {
01290       ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01291       int lp1 = patchRecords[cr.pid[0]].localIndex;
01292       patch_pair &pp = patch_pairs[i];
01293       pp.patch1_force_list_index = lp1;
01294       pp.patch1_force_list_size = force_lists[lp1].force_list_size;
01295     }
01296 
01297    if ( 1 || simParams->outputCudaTiming ) {
01298     CkPrintf("Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
01299         CkMyPe(), localActivePatches.size(), remoteActivePatches.size(),
01300         localComputeRecords.size(), remoteComputeRecords.size());
01301    }
01302   }  // computesChanged
01303 
01304   int istart = 0;
01305   int flstart = 0;
01306   int vlstart = 0;
01307   int max_atoms_per_patch = 0;
01308   int i;
01309   for ( i=0; i<npatches; ++i ) {
01310     if ( i == localActivePatches.size() ) {
01311        num_local_atom_records = istart;
01312     }
01313     force_lists[i].force_list_start = flstart;
01314     force_lists[i].force_output_start = istart;
01315     force_lists[i].atom_start = istart;
01316     force_lists[i].virial_list_start = vlstart;
01317     force_lists[i].virial_output_start = 16*i;
01318     patch_record &pr = patchRecords[activePatches[i]];
01319     pr.localStart = istart;
01320     int natoms = pr.p->getNumAtoms();
01321     int nfreeatoms = natoms;
01322     if ( fixedAtomsOn ) {
01323       const CompAtomExt *aExt = pr.xExt;
01324       for ( int j=0; j<natoms; ++j ) {
01325         if ( aExt[j].atomFixed ) --nfreeatoms;
01326       }
01327     }
01328     if ( natoms > max_atoms_per_patch ) max_atoms_per_patch = natoms;
01329     pr.numAtoms = natoms;
01330     pr.numFreeAtoms = nfreeatoms;
01331     force_lists[i].patch_size = nfreeatoms;
01332     if ( natoms & 15 ) { natoms += 16 - (natoms & 15); }
01333     if ( nfreeatoms & 15 ) { nfreeatoms += 16 - (nfreeatoms & 15); }
01334     force_lists[i].patch_stride = nfreeatoms;
01335     flstart += nfreeatoms * force_lists[i].force_list_size;
01336     vlstart += 16 * force_lists[i].force_list_size;
01337     istart += natoms;  // already rounded up
01338     force_lists[i].force_list_size = 0;  // rebuild below
01339   }
01340   if ( i == localActivePatches.size() ) {
01341      num_local_atom_records = istart;
01342   }
01343   num_force_records = flstart;
01344   num_atom_records = istart;
01345   num_remote_atom_records = num_atom_records - num_local_atom_records;
01346   if ( num_atom_records > num_atom_records_allocated ) {
01347     if ( num_atom_records_allocated ) {
01348       // delete [] atom_order;
01349       cudaFreeHost(atom_params);
01350       cudaFreeHost(atoms);
01351       cudaFreeHost(forces);
01352       cudaFreeHost(slow_forces);
01353       if (simParams->GBISOn) {
01354         cudaFreeHost(intRad0H);//6 GBIS arrays
01355         cudaFreeHost(intRadSH);
01356         cudaFreeHost(psiSumH);
01357         cudaFreeHost(bornRadH);
01358         cudaFreeHost(dEdaSumH);
01359         cudaFreeHost(dHdrPrefixH);
01360       }
01361       cuda_errcheck("in cudaFreeHost");
01362     }
01363     num_atom_records_allocated = 1.1 * num_atom_records + 1;
01364     // atom_order = new int[num_atom_records_allocated];
01365     cudaMallocHost((void**)&atom_params,sizeof(atom_param)*num_atom_records_allocated);
01366     cudaMallocHost((void**)&atoms,sizeof(atom)*num_atom_records_allocated);
01367     cuda_errcheck("in cudaMallocHost atoms");
01368     cudaHostAlloc((void**)&forces,sizeof(float4)*num_atom_records_allocated,cudaHostAllocMapped);
01369     cudaHostAlloc((void**)&slow_forces,sizeof(float4)*num_atom_records_allocated,cudaHostAllocMapped);
01370     //allocate GBIS memory
01371     if (simParams->GBISOn) {
01372   cudaMallocHost((void**)&intRad0H,sizeof(float)*num_atom_records_allocated);
01373   cudaMallocHost((void**)&intRadSH,sizeof(float)*num_atom_records_allocated);
01374   cudaHostAlloc((void**)&psiSumH, sizeof(GBReal)*num_atom_records_allocated,cudaHostAllocMapped);
01375   cudaMallocHost((void**)&bornRadH,sizeof(float)*num_atom_records_allocated);
01376   cudaHostAlloc((void**)&dEdaSumH,sizeof(GBReal)*num_atom_records_allocated,cudaHostAllocMapped);
01377   cudaMallocHost((void**)&dHdrPrefixH,sizeof(float)*num_atom_records_allocated);
01378     }
01379     cuda_errcheck("in cudaHostAlloc forces");
01380   }
01381 
01382   int bfstart = 0;
01383   int ncomputes = computeRecords.size();
01384   for ( int i=0; i<ncomputes; ++i ) {
01385     ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01386     int p1 = cr.pid[0];
01387     int p2 = cr.pid[1];
01388     int lp1 = patchRecords[p1].localIndex;
01389     int lp2 = patchRecords[p2].localIndex;
01390     patch_pair &pp = patch_pairs[i];
01391     pp.patch1_atom_start = patchRecords[p1].localStart;
01392     pp.patch1_force_start = force_lists[lp1].force_list_start +
01393         force_lists[lp1].patch_stride * force_lists[lp1].force_list_size;
01394     pp.virial_start = force_lists[lp1].virial_list_start +
01395                                  16 * force_lists[lp1].force_list_size;
01396     pp.patch1_size = patchRecords[p1].numAtoms;
01397     pp.patch1_force_size = patchRecords[p1].numFreeAtoms;
01398     pp.patch2_force_size = patchRecords[p2].numFreeAtoms;
01399     pp.block_flags_start = bfstart;
01400     bfstart += ((pp.patch1_force_size + 127) >> 7) << 5;
01401     // istart += pp.patch1_size;
01402     // if ( istart & 15 ) { istart += 16 - (istart & 15); }
01403     pp.patch2_atom_start = patchRecords[p2].localStart;
01404     // pp.patch2_force_start = force_lists[lp2].force_list_start +
01405 //      force_lists[lp2].patch_stride * force_lists[lp2].force_list_size;
01406     pp.patch2_size = patchRecords[p2].numAtoms;
01407     // pp.patch2_force_size = patchRecords[p2].numFreeAtoms;
01408     // this must happen at end to get self computes right
01409     force_lists[lp1].force_list_size++;
01410     // if ( lp1 != lp2 || cr.t[0] != cr.t[1] ) {
01411     //   force_lists[lp2].force_list_size++;
01412     // }
01413     // istart += pp.patch2_size;
01414     // if ( istart & 15 ) { istart += 16 - (istart & 15); }
01415   } //for ncomputes
01416 
01417 #if 0
01418   CkPrintf("Pe %d cuda_bind_patch_pairs %d %d %d %d %d\n", CkMyPe(),
01419         patch_pairs.size(), force_lists.size(),
01420         num_atom_records, num_force_records,
01421         max_atoms_per_patch);
01422 #endif
01423 
01424   int totalmem = patch_pairs.size() * sizeof(patch_pair) +
01425                 force_lists.size() * sizeof(force_list) +
01426                 num_force_records * sizeof(float4) +
01427                 num_atom_records * sizeof(atom) +
01428                 num_atom_records * sizeof(atom_param) +
01429                 num_atom_records * sizeof(float4);
01430   int totalcopy = num_atom_records * ( sizeof(atom) + sizeof(float4) );
01431 /*
01432   CkPrintf("Pe %d allocating %d MB of GPU memory, will copy %d kB per step\n",
01433                         CkMyPe(), totalmem >> 20, totalcopy >> 10);
01434 */
01435 
01436   cuda_bind_patch_pairs(patch_pairs.begin(), patch_pairs.size(),
01437                         force_lists.begin(), force_lists.size(),
01438                         num_atom_records, num_force_records,
01439                         bfstart, max_atoms_per_patch);
01440 
01441  }  // atomsChanged || computesChanged
01442 
01443   double charge_scaling = sqrt(COULOMB * scaling * dielectric_1);
01444 
01445   Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01446   float maxAtomMovement = 0.;
01447   float maxPatchTolerance = 0.;
01448 
01449   for ( int i=0; i<activePatches.size(); ++i ) {
01450     patch_record &pr = patchRecords[activePatches[i]];
01451 
01452     float maxMove = pr.p->flags.maxAtomMovement;
01453     if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
01454 
01455     float maxTol = pr.p->flags.pairlistTolerance;
01456     if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
01457 
01458     int start = pr.localStart;
01459     int n = pr.numAtoms;
01460     const CompAtom *a = pr.x;
01461     const CompAtomExt *aExt = pr.xExt;
01462     // int *ao = atom_order + start;
01463     if ( atomsChanged ) {
01464 #if 0
01465       int nfree = pr.numFreeAtoms;
01466       if ( nfree != n ) {
01467         int k = 0;
01468         for ( int j=0; j<n; ++j ) {
01469           // put free atoms first
01470           if ( ! aExt[j].atomFixed ) ao[k++] = j;
01471         }
01472         if ( fixedAtomsOn ) for ( int j=0; j<n; ++j ) {
01473           // put fixed atoms at end
01474           if ( aExt[j].atomFixed ) ao[k++] = j;
01475         }
01476       } else {
01477         for ( int j=0; j<n; ++j ) {
01478           ao[j] = j;
01479         }
01480       }
01481 
01482       sortAtomsForCUDA(ao,a,nfree,n);
01483 #endif
01484 
01485       atom_param *ap = atom_params + start;
01486       for ( int k=0; k<n; ++k ) {
01487         int j = aExt[k].sortOrder;
01488         ap[k].vdw_type = a[j].vdwType;
01489         ap[k].index = aExt[j].id;
01490 #ifdef MEM_OPT_VERSION
01491         ap[k].excl_index = exclusionsByAtom[aExt[j].exclId].y;
01492         ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].exclId].x;
01493 #else
01494         ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
01495         ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
01496 #endif
01497       }
01498     }
01499     {
01500 #if 1
01501       const CudaAtom *ac = pr.p->getCudaAtomList();
01502       atom *ap = atoms + start;
01503       memcpy(ap, ac, sizeof(atom)*n);
01504 #else
01505       Vector center =
01506         pr.p->flags.lattice.unscale(cudaCompute->patchMap->center(pr.patchID));
01507       atom *ap = atoms + start;
01508       for ( int k=0; k<n; ++k ) {
01509         int j = aExt[k].sortOrder;
01510         ap[k].position.x = a[j].position.x - center.x;
01511         ap[k].position.y = a[j].position.y - center.y;
01512         ap[k].position.z = a[j].position.z - center.z;
01513         ap[k].charge = charge_scaling * a[j].charge;
01514       }
01515 #endif
01516     }
01517   }
01518 //GBISP("finished active patches\n")
01519 
01520   //CkPrintf("maxMovement = %f  maxTolerance = %f  save = %d  use = %d\n",
01521   //  maxAtomMovement, maxPatchTolerance,
01522   //  flags.savePairlists, flags.usePairlists);
01523 
01524   savePairlists = 0;
01525   usePairlists = 0;
01526   if ( flags.savePairlists ) {
01527     savePairlists = 1;
01528     usePairlists = 1;
01529   } else if ( flags.usePairlists ) {
01530     if ( ! pairlistsValid ||
01531          ( 2. * maxAtomMovement > pairlistTolerance ) ) {
01532       reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
01533     } else {
01534       usePairlists = 1;
01535     }
01536   }
01537   if ( ! usePairlists ) {
01538     pairlistsValid = 0;
01539   }
01540   float plcutoff = cutoff;
01541   if ( savePairlists ) {
01542     pairlistsValid = 1;
01543     pairlistTolerance = 2. * maxPatchTolerance;
01544     plcutoff += pairlistTolerance;
01545   }
01546   plcutoff2 = plcutoff * plcutoff;
01547 
01548   //CkPrintf("plcutoff = %f  listTolerance = %f  save = %d  use = %d\n",
01549   //  plcutoff, pairlistTolerance, savePairlists, usePairlists);
01550 
01551 #if 0
01552   // calculate warp divergence
01553   if ( 1 ) {
01554     Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01555     Lattice &lattice = flags.lattice;
01556     float3 lata, latb, latc;
01557     lata.x = lattice.a().x;
01558     lata.y = lattice.a().y;
01559     lata.z = lattice.a().z;
01560     latb.x = lattice.b().x;
01561     latb.y = lattice.b().y;
01562     latb.z = lattice.b().z;
01563     latc.x = lattice.c().x;
01564     latc.y = lattice.c().y;
01565     latc.z = lattice.c().z;
01566 
01567     int ncomputes = computeRecords.size();
01568     for ( int ic=0; ic<ncomputes; ++ic ) {
01569       patch_pair &pp = patch_pairs[ic];
01570       atom *a1 = atoms + pp.patch1_atom_start;
01571       int n1 = pp.patch1_size;
01572       atom *a2 = atoms + pp.patch2_atom_start;
01573       int n2 = pp.patch2_size;
01574       float offx = pp.offset.x * lata.x
01575                + pp.offset.y * latb.x
01576                + pp.offset.z * latc.x;
01577       float offy = pp.offset.x * lata.y
01578                + pp.offset.y * latb.y
01579                + pp.offset.z * latc.y;
01580       float offz = pp.offset.x * lata.z
01581                + pp.offset.y * latb.z
01582                + pp.offset.z * latc.z;
01583       // CkPrintf("%f %f %f\n", offx, offy, offz);
01584       int atoms_tried = 0;
01585       int blocks_tried = 0;
01586       int atoms_used = 0;
01587       int blocks_used = 0;
01588       for ( int ii=0; ii<n1; ii+=32 ) {  // warps
01589         for ( int jj=0; jj<n2; jj+=16 ) {  // shared atom loads
01590           int block_used = 0;
01591           for ( int j=jj; j<jj+16 && j<n2; ++j ) {  // shared atoms
01592             int atom_used = 0;
01593             for ( int i=ii; i<ii+32 && i<n1; ++i ) {  // threads
01594               float dx = offx + a1[i].position.x - a2[j].position.x;
01595               float dy = offy + a1[i].position.y - a2[j].position.y;
01596               float dz = offz + a1[i].position.z - a2[j].position.z;
01597               float r2 = dx*dx + dy*dy + dz*dz;
01598               if ( r2 < cutoff2 ) atom_used = 1;
01599             }
01600             ++atoms_tried;
01601             if ( atom_used ) { block_used = 1; ++atoms_used; }
01602           }
01603           ++blocks_tried;
01604           if ( block_used ) { ++blocks_used; }
01605         }
01606       }
01607       CkPrintf("blocks = %d/%d (%f)  atoms = %d/%d (%f)\n",
01608                 blocks_used, blocks_tried, blocks_used/(float)blocks_tried,
01609                 atoms_used, atoms_tried, atoms_used/(float)atoms_tried);
01610     }
01611   }
01612 #endif
01613 
01614   } // !GBISOn || gbisPhase == 1
01615 
01616   //Do GBIS
01617   if (simParams->GBISOn) {
01618     //open GBIS boxes depending on phase
01619     for ( int i=0; i<activePatches.size(); ++i ) {
01620       patch_record &pr = master->patchRecords[activePatches[i]];
01621 GBISP("doWork[%d] accessing arrays for P%d\n",CkMyPe(),gbisPhase);
01622         if (gbisPhase == 1) {
01623           //Copy GBIS intRadius to Host
01624           if (atomsChanged) {
01625             float *intRad0 = intRad0H + pr.localStart;
01626             float *intRadS = intRadSH + pr.localStart;
01627             for ( int k=0; k<pr.numAtoms; ++k ) {
01628               int j = pr.xExt[k].sortOrder;
01629               intRad0[k] = pr.intRad[2*j+0];
01630               intRadS[k] = pr.intRad[2*j+1];
01631             }
01632           }
01633         } else if (gbisPhase == 2) {
01634           float *bornRad = bornRadH + pr.localStart;
01635           for ( int k=0; k<pr.numAtoms; ++k ) {
01636             int j = pr.xExt[k].sortOrder;
01637             bornRad[k] = pr.bornRad[j];
01638           }
01639         } else if (gbisPhase == 3) {
01640           float *dHdrPrefix = dHdrPrefixH + pr.localStart;
01641           for ( int k=0; k<pr.numAtoms; ++k ) {
01642             int j = pr.xExt[k].sortOrder;
01643             dHdrPrefix[k] = pr.dHdrPrefix[j];
01644           }
01645       } // end phases
01646     } // end for patches
01647   } // if GBISOn
01648 
01649   kernel_time = CkWallTimer();
01650   kernel_launch_state = 1;
01651   if ( gpu_is_mine ) recvYieldDevice(-1);
01652 }
01653 
01654 void cuda_check_remote_calc(void *arg, double /* walltime */) {
01655   // in theory we only need end_remote_calc, but overlap isn't reliable
01656   // if ( cudaEventQuery(end_remote_calc) == cudaSuccess ) {
01657   if ( cudaEventQuery(end_remote_download) == cudaSuccess ) {
01658 // CkPrintf("Pe %d yielding to %d after remote calc\n", CkMyPe(), next_pe_sharing_gpu);
01659     computeMgr->sendYieldDevice(next_pe_sharing_gpu);
01660 // CkPrintf("Pe %d yielded to %d after remote calc\n", CkMyPe(), next_pe_sharing_gpu);
01661   } else {
01662     CUDA_POLL(cuda_check_remote_calc, arg);
01663   }
01664 }
01665 
01666 void cuda_check_local_calc(void *arg, double /* walltime */) {
01667   // in theory we only need end_local_calc, but overlap isn't reliable
01668   // if ( cudaEventQuery(end_local_calc) == cudaSuccess ) {
01669   if ( cudaEventQuery(end_local_download) == cudaSuccess ) {
01670 // CkPrintf("Pe %d yielding to %d after local calc\n", CkMyPe(), next_pe_sharing_gpu);
01671     computeMgr->sendYieldDevice(next_pe_sharing_gpu);
01672 // CkPrintf("Pe %d yielded to %d after local calc\n", CkMyPe(), next_pe_sharing_gpu);
01673   } else {
01674     CUDA_POLL(cuda_check_local_calc, arg);
01675   }
01676 }
01677 
01678 // computeMgr->sendYieldDevice(next_pe_sharing_gpu);
01679 
01680 void ComputeNonbondedCUDA::recvYieldDevice(int pe) {
01681 GBISP("C.N.CUDA[%d]::recvYieldDevice: seq %d, workStarted %d, \
01682 gbisPhase %d, kls %d, from pe %d\n", CkMyPe(), sequence(), \
01683 workStarted, gbisPhase, kernel_launch_state, pe)
01684 
01685   float3 lata, latb, latc;
01686   lata.x = lattice.a().x;
01687   lata.y = lattice.a().y;
01688   lata.z = lattice.a().z;
01689   latb.x = lattice.b().x;
01690   latb.y = lattice.b().y;
01691   latb.z = lattice.b().z;
01692   latc.x = lattice.c().x;
01693   latc.y = lattice.c().y;
01694   latc.z = lattice.c().z;
01695   SimParameters *simParams = Node::Object()->simParameters;
01696 
01697   switch ( kernel_launch_state ) {
01699 // Remote
01700   case 1:
01701 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: case 1\n", CkMyPe())
01702     ++kernel_launch_state;
01703     gpu_is_mine = 0;
01704     remote_submit_time = CkWallTimer();
01705 
01706     if (!simParams->GBISOn || gbisPhase == 1) {
01707     // cudaEventRecord(start_upload, stream);
01708     if ( atomsChanged ) {
01709       cuda_bind_atom_params(atom_params);
01710     }
01711     if ( simParams->GBISOn) {
01712       cuda_bind_GBIS_psiSum(psiSumH);
01713       if ( atomsChanged ) {
01714         cuda_bind_GBIS_intRad(intRad0H, intRadSH);
01715       }
01716     }
01717     cuda_bind_atoms(atoms);
01718     cuda_bind_forces(forces, slow_forces);
01719     cuda_bind_virials(virials);
01720     cuda_bind_GBIS_energy(energy_gbis);
01721     cudaEventRecord(start_calc, stream);
01722     //call CUDA Kernels
01723     cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
01724         localComputeRecords.size(),remoteComputeRecords.size(),
01725         localActivePatches.size(),remoteActivePatches.size(),
01726         doSlow, doEnergy, usePairlists, savePairlists, stream);
01727     if (simParams->GBISOn) {
01728       cuda_GBIS_P1(
01729         localComputeRecords.size(),remoteComputeRecords.size(),
01730         localActivePatches.size(),remoteActivePatches.size(),
01731         simParams->alpha_cutoff-simParams->fsMax,
01732         simParams->coulomb_radius_offset,
01733         lata, latb, latc, stream);
01734     }
01735     //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
01736     //    num_local_atom_records,num_remote_atom_records);
01737     cudaEventRecord(end_remote_download, stream);
01738     CUDA_POLL(cuda_check_remote_progress,this);
01739     if ( shared_gpu && ! mergegrids ) {
01740       CUDA_POLL(cuda_check_remote_calc,this);
01741       break;
01742     }
01743     } // !GBIS or gbisPhase==1
01744     if (simParams->GBISOn) {
01745       if (gbisPhase == 1) {
01746         //GBIS P1 Kernel launched in previous code block
01747       } else if (gbisPhase == 2) {
01748 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P2>>>\n", CkMyPe())
01749         // cudaEventRecord(start_upload, stream);
01750         cuda_bind_GBIS_bornRad(bornRadH);
01751         cuda_bind_GBIS_dEdaSum(dEdaSumH);
01752         cudaEventRecord(start_calc, stream);
01753         cuda_GBIS_P2(
01754           localComputeRecords.size(),remoteComputeRecords.size(),
01755           localActivePatches.size(),remoteActivePatches.size(),
01756           (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
01757           simParams->nonbondedScaling, simParams->kappa,
01758           (simParams->switchingActive ? simParams->switchingDist : -1.0),
01759           simParams->dielectric, simParams->solvent_dielectric,
01760           lata, latb, latc,
01761           doEnergy, doSlow, stream
01762           );
01763         cudaEventRecord(end_remote_download, stream);
01764         CUDA_POLL(cuda_check_remote_progress,this);
01765       } else if (gbisPhase == 3) {
01766 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P3>>>\n", CkMyPe())
01767         // cudaEventRecord(start_upload, stream);
01768         cuda_bind_GBIS_dHdrPrefix(dHdrPrefixH);
01769         cudaEventRecord(start_calc, stream);
01770         if (doSlow)
01771         cuda_GBIS_P3(
01772           localComputeRecords.size(),remoteComputeRecords.size(),
01773           localActivePatches.size(),remoteActivePatches.size(),
01774           (simParams->alpha_cutoff-simParams->fsMax),
01775           simParams->coulomb_radius_offset,
01776           simParams->nonbondedScaling,
01777           lata, latb, latc, stream
01778           );
01779         cudaEventRecord(end_remote_download, stream);
01780         CUDA_POLL(cuda_check_remote_progress,this);
01781       }
01782     }
01783 
01785 // Local
01786   case 2:
01787 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: case 2\n", CkMyPe())
01788     ++kernel_launch_state;
01789     gpu_is_mine = 0;
01790 
01791 #if 0
01792     cudaStreamWaitEvent(stream2, start_calc, 0);
01793 #else
01794 #define stream2 stream
01795 #endif
01796 
01797     if (!simParams->GBISOn || gbisPhase == 1) {
01798 
01799       cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
01800               0,localComputeRecords.size(),
01801               0,localActivePatches.size(),
01802               doSlow, doEnergy, usePairlists, savePairlists, stream2);
01803       if (simParams->GBISOn) {
01804         cuda_GBIS_P1(
01805           0,localComputeRecords.size(),
01806           0,localActivePatches.size(),
01807           simParams->alpha_cutoff-simParams->fsMax,
01808           simParams->coulomb_radius_offset,
01809           lata, latb, latc, stream2 );
01810       }
01811     //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
01812     //    0,num_local_atom_records);
01813     //cuda_load_virials(virials, doSlow);  // slow_virials follows virials
01814     cudaEventRecord(end_local_download, stream2);
01815     if ( workStarted == 2 ) {
01816 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
01817 cuda_check_local_progress\n", CkMyPe())
01818       CUDA_POLL(cuda_check_local_progress,this);
01819     }
01820     if ( shared_gpu && ! mergegrids ) {
01821 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
01822 cuda_check_local_calc\n", CkMyPe())
01823       CUDA_POLL(cuda_check_local_calc,this);
01824       break;
01825     }
01826 
01827     } // !GBIS or gbisPhase==1
01828     if (simParams->GBISOn) {
01829       if (gbisPhase == 1) {
01830         //GBIS P1 Kernel launched in previous code block
01831       } else if (gbisPhase == 2) {
01832 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P2>>>\n", CkMyPe())
01833         cuda_GBIS_P2(
01834           0,localComputeRecords.size(),
01835           0,localActivePatches.size(),
01836           (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
01837           simParams->nonbondedScaling, simParams->kappa,
01838           (simParams->switchingActive ? simParams->switchingDist : -1.0),
01839           simParams->dielectric, simParams->solvent_dielectric,
01840           lata, latb, latc,
01841           doEnergy, doSlow, stream2
01842           );
01843         cudaEventRecord(end_local_download, stream2);
01844         if ( workStarted == 2 ) {
01845           CUDA_POLL(cuda_check_local_progress,this);
01846         }
01847       } else if (gbisPhase == 3) {
01848 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P3>>>\n", CkMyPe())
01849         if (doSlow)
01850         cuda_GBIS_P3(
01851           0,localComputeRecords.size(),
01852           0,localActivePatches.size(),
01853           (simParams->alpha_cutoff-simParams->fsMax),
01854           simParams->coulomb_radius_offset,
01855           simParams->nonbondedScaling,
01856           lata, latb, latc, stream2
01857           );
01858         cudaEventRecord(end_local_download, stream2);
01859         if ( workStarted == 2 ) {
01860           CUDA_POLL(cuda_check_local_progress,this);
01861         }
01862       } // phases
01863     } // GBISOn
01864 
01865 
01866   default:
01867 GBISP("C.N.CUDA[%d]::recvYieldDevice: case default\n", CkMyPe())
01868     gpu_is_mine = 1;
01869     break;
01870   } // switch
01871 GBISP("C.N.CUDA[%d]::recvYieldDevice: DONE\n", CkMyPe())
01872 }
01873 
01874 
01875 void ComputeNonbondedCUDA::messageFinishWork() {
01876 
01877   cuda_errcheck("at cuda stream completed");
01878 
01879   for ( int i = 0; i < numSlaves; ++i ) {
01880     computeMgr->sendNonbondedCUDASlaveEnqueue(slaves[i],slavePes[i],sequence(),priority(),workStarted);
01881   }
01882 
01883   WorkDistrib::messageEnqueueWork(this);
01884 
01885 }
01886 
01887 //dtanner
01888 int ComputeNonbondedCUDA::finishWork() {
01889 GBISP("C.N.CUDA[%d]::fnWork: workStarted %d, phase %d\n", \
01890 CkMyPe(), workStarted, gbisPhase)
01891 
01892   Molecule *mol = Node::Object()->molecule;
01893   SimParameters *simParams = Node::Object()->simParameters;
01894 
01895   ResizeArray<int> &patches( workStarted == 1 ?
01896                                 remoteHostedPatches : localHostedPatches );
01897 
01898   //call once at beginning of each phase/step
01899   if ( !simParams->GBISOn || gbisPhase == 1 ) {
01900     for ( int i=0; i<patches.size(); ++i ) {
01901       patch_record &pr = master->patchRecords[patches[i]];
01902 GBISP("GBIS[%d] fnWork() P0[%d] force.open()\n",CkMyPe(), pr.patchID);
01903       pr.r = pr.forceBox->open();
01904     }
01905   } // !GBISOn || gbisPhase==1
01906 
01907   // long long int wcount = 0;
01908   // double virial = 0.;
01909   // double virial_slow = 0.;
01910   for ( int i=0; i<patches.size(); ++i ) {
01911     // CkPrintf("Pe %d patch %d of %d pid %d\n",CkMyPe(),i,patches.size(),patches[i]);
01912     patch_record &pr = master->patchRecords[patches[i]];
01913 
01914     int start = pr.localStart;
01915     const CompAtomExt *aExt = pr.xExt;
01916     if ( !simParams->GBISOn || gbisPhase == 3 ) {
01917     int nfree = pr.numFreeAtoms;
01918     pr.f = pr.r->f[Results::nbond];
01919     Force *f = pr.f;
01920     Force *f_slow = pr.r->f[Results::slow];
01921     const CompAtom *a = pr.x;
01922     // int *ao = atom_order + start;
01923     float4 *af = master->forces + start;
01924     float4 *af_slow = master->slow_forces + start;
01925     // only free atoms return forces
01926     for ( int k=0; k<nfree; ++k ) {
01927       int j = aExt[k].sortOrder;
01928       f[j].x += af[k].x;
01929       f[j].y += af[k].y;
01930       f[j].z += af[k].z;
01931       // wcount += af[k].w;
01932       // virial += af[k].w;
01933       if ( doSlow ) {
01934         f_slow[j].x += af_slow[k].x;
01935         f_slow[j].y += af_slow[k].y;
01936         f_slow[j].z += af_slow[k].z;
01937         // virial_slow += af_slow[k].w;
01938       }
01939     }
01940 
01941 #if 1
01942     // check exclusions reported as w
01943     if ( CkNumPes() == 1 ) {
01944       const CompAtomExt *aExt = pr.xExt;
01945       for ( int k=0; k<nfree; ++k ) {
01946         int j = aExt[k].sortOrder;
01947 #ifdef MEM_OPT_VERSION
01948         int excl_expected = mol->exclSigPool[aExt[j].exclId].fullExclCnt + 1;
01949 #else
01950         int excl_expected = mol->get_full_exclusions_for_atom(aExt[j].id)[0] + 1;
01951 #endif
01952         if ( af[k].w != excl_expected ) {
01953           CkPrintf("%d:%d(%d) atom %d found %d exclusions but expected %d\n",
01954                 i, j, k, aExt[j].id, (int)af[k].w, excl_expected );
01955         }
01956       }
01957     }
01958 #endif
01959   } // !GBISOn || gbisPhase == 3
01960 
01961 #if 0
01962     if ( i % 31 == 0 ) for ( int j=0; j<3; ++j ) {
01963       CkPrintf("Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
01964         j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
01965         af[j].w);
01966     }
01967 #endif
01968 
01969     //Close Boxes depending on Phase
01970     if (simParams->GBISOn) {
01971       if (gbisPhase == 1) {
01972         //Copy dEdaSum from Host to Patch Box
01973         GBReal *psiSumMaster = master->psiSumH + start;
01974         for ( int k=0; k<pr.numAtoms; ++k ) {
01975           int j = aExt[k].sortOrder;
01976           pr.psiSum[j] += psiSumMaster[k];
01977         }
01978 GBISP("C.N.CUDA[%d]::fnWork: P1 psiSum.close()\n", CkMyPe());
01979         pr.psiSumBox->close(&(pr.psiSum));
01980 
01981       } else if (gbisPhase == 2) {
01982         //Copy dEdaSum from Host to Patch Box
01983         GBReal *dEdaSumMaster = master->dEdaSumH + start;
01984         for ( int k=0; k<pr.numAtoms; ++k ) {
01985           int j = aExt[k].sortOrder;
01986           pr.dEdaSum[j] += dEdaSumMaster[k];
01987         }
01988 GBISP("C.N.CUDA[%d]::fnWork: P2 dEdaSum.close()\n", CkMyPe());
01989         pr.dEdaSumBox->close(&(pr.dEdaSum));
01990 
01991       } else if (gbisPhase == 3) {
01992 GBISP("C.N.CUDA[%d]::fnWork: P3 all.close()\n", CkMyPe());
01993         pr.intRadBox->close(&(pr.intRad)); //box 6
01994         pr.bornRadBox->close(&(pr.bornRad)); //box 7
01995         pr.dHdrPrefixBox->close(&(pr.dHdrPrefix)); //box 9
01996         pr.positionBox->close(&(pr.x)); //box 0
01997         pr.forceBox->close(&(pr.r));
01998       } //end phases
01999     } else { //not GBIS
02000 GBISP("C.N.CUDA[%d]::fnWork: pos/force.close()\n", CkMyPe());
02001       pr.positionBox->close(&(pr.x));
02002       pr.forceBox->close(&(pr.r));
02003     }
02004   }// end for
02005 
02006 #if 0
02007   virial *= (-1./6.);
02008   reduction->item(REDUCTION_VIRIAL_NBOND_XX) += virial;
02009   reduction->item(REDUCTION_VIRIAL_NBOND_YY) += virial;
02010   reduction->item(REDUCTION_VIRIAL_NBOND_ZZ) += virial;
02011   if ( doSlow ) {
02012     virial_slow *= (-1./6.);
02013     reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial_slow;
02014     reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial_slow;
02015     reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial_slow;
02016   }
02017 #endif
02018 
02019   if ( workStarted == 1 && ! mergegrids &&
02020        ( localHostedPatches.size() || master == this ) ) {
02021     GBISP("not finished, call again\n");
02022     return 0;  // not finished, call again
02023   }
02024 
02025   if ( master != this ) {  // finished
02026     GBISP("finished\n");
02027     if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
02028     atomsChanged = 0;
02029     return 1;
02030   }
02031 
02032   cuda_timer_total += kernel_time;
02033 
02034   if ( !simParams->GBISOn || gbisPhase == 3 ) {
02035 
02036   {
02037     Tensor virial_tensor;
02038     BigReal energyv = 0.;
02039     BigReal energye = 0.;
02040     BigReal energys = 0.;
02041     for ( int i = 0; i < num_virials; ++i ) {
02042       virial_tensor.xx += virials[16*i];
02043       virial_tensor.xy += virials[16*i+1];
02044       virial_tensor.xz += virials[16*i+2];
02045       virial_tensor.yx += virials[16*i+3];
02046       virial_tensor.yy += virials[16*i+4];
02047       virial_tensor.yz += virials[16*i+5];
02048       virial_tensor.zx += virials[16*i+6];
02049       virial_tensor.zy += virials[16*i+7];
02050       virial_tensor.zz += virials[16*i+8];
02051       energyv += virials[16*i+9];
02052       energye += virials[16*i+10];
02053       energys += virials[16*i+11];
02054       if (simParams->GBISOn) {
02055         energye += energy_gbis[i];
02056       }
02057     }
02058     ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NBOND,virial_tensor);
02059     if ( doEnergy ) {
02060       reduction->item(REDUCTION_LJ_ENERGY) += energyv;
02061       reduction->item(REDUCTION_ELECT_ENERGY) += energye;
02062       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += energys;
02063     }
02064   }
02065   if ( doSlow ) {
02066     Tensor virial_slow_tensor;
02067     for ( int i = 0; i < num_virials; ++i ) {
02068       virial_slow_tensor.xx += slow_virials[16*i];
02069       virial_slow_tensor.xy += slow_virials[16*i+1];
02070       virial_slow_tensor.xz += slow_virials[16*i+2];
02071       virial_slow_tensor.yx += slow_virials[16*i+3];
02072       virial_slow_tensor.yy += slow_virials[16*i+4];
02073       virial_slow_tensor.yz += slow_virials[16*i+5];
02074       virial_slow_tensor.zx += slow_virials[16*i+6];
02075       virial_slow_tensor.zy += slow_virials[16*i+7];
02076       virial_slow_tensor.zz += slow_virials[16*i+8];
02077     }
02078     ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_SLOW,virial_slow_tensor);
02079   }
02080 
02081   atomsChanged = 0;
02082   reduction->submit();
02083 
02084   cuda_timer_count++;
02085   if ( simParams->outputCudaTiming &&
02086         step % simParams->outputCudaTiming == 0 ) {
02087 
02088   // int natoms = mol->numAtoms; 
02089   // double wpa = wcount;  wpa /= natoms;
02090 
02091   // CkPrintf("Pe %d CUDA kernel %f ms, total %f ms, wpa %f\n", CkMyPe(),
02092         //      kernel_time * 1.e3, time * 1.e3, wpa);
02093 
02094 #if 0
02095   float upload_ms, remote_calc_ms;
02096   float local_calc_ms, total_ms;
02097   cuda_errcheck("before event timers");
02098   cudaEventElapsedTime(&upload_ms, start_upload, start_calc);
02099   cuda_errcheck("in event timer 1");
02100   cudaEventElapsedTime(&remote_calc_ms, start_calc, end_remote_download);
02101   cuda_errcheck("in event timer 2");
02102   cudaEventElapsedTime(&local_calc_ms, end_remote_download, end_local_download);
02103   cuda_errcheck("in event timer 3");
02104   cudaEventElapsedTime(&total_ms, start_upload, end_local_download);
02105   cuda_errcheck("in event timer 4");
02106   cuda_errcheck("in event timers");
02107 
02108     CkPrintf("CUDA EVENT TIMING: %d %f %f %f %f\n",
02109         CkMyPe(), upload_ms, remote_calc_ms,
02110                         local_calc_ms, total_ms);
02111 #endif
02112 
02113     if ( cuda_timer_count >= simParams->outputCudaTiming ) {
02114       cuda_timer_total /= cuda_timer_count;
02115       CkPrintf("CUDA TIMING: %d  %f ms/step on node %d\n",
02116                 step, cuda_timer_total * 1.e3, CkMyPe());
02117     }
02118     cuda_timer_count = 0;
02119     cuda_timer_total = 0;
02120   }
02121 
02122   } // !GBISOn || gbisPhase==3  
02123 
02124   // Next GBIS Phase
02125 GBISP("C.N.CUDA[%d]::fnWork: incrementing phase\n", CkMyPe())
02126     if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
02127 
02128   GBISP("C.N.CUDA[%d] finished ready for next step\n",CkMyPe());
02129   return 1;  // finished and ready for next step
02130 }
02131 
02132 
02133 #endif  // NAMD_CUDA
02134 

Generated on Sat May 25 04:07:16 2013 for NAMD by  doxygen 1.3.9.1