Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members

ComputeNonbondedCUDA.C

Go to the documentation of this file.
00001 
00002 #include "common.h"
00003 #include "charm++.h"
00004 
00005 #ifdef NAMD_CUDA
00006 #include <cuda_runtime.h>
00007 #include <cuda.h>
00008 #endif
00009 
00010 #include "WorkDistrib.h"
00011 #include "ComputeMgr.h"
00012 #include "ProxyMgr.h"
00013 #include "ComputeNonbondedCUDA.h"
00014 #include "ComputeNonbondedCUDAKernel.h"
00015 #include "LJTable.h"
00016 #include "ObjectArena.h"
00017 #include "SortAtoms.h"
00018 #include <algorithm>
00019 
00020 #include "NamdTypes.h"
00021 
00022 //#define PRINT_GBIS
00023 #undef PRINT_GBIS
00024 
00025 #ifdef NAMD_CUDA
00026 
00027 #ifdef WIN32
00028 #define __thread __declspec(thread)
00029 #endif
00030 
00031 extern __thread cudaStream_t stream;
00032 extern __thread cudaStream_t stream2;
00033 
00034 void cuda_errcheck(const char *msg) {
00035   cudaError_t err;
00036   if ((err = cudaGetLastError()) != cudaSuccess) {
00037     char host[128];
00038 #ifdef NOHOSTNAME
00039     sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00040 #else
00041     gethostname(host, 128);  host[127] = 0;
00042 #endif
00043     char devstr[128] = "";
00044     int devnum;
00045     if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00046       sprintf(devstr, " device %d", devnum);
00047     }
00048     char errmsg[1024];
00049     sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
00050     NAMD_die(errmsg);
00051   }
00052 }
00053 
00054 void cuda_die(const char *msg) {
00055     char host[128];
00056 #ifdef NOHOSTNAME
00057     sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00058 #else
00059     gethostname(host, 128);  host[127] = 0;
00060 #endif
00061     char devstr[128] = "";
00062     int devnum;
00063     if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00064       sprintf(devstr, " device %d", devnum);
00065     }
00066     char errmsg[1024];
00067     sprintf(errmsg,"CUDA error on Pe %d (%s%s): %s", CkMyPe(), host, devstr, msg);
00068     NAMD_die(errmsg);
00069 }
00070 
00071 char *devicelist;
00072 static __thread int usedevicelist;
00073 static __thread int ignoresharing;
00074 static __thread int mergegrids;
00075 
00076 void cuda_getargs(char **argv) {
00077   devicelist = 0;
00078   usedevicelist = CmiGetArgStringDesc(argv, "+devices", &devicelist,
00079         "comma-delimited list of CUDA device numbers such as 0,2,1,2");
00080   ignoresharing = CmiGetArgFlag(argv, "+ignoresharing");
00081   mergegrids = CmiGetArgFlag(argv, "+mergegrids");
00082 }
00083 
00084 static __thread int shared_gpu;
00085 static __thread int first_pe_sharing_gpu;
00086 static __thread int next_pe_sharing_gpu;
00087 static __thread int devicePe;
00088 static __thread int numPesSharingDevice;
00089 static __thread int *pesSharingDevice;
00090 
00091 static __thread int gpu_is_mine;
00092 
00093 int cuda_device_pe() { return devicePe; }
00094 
00095 bool cuda_device_shared_with_pe(int pe) {
00096   for ( int i=0; i<numPesSharingDevice; ++i ) {
00097     if ( pesSharingDevice[i] == pe ) return true;
00098   }
00099   return false;
00100 }
00101 
00102 static inline bool sortop_bitreverse(int a, int b) {
00103   if ( a == b ) return 0; 
00104   for ( int bit = 1; bit; bit *= 2 ) {
00105     if ( (a&bit) != (b&bit) ) return ((a&bit) < (b&bit));
00106   }
00107   return 0;
00108 }
00109 
00110 /*
00111 BASE
00112 2 types (remote & local)
00113 16 pes per node
00114 3 phases (1, 2, 3)
00115 */
00116 
00117 #define CUDA_EVENT_ID_BASE 100
00118 #define CUDA_TRACE_REMOTE(START,END) \
00119   do { int dev; cudaGetDevice(&dev); traceUserBracketEvent( \
00120        CUDA_EVENT_ID_BASE + 2 * dev, START, END); } while (0)
00121 #define CUDA_TRACE_LOCAL(START,END) \
00122   do { int dev; cudaGetDevice(&dev); traceUserBracketEvent( \
00123        CUDA_EVENT_ID_BASE + 2 * dev + 1, START, END); } while (0)
00124 
00125 void cuda_register_user_events() {
00126 
00127 #define REGISTER_DEVICE_EVENTS(DEV) \
00128   traceRegisterUserEvent("CUDA device " #DEV " remote", CUDA_EVENT_ID_BASE + 2 * DEV); \
00129   traceRegisterUserEvent("CUDA device " #DEV " local", CUDA_EVENT_ID_BASE + 2 * DEV + 1);
00130 
00131   REGISTER_DEVICE_EVENTS(0)
00132   REGISTER_DEVICE_EVENTS(1)
00133   REGISTER_DEVICE_EVENTS(2)
00134   REGISTER_DEVICE_EVENTS(3)
00135   REGISTER_DEVICE_EVENTS(4)
00136   REGISTER_DEVICE_EVENTS(5)
00137   REGISTER_DEVICE_EVENTS(6)
00138   REGISTER_DEVICE_EVENTS(7)
00139   REGISTER_DEVICE_EVENTS(8)
00140   REGISTER_DEVICE_EVENTS(9)
00141   REGISTER_DEVICE_EVENTS(10)
00142   REGISTER_DEVICE_EVENTS(11)
00143   REGISTER_DEVICE_EVENTS(12)
00144   REGISTER_DEVICE_EVENTS(13)
00145   REGISTER_DEVICE_EVENTS(14)
00146   REGISTER_DEVICE_EVENTS(15)
00147 
00148 }
00149 
00150 void cuda_initialize() {
00151 
00152   if ( 0 == CkMyPe() ) cuda_register_user_events();
00153 
00154   char host[128];
00155 #ifdef NOHOSTNAME
00156   sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00157 #else
00158   gethostname(host, 128);  host[127] = 0;
00159 #endif
00160 
00161   int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
00162   int myRankInPhysicalNode;
00163   int numPesOnPhysicalNode;
00164   int *pesOnPhysicalNode;
00165   CmiGetPesOnPhysicalNode(myPhysicalNodeID,
00166                            &pesOnPhysicalNode,&numPesOnPhysicalNode);
00167 
00168   {
00169     int i;
00170     for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00171       if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
00172         i = numPesOnPhysicalNode;
00173         break;
00174       }
00175       if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
00176     }
00177     if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
00178       CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
00179       for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00180         CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
00181           i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
00182       }
00183       myRankInPhysicalNode = 0;
00184       numPesOnPhysicalNode = 1;
00185       pesOnPhysicalNode = new int[1];
00186       pesOnPhysicalNode[0] = CkMyPe();
00187     } else {
00188       myRankInPhysicalNode = i;
00189     }
00190   }
00191   // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
00192 
00193   int deviceCount = 0;
00194   cudaGetDeviceCount(&deviceCount);
00195   cuda_errcheck("in cudaGetDeviceCount");
00196   if ( deviceCount <= 0 ) {
00197     cuda_die("No CUDA devices found.");
00198   }
00199 
00200   int *devices;
00201   int ndevices = 0;
00202   int nexclusive = 0;
00203   if ( usedevicelist ) {
00204     devices = new int[strlen(devicelist)];
00205     int i = 0;
00206     while ( devicelist[i] ) {
00207       ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
00208       while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
00209       while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
00210     }
00211   } else {
00212     if ( ! CkMyPe() ) {
00213       CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
00214     }
00215     devices = new int[deviceCount];
00216     for ( int i=0; i<deviceCount; ++i ) {
00217       int dev = i % deviceCount;
00218 #if CUDA_VERSION >= 2020
00219       cudaDeviceProp deviceProp;
00220       cudaGetDeviceProperties(&deviceProp, dev);
00221       cuda_errcheck("in cudaGetDeviceProperties");
00222       if ( deviceProp.computeMode != cudaComputeModeProhibited
00223            && (deviceProp.major > 1 || deviceProp.minor >= 1)
00224            && deviceProp.canMapHostMemory
00225            && deviceProp.multiProcessorCount > 2 ) {  // exclude weak cards
00226         devices[ndevices++] = dev;
00227       }
00228       if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
00229         ++nexclusive;
00230       }
00231 #else
00232       devices[ndevices++] = dev;
00233 #endif
00234     }
00235   }
00236 
00237   if ( ! ndevices ) {
00238     cuda_die("All CUDA devices are in prohibited mode, of compute capability 1.0, or otherwise unusable.");
00239   }
00240 
00241   shared_gpu = 0;
00242   gpu_is_mine = 1;
00243   first_pe_sharing_gpu = CkMyPe();
00244   next_pe_sharing_gpu = CkMyPe();
00245 
00246  /* if ( (ndevices >= numPesOnPhysicalNode) || (nexclusive == 0) ) */ {
00247 
00248   int dev;
00249   if ( numPesOnPhysicalNode > 1 ) {
00250     int myDeviceRank = myRankInPhysicalNode * ndevices / numPesOnPhysicalNode;
00251     dev = devices[myDeviceRank];
00252     devicePe = CkMyPe();
00253     if ( ignoresharing ) {
00254       pesSharingDevice = new int[1];
00255       pesSharingDevice[0] = CkMyPe();
00256       numPesSharingDevice = 1;
00257     } else {
00258       pesSharingDevice = new int[numPesOnPhysicalNode];
00259       devicePe = -1;
00260       numPesSharingDevice = 0;
00261       for ( int i = 0; i < numPesOnPhysicalNode; ++i ) {
00262         if ( i * ndevices / numPesOnPhysicalNode == myDeviceRank ) {
00263           int thisPe = pesOnPhysicalNode[i];
00264           pesSharingDevice[numPesSharingDevice++] = thisPe;
00265           if ( devicePe < 1 ) devicePe = thisPe;
00266           if ( sortop_bitreverse(thisPe,devicePe) ) devicePe = thisPe;
00267         }
00268       }
00269       for ( int j = 0; j < ndevices; ++j ) {
00270         if ( devices[j] == dev && j != myDeviceRank ) shared_gpu = 1;
00271       }
00272     }
00273     if ( shared_gpu && devicePe == CkMyPe() ) {
00274       CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
00275     }
00276   } else {  // in case phys node code is lying
00277     dev = devices[CkMyPe() % ndevices];
00278     devicePe = CkMyPe();
00279     pesSharingDevice = new int[1];
00280     pesSharingDevice[0] = CkMyPe();
00281     numPesSharingDevice = 1;
00282   }
00283 
00284   if ( devicePe != CkMyPe() ) {
00285     CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
00286              CkMyPe(), myRankInPhysicalNode, devicePe);
00287     return;
00288   }
00289 
00290   // disable token-passing but don't submit local until remote finished
00291   // if shared_gpu is true, otherwise submit all work immediately
00292   first_pe_sharing_gpu = CkMyPe();
00293   next_pe_sharing_gpu = CkMyPe();
00294 
00295   gpu_is_mine = ( first_pe_sharing_gpu == CkMyPe() ); 
00296 
00297   if ( dev >= deviceCount ) {
00298     char buf[256];
00299     sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
00300                 CkMyPe(), dev, host, deviceCount);
00301     NAMD_die(buf);
00302   }
00303 
00304   cudaError_t err;
00305   cudaDeviceProp deviceProp;
00306   err = cudaGetDeviceProperties(&deviceProp, dev);
00307   if (err == cudaSuccess) {
00308     CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s'  Mem: %dMB  Rev: %d.%d\n",
00309              CkMyPe(), myRankInPhysicalNode, dev, host,
00310              deviceProp.name, deviceProp.totalGlobalMem / (1024*1024),
00311              deviceProp.major, deviceProp.minor);
00312 
00313     err = cudaSetDevice(dev);
00314   }
00315   if ( err != cudaSuccess) {
00316     char errmsg[1024];
00317     sprintf(errmsg,"CUDA error binding to device %d on pe %d: %s",
00318                         dev, CkMyPe(), cudaGetErrorString(err));
00319     NAMD_die(errmsg);
00320   }
00321 
00322  }  // just let CUDA pick a device for us
00323 
00324   cudaSetDeviceFlags(cudaDeviceMapHost);
00325   cuda_errcheck("in cudaSetDeviceFlags");
00326 
00327   int dev;
00328   if ( cudaGetDevice(&dev) == cudaSuccess ) {
00329     cudaDeviceProp deviceProp;
00330     cudaGetDeviceProperties(&deviceProp, dev);
00331     cuda_errcheck("in cudaGetDeviceProperties");
00332     if ( deviceProp.computeMode == cudaComputeModeProhibited )
00333       cuda_die("device in prohibited mode");
00334     if ( deviceProp.major < 2 && deviceProp.minor < 1 )
00335       cuda_die("device not of compute capability 1.1 or higher");
00336     if ( ! deviceProp.canMapHostMemory )
00337       cuda_die("device cannot map host memory");
00338   }
00339 
00340   if ( sizeof(patch_pair) & 15 ) NAMD_bug("sizeof(patch_pair) % 16 != 0");
00341   if ( sizeof(force_list) & 15 ) NAMD_bug("sizeof(force_list) % 16 != 0");
00342   if ( sizeof(atom) & 15 ) NAMD_bug("sizeof(atom) % 16 != 0");
00343   if ( sizeof(atom_param) & 15 ) NAMD_bug("sizeof(atom_param) % 16 != 0");
00344 
00345 }
00346 
00347 static __thread ComputeNonbondedCUDA* cudaCompute = 0;
00348 static __thread ComputeMgr *computeMgr = 0;
00349 
00350 void send_build_cuda_force_table() {
00351   computeMgr->sendBuildCudaForceTable();
00352 }
00353 
00354 void build_cuda_force_table() {
00355   if ( devicePe != CkMyPe() ) return;
00356   ComputeNonbondedCUDA::build_lj_table();
00357   ComputeNonbondedCUDA::build_force_table();
00358 }
00359 
00360 void ComputeNonbondedCUDA::build_lj_table() {  // static
00361 
00362   const LJTable* const ljTable = ComputeNonbondedUtil:: ljTable;
00363   const int dim = ljTable->get_table_dim();
00364 
00365   // round dim up to odd multiple of 16
00366   int tsize = (((dim+16+31)/32)*32)-16;
00367   if ( tsize < dim ) NAMD_bug("ComputeNonbondedCUDA::build_lj_table bad tsize");
00368 
00369   float2 *t = new float2[tsize*tsize];
00370   float2 *row = t;
00371   for ( int i=0; i<dim; ++i, row += tsize ) {
00372     for ( int j=0; j<dim; ++j ) {
00373       const LJTable::TableEntry *e = ljTable->table_val(i,j);
00374       row[j].x = e->A * scaling;
00375       row[j].y = e->B * scaling;
00376     }
00377   }
00378 
00379   cuda_bind_lj_table(t,tsize);
00380   delete [] t;
00381 
00382   if ( ! CkMyPe() ) {
00383     CkPrintf("Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim);
00384   }
00385 }
00386 
00387 void ComputeNonbondedCUDA::build_force_table() {  // static
00388 
00389   float4 t[FORCE_TABLE_SIZE];
00390   float4 et[FORCE_TABLE_SIZE];  // energy table
00391 
00392   const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta;
00393   const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp;
00394   // const int r2_delta_expc = 64 * (r2_delta_exp - 127);
00395   const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
00396 
00397   double r2list[FORCE_TABLE_SIZE];  // double to match cpu code
00398   for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00399     double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00400     r2list[i] = r*r + r2_delta;
00401   }
00402 
00403   union { double f; int32 i[2]; } byte_order_test;
00404   byte_order_test.f = 1.0;  // should occupy high-order bits only
00405   int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
00406 
00407   for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00408     double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00409     int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc;  // table_i >= 0
00410 
00411     if ( r > cutoff ) {
00412       t[i].x = 0.;
00413       t[i].y = 0.;
00414       t[i].z = 0.;
00415       t[i].w = 0.;
00416       et[i].x = 0.;
00417       et[i].y = 0.;
00418       et[i].z = 0.;
00419       et[i].w = 0.;
00420       continue;
00421     }
00422 
00423     BigReal diffa = r2list[i] - r2_table[table_i];
00424 
00425     // coulomb 1/r or fast force
00426     // t[i].x = 1. / (r2 * r);  // -1/r * d/dr r^-1
00427     {
00428       BigReal table_a = fast_table[4*table_i];
00429       BigReal table_b = fast_table[4*table_i+1];
00430       BigReal table_c = fast_table[4*table_i+2];
00431       BigReal table_d = fast_table[4*table_i+3];
00432       BigReal grad =
00433                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00434       t[i].x = 2. * grad;
00435       BigReal ener = table_a + diffa *
00436                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00437       et[i].x = ener;
00438     }
00439 
00440 
00441     // pme correction for slow force
00442     // t[i].w = 0.;
00443     {
00444       BigReal table_a = scor_table[4*table_i];
00445       BigReal table_b = scor_table[4*table_i+1];
00446       BigReal table_c = scor_table[4*table_i+2];
00447       BigReal table_d = scor_table[4*table_i+3];
00448       BigReal grad =
00449                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00450       t[i].w = 2. * grad;
00451       BigReal ener = table_a + diffa *
00452                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00453       et[i].w = ener;
00454     }
00455 
00456 
00457     // vdw 1/r^6
00458     // t[i].y = 6. / (r8);  // -1/r * d/dr r^-6
00459     {
00460       BigReal table_a = vdwb_table[4*table_i];
00461       BigReal table_b = vdwb_table[4*table_i+1];
00462       BigReal table_c = vdwb_table[4*table_i+2];
00463       BigReal table_d = vdwb_table[4*table_i+3];
00464       BigReal grad =
00465                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00466       t[i].y = 2. * -1. * grad;
00467       BigReal ener = table_a + diffa *
00468                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00469       et[i].y = -1. * ener;
00470     }
00471 
00472 
00473     // vdw 1/r^12
00474     // t[i].z = 12e / (r8 * r4 * r2);  // -1/r * d/dr r^-12
00475     {
00476       BigReal table_a = vdwa_table[4*table_i];
00477       BigReal table_b = vdwa_table[4*table_i+1];
00478       BigReal table_c = vdwa_table[4*table_i+2];
00479       BigReal table_d = vdwa_table[4*table_i+3];
00480       BigReal grad =
00481                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00482       t[i].z = 2. * grad;
00483       BigReal ener = table_a + diffa *
00484                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00485       et[i].z = ener;
00486     }
00487 
00488     // CkPrintf("%d %g %g %g %g %g %g\n", i, r, diffa,
00489     //   t[i].x, t[i].y, t[i].z, t[i].w);
00490 
00491 /*
00492     double r2 = r * r;
00493     double r4 = r2 * r2;
00494     double r8 = r4 * r4;
00495 
00496     t[i].x = 1. / (r2 * r);  // -1/r * d/dr r^-1
00497     t[i].y = 6. / (r8);  // -1/r * d/dr r^-6
00498     t[i].z = 12. / (r8 * r4 * r2);  // -1/r * d/dr r^-12
00499     t[i].w = 0.;
00500 */
00501   }
00502 
00503   t[0].x = 0.f;
00504   t[0].y = 0.f;
00505   t[0].z = 0.f;
00506   t[0].w = 0.f;
00507   et[0].x = et[1].x;
00508   et[0].y = et[1].y;
00509   et[0].z = et[1].z;
00510   et[0].w = et[1].w;
00511 
00512   cuda_bind_force_table(t,et);
00513 
00514   if ( ! CkMyPe() ) {
00515     CkPrintf("Info: Updated CUDA force table with %d elements.\n", FORCE_TABLE_SIZE);
00516   }
00517 }
00518 
00519 struct exlist_sortop {
00520   bool operator() (int32 *li, int32 *lj) {
00521     return ( li[1] < lj[1] );
00522   }
00523 };
00524 
00525 static __thread int2 *exclusionsByAtom;
00526 
00527 void ComputeNonbondedCUDA::build_exclusions() {
00528   Molecule *mol = Node::Object()->molecule;
00529 
00530 #ifdef MEM_OPT_VERSION
00531   int natoms = mol->exclSigPoolSize;
00532 #else
00533   int natoms = mol->numAtoms; 
00534 #endif
00535 
00536   exclusionsByAtom = new int2[natoms];
00537 
00538   // create unique sorted lists
00539 
00540   ObjectArena<int32> listArena;
00541   ResizeArray<int32*> unique_lists;
00542   int32 **listsByAtom = new int32*[natoms];
00543   SortableResizeArray<int32> curList;
00544   for ( int i=0; i<natoms; ++i ) {
00545     curList.resize(0);
00546     curList.add(0);  // always excluded from self
00547 #ifdef MEM_OPT_VERSION
00548     const ExclusionSignature *sig = mol->exclSigPool + i;
00549     int n = sig->fullExclCnt;
00550     for ( int j=0; j<n; ++j ) { curList.add(sig->fullOffset[j]); }
00551     n += 1;
00552 #else
00553     const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
00554     int n = mol_list[0] + 1;
00555     for ( int j=1; j<n; ++j ) { curList.add(mol_list[j] - i); }
00556 #endif
00557     curList.sort();
00558 
00559     int j;
00560     for ( j=0; j<unique_lists.size(); ++j ) {
00561       if ( n != unique_lists[j][0] ) continue;  // no match
00562       int k;
00563       for ( k=0; k<n; ++k ) {
00564         if ( unique_lists[j][k+3] != curList[k] ) break;
00565       }
00566       if ( k == n ) break;  // found match
00567     }
00568     if ( j == unique_lists.size() ) {  // no match
00569       int32 *list = listArena.getNewArray(n+3);
00570       list[0] = n;
00571       int maxdiff = 0;
00572       maxdiff = -1 * curList[0];
00573       if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
00574       list[1] = maxdiff;
00575       for ( int k=0; k<n; ++k ) {
00576         list[k+3] = curList[k];
00577       }
00578       unique_lists.add(list);
00579     }
00580     listsByAtom[i] = unique_lists[j];
00581   }
00582   // sort lists by maxdiff
00583   std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
00584   long int totalbits = 0;
00585   int nlists = unique_lists.size();
00586   for ( int j=0; j<nlists; ++j ) {
00587     int32 *list = unique_lists[j];
00588     int maxdiff = list[1];
00589     list[2] = totalbits + maxdiff;
00590     totalbits += 2*maxdiff + 1;
00591   }
00592   for ( int i=0; i<natoms; ++i ) {
00593     exclusionsByAtom[i].x = listsByAtom[i][1];  // maxdiff
00594     exclusionsByAtom[i].y = listsByAtom[i][2];  // start
00595   }
00596   delete [] listsByAtom;
00597 
00598   if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
00599 
00600   if ( ! CkMyPe() ) {
00601     long int bytesneeded = totalbits / 8;
00602     CkPrintf("Info: Found %d unique exclusion lists needing %ld bytes\n",
00603                 unique_lists.size(), bytesneeded);
00604 
00605     long int bytesavail = MAX_EXCLUSIONS * sizeof(unsigned int);
00606     if ( bytesneeded > bytesavail ) {
00607       char errmsg[512];
00608       sprintf(errmsg,"%ld bytes of CUDA memory needed for exclusions "
00609                      "but only %ld bytes can be addressed with 32-bit int.",
00610                      bytesneeded, bytesavail);
00611       NAMD_die(errmsg);
00612     }
00613   }
00614 
00615 #define SET_EXCL(EXCL,BASE,DIFF) \
00616          (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
00617 
00618   unsigned int *exclusion_bits = new unsigned int[totalbits/32];
00619   memset(exclusion_bits, 0, totalbits/8);
00620 
00621   long int base = 0;
00622   for ( int i=0; i<unique_lists.size(); ++i ) {
00623     base += unique_lists[i][1];
00624     if ( base != unique_lists[i][2] ) {
00625       NAMD_bug("ComputeNonbondedCUDA::build_exclusions base != stored");
00626     }
00627     int n = unique_lists[i][0];
00628     for ( int j=0; j<n; ++j ) {
00629       SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
00630     }
00631     base += unique_lists[i][1] + 1;
00632   }
00633 
00634   cuda_bind_exclusions(exclusion_bits, totalbits/32);
00635 
00636   delete [] exclusion_bits;
00637 }
00638 
00639 
00640 void register_cuda_compute_self(ComputeID c, PatchID pid) {
00641 
00642   if ( ! cudaCompute ) NAMD_bug("register_self called early");
00643 
00644   cudaCompute->requirePatch(pid);
00645 
00646   ComputeNonbondedCUDA::compute_record cr;
00647   cr.c = c;
00648   cr.pid[0] = pid;  cr.pid[1] = pid;
00649   cr.offset = 0.;
00650   if ( cudaCompute->patchRecords[pid].isLocal ) {
00651     cudaCompute->localComputeRecords.add(cr);
00652   } else {
00653     cudaCompute->remoteComputeRecords.add(cr);
00654   }
00655 }
00656 
00657 void register_cuda_compute_pair(ComputeID c, PatchID pid[], int t[]) {
00658 
00659   if ( ! cudaCompute ) NAMD_bug("register_pair called early");
00660  
00661   cudaCompute->requirePatch(pid[0]);
00662   cudaCompute->requirePatch(pid[1]);
00663 
00664   ComputeNonbondedCUDA::compute_record cr, cr2;
00665   cr.c = c;  cr2.c = c;
00666   cr.pid[0] = pid[0];  cr.pid[1] = pid[1];
00667   cr2.pid[0] = pid[1];  cr2.pid[1] = pid[0];
00668 
00669   int t1 = t[0];
00670   int t2 = t[1];
00671   Vector offset = cudaCompute->patchMap->center(pid[0])
00672                 - cudaCompute->patchMap->center(pid[1]);
00673   offset.x += (t1%3-1) - (t2%3-1);
00674   offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
00675   offset.z += (t1/9-1) - (t2/9-1);
00676   cr.offset = offset;
00677   cr2.offset = -1. * offset;
00678     
00679   if ( cudaCompute->patchRecords[pid[0]].isLocal ) {
00680     cudaCompute->localComputeRecords.add(cr);
00681   } else {
00682     cudaCompute->remoteComputeRecords.add(cr);
00683   }
00684   if ( cudaCompute->patchRecords[pid[1]].isLocal ) {
00685     cudaCompute->localComputeRecords.add(cr2);
00686   } else {
00687     cudaCompute->remoteComputeRecords.add(cr2);
00688   }
00689 }
00690 
00691 void unregister_cuda_compute(ComputeID c) {  // static
00692 
00693   NAMD_bug("unregister_compute unimplemented");
00694 
00695 }
00696 
00697 static __thread int atomsChanged = 0;
00698 static __thread int computesChanged = 0;
00699 
00700 static __thread int pairlistsValid = 0;
00701 static __thread float pairlistTolerance = 0.;
00702 static __thread int usePairlists = 0;
00703 static __thread int savePairlists = 0;
00704 static __thread float plcutoff2 = 0;
00705 
00706 // static __thread cudaEvent_t start_upload;
00707 static __thread cudaEvent_t start_calc;
00708 static __thread cudaEvent_t end_remote_download;
00709 static __thread cudaEvent_t end_local_download;
00710 
00711 static __thread ResizeArray<patch_pair> *patch_pairs_ptr;
00712 static __thread ResizeArray<force_list> *force_lists_ptr;
00713 #define PATCH_PAIRS_REF ResizeArray<patch_pair> &patch_pairs(*patch_pairs_ptr);
00714 #define FORCE_LISTS_REF ResizeArray<force_list> &force_lists(*force_lists_ptr);
00715 
00716 ComputeNonbondedCUDA::ComputeNonbondedCUDA(ComputeID c, ComputeMgr *mgr,
00717                 ComputeNonbondedCUDA *m, int idx) : Compute(c), slaveIndex(idx) {
00718 #ifdef PRINT_GBIS
00719    CkPrintf("C.N.CUDA[%d]::constructor cid=%d\n", CkMyPe(), c);
00720 #endif
00721 
00722   // CkPrintf("create ComputeNonbondedCUDA\n");
00723   master = m ? m : this;
00724   cudaCompute = this;
00725   computeMgr = mgr;
00726   patchMap = PatchMap::Object();
00727   atomMap = AtomMap::Object();
00728   reduction = 0;
00729 
00730   SimParameters *params = Node::Object()->simParameters;
00731   if (params->pressureProfileOn) {
00732     NAMD_die("pressure profile not supported in CUDA");
00733   }
00734 
00735   atomsChanged = 1;
00736   computesChanged = 1;
00737   workStarted = 0;
00738   basePriority = PROXY_DATA_PRIORITY;
00739   localWorkMsg2 = new (PRIORITY_SIZE) LocalWorkMsg;
00740 
00741   if ( master != this ) { // I am slave
00742     masterPe = master->masterPe;
00743     master->slaves[slaveIndex] = this;
00744     if ( master->slavePes[slaveIndex] != CkMyPe() ) {
00745       NAMD_bug("ComputeNonbondedCUDA slavePes[slaveIndex] != CkMyPe");
00746     }
00747     registerPatches();
00748     return;
00749   }
00750   masterPe = CkMyPe();
00751 
00752   reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00753 
00754   cuda_init();
00755   build_exclusions();
00756   // cudaEventCreate(&start_upload);
00757   cudaEventCreateWithFlags(&start_calc,cudaEventDisableTiming);
00758   cudaEventCreateWithFlags(&end_remote_download,cudaEventDisableTiming);
00759   cudaEventCreateWithFlags(&end_local_download,cudaEventDisableTiming);
00760 
00761   patch_pairs_ptr = new ResizeArray<patch_pair>;
00762   force_lists_ptr = new ResizeArray<force_list>;
00763 }
00764 
00765 
00766 ComputeNonbondedCUDA::~ComputeNonbondedCUDA() { ; }
00767 
00768 void ComputeNonbondedCUDA::requirePatch(int pid) {
00769 
00770   computesChanged = 1;
00771   patch_record &pr = patchRecords.item(pid);
00772   if ( pr.refCount == 0 ) {
00773     if ( mergegrids ) {
00774       pr.isLocal = 0;
00775     } else if ( CkNumNodes() < 2 ) {
00776       pr.isLocal = 1 & ( 1 ^ patchMap->index_a(pid) ^
00777          patchMap->index_b(pid) ^ patchMap->index_c(pid) );
00778     } else {
00779       pr.isLocal = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
00780     }
00781     if ( pr.isLocal ) {
00782       localActivePatches.add(pid);
00783     } else {
00784       remoteActivePatches.add(pid);
00785     }
00786     activePatches.add(pid);
00787     // setNumPatches(activePatches.size());
00788     pr.patchID = pid;
00789     pr.hostPe = -1;
00790     // ProxyMgr::Object()->createProxy(pid);
00791     // pr.p = patchMap->patch(pid);
00792     // pr.positionBox = pr.p->registerPositionPickup(this);
00793     // pr.forceBox = pr.p->registerForceDeposit(this);
00794     pr.x = NULL;
00795     pr.xExt = NULL;
00796     pr.r = NULL;
00797     pr.f = NULL;
00798     pr.intRad      = NULL;
00799     pr.psiSum      = NULL;
00800     pr.bornRad     = NULL;
00801     pr.dEdaSum     = NULL;
00802     pr.dHdrPrefix  = NULL;
00803   }
00804   pr.refCount += 1;
00805 }
00806 
00807 void ComputeNonbondedCUDA::registerPatches() {
00808 
00809   SimParameters *simParams = Node::Object()->simParameters;
00810   int npatches = master->activePatches.size();
00811   int *pids = master->activePatches.begin();
00812   patch_record *recs = master->patchRecords.begin();
00813   for ( int i=0; i<npatches; ++i ) {
00814     int pid = pids[i];
00815     patch_record &pr = recs[pid];
00816     if ( pr.hostPe == CkMyPe() ) {
00817       //if ( ! reduction ) {
00818       //  reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00819       //}
00820       hostedPatches.add(pid);
00821       if ( pr.isLocal ) {
00822         localHostedPatches.add(pid);
00823       } else {
00824         remoteHostedPatches.add(pid);
00825       }
00826       ProxyMgr::Object()->createProxy(pid);
00827       pr.p = patchMap->patch(pid);
00828       pr.positionBox = pr.p->registerPositionPickup(this);
00829       pr.forceBox = pr.p->registerForceDeposit(this);
00830       if (simParams->GBISOn) {
00831         pr.intRadBox      = pr.p->registerIntRadPickup(this);
00832         pr.psiSumBox      = pr.p->registerPsiSumDeposit(this);
00833         pr.bornRadBox     = pr.p->registerBornRadPickup(this);
00834         pr.dEdaSumBox     = pr.p->registerDEdaSumDeposit(this);
00835         pr.dHdrPrefixBox  = pr.p->registerDHdrPrefixPickup(this);
00836       }
00837     }
00838   }
00839   if ( master == this ) setNumPatches(activePatches.size());
00840   else setNumPatches(hostedPatches.size());
00841   CkPrintf("Pe %d hosts %d local and %d remote patches for pe %d\n", CkMyPe(), localHostedPatches.size(), remoteHostedPatches.size(), masterPe);
00842 }
00843 
00844 void ComputeNonbondedCUDA::assignPatches() {
00845 
00846   int *pesOnNodeSharingDevice = new int[CkMyNodeSize()];
00847   int numPesOnNodeSharingDevice = 0;
00848   int masterIndex = -1;
00849   for ( int i=0; i<numPesSharingDevice; ++i ) {
00850     int pe = pesSharingDevice[i];
00851     if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
00852     if ( CkNodeOf(pe) == CkMyNode() ) {
00853       pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
00854     }
00855   }
00856 
00857   int npatches = activePatches.size();
00858   int *count = new int[npatches];
00859   memset(count, 0, sizeof(int)*npatches);
00860   int *pcount = new int[numPesOnNodeSharingDevice];
00861   memset(pcount, 0, sizeof(int)*numPesOnNodeSharingDevice);
00862   int *rankpcount = new int[CkMyNodeSize()];
00863   memset(rankpcount, 0, sizeof(int)*CkMyNodeSize());
00864   char *table = new char[npatches*numPesOnNodeSharingDevice];
00865   memset(table, 0, npatches*numPesOnNodeSharingDevice);
00866 
00867   int unassignedpatches = npatches;
00868 
00869   if ( 0 ) { // assign all to device pe
00870     for ( int i=0; i<npatches; ++i ) {
00871       int pid = activePatches[i];
00872       patch_record &pr = patchRecords[pid];
00873       pr.hostPe = CkMyPe();
00874     }
00875     unassignedpatches = 0;
00876     pcount[masterIndex] = npatches;
00877   } else 
00878 
00879   // assign if home pe and build table of natural proxies
00880   for ( int i=0; i<npatches; ++i ) {
00881     int pid = activePatches[i];
00882     patch_record &pr = patchRecords[pid];
00883     int homePe = patchMap->node(pid);
00884     for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00885       int pe = pesOnNodeSharingDevice[j];
00886       if ( pe == homePe ) {
00887         pr.hostPe = pe;  --unassignedpatches;
00888         pcount[j] += 1;
00889       }
00890       if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
00891         table[i*numPesOnNodeSharingDevice+j] = 1;
00892       }
00893     }
00894     if ( pr.hostPe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
00895       pr.hostPe = homePe;  --unassignedpatches;
00896       rankpcount[CkRankOf(homePe)] += 1;
00897     }
00898   }
00899   // assign if only one pe has a required proxy
00900   int assignj = 0;
00901   for ( int i=0; i<npatches; ++i ) {
00902     int pid = activePatches[i];
00903     patch_record &pr = patchRecords[pid];
00904     if ( pr.hostPe != -1 ) continue;
00905     int c = 0;
00906     int lastj;
00907     for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00908       if ( table[i*numPesOnNodeSharingDevice+j] ) { ++c; lastj=j; }
00909     }
00910     count[i] = c;
00911     if ( c == 1 ) {
00912       pr.hostPe = pesOnNodeSharingDevice[lastj];
00913       --unassignedpatches;
00914       pcount[lastj] += 1;
00915     }
00916   }
00917   while ( unassignedpatches ) {
00918     int i;
00919     for ( i=0; i<npatches; ++i ) {
00920       if ( ! table[i*numPesOnNodeSharingDevice+assignj] ) continue;
00921       int pid = activePatches[i];
00922       patch_record &pr = patchRecords[pid];
00923       if ( pr.hostPe != -1 ) continue;
00924       pr.hostPe = pesOnNodeSharingDevice[assignj];
00925       --unassignedpatches;
00926       pcount[assignj] += 1;
00927       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00928       break;
00929     }
00930     if ( i<npatches ) continue;  // start search again
00931     for ( i=0; i<npatches; ++i ) {
00932       int pid = activePatches[i];
00933       patch_record &pr = patchRecords[pid];
00934       if ( pr.hostPe != -1 ) continue;
00935       if ( count[i] ) continue;
00936       pr.hostPe = pesOnNodeSharingDevice[assignj];
00937       --unassignedpatches;
00938       pcount[assignj] += 1;
00939       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00940       break;
00941     }
00942     if ( i<npatches ) continue;  // start search again
00943     if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00944   }
00945 
00946   for ( int i=0; i<npatches; ++i ) {
00947     int pid = activePatches[i];
00948     patch_record &pr = patchRecords[pid];
00949     // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), pid, pr.hostPe);
00950   }
00951 
00952   slavePes = new int[CkMyNodeSize()];
00953   slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
00954   numSlaves = 0;
00955   for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00956     int pe = pesOnNodeSharingDevice[j];
00957     int rank = pe - CkNodeFirst(CkMyNode());
00958     // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
00959     //          CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
00960     if ( pe == CkMyPe() ) continue;
00961     if ( ! pcount[j] && ! rankpcount[rank] ) continue;
00962     rankpcount[rank] = 0;  // skip in rank loop below
00963     slavePes[numSlaves] = pe;
00964     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00965     ++numSlaves;
00966   }
00967   for ( int j=0; j<CkMyNodeSize(); ++j ) {
00968     int pe = CkNodeFirst(CkMyNode()) + j;
00969     // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
00970     //          CkMyPe(),j,pe,rankpcount[j]);
00971     if ( ! rankpcount[j] ) continue;
00972     if ( pe == CkMyPe() ) continue;
00973     slavePes[numSlaves] = pe;
00974     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00975     ++numSlaves;
00976   }
00977   registerPatches();
00978 
00979   delete [] pesOnNodeSharingDevice;
00980   delete [] count;
00981   delete [] pcount;
00982   delete [] rankpcount;
00983   delete [] table;
00984 }
00985 
00986 static __thread int num_atom_records_allocated;
00987 // static __thread int* atom_order;
00988 static __thread atom_param* atom_params;
00989 static __thread atom* atoms;
00990 // static __thread float4* forces;
00991 // static __thread float4* slow_forces;
00992 static __thread int num_virials;
00993 static __thread int num_virials_allocated;
00994 static __thread float *virials;
00995 static __thread float *slow_virials;
00996 static __thread float *energy_gbis;
00997 
00998 //GBIS host pointers
00999 static __thread float *intRad0H;
01000 static __thread float *intRadSH;
01001 //static __thread GBReal *psiSumH; //moved into class
01002 static __thread float *bornRadH;
01003 //static __thread GBReal *dEdaSumH; //moved into class
01004 static __thread float *dHdrPrefixH;
01005 
01006 static __thread int cuda_timer_count;
01007 static __thread double cuda_timer_total;
01008 static __thread double kernel_time;
01009 static __thread double remote_submit_time;
01010 static __thread double local_submit_time;
01011 
01012 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
01013 
01014 #ifdef PRINT_GBIS
01015 #define GBISP(...) CkPrintf(__VA_ARGS__);
01016 #else
01017 #define GBISP(...)
01018 #endif
01019 
01020 #define count_limit 1000000
01021 static __thread int check_remote_count;
01022 static __thread int check_local_count;
01023 
01024 void cuda_check_remote_progress(void *arg, double) {
01025 
01026   cudaError_t err = cudaEventQuery(end_remote_download);
01027   if ( err == cudaSuccess ) {
01028     local_submit_time = CkWallTimer();
01029     CUDA_TRACE_REMOTE(remote_submit_time,local_submit_time);
01030     ((ComputeNonbondedCUDA *) arg)->messageFinishWork();
01031     check_remote_count = 0;
01032   } else if ( err != cudaErrorNotReady ) {
01033     cuda_errcheck("in cuda_check_remote_progress");
01034     NAMD_bug("cuda_errcheck missed error in cuda_check_remote_progress");
01035   } else if ( ++check_remote_count >= count_limit ) {
01036     char errmsg[256];
01037     sprintf(errmsg,"cuda_check_remote_progress polled %d times over %f s on step %d",
01038             check_remote_count, CkWallTimer() - remote_submit_time,
01039             ((ComputeNonbondedCUDA *) arg)->step);
01040     cuda_errcheck(errmsg);
01041     NAMD_die(errmsg);
01042   } else if ( check_local_count ) {
01043     NAMD_bug("nonzero check_local_count in cuda_check_remote_progress");
01044   } else {
01045     CUDA_POLL(cuda_check_remote_progress, arg);
01046   }
01047 }
01048 
01049 void cuda_check_local_progress(void *arg, double) {
01050 
01051   cudaError_t err = cudaEventQuery(end_local_download);
01052   if ( err == cudaSuccess ) {
01053     double wall_time = CkWallTimer();
01054     CUDA_TRACE_LOCAL(local_submit_time,wall_time);
01055     kernel_time = wall_time - kernel_time;
01056     ((ComputeNonbondedCUDA *) arg)->messageFinishWork();
01057     check_local_count = 0;
01058   } else if ( err != cudaErrorNotReady ) {
01059     cuda_errcheck("in cuda_check_local_progress");
01060     NAMD_bug("cuda_errcheck missed error in cuda_check_local_progress");
01061   } else if ( ++check_local_count >= count_limit ) {
01062     char errmsg[256];
01063     sprintf(errmsg,"cuda_check_local_progress polled %d times over %f s on step %d",
01064             check_local_count, CkWallTimer() - local_submit_time,
01065             ((ComputeNonbondedCUDA *) arg)->step);
01066     cuda_errcheck(errmsg);
01067     NAMD_die(errmsg);
01068   } else if ( check_remote_count ) {
01069     NAMD_bug("nonzero check_remote_count in cuda_check_local_progress");
01070   } else {
01071     CUDA_POLL(cuda_check_local_progress, arg);
01072   }
01073 }
01074 
01075 #if 0
01076 // don't use this one unless timer is part of stream, above is better
01077 void cuda_check_progress(void *arg, double) {
01078   if ( cuda_stream_finished() ) {
01079     kernel_time = CkWallTimer() - kernel_time;
01080     CUDA_POLL(ccd_index);
01081     // ((ComputeNonbondedCUDA *) arg)->finishWork();
01082     WorkDistrib::messageEnqueueWork((ComputeNonbondedCUDA *) arg);
01083   }
01084 }
01085 #endif
01086 
01087 void ComputeNonbondedCUDA::atomUpdate() { atomsChanged = 1; }
01088 
01089 static __thread int kernel_launch_state = 0;
01090 
01091 struct cr_sortop {
01092   const Lattice &l;
01093   cr_sortop(const Lattice &lattice) : l(lattice) { }
01094   bool operator() (ComputeNonbondedCUDA::compute_record i,
01095                         ComputeNonbondedCUDA::compute_record j) {
01096     Vector a = l.a();
01097     Vector b = l.b();
01098     Vector c = l.c();
01099     BigReal ri = (i.offset.x * a + i.offset.y * b + i.offset.z * c).length2();
01100     BigReal rj = (j.offset.x * a + j.offset.y * b + j.offset.z * c).length2();
01101     return ( ri < rj );
01102   }
01103 };
01104 
01105 int ComputeNonbondedCUDA::noWork() {
01106 
01107   SimParameters *simParams = Node::Object()->simParameters;
01108   Flags &flags = master->patchRecords[hostedPatches[0]].p->flags;
01109   lattice = flags.lattice;
01110   doSlow = flags.doFullElectrostatics;
01111   doEnergy = flags.doEnergy;
01112   step = flags.step;
01113 
01114   if ( ! flags.doNonbonded ) {
01115 GBISP("GBIS[%d] noWork() don't do nonbonded\n",CkMyPe());
01116     for ( int i=0; i<hostedPatches.size(); ++i ) {
01117       patch_record &pr = master->patchRecords[hostedPatches[i]];
01118       pr.positionBox->skip();
01119       pr.forceBox->skip();
01120       if (simParams->GBISOn) {
01121         pr.intRadBox->skip();
01122         pr.psiSumBox->skip();
01123         pr.bornRadBox->skip();
01124         pr.dEdaSumBox->skip();
01125         pr.dHdrPrefixBox->skip();
01126       }
01127     }
01128     if ( master != this ) {
01129       computeMgr->sendNonbondedCUDASlaveReady(masterPe,
01130                         hostedPatches.size(),atomsChanged,sequence());
01131     }
01132     if ( reduction ) reduction->submit();
01133     return 1;
01134   }
01135 
01136   for ( int i=0; i<hostedPatches.size(); ++i ) {
01137     patch_record &pr = master->patchRecords[hostedPatches[i]];
01138     if (!simParams->GBISOn || gbisPhase == 1) {
01139 GBISP("GBIS[%d] noWork() P0[%d] open()\n",CkMyPe(), pr.patchID);
01140       pr.x = pr.positionBox->open();
01141       pr.xExt = pr.p->getCompAtomExtInfo();
01142     }
01143 
01144     if (simParams->GBISOn) {
01145       if (gbisPhase == 1) {
01146 GBISP("GBIS[%d] noWork() P1[%d] open()\n",CkMyPe(),pr.patchID);
01147         pr.intRad     = pr.intRadBox->open();
01148         pr.psiSum     = pr.psiSumBox->open();
01149       } else if (gbisPhase == 2) {
01150 GBISP("GBIS[%d] noWork() P2[%d] open()\n",CkMyPe(),pr.patchID);
01151         pr.bornRad    = pr.bornRadBox->open();
01152         pr.dEdaSum    = pr.dEdaSumBox->open();
01153       } else if (gbisPhase == 3) {
01154 GBISP("GBIS[%d] noWork() P3[%d] open()\n",CkMyPe(),pr.patchID);
01155         pr.dHdrPrefix = pr.dHdrPrefixBox->open();
01156       }
01157 GBISP("opened GBIS boxes");
01158     }
01159   }
01160 
01161   if ( master == this ) return 0; //work to do, enqueue as usual
01162 
01163   // message masterPe
01164   computeMgr->sendNonbondedCUDASlaveReady(masterPe,
01165                         hostedPatches.size(),atomsChanged,sequence());
01166 
01167   workStarted = 1;
01168   basePriority = COMPUTE_PROXY_PRIORITY;
01169 
01170   return 1;
01171 }
01172 
01173 void ComputeNonbondedCUDA::doWork() {
01174 GBISP("C.N.CUDA[%d]::doWork: seq %d, phase %d, workStarted %d\n", \
01175 CkMyPe(), sequence(), gbisPhase, workStarted);
01176 
01177   PATCH_PAIRS_REF;
01178   FORCE_LISTS_REF;
01179 
01180   if ( workStarted ) { //if work already started, check if finished
01181     if ( finishWork() ) {  // finished
01182       workStarted = 0;
01183       basePriority = PROXY_DATA_PRIORITY;  // higher to aid overlap
01184     } else {  // need to call again
01185       workStarted = 2;
01186       basePriority = PROXY_RESULTS_PRIORITY;  // lower for local
01187       if ( master == this && kernel_launch_state > 2 ) {
01188         cuda_check_local_progress(this,0.);  // launches polling
01189       }
01190     }
01191     return;
01192   }
01193 
01194   workStarted = 1;
01195   basePriority = COMPUTE_PROXY_PRIORITY;
01196 
01197   Molecule *mol = Node::Object()->molecule;
01198   Parameters *params = Node::Object()->parameters;
01199   SimParameters *simParams = Node::Object()->simParameters;
01200 
01201   //execute only during GBIS phase 1, or if not using GBIS
01202   if (!simParams->GBISOn || gbisPhase == 1) {
01203 
01204   //bind new patches to GPU
01205  if ( atomsChanged || computesChanged ) {
01206   int npatches = activePatches.size();
01207 
01208   pairlistsValid = 0;
01209   pairlistTolerance = 0.;
01210 
01211   if ( computesChanged ) {
01212     computesChanged = 0;
01213 
01214     num_virials = npatches;
01215     if ( num_virials > num_virials_allocated ) {
01216       if ( num_virials_allocated ) {
01217         cudaFreeHost(virials);
01218         cudaFreeHost(energy_gbis);
01219         cuda_errcheck("in cudaFreeHost virials");
01220       }
01221       num_virials_allocated = 1.1 * num_virials + 1;
01222       cudaHostAlloc((void**)&virials,2*16*sizeof(float)*num_virials_allocated,cudaHostAllocMapped);
01223       cudaHostAlloc((void**)&energy_gbis,sizeof(float)*num_virials_allocated,cudaHostAllocMapped);
01224       for (int i  = 0; i < num_virials_allocated; i++)
01225         energy_gbis[i] = 0.f;
01226       cuda_errcheck("in cudaHostAlloc virials");
01227       slow_virials = virials + 16*num_virials;
01228     }
01229 
01230     int *ap = activePatches.begin();
01231     for ( int i=0; i<localActivePatches.size(); ++i ) {
01232       *(ap++) = localActivePatches[i];
01233     }
01234     for ( int i=0; i<remoteActivePatches.size(); ++i ) {
01235       *(ap++) = remoteActivePatches[i];
01236     }
01237 
01238     // sort computes by distance between patches
01239     cr_sortop so(lattice);
01240     std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),so);
01241     std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),so);
01242  
01243     int nlc = localComputeRecords.size();
01244     int nrc = remoteComputeRecords.size();
01245     computeRecords.resize(nlc+nrc);
01246     compute_record *cr = computeRecords.begin();
01247     for ( int i=0; i<nlc; ++i ) {
01248       *(cr++) = localComputeRecords[i];
01249     }
01250     for ( int i=0; i<nrc; ++i ) {
01251       *(cr++) = remoteComputeRecords[i];
01252     }
01253 
01254     force_lists.resize(npatches);
01255     for ( int i=0; i<npatches; ++i ) {
01256       patchRecords[activePatches[i]].localIndex = i;
01257       force_lists[i].force_list_size = 0;
01258     }
01259 
01260     int ncomputes = computeRecords.size();
01261     patch_pairs.resize(ncomputes);
01262     for ( int i=0; i<ncomputes; ++i ) {
01263       ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01264       int lp1 = patchRecords[cr.pid[0]].localIndex;
01265       int lp2 = patchRecords[cr.pid[1]].localIndex;
01266       force_lists[lp1].force_list_size++;
01267       patch_pair &pp = patch_pairs[i];
01268       pp.offset.x = cr.offset.x;
01269       pp.offset.y = cr.offset.y;
01270       pp.offset.z = cr.offset.z;
01271     }
01272 
01273     for ( int i=0; i<ncomputes; ++i ) {
01274       ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01275       int lp1 = patchRecords[cr.pid[0]].localIndex;
01276       patch_pair &pp = patch_pairs[i];
01277       pp.patch1_force_list_index = lp1;
01278       pp.patch1_force_list_size = force_lists[lp1].force_list_size;
01279     }
01280 
01281    if ( 1 || simParams->outputCudaTiming ) {
01282     CkPrintf("Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
01283         CkMyPe(), localActivePatches.size(), remoteActivePatches.size(),
01284         localComputeRecords.size(), remoteComputeRecords.size());
01285    }
01286   }  // computesChanged
01287 
01288   int istart = 0;
01289   int flstart = 0;
01290   int vlstart = 0;
01291   int max_atoms_per_patch = 0;
01292   int i;
01293   for ( i=0; i<npatches; ++i ) {
01294     if ( i == localActivePatches.size() ) {
01295        num_local_atom_records = istart;
01296     }
01297     force_lists[i].force_list_start = flstart;
01298     force_lists[i].force_output_start = istart;
01299     force_lists[i].atom_start = istart;
01300     force_lists[i].virial_list_start = vlstart;
01301     force_lists[i].virial_output_start = 16*i;
01302     patch_record &pr = patchRecords[activePatches[i]];
01303     pr.localStart = istart;
01304     int natoms = pr.p->getNumAtoms();
01305     int nfreeatoms = natoms;
01306     if ( fixedAtomsOn ) {
01307       const CompAtomExt *aExt = pr.xExt;
01308       for ( int j=0; j<natoms; ++j ) {
01309         if ( aExt[j].atomFixed ) --nfreeatoms;
01310       }
01311     }
01312     if ( natoms > max_atoms_per_patch ) max_atoms_per_patch = natoms;
01313     pr.numAtoms = natoms;
01314     pr.numFreeAtoms = nfreeatoms;
01315     force_lists[i].patch_size = nfreeatoms;
01316     if ( natoms & 15 ) { natoms += 16 - (natoms & 15); }
01317     if ( nfreeatoms & 15 ) { nfreeatoms += 16 - (nfreeatoms & 15); }
01318     force_lists[i].patch_stride = nfreeatoms;
01319     flstart += nfreeatoms * force_lists[i].force_list_size;
01320     vlstart += 16 * force_lists[i].force_list_size;
01321     istart += natoms;  // already rounded up
01322     force_lists[i].force_list_size = 0;  // rebuild below
01323   }
01324   if ( i == localActivePatches.size() ) {
01325      num_local_atom_records = istart;
01326   }
01327   num_force_records = flstart;
01328   num_atom_records = istart;
01329   num_remote_atom_records = num_atom_records - num_local_atom_records;
01330   if ( num_atom_records > num_atom_records_allocated ) {
01331     if ( num_atom_records_allocated ) {
01332       // delete [] atom_order;
01333       cudaFreeHost(atom_params);
01334       cudaFreeHost(atoms);
01335       cudaFreeHost(forces);
01336       cudaFreeHost(slow_forces);
01337       if (simParams->GBISOn) {
01338         cudaFreeHost(intRad0H);//6 GBIS arrays
01339         cudaFreeHost(intRadSH);
01340         cudaFreeHost(psiSumH);
01341         cudaFreeHost(bornRadH);
01342         cudaFreeHost(dEdaSumH);
01343         cudaFreeHost(dHdrPrefixH);
01344       }
01345       cuda_errcheck("in cudaFreeHost");
01346     }
01347     num_atom_records_allocated = 1.1 * num_atom_records + 1;
01348     // atom_order = new int[num_atom_records_allocated];
01349     cudaMallocHost((void**)&atom_params,sizeof(atom_param)*num_atom_records_allocated);
01350     cudaMallocHost((void**)&atoms,sizeof(atom)*num_atom_records_allocated);
01351     cuda_errcheck("in cudaMallocHost atoms");
01352     cudaHostAlloc((void**)&forces,sizeof(float4)*num_atom_records_allocated,cudaHostAllocMapped);
01353     cudaHostAlloc((void**)&slow_forces,sizeof(float4)*num_atom_records_allocated,cudaHostAllocMapped);
01354     //allocate GBIS memory
01355     if (simParams->GBISOn) {
01356   cudaMallocHost((void**)&intRad0H,sizeof(float)*num_atom_records_allocated);
01357   cudaMallocHost((void**)&intRadSH,sizeof(float)*num_atom_records_allocated);
01358   cudaHostAlloc((void**)&psiSumH, sizeof(GBReal)*num_atom_records_allocated,cudaHostAllocMapped);
01359   cudaMallocHost((void**)&bornRadH,sizeof(float)*num_atom_records_allocated);
01360   cudaHostAlloc((void**)&dEdaSumH,sizeof(GBReal)*num_atom_records_allocated,cudaHostAllocMapped);
01361   cudaMallocHost((void**)&dHdrPrefixH,sizeof(float)*num_atom_records_allocated);
01362     }
01363     cuda_errcheck("in cudaHostAlloc forces");
01364   }
01365 
01366   int bfstart = 0;
01367   int ncomputes = computeRecords.size();
01368   for ( int i=0; i<ncomputes; ++i ) {
01369     ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01370     int p1 = cr.pid[0];
01371     int p2 = cr.pid[1];
01372     int lp1 = patchRecords[p1].localIndex;
01373     int lp2 = patchRecords[p2].localIndex;
01374     patch_pair &pp = patch_pairs[i];
01375     pp.patch1_atom_start = patchRecords[p1].localStart;
01376     pp.patch1_force_start = force_lists[lp1].force_list_start +
01377         force_lists[lp1].patch_stride * force_lists[lp1].force_list_size;
01378     pp.virial_start = force_lists[lp1].virial_list_start +
01379                                  16 * force_lists[lp1].force_list_size;
01380     pp.patch1_size = patchRecords[p1].numAtoms;
01381     pp.patch1_force_size = patchRecords[p1].numFreeAtoms;
01382     pp.patch2_force_size = patchRecords[p2].numFreeAtoms;
01383     pp.block_flags_start = bfstart;
01384     bfstart += ((pp.patch1_force_size + 127) >> 7) << 5;
01385     // istart += pp.patch1_size;
01386     // if ( istart & 15 ) { istart += 16 - (istart & 15); }
01387     pp.patch2_atom_start = patchRecords[p2].localStart;
01388     // pp.patch2_force_start = force_lists[lp2].force_list_start +
01389 //      force_lists[lp2].patch_stride * force_lists[lp2].force_list_size;
01390     pp.patch2_size = patchRecords[p2].numAtoms;
01391     // pp.patch2_force_size = patchRecords[p2].numFreeAtoms;
01392     // this must happen at end to get self computes right
01393     force_lists[lp1].force_list_size++;
01394     // if ( lp1 != lp2 || cr.t[0] != cr.t[1] ) {
01395     //   force_lists[lp2].force_list_size++;
01396     // }
01397     // istart += pp.patch2_size;
01398     // if ( istart & 15 ) { istart += 16 - (istart & 15); }
01399   } //for ncomputes
01400 
01401 #if 0
01402   CkPrintf("Pe %d cuda_bind_patch_pairs %d %d %d %d %d\n", CkMyPe(),
01403         patch_pairs.size(), force_lists.size(),
01404         num_atom_records, num_force_records,
01405         max_atoms_per_patch);
01406 #endif
01407 
01408   int totalmem = patch_pairs.size() * sizeof(patch_pair) +
01409                 force_lists.size() * sizeof(force_list) +
01410                 num_force_records * sizeof(float4) +
01411                 num_atom_records * sizeof(atom) +
01412                 num_atom_records * sizeof(atom_param) +
01413                 num_atom_records * sizeof(float4);
01414   int totalcopy = num_atom_records * ( sizeof(atom) + sizeof(float4) );
01415 /*
01416   CkPrintf("Pe %d allocating %d MB of GPU memory, will copy %d kB per step\n",
01417                         CkMyPe(), totalmem >> 20, totalcopy >> 10);
01418 */
01419 
01420   cuda_bind_patch_pairs(patch_pairs.begin(), patch_pairs.size(),
01421                         force_lists.begin(), force_lists.size(),
01422                         num_atom_records, num_force_records,
01423                         bfstart, max_atoms_per_patch);
01424 
01425  }  // atomsChanged || computesChanged
01426 
01427   double charge_scaling = sqrt(COULOMB * scaling * dielectric_1);
01428 
01429   Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01430   float maxAtomMovement = 0.;
01431   float maxPatchTolerance = 0.;
01432 
01433   for ( int i=0; i<activePatches.size(); ++i ) {
01434     patch_record &pr = patchRecords[activePatches[i]];
01435 
01436     float maxMove = pr.p->flags.maxAtomMovement;
01437     if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
01438 
01439     float maxTol = pr.p->flags.pairlistTolerance;
01440     if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
01441 
01442     int start = pr.localStart;
01443     int n = pr.numAtoms;
01444     const CompAtom *a = pr.x;
01445     const CompAtomExt *aExt = pr.xExt;
01446     // int *ao = atom_order + start;
01447     if ( atomsChanged ) {
01448 #if 0
01449       int nfree = pr.numFreeAtoms;
01450       if ( nfree != n ) {
01451         int k = 0;
01452         for ( int j=0; j<n; ++j ) {
01453           // put free atoms first
01454           if ( ! aExt[j].atomFixed ) ao[k++] = j;
01455         }
01456         if ( fixedAtomsOn ) for ( int j=0; j<n; ++j ) {
01457           // put fixed atoms at end
01458           if ( aExt[j].atomFixed ) ao[k++] = j;
01459         }
01460       } else {
01461         for ( int j=0; j<n; ++j ) {
01462           ao[j] = j;
01463         }
01464       }
01465 
01466       sortAtomsForCUDA(ao,a,nfree,n);
01467 #endif
01468 
01469       atom_param *ap = atom_params + start;
01470       for ( int k=0; k<n; ++k ) {
01471         int j = aExt[k].sortOrder;
01472         ap[k].vdw_type = a[j].vdwType;
01473         ap[k].index = aExt[j].id;
01474 #ifdef MEM_OPT_VERSION
01475         ap[k].excl_index = exclusionsByAtom[aExt[j].exclId].y;
01476         ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].exclId].x;
01477 #else
01478         ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
01479         ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
01480 #endif
01481       }
01482     }
01483     {
01484 #if 1
01485       const CudaAtom *ac = pr.p->getCudaAtomList();
01486       atom *ap = atoms + start;
01487       memcpy(ap, ac, sizeof(atom)*n);
01488 #else
01489       Vector center =
01490         pr.p->flags.lattice.unscale(cudaCompute->patchMap->center(pr.patchID));
01491       atom *ap = atoms + start;
01492       for ( int k=0; k<n; ++k ) {
01493         int j = aExt[k].sortOrder;
01494         ap[k].position.x = a[j].position.x - center.x;
01495         ap[k].position.y = a[j].position.y - center.y;
01496         ap[k].position.z = a[j].position.z - center.z;
01497         ap[k].charge = charge_scaling * a[j].charge;
01498       }
01499 #endif
01500     }
01501   }
01502 //GBISP("finished active patches\n")
01503 
01504   //CkPrintf("maxMovement = %f  maxTolerance = %f  save = %d  use = %d\n",
01505   //  maxAtomMovement, maxPatchTolerance,
01506   //  flags.savePairlists, flags.usePairlists);
01507 
01508   savePairlists = 0;
01509   usePairlists = 0;
01510   if ( flags.savePairlists ) {
01511     savePairlists = 1;
01512     usePairlists = 1;
01513   } else if ( flags.usePairlists ) {
01514     if ( ! pairlistsValid ||
01515          ( 2. * maxAtomMovement > pairlistTolerance ) ) {
01516       reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
01517     } else {
01518       usePairlists = 1;
01519     }
01520   }
01521   if ( ! usePairlists ) {
01522     pairlistsValid = 0;
01523   }
01524   float plcutoff = cutoff;
01525   if ( savePairlists ) {
01526     pairlistsValid = 1;
01527     pairlistTolerance = 2. * maxPatchTolerance;
01528     plcutoff += pairlistTolerance;
01529   }
01530   plcutoff2 = plcutoff * plcutoff;
01531 
01532   //CkPrintf("plcutoff = %f  listTolerance = %f  save = %d  use = %d\n",
01533   //  plcutoff, pairlistTolerance, savePairlists, usePairlists);
01534 
01535 #if 0
01536   // calculate warp divergence
01537   if ( 1 ) {
01538     Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01539     Lattice &lattice = flags.lattice;
01540     float3 lata, latb, latc;
01541     lata.x = lattice.a().x;
01542     lata.y = lattice.a().y;
01543     lata.z = lattice.a().z;
01544     latb.x = lattice.b().x;
01545     latb.y = lattice.b().y;
01546     latb.z = lattice.b().z;
01547     latc.x = lattice.c().x;
01548     latc.y = lattice.c().y;
01549     latc.z = lattice.c().z;
01550 
01551     int ncomputes = computeRecords.size();
01552     for ( int ic=0; ic<ncomputes; ++ic ) {
01553       patch_pair &pp = patch_pairs[ic];
01554       atom *a1 = atoms + pp.patch1_atom_start;
01555       int n1 = pp.patch1_size;
01556       atom *a2 = atoms + pp.patch2_atom_start;
01557       int n2 = pp.patch2_size;
01558       float offx = pp.offset.x * lata.x
01559                + pp.offset.y * latb.x
01560                + pp.offset.z * latc.x;
01561       float offy = pp.offset.x * lata.y
01562                + pp.offset.y * latb.y
01563                + pp.offset.z * latc.y;
01564       float offz = pp.offset.x * lata.z
01565                + pp.offset.y * latb.z
01566                + pp.offset.z * latc.z;
01567       // CkPrintf("%f %f %f\n", offx, offy, offz);
01568       int atoms_tried = 0;
01569       int blocks_tried = 0;
01570       int atoms_used = 0;
01571       int blocks_used = 0;
01572       for ( int ii=0; ii<n1; ii+=32 ) {  // warps
01573         for ( int jj=0; jj<n2; jj+=16 ) {  // shared atom loads
01574           int block_used = 0;
01575           for ( int j=jj; j<jj+16 && j<n2; ++j ) {  // shared atoms
01576             int atom_used = 0;
01577             for ( int i=ii; i<ii+32 && i<n1; ++i ) {  // threads
01578               float dx = offx + a1[i].position.x - a2[j].position.x;
01579               float dy = offy + a1[i].position.y - a2[j].position.y;
01580               float dz = offz + a1[i].position.z - a2[j].position.z;
01581               float r2 = dx*dx + dy*dy + dz*dz;
01582               if ( r2 < cutoff2 ) atom_used = 1;
01583             }
01584             ++atoms_tried;
01585             if ( atom_used ) { block_used = 1; ++atoms_used; }
01586           }
01587           ++blocks_tried;
01588           if ( block_used ) { ++blocks_used; }
01589         }
01590       }
01591       CkPrintf("blocks = %d/%d (%f)  atoms = %d/%d (%f)\n",
01592                 blocks_used, blocks_tried, blocks_used/(float)blocks_tried,
01593                 atoms_used, atoms_tried, atoms_used/(float)atoms_tried);
01594     }
01595   }
01596 #endif
01597 
01598   } // !GBISOn || gbisPhase == 1
01599 
01600   //Do GBIS
01601   if (simParams->GBISOn) {
01602     //open GBIS boxes depending on phase
01603     for ( int i=0; i<activePatches.size(); ++i ) {
01604       patch_record &pr = master->patchRecords[activePatches[i]];
01605 GBISP("doWork[%d] accessing arrays for P%d\n",CkMyPe(),gbisPhase);
01606         if (gbisPhase == 1) {
01607           //Copy GBIS intRadius to Host
01608           if (atomsChanged) {
01609             float *intRad0 = intRad0H + pr.localStart;
01610             float *intRadS = intRadSH + pr.localStart;
01611             for ( int k=0; k<pr.numAtoms; ++k ) {
01612               int j = pr.xExt[k].sortOrder;
01613               intRad0[k] = pr.intRad[2*j+0];
01614               intRadS[k] = pr.intRad[2*j+1];
01615             }
01616           }
01617         } else if (gbisPhase == 2) {
01618           float *bornRad = bornRadH + pr.localStart;
01619           for ( int k=0; k<pr.numAtoms; ++k ) {
01620             int j = pr.xExt[k].sortOrder;
01621             bornRad[k] = pr.bornRad[j];
01622           }
01623         } else if (gbisPhase == 3) {
01624           float *dHdrPrefix = dHdrPrefixH + pr.localStart;
01625           for ( int k=0; k<pr.numAtoms; ++k ) {
01626             int j = pr.xExt[k].sortOrder;
01627             dHdrPrefix[k] = pr.dHdrPrefix[j];
01628           }
01629       } // end phases
01630     } // end for patches
01631   } // if GBISOn
01632 
01633   kernel_time = CkWallTimer();
01634   kernel_launch_state = 1;
01635   if ( gpu_is_mine ) recvYieldDevice(-1);
01636 }
01637 
01638 void cuda_check_remote_calc(void *arg, double) {
01639   // in theory we only need end_remote_calc, but overlap isn't reliable
01640   // if ( cudaEventQuery(end_remote_calc) == cudaSuccess ) {
01641   if ( cudaEventQuery(end_remote_download) == cudaSuccess ) {
01642 // CkPrintf("Pe %d yielding to %d after remote calc\n", CkMyPe(), next_pe_sharing_gpu);
01643     computeMgr->sendYieldDevice(next_pe_sharing_gpu);
01644 // CkPrintf("Pe %d yielded to %d after remote calc\n", CkMyPe(), next_pe_sharing_gpu);
01645   } else {
01646     CUDA_POLL(cuda_check_remote_calc, arg);
01647   }
01648 }
01649 
01650 void cuda_check_local_calc(void *arg, double) {
01651   // in theory we only need end_local_calc, but overlap isn't reliable
01652   // if ( cudaEventQuery(end_local_calc) == cudaSuccess ) {
01653   if ( cudaEventQuery(end_local_download) == cudaSuccess ) {
01654 // CkPrintf("Pe %d yielding to %d after local calc\n", CkMyPe(), next_pe_sharing_gpu);
01655     computeMgr->sendYieldDevice(next_pe_sharing_gpu);
01656 // CkPrintf("Pe %d yielded to %d after local calc\n", CkMyPe(), next_pe_sharing_gpu);
01657   } else {
01658     CUDA_POLL(cuda_check_local_calc, arg);
01659   }
01660 }
01661 
01662 // computeMgr->sendYieldDevice(next_pe_sharing_gpu);
01663 
01664 void ComputeNonbondedCUDA::recvYieldDevice(int pe) {
01665 GBISP("C.N.CUDA[%d]::recvYieldDevice: seq %d, workStarted %d, \
01666 gbisPhase %d, kls %d, from pe %d\n", CkMyPe(), sequence(), \
01667 workStarted, gbisPhase, kernel_launch_state, pe)
01668 
01669   float3 lata, latb, latc;
01670   lata.x = lattice.a().x;
01671   lata.y = lattice.a().y;
01672   lata.z = lattice.a().z;
01673   latb.x = lattice.b().x;
01674   latb.y = lattice.b().y;
01675   latb.z = lattice.b().z;
01676   latc.x = lattice.c().x;
01677   latc.y = lattice.c().y;
01678   latc.z = lattice.c().z;
01679   SimParameters *simParams = Node::Object()->simParameters;
01680 
01681   switch ( kernel_launch_state ) {
01683 // Remote
01684   case 1:
01685 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: case 1\n", CkMyPe())
01686     ++kernel_launch_state;
01687     gpu_is_mine = 0;
01688     remote_submit_time = CkWallTimer();
01689 
01690     if (!simParams->GBISOn || gbisPhase == 1) {
01691     // cudaEventRecord(start_upload, stream);
01692     if ( atomsChanged ) {
01693       cuda_bind_atom_params(atom_params);
01694     }
01695     if ( simParams->GBISOn) {
01696       cuda_bind_GBIS_psiSum(psiSumH);
01697       if ( atomsChanged ) {
01698         cuda_bind_GBIS_intRad(intRad0H, intRadSH);
01699       }
01700     }
01701     cuda_bind_atoms(atoms);
01702     cuda_bind_forces(forces, slow_forces);
01703     cuda_bind_virials(virials);
01704     cuda_bind_GBIS_energy(energy_gbis);
01705     cudaEventRecord(start_calc, stream);
01706     //call CUDA Kernels
01707     cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
01708         localComputeRecords.size(),remoteComputeRecords.size(),
01709         localActivePatches.size(),remoteActivePatches.size(),
01710         doSlow, doEnergy, usePairlists, savePairlists, stream);
01711     if (simParams->GBISOn) {
01712       cuda_GBIS_P1(
01713         localComputeRecords.size(),remoteComputeRecords.size(),
01714         localActivePatches.size(),remoteActivePatches.size(),
01715         simParams->alpha_cutoff-simParams->fsMax,
01716         simParams->coulomb_radius_offset,
01717         lata, latb, latc, stream);
01718     }
01719     //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
01720     //    num_local_atom_records,num_remote_atom_records);
01721     cudaEventRecord(end_remote_download, stream);
01722     CUDA_POLL(cuda_check_remote_progress,this);
01723     if ( shared_gpu && ! mergegrids ) {
01724       CUDA_POLL(cuda_check_remote_calc,this);
01725       break;
01726     }
01727     } // !GBIS or gbisPhase==1
01728     if (simParams->GBISOn) {
01729       if (gbisPhase == 1) {
01730         //GBIS P1 Kernel launched in previous code block
01731       } else if (gbisPhase == 2) {
01732 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P2>>>\n", CkMyPe())
01733         // cudaEventRecord(start_upload, stream);
01734         cuda_bind_GBIS_bornRad(bornRadH);
01735         cuda_bind_GBIS_dEdaSum(dEdaSumH);
01736         cudaEventRecord(start_calc, stream);
01737         cuda_GBIS_P2(
01738           localComputeRecords.size(),remoteComputeRecords.size(),
01739           localActivePatches.size(),remoteActivePatches.size(),
01740           (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
01741           simParams->nonbondedScaling, simParams->kappa,
01742           (simParams->switchingActive ? simParams->switchingDist : -1.0),
01743           simParams->dielectric, simParams->solvent_dielectric,
01744           lata, latb, latc,
01745           doEnergy, doSlow, stream
01746           );
01747         cudaEventRecord(end_remote_download, stream);
01748         CUDA_POLL(cuda_check_remote_progress,this);
01749       } else if (gbisPhase == 3) {
01750 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P3>>>\n", CkMyPe())
01751         // cudaEventRecord(start_upload, stream);
01752         cuda_bind_GBIS_dHdrPrefix(dHdrPrefixH);
01753         cudaEventRecord(start_calc, stream);
01754         if (doSlow)
01755         cuda_GBIS_P3(
01756           localComputeRecords.size(),remoteComputeRecords.size(),
01757           localActivePatches.size(),remoteActivePatches.size(),
01758           (simParams->alpha_cutoff-simParams->fsMax),
01759           simParams->coulomb_radius_offset,
01760           simParams->nonbondedScaling,
01761           lata, latb, latc, stream
01762           );
01763         cudaEventRecord(end_remote_download, stream);
01764         CUDA_POLL(cuda_check_remote_progress,this);
01765       }
01766     }
01767 
01769 // Local
01770   case 2:
01771 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: case 2\n", CkMyPe())
01772     ++kernel_launch_state;
01773     gpu_is_mine = 0;
01774 
01775 #if 0
01776     cudaStreamWaitEvent(stream2, start_calc, 0);
01777 #else
01778 #define stream2 stream
01779 #endif
01780 
01781     if (!simParams->GBISOn || gbisPhase == 1) {
01782 
01783       cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
01784               0,localComputeRecords.size(),
01785               0,localActivePatches.size(),
01786               doSlow, doEnergy, usePairlists, savePairlists, stream2);
01787       if (simParams->GBISOn) {
01788         cuda_GBIS_P1(
01789           0,localComputeRecords.size(),
01790           0,localActivePatches.size(),
01791           simParams->alpha_cutoff-simParams->fsMax,
01792           simParams->coulomb_radius_offset,
01793           lata, latb, latc, stream2 );
01794       }
01795     //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
01796     //    0,num_local_atom_records);
01797     //cuda_load_virials(virials, doSlow);  // slow_virials follows virials
01798     cudaEventRecord(end_local_download, stream2);
01799     if ( workStarted == 2 ) {
01800 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
01801 cuda_check_local_progress\n", CkMyPe())
01802       CUDA_POLL(cuda_check_local_progress,this);
01803     }
01804     if ( shared_gpu && ! mergegrids ) {
01805 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
01806 cuda_check_local_calc\n", CkMyPe())
01807       CUDA_POLL(cuda_check_local_calc,this);
01808       break;
01809     }
01810 
01811     } // !GBIS or gbisPhase==1
01812     if (simParams->GBISOn) {
01813       if (gbisPhase == 1) {
01814         //GBIS P1 Kernel launched in previous code block
01815       } else if (gbisPhase == 2) {
01816 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P2>>>\n", CkMyPe())
01817         cuda_GBIS_P2(
01818           0,localComputeRecords.size(),
01819           0,localActivePatches.size(),
01820           (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
01821           simParams->nonbondedScaling, simParams->kappa,
01822           (simParams->switchingActive ? simParams->switchingDist : -1.0),
01823           simParams->dielectric, simParams->solvent_dielectric,
01824           lata, latb, latc,
01825           doEnergy, doSlow, stream2
01826           );
01827         cudaEventRecord(end_local_download, stream2);
01828         if ( workStarted == 2 ) {
01829           CUDA_POLL(cuda_check_local_progress,this);
01830         }
01831       } else if (gbisPhase == 3) {
01832 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P3>>>\n", CkMyPe())
01833         if (doSlow)
01834         cuda_GBIS_P3(
01835           0,localComputeRecords.size(),
01836           0,localActivePatches.size(),
01837           (simParams->alpha_cutoff-simParams->fsMax),
01838           simParams->coulomb_radius_offset,
01839           simParams->nonbondedScaling,
01840           lata, latb, latc, stream2
01841           );
01842         cudaEventRecord(end_local_download, stream2);
01843         if ( workStarted == 2 ) {
01844           CUDA_POLL(cuda_check_local_progress,this);
01845         }
01846       } // phases
01847     } // GBISOn
01848 
01849 
01850   default:
01851 GBISP("C.N.CUDA[%d]::recvYieldDevice: case default\n", CkMyPe())
01852     gpu_is_mine = 1;
01853     break;
01854   } // switch
01855 GBISP("C.N.CUDA[%d]::recvYieldDevice: DONE\n", CkMyPe())
01856 }
01857 
01858 
01859 void ComputeNonbondedCUDA::messageFinishWork() {
01860 
01861   cuda_errcheck("at cuda stream completed");
01862 
01863   for ( int i = 0; i < numSlaves; ++i ) {
01864     computeMgr->sendNonbondedCUDASlaveEnqueue(slaves[i],slavePes[i],sequence(),priority(),workStarted);
01865   }
01866 
01867   WorkDistrib::messageEnqueueWork(this);
01868 
01869 }
01870 
01871 //dtanner
01872 int ComputeNonbondedCUDA::finishWork() {
01873 GBISP("C.N.CUDA[%d]::fnWork: workStarted %d, phase %d\n", \
01874 CkMyPe(), workStarted, gbisPhase)
01875 
01876   Molecule *mol = Node::Object()->molecule;
01877   SimParameters *simParams = Node::Object()->simParameters;
01878 
01879   ResizeArray<int> &patches( workStarted == 1 ?
01880                                 remoteHostedPatches : localHostedPatches );
01881 
01882   //call once at beginning of each phase/step
01883   if ( !simParams->GBISOn || gbisPhase == 1 ) {
01884     for ( int i=0; i<patches.size(); ++i ) {
01885       patch_record &pr = master->patchRecords[patches[i]];
01886 GBISP("GBIS[%d] fnWork() P0[%d] force.open()\n",CkMyPe(), pr.patchID);
01887       pr.r = pr.forceBox->open();
01888     }
01889   } // !GBISOn || gbisPhase==1
01890 
01891   // long long int wcount = 0;
01892   // double virial = 0.;
01893   // double virial_slow = 0.;
01894   for ( int i=0; i<patches.size(); ++i ) {
01895     // CkPrintf("Pe %d patch %d of %d pid %d\n",CkMyPe(),i,patches.size(),patches[i]);
01896     patch_record &pr = master->patchRecords[patches[i]];
01897 
01898     int start = pr.localStart;
01899     const CompAtomExt *aExt = pr.xExt;
01900     if ( !simParams->GBISOn || gbisPhase == 3 ) {
01901     int nfree = pr.numFreeAtoms;
01902     pr.f = pr.r->f[Results::nbond];
01903     Force *f = pr.f;
01904     Force *f_slow = pr.r->f[Results::slow];
01905     const CompAtom *a = pr.x;
01906     // int *ao = atom_order + start;
01907     float4 *af = master->forces + start;
01908     float4 *af_slow = master->slow_forces + start;
01909     // only free atoms return forces
01910     for ( int k=0; k<nfree; ++k ) {
01911       int j = aExt[k].sortOrder;
01912       f[j].x += af[k].x;
01913       f[j].y += af[k].y;
01914       f[j].z += af[k].z;
01915       // wcount += af[k].w;
01916       // virial += af[k].w;
01917       if ( doSlow ) {
01918         f_slow[j].x += af_slow[k].x;
01919         f_slow[j].y += af_slow[k].y;
01920         f_slow[j].z += af_slow[k].z;
01921         // virial_slow += af_slow[k].w;
01922       }
01923     }
01924 
01925 #if 1
01926     // check exclusions reported as w
01927     if ( CkNumPes() == 1 ) {
01928       const CompAtomExt *aExt = pr.xExt;
01929       for ( int k=0; k<nfree; ++k ) {
01930         int j = aExt[k].sortOrder;
01931 #ifdef MEM_OPT_VERSION
01932         int excl_expected = mol->exclSigPool[aExt[j].exclId].fullExclCnt + 1;
01933 #else
01934         int excl_expected = mol->get_full_exclusions_for_atom(aExt[j].id)[0] + 1;
01935 #endif
01936         if ( af[k].w != excl_expected ) {
01937           CkPrintf("%d:%d(%d) atom %d found %d exclusions but expected %d\n",
01938                 i, j, k, aExt[j].id, (int)af[k].w, excl_expected );
01939         }
01940       }
01941     }
01942 #endif
01943   } // !GBISOn || gbisPhase == 3
01944 
01945 #if 0
01946     if ( i % 31 == 0 ) for ( int j=0; j<3; ++j ) {
01947       CkPrintf("Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
01948         j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
01949         af[j].w);
01950     }
01951 #endif
01952 
01953     //Close Boxes depending on Phase
01954     if (simParams->GBISOn) {
01955       if (gbisPhase == 1) {
01956         //Copy dEdaSum from Host to Patch Box
01957         GBReal *psiSumMaster = master->psiSumH + start;
01958         for ( int k=0; k<pr.numAtoms; ++k ) {
01959           int j = aExt[k].sortOrder;
01960           pr.psiSum[j] += psiSumMaster[k];
01961         }
01962 GBISP("C.N.CUDA[%d]::fnWork: P1 psiSum.close()\n", CkMyPe());
01963         pr.psiSumBox->close(&(pr.psiSum));
01964 
01965       } else if (gbisPhase == 2) {
01966         //Copy dEdaSum from Host to Patch Box
01967         GBReal *dEdaSumMaster = master->dEdaSumH + start;
01968         for ( int k=0; k<pr.numAtoms; ++k ) {
01969           int j = aExt[k].sortOrder;
01970           pr.dEdaSum[j] += dEdaSumMaster[k];
01971         }
01972 GBISP("C.N.CUDA[%d]::fnWork: P2 dEdaSum.close()\n", CkMyPe());
01973         pr.dEdaSumBox->close(&(pr.dEdaSum));
01974 
01975       } else if (gbisPhase == 3) {
01976 GBISP("C.N.CUDA[%d]::fnWork: P3 all.close()\n", CkMyPe());
01977         pr.intRadBox->close(&(pr.intRad)); //box 6
01978         pr.bornRadBox->close(&(pr.bornRad)); //box 7
01979         pr.dHdrPrefixBox->close(&(pr.dHdrPrefix)); //box 9
01980         pr.positionBox->close(&(pr.x)); //box 0
01981         pr.forceBox->close(&(pr.r));
01982       } //end phases
01983     } else { //not GBIS
01984 GBISP("C.N.CUDA[%d]::fnWork: pos/force.close()\n", CkMyPe());
01985       pr.positionBox->close(&(pr.x));
01986       pr.forceBox->close(&(pr.r));
01987     }
01988   }// end for
01989 
01990 #if 0
01991   virial *= (-1./6.);
01992   reduction->item(REDUCTION_VIRIAL_NBOND_XX) += virial;
01993   reduction->item(REDUCTION_VIRIAL_NBOND_YY) += virial;
01994   reduction->item(REDUCTION_VIRIAL_NBOND_ZZ) += virial;
01995   if ( doSlow ) {
01996     virial_slow *= (-1./6.);
01997     reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial_slow;
01998     reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial_slow;
01999     reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial_slow;
02000   }
02001 #endif
02002 
02003   if ( workStarted == 1 && ! mergegrids &&
02004        ( localHostedPatches.size() || master == this ) ) {
02005     GBISP("not finished, call again\n");
02006     return 0;  // not finished, call again
02007   }
02008 
02009   if ( master != this ) {  // finished
02010     GBISP("finished\n");
02011     if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
02012     atomsChanged = 0;
02013     return 1;
02014   }
02015 
02016   cuda_timer_total += kernel_time;
02017 
02018   if ( !simParams->GBISOn || gbisPhase == 3 ) {
02019 
02020   {
02021     Tensor virial_tensor;
02022     BigReal energyv = 0.;
02023     BigReal energye = 0.;
02024     BigReal energys = 0.;
02025     for ( int i = 0; i < num_virials; ++i ) {
02026       virial_tensor.xx += virials[16*i];
02027       virial_tensor.xy += virials[16*i+1];
02028       virial_tensor.xz += virials[16*i+2];
02029       virial_tensor.yx += virials[16*i+3];
02030       virial_tensor.yy += virials[16*i+4];
02031       virial_tensor.yz += virials[16*i+5];
02032       virial_tensor.zx += virials[16*i+6];
02033       virial_tensor.zy += virials[16*i+7];
02034       virial_tensor.zz += virials[16*i+8];
02035       energyv += virials[16*i+9];
02036       energye += virials[16*i+10];
02037       energys += virials[16*i+11];
02038       if (simParams->GBISOn) {
02039         energye += energy_gbis[i];
02040       }
02041     }
02042     ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NBOND,virial_tensor);
02043     if ( doEnergy ) {
02044       reduction->item(REDUCTION_LJ_ENERGY) += energyv;
02045       reduction->item(REDUCTION_ELECT_ENERGY) += energye;
02046       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += energys;
02047     }
02048   }
02049   if ( doSlow ) {
02050     Tensor virial_slow_tensor;
02051     for ( int i = 0; i < num_virials; ++i ) {
02052       virial_slow_tensor.xx += slow_virials[16*i];
02053       virial_slow_tensor.xy += slow_virials[16*i+1];
02054       virial_slow_tensor.xz += slow_virials[16*i+2];
02055       virial_slow_tensor.yx += slow_virials[16*i+3];
02056       virial_slow_tensor.yy += slow_virials[16*i+4];
02057       virial_slow_tensor.yz += slow_virials[16*i+5];
02058       virial_slow_tensor.zx += slow_virials[16*i+6];
02059       virial_slow_tensor.zy += slow_virials[16*i+7];
02060       virial_slow_tensor.zz += slow_virials[16*i+8];
02061     }
02062     ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_SLOW,virial_slow_tensor);
02063   }
02064 
02065   atomsChanged = 0;
02066   reduction->submit();
02067 
02068   cuda_timer_count++;
02069   if ( simParams->outputCudaTiming &&
02070         step % simParams->outputCudaTiming == 0 ) {
02071 
02072   // int natoms = mol->numAtoms; 
02073   // double wpa = wcount;  wpa /= natoms;
02074 
02075   // CkPrintf("Pe %d CUDA kernel %f ms, total %f ms, wpa %f\n", CkMyPe(),
02076         //      kernel_time * 1.e3, time * 1.e3, wpa);
02077 
02078 #if 0
02079   float upload_ms, remote_calc_ms;
02080   float local_calc_ms, total_ms;
02081   cuda_errcheck("before event timers");
02082   cudaEventElapsedTime(&upload_ms, start_upload, start_calc);
02083   cuda_errcheck("in event timer 1");
02084   cudaEventElapsedTime(&remote_calc_ms, start_calc, end_remote_download);
02085   cuda_errcheck("in event timer 2");
02086   cudaEventElapsedTime(&local_calc_ms, end_remote_download, end_local_download);
02087   cuda_errcheck("in event timer 3");
02088   cudaEventElapsedTime(&total_ms, start_upload, end_local_download);
02089   cuda_errcheck("in event timer 4");
02090   cuda_errcheck("in event timers");
02091 
02092     CkPrintf("CUDA EVENT TIMING: %d %f %f %f %f\n",
02093         CkMyPe(), upload_ms, remote_calc_ms,
02094                         local_calc_ms, total_ms);
02095 #endif
02096 
02097     if ( cuda_timer_count >= simParams->outputCudaTiming ) {
02098       cuda_timer_total /= cuda_timer_count;
02099       CkPrintf("CUDA TIMING: %d  %f ms/step on node %d\n",
02100                 step, cuda_timer_total * 1.e3, CkMyPe());
02101     }
02102     cuda_timer_count = 0;
02103     cuda_timer_total = 0;
02104   }
02105 
02106   } // !GBISOn || gbisPhase==3  
02107 
02108   // Next GBIS Phase
02109 GBISP("C.N.CUDA[%d]::fnWork: incrementing phase\n", CkMyPe())
02110     if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
02111 
02112   GBISP("C.N.CUDA[%d] finished ready for next step\n",CkMyPe());
02113   return 1;  // finished and ready for next step
02114 }
02115 
02116 
02117 #endif  // NAMD_CUDA
02118 

Generated on Fri May 25 04:07:14 2012 for NAMD by  doxygen 1.3.9.1