ComputeNonbondedCUDA.C

Go to the documentation of this file.
00001 
00002 #include "common.h"
00003 #include "charm++.h"
00004 
00005 #ifdef NAMD_CUDA
00006 #include <cuda_runtime.h>
00007 #include <cuda.h>
00008 #endif
00009 
00010 #include "WorkDistrib.h"
00011 #include "ComputeMgr.h"
00012 #include "ProxyMgr.h"
00013 #include "ComputeNonbondedCUDAKernel.h"
00014 #include "ComputeNonbondedCUDA.h"
00015 #include "LJTable.h"
00016 #include "ObjectArena.h"
00017 #include "SortAtoms.h"
00018 #include "Priorities.h"
00019 #include <algorithm>
00020 
00021 #include "NamdTypes.h"
00022 #include "DeviceCUDA.h"
00023 #include "CudaUtils.h"
00024 
00025 //#define PRINT_GBIS
00026 #undef PRINT_GBIS
00027 
00028 #ifdef NAMD_CUDA
00029 
00030 #ifdef WIN32
00031 #define __thread __declspec(thread)
00032 #endif
00033 
00034 extern __thread int max_grid_size;
00035 
00036 extern __thread cudaStream_t stream;
00037 extern __thread cudaStream_t stream2;
00038 
00039 extern __thread DeviceCUDA *deviceCUDA;
00040 
00041 void cuda_errcheck(const char *msg) {
00042   cudaError_t err;
00043   if ((err = cudaGetLastError()) != cudaSuccess) {
00044     char host[128];
00045 #ifdef NOHOSTNAME
00046     sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00047 #else
00048     gethostname(host, 128);  host[127] = 0;
00049 #endif
00050     char devstr[128] = "";
00051     int devnum;
00052     if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00053       sprintf(devstr, " device %d", devnum);
00054     }
00055     char errmsg[1024];
00056     sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
00057     NAMD_die(errmsg);
00058   }
00059 }
00060 
00061 /*
00062 void cuda_die(const char *msg) {
00063     char host[128];
00064 #ifdef NOHOSTNAME
00065     sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00066 #else
00067     gethostname(host, 128);  host[127] = 0;
00068 #endif
00069     char devstr[128] = "";
00070     int devnum;
00071     if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00072       sprintf(devstr, " device %d", devnum);
00073     }
00074     char errmsg[1024];
00075     sprintf(errmsg,"CUDA error on Pe %d (%s%s): %s", CkMyPe(), host, devstr, msg);
00076     NAMD_die(errmsg);
00077 }
00078 */
00079 
00080 /*
00081 char *devicelist;
00082 static __thread int usedevicelist;
00083 static __thread int ignoresharing;
00084 static __thread int mergegrids;
00085 static __thread int nomergegrids;
00086 static __thread int nostreaming;
00087 
00088 void cuda_getargs(char **argv) {
00089   devicelist = 0;
00090   usedevicelist = CmiGetArgStringDesc(argv, "+devices", &devicelist,
00091         "comma-delimited list of CUDA device numbers such as 0,2,1,2");
00092   ignoresharing = CmiGetArgFlag(argv, "+ignoresharing");
00093   mergegrids = CmiGetArgFlag(argv, "+mergegrids");
00094   nomergegrids = CmiGetArgFlag(argv, "+nomergegrids");
00095   if ( mergegrids && nomergegrids ) NAMD_die("Do not specify both +mergegrids and +nomergegrids");
00096   nostreaming = CmiGetArgFlag(argv, "+nostreaming");
00097 }
00098 
00099 static __thread int shared_gpu;
00100 static __thread int first_pe_sharing_gpu;
00101 static __thread int next_pe_sharing_gpu;
00102 static __thread int devicePe;
00103 static __thread int numPesSharingDevice;
00104 static __thread int *pesSharingDevice;
00105 
00106 static __thread int gpu_is_mine;
00107 
00108 int cuda_device_pe() { return devicePe; }
00109 
00110 bool cuda_device_shared_with_pe(int pe) {
00111   for ( int i=0; i<numPesSharingDevice; ++i ) {
00112     if ( pesSharingDevice[i] == pe ) return true;
00113   }
00114   return false;
00115 }
00116 
00117 bool one_cuda_device_per_node() {
00118   if ( numPesSharingDevice != CkMyNodeSize() ) return false;
00119   int numPesOnNodeSharingDevice = 0;
00120   for ( int i=0; i<numPesSharingDevice; ++i ) {
00121     if ( CkNodeOf(pesSharingDevice[i]) == CkMyNode() ) {
00122       ++numPesOnNodeSharingDevice;
00123     }
00124   }
00125   return ( numPesOnNodeSharingDevice == CkMyNodeSize() );
00126 }
00127 */
00128 
00129 static inline bool sortop_bitreverse(int a, int b) {
00130   if ( a == b ) return 0; 
00131   for ( int bit = 1; bit; bit *= 2 ) {
00132     if ( (a&bit) != (b&bit) ) return ((a&bit) < (b&bit));
00133   }
00134   return 0;
00135 }
00136 
00137 /*
00138 BASE
00139 2 types (remote & local)
00140 16 pes per node
00141 3 phases (1, 2, 3)
00142 */
00143 
00144 /*
00145 #define CUDA_EVENT_ID_POLL_REMOTE 98
00146 #define CUDA_TRACE_POLL_REMOTE \
00147   traceUserEvent(CUDA_EVENT_ID_POLL_REMOTE)
00148 #define CUDA_EVENT_ID_POLL_LOCAL 99
00149 #define CUDA_TRACE_POLL_LOCAL \
00150   traceUserEvent(CUDA_EVENT_ID_POLL_LOCAL)
00151 #define CUDA_EVENT_ID_BASE 100
00152 #define CUDA_TRACE_REMOTE(START,END) \
00153   do { int dev; cudaGetDevice(&dev); traceUserBracketEvent( \
00154        CUDA_EVENT_ID_BASE + 2 * dev, START, END); } while (0)
00155 #define CUDA_TRACE_LOCAL(START,END) \
00156   do { int dev; cudaGetDevice(&dev); traceUserBracketEvent( \
00157        CUDA_EVENT_ID_BASE + 2 * dev + 1, START, END); } while (0)
00158 
00159 void cuda_register_user_events() {
00160 
00161   traceRegisterUserEvent("CUDA poll remote", CUDA_EVENT_ID_POLL_REMOTE);
00162   traceRegisterUserEvent("CUDA poll local", CUDA_EVENT_ID_POLL_LOCAL);
00163 
00164 #define REGISTER_DEVICE_EVENTS(DEV) \
00165   traceRegisterUserEvent("CUDA device " #DEV " remote", CUDA_EVENT_ID_BASE + 2 * DEV); \
00166   traceRegisterUserEvent("CUDA device " #DEV " local", CUDA_EVENT_ID_BASE + 2 * DEV + 1);
00167 
00168   REGISTER_DEVICE_EVENTS(0)
00169   REGISTER_DEVICE_EVENTS(1)
00170   REGISTER_DEVICE_EVENTS(2)
00171   REGISTER_DEVICE_EVENTS(3)
00172   REGISTER_DEVICE_EVENTS(4)
00173   REGISTER_DEVICE_EVENTS(5)
00174   REGISTER_DEVICE_EVENTS(6)
00175   REGISTER_DEVICE_EVENTS(7)
00176   REGISTER_DEVICE_EVENTS(8)
00177   REGISTER_DEVICE_EVENTS(9)
00178   REGISTER_DEVICE_EVENTS(10)
00179   REGISTER_DEVICE_EVENTS(11)
00180   REGISTER_DEVICE_EVENTS(12)
00181   REGISTER_DEVICE_EVENTS(13)
00182   REGISTER_DEVICE_EVENTS(14)
00183   REGISTER_DEVICE_EVENTS(15)
00184 
00185 }
00186 
00187 void cuda_initialize() {
00188 
00189   if ( 0 == CkMyPe() ) cuda_register_user_events();
00190 
00191   if ( 0 == CkMyPe() ) CkPrintf("Info: Built with CUDA version %d\n", CUDA_VERSION);
00192 
00193   char host[128];
00194 #ifdef NOHOSTNAME
00195   sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00196 #else
00197   gethostname(host, 128);  host[127] = 0;
00198 #endif
00199 
00200   int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
00201   int myRankInPhysicalNode;
00202   int numPesOnPhysicalNode;
00203   int *pesOnPhysicalNode;
00204   CmiGetPesOnPhysicalNode(myPhysicalNodeID,
00205                            &pesOnPhysicalNode,&numPesOnPhysicalNode);
00206 
00207   {
00208     int i;
00209     for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00210       if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
00211         i = numPesOnPhysicalNode;
00212         break;
00213       }
00214       if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
00215     }
00216     if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
00217       CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
00218       for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00219         CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
00220           i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
00221       }
00222       myRankInPhysicalNode = 0;
00223       numPesOnPhysicalNode = 1;
00224       pesOnPhysicalNode = new int[1];
00225       pesOnPhysicalNode[0] = CkMyPe();
00226     } else {
00227       myRankInPhysicalNode = i;
00228     }
00229   }
00230   // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
00231 
00232   int deviceCount = 0;
00233   cudaGetDeviceCount(&deviceCount);
00234   cuda_errcheck("in cudaGetDeviceCount");
00235   if ( deviceCount <= 0 ) {
00236     cuda_die("No CUDA devices found.");
00237   }
00238 
00239   int *devices;
00240   int ndevices = 0;
00241   int nexclusive = 0;
00242   if ( usedevicelist ) {
00243     devices = new int[strlen(devicelist)];
00244     int i = 0;
00245     while ( devicelist[i] ) {
00246       ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
00247       while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
00248       while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
00249     }
00250   } else {
00251     if ( ! CkMyPe() ) {
00252       CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
00253     }
00254     devices = new int[deviceCount];
00255     for ( int i=0; i<deviceCount; ++i ) {
00256       int dev = i % deviceCount;
00257 #if CUDA_VERSION >= 2020
00258       cudaDeviceProp deviceProp;
00259       cudaGetDeviceProperties(&deviceProp, dev);
00260       cuda_errcheck("in cudaGetDeviceProperties");
00261       if ( deviceProp.computeMode != cudaComputeModeProhibited
00262            && (deviceProp.major >= 3)
00263            && deviceProp.canMapHostMemory
00264            && ( (deviceProp.multiProcessorCount > 2) ||
00265                 ((ndevices==0)&&(CkNumNodes()==1)) ) // exclude weak cards
00266          ) {
00267         devices[ndevices++] = dev;
00268       }
00269       if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
00270         ++nexclusive;
00271       }
00272 #else
00273       devices[ndevices++] = dev;
00274 #endif
00275     }
00276   }
00277 
00278   if ( ! ndevices ) {
00279     cuda_die("all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
00280   }
00281 
00282   shared_gpu = 0;
00283   gpu_is_mine = 1;
00284   first_pe_sharing_gpu = CkMyPe();
00285   next_pe_sharing_gpu = CkMyPe();
00286 
00287  // if ( (ndevices >= numPesOnPhysicalNode) || (nexclusive == 0) ) 
00288   {
00289 
00290   int dev;
00291   if ( numPesOnPhysicalNode > 1 ) {
00292     int myDeviceRank = myRankInPhysicalNode * ndevices / numPesOnPhysicalNode;
00293     dev = devices[myDeviceRank];
00294     devicePe = CkMyPe();
00295     if ( ignoresharing ) {
00296       pesSharingDevice = new int[1];
00297       pesSharingDevice[0] = CkMyPe();
00298       numPesSharingDevice = 1;
00299     } else {
00300       pesSharingDevice = new int[numPesOnPhysicalNode];
00301       devicePe = -1;
00302       numPesSharingDevice = 0;
00303       for ( int i = 0; i < numPesOnPhysicalNode; ++i ) {
00304         if ( i * ndevices / numPesOnPhysicalNode == myDeviceRank ) {
00305           int thisPe = pesOnPhysicalNode[i];
00306           pesSharingDevice[numPesSharingDevice++] = thisPe;
00307           if ( devicePe < 1 ) devicePe = thisPe;
00308           if ( WorkDistrib::pe_sortop_diffuse()(thisPe,devicePe) ) devicePe = thisPe;
00309         }
00310       }
00311       for ( int j = 0; j < ndevices; ++j ) {
00312         if ( devices[j] == dev && j != myDeviceRank ) shared_gpu = 1;
00313       }
00314     }
00315     if ( shared_gpu && devicePe == CkMyPe() ) {
00316       if ( CmiPhysicalNodeID(devicePe) < 2 )
00317       CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
00318     }
00319   } else {  // in case phys node code is lying
00320     dev = devices[CkMyPe() % ndevices];
00321     devicePe = CkMyPe();
00322     pesSharingDevice = new int[1];
00323     pesSharingDevice[0] = CkMyPe();
00324     numPesSharingDevice = 1;
00325   }
00326 
00327   if ( devicePe != CkMyPe() ) {
00328     if ( CmiPhysicalNodeID(devicePe) < 2 )
00329     CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
00330              CkMyPe(), myRankInPhysicalNode, devicePe);
00331 
00332     // for PME only
00333     cudaError_t err = cudaSetDevice(dev);
00334     if ( err != cudaSuccess ) {
00335       char errmsg[1024];
00336       sprintf(errmsg,"CUDA error binding to device %d on pe %d: %s",
00337                         dev, CkMyPe(), cudaGetErrorString(err));
00338       NAMD_die(errmsg);
00339     }
00340 
00341     return;
00342   }
00343 
00344   // disable token-passing but don't submit local until remote finished
00345   // if shared_gpu is true, otherwise submit all work immediately
00346   first_pe_sharing_gpu = CkMyPe();
00347   next_pe_sharing_gpu = CkMyPe();
00348 
00349   gpu_is_mine = ( first_pe_sharing_gpu == CkMyPe() ); 
00350 
00351   if ( dev >= deviceCount ) {
00352     char buf[256];
00353     sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
00354                 CkMyPe(), dev, host, deviceCount);
00355     NAMD_die(buf);
00356   }
00357 
00358   cudaError_t err;
00359   cudaDeviceProp deviceProp;
00360   err = cudaGetDeviceProperties(&deviceProp, dev);
00361   if (err == cudaSuccess) {
00362     if ( CmiPhysicalNodeID(devicePe) < 2 )
00363     CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s'  Mem: %dMB  Rev: %d.%d\n",
00364              CkMyPe(), myRankInPhysicalNode, dev, host,
00365              deviceProp.name, deviceProp.totalGlobalMem / (1024*1024),
00366              deviceProp.major, deviceProp.minor);
00367 
00368     err = cudaSetDevice(dev);
00369   }
00370   if ( err != cudaSuccess) {
00371     char errmsg[1024];
00372     sprintf(errmsg,"CUDA error binding to device %d on pe %d: %s",
00373                         dev, CkMyPe(), cudaGetErrorString(err));
00374     NAMD_die(errmsg);
00375   }
00376 
00377  }  // just let CUDA pick a device for us
00378 
00379   cudaSetDeviceFlags(cudaDeviceMapHost);
00380   cuda_errcheck("in cudaSetDeviceFlags");
00381 
00382   int dev;
00383   if ( cudaGetDevice(&dev) == cudaSuccess ) {
00384     cudaDeviceProp deviceProp;
00385     cudaGetDeviceProperties(&deviceProp, dev);
00386     cuda_errcheck("in cudaGetDeviceProperties");
00387     if ( deviceProp.computeMode == cudaComputeModeProhibited )
00388       cuda_die("device in prohibited mode");
00389     if ( deviceProp.major < 3 )
00390       cuda_die("device not of compute capability 3.0 or higher");
00391     if ( ! deviceProp.canMapHostMemory )
00392       cuda_die("device cannot map host memory");
00393   }
00394 
00395 }
00396 */
00397 
00398 static __thread ComputeNonbondedCUDA* cudaCompute = 0;
00399 static __thread ComputeMgr *computeMgr = 0;
00400 
00401 void send_build_cuda_force_table() {
00402   computeMgr->sendBuildCudaForceTable();
00403 }
00404 
00405 void build_cuda_force_table() {
00406   if ( deviceCUDA->getMasterPe() != CkMyPe() ) return;
00407   ComputeNonbondedCUDA::build_lj_table();
00408   ComputeNonbondedCUDA::build_force_table();
00409 }
00410 
00411 void ComputeNonbondedCUDA::build_lj_table() {  // static
00412 
00413   const LJTable* const ljTable = ComputeNonbondedUtil:: ljTable;
00414   const int dim = ljTable->get_table_dim();
00415 
00416   // round dim up to odd multiple of 16
00417   int tsize = (((dim+16+31)/32)*32)-16;
00418   if ( tsize < dim ) NAMD_bug("ComputeNonbondedCUDA::build_lj_table bad tsize");
00419 
00420   float2 *t = new float2[tsize*tsize];
00421   float2 *row = t;
00422   for ( int i=0; i<dim; ++i, row += tsize ) {
00423     for ( int j=0; j<dim; ++j ) {
00424       const LJTable::TableEntry *e = ljTable->table_val(i,j);
00425       row[j].x = e->A * scaling;
00426       row[j].y = e->B * scaling;
00427     }
00428   }
00429 
00430   cuda_bind_lj_table(t,tsize);
00431   delete [] t;
00432 
00433   if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
00434     CkPrintf("Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim);
00435   }
00436 }
00437 
00438 void ComputeNonbondedCUDA::build_force_table() {  // static
00439 
00440   float4 t[FORCE_TABLE_SIZE];
00441   float4 et[FORCE_TABLE_SIZE];  // energy table
00442 
00443   const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta;
00444   const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp;
00445   // const int r2_delta_expc = 64 * (r2_delta_exp - 127);
00446   const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
00447 
00448   double r2list[FORCE_TABLE_SIZE];  // double to match cpu code
00449   for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00450     double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00451     r2list[i] = r*r + r2_delta;
00452   }
00453 
00454   union { double f; int32 i[2]; } byte_order_test;
00455   byte_order_test.f = 1.0;  // should occupy high-order bits only
00456   int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
00457 
00458   for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00459     double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00460     int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc;  // table_i >= 0
00461 
00462     if ( r > cutoff ) {
00463       t[i].x = 0.;
00464       t[i].y = 0.;
00465       t[i].z = 0.;
00466       t[i].w = 0.;
00467       et[i].x = 0.;
00468       et[i].y = 0.;
00469       et[i].z = 0.;
00470       et[i].w = 0.;
00471       continue;
00472     }
00473 
00474     BigReal diffa = r2list[i] - r2_table[table_i];
00475 
00476     // coulomb 1/r or fast force
00477     // t[i].x = 1. / (r2 * r);  // -1/r * d/dr r^-1
00478     {
00479       BigReal table_a = fast_table[4*table_i];
00480       BigReal table_b = fast_table[4*table_i+1];
00481       BigReal table_c = fast_table[4*table_i+2];
00482       BigReal table_d = fast_table[4*table_i+3];
00483       BigReal grad =
00484                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00485       t[i].x = 2. * grad;
00486       BigReal ener = table_a + diffa *
00487                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00488       et[i].x = ener;
00489     }
00490 
00491 
00492     // pme correction for slow force
00493     // t[i].w = 0.;
00494     {
00495       BigReal table_a = scor_table[4*table_i];
00496       BigReal table_b = scor_table[4*table_i+1];
00497       BigReal table_c = scor_table[4*table_i+2];
00498       BigReal table_d = scor_table[4*table_i+3];
00499       BigReal grad =
00500                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00501       t[i].w = 2. * grad;
00502       BigReal ener = table_a + diffa *
00503                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00504       et[i].w = ener;
00505     }
00506 
00507 
00508     // vdw 1/r^6
00509     // t[i].y = 6. / (r8);  // -1/r * d/dr r^-6
00510     {
00511       BigReal table_a = vdwb_table[4*table_i];
00512       BigReal table_b = vdwb_table[4*table_i+1];
00513       BigReal table_c = vdwb_table[4*table_i+2];
00514       BigReal table_d = vdwb_table[4*table_i+3];
00515       BigReal grad =
00516                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00517       t[i].y = 2. * -1. * grad;
00518       BigReal ener = table_a + diffa *
00519                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00520       et[i].y = -1. * ener;
00521     }
00522 
00523 
00524     // vdw 1/r^12
00525     // t[i].z = 12e / (r8 * r4 * r2);  // -1/r * d/dr r^-12
00526     {
00527       BigReal table_a = vdwa_table[4*table_i];
00528       BigReal table_b = vdwa_table[4*table_i+1];
00529       BigReal table_c = vdwa_table[4*table_i+2];
00530       BigReal table_d = vdwa_table[4*table_i+3];
00531       BigReal grad =
00532                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00533       t[i].z = 2. * grad;
00534       BigReal ener = table_a + diffa *
00535                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00536       et[i].z = ener;
00537     }
00538 
00539     // CkPrintf("%d %g %g %g %g %g %g\n", i, r, diffa,
00540     //   t[i].x, t[i].y, t[i].z, t[i].w);
00541 
00542 /*
00543     double r2 = r * r;
00544     double r4 = r2 * r2;
00545     double r8 = r4 * r4;
00546 
00547     t[i].x = 1. / (r2 * r);  // -1/r * d/dr r^-1
00548     t[i].y = 6. / (r8);  // -1/r * d/dr r^-6
00549     t[i].z = 12. / (r8 * r4 * r2);  // -1/r * d/dr r^-12
00550     t[i].w = 0.;
00551 */
00552   }
00553 
00554   t[0].x = 0.f;
00555   t[0].y = 0.f;
00556   t[0].z = 0.f;
00557   t[0].w = 0.f;
00558   et[0].x = et[1].x;
00559   et[0].y = et[1].y;
00560   et[0].z = et[1].z;
00561   et[0].w = et[1].w;
00562 
00563   cuda_bind_force_table(t,et);
00564 
00565   if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
00566     CkPrintf("Info: Updated CUDA force table with %d elements.\n", FORCE_TABLE_SIZE);
00567   }
00568 }
00569 
00570 struct exlist_sortop {
00571   bool operator() (int32 *li, int32 *lj) {
00572     return ( li[1] < lj[1] );
00573   }
00574 };
00575 
00576 void build_cuda_exclusions() {
00577   if ( deviceCUDA->getMasterPe() != CkMyPe() ) return;
00578   ComputeNonbondedCUDA::build_exclusions();
00579 }
00580 
00581 static __thread int2 *exclusionsByAtom;
00582 
00583 void ComputeNonbondedCUDA::build_exclusions() {
00584   Molecule *mol = Node::Object()->molecule;
00585 
00586 #ifdef MEM_OPT_VERSION
00587   int natoms = mol->exclSigPoolSize;
00588 #else
00589   int natoms = mol->numAtoms; 
00590 #endif
00591 
00592   delete [] exclusionsByAtom;
00593   exclusionsByAtom = new int2[natoms];
00594 
00595   // create unique sorted lists
00596 
00597   ObjectArena<int32> listArena;
00598   ResizeArray<int32*> unique_lists;
00599   int32 **listsByAtom = new int32*[natoms];
00600   SortableResizeArray<int32> curList;
00601   for ( int i=0; i<natoms; ++i ) {
00602     curList.resize(0);
00603     curList.add(0);  // always excluded from self
00604 #ifdef MEM_OPT_VERSION
00605     const ExclusionSignature *sig = mol->exclSigPool + i;
00606     int n = sig->fullExclCnt;
00607     for ( int j=0; j<n; ++j ) { curList.add(sig->fullOffset[j]); }
00608     n += 1;
00609 #else
00610     const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
00611     int n = mol_list[0] + 1;
00612     for ( int j=1; j<n; ++j ) {
00613       curList.add(mol_list[j] - i);
00614     }
00615 #endif
00616     curList.sort();
00617 
00618     int j;
00619     for ( j=0; j<unique_lists.size(); ++j ) {
00620       if ( n != unique_lists[j][0] ) continue;  // no match
00621       int k;
00622       for ( k=0; k<n; ++k ) {
00623         if ( unique_lists[j][k+3] != curList[k] ) break;
00624       }
00625       if ( k == n ) break;  // found match
00626     }
00627     if ( j == unique_lists.size() ) {  // no match
00628       int32 *list = listArena.getNewArray(n+3);
00629       list[0] = n;
00630       int maxdiff = 0;
00631       maxdiff = -1 * curList[0];
00632       if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
00633       list[1] = maxdiff;
00634       for ( int k=0; k<n; ++k ) {
00635         list[k+3] = curList[k];
00636       }
00637       unique_lists.add(list);
00638     }
00639     listsByAtom[i] = unique_lists[j];
00640   }
00641   // sort lists by maxdiff
00642   std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
00643   long int totalbits = 0;
00644   int nlists = unique_lists.size();
00645   for ( int j=0; j<nlists; ++j ) {
00646     int32 *list = unique_lists[j];
00647     int maxdiff = list[1];
00648     list[2] = totalbits + maxdiff;
00649     totalbits += 2*maxdiff + 1;
00650   }
00651   for ( int i=0; i<natoms; ++i ) {
00652     exclusionsByAtom[i].x = listsByAtom[i][1];  // maxdiff
00653     exclusionsByAtom[i].y = listsByAtom[i][2];  // start
00654   }
00655   delete [] listsByAtom;
00656 
00657   if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
00658 
00659   {
00660     long int bytesneeded = totalbits / 8;
00661     if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
00662     CkPrintf("Info: Found %d unique exclusion lists needing %ld bytes\n",
00663                 unique_lists.size(), bytesneeded);
00664     }
00665 
00666     long int bytesavail = MAX_EXCLUSIONS * sizeof(unsigned int);
00667     if ( bytesneeded > bytesavail ) {
00668       char errmsg[512];
00669       sprintf(errmsg,"Found %d unique exclusion lists needing %ld bytes "
00670                      "but only %ld bytes can be addressed with 32-bit int.",
00671                      unique_lists.size(), bytesneeded, bytesavail);
00672       NAMD_die(errmsg);
00673     }
00674   }
00675 
00676 #define SET_EXCL(EXCL,BASE,DIFF) \
00677          (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
00678 
00679   unsigned int *exclusion_bits = new unsigned int[totalbits/32];
00680   memset(exclusion_bits, 0, totalbits/8);
00681 
00682   long int base = 0;
00683   for ( int i=0; i<unique_lists.size(); ++i ) {
00684     base += unique_lists[i][1];
00685     if ( unique_lists[i][2] != (int32)base ) {
00686       NAMD_bug("ComputeNonbondedCUDA::build_exclusions base != stored");
00687     }
00688     int n = unique_lists[i][0];
00689     for ( int j=0; j<n; ++j ) {
00690       SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
00691     }
00692     base += unique_lists[i][1] + 1;
00693   }
00694 
00695   cuda_bind_exclusions(exclusion_bits, totalbits/32);
00696 
00697   delete [] exclusion_bits;
00698 }
00699 
00700 
00701 void register_cuda_compute_self(ComputeID c, PatchID pid) {
00702 
00703   if ( ! cudaCompute ) NAMD_bug("register_self called early");
00704 
00705   cudaCompute->requirePatch(pid);
00706 
00707   ComputeNonbondedCUDA::compute_record cr;
00708   cr.c = c;
00709   cr.pid[0] = pid;  cr.pid[1] = pid;
00710   cr.offset = 0.;
00711   if ( cudaCompute->patchRecords[pid].isLocal ) {
00712     cudaCompute->localComputeRecords.add(cr);
00713   } else {
00714     cudaCompute->remoteComputeRecords.add(cr);
00715   }
00716 }
00717 
00718 void register_cuda_compute_pair(ComputeID c, PatchID pid[], int t[]) {
00719 
00720   if ( ! cudaCompute ) NAMD_bug("register_pair called early");
00721  
00722   cudaCompute->requirePatch(pid[0]);
00723   cudaCompute->requirePatch(pid[1]);
00724 
00725   ComputeNonbondedCUDA::compute_record cr;
00726   cr.c = c; 
00727   cr.pid[0] = pid[0];  cr.pid[1] = pid[1];
00728 
00729   int t1 = t[0];
00730   int t2 = t[1];
00731   Vector offset = cudaCompute->patchMap->center(pid[0])
00732                 - cudaCompute->patchMap->center(pid[1]);
00733   offset.x += (t1%3-1) - (t2%3-1);
00734   offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
00735   offset.z += (t1/9-1) - (t2/9-1);
00736   cr.offset = offset;
00737 
00738   if ( cudaCompute->patchRecords[pid[0]].isLocal ) {
00739     cudaCompute->localComputeRecords.add(cr);
00740   } else {
00741     cudaCompute->remoteComputeRecords.add(cr);
00742   }
00743 }
00744 
00745 void unregister_cuda_compute(ComputeID c) {  // static
00746 
00747   NAMD_bug("unregister_compute unimplemented");
00748 
00749 }
00750 
00751 // static __thread cudaEvent_t start_upload;
00752 static __thread cudaEvent_t start_calc;
00753 static __thread cudaEvent_t end_remote_download;
00754 static __thread cudaEvent_t end_local_download;
00755 
00756 static __thread ResizeArray<patch_pair> *patch_pairs_ptr;
00757 static __thread ResizeArray<int> *patch_pair_num_ptr;
00758 
00759 void init_arrays();
00760 
00761 ComputeNonbondedCUDA::ComputeNonbondedCUDA(ComputeID c, ComputeMgr *mgr,
00762                                            ComputeNonbondedCUDA *m, int idx) : Compute(c), slaveIndex(idx) {
00763 #ifdef PRINT_GBIS
00764    CkPrintf("C.N.CUDA[%d]::constructor cid=%d\n", CkMyPe(), c);
00765 #endif
00766 
00767   if ( sizeof(patch_pair) & 15 ) NAMD_bug("sizeof(patch_pair) % 16 != 0");
00768   if ( sizeof(atom) & 15 ) NAMD_bug("sizeof(atom) % 16 != 0");
00769   if ( sizeof(atom_param) & 15 ) NAMD_bug("sizeof(atom_param) % 16 != 0");
00770 
00771   // CkPrintf("create ComputeNonbondedCUDA\n");
00772   master = m ? m : this;
00773   cudaCompute = this;
00774   computeMgr = mgr;
00775   patchMap = PatchMap::Object();
00776   atomMap = AtomMap::Object();
00777   reduction = 0;
00778 
00779   SimParameters *params = Node::Object()->simParameters;
00780   if (params->pressureProfileOn) {
00781     NAMD_die("pressure profile not supported in CUDA");
00782   }
00783 
00784   atomsChanged = 1;
00785   computesChanged = 1;
00786   patchPairsReordered = 1;
00787 
00788   pairlistsValid = 0;
00789   pairlistTolerance = 0.;
00790   usePairlists = 0;
00791   savePairlists = 0;
00792   plcutoff2 = 0.;
00793 
00794   workStarted = 0;
00795   basePriority = PROXY_DATA_PRIORITY;
00796   localWorkMsg2 = new (PRIORITY_SIZE) LocalWorkMsg;
00797 
00798   // Zero array sizes and pointers
00799   init_arrays();
00800 
00801   atoms_size = 0;
00802   atoms = NULL;
00803 
00804   forces_size = 0;
00805   forces = NULL;
00806   
00807   slow_forces_size = 0;
00808   slow_forces = NULL;
00809 
00810   psiSumH_size = 0;
00811   psiSumH = NULL;
00812 
00813   dEdaSumH_size = 0;
00814   dEdaSumH = NULL;
00815 
00816   if ( master != this ) { // I am slave
00817     masterPe = master->masterPe;
00818     master->slaves[slaveIndex] = this;
00819     if ( master->slavePes[slaveIndex] != CkMyPe() ) {
00820       NAMD_bug("ComputeNonbondedCUDA slavePes[slaveIndex] != CkMyPe");
00821     }
00822     deviceID = master->deviceID;
00823     registerPatches();
00824     return;
00825   }
00826   masterPe = CkMyPe();
00827 
00828   const bool streaming = ! (deviceCUDA->getNoStreaming() || params->GBISOn);
00829   if ( streaming && ! deviceCUDA->getSharedGpu() && ! deviceCUDA->getNoMergeGrids() )
00830     deviceCUDA->setMergeGrids(1);
00831 
00832   // Sanity checks for New CUDA kernel
00833   if (params->GBISOn) {
00834     // GBIS
00835     if (deviceCUDA->getNoMergeGrids()) {
00836       NAMD_die("CUDA kernel cannot use +nomergegrids with GBIS simulations");
00837     }
00838     // Set mergegrids ON as long as user hasn't defined +nomergegrids
00839     if (!deviceCUDA->getNoMergeGrids()) deviceCUDA->setMergeGrids(1);
00840     // Final sanity check
00841     if (!deviceCUDA->getMergeGrids()) NAMD_die("CUDA GBIS kernel final sanity check failed");
00842   } else {
00843     // non-GBIS
00844     if (deviceCUDA->getNoStreaming() && !deviceCUDA->getMergeGrids()) {
00845       NAMD_die("CUDA kernel requires +mergegrids if +nostreaming is used");
00846     }
00847   }
00848 
00849 #if CUDA_VERSION >= 5050
00850   int leastPriority, greatestPriority;
00851   cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
00852   cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
00853   if ( leastPriority != greatestPriority ) {
00854     if ( CkMyNode() == 0 ) {
00855       int dev = deviceCUDA->getDeviceID();
00856       CkPrintf("CUDA device %d stream priority range %d %d\n", dev, leastPriority, greatestPriority);
00857     }
00858     if ( deviceCUDA->getMergeGrids() && params->PMEOn && params->PMEOffload && !params->usePMECUDA) {
00859       greatestPriority = leastPriority;
00860     }
00861     if (params->usePMECUDA) greatestPriority = leastPriority;
00862     cudaStreamCreateWithPriority(&stream,cudaStreamDefault,greatestPriority);
00863     cudaStreamCreateWithPriority(&stream2,cudaStreamDefault,leastPriority);
00864   } else
00865 #endif
00866   {
00867     cudaStreamCreate(&stream);
00868     cuda_errcheck("cudaStreamCreate");
00869     int dev = deviceCUDA->getDeviceID();
00870     cudaDeviceProp deviceProp;
00871     cudaGetDeviceProperties(&deviceProp, dev);
00872     cuda_errcheck("cudaGetDeviceProperties");
00873     if ( deviceProp.concurrentKernels && deviceProp.major > 2 ) {
00874       if ( CkMyNode() == 0 ) CkPrintf("CUDA device %d supports concurrent kernels.\n", dev);
00875       cudaStreamCreate(&stream2);
00876     } else {
00877       stream2 = stream;
00878     }
00879   }
00880   cuda_errcheck("cudaStreamCreate");
00881 
00882   // Get GPU device ID
00883   deviceID = deviceCUDA->getDeviceID();
00884 
00885   cuda_init();
00886   if ( max_grid_size < 65535 ) NAMD_bug("bad CUDA max_grid_size");
00887   build_exclusions();
00888   // cudaEventCreate(&start_upload);
00889   cudaEventCreateWithFlags(&start_calc,cudaEventDisableTiming);
00890   cudaEventCreateWithFlags(&end_remote_download,cudaEventDisableTiming);
00891   cudaEventCreateWithFlags(&end_local_download,cudaEventDisableTiming);
00892 
00893   patchRecords.resize(patchMap->numPatches());
00894   patch_pairs_ptr = new ResizeArray<patch_pair>;
00895   patch_pair_num_ptr = new ResizeArray<int>;
00896 
00897   if ( params->PMEOn && params->PMEOffload && !params->usePMECUDA) deviceCUDA->setGpuIsMine(0);
00898 }
00899 
00900 
00901 ComputeNonbondedCUDA::~ComputeNonbondedCUDA() { ; }
00902 
00903 void ComputeNonbondedCUDA::requirePatch(int pid) {
00904 
00905   computesChanged = 1;
00906   patch_record &pr = patchRecords[pid];
00907   if ( pr.refCount == 0 ) {
00908     pr.isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->node(pid)) == CmiPhysicalNodeID(CkMyPe()) );
00909     pr.isSameNode = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
00910     if ( deviceCUDA->getMergeGrids() ) {
00911       pr.isLocal = 0;
00912     } else if ( CkNumNodes() < 2 ) {
00913       pr.isLocal = 1 & ( 1 ^ patchMap->index_a(pid) ^
00914          patchMap->index_b(pid) ^ patchMap->index_c(pid) );
00915     } else {
00916       pr.isLocal = pr.isSameNode;
00917     }
00918     if ( pr.isLocal ) {
00919       localActivePatches.add(pid);
00920     } else {
00921       remoteActivePatches.add(pid);
00922     }
00923     activePatches.add(pid);
00924     pr.patchID = pid;
00925     pr.hostPe = -1;
00926     pr.x = NULL;
00927     pr.xExt = NULL;
00928     pr.r = NULL;
00929     pr.f = NULL;
00930     pr.intRad      = NULL;
00931     pr.psiSum      = NULL;
00932     pr.bornRad     = NULL;
00933     pr.dEdaSum     = NULL;
00934     pr.dHdrPrefix  = NULL;
00935   }
00936   pr.refCount += 1;
00937 }
00938 
00939 void ComputeNonbondedCUDA::registerPatches() {
00940 
00941   SimParameters *simParams = Node::Object()->simParameters;
00942   int npatches = master->activePatches.size();
00943   int *pids = master->activePatches.begin();
00944   patch_record *recs = master->patchRecords.begin();
00945   for ( int i=0; i<npatches; ++i ) {
00946     int pid = pids[i];
00947     patch_record &pr = recs[pid];
00948     if ( pr.hostPe == CkMyPe() ) {
00949       pr.slave = this;
00950       pr.msg = new (PRIORITY_SIZE) FinishWorkMsg;
00951       hostedPatches.add(pid);
00952       if ( pr.isLocal ) {
00953         localHostedPatches.add(pid);
00954       } else {
00955         remoteHostedPatches.add(pid);
00956       }
00957       ProxyMgr::Object()->createProxy(pid);
00958       pr.p = patchMap->patch(pid);
00959       pr.positionBox = pr.p->registerPositionPickup(this);
00960       pr.forceBox = pr.p->registerForceDeposit(this);
00961       if (simParams->GBISOn) {
00962         pr.intRadBox      = pr.p->registerIntRadPickup(this);
00963         pr.psiSumBox      = pr.p->registerPsiSumDeposit(this);
00964         pr.bornRadBox     = pr.p->registerBornRadPickup(this);
00965         pr.dEdaSumBox     = pr.p->registerDEdaSumDeposit(this);
00966         pr.dHdrPrefixBox  = pr.p->registerDHdrPrefixPickup(this);
00967       }
00968     }
00969   }
00970   if ( master == this ) setNumPatches(activePatches.size());
00971   else setNumPatches(hostedPatches.size());
00972   if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
00973   CkPrintf("Pe %d hosts %d local and %d remote patches for pe %d\n", CkMyPe(), localHostedPatches.size(), remoteHostedPatches.size(), masterPe);
00974 }
00975 
00976 struct pid_sortop_reverse_priority {
00977   bool operator() (int pidj, int pidi) {  // i and j reversed
00978     int ppi = PATCH_PRIORITY(pidi);
00979     int ppj = PATCH_PRIORITY(pidj);
00980     if ( ppi != ppj ) return ppi < ppj;
00981     return pidi < pidj;
00982   }
00983 };
00984 
00985 void ComputeNonbondedCUDA::assignPatches() {
00986 
00987   int *pesOnNodeSharingDevice = new int[CkMyNodeSize()];
00988   int numPesOnNodeSharingDevice = 0;
00989   int masterIndex = -1;
00990   for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
00991     int pe = deviceCUDA->getPesSharingDevice(i);
00992     if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
00993     if ( CkNodeOf(pe) == CkMyNode() ) {
00994       pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
00995     }
00996   }
00997 
00998   int npatches = activePatches.size();
00999 
01000   if ( npatches ) {
01001     reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
01002   }
01003 
01004   // calculate priority rank of local home patch within pe
01005   {
01006     ResizeArray< ResizeArray<int> > homePatchByRank(CkMyNodeSize());
01007     for ( int i=0; i<npatches; ++i ) {
01008       int pid = activePatches[i];
01009       int homePe = patchMap->node(pid);
01010       if ( CkNodeOf(homePe) == CkMyNode() ) {
01011         homePatchByRank[CkRankOf(homePe)].add(pid);
01012       }
01013     }
01014     for ( int i=0; i<CkMyNodeSize(); ++i ) {
01015       pid_sortop_reverse_priority so;
01016       std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
01017       int masterBoost = ( CkMyRank() == i ? 2 : 0 );
01018       for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
01019         int pid = homePatchByRank[i][j];
01020         patchRecords[pid].reversePriorityRankInPe = j + masterBoost;
01021       }
01022     }
01023   }
01024 
01025   int *count = new int[npatches];
01026   memset(count, 0, sizeof(int)*npatches);
01027   int *pcount = new int[numPesOnNodeSharingDevice];
01028   memset(pcount, 0, sizeof(int)*numPesOnNodeSharingDevice);
01029   int *rankpcount = new int[CkMyNodeSize()];
01030   memset(rankpcount, 0, sizeof(int)*CkMyNodeSize());
01031   char *table = new char[npatches*numPesOnNodeSharingDevice];
01032   memset(table, 0, npatches*numPesOnNodeSharingDevice);
01033 
01034   int unassignedpatches = npatches;
01035 
01036   if ( 0 ) { // assign all to device pe
01037     for ( int i=0; i<npatches; ++i ) {
01038       int pid = activePatches[i];
01039       patch_record &pr = patchRecords[pid];
01040       pr.hostPe = CkMyPe();
01041     }
01042     unassignedpatches = 0;
01043     pcount[masterIndex] = npatches;
01044   } else 
01045 
01046   // assign if home pe and build table of natural proxies
01047   for ( int i=0; i<npatches; ++i ) {
01048     int pid = activePatches[i];
01049     patch_record &pr = patchRecords[pid];
01050     int homePe = patchMap->node(pid);
01051     for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
01052       int pe = pesOnNodeSharingDevice[j];
01053       if ( pe == homePe ) {
01054         pr.hostPe = pe;  --unassignedpatches;
01055         pcount[j] += 1;
01056       }
01057       if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
01058         table[i*numPesOnNodeSharingDevice+j] = 1;
01059       }
01060     }
01061     if ( pr.hostPe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
01062       pr.hostPe = homePe;  --unassignedpatches;
01063       rankpcount[CkRankOf(homePe)] += 1;
01064     }
01065   }
01066   // assign if only one pe has a required proxy
01067   int assignj = 0;
01068   for ( int i=0; i<npatches; ++i ) {
01069     int pid = activePatches[i];
01070     patch_record &pr = patchRecords[pid];
01071     if ( pr.hostPe != -1 ) continue;
01072     int c = 0;
01073     int lastj;
01074     for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
01075       if ( table[i*numPesOnNodeSharingDevice+j] ) { ++c; lastj=j; }
01076     }
01077     count[i] = c;
01078     if ( c == 1 ) {
01079       pr.hostPe = pesOnNodeSharingDevice[lastj];
01080       --unassignedpatches;
01081       pcount[lastj] += 1;
01082     }
01083   }
01084   while ( unassignedpatches ) {
01085     int i;
01086     for ( i=0; i<npatches; ++i ) {
01087       if ( ! table[i*numPesOnNodeSharingDevice+assignj] ) continue;
01088       int pid = activePatches[i];
01089       patch_record &pr = patchRecords[pid];
01090       if ( pr.hostPe != -1 ) continue;
01091       pr.hostPe = pesOnNodeSharingDevice[assignj];
01092       --unassignedpatches;
01093       pcount[assignj] += 1;
01094       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
01095       break;
01096     }
01097     if ( i<npatches ) continue;  // start search again
01098     for ( i=0; i<npatches; ++i ) {
01099       int pid = activePatches[i];
01100       patch_record &pr = patchRecords[pid];
01101       if ( pr.hostPe != -1 ) continue;
01102       if ( count[i] ) continue;
01103       pr.hostPe = pesOnNodeSharingDevice[assignj];
01104       --unassignedpatches;
01105       pcount[assignj] += 1;
01106       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
01107       break;
01108     }
01109     if ( i<npatches ) continue;  // start search again
01110     if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
01111   }
01112 
01113   for ( int i=0; i<npatches; ++i ) {
01114     int pid = activePatches[i];
01115     patch_record &pr = patchRecords[pid];
01116     // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), pid, pr.hostPe);
01117   }
01118 
01119   slavePes = new int[CkMyNodeSize()];
01120   slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
01121   numSlaves = 0;
01122   for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
01123     int pe = pesOnNodeSharingDevice[j];
01124     int rank = pe - CkNodeFirst(CkMyNode());
01125     // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
01126     //          CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
01127     if ( pe == CkMyPe() ) continue;
01128     if ( ! pcount[j] && ! rankpcount[rank] ) continue;
01129     rankpcount[rank] = 0;  // skip in rank loop below
01130     slavePes[numSlaves] = pe;
01131     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
01132     ++numSlaves;
01133   }
01134   for ( int j=0; j<CkMyNodeSize(); ++j ) {
01135     int pe = CkNodeFirst(CkMyNode()) + j;
01136     // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
01137     //          CkMyPe(),j,pe,rankpcount[j]);
01138     if ( ! rankpcount[j] ) continue;
01139     if ( pe == CkMyPe() ) continue;
01140     slavePes[numSlaves] = pe;
01141     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
01142     ++numSlaves;
01143   }
01144   registerPatches();
01145 
01146   delete [] pesOnNodeSharingDevice;
01147   delete [] count;
01148   delete [] pcount;
01149   delete [] rankpcount;
01150   delete [] table;
01151 }
01152 
01153 static __thread int atom_params_size;
01154 static __thread atom_param* atom_params;
01155 
01156 static __thread int vdw_types_size;
01157 static __thread int* vdw_types;
01158 
01159 static __thread int dummy_size;
01160 static __thread float* dummy_dev;
01161 
01162 static __thread int force_ready_queue_size;
01163 static __thread int *force_ready_queue;
01164 static __thread int force_ready_queue_len;
01165 static __thread int force_ready_queue_next;
01166 
01167 static __thread int block_order_size;
01168 static __thread int *block_order;
01169 
01170 static __thread int num_atoms;
01171 static __thread int num_local_atoms;
01172 static __thread int num_remote_atoms;
01173 
01174 static __thread int virials_size;
01175 static __thread float *virials;
01176 static __thread int num_virials;
01177 // NOTE: slow_virials is a computed pointer from "virials" -do not deallocate
01178 static __thread float *slow_virials;
01179 
01180 static __thread int energy_gbis_size;
01181 static __thread float *energy_gbis;
01182 
01183 //GBIS host pointers
01184 static __thread int intRad0H_size;
01185 static __thread float *intRad0H;
01186 static __thread int intRadSH_size;
01187 static __thread float *intRadSH;
01188 static __thread int bornRadH_size;
01189 static __thread float *bornRadH;
01190 static __thread int dHdrPrefixH_size;
01191 static __thread float *dHdrPrefixH;
01192 
01193 static __thread int cuda_timer_count;
01194 static __thread double cuda_timer_total;
01195 static __thread double kernel_time;
01196 static __thread double remote_submit_time;
01197 static __thread double local_submit_time;
01198 
01199 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
01200 
01201 extern "C" void CcdCallBacksReset(void *ignored,double curWallTime);  // fix Charm++
01202 
01203 #ifdef PRINT_GBIS
01204 #define GBISP(...) CkPrintf(__VA_ARGS__);
01205 #else
01206 #define GBISP(...)
01207 #endif
01208 
01209 #define count_limit 1000000
01210 static __thread int check_count;
01211 static __thread int check_remote_count;
01212 static __thread int check_local_count;
01213 
01214 void init_arrays() {
01215 
01216   atom_params_size = 0;
01217   atom_params = NULL;
01218 
01219   vdw_types_size = 0;
01220   vdw_types = NULL;
01221   
01222   dummy_size = 0;
01223   dummy_dev = NULL;
01224 
01225   force_ready_queue_size = 0;
01226   force_ready_queue = NULL;
01227   force_ready_queue_len = 0;
01228   force_ready_queue_next = 0;
01229   
01230   block_order_size = 0;
01231   block_order = NULL;
01232   
01233   num_atoms = 0;
01234   num_local_atoms = 0;
01235   num_remote_atoms = 0;
01236 
01237   virials_size = 0;
01238   virials = NULL;
01239   num_virials = 0;
01240 
01241   energy_gbis_size = 0;
01242   energy_gbis = NULL;
01243 
01244   intRad0H_size = 0;
01245   intRad0H = NULL;
01246   intRadSH_size = 0;
01247   intRadSH = NULL;
01248   bornRadH_size = 0;
01249   bornRadH = NULL;
01250   dHdrPrefixH_size = 0;
01251   dHdrPrefixH = NULL;
01252 
01253 }
01254 
01255 void cuda_check_progress(void *arg, double walltime) {
01256   CUDA_TRACE_POLL_REMOTE;
01257 
01258   int flindex;
01259   int poll_again = 1;
01260   while ( -1 != (flindex = force_ready_queue[force_ready_queue_next]) ) {
01261     //    CkPrintf("Pe %d forces ready %d is index %d at %lf\n",
01262     //       CkMyPe(), force_ready_queue_next, flindex, walltime);
01263     force_ready_queue[force_ready_queue_next] = -1;
01264     ++force_ready_queue_next;
01265     check_count = 0;
01266     if ( force_ready_queue_next == force_ready_queue_len ) {
01267       poll_again = 0;
01268       CUDA_TRACE_LOCAL(kernel_time,walltime);
01269       kernel_time = walltime - kernel_time;
01270       // need to guarantee this finishes before the last patch message!
01271       ((ComputeNonbondedCUDA *) arg)->workStarted = 0;
01272       ((ComputeNonbondedCUDA *) arg)->finishReductions();
01273     }
01274     ((ComputeNonbondedCUDA *) arg)->messageFinishPatch(flindex);
01275     if ( force_ready_queue_next == force_ready_queue_len ) break;
01276   }
01277   if ( ++check_count >= count_limit ) {
01278     char errmsg[256];
01279     sprintf(errmsg,"cuda_check_progress polled %d times over %f s on step %d",
01280             check_count, walltime - remote_submit_time,
01281             ((ComputeNonbondedCUDA *) arg)->step);
01282     cuda_errcheck(errmsg);
01283     NAMD_die(errmsg);
01284   }
01285   if ( poll_again ) {
01286     CcdCallBacksReset(0,walltime);  // fix Charm++
01287     CUDA_POLL(cuda_check_progress, arg);
01288   }
01289 }
01290 
01291 void cuda_check_remote_progress(void *arg, double walltime) {
01292 
01293   CUDA_TRACE_POLL_REMOTE;
01294   cudaError_t err = cudaEventQuery(end_remote_download);
01295   if ( err == cudaSuccess ) {
01296     local_submit_time = walltime;
01297     CUDA_TRACE_REMOTE(remote_submit_time,local_submit_time);
01298     if ( deviceCUDA->getMergeGrids() ) {  // no local
01299       kernel_time = local_submit_time - kernel_time;
01300     }
01301     check_remote_count = 0;
01302     cuda_errcheck("at cuda remote stream completed");
01303     WorkDistrib::messageFinishCUDA((ComputeNonbondedCUDA *) arg);
01304   } else if ( err != cudaErrorNotReady ) {
01305     cuda_errcheck("in cuda_check_remote_progress");
01306     NAMD_bug("cuda_errcheck missed error in cuda_check_remote_progress");
01307   } else if ( ++check_remote_count >= count_limit ) {
01308     char errmsg[256];
01309     sprintf(errmsg,"cuda_check_remote_progress polled %d times over %f s on step %d",
01310             check_remote_count, walltime - remote_submit_time,
01311             ((ComputeNonbondedCUDA *) arg)->step);
01312     cuda_errcheck(errmsg);
01313     NAMD_die(errmsg);
01314   } else if ( check_local_count ) {
01315     NAMD_bug("nonzero check_local_count in cuda_check_remote_progress");
01316   } else {
01317     CcdCallBacksReset(0,walltime);  // fix Charm++
01318     CUDA_POLL(cuda_check_remote_progress, arg);
01319   }
01320 }
01321 
01322 void cuda_check_local_progress(void *arg, double walltime) {
01323 
01324   CUDA_TRACE_POLL_LOCAL;
01325   cudaError_t err = cudaEventQuery(end_local_download);
01326   if ( err == cudaSuccess ) {
01327     CUDA_TRACE_LOCAL(local_submit_time,walltime);
01328     kernel_time = walltime - kernel_time;
01329     check_local_count = 0;
01330     cuda_errcheck("at cuda local stream completed");
01331     WorkDistrib::messageFinishCUDA((ComputeNonbondedCUDA *) arg);
01332   } else if ( err != cudaErrorNotReady ) {
01333     cuda_errcheck("in cuda_check_local_progress");
01334     NAMD_bug("cuda_errcheck missed error in cuda_check_local_progress");
01335   } else if ( ++check_local_count >= count_limit ) {
01336     char errmsg[256];
01337     sprintf(errmsg,"cuda_check_local_progress polled %d times over %f s on step %d",
01338             check_local_count, walltime - local_submit_time,
01339             ((ComputeNonbondedCUDA *) arg)->step);
01340     cuda_errcheck(errmsg);
01341     NAMD_die(errmsg);
01342   } else if ( check_remote_count ) {
01343     NAMD_bug("nonzero check_remote_count in cuda_check_local_progress");
01344   } else {
01345     CcdCallBacksReset(0,walltime);  // fix Charm++
01346     CUDA_POLL(cuda_check_local_progress, arg);
01347   }
01348 }
01349 
01350 #if 0
01351 // don't use this one unless timer is part of stream, above is better
01352 void cuda_check_progress(void *arg, double walltime) {
01353   if ( cuda_stream_finished() ) {
01354     kernel_time = walltime - kernel_time;
01355     CcdCallBacksReset(0,walltime);  // fix Charm++
01356     CUDA_POLL(ccd_index);
01357     // ((ComputeNonbondedCUDA *) arg)->finishWork();
01358     WorkDistrib::messageEnqueueWork((ComputeNonbondedCUDA *) arg);
01359   }
01360 }
01361 #endif
01362 
01363 void ComputeNonbondedCUDA::atomUpdate() {
01364   //fprintf(stderr, "%d ComputeNonbondedCUDA::atomUpdate\n",CkMyPe());
01365   atomsChanged = 1;
01366 }
01367 
01368 static __thread int kernel_launch_state = 0;
01369 
01370 struct cr_sortop_distance {
01371   const Lattice &l;
01372   cr_sortop_distance(const Lattice &lattice) : l(lattice) { }
01373   bool operator() (ComputeNonbondedCUDA::compute_record i,
01374                         ComputeNonbondedCUDA::compute_record j) {
01375     Vector a = l.a();
01376     Vector b = l.b();
01377     Vector c = l.c();
01378     BigReal ri = (i.offset.x * a + i.offset.y * b + i.offset.z * c).length2();
01379     BigReal rj = (j.offset.x * a + j.offset.y * b + j.offset.z * c).length2();
01380     return ( ri < rj );
01381   }
01382 };
01383 
01384 struct cr_sortop_reverse_priority {
01385   cr_sortop_distance &distop;
01386   const ComputeNonbondedCUDA::patch_record *pr;
01387   cr_sortop_reverse_priority(cr_sortop_distance &sod,
01388        const ComputeNonbondedCUDA::patch_record *patchrecs) : distop(sod), pr(patchrecs) { }
01389   bool pid_compare_priority(int pidi, int pidj) {
01390     const ComputeNonbondedCUDA::patch_record &pri = pr[pidi];
01391     const ComputeNonbondedCUDA::patch_record &prj = pr[pidj];
01392     if ( pri.isSamePhysicalNode && ! prj.isSamePhysicalNode ) return 0;
01393     if ( prj.isSamePhysicalNode && ! pri.isSamePhysicalNode ) return 1;
01394     if ( pri.isSameNode && ! prj.isSameNode ) return 0;
01395     if ( prj.isSameNode && ! pri.isSameNode ) return 1;
01396     if ( pri.isSameNode ) {  // and prj.isSameNode
01397       int rpri = pri.reversePriorityRankInPe;
01398       int rprj = prj.reversePriorityRankInPe;
01399       if ( rpri != rprj ) return rpri > rprj;
01400       return sortop_bitreverse(CkRankOf(pri.hostPe),CkRankOf(prj.hostPe));
01401     }
01402     int ppi = PATCH_PRIORITY(pidi);
01403     int ppj = PATCH_PRIORITY(pidj);
01404     if ( ppi != ppj ) return ppi < ppj;
01405     return pidi < pidj;
01406   }
01407   bool operator() (ComputeNonbondedCUDA::compute_record j,
01408                         ComputeNonbondedCUDA::compute_record i) {  // i and j reversed
01409     int pidi = pid_compare_priority(i.pid[0],i.pid[1]) ? i.pid[0] : i.pid[1];
01410     int pidj = pid_compare_priority(j.pid[0],j.pid[1]) ? j.pid[0] : j.pid[1];
01411     if ( pidi != pidj ) return pid_compare_priority(pidi, pidj);
01412     return distop(i,j);
01413   }
01414 };
01415 
01416 void ComputeNonbondedCUDA::skip() {
01417   //fprintf(stderr, "ComputeNonbondedCUDA::skip()\n");
01418   SimParameters *simParams = Node::Object()->simParameters;
01419   for ( int i=0; i<hostedPatches.size(); ++i ) {
01420     patch_record &pr = master->patchRecords[hostedPatches[i]];
01421     pr.positionBox->skip();
01422     pr.forceBox->skip();
01423     if (simParams->GBISOn) {
01424       pr.intRadBox->skip();
01425       pr.psiSumBox->skip();
01426       pr.bornRadBox->skip();
01427       pr.dEdaSumBox->skip();
01428       pr.dHdrPrefixBox->skip();
01429     }
01430   }
01431 }
01432 
01433 int ComputeNonbondedCUDA::noWork() {
01434 
01435   SimParameters *simParams = Node::Object()->simParameters;
01436   Flags &flags = master->patchRecords[hostedPatches[0]].p->flags;
01437   lattice = flags.lattice;
01438   doSlow = flags.doFullElectrostatics;
01439   doEnergy = flags.doEnergy;
01440   step = flags.step;
01441 
01442   if ( ! flags.doNonbonded ) {
01443     GBISP("GBIS[%d] noWork() don't do nonbonded\n",CkMyPe());
01444     if ( master != this ) {
01445       computeMgr->sendNonbondedCUDASlaveReady(masterPe,
01446                         hostedPatches.size(),atomsChanged,sequence());
01447     } else {
01448       for ( int i = 0; i < numSlaves; ++i ) {
01449         computeMgr->sendNonbondedCUDASlaveSkip(slaves[i],slavePes[i]);
01450       }
01451       skip();
01452     }
01453     if ( reduction ) {
01454        reduction->submit();
01455     }
01456 
01457     return 1;
01458   }
01459 
01460   for ( int i=0; i<hostedPatches.size(); ++i ) {
01461     patch_record &pr = master->patchRecords[hostedPatches[i]];
01462     if (!simParams->GBISOn || gbisPhase == 1) {
01463       GBISP("GBIS[%d] noWork() P0[%d] open()\n",CkMyPe(), pr.patchID);
01464       pr.x = pr.positionBox->open();
01465       pr.xExt = pr.p->getCompAtomExtInfo();
01466     }
01467 
01468     if (simParams->GBISOn) {
01469       if (gbisPhase == 1) {
01470         GBISP("GBIS[%d] noWork() P1[%d] open()\n",CkMyPe(),pr.patchID);
01471         pr.intRad     = pr.intRadBox->open();
01472         pr.psiSum     = pr.psiSumBox->open();
01473       } else if (gbisPhase == 2) {
01474         GBISP("GBIS[%d] noWork() P2[%d] open()\n",CkMyPe(),pr.patchID);
01475         pr.bornRad    = pr.bornRadBox->open();
01476         pr.dEdaSum    = pr.dEdaSumBox->open();
01477       } else if (gbisPhase == 3) {
01478         GBISP("GBIS[%d] noWork() P3[%d] open()\n",CkMyPe(),pr.patchID);
01479         pr.dHdrPrefix = pr.dHdrPrefixBox->open();
01480       }
01481       GBISP("opened GBIS boxes");
01482     }
01483   }
01484 
01485   if ( master == this ) return 0; //work to do, enqueue as usual
01486 
01487   // message masterPe
01488   computeMgr->sendNonbondedCUDASlaveReady(masterPe,
01489                         hostedPatches.size(),atomsChanged,sequence());
01490   atomsChanged = 0;
01491 
01492   workStarted = 1;
01493 
01494   return 1;
01495 }
01496 
01497 void ComputeNonbondedCUDA::doWork() {
01498 GBISP("C.N.CUDA[%d]::doWork: seq %d, phase %d, workStarted %d, atomsChanged %d\n", \
01499 CkMyPe(), sequence(), gbisPhase, workStarted, atomsChanged);
01500 
01501   // Set GPU device ID
01502   cudaCheck(cudaSetDevice(deviceID));
01503 
01504   ResizeArray<patch_pair> &patch_pairs(*patch_pairs_ptr);
01505   ResizeArray<int> &patch_pair_num(*patch_pair_num_ptr);
01506 
01507   if ( workStarted ) { //if work already started, check if finished
01508     if ( finishWork() ) {  // finished
01509       workStarted = 0;
01510       basePriority = PROXY_DATA_PRIORITY;  // higher to aid overlap
01511     } else {  // need to call again
01512       workStarted = 2;
01513       basePriority = PROXY_RESULTS_PRIORITY;  // lower for local
01514       if ( master == this && kernel_launch_state > 2 ) {
01515         cuda_check_local_progress(this,0.);  // launches polling
01516       }
01517     }
01518     return;
01519   }
01520 
01521   workStarted = 1;
01522   basePriority = COMPUTE_PROXY_PRIORITY;
01523 
01524   Molecule *mol = Node::Object()->molecule;
01525   Parameters *params = Node::Object()->parameters;
01526   SimParameters *simParams = Node::Object()->simParameters;
01527 
01528   //execute only during GBIS phase 1, or if not using GBIS
01529   if (!simParams->GBISOn || gbisPhase == 1) {
01530 
01531     //bind new patches to GPU
01532     if ( atomsChanged || computesChanged ) {
01533       int npatches = activePatches.size();
01534 
01535       pairlistsValid = 0;
01536       pairlistTolerance = 0.;
01537 
01538       if ( computesChanged ) {
01539         computesChanged = 0;
01540 
01541         if ( ! dummy_size ) {
01542           dummy_size = 1024*1024;
01543           cudaMalloc((void**)&dummy_dev,dummy_size);
01544           cuda_errcheck("in cudaMalloc dummy_dev");
01545         }
01546 
01547         bool did_realloc = reallocate_host<int>(&force_ready_queue, &force_ready_queue_size, npatches,
01548           1.2f, cudaHostAllocMapped);
01549         if (did_realloc) {
01550           for (int k=0; k < force_ready_queue_size; ++k)
01551             force_ready_queue[k] = -1;
01552         }
01553         force_ready_queue_len = npatches;
01554         reallocate_host<int>(&block_order, &block_order_size,
01555           2*(localComputeRecords.size()+remoteComputeRecords.size()),
01556           1.2f, cudaHostAllocMapped);
01557     
01558         num_virials = npatches;
01559         reallocate_host<float>(&virials, &virials_size, 2*16*num_virials,
01560           1.2f, cudaHostAllocMapped);
01561         slow_virials = virials + 16*num_virials;
01562 
01563         reallocate_host<float>(&energy_gbis, &energy_gbis_size, npatches,
01564           1.2f, cudaHostAllocMapped);
01565         for (int i  = 0; i < energy_gbis_size; i++) energy_gbis[i] = 0.f;
01566 
01567         int *ap = activePatches.begin();
01568         for ( int i=0; i<localActivePatches.size(); ++i ) {
01569           *(ap++) = localActivePatches[i];
01570         }
01571         for ( int i=0; i<remoteActivePatches.size(); ++i ) {
01572           *(ap++) = remoteActivePatches[i];
01573         }
01574 
01575         // sort computes by distance between patches
01576         cr_sortop_distance so(lattice);
01577         std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),so);
01578         std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),so);
01579 
01580         const bool streaming = ! (deviceCUDA->getNoStreaming() || simParams->GBISOn);
01581 
01582         if ( streaming ) {
01583           // iout << "Reverse-priority sorting...\n" << endi;
01584           cr_sortop_reverse_priority sorp(so,patchRecords.begin());
01585           std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),sorp);
01586           std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),sorp);
01587           patchPairsReordered = 0;
01588           //patchPairsReordered = 1;
01589       // int len = remoteComputeRecords.size();
01590       // for ( int i=0; i<len; ++i ) {
01591       //   iout << "reverse_order " << i << " " << remoteComputeRecords[i].pid[0] << "\n";
01592       // }
01593       // int len2 = localComputeRecords.size();
01594       // for ( int i=0; i<len2; ++i ) {
01595       //   iout << "reverse_order " << (i+len) << " " << localComputeRecords[i].pid[0] << "\n";
01596       // }
01597       // iout << endi;
01598         } else {
01599           patchPairsReordered = 1;
01600         }
01601  
01602         int nlc = localComputeRecords.size();
01603         int nrc = remoteComputeRecords.size();
01604         computeRecords.resize(nlc+nrc);
01605         compute_record *cr = computeRecords.begin();
01606         for ( int i=0; i<nrc; ++i ) {
01607           *(cr++) = remoteComputeRecords[i];
01608         }
01609         for ( int i=0; i<nlc; ++i ) {
01610           *(cr++) = localComputeRecords[i];
01611         }
01612 
01613         // patch_pair_num[i] = number of patch pairs that involve patch i
01614         patch_pair_num.resize(npatches);
01615         for ( int i=0; i<npatches; ++i ) {
01616           patchRecords[activePatches[i]].localIndex = i;
01617           patch_pair_num[i] = 0;
01618         }
01619 
01620         int ncomputes = computeRecords.size();
01621         patch_pairs.resize(ncomputes);
01622         for ( int i=0; i<ncomputes; ++i ) {
01623           ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01624           int lp1 = patchRecords[cr.pid[0]].localIndex;
01625           int lp2 = patchRecords[cr.pid[1]].localIndex;
01626           patch_pair_num[lp1]++;
01627           if (lp1 != lp2) patch_pair_num[lp2]++;
01628           patch_pair &pp = patch_pairs[i];
01629           pp.offset.x = cr.offset.x;
01630           pp.offset.y = cr.offset.y;
01631           pp.offset.z = cr.offset.z;
01632         }
01633 
01634         for ( int i=0; i<ncomputes; ++i ) {
01635           ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01636           int lp1 = patchRecords[cr.pid[0]].localIndex;
01637           int lp2 = patchRecords[cr.pid[1]].localIndex;
01638           patch_pair &pp = patch_pairs[i];
01639           pp.patch1_ind = lp1;
01640           pp.patch2_ind = lp2;
01641           pp.patch1_num_pairs = patch_pair_num[lp1];
01642           pp.patch2_num_pairs = patch_pair_num[lp2];
01643         }
01644 
01645       if ( CmiPhysicalNodeID(CkMyPe()) < 2 ) {
01646         CkPrintf("Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
01647                  CkMyPe(), localActivePatches.size(), remoteActivePatches.size(),
01648                  localComputeRecords.size(), remoteComputeRecords.size());
01649       }
01650     }  // computesChanged
01651     else if ( ! patchPairsReordered ) {
01652       patchPairsReordered = 1;
01653       int len = patch_pairs.size();
01654       int nlc = localComputeRecords.size();
01655       int nrc = remoteComputeRecords.size();
01656       if ( len != nlc + nrc ) NAMD_bug("array size mismatch in ComputeNonbondedCUDA reordering");
01657       ResizeArray<ComputeNonbondedCUDA::compute_record> new_computeRecords(len);
01658       ResizeArray<patch_pair> new_patch_pairs(len);
01659       int irc=nrc;
01660       int ilc=len;
01661       for ( int i=0; i<len; ++i ) {
01662         int boi = block_order[i];
01663         int dest;
01664         if ( boi < nrc ) { dest = --irc; } else { dest = --ilc; }
01665         new_computeRecords[dest] = computeRecords[boi];
01666         new_patch_pairs[dest] = patch_pairs[boi];
01667       }
01668       if ( irc != 0 || ilc != nrc ) NAMD_bug("block index mismatch in ComputeNonbondedCUDA reordering");
01669       computeRecords.swap(new_computeRecords);
01670       patch_pairs.swap(new_patch_pairs);
01671     }
01672 
01673     int istart = 0;
01674     int i;
01675     for ( i=0; i<npatches; ++i ) {
01676       if ( i == localActivePatches.size() ) {
01677         num_local_atoms = istart;
01678       }
01679       patch_record &pr = patchRecords[activePatches[i]];
01680       pr.localStart = istart;
01681       int natoms = pr.p->getNumAtoms();
01682       int nfreeatoms = natoms;
01683       if ( fixedAtomsOn ) {
01684         const CompAtomExt *aExt = pr.xExt;
01685         for ( int j=0; j<natoms; ++j ) {
01686           if ( aExt[j].atomFixed ) --nfreeatoms;
01687         }
01688       }
01689       pr.numAtoms = natoms;
01690       pr.numFreeAtoms = nfreeatoms;
01691       istart += natoms;
01692       istart += 16 - (natoms & 15);
01693     }
01694     if ( i == localActivePatches.size() ) {
01695       num_local_atoms = istart;
01696     }
01697     num_atoms = istart;
01698     num_remote_atoms = num_atoms - num_local_atoms;
01699     reallocate_host<atom_param>(&atom_params, &atom_params_size, num_atoms, 1.2f);
01700     reallocate_host<int>(&vdw_types, &vdw_types_size, num_atoms, 1.2f);
01701     reallocate_host<CudaAtom>(&atoms, &atoms_size, num_atoms, 1.2f);
01702     reallocate_host<float4>(&forces, &forces_size, num_atoms, 1.2f, cudaHostAllocMapped);
01703     reallocate_host<float4>(&slow_forces, &slow_forces_size, num_atoms, 1.2f, cudaHostAllocMapped);
01704     if (simParams->GBISOn) {
01705       reallocate_host<float>(&intRad0H, &intRad0H_size, num_atoms, 1.2f);
01706       reallocate_host<float>(&intRadSH, &intRadSH_size, num_atoms, 1.2f);
01707       reallocate_host<GBReal>(&psiSumH, &psiSumH_size, num_atoms, 1.2f, cudaHostAllocMapped);
01708       reallocate_host<float>(&bornRadH, &bornRadH_size, num_atoms, 1.2f);
01709       reallocate_host<GBReal>(&dEdaSumH, &dEdaSumH_size, num_atoms, 1.2f, cudaHostAllocMapped);
01710       reallocate_host<float>(&dHdrPrefixH, &dHdrPrefixH_size, num_atoms, 1.2f);
01711     }
01712 
01713     int bfstart = 0;
01714     int exclmask_start = 0;
01715     int ncomputes = computeRecords.size();
01716     for ( int i=0; i<ncomputes; ++i ) {
01717       ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01718       int p1 = cr.pid[0];
01719       int p2 = cr.pid[1];
01720       patch_pair &pp = patch_pairs[i];
01721       pp.patch1_start = patchRecords[p1].localStart;
01722       pp.patch1_size  = patchRecords[p1].numAtoms;
01723       pp.patch1_free_size = patchRecords[p1].numFreeAtoms;
01724       pp.patch2_start = patchRecords[p2].localStart;
01725       pp.patch2_size  = patchRecords[p2].numAtoms;
01726       pp.patch2_free_size = patchRecords[p2].numFreeAtoms;
01727       pp.plist_start = bfstart;
01728       // size1*size2 = number of patch pairs
01729       int size1 = (pp.patch1_size-1)/WARPSIZE+1;
01730       int size2 = (pp.patch2_size-1)/WARPSIZE+1;
01731       pp.plist_size = (size1*size2-1)/32+1;
01732       bfstart += pp.plist_size;
01733       pp.exclmask_start = exclmask_start;
01734       exclmask_start += size1*size2;
01735     } //for ncomputes
01736 
01737     cuda_bind_patch_pairs(patch_pairs.begin(), patch_pairs.size(),
01738       activePatches.size(), num_atoms, bfstart, 
01739       exclmask_start);
01740 
01741   }  // atomsChanged || computesChanged
01742 
01743   double charge_scaling = sqrt(COULOMB * scaling * dielectric_1);
01744 
01745   Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01746   float maxAtomMovement = 0.;
01747   float maxPatchTolerance = 0.;
01748 
01749   for ( int i=0; i<activePatches.size(); ++i ) {
01750     patch_record &pr = patchRecords[activePatches[i]];
01751 
01752     float maxMove = pr.p->flags.maxAtomMovement;
01753     if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
01754 
01755     float maxTol = pr.p->flags.pairlistTolerance;
01756     if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
01757 
01758     int start = pr.localStart;
01759     int n = pr.numAtoms;
01760     const CompAtom *a = pr.x;
01761     const CompAtomExt *aExt = pr.xExt;
01762     if ( atomsChanged ) {
01763 
01764       atom_param *ap = atom_params + start;
01765       for ( int k=0; k<n; ++k ) {
01766         int j = aExt[k].sortOrder;
01767         ap[k].vdw_type = a[j].vdwType;
01768         vdw_types[start + k] = a[j].vdwType;
01769         ap[k].index = aExt[j].id;
01770 #ifdef MEM_OPT_VERSION
01771         ap[k].excl_index = exclusionsByAtom[aExt[j].exclId].y;
01772         ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].exclId].x;
01773 #else // ! MEM_OPT_VERSION
01774         ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
01775         ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
01776 #endif // MEM_OPT_VERSION
01777       }
01778     }
01779     {
01780 #if 1
01781       const CudaAtom *ac = pr.p->getCudaAtomList();
01782       CudaAtom *ap = atoms + start;
01783       memcpy(ap, ac, sizeof(atom)*n);
01784 #else
01785       Vector center =
01786         pr.p->flags.lattice.unscale(cudaCompute->patchMap->center(pr.patchID));
01787       atom *ap = atoms + start;
01788       for ( int k=0; k<n; ++k ) {
01789         int j = aExt[k].sortOrder;
01790         ap[k].position.x = a[j].position.x - center.x;
01791         ap[k].position.y = a[j].position.y - center.y;
01792         ap[k].position.z = a[j].position.z - center.z;
01793         ap[k].charge = charge_scaling * a[j].charge;
01794       }
01795 #endif
01796     }
01797   }
01798 
01799   savePairlists = 0;
01800   usePairlists = 0;
01801   if ( flags.savePairlists ) {
01802     savePairlists = 1;
01803     usePairlists = 1;
01804   } else if ( flags.usePairlists ) {
01805     if ( ! pairlistsValid ||
01806          ( 2. * maxAtomMovement > pairlistTolerance ) ) {
01807       reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
01808     } else {
01809       usePairlists = 1;
01810     }
01811   }
01812   if ( ! usePairlists ) {
01813     pairlistsValid = 0;
01814   }
01815   float plcutoff = cutoff;
01816   if ( savePairlists ) {
01817     pairlistsValid = 1;
01818     pairlistTolerance = 2. * maxPatchTolerance;
01819     plcutoff += pairlistTolerance;
01820   }
01821   plcutoff2 = plcutoff * plcutoff;
01822 
01823   //CkPrintf("plcutoff = %f  listTolerance = %f  save = %d  use = %d\n",
01824   //  plcutoff, pairlistTolerance, savePairlists, usePairlists);
01825 
01826 #if 0
01827   // calculate warp divergence
01828   if ( 1 ) {
01829     Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01830     Lattice &lattice = flags.lattice;
01831     float3 lata, latb, latc;
01832     lata.x = lattice.a().x;
01833     lata.y = lattice.a().y;
01834     lata.z = lattice.a().z;
01835     latb.x = lattice.b().x;
01836     latb.y = lattice.b().y;
01837     latb.z = lattice.b().z;
01838     latc.x = lattice.c().x;
01839     latc.y = lattice.c().y;
01840     latc.z = lattice.c().z;
01841 
01842     int ncomputes = computeRecords.size();
01843     for ( int ic=0; ic<ncomputes; ++ic ) {
01844       patch_pair &pp = patch_pairs[ic];
01845       atom *a1 = atoms + pp.patch1_atom_start;
01846       int n1 = pp.patch1_size;
01847       atom *a2 = atoms + pp.patch2_atom_start;
01848       int n2 = pp.patch2_size;
01849       float offx = pp.offset.x * lata.x
01850                + pp.offset.y * latb.x
01851                + pp.offset.z * latc.x;
01852       float offy = pp.offset.x * lata.y
01853                + pp.offset.y * latb.y
01854                + pp.offset.z * latc.y;
01855       float offz = pp.offset.x * lata.z
01856                + pp.offset.y * latb.z
01857                + pp.offset.z * latc.z;
01858       // CkPrintf("%f %f %f\n", offx, offy, offz);
01859       int atoms_tried = 0;
01860       int blocks_tried = 0;
01861       int atoms_used = 0;
01862       int blocks_used = 0;
01863       for ( int ii=0; ii<n1; ii+=32 ) {  // warps
01864         for ( int jj=0; jj<n2; jj+=16 ) {  // shared atom loads
01865           int block_used = 0;
01866           for ( int j=jj; j<jj+16 && j<n2; ++j ) {  // shared atoms
01867             int atom_used = 0;
01868             for ( int i=ii; i<ii+32 && i<n1; ++i ) {  // threads
01869               float dx = offx + a1[i].position.x - a2[j].position.x;
01870               float dy = offy + a1[i].position.y - a2[j].position.y;
01871               float dz = offz + a1[i].position.z - a2[j].position.z;
01872               float r2 = dx*dx + dy*dy + dz*dz;
01873               if ( r2 < cutoff2 ) atom_used = 1;
01874             }
01875             ++atoms_tried;
01876             if ( atom_used ) { block_used = 1; ++atoms_used; }
01877           }
01878           ++blocks_tried;
01879           if ( block_used ) { ++blocks_used; }
01880         }
01881       }
01882       CkPrintf("blocks = %d/%d (%f)  atoms = %d/%d (%f)\n",
01883                 blocks_used, blocks_tried, blocks_used/(float)blocks_tried,
01884                 atoms_used, atoms_tried, atoms_used/(float)atoms_tried);
01885     }
01886   }
01887 #endif
01888 
01889   } // !GBISOn || gbisPhase == 1
01890 
01891   //Do GBIS
01892   if (simParams->GBISOn) {
01893     //open GBIS boxes depending on phase
01894     for ( int i=0; i<activePatches.size(); ++i ) {
01895       patch_record &pr = master->patchRecords[activePatches[i]];
01896       GBISP("doWork[%d] accessing arrays for P%d\n",CkMyPe(),gbisPhase);
01897       if (gbisPhase == 1) {
01898         //Copy GBIS intRadius to Host
01899         if (atomsChanged) {
01900           float *intRad0 = intRad0H + pr.localStart;
01901           float *intRadS = intRadSH + pr.localStart;
01902           for ( int k=0; k<pr.numAtoms; ++k ) {
01903             int j = pr.xExt[k].sortOrder;
01904             intRad0[k] = pr.intRad[2*j+0];
01905             intRadS[k] = pr.intRad[2*j+1];
01906           }
01907         }
01908       } else if (gbisPhase == 2) {
01909         float *bornRad = bornRadH + pr.localStart;
01910         for ( int k=0; k<pr.numAtoms; ++k ) {
01911           int j = pr.xExt[k].sortOrder;
01912           bornRad[k] = pr.bornRad[j];
01913         }
01914       } else if (gbisPhase == 3) {
01915         float *dHdrPrefix = dHdrPrefixH + pr.localStart;
01916         for ( int k=0; k<pr.numAtoms; ++k ) {
01917           int j = pr.xExt[k].sortOrder;
01918           dHdrPrefix[k] = pr.dHdrPrefix[j];
01919         }
01920       } // end phases
01921     } // end for patches
01922   } // if GBISOn
01923 
01924   kernel_time = CkWallTimer();
01925   kernel_launch_state = 1;
01926   //if ( gpu_is_mine || ! doSlow ) recvYieldDevice(-1);
01927   if ( deviceCUDA->getGpuIsMine() || ! doSlow ) recvYieldDevice(-1);
01928 }
01929 
01930 void cuda_check_remote_calc(void *arg, double walltime) {
01931   // in theory we only need end_remote_calc, but overlap isn't reliable
01932   // if ( cudaEventQuery(end_remote_calc) == cudaSuccess ) {
01933   if ( cudaEventQuery(end_remote_download) == cudaSuccess ) {
01934 // CkPrintf("Pe %d yielding to %d after remote calc\n", CkMyPe(), next_pe_sharing_gpu);
01935     computeMgr->sendYieldDevice(deviceCUDA->getNextPeSharingGpu());
01936 // CkPrintf("Pe %d yielded to %d after remote calc\n", CkMyPe(), next_pe_sharing_gpu);
01937   } else {
01938     CcdCallBacksReset(0,walltime);  // fix Charm++
01939     CUDA_POLL(cuda_check_remote_calc, arg);
01940   }
01941 }
01942 
01943 void cuda_check_local_calc(void *arg, double walltime) {
01944   // in theory we only need end_local_calc, but overlap isn't reliable
01945   // if ( cudaEventQuery(end_local_calc) == cudaSuccess ) {
01946   if ( cudaEventQuery(end_local_download) == cudaSuccess ) {
01947 // CkPrintf("Pe %d yielding to %d after local calc\n", CkMyPe(), next_pe_sharing_gpu);
01948     computeMgr->sendYieldDevice(deviceCUDA->getNextPeSharingGpu());
01949 // CkPrintf("Pe %d yielded to %d after local calc\n", CkMyPe(), next_pe_sharing_gpu);
01950   } else {
01951     CcdCallBacksReset(0,walltime);  // fix Charm++
01952     CUDA_POLL(cuda_check_local_calc, arg);
01953   }
01954 }
01955 
01956 // computeMgr->sendYieldDevice(next_pe_sharing_gpu);
01957 
01958 void ComputeNonbondedCUDA::recvYieldDevice(int pe) {
01959 GBISP("C.N.CUDA[%d]::recvYieldDevice: seq %d, workStarted %d, \
01960 gbisPhase %d, kls %d, from pe %d\n", CkMyPe(), sequence(), \
01961 workStarted, gbisPhase, kernel_launch_state, pe)
01962 
01963   float3 lata, latb, latc;
01964   lata.x = lattice.a().x;
01965   lata.y = lattice.a().y;
01966   lata.z = lattice.a().z;
01967   latb.x = lattice.b().x;
01968   latb.y = lattice.b().y;
01969   latb.z = lattice.b().z;
01970   latc.x = lattice.c().x;
01971   latc.y = lattice.c().y;
01972   latc.z = lattice.c().z;
01973   SimParameters *simParams = Node::Object()->simParameters;
01974 
01975   // Set GPU device ID
01976   cudaSetDevice(deviceID);
01977 
01978   const bool streaming = ! (deviceCUDA->getNoStreaming() || simParams->GBISOn);
01979 
01980   double walltime;
01981   if ( kernel_launch_state == 1 || kernel_launch_state == 2 ) {
01982     walltime = CkWallTimer();
01983     CcdCallBacksReset(0,walltime);  // fix Charm++
01984   }
01985 
01986   switch ( kernel_launch_state ) {
01988 // Remote
01989   case 1:
01990 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: case 1\n", CkMyPe())
01991     ++kernel_launch_state;
01992     //gpu_is_mine = 0;
01993     deviceCUDA->setGpuIsMine(0);
01994     remote_submit_time = walltime;
01995 
01996     if (!simParams->GBISOn || gbisPhase == 1) {
01997       // cudaEventRecord(start_upload, stream);
01998       if ( atomsChanged ) {
01999         cuda_bind_atom_params(atom_params);
02000         cuda_bind_vdw_types(vdw_types);
02001       }
02002       if ( simParams->GBISOn) {
02003         cuda_bind_GBIS_psiSum(psiSumH);
02004                if ( atomsChanged ) {
02005                  cuda_bind_GBIS_intRad(intRad0H, intRadSH);
02006                }
02007       }
02008       atomsChanged = 0;
02009       cuda_bind_atoms((const atom *)atoms);
02010       cuda_bind_forces(forces, slow_forces);
02011       cuda_bind_virials(virials, force_ready_queue, block_order);
02012       if ( simParams->GBISOn) {
02013         cuda_bind_GBIS_energy(energy_gbis);
02014       }
02015       if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
02016       //call CUDA Kernels
02017 
02018       cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
02019                             0,remoteComputeRecords.size(),
02020                             remoteComputeRecords.size()+localComputeRecords.size(),
02021                             doSlow, doEnergy, usePairlists, savePairlists, streaming, ! patchPairsReordered, stream);
02022       if (simParams->GBISOn) {
02023         cuda_GBIS_P1(
02024           0,remoteComputeRecords.size(),
02025                       localActivePatches.size(),remoteActivePatches.size(),
02026                       simParams->alpha_cutoff-simParams->fsMax,
02027                       simParams->coulomb_radius_offset,
02028                       lata, latb, latc, stream);
02029       }
02030       //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
02031       //    num_local_atom_records,num_remote_atom_records);
02032       if ( ( ! streaming ) || ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) ) {
02033         cudaEventRecord(end_remote_download, stream);
02034       }
02035       if ( streaming ) {
02036         force_ready_queue_next = 0;
02037         CUDA_POLL(cuda_check_progress,this);
02038       } else {
02039         CUDA_POLL(cuda_check_remote_progress,this);
02040       }
02041       if ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) {
02042         CUDA_POLL(cuda_check_remote_calc,this);
02043         break;
02044       }
02045     } // !GBIS or gbisPhase==1
02046     if (simParams->GBISOn) {
02047       if (gbisPhase == 1) {
02048         //GBIS P1 Kernel launched in previous code block
02049       } else if (gbisPhase == 2) {
02050 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P2>>>\n", CkMyPe())
02051         // cudaEventRecord(start_upload, stream);
02052         cuda_bind_GBIS_bornRad(bornRadH);
02053         cuda_bind_GBIS_dEdaSum(dEdaSumH);
02054         if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
02055         cuda_GBIS_P2(
02056           0,remoteComputeRecords.size(),
02057           localActivePatches.size(),remoteActivePatches.size(),
02058           (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
02059           simParams->nonbondedScaling, simParams->kappa,
02060           (simParams->switchingActive ? simParams->switchingDist : -1.0),
02061           simParams->dielectric, simParams->solvent_dielectric,
02062           lata, latb, latc,
02063           doEnergy, doSlow, stream
02064           );
02065         cudaEventRecord(end_remote_download, stream);
02066         CUDA_POLL(cuda_check_remote_progress,this);
02067       } else if (gbisPhase == 3) {
02068 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P3>>>\n", CkMyPe())
02069         // cudaEventRecord(start_upload, stream);
02070         cuda_bind_GBIS_dHdrPrefix(dHdrPrefixH);
02071         if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
02072         if (doSlow)
02073         cuda_GBIS_P3(
02074           0,remoteComputeRecords.size(),
02075           localActivePatches.size(),remoteActivePatches.size(),
02076           (simParams->alpha_cutoff-simParams->fsMax),
02077           simParams->coulomb_radius_offset,
02078           simParams->nonbondedScaling,
02079           lata, latb, latc, stream
02080           );
02081         cudaEventRecord(end_remote_download, stream);
02082         CUDA_POLL(cuda_check_remote_progress,this);
02083       }
02084     }
02085 
02087 // Local
02088   case 2:
02089 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: case 2\n", CkMyPe())
02090     ++kernel_launch_state;
02091     //gpu_is_mine = 0;
02092     deviceCUDA->setGpuIsMine(0);
02093 
02094     if ( stream2 != stream ) {
02095       // needed to ensure that upload is finished
02096       cudaStreamWaitEvent(stream2, start_calc, 0);
02097       // priorities do not prevent local from launching ahead
02098       // of remote, so delay local stream with a small memset
02099       cudaMemsetAsync(dummy_dev, 0, dummy_size, stream2);
02100     }
02101 
02102     if (!simParams->GBISOn || gbisPhase == 1) {
02103 
02104       cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
02105                             remoteComputeRecords.size(),localComputeRecords.size(),
02106                             remoteComputeRecords.size()+localComputeRecords.size(),
02107                             doSlow, doEnergy, usePairlists, savePairlists, streaming, ! patchPairsReordered, stream2);
02108       if (simParams->GBISOn) {
02109         cuda_GBIS_P1(
02110                      remoteComputeRecords.size(),localComputeRecords.size(),
02111                      0,localActivePatches.size(),
02112                      simParams->alpha_cutoff-simParams->fsMax,
02113                      simParams->coulomb_radius_offset,
02114                      lata, latb, latc, stream2 );
02115       }
02116       //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
02117       //    0,num_local_atom_records);
02118       //cuda_load_virials(virials, doSlow);  // slow_virials follows virials
02119       if ( ( ! streaming ) || ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) ) {
02120         cudaEventRecord(end_local_download, stream2);
02121       }
02122       if ( ! streaming && workStarted == 2 ) {
02123         GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
02124 cuda_check_local_progress\n", CkMyPe())
02125         CUDA_POLL(cuda_check_local_progress,this);
02126       }
02127       if ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) {
02128         GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
02129 cuda_check_local_calc\n", CkMyPe())
02130           CUDA_POLL(cuda_check_local_calc,this);
02131         break;
02132       }
02133 
02134     } // !GBIS or gbisPhase==1
02135     if (simParams->GBISOn) {
02136       if (gbisPhase == 1) {
02137         //GBIS P1 Kernel launched in previous code block
02138       } else if (gbisPhase == 2) {
02139 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P2>>>\n", CkMyPe())
02140         cuda_GBIS_P2(
02141           remoteComputeRecords.size(),localComputeRecords.size(),
02142           0,localActivePatches.size(),
02143           (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
02144           simParams->nonbondedScaling, simParams->kappa,
02145           (simParams->switchingActive ? simParams->switchingDist : -1.0),
02146           simParams->dielectric, simParams->solvent_dielectric,
02147           lata, latb, latc,
02148           doEnergy, doSlow, stream2
02149           );
02150         cudaEventRecord(end_local_download, stream2);
02151         if ( workStarted == 2 ) {
02152           CUDA_POLL(cuda_check_local_progress,this);
02153         }
02154       } else if (gbisPhase == 3) {
02155 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P3>>>\n", CkMyPe())
02156         if (doSlow)
02157         cuda_GBIS_P3(
02158           remoteComputeRecords.size(),localComputeRecords.size(),
02159           0,localActivePatches.size(),
02160           (simParams->alpha_cutoff-simParams->fsMax),
02161           simParams->coulomb_radius_offset,
02162           simParams->nonbondedScaling,
02163           lata, latb, latc, stream2
02164           );
02165         cudaEventRecord(end_local_download, stream2);
02166         if ( workStarted == 2 ) {
02167           CUDA_POLL(cuda_check_local_progress,this);
02168         }
02169       } // phases
02170     } // GBISOn
02171     if ( simParams->PMEOn && simParams->PMEOffload  && !simParams->usePMECUDA) break;
02172 
02173   default:
02174 GBISP("C.N.CUDA[%d]::recvYieldDevice: case default\n", CkMyPe())
02175     //gpu_is_mine = 1;
02176     deviceCUDA->setGpuIsMine(1);
02177     break;
02178   } // switch
02179 GBISP("C.N.CUDA[%d]::recvYieldDevice: DONE\n", CkMyPe())
02180 }
02181 
02182 void ComputeNonbondedCUDA::messageFinishPatch(int flindex) {
02183   int pid = activePatches[flindex];
02184   patch_record &pr = patchRecords[pid];
02185   //fprintf(stderr, "%d ComputeNonbondedCUDA::messageFinishPatch %d\n",CkMyPe(),pr.hostPe);
02186   computeMgr->sendNonbondedCUDASlaveEnqueuePatch(pr.slave,pr.hostPe,sequence(),PROXY_DATA_PRIORITY,flindex,pr.msg);
02187 }
02188 
02189 void ComputeNonbondedCUDA::finishPatch(int flindex) {
02190   //fprintf(stderr, "%d ComputeNonbondedCUDA::finishPatch \n",CkMyPe());
02191   patch_record &pr = master->patchRecords[master->activePatches[flindex]];
02192   pr.r = pr.forceBox->open();
02193   finishPatch(pr);
02194   pr.positionBox->close(&(pr.x));
02195   pr.forceBox->close(&(pr.r));
02196 }
02197 
02198 void ComputeNonbondedCUDA::finishPatch(patch_record &pr) {
02199   int start = pr.localStart;
02200   const CompAtomExt *aExt = pr.xExt;
02201   int nfree = pr.numAtoms;
02202   pr.f = pr.r->f[Results::nbond];
02203   Force *f = pr.f;
02204   Force *f_slow = pr.r->f[Results::slow];
02205   const CompAtom *a = pr.x;
02206   // int *ao = atom_order + start;
02207   float4 *af = master->forces + start;
02208   float4 *af_slow = master->slow_forces + start;
02209   // only free atoms return forces
02210   for ( int k=0; k<nfree; ++k ) {
02211     int j = aExt[k].sortOrder;
02212     f[j].x += af[k].x;
02213     f[j].y += af[k].y;
02214     f[j].z += af[k].z;
02215     // wcount += af[k].w;
02216     // virial += af[k].w;
02217     if ( doSlow ) {
02218       f_slow[j].x += af_slow[k].x;
02219       f_slow[j].y += af_slow[k].y;
02220       f_slow[j].z += af_slow[k].z;
02221       // virial_slow += af_slow[k].w;
02222     }
02223   }
02224 }
02225 
02226 //dtanner
02227 int ComputeNonbondedCUDA::finishWork() {
02228   //fprintf(stderr, "%d ComputeNonbondedCUDA::finishWork() \n",CkMyPe());
02229 
02230   if ( master == this ) {
02231     for ( int i = 0; i < numSlaves; ++i ) {
02232       computeMgr->sendNonbondedCUDASlaveEnqueue(slaves[i],slavePes[i],sequence(),priority(),workStarted);
02233     }
02234   }
02235 
02236 GBISP("C.N.CUDA[%d]::fnWork: workStarted %d, phase %d\n", \
02237 CkMyPe(), workStarted, gbisPhase)
02238 
02239   Molecule *mol = Node::Object()->molecule;
02240   SimParameters *simParams = Node::Object()->simParameters;
02241 
02242   ResizeArray<int> &patches( workStarted == 1 ?
02243                                 remoteHostedPatches : localHostedPatches );
02244 
02245   // long long int wcount = 0;
02246   // double virial = 0.;
02247   // double virial_slow = 0.;
02248   for ( int i=0; i<patches.size(); ++i ) {
02249     // CkPrintf("Pe %d patch %d of %d pid %d\n",CkMyPe(),i,patches.size(),patches[i]);
02250     patch_record &pr = master->patchRecords[patches[i]];
02251 
02252     if ( !simParams->GBISOn || gbisPhase == 1 ) {
02253       patch_record &pr = master->patchRecords[patches[i]];
02254 GBISP("GBIS[%d] fnWork() P0[%d] force.open()\n",CkMyPe(), pr.patchID);
02255       pr.r = pr.forceBox->open();
02256     } // !GBISOn || gbisPhase==1
02257 
02258     int start = pr.localStart;
02259     const CompAtomExt *aExt = pr.xExt;
02260     if ( !simParams->GBISOn || gbisPhase == 3 ) {
02261       finishPatch(pr);
02262     } // !GBISOn || gbisPhase == 3
02263 
02264 #if 0
02265     if ( i % 31 == 0 ) for ( int j=0; j<3; ++j ) {
02266       CkPrintf("Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
02267         j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
02268         af[j].w);
02269     }
02270 #endif
02271 
02272     //Close Boxes depending on Phase
02273     if (simParams->GBISOn) {
02274       if (gbisPhase == 1) {
02275         //Copy dEdaSum from Host to Patch Box
02276         GBReal *psiSumMaster = master->psiSumH + start;
02277         for ( int k=0; k<pr.numAtoms; ++k ) {
02278           int j = aExt[k].sortOrder;
02279           pr.psiSum[j] += psiSumMaster[k];
02280         }
02281 GBISP("C.N.CUDA[%d]::fnWork: P1 psiSum.close()\n", CkMyPe());
02282         pr.psiSumBox->close(&(pr.psiSum));
02283 
02284       } else if (gbisPhase == 2) {
02285         //Copy dEdaSum from Host to Patch Box
02286         GBReal *dEdaSumMaster = master->dEdaSumH + start;
02287         for ( int k=0; k<pr.numAtoms; ++k ) {
02288           int j = aExt[k].sortOrder;
02289           pr.dEdaSum[j] += dEdaSumMaster[k];
02290         }
02291 GBISP("C.N.CUDA[%d]::fnWork: P2 dEdaSum.close()\n", CkMyPe());
02292         pr.dEdaSumBox->close(&(pr.dEdaSum));
02293 
02294       } else if (gbisPhase == 3) {
02295 GBISP("C.N.CUDA[%d]::fnWork: P3 all.close()\n", CkMyPe());
02296         pr.intRadBox->close(&(pr.intRad)); //box 6
02297         pr.bornRadBox->close(&(pr.bornRad)); //box 7
02298         pr.dHdrPrefixBox->close(&(pr.dHdrPrefix)); //box 9
02299         pr.positionBox->close(&(pr.x)); //box 0
02300         pr.forceBox->close(&(pr.r));
02301       } //end phases
02302     } else { //not GBIS
02303 GBISP("C.N.CUDA[%d]::fnWork: pos/force.close()\n", CkMyPe());
02304       pr.positionBox->close(&(pr.x));
02305       pr.forceBox->close(&(pr.r));
02306     }
02307   }// end for
02308 
02309 #if 0
02310   virial *= (-1./6.);
02311   reduction->item(REDUCTION_VIRIAL_NBOND_XX) += virial;
02312   reduction->item(REDUCTION_VIRIAL_NBOND_YY) += virial;
02313   reduction->item(REDUCTION_VIRIAL_NBOND_ZZ) += virial;
02314   if ( doSlow ) {
02315     virial_slow *= (-1./6.);
02316     reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial_slow;
02317     reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial_slow;
02318     reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial_slow;
02319   }
02320 #endif
02321 
02322   if ( workStarted == 1 && ! deviceCUDA->getMergeGrids() &&
02323        ( localHostedPatches.size() || master == this ) ) {
02324     GBISP("not finished, call again\n");
02325     return 0;  // not finished, call again
02326   }
02327 
02328   if ( master != this ) {  // finished
02329     GBISP("finished\n");
02330     if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
02331     atomsChanged = 0;
02332     return 1;
02333   }
02334 
02335   cuda_timer_total += kernel_time;
02336 
02337   if ( !simParams->GBISOn || gbisPhase == 3 ) {
02338 
02339     atomsChanged = 0;
02340     finishReductions();
02341 
02342   } // !GBISOn || gbisPhase==3  
02343 
02344   // Next GBIS Phase
02345 GBISP("C.N.CUDA[%d]::fnWork: incrementing phase\n", CkMyPe())
02346     if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
02347 
02348   GBISP("C.N.CUDA[%d] finished ready for next step\n",CkMyPe());
02349   return 1;  // finished and ready for next step
02350 }
02351 
02352 
02353 void ComputeNonbondedCUDA::finishReductions() {
02354   //fprintf(stderr, "%d ComputeNonbondedCUDA::finishReductions \n",CkMyPe());
02355 
02356   basePriority = PROXY_DATA_PRIORITY;  // higher to aid overlap
02357 
02358   SimParameters *simParams = Node::Object()->simParameters;
02359 
02360   if ( 0 && patchPairsReordered && patchPairsReordered < 3 ) {
02361     if ( patchPairsReordered++ == 2) {
02362       int patch_len = patchRecords.size();
02363       ResizeArray<int> plast(patch_len);
02364       for ( int i=0; i<patch_len; ++i ) {
02365         plast[i] = -1;
02366       }
02367       int order_len = localComputeRecords.size()+remoteComputeRecords.size();
02368       for ( int i=0; i<order_len; ++i ) {
02369         plast[computeRecords[block_order[i]].pid[0]] = i;
02370         iout << "block_order " << i << " " << block_order[i] << " " << computeRecords[block_order[i]].pid[0] << "\n";
02371       }
02372       iout << endi;
02373       for ( int i=0; i<patch_len; ++i ) {
02374         iout << "patch_last " << i << " " << plast[i] << "\n";
02375       }
02376       iout << endi;
02377     }
02378   }
02379 
02380   {
02381     Tensor virial_tensor;
02382     BigReal energyv = 0.;
02383     BigReal energye = 0.;
02384     BigReal energys = 0.;
02385     int nexcluded = 0;
02386     for ( int i = 0; i < num_virials; ++i ) {
02387       virial_tensor.xx += virials[16*i];
02388       virial_tensor.xy += virials[16*i+1];
02389       virial_tensor.xz += virials[16*i+2];
02390       virial_tensor.yx += virials[16*i+3];
02391       virial_tensor.yy += virials[16*i+4];
02392       virial_tensor.yz += virials[16*i+5];
02393       virial_tensor.zx += virials[16*i+6];
02394       virial_tensor.zy += virials[16*i+7];
02395       virial_tensor.zz += virials[16*i+8];
02396       energyv += virials[16*i+9];
02397       energye += virials[16*i+10];
02398       energys += virials[16*i+11];
02399       nexcluded += ((int *)virials)[16*i+12];
02400       if (simParams->GBISOn) {
02401         energye += energy_gbis[i];
02402       }
02403     }
02404     reduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += nexcluded;
02405     ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NBOND,virial_tensor);
02406     if ( doEnergy ) {
02407       reduction->item(REDUCTION_LJ_ENERGY) += energyv;
02408       reduction->item(REDUCTION_ELECT_ENERGY) += energye;
02409       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += energys;
02410     }
02411   }
02412   if ( doSlow ) {
02413     Tensor virial_slow_tensor;
02414     for ( int i = 0; i < num_virials; ++i ) {
02415       virial_slow_tensor.xx += slow_virials[16*i];
02416       virial_slow_tensor.xy += slow_virials[16*i+1];
02417       virial_slow_tensor.xz += slow_virials[16*i+2];
02418       virial_slow_tensor.yx += slow_virials[16*i+3];
02419       virial_slow_tensor.yy += slow_virials[16*i+4];
02420       virial_slow_tensor.yz += slow_virials[16*i+5];
02421       virial_slow_tensor.zx += slow_virials[16*i+6];
02422       virial_slow_tensor.zy += slow_virials[16*i+7];
02423       virial_slow_tensor.zz += slow_virials[16*i+8];
02424     }
02425     ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_SLOW,virial_slow_tensor);
02426   }
02427 
02428   reduction->submit();
02429 
02430   cuda_timer_count++;
02431   if ( simParams->outputCudaTiming &&
02432         step % simParams->outputCudaTiming == 0 ) {
02433 
02434     // int natoms = mol->numAtoms; 
02435     // double wpa = wcount;  wpa /= natoms;
02436 
02437     // CkPrintf("Pe %d CUDA kernel %f ms, total %f ms, wpa %f\n", CkMyPe(),
02438     //  kernel_time * 1.e3, time * 1.e3, wpa);
02439 
02440 #if 0
02441     float upload_ms, remote_calc_ms;
02442     float local_calc_ms, total_ms;
02443     cuda_errcheck("before event timers");
02444     cudaEventElapsedTime(&upload_ms, start_upload, start_calc);
02445     cuda_errcheck("in event timer 1");
02446     cudaEventElapsedTime(&remote_calc_ms, start_calc, end_remote_download);
02447     cuda_errcheck("in event timer 2");
02448     cudaEventElapsedTime(&local_calc_ms, end_remote_download, end_local_download);
02449     cuda_errcheck("in event timer 3");
02450     cudaEventElapsedTime(&total_ms, start_upload, end_local_download);
02451     cuda_errcheck("in event timer 4");
02452     cuda_errcheck("in event timers");
02453 
02454     CkPrintf("CUDA EVENT TIMING: %d %f %f %f %f\n",
02455              CkMyPe(), upload_ms, remote_calc_ms,
02456              local_calc_ms, total_ms);
02457 #endif
02458 
02459     if ( cuda_timer_count >= simParams->outputCudaTiming ) {
02460       cuda_timer_total /= cuda_timer_count;
02461       CkPrintf("CUDA TIMING: %d  %f ms/step on node %d\n",
02462                step, cuda_timer_total * 1.e3, CkMyPe());
02463     }
02464     cuda_timer_count = 0;
02465     cuda_timer_total = 0;
02466   }
02467 
02468 }
02469 
02470 
02471 #endif  // NAMD_CUDA
02472 

Generated on Wed Nov 22 01:17:13 2017 for NAMD by  doxygen 1.4.7