ComputeNonbondedCUDA.C

Go to the documentation of this file.
00001 
00002 #include "common.h"
00003 #include "charm++.h"
00004 
00005 #ifdef NAMD_CUDA
00006 #include <cuda_runtime.h>
00007 #include <cuda.h>
00008 #endif
00009 
00010 #include "WorkDistrib.h"
00011 #include "ComputeMgr.h"
00012 #include "ProxyMgr.h"
00013 #include "ComputeNonbondedCUDAKernel.h"
00014 #include "ComputeNonbondedCUDA.h"
00015 #include "LJTable.h"
00016 #include "ObjectArena.h"
00017 #include "SortAtoms.h"
00018 #include "Priorities.h"
00019 #include <algorithm>
00020 
00021 #include "NamdTypes.h"
00022 #include "DeviceCUDA.h"
00023 #include "CudaUtils.h"
00024 
00025 //#define PRINT_GBIS
00026 #undef PRINT_GBIS
00027 
00028 #ifdef NAMD_CUDA
00029 
00030 #ifdef WIN32
00031 #define __thread __declspec(thread)
00032 #endif
00033 
00034 extern __thread int max_grid_size;
00035 
00036 extern __thread cudaStream_t stream;
00037 extern __thread cudaStream_t stream2;
00038 
00039 extern __thread DeviceCUDA *deviceCUDA;
00040 
00041 void cuda_errcheck(const char *msg) {
00042   cudaError_t err;
00043   if ((err = cudaGetLastError()) != cudaSuccess) {
00044     char host[128];
00045 #ifdef NOHOSTNAME
00046     sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00047 #else
00048     gethostname(host, 128);  host[127] = 0;
00049 #endif
00050     char devstr[128] = "";
00051     int devnum;
00052     if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00053       sprintf(devstr, " device %d", devnum);
00054     }
00055     cudaDeviceProp deviceProp;
00056     if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
00057       sprintf(devstr, " device %d pci %x:%x:%x", devnum,
00058         deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
00059     }
00060     char errmsg[1024];
00061     sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
00062     NAMD_die(errmsg);
00063   }
00064 }
00065 
00066 static inline bool sortop_bitreverse(int a, int b) {
00067   if ( a == b ) return 0; 
00068   for ( int bit = 1; bit; bit *= 2 ) {
00069     if ( (a&bit) != (b&bit) ) return ((a&bit) < (b&bit));
00070   }
00071   return 0;
00072 }
00073 
00074 static __thread ComputeNonbondedCUDA* cudaCompute = 0;
00075 static __thread ComputeMgr *computeMgr = 0;
00076 
00077 void send_build_cuda_force_table() {
00078   computeMgr->sendBuildCudaForceTable();
00079 }
00080 
00081 void build_cuda_force_table() {
00082   if ( deviceCUDA->getMasterPe() != CkMyPe() ) return;
00083   ComputeNonbondedCUDA::build_lj_table();
00084   ComputeNonbondedCUDA::build_force_table();
00085 }
00086 
00087 void ComputeNonbondedCUDA::build_lj_table() {  // static
00088 
00089   const LJTable* const ljTable = ComputeNonbondedUtil:: ljTable;
00090   const int dim = ljTable->get_table_dim();
00091 
00092   // round dim up to odd multiple of 16
00093   int tsize = (((dim+16+31)/32)*32)-16;
00094   if ( tsize < dim ) NAMD_bug("ComputeNonbondedCUDA::build_lj_table bad tsize");
00095 
00096   float2 *t = new float2[tsize*tsize];
00097   float2 *row = t;
00098   for ( int i=0; i<dim; ++i, row += tsize ) {
00099     for ( int j=0; j<dim; ++j ) {
00100       const LJTable::TableEntry *e = ljTable->table_val(i,j);
00101       row[j].x = e->A * scaling;
00102       row[j].y = e->B * scaling;
00103     }
00104   }
00105 
00106   cuda_bind_lj_table(t,tsize);
00107   delete [] t;
00108 
00109   if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
00110     CkPrintf("Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim);
00111   }
00112 }
00113 
00114 void ComputeNonbondedCUDA::build_force_table() {  // static
00115 
00116   float4 t[FORCE_TABLE_SIZE];
00117   float4 et[FORCE_TABLE_SIZE];  // energy table
00118 
00119   const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta;
00120   const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp;
00121   // const int r2_delta_expc = 64 * (r2_delta_exp - 127);
00122   const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
00123 
00124   double r2list[FORCE_TABLE_SIZE];  // double to match cpu code
00125   for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00126     double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00127     r2list[i] = r*r + r2_delta;
00128   }
00129 
00130   union { double f; int32 i[2]; } byte_order_test;
00131   byte_order_test.f = 1.0;  // should occupy high-order bits only
00132   int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
00133 
00134   for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00135     double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00136     int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc;  // table_i >= 0
00137 
00138     if ( r > cutoff ) {
00139       t[i].x = 0.;
00140       t[i].y = 0.;
00141       t[i].z = 0.;
00142       t[i].w = 0.;
00143       et[i].x = 0.;
00144       et[i].y = 0.;
00145       et[i].z = 0.;
00146       et[i].w = 0.;
00147       continue;
00148     }
00149 
00150     BigReal diffa = r2list[i] - r2_table[table_i];
00151 
00152     // coulomb 1/r or fast force
00153     // t[i].x = 1. / (r2 * r);  // -1/r * d/dr r^-1
00154     {
00155       BigReal table_a = fast_table[4*table_i];
00156       BigReal table_b = fast_table[4*table_i+1];
00157       BigReal table_c = fast_table[4*table_i+2];
00158       BigReal table_d = fast_table[4*table_i+3];
00159       BigReal grad =
00160                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00161       t[i].x = 2. * grad;
00162       BigReal ener = table_a + diffa *
00163                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00164       et[i].x = ener;
00165     }
00166 
00167 
00168     // pme correction for slow force
00169     // t[i].w = 0.;
00170     {
00171       BigReal table_a = scor_table[4*table_i];
00172       BigReal table_b = scor_table[4*table_i+1];
00173       BigReal table_c = scor_table[4*table_i+2];
00174       BigReal table_d = scor_table[4*table_i+3];
00175       BigReal grad =
00176                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00177       t[i].w = 2. * grad;
00178       BigReal ener = table_a + diffa *
00179                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00180       et[i].w = ener;
00181     }
00182 
00183 
00184     // vdw 1/r^6
00185     // t[i].y = 6. / (r8);  // -1/r * d/dr r^-6
00186     {
00187       BigReal table_a = vdwb_table[4*table_i];
00188       BigReal table_b = vdwb_table[4*table_i+1];
00189       BigReal table_c = vdwb_table[4*table_i+2];
00190       BigReal table_d = vdwb_table[4*table_i+3];
00191       BigReal grad =
00192                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00193       t[i].y = 2. * -1. * grad;
00194       BigReal ener = table_a + diffa *
00195                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00196       et[i].y = -1. * ener;
00197     }
00198 
00199 
00200     // vdw 1/r^12
00201     // t[i].z = 12e / (r8 * r4 * r2);  // -1/r * d/dr r^-12
00202     {
00203       BigReal table_a = vdwa_table[4*table_i];
00204       BigReal table_b = vdwa_table[4*table_i+1];
00205       BigReal table_c = vdwa_table[4*table_i+2];
00206       BigReal table_d = vdwa_table[4*table_i+3];
00207       BigReal grad =
00208                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00209       t[i].z = 2. * grad;
00210       BigReal ener = table_a + diffa *
00211                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00212       et[i].z = ener;
00213     }
00214 
00215     // CkPrintf("%d %g %g %g %g %g %g\n", i, r, diffa,
00216     //   t[i].x, t[i].y, t[i].z, t[i].w);
00217 
00218 /*
00219     double r2 = r * r;
00220     double r4 = r2 * r2;
00221     double r8 = r4 * r4;
00222 
00223     t[i].x = 1. / (r2 * r);  // -1/r * d/dr r^-1
00224     t[i].y = 6. / (r8);  // -1/r * d/dr r^-6
00225     t[i].z = 12. / (r8 * r4 * r2);  // -1/r * d/dr r^-12
00226     t[i].w = 0.;
00227 */
00228   }
00229 
00230   t[0].x = 0.f;
00231   t[0].y = 0.f;
00232   t[0].z = 0.f;
00233   t[0].w = 0.f;
00234   et[0].x = et[1].x;
00235   et[0].y = et[1].y;
00236   et[0].z = et[1].z;
00237   et[0].w = et[1].w;
00238 
00239   cuda_bind_force_table(t,et);
00240 
00241   if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
00242     CkPrintf("Info: Updated CUDA force table with %d elements.\n", FORCE_TABLE_SIZE);
00243   }
00244 }
00245 
00246 struct exlist_sortop {
00247   bool operator() (int32 *li, int32 *lj) {
00248     return ( li[1] < lj[1] );
00249   }
00250 };
00251 
00252 void build_cuda_exclusions() {
00253   if ( deviceCUDA->getMasterPe() != CkMyPe() ) return;
00254   ComputeNonbondedCUDA::build_exclusions();
00255 }
00256 
00257 static __thread int2 *exclusionsByAtom;
00258 
00259 void ComputeNonbondedCUDA::build_exclusions() {
00260   Molecule *mol = Node::Object()->molecule;
00261 
00262 #ifdef MEM_OPT_VERSION
00263   int natoms = mol->exclSigPoolSize;
00264 #else
00265   int natoms = mol->numAtoms; 
00266 #endif
00267 
00268   delete [] exclusionsByAtom;
00269   exclusionsByAtom = new int2[natoms];
00270 
00271   // create unique sorted lists
00272 
00273   ObjectArena<int32> listArena;
00274   ResizeArray<int32*> unique_lists;
00275   int32 **listsByAtom = new int32*[natoms];
00276   SortableResizeArray<int32> curList;
00277   for ( int i=0; i<natoms; ++i ) {
00278     curList.resize(0);
00279     curList.add(0);  // always excluded from self
00280 #ifdef MEM_OPT_VERSION
00281     const ExclusionSignature *sig = mol->exclSigPool + i;
00282     int n = sig->fullExclCnt;
00283     for ( int j=0; j<n; ++j ) { curList.add(sig->fullOffset[j]); }
00284     n += 1;
00285 #else
00286     const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
00287     int n = mol_list[0] + 1;
00288     for ( int j=1; j<n; ++j ) {
00289       curList.add(mol_list[j] - i);
00290     }
00291 #endif
00292     curList.sort();
00293 
00294     int j;
00295     for ( j=0; j<unique_lists.size(); ++j ) {
00296       if ( n != unique_lists[j][0] ) continue;  // no match
00297       int k;
00298       for ( k=0; k<n; ++k ) {
00299         if ( unique_lists[j][k+3] != curList[k] ) break;
00300       }
00301       if ( k == n ) break;  // found match
00302     }
00303     if ( j == unique_lists.size() ) {  // no match
00304       int32 *list = listArena.getNewArray(n+3);
00305       list[0] = n;
00306       int maxdiff = 0;
00307       maxdiff = -1 * curList[0];
00308       if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
00309       list[1] = maxdiff;
00310       for ( int k=0; k<n; ++k ) {
00311         list[k+3] = curList[k];
00312       }
00313       unique_lists.add(list);
00314     }
00315     listsByAtom[i] = unique_lists[j];
00316   }
00317   // sort lists by maxdiff
00318   std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
00319   long int totalbits = 0;
00320   int nlists = unique_lists.size();
00321   for ( int j=0; j<nlists; ++j ) {
00322     int32 *list = unique_lists[j];
00323     int maxdiff = list[1];
00324     list[2] = totalbits + maxdiff;
00325     totalbits += 2*maxdiff + 1;
00326   }
00327   for ( int i=0; i<natoms; ++i ) {
00328     exclusionsByAtom[i].x = listsByAtom[i][1];  // maxdiff
00329     exclusionsByAtom[i].y = listsByAtom[i][2];  // start
00330   }
00331   delete [] listsByAtom;
00332 
00333   if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
00334 
00335   {
00336     long int bytesneeded = totalbits / 8;
00337     if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
00338     CkPrintf("Info: Found %d unique exclusion lists needing %ld bytes\n",
00339                 unique_lists.size(), bytesneeded);
00340     }
00341 
00342     long int bytesavail = MAX_EXCLUSIONS * sizeof(unsigned int);
00343     if ( bytesneeded > bytesavail ) {
00344       char errmsg[512];
00345       sprintf(errmsg,"Found %d unique exclusion lists needing %ld bytes "
00346                      "but only %ld bytes can be addressed with 32-bit int.",
00347                      unique_lists.size(), bytesneeded, bytesavail);
00348       NAMD_die(errmsg);
00349     }
00350   }
00351 
00352 #define SET_EXCL(EXCL,BASE,DIFF) \
00353          (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
00354 
00355   unsigned int *exclusion_bits = new unsigned int[totalbits/32];
00356   memset(exclusion_bits, 0, totalbits/8);
00357 
00358   long int base = 0;
00359   for ( int i=0; i<unique_lists.size(); ++i ) {
00360     base += unique_lists[i][1];
00361     if ( unique_lists[i][2] != (int32)base ) {
00362       NAMD_bug("ComputeNonbondedCUDA::build_exclusions base != stored");
00363     }
00364     int n = unique_lists[i][0];
00365     for ( int j=0; j<n; ++j ) {
00366       SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
00367     }
00368     base += unique_lists[i][1] + 1;
00369   }
00370 
00371   cuda_bind_exclusions(exclusion_bits, totalbits/32);
00372 
00373   delete [] exclusion_bits;
00374 }
00375 
00376 
00377 void register_cuda_compute_self(ComputeID c, PatchID pid) {
00378 
00379   if ( ! cudaCompute ) NAMD_bug("register_self called early");
00380 
00381   cudaCompute->requirePatch(pid);
00382 
00383   ComputeNonbondedCUDA::compute_record cr;
00384   cr.c = c;
00385   cr.pid[0] = pid;  cr.pid[1] = pid;
00386   cr.offset = 0.;
00387   if ( cudaCompute->patchRecords[pid].isLocal ) {
00388     cudaCompute->localComputeRecords.add(cr);
00389   } else {
00390     cudaCompute->remoteComputeRecords.add(cr);
00391   }
00392 }
00393 
00394 void register_cuda_compute_pair(ComputeID c, PatchID pid[], int t[]) {
00395 
00396   if ( ! cudaCompute ) NAMD_bug("register_pair called early");
00397  
00398   cudaCompute->requirePatch(pid[0]);
00399   cudaCompute->requirePatch(pid[1]);
00400 
00401   ComputeNonbondedCUDA::compute_record cr;
00402   cr.c = c; 
00403   cr.pid[0] = pid[0];  cr.pid[1] = pid[1];
00404 
00405   int t1 = t[0];
00406   int t2 = t[1];
00407   Vector offset = cudaCompute->patchMap->center(pid[0])
00408                 - cudaCompute->patchMap->center(pid[1]);
00409   offset.x += (t1%3-1) - (t2%3-1);
00410   offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
00411   offset.z += (t1/9-1) - (t2/9-1);
00412   cr.offset = offset;
00413 
00414   if ( cudaCompute->patchRecords[pid[0]].isLocal ) {
00415     cudaCompute->localComputeRecords.add(cr);
00416   } else {
00417     cudaCompute->remoteComputeRecords.add(cr);
00418   }
00419 }
00420 
00421 void unregister_cuda_compute(ComputeID c) {  // static
00422 
00423   NAMD_bug("unregister_compute unimplemented");
00424 
00425 }
00426 
00427 // static __thread cudaEvent_t start_upload;
00428 static __thread cudaEvent_t start_calc;
00429 static __thread cudaEvent_t end_remote_download;
00430 static __thread cudaEvent_t end_local_download;
00431 
00432 static __thread ResizeArray<patch_pair> *patch_pairs_ptr;
00433 static __thread ResizeArray<int> *patch_pair_num_ptr;
00434 
00435 void init_arrays();
00436 
00437 ComputeNonbondedCUDA::ComputeNonbondedCUDA(ComputeID c, ComputeMgr *mgr,
00438                                            ComputeNonbondedCUDA *m, int idx) : Compute(c), slaveIndex(idx) {
00439 #ifdef PRINT_GBIS
00440    CkPrintf("C.N.CUDA[%d]::constructor cid=%d\n", CkMyPe(), c);
00441 #endif
00442 
00443   if ( sizeof(patch_pair) & 15 ) NAMD_bug("sizeof(patch_pair) % 16 != 0");
00444   if ( sizeof(atom) & 15 ) NAMD_bug("sizeof(atom) % 16 != 0");
00445   if ( sizeof(atom_param) & 15 ) NAMD_bug("sizeof(atom_param) % 16 != 0");
00446 
00447   // CkPrintf("create ComputeNonbondedCUDA\n");
00448   master = m ? m : this;
00449   cudaCompute = this;
00450   computeMgr = mgr;
00451   patchMap = PatchMap::Object();
00452   atomMap = AtomMap::Object();
00453   reduction = 0;
00454 
00455   SimParameters *params = Node::Object()->simParameters;
00456   if (params->pressureProfileOn) {
00457     NAMD_die("pressure profile not supported in CUDA");
00458   }
00459 
00460   atomsChanged = 1;
00461   computesChanged = 1;
00462   patchPairsReordered = 1;
00463 
00464   pairlistsValid = 0;
00465   pairlistTolerance = 0.;
00466   usePairlists = 0;
00467   savePairlists = 0;
00468   plcutoff2 = 0.;
00469 
00470   workStarted = 0;
00471   basePriority = PROXY_DATA_PRIORITY;
00472   localWorkMsg2 = new (PRIORITY_SIZE) LocalWorkMsg;
00473 
00474   // Zero array sizes and pointers
00475   init_arrays();
00476 
00477   atoms_size = 0;
00478   atoms = NULL;
00479 
00480   forces_size = 0;
00481   forces = NULL;
00482   
00483   slow_forces_size = 0;
00484   slow_forces = NULL;
00485 
00486   psiSumH_size = 0;
00487   psiSumH = NULL;
00488 
00489   dEdaSumH_size = 0;
00490   dEdaSumH = NULL;
00491 
00492   if ( master != this ) { // I am slave
00493     masterPe = master->masterPe;
00494     master->slaves[slaveIndex] = this;
00495     if ( master->slavePes[slaveIndex] != CkMyPe() ) {
00496       NAMD_bug("ComputeNonbondedCUDA slavePes[slaveIndex] != CkMyPe");
00497     }
00498     deviceID = master->deviceID;
00499     registerPatches();
00500     return;
00501   }
00502   masterPe = CkMyPe();
00503 
00504   const bool streaming = ! (deviceCUDA->getNoStreaming() || params->GBISOn);
00505   if ( streaming && ! deviceCUDA->getSharedGpu() && ! deviceCUDA->getNoMergeGrids() )
00506     deviceCUDA->setMergeGrids(1);
00507 
00508   // Sanity checks for New CUDA kernel
00509   if (params->GBISOn) {
00510     // GBIS
00511     if (deviceCUDA->getNoMergeGrids()) {
00512       NAMD_die("CUDA kernel cannot use +nomergegrids with GBIS simulations");
00513     }
00514     // Set mergegrids ON as long as user hasn't defined +nomergegrids
00515     if (!deviceCUDA->getNoMergeGrids()) deviceCUDA->setMergeGrids(1);
00516     // Final sanity check
00517     if (!deviceCUDA->getMergeGrids()) NAMD_die("CUDA GBIS kernel final sanity check failed");
00518   } else {
00519     // non-GBIS
00520     if (deviceCUDA->getNoStreaming() && !deviceCUDA->getMergeGrids()) {
00521       NAMD_die("CUDA kernel requires +mergegrids if +nostreaming is used");
00522     }
00523   }
00524 
00525 #if CUDA_VERSION >= 5050
00526   int leastPriority, greatestPriority;
00527   cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
00528   cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
00529   if ( leastPriority != greatestPriority ) {
00530     if ( CkMyNode() == 0 ) {
00531       int dev = deviceCUDA->getDeviceID();
00532       CkPrintf("CUDA device %d stream priority range %d %d\n", dev, leastPriority, greatestPriority);
00533     }
00534     if ( deviceCUDA->getMergeGrids() && params->PMEOn && params->PMEOffload && !params->usePMECUDA) {
00535       greatestPriority = leastPriority;
00536     }
00537     if (params->usePMECUDA) greatestPriority = leastPriority;
00538     cudaStreamCreateWithPriority(&stream,cudaStreamDefault,greatestPriority);
00539     cudaStreamCreateWithPriority(&stream2,cudaStreamDefault,leastPriority);
00540   } else
00541 #endif
00542   {
00543     cudaStreamCreate(&stream);
00544     cuda_errcheck("cudaStreamCreate");
00545     int dev = deviceCUDA->getDeviceID();
00546     cudaDeviceProp deviceProp;
00547     cudaGetDeviceProperties(&deviceProp, dev);
00548     cuda_errcheck("cudaGetDeviceProperties");
00549     if ( deviceProp.concurrentKernels && deviceProp.major > 2 ) {
00550       if ( CkMyNode() == 0 ) CkPrintf("CUDA device %d supports concurrent kernels.\n", dev);
00551       cudaStreamCreate(&stream2);
00552     } else {
00553       stream2 = stream;
00554     }
00555   }
00556   cuda_errcheck("cudaStreamCreate");
00557 
00558   // Get GPU device ID
00559   deviceID = deviceCUDA->getDeviceID();
00560 
00561   cuda_init();
00562   if ( max_grid_size < 65535 ) NAMD_bug("bad CUDA max_grid_size");
00563   build_exclusions();
00564   // cudaEventCreate(&start_upload);
00565   cudaEventCreateWithFlags(&start_calc,cudaEventDisableTiming);
00566   cudaEventCreateWithFlags(&end_remote_download,cudaEventDisableTiming);
00567   cudaEventCreateWithFlags(&end_local_download,cudaEventDisableTiming);
00568 
00569   patchRecords.resize(patchMap->numPatches());
00570   patch_pairs_ptr = new ResizeArray<patch_pair>;
00571   patch_pair_num_ptr = new ResizeArray<int>;
00572 
00573   if ( params->PMEOn && params->PMEOffload && !params->usePMECUDA) deviceCUDA->setGpuIsMine(0);
00574 }
00575 
00576 
00577 ComputeNonbondedCUDA::~ComputeNonbondedCUDA() { ; }
00578 
00579 void ComputeNonbondedCUDA::requirePatch(int pid) {
00580 
00581   computesChanged = 1;
00582   patch_record &pr = patchRecords[pid];
00583   if ( pr.refCount == 0 ) {
00584     pr.isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->node(pid)) == CmiPhysicalNodeID(CkMyPe()) );
00585     pr.isSameNode = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
00586     if ( deviceCUDA->getMergeGrids() ) {
00587       pr.isLocal = 0;
00588     } else if ( CkNumNodes() < 2 ) {
00589       pr.isLocal = 1 & ( 1 ^ patchMap->index_a(pid) ^
00590          patchMap->index_b(pid) ^ patchMap->index_c(pid) );
00591     } else {
00592       pr.isLocal = pr.isSameNode;
00593     }
00594     if ( pr.isLocal ) {
00595       localActivePatches.add(pid);
00596     } else {
00597       remoteActivePatches.add(pid);
00598     }
00599     activePatches.add(pid);
00600     pr.patchID = pid;
00601     pr.hostPe = -1;
00602     pr.x = NULL;
00603     pr.xExt = NULL;
00604     pr.r = NULL;
00605     pr.f = NULL;
00606     pr.intRad      = NULL;
00607     pr.psiSum      = NULL;
00608     pr.bornRad     = NULL;
00609     pr.dEdaSum     = NULL;
00610     pr.dHdrPrefix  = NULL;
00611   }
00612   pr.refCount += 1;
00613 }
00614 
00615 void ComputeNonbondedCUDA::registerPatches() {
00616 
00617   SimParameters *simParams = Node::Object()->simParameters;
00618   int npatches = master->activePatches.size();
00619   int *pids = master->activePatches.begin();
00620   patch_record *recs = master->patchRecords.begin();
00621   for ( int i=0; i<npatches; ++i ) {
00622     int pid = pids[i];
00623     patch_record &pr = recs[pid];
00624     if ( pr.hostPe == CkMyPe() ) {
00625       pr.slave = this;
00626       pr.msg = new (PRIORITY_SIZE) FinishWorkMsg;
00627       hostedPatches.add(pid);
00628       if ( pr.isLocal ) {
00629         localHostedPatches.add(pid);
00630       } else {
00631         remoteHostedPatches.add(pid);
00632       }
00633       ProxyMgr::Object()->createProxy(pid);
00634       pr.p = patchMap->patch(pid);
00635       pr.positionBox = pr.p->registerPositionPickup(this);
00636       pr.forceBox = pr.p->registerForceDeposit(this);
00637       if (simParams->GBISOn) {
00638         pr.intRadBox      = pr.p->registerIntRadPickup(this);
00639         pr.psiSumBox      = pr.p->registerPsiSumDeposit(this);
00640         pr.bornRadBox     = pr.p->registerBornRadPickup(this);
00641         pr.dEdaSumBox     = pr.p->registerDEdaSumDeposit(this);
00642         pr.dHdrPrefixBox  = pr.p->registerDHdrPrefixPickup(this);
00643       }
00644     }
00645   }
00646   if ( master == this ) setNumPatches(activePatches.size());
00647   else setNumPatches(hostedPatches.size());
00648   if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
00649   CkPrintf("Pe %d hosts %d local and %d remote patches for pe %d\n", CkMyPe(), localHostedPatches.size(), remoteHostedPatches.size(), masterPe);
00650 }
00651 
00652 struct pid_sortop_reverse_priority {
00653   bool operator() (int pidj, int pidi) {  // i and j reversed
00654     int ppi = PATCH_PRIORITY(pidi);
00655     int ppj = PATCH_PRIORITY(pidj);
00656     if ( ppi != ppj ) return ppi < ppj;
00657     return pidi < pidj;
00658   }
00659 };
00660 
00661 void ComputeNonbondedCUDA::assignPatches() {
00662 
00663   int *pesOnNodeSharingDevice = new int[CkMyNodeSize()];
00664   int numPesOnNodeSharingDevice = 0;
00665   int masterIndex = -1;
00666   for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
00667     int pe = deviceCUDA->getPesSharingDevice(i);
00668     if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
00669     if ( CkNodeOf(pe) == CkMyNode() ) {
00670       pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
00671     }
00672   }
00673 
00674   int npatches = activePatches.size();
00675 
00676   if ( npatches ) {
00677     reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00678   }
00679 
00680   // calculate priority rank of local home patch within pe
00681   {
00682     ResizeArray< ResizeArray<int> > homePatchByRank(CkMyNodeSize());
00683     for ( int i=0; i<npatches; ++i ) {
00684       int pid = activePatches[i];
00685       int homePe = patchMap->node(pid);
00686       if ( CkNodeOf(homePe) == CkMyNode() ) {
00687         homePatchByRank[CkRankOf(homePe)].add(pid);
00688       }
00689     }
00690     for ( int i=0; i<CkMyNodeSize(); ++i ) {
00691       pid_sortop_reverse_priority so;
00692       std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
00693       int masterBoost = ( CkMyRank() == i ? 2 : 0 );
00694       for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
00695         int pid = homePatchByRank[i][j];
00696         patchRecords[pid].reversePriorityRankInPe = j + masterBoost;
00697       }
00698     }
00699   }
00700 
00701   int *count = new int[npatches];
00702   memset(count, 0, sizeof(int)*npatches);
00703   int *pcount = new int[numPesOnNodeSharingDevice];
00704   memset(pcount, 0, sizeof(int)*numPesOnNodeSharingDevice);
00705   int *rankpcount = new int[CkMyNodeSize()];
00706   memset(rankpcount, 0, sizeof(int)*CkMyNodeSize());
00707   char *table = new char[npatches*numPesOnNodeSharingDevice];
00708   memset(table, 0, npatches*numPesOnNodeSharingDevice);
00709 
00710   int unassignedpatches = npatches;
00711 
00712   if ( 0 ) { // assign all to device pe
00713     for ( int i=0; i<npatches; ++i ) {
00714       int pid = activePatches[i];
00715       patch_record &pr = patchRecords[pid];
00716       pr.hostPe = CkMyPe();
00717     }
00718     unassignedpatches = 0;
00719     pcount[masterIndex] = npatches;
00720   } else 
00721 
00722   // assign if home pe and build table of natural proxies
00723   for ( int i=0; i<npatches; ++i ) {
00724     int pid = activePatches[i];
00725     patch_record &pr = patchRecords[pid];
00726     int homePe = patchMap->node(pid);
00727     for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00728       int pe = pesOnNodeSharingDevice[j];
00729       if ( pe == homePe ) {
00730         pr.hostPe = pe;  --unassignedpatches;
00731         pcount[j] += 1;
00732       }
00733       if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
00734         table[i*numPesOnNodeSharingDevice+j] = 1;
00735       }
00736     }
00737     if ( pr.hostPe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
00738       pr.hostPe = homePe;  --unassignedpatches;
00739       rankpcount[CkRankOf(homePe)] += 1;
00740     }
00741   }
00742   // assign if only one pe has a required proxy
00743   int assignj = 0;
00744   for ( int i=0; i<npatches; ++i ) {
00745     int pid = activePatches[i];
00746     patch_record &pr = patchRecords[pid];
00747     if ( pr.hostPe != -1 ) continue;
00748     int c = 0;
00749     int lastj;
00750     for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00751       if ( table[i*numPesOnNodeSharingDevice+j] ) { ++c; lastj=j; }
00752     }
00753     count[i] = c;
00754     if ( c == 1 ) {
00755       pr.hostPe = pesOnNodeSharingDevice[lastj];
00756       --unassignedpatches;
00757       pcount[lastj] += 1;
00758     }
00759   }
00760   while ( unassignedpatches ) {
00761     int i;
00762     for ( i=0; i<npatches; ++i ) {
00763       if ( ! table[i*numPesOnNodeSharingDevice+assignj] ) continue;
00764       int pid = activePatches[i];
00765       patch_record &pr = patchRecords[pid];
00766       if ( pr.hostPe != -1 ) continue;
00767       pr.hostPe = pesOnNodeSharingDevice[assignj];
00768       --unassignedpatches;
00769       pcount[assignj] += 1;
00770       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00771       break;
00772     }
00773     if ( i<npatches ) continue;  // start search again
00774     for ( i=0; i<npatches; ++i ) {
00775       int pid = activePatches[i];
00776       patch_record &pr = patchRecords[pid];
00777       if ( pr.hostPe != -1 ) continue;
00778       if ( count[i] ) continue;
00779       pr.hostPe = pesOnNodeSharingDevice[assignj];
00780       --unassignedpatches;
00781       pcount[assignj] += 1;
00782       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00783       break;
00784     }
00785     if ( i<npatches ) continue;  // start search again
00786     if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00787   }
00788 
00789   for ( int i=0; i<npatches; ++i ) {
00790     int pid = activePatches[i];
00791     patch_record &pr = patchRecords[pid];
00792     // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), pid, pr.hostPe);
00793   }
00794 
00795   slavePes = new int[CkMyNodeSize()];
00796   slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
00797   numSlaves = 0;
00798   for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00799     int pe = pesOnNodeSharingDevice[j];
00800     int rank = pe - CkNodeFirst(CkMyNode());
00801     // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
00802     //          CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
00803     if ( pe == CkMyPe() ) continue;
00804     if ( ! pcount[j] && ! rankpcount[rank] ) continue;
00805     rankpcount[rank] = 0;  // skip in rank loop below
00806     slavePes[numSlaves] = pe;
00807     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00808     ++numSlaves;
00809   }
00810   for ( int j=0; j<CkMyNodeSize(); ++j ) {
00811     int pe = CkNodeFirst(CkMyNode()) + j;
00812     // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
00813     //          CkMyPe(),j,pe,rankpcount[j]);
00814     if ( ! rankpcount[j] ) continue;
00815     if ( pe == CkMyPe() ) continue;
00816     slavePes[numSlaves] = pe;
00817     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00818     ++numSlaves;
00819   }
00820   registerPatches();
00821 
00822   delete [] pesOnNodeSharingDevice;
00823   delete [] count;
00824   delete [] pcount;
00825   delete [] rankpcount;
00826   delete [] table;
00827 }
00828 
00829 static __thread int atom_params_size;
00830 static __thread atom_param* atom_params;
00831 
00832 static __thread int vdw_types_size;
00833 static __thread int* vdw_types;
00834 
00835 static __thread int dummy_size;
00836 static __thread float* dummy_dev;
00837 
00838 static __thread int force_ready_queue_size;
00839 static __thread int *force_ready_queue;
00840 static __thread int force_ready_queue_len;
00841 static __thread int force_ready_queue_next;
00842 
00843 static __thread int block_order_size;
00844 static __thread int *block_order;
00845 
00846 static __thread int num_atoms;
00847 static __thread int num_local_atoms;
00848 static __thread int num_remote_atoms;
00849 
00850 static __thread int virials_size;
00851 static __thread float *virials;
00852 static __thread int num_virials;
00853 // NOTE: slow_virials is a computed pointer from "virials" -do not deallocate
00854 static __thread float *slow_virials;
00855 
00856 static __thread int energy_gbis_size;
00857 static __thread float *energy_gbis;
00858 
00859 //GBIS host pointers
00860 static __thread int intRad0H_size;
00861 static __thread float *intRad0H;
00862 static __thread int intRadSH_size;
00863 static __thread float *intRadSH;
00864 static __thread int bornRadH_size;
00865 static __thread float *bornRadH;
00866 static __thread int dHdrPrefixH_size;
00867 static __thread float *dHdrPrefixH;
00868 
00869 static __thread int cuda_timer_count;
00870 static __thread double cuda_timer_total;
00871 static __thread double kernel_time;
00872 static __thread double remote_submit_time;
00873 static __thread double local_submit_time;
00874 
00875 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
00876 
00877 extern "C" void CcdCallBacksReset(void *ignored,double curWallTime);  // fix Charm++
00878 
00879 #ifdef PRINT_GBIS
00880 #define GBISP(...) CkPrintf(__VA_ARGS__);
00881 #else
00882 #define GBISP(...)
00883 #endif
00884 
00885 #define count_limit 1000000
00886 static __thread int check_count;
00887 static __thread int check_remote_count;
00888 static __thread int check_local_count;
00889 
00890 void init_arrays() {
00891 
00892   atom_params_size = 0;
00893   atom_params = NULL;
00894 
00895   vdw_types_size = 0;
00896   vdw_types = NULL;
00897   
00898   dummy_size = 0;
00899   dummy_dev = NULL;
00900 
00901   force_ready_queue_size = 0;
00902   force_ready_queue = NULL;
00903   force_ready_queue_len = 0;
00904   force_ready_queue_next = 0;
00905   
00906   block_order_size = 0;
00907   block_order = NULL;
00908   
00909   num_atoms = 0;
00910   num_local_atoms = 0;
00911   num_remote_atoms = 0;
00912 
00913   virials_size = 0;
00914   virials = NULL;
00915   num_virials = 0;
00916 
00917   energy_gbis_size = 0;
00918   energy_gbis = NULL;
00919 
00920   intRad0H_size = 0;
00921   intRad0H = NULL;
00922   intRadSH_size = 0;
00923   intRadSH = NULL;
00924   bornRadH_size = 0;
00925   bornRadH = NULL;
00926   dHdrPrefixH_size = 0;
00927   dHdrPrefixH = NULL;
00928 
00929 }
00930 
00931 void cuda_check_progress(void *arg, double walltime) {
00932   CUDA_TRACE_POLL_REMOTE;
00933 
00934   int flindex;
00935   int poll_again = 1;
00936   while ( -1 != (flindex = force_ready_queue[force_ready_queue_next]) ) {
00937     //    CkPrintf("Pe %d forces ready %d is index %d at %lf\n",
00938     //       CkMyPe(), force_ready_queue_next, flindex, walltime);
00939     force_ready_queue[force_ready_queue_next] = -1;
00940     ++force_ready_queue_next;
00941     check_count = 0;
00942     if ( force_ready_queue_next == force_ready_queue_len ) {
00943       poll_again = 0;
00944       CUDA_TRACE_LOCAL(kernel_time,walltime);
00945       kernel_time = walltime - kernel_time;
00946       // need to guarantee this finishes before the last patch message!
00947       ((ComputeNonbondedCUDA *) arg)->workStarted = 0;
00948       ((ComputeNonbondedCUDA *) arg)->finishReductions();
00949     }
00950     ((ComputeNonbondedCUDA *) arg)->messageFinishPatch(flindex);
00951     if ( force_ready_queue_next == force_ready_queue_len ) break;
00952   }
00953   if ( ++check_count >= count_limit ) {
00954     char errmsg[256];
00955     sprintf(errmsg,"cuda_check_progress polled %d times over %f s on step %d",
00956             check_count, walltime - remote_submit_time,
00957             ((ComputeNonbondedCUDA *) arg)->step);
00958     cuda_errcheck(errmsg);
00959     NAMD_die(errmsg);
00960   }
00961   if ( poll_again ) {
00962     CcdCallBacksReset(0,walltime);  // fix Charm++
00963     CUDA_POLL(cuda_check_progress, arg);
00964   }
00965 }
00966 
00967 void cuda_check_remote_progress(void *arg, double walltime) {
00968 
00969   CUDA_TRACE_POLL_REMOTE;
00970   cudaError_t err = cudaEventQuery(end_remote_download);
00971   if ( err == cudaSuccess ) {
00972     local_submit_time = walltime;
00973     CUDA_TRACE_REMOTE(remote_submit_time,local_submit_time);
00974     if ( deviceCUDA->getMergeGrids() ) {  // no local
00975       kernel_time = local_submit_time - kernel_time;
00976     }
00977     check_remote_count = 0;
00978     cuda_errcheck("at cuda remote stream completed");
00979     WorkDistrib::messageFinishCUDA((ComputeNonbondedCUDA *) arg);
00980   } else if ( err != cudaErrorNotReady ) {
00981     cuda_errcheck("in cuda_check_remote_progress");
00982     NAMD_bug("cuda_errcheck missed error in cuda_check_remote_progress");
00983   } else if ( ++check_remote_count >= count_limit ) {
00984     char errmsg[256];
00985     sprintf(errmsg,"cuda_check_remote_progress polled %d times over %f s on step %d",
00986             check_remote_count, walltime - remote_submit_time,
00987             ((ComputeNonbondedCUDA *) arg)->step);
00988     cuda_errcheck(errmsg);
00989     NAMD_die(errmsg);
00990   } else if ( check_local_count ) {
00991     NAMD_bug("nonzero check_local_count in cuda_check_remote_progress");
00992   } else {
00993     CcdCallBacksReset(0,walltime);  // fix Charm++
00994     CUDA_POLL(cuda_check_remote_progress, arg);
00995   }
00996 }
00997 
00998 void cuda_check_local_progress(void *arg, double walltime) {
00999 
01000   CUDA_TRACE_POLL_LOCAL;
01001   cudaError_t err = cudaEventQuery(end_local_download);
01002   if ( err == cudaSuccess ) {
01003     CUDA_TRACE_LOCAL(local_submit_time,walltime);
01004     kernel_time = walltime - kernel_time;
01005     check_local_count = 0;
01006     cuda_errcheck("at cuda local stream completed");
01007     WorkDistrib::messageFinishCUDA((ComputeNonbondedCUDA *) arg);
01008   } else if ( err != cudaErrorNotReady ) {
01009     cuda_errcheck("in cuda_check_local_progress");
01010     NAMD_bug("cuda_errcheck missed error in cuda_check_local_progress");
01011   } else if ( ++check_local_count >= count_limit ) {
01012     char errmsg[256];
01013     sprintf(errmsg,"cuda_check_local_progress polled %d times over %f s on step %d",
01014             check_local_count, walltime - local_submit_time,
01015             ((ComputeNonbondedCUDA *) arg)->step);
01016     cuda_errcheck(errmsg);
01017     NAMD_die(errmsg);
01018   } else if ( check_remote_count ) {
01019     NAMD_bug("nonzero check_remote_count in cuda_check_local_progress");
01020   } else {
01021     CcdCallBacksReset(0,walltime);  // fix Charm++
01022     CUDA_POLL(cuda_check_local_progress, arg);
01023   }
01024 }
01025 
01026 #if 0
01027 // don't use this one unless timer is part of stream, above is better
01028 void cuda_check_progress(void *arg, double walltime) {
01029   if ( cuda_stream_finished() ) {
01030     kernel_time = walltime - kernel_time;
01031     CcdCallBacksReset(0,walltime);  // fix Charm++
01032     CUDA_POLL(ccd_index);
01033     // ((ComputeNonbondedCUDA *) arg)->finishWork();
01034     WorkDistrib::messageEnqueueWork((ComputeNonbondedCUDA *) arg);
01035   }
01036 }
01037 #endif
01038 
01039 void ComputeNonbondedCUDA::atomUpdate() {
01040   //fprintf(stderr, "%d ComputeNonbondedCUDA::atomUpdate\n",CkMyPe());
01041   atomsChanged = 1;
01042 }
01043 
01044 static __thread int kernel_launch_state = 0;
01045 
01046 struct cr_sortop_distance {
01047   const Lattice &l;
01048   cr_sortop_distance(const Lattice &lattice) : l(lattice) { }
01049   bool operator() (ComputeNonbondedCUDA::compute_record i,
01050                         ComputeNonbondedCUDA::compute_record j) {
01051     Vector a = l.a();
01052     Vector b = l.b();
01053     Vector c = l.c();
01054     BigReal ri = (i.offset.x * a + i.offset.y * b + i.offset.z * c).length2();
01055     BigReal rj = (j.offset.x * a + j.offset.y * b + j.offset.z * c).length2();
01056     return ( ri < rj );
01057   }
01058 };
01059 
01060 struct cr_sortop_reverse_priority {
01061   cr_sortop_distance &distop;
01062   const ComputeNonbondedCUDA::patch_record *pr;
01063   cr_sortop_reverse_priority(cr_sortop_distance &sod,
01064        const ComputeNonbondedCUDA::patch_record *patchrecs) : distop(sod), pr(patchrecs) { }
01065   bool pid_compare_priority(int pidi, int pidj) {
01066     const ComputeNonbondedCUDA::patch_record &pri = pr[pidi];
01067     const ComputeNonbondedCUDA::patch_record &prj = pr[pidj];
01068     if ( pri.isSamePhysicalNode && ! prj.isSamePhysicalNode ) return 0;
01069     if ( prj.isSamePhysicalNode && ! pri.isSamePhysicalNode ) return 1;
01070     if ( pri.isSameNode && ! prj.isSameNode ) return 0;
01071     if ( prj.isSameNode && ! pri.isSameNode ) return 1;
01072     if ( pri.isSameNode ) {  // and prj.isSameNode
01073       int rpri = pri.reversePriorityRankInPe;
01074       int rprj = prj.reversePriorityRankInPe;
01075       if ( rpri != rprj ) return rpri > rprj;
01076       return sortop_bitreverse(CkRankOf(pri.hostPe),CkRankOf(prj.hostPe));
01077     }
01078     int ppi = PATCH_PRIORITY(pidi);
01079     int ppj = PATCH_PRIORITY(pidj);
01080     if ( ppi != ppj ) return ppi < ppj;
01081     return pidi < pidj;
01082   }
01083   bool operator() (ComputeNonbondedCUDA::compute_record j,
01084                         ComputeNonbondedCUDA::compute_record i) {  // i and j reversed
01085     int pidi = pid_compare_priority(i.pid[0],i.pid[1]) ? i.pid[0] : i.pid[1];
01086     int pidj = pid_compare_priority(j.pid[0],j.pid[1]) ? j.pid[0] : j.pid[1];
01087     if ( pidi != pidj ) return pid_compare_priority(pidi, pidj);
01088     return distop(i,j);
01089   }
01090 };
01091 
01092 void ComputeNonbondedCUDA::skip() {
01093   //fprintf(stderr, "ComputeNonbondedCUDA::skip()\n");
01094   SimParameters *simParams = Node::Object()->simParameters;
01095   for ( int i=0; i<hostedPatches.size(); ++i ) {
01096     patch_record &pr = master->patchRecords[hostedPatches[i]];
01097     pr.positionBox->skip();
01098     pr.forceBox->skip();
01099     if (simParams->GBISOn) {
01100       pr.intRadBox->skip();
01101       pr.psiSumBox->skip();
01102       pr.bornRadBox->skip();
01103       pr.dEdaSumBox->skip();
01104       pr.dHdrPrefixBox->skip();
01105     }
01106   }
01107 }
01108 
01109 int ComputeNonbondedCUDA::noWork() {
01110 
01111   SimParameters *simParams = Node::Object()->simParameters;
01112   Flags &flags = master->patchRecords[hostedPatches[0]].p->flags;
01113   lattice = flags.lattice;
01114   doSlow = flags.doFullElectrostatics;
01115   doEnergy = flags.doEnergy;
01116   step = flags.step;
01117 
01118   if ( ! flags.doNonbonded ) {
01119     GBISP("GBIS[%d] noWork() don't do nonbonded\n",CkMyPe());
01120     if ( master != this ) {
01121       computeMgr->sendNonbondedCUDASlaveReady(masterPe,
01122                         hostedPatches.size(),atomsChanged,sequence());
01123     } else {
01124       for ( int i = 0; i < numSlaves; ++i ) {
01125         computeMgr->sendNonbondedCUDASlaveSkip(slaves[i],slavePes[i]);
01126       }
01127       skip();
01128     }
01129     if ( reduction ) {
01130        reduction->submit();
01131     }
01132 
01133     return 1;
01134   }
01135 
01136   for ( int i=0; i<hostedPatches.size(); ++i ) {
01137     patch_record &pr = master->patchRecords[hostedPatches[i]];
01138     if (!simParams->GBISOn || gbisPhase == 1) {
01139       GBISP("GBIS[%d] noWork() P0[%d] open()\n",CkMyPe(), pr.patchID);
01140       pr.x = pr.positionBox->open();
01141       pr.xExt = pr.p->getCompAtomExtInfo();
01142     }
01143 
01144     if (simParams->GBISOn) {
01145       if (gbisPhase == 1) {
01146         GBISP("GBIS[%d] noWork() P1[%d] open()\n",CkMyPe(),pr.patchID);
01147         pr.intRad     = pr.intRadBox->open();
01148         pr.psiSum     = pr.psiSumBox->open();
01149       } else if (gbisPhase == 2) {
01150         GBISP("GBIS[%d] noWork() P2[%d] open()\n",CkMyPe(),pr.patchID);
01151         pr.bornRad    = pr.bornRadBox->open();
01152         pr.dEdaSum    = pr.dEdaSumBox->open();
01153       } else if (gbisPhase == 3) {
01154         GBISP("GBIS[%d] noWork() P3[%d] open()\n",CkMyPe(),pr.patchID);
01155         pr.dHdrPrefix = pr.dHdrPrefixBox->open();
01156       }
01157       GBISP("opened GBIS boxes");
01158     }
01159   }
01160 
01161   if ( master == this ) return 0; //work to do, enqueue as usual
01162 
01163   // message masterPe
01164   computeMgr->sendNonbondedCUDASlaveReady(masterPe,
01165                         hostedPatches.size(),atomsChanged,sequence());
01166   atomsChanged = 0;
01167 
01168   workStarted = 1;
01169 
01170   return 1;
01171 }
01172 
01173 void ComputeNonbondedCUDA::doWork() {
01174 GBISP("C.N.CUDA[%d]::doWork: seq %d, phase %d, workStarted %d, atomsChanged %d\n", \
01175 CkMyPe(), sequence(), gbisPhase, workStarted, atomsChanged);
01176 
01177   // Set GPU device ID
01178   cudaCheck(cudaSetDevice(deviceID));
01179 
01180   ResizeArray<patch_pair> &patch_pairs(*patch_pairs_ptr);
01181   ResizeArray<int> &patch_pair_num(*patch_pair_num_ptr);
01182 
01183   if ( workStarted ) { //if work already started, check if finished
01184     if ( finishWork() ) {  // finished
01185       workStarted = 0;
01186       basePriority = PROXY_DATA_PRIORITY;  // higher to aid overlap
01187     } else {  // need to call again
01188       workStarted = 2;
01189       basePriority = PROXY_RESULTS_PRIORITY;  // lower for local
01190       if ( master == this && kernel_launch_state > 2 ) {
01191         cuda_check_local_progress(this,0.);  // launches polling
01192       }
01193     }
01194     return;
01195   }
01196 
01197   workStarted = 1;
01198   basePriority = COMPUTE_PROXY_PRIORITY;
01199 
01200   Molecule *mol = Node::Object()->molecule;
01201   Parameters *params = Node::Object()->parameters;
01202   SimParameters *simParams = Node::Object()->simParameters;
01203 
01204   //execute only during GBIS phase 1, or if not using GBIS
01205   if (!simParams->GBISOn || gbisPhase == 1) {
01206 
01207     //bind new patches to GPU
01208     if ( atomsChanged || computesChanged ) {
01209       int npatches = activePatches.size();
01210 
01211       pairlistsValid = 0;
01212       pairlistTolerance = 0.;
01213 
01214       if ( computesChanged ) {
01215         computesChanged = 0;
01216 
01217         if ( ! dummy_size ) {
01218           dummy_size = 1024*1024;
01219           cudaMalloc((void**)&dummy_dev,dummy_size);
01220           cuda_errcheck("in cudaMalloc dummy_dev");
01221         }
01222 
01223         bool did_realloc = reallocate_host<int>(&force_ready_queue, &force_ready_queue_size, npatches,
01224           1.2f, cudaHostAllocMapped);
01225         if (did_realloc) {
01226           for (int k=0; k < force_ready_queue_size; ++k)
01227             force_ready_queue[k] = -1;
01228         }
01229         force_ready_queue_len = npatches;
01230         reallocate_host<int>(&block_order, &block_order_size,
01231           2*(localComputeRecords.size()+remoteComputeRecords.size()),
01232           1.2f, cudaHostAllocMapped);
01233     
01234         num_virials = npatches;
01235         reallocate_host<float>(&virials, &virials_size, 2*16*num_virials,
01236           1.2f, cudaHostAllocMapped);
01237         slow_virials = virials + 16*num_virials;
01238 
01239         reallocate_host<float>(&energy_gbis, &energy_gbis_size, npatches,
01240           1.2f, cudaHostAllocMapped);
01241         for (int i  = 0; i < energy_gbis_size; i++) energy_gbis[i] = 0.f;
01242 
01243         int *ap = activePatches.begin();
01244         for ( int i=0; i<localActivePatches.size(); ++i ) {
01245           *(ap++) = localActivePatches[i];
01246         }
01247         for ( int i=0; i<remoteActivePatches.size(); ++i ) {
01248           *(ap++) = remoteActivePatches[i];
01249         }
01250 
01251         // sort computes by distance between patches
01252         cr_sortop_distance so(lattice);
01253         std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),so);
01254         std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),so);
01255 
01256         const bool streaming = ! (deviceCUDA->getNoStreaming() || simParams->GBISOn);
01257 
01258         if ( streaming ) {
01259           // iout << "Reverse-priority sorting...\n" << endi;
01260           cr_sortop_reverse_priority sorp(so,patchRecords.begin());
01261           std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),sorp);
01262           std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),sorp);
01263           patchPairsReordered = 0;
01264           //patchPairsReordered = 1;
01265       // int len = remoteComputeRecords.size();
01266       // for ( int i=0; i<len; ++i ) {
01267       //   iout << "reverse_order " << i << " " << remoteComputeRecords[i].pid[0] << "\n";
01268       // }
01269       // int len2 = localComputeRecords.size();
01270       // for ( int i=0; i<len2; ++i ) {
01271       //   iout << "reverse_order " << (i+len) << " " << localComputeRecords[i].pid[0] << "\n";
01272       // }
01273       // iout << endi;
01274         } else {
01275           patchPairsReordered = 1;
01276         }
01277  
01278         int nlc = localComputeRecords.size();
01279         int nrc = remoteComputeRecords.size();
01280         computeRecords.resize(nlc+nrc);
01281         compute_record *cr = computeRecords.begin();
01282         for ( int i=0; i<nrc; ++i ) {
01283           *(cr++) = remoteComputeRecords[i];
01284         }
01285         for ( int i=0; i<nlc; ++i ) {
01286           *(cr++) = localComputeRecords[i];
01287         }
01288 
01289         // patch_pair_num[i] = number of patch pairs that involve patch i
01290         patch_pair_num.resize(npatches);
01291         for ( int i=0; i<npatches; ++i ) {
01292           patchRecords[activePatches[i]].localIndex = i;
01293           patch_pair_num[i] = 0;
01294         }
01295 
01296         int ncomputes = computeRecords.size();
01297         patch_pairs.resize(ncomputes);
01298         for ( int i=0; i<ncomputes; ++i ) {
01299           ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01300           int lp1 = patchRecords[cr.pid[0]].localIndex;
01301           int lp2 = patchRecords[cr.pid[1]].localIndex;
01302           patch_pair_num[lp1]++;
01303           if (lp1 != lp2) patch_pair_num[lp2]++;
01304           patch_pair &pp = patch_pairs[i];
01305           pp.offset.x = cr.offset.x;
01306           pp.offset.y = cr.offset.y;
01307           pp.offset.z = cr.offset.z;
01308         }
01309 
01310         for ( int i=0; i<ncomputes; ++i ) {
01311           ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01312           int lp1 = patchRecords[cr.pid[0]].localIndex;
01313           int lp2 = patchRecords[cr.pid[1]].localIndex;
01314           patch_pair &pp = patch_pairs[i];
01315           pp.patch1_ind = lp1;
01316           pp.patch2_ind = lp2;
01317           pp.patch1_num_pairs = patch_pair_num[lp1];
01318           pp.patch2_num_pairs = patch_pair_num[lp2];
01319         }
01320 
01321       if ( CmiPhysicalNodeID(CkMyPe()) < 2 ) {
01322         CkPrintf("Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
01323                  CkMyPe(), localActivePatches.size(), remoteActivePatches.size(),
01324                  localComputeRecords.size(), remoteComputeRecords.size());
01325       }
01326     }  // computesChanged
01327     else if ( ! patchPairsReordered ) {
01328       patchPairsReordered = 1;
01329       int len = patch_pairs.size();
01330       int nlc = localComputeRecords.size();
01331       int nrc = remoteComputeRecords.size();
01332       if ( len != nlc + nrc ) NAMD_bug("array size mismatch in ComputeNonbondedCUDA reordering");
01333       ResizeArray<ComputeNonbondedCUDA::compute_record> new_computeRecords(len);
01334       ResizeArray<patch_pair> new_patch_pairs(len);
01335       int irc=nrc;
01336       int ilc=len;
01337       for ( int i=0; i<len; ++i ) {
01338         int boi = block_order[i];
01339         int dest;
01340         if ( boi < nrc ) { dest = --irc; } else { dest = --ilc; }
01341         new_computeRecords[dest] = computeRecords[boi];
01342         new_patch_pairs[dest] = patch_pairs[boi];
01343       }
01344       if ( irc != 0 || ilc != nrc ) NAMD_bug("block index mismatch in ComputeNonbondedCUDA reordering");
01345       computeRecords.swap(new_computeRecords);
01346       patch_pairs.swap(new_patch_pairs);
01347     }
01348 
01349     int istart = 0;
01350     int i;
01351     for ( i=0; i<npatches; ++i ) {
01352       if ( i == localActivePatches.size() ) {
01353         num_local_atoms = istart;
01354       }
01355       patch_record &pr = patchRecords[activePatches[i]];
01356       pr.localStart = istart;
01357       int natoms = pr.p->getNumAtoms();
01358       int nfreeatoms = natoms;
01359       if ( fixedAtomsOn ) {
01360         const CompAtomExt *aExt = pr.xExt;
01361         for ( int j=0; j<natoms; ++j ) {
01362           if ( aExt[j].atomFixed ) --nfreeatoms;
01363         }
01364       }
01365       pr.numAtoms = natoms;
01366       pr.numFreeAtoms = nfreeatoms;
01367       istart += natoms;
01368       istart += 16 - (natoms & 15);
01369     }
01370     if ( i == localActivePatches.size() ) {
01371       num_local_atoms = istart;
01372     }
01373     num_atoms = istart;
01374     num_remote_atoms = num_atoms - num_local_atoms;
01375     reallocate_host<atom_param>(&atom_params, &atom_params_size, num_atoms, 1.2f);
01376     reallocate_host<int>(&vdw_types, &vdw_types_size, num_atoms, 1.2f);
01377     reallocate_host<CudaAtom>(&atoms, &atoms_size, num_atoms, 1.2f);
01378     reallocate_host<float4>(&forces, &forces_size, num_atoms, 1.2f, cudaHostAllocMapped);
01379     reallocate_host<float4>(&slow_forces, &slow_forces_size, num_atoms, 1.2f, cudaHostAllocMapped);
01380     if (simParams->GBISOn) {
01381       reallocate_host<float>(&intRad0H, &intRad0H_size, num_atoms, 1.2f);
01382       reallocate_host<float>(&intRadSH, &intRadSH_size, num_atoms, 1.2f);
01383       reallocate_host<GBReal>(&psiSumH, &psiSumH_size, num_atoms, 1.2f, cudaHostAllocMapped);
01384       reallocate_host<float>(&bornRadH, &bornRadH_size, num_atoms, 1.2f);
01385       reallocate_host<GBReal>(&dEdaSumH, &dEdaSumH_size, num_atoms, 1.2f, cudaHostAllocMapped);
01386       reallocate_host<float>(&dHdrPrefixH, &dHdrPrefixH_size, num_atoms, 1.2f);
01387     }
01388 
01389     int bfstart = 0;
01390     int exclmask_start = 0;
01391     int ncomputes = computeRecords.size();
01392     for ( int i=0; i<ncomputes; ++i ) {
01393       ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01394       int p1 = cr.pid[0];
01395       int p2 = cr.pid[1];
01396       patch_pair &pp = patch_pairs[i];
01397       pp.patch1_start = patchRecords[p1].localStart;
01398       pp.patch1_size  = patchRecords[p1].numAtoms;
01399       pp.patch1_free_size = patchRecords[p1].numFreeAtoms;
01400       pp.patch2_start = patchRecords[p2].localStart;
01401       pp.patch2_size  = patchRecords[p2].numAtoms;
01402       pp.patch2_free_size = patchRecords[p2].numFreeAtoms;
01403       pp.plist_start = bfstart;
01404       // size1*size2 = number of patch pairs
01405       int size1 = (pp.patch1_size-1)/WARPSIZE+1;
01406       int size2 = (pp.patch2_size-1)/WARPSIZE+1;
01407       pp.plist_size = (size1*size2-1)/32+1;
01408       bfstart += pp.plist_size;
01409       pp.exclmask_start = exclmask_start;
01410       exclmask_start += size1*size2;
01411     } //for ncomputes
01412 
01413     cuda_bind_patch_pairs(patch_pairs.begin(), patch_pairs.size(),
01414       activePatches.size(), num_atoms, bfstart, 
01415       exclmask_start);
01416 
01417   }  // atomsChanged || computesChanged
01418 
01419   double charge_scaling = sqrt(COULOMB * scaling * dielectric_1);
01420 
01421   Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01422   float maxAtomMovement = 0.;
01423   float maxPatchTolerance = 0.;
01424 
01425   for ( int i=0; i<activePatches.size(); ++i ) {
01426     patch_record &pr = patchRecords[activePatches[i]];
01427 
01428     float maxMove = pr.p->flags.maxAtomMovement;
01429     if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
01430 
01431     float maxTol = pr.p->flags.pairlistTolerance;
01432     if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
01433 
01434     int start = pr.localStart;
01435     int n = pr.numAtoms;
01436     const CompAtom *a = pr.x;
01437     const CompAtomExt *aExt = pr.xExt;
01438     if ( atomsChanged ) {
01439 
01440       atom_param *ap = atom_params + start;
01441       for ( int k=0; k<n; ++k ) {
01442         int j = aExt[k].sortOrder;
01443         ap[k].vdw_type = a[j].vdwType;
01444         vdw_types[start + k] = a[j].vdwType;
01445         ap[k].index = aExt[j].id;
01446 #ifdef MEM_OPT_VERSION
01447         ap[k].excl_index = exclusionsByAtom[aExt[j].exclId].y;
01448         ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].exclId].x;
01449 #else // ! MEM_OPT_VERSION
01450         ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
01451         ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
01452 #endif // MEM_OPT_VERSION
01453       }
01454     }
01455     {
01456 #if 1
01457       const CudaAtom *ac = pr.p->getCudaAtomList();
01458       CudaAtom *ap = atoms + start;
01459       memcpy(ap, ac, sizeof(atom)*n);
01460 #else
01461       Vector center =
01462         pr.p->flags.lattice.unscale(cudaCompute->patchMap->center(pr.patchID));
01463       atom *ap = atoms + start;
01464       for ( int k=0; k<n; ++k ) {
01465         int j = aExt[k].sortOrder;
01466         ap[k].position.x = a[j].position.x - center.x;
01467         ap[k].position.y = a[j].position.y - center.y;
01468         ap[k].position.z = a[j].position.z - center.z;
01469         ap[k].charge = charge_scaling * a[j].charge;
01470       }
01471 #endif
01472     }
01473   }
01474 
01475   savePairlists = 0;
01476   usePairlists = 0;
01477   if ( flags.savePairlists ) {
01478     savePairlists = 1;
01479     usePairlists = 1;
01480   } else if ( flags.usePairlists ) {
01481     if ( ! pairlistsValid ||
01482          ( 2. * maxAtomMovement > pairlistTolerance ) ) {
01483       reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
01484     } else {
01485       usePairlists = 1;
01486     }
01487   }
01488   if ( ! usePairlists ) {
01489     pairlistsValid = 0;
01490   }
01491   float plcutoff = cutoff;
01492   if ( savePairlists ) {
01493     pairlistsValid = 1;
01494     pairlistTolerance = 2. * maxPatchTolerance;
01495     plcutoff += pairlistTolerance;
01496   }
01497   plcutoff2 = plcutoff * plcutoff;
01498 
01499   //CkPrintf("plcutoff = %f  listTolerance = %f  save = %d  use = %d\n",
01500   //  plcutoff, pairlistTolerance, savePairlists, usePairlists);
01501 
01502 #if 0
01503   // calculate warp divergence
01504   if ( 1 ) {
01505     Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01506     Lattice &lattice = flags.lattice;
01507     float3 lata, latb, latc;
01508     lata.x = lattice.a().x;
01509     lata.y = lattice.a().y;
01510     lata.z = lattice.a().z;
01511     latb.x = lattice.b().x;
01512     latb.y = lattice.b().y;
01513     latb.z = lattice.b().z;
01514     latc.x = lattice.c().x;
01515     latc.y = lattice.c().y;
01516     latc.z = lattice.c().z;
01517 
01518     int ncomputes = computeRecords.size();
01519     for ( int ic=0; ic<ncomputes; ++ic ) {
01520       patch_pair &pp = patch_pairs[ic];
01521       atom *a1 = atoms + pp.patch1_atom_start;
01522       int n1 = pp.patch1_size;
01523       atom *a2 = atoms + pp.patch2_atom_start;
01524       int n2 = pp.patch2_size;
01525       float offx = pp.offset.x * lata.x
01526                + pp.offset.y * latb.x
01527                + pp.offset.z * latc.x;
01528       float offy = pp.offset.x * lata.y
01529                + pp.offset.y * latb.y
01530                + pp.offset.z * latc.y;
01531       float offz = pp.offset.x * lata.z
01532                + pp.offset.y * latb.z
01533                + pp.offset.z * latc.z;
01534       // CkPrintf("%f %f %f\n", offx, offy, offz);
01535       int atoms_tried = 0;
01536       int blocks_tried = 0;
01537       int atoms_used = 0;
01538       int blocks_used = 0;
01539       for ( int ii=0; ii<n1; ii+=32 ) {  // warps
01540         for ( int jj=0; jj<n2; jj+=16 ) {  // shared atom loads
01541           int block_used = 0;
01542           for ( int j=jj; j<jj+16 && j<n2; ++j ) {  // shared atoms
01543             int atom_used = 0;
01544             for ( int i=ii; i<ii+32 && i<n1; ++i ) {  // threads
01545               float dx = offx + a1[i].position.x - a2[j].position.x;
01546               float dy = offy + a1[i].position.y - a2[j].position.y;
01547               float dz = offz + a1[i].position.z - a2[j].position.z;
01548               float r2 = dx*dx + dy*dy + dz*dz;
01549               if ( r2 < cutoff2 ) atom_used = 1;
01550             }
01551             ++atoms_tried;
01552             if ( atom_used ) { block_used = 1; ++atoms_used; }
01553           }
01554           ++blocks_tried;
01555           if ( block_used ) { ++blocks_used; }
01556         }
01557       }
01558       CkPrintf("blocks = %d/%d (%f)  atoms = %d/%d (%f)\n",
01559                 blocks_used, blocks_tried, blocks_used/(float)blocks_tried,
01560                 atoms_used, atoms_tried, atoms_used/(float)atoms_tried);
01561     }
01562   }
01563 #endif
01564 
01565   } // !GBISOn || gbisPhase == 1
01566 
01567   //Do GBIS
01568   if (simParams->GBISOn) {
01569     //open GBIS boxes depending on phase
01570     for ( int i=0; i<activePatches.size(); ++i ) {
01571       patch_record &pr = master->patchRecords[activePatches[i]];
01572       GBISP("doWork[%d] accessing arrays for P%d\n",CkMyPe(),gbisPhase);
01573       if (gbisPhase == 1) {
01574         //Copy GBIS intRadius to Host
01575         if (atomsChanged) {
01576           float *intRad0 = intRad0H + pr.localStart;
01577           float *intRadS = intRadSH + pr.localStart;
01578           for ( int k=0; k<pr.numAtoms; ++k ) {
01579             int j = pr.xExt[k].sortOrder;
01580             intRad0[k] = pr.intRad[2*j+0];
01581             intRadS[k] = pr.intRad[2*j+1];
01582           }
01583         }
01584       } else if (gbisPhase == 2) {
01585         float *bornRad = bornRadH + pr.localStart;
01586         for ( int k=0; k<pr.numAtoms; ++k ) {
01587           int j = pr.xExt[k].sortOrder;
01588           bornRad[k] = pr.bornRad[j];
01589         }
01590       } else if (gbisPhase == 3) {
01591         float *dHdrPrefix = dHdrPrefixH + pr.localStart;
01592         for ( int k=0; k<pr.numAtoms; ++k ) {
01593           int j = pr.xExt[k].sortOrder;
01594           dHdrPrefix[k] = pr.dHdrPrefix[j];
01595         }
01596       } // end phases
01597     } // end for patches
01598   } // if GBISOn
01599 
01600   kernel_time = CkWallTimer();
01601   kernel_launch_state = 1;
01602   //if ( gpu_is_mine || ! doSlow ) recvYieldDevice(-1);
01603   if ( deviceCUDA->getGpuIsMine() || ! doSlow ) recvYieldDevice(-1);
01604 }
01605 
01606 void cuda_check_remote_calc(void *arg, double walltime) {
01607   // in theory we only need end_remote_calc, but overlap isn't reliable
01608   // if ( cudaEventQuery(end_remote_calc) == cudaSuccess ) {
01609   if ( cudaEventQuery(end_remote_download) == cudaSuccess ) {
01610 // CkPrintf("Pe %d yielding to %d after remote calc\n", CkMyPe(), next_pe_sharing_gpu);
01611     computeMgr->sendYieldDevice(deviceCUDA->getNextPeSharingGpu());
01612 // CkPrintf("Pe %d yielded to %d after remote calc\n", CkMyPe(), next_pe_sharing_gpu);
01613   } else {
01614     CcdCallBacksReset(0,walltime);  // fix Charm++
01615     CUDA_POLL(cuda_check_remote_calc, arg);
01616   }
01617 }
01618 
01619 void cuda_check_local_calc(void *arg, double walltime) {
01620   // in theory we only need end_local_calc, but overlap isn't reliable
01621   // if ( cudaEventQuery(end_local_calc) == cudaSuccess ) {
01622   if ( cudaEventQuery(end_local_download) == cudaSuccess ) {
01623 // CkPrintf("Pe %d yielding to %d after local calc\n", CkMyPe(), next_pe_sharing_gpu);
01624     computeMgr->sendYieldDevice(deviceCUDA->getNextPeSharingGpu());
01625 // CkPrintf("Pe %d yielded to %d after local calc\n", CkMyPe(), next_pe_sharing_gpu);
01626   } else {
01627     CcdCallBacksReset(0,walltime);  // fix Charm++
01628     CUDA_POLL(cuda_check_local_calc, arg);
01629   }
01630 }
01631 
01632 // computeMgr->sendYieldDevice(next_pe_sharing_gpu);
01633 
01634 void ComputeNonbondedCUDA::recvYieldDevice(int pe) {
01635 GBISP("C.N.CUDA[%d]::recvYieldDevice: seq %d, workStarted %d, \
01636 gbisPhase %d, kls %d, from pe %d\n", CkMyPe(), sequence(), \
01637 workStarted, gbisPhase, kernel_launch_state, pe)
01638 
01639   float3 lata, latb, latc;
01640   lata.x = lattice.a().x;
01641   lata.y = lattice.a().y;
01642   lata.z = lattice.a().z;
01643   latb.x = lattice.b().x;
01644   latb.y = lattice.b().y;
01645   latb.z = lattice.b().z;
01646   latc.x = lattice.c().x;
01647   latc.y = lattice.c().y;
01648   latc.z = lattice.c().z;
01649   SimParameters *simParams = Node::Object()->simParameters;
01650 
01651   // Set GPU device ID
01652   cudaSetDevice(deviceID);
01653 
01654   const bool streaming = ! (deviceCUDA->getNoStreaming() || simParams->GBISOn);
01655 
01656   double walltime;
01657   if ( kernel_launch_state == 1 || kernel_launch_state == 2 ) {
01658     walltime = CkWallTimer();
01659     CcdCallBacksReset(0,walltime);  // fix Charm++
01660   }
01661 
01662   switch ( kernel_launch_state ) {
01664 // Remote
01665   case 1:
01666 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: case 1\n", CkMyPe())
01667     ++kernel_launch_state;
01668     //gpu_is_mine = 0;
01669     deviceCUDA->setGpuIsMine(0);
01670     remote_submit_time = walltime;
01671 
01672     if (!simParams->GBISOn || gbisPhase == 1) {
01673       // cudaEventRecord(start_upload, stream);
01674       if ( atomsChanged ) {
01675         cuda_bind_atom_params(atom_params);
01676         cuda_bind_vdw_types(vdw_types);
01677       }
01678       if ( simParams->GBISOn) {
01679         cuda_bind_GBIS_psiSum(psiSumH);
01680                if ( atomsChanged ) {
01681                  cuda_bind_GBIS_intRad(intRad0H, intRadSH);
01682                }
01683       }
01684       atomsChanged = 0;
01685       cuda_bind_atoms((const atom *)atoms);
01686       cuda_bind_forces(forces, slow_forces);
01687       cuda_bind_virials(virials, force_ready_queue, block_order);
01688       if ( simParams->GBISOn) {
01689         cuda_bind_GBIS_energy(energy_gbis);
01690       }
01691       if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
01692       //call CUDA Kernels
01693 
01694       cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
01695                             0,remoteComputeRecords.size(),
01696                             remoteComputeRecords.size()+localComputeRecords.size(),
01697                             doSlow, doEnergy, usePairlists, savePairlists, streaming, ! patchPairsReordered, stream);
01698       if (simParams->GBISOn) {
01699         cuda_GBIS_P1(
01700           0,remoteComputeRecords.size(),
01701                       localActivePatches.size(),remoteActivePatches.size(),
01702                       simParams->alpha_cutoff-simParams->fsMax,
01703                       simParams->coulomb_radius_offset,
01704                       lata, latb, latc, stream);
01705       }
01706       //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
01707       //    num_local_atom_records,num_remote_atom_records);
01708       if ( ( ! streaming ) || ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) ) {
01709         cudaEventRecord(end_remote_download, stream);
01710       }
01711       if ( streaming ) {
01712         force_ready_queue_next = 0;
01713         CUDA_POLL(cuda_check_progress,this);
01714       } else {
01715         CUDA_POLL(cuda_check_remote_progress,this);
01716       }
01717       if ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) {
01718         CUDA_POLL(cuda_check_remote_calc,this);
01719         break;
01720       }
01721     } // !GBIS or gbisPhase==1
01722     if (simParams->GBISOn) {
01723       if (gbisPhase == 1) {
01724         //GBIS P1 Kernel launched in previous code block
01725       } else if (gbisPhase == 2) {
01726 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P2>>>\n", CkMyPe())
01727         // cudaEventRecord(start_upload, stream);
01728         cuda_bind_GBIS_bornRad(bornRadH);
01729         cuda_bind_GBIS_dEdaSum(dEdaSumH);
01730         if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
01731         cuda_GBIS_P2(
01732           0,remoteComputeRecords.size(),
01733           localActivePatches.size(),remoteActivePatches.size(),
01734           (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
01735           simParams->nonbondedScaling, simParams->kappa,
01736           (simParams->switchingActive ? simParams->switchingDist : -1.0),
01737           simParams->dielectric, simParams->solvent_dielectric,
01738           lata, latb, latc,
01739           doEnergy, doSlow, stream
01740           );
01741         cudaEventRecord(end_remote_download, stream);
01742         CUDA_POLL(cuda_check_remote_progress,this);
01743       } else if (gbisPhase == 3) {
01744 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P3>>>\n", CkMyPe())
01745         // cudaEventRecord(start_upload, stream);
01746         cuda_bind_GBIS_dHdrPrefix(dHdrPrefixH);
01747         if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
01748         if (doSlow)
01749         cuda_GBIS_P3(
01750           0,remoteComputeRecords.size(),
01751           localActivePatches.size(),remoteActivePatches.size(),
01752           (simParams->alpha_cutoff-simParams->fsMax),
01753           simParams->coulomb_radius_offset,
01754           simParams->nonbondedScaling,
01755           lata, latb, latc, stream
01756           );
01757         cudaEventRecord(end_remote_download, stream);
01758         CUDA_POLL(cuda_check_remote_progress,this);
01759       }
01760     }
01761 
01763 // Local
01764   case 2:
01765 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: case 2\n", CkMyPe())
01766     ++kernel_launch_state;
01767     //gpu_is_mine = 0;
01768     deviceCUDA->setGpuIsMine(0);
01769 
01770     if ( stream2 != stream ) {
01771       // needed to ensure that upload is finished
01772       cudaStreamWaitEvent(stream2, start_calc, 0);
01773       // priorities do not prevent local from launching ahead
01774       // of remote, so delay local stream with a small memset
01775       cudaMemsetAsync(dummy_dev, 0, dummy_size, stream2);
01776     }
01777 
01778     if (!simParams->GBISOn || gbisPhase == 1) {
01779 
01780       cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
01781                             remoteComputeRecords.size(),localComputeRecords.size(),
01782                             remoteComputeRecords.size()+localComputeRecords.size(),
01783                             doSlow, doEnergy, usePairlists, savePairlists, streaming, ! patchPairsReordered, stream2);
01784       if (simParams->GBISOn) {
01785         cuda_GBIS_P1(
01786                      remoteComputeRecords.size(),localComputeRecords.size(),
01787                      0,localActivePatches.size(),
01788                      simParams->alpha_cutoff-simParams->fsMax,
01789                      simParams->coulomb_radius_offset,
01790                      lata, latb, latc, stream2 );
01791       }
01792       //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
01793       //    0,num_local_atom_records);
01794       //cuda_load_virials(virials, doSlow);  // slow_virials follows virials
01795       if ( ( ! streaming ) || ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) ) {
01796         cudaEventRecord(end_local_download, stream2);
01797       }
01798       if ( ! streaming && workStarted == 2 ) {
01799         GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
01800 cuda_check_local_progress\n", CkMyPe())
01801         CUDA_POLL(cuda_check_local_progress,this);
01802       }
01803       if ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) {
01804         GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
01805 cuda_check_local_calc\n", CkMyPe())
01806           CUDA_POLL(cuda_check_local_calc,this);
01807         break;
01808       }
01809 
01810     } // !GBIS or gbisPhase==1
01811     if (simParams->GBISOn) {
01812       if (gbisPhase == 1) {
01813         //GBIS P1 Kernel launched in previous code block
01814       } else if (gbisPhase == 2) {
01815 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P2>>>\n", CkMyPe())
01816         cuda_GBIS_P2(
01817           remoteComputeRecords.size(),localComputeRecords.size(),
01818           0,localActivePatches.size(),
01819           (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
01820           simParams->nonbondedScaling, simParams->kappa,
01821           (simParams->switchingActive ? simParams->switchingDist : -1.0),
01822           simParams->dielectric, simParams->solvent_dielectric,
01823           lata, latb, latc,
01824           doEnergy, doSlow, stream2
01825           );
01826         cudaEventRecord(end_local_download, stream2);
01827         if ( workStarted == 2 ) {
01828           CUDA_POLL(cuda_check_local_progress,this);
01829         }
01830       } else if (gbisPhase == 3) {
01831 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P3>>>\n", CkMyPe())
01832         if (doSlow)
01833         cuda_GBIS_P3(
01834           remoteComputeRecords.size(),localComputeRecords.size(),
01835           0,localActivePatches.size(),
01836           (simParams->alpha_cutoff-simParams->fsMax),
01837           simParams->coulomb_radius_offset,
01838           simParams->nonbondedScaling,
01839           lata, latb, latc, stream2
01840           );
01841         cudaEventRecord(end_local_download, stream2);
01842         if ( workStarted == 2 ) {
01843           CUDA_POLL(cuda_check_local_progress,this);
01844         }
01845       } // phases
01846     } // GBISOn
01847     if ( simParams->PMEOn && simParams->PMEOffload  && !simParams->usePMECUDA) break;
01848 
01849   default:
01850 GBISP("C.N.CUDA[%d]::recvYieldDevice: case default\n", CkMyPe())
01851     //gpu_is_mine = 1;
01852     deviceCUDA->setGpuIsMine(1);
01853     break;
01854   } // switch
01855 GBISP("C.N.CUDA[%d]::recvYieldDevice: DONE\n", CkMyPe())
01856 }
01857 
01858 void ComputeNonbondedCUDA::messageFinishPatch(int flindex) {
01859   int pid = activePatches[flindex];
01860   patch_record &pr = patchRecords[pid];
01861   //fprintf(stderr, "%d ComputeNonbondedCUDA::messageFinishPatch %d\n",CkMyPe(),pr.hostPe);
01862   computeMgr->sendNonbondedCUDASlaveEnqueuePatch(pr.slave,pr.hostPe,sequence(),PROXY_DATA_PRIORITY,flindex,pr.msg);
01863 }
01864 
01865 void ComputeNonbondedCUDA::finishPatch(int flindex) {
01866   //fprintf(stderr, "%d ComputeNonbondedCUDA::finishPatch \n",CkMyPe());
01867   patch_record &pr = master->patchRecords[master->activePatches[flindex]];
01868   pr.r = pr.forceBox->open();
01869   finishPatch(pr);
01870   pr.positionBox->close(&(pr.x));
01871   pr.forceBox->close(&(pr.r));
01872 }
01873 
01874 void ComputeNonbondedCUDA::finishPatch(patch_record &pr) {
01875   int start = pr.localStart;
01876   const CompAtomExt *aExt = pr.xExt;
01877   int nfree = pr.numAtoms;
01878   pr.f = pr.r->f[Results::nbond];
01879   Force *f = pr.f;
01880   Force *f_slow = pr.r->f[Results::slow];
01881   const CompAtom *a = pr.x;
01882   // int *ao = atom_order + start;
01883   float4 *af = master->forces + start;
01884   float4 *af_slow = master->slow_forces + start;
01885   // only free atoms return forces
01886   for ( int k=0; k<nfree; ++k ) {
01887     int j = aExt[k].sortOrder;
01888     f[j].x += af[k].x;
01889     f[j].y += af[k].y;
01890     f[j].z += af[k].z;
01891     // wcount += af[k].w;
01892     // virial += af[k].w;
01893     if ( doSlow ) {
01894       f_slow[j].x += af_slow[k].x;
01895       f_slow[j].y += af_slow[k].y;
01896       f_slow[j].z += af_slow[k].z;
01897       // virial_slow += af_slow[k].w;
01898     }
01899   }
01900 }
01901 
01902 //dtanner
01903 int ComputeNonbondedCUDA::finishWork() {
01904   //fprintf(stderr, "%d ComputeNonbondedCUDA::finishWork() \n",CkMyPe());
01905 
01906   if ( master == this ) {
01907     for ( int i = 0; i < numSlaves; ++i ) {
01908       computeMgr->sendNonbondedCUDASlaveEnqueue(slaves[i],slavePes[i],sequence(),priority(),workStarted);
01909     }
01910   }
01911 
01912 GBISP("C.N.CUDA[%d]::fnWork: workStarted %d, phase %d\n", \
01913 CkMyPe(), workStarted, gbisPhase)
01914 
01915   Molecule *mol = Node::Object()->molecule;
01916   SimParameters *simParams = Node::Object()->simParameters;
01917 
01918   ResizeArray<int> &patches( workStarted == 1 ?
01919                                 remoteHostedPatches : localHostedPatches );
01920 
01921   // long long int wcount = 0;
01922   // double virial = 0.;
01923   // double virial_slow = 0.;
01924   for ( int i=0; i<patches.size(); ++i ) {
01925     // CkPrintf("Pe %d patch %d of %d pid %d\n",CkMyPe(),i,patches.size(),patches[i]);
01926     patch_record &pr = master->patchRecords[patches[i]];
01927 
01928     if ( !simParams->GBISOn || gbisPhase == 1 ) {
01929       patch_record &pr = master->patchRecords[patches[i]];
01930 GBISP("GBIS[%d] fnWork() P0[%d] force.open()\n",CkMyPe(), pr.patchID);
01931       pr.r = pr.forceBox->open();
01932     } // !GBISOn || gbisPhase==1
01933 
01934     int start = pr.localStart;
01935     const CompAtomExt *aExt = pr.xExt;
01936     if ( !simParams->GBISOn || gbisPhase == 3 ) {
01937       finishPatch(pr);
01938     } // !GBISOn || gbisPhase == 3
01939 
01940 #if 0
01941     if ( i % 31 == 0 ) for ( int j=0; j<3; ++j ) {
01942       CkPrintf("Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
01943         j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
01944         af[j].w);
01945     }
01946 #endif
01947 
01948     //Close Boxes depending on Phase
01949     if (simParams->GBISOn) {
01950       if (gbisPhase == 1) {
01951         //Copy dEdaSum from Host to Patch Box
01952         GBReal *psiSumMaster = master->psiSumH + start;
01953         for ( int k=0; k<pr.numAtoms; ++k ) {
01954           int j = aExt[k].sortOrder;
01955           pr.psiSum[j] += psiSumMaster[k];
01956         }
01957 GBISP("C.N.CUDA[%d]::fnWork: P1 psiSum.close()\n", CkMyPe());
01958         pr.psiSumBox->close(&(pr.psiSum));
01959 
01960       } else if (gbisPhase == 2) {
01961         //Copy dEdaSum from Host to Patch Box
01962         GBReal *dEdaSumMaster = master->dEdaSumH + start;
01963         for ( int k=0; k<pr.numAtoms; ++k ) {
01964           int j = aExt[k].sortOrder;
01965           pr.dEdaSum[j] += dEdaSumMaster[k];
01966         }
01967 GBISP("C.N.CUDA[%d]::fnWork: P2 dEdaSum.close()\n", CkMyPe());
01968         pr.dEdaSumBox->close(&(pr.dEdaSum));
01969 
01970       } else if (gbisPhase == 3) {
01971 GBISP("C.N.CUDA[%d]::fnWork: P3 all.close()\n", CkMyPe());
01972         pr.intRadBox->close(&(pr.intRad)); //box 6
01973         pr.bornRadBox->close(&(pr.bornRad)); //box 7
01974         pr.dHdrPrefixBox->close(&(pr.dHdrPrefix)); //box 9
01975         pr.positionBox->close(&(pr.x)); //box 0
01976         pr.forceBox->close(&(pr.r));
01977       } //end phases
01978     } else { //not GBIS
01979 GBISP("C.N.CUDA[%d]::fnWork: pos/force.close()\n", CkMyPe());
01980       pr.positionBox->close(&(pr.x));
01981       pr.forceBox->close(&(pr.r));
01982     }
01983   }// end for
01984 
01985 #if 0
01986   virial *= (-1./6.);
01987   reduction->item(REDUCTION_VIRIAL_NBOND_XX) += virial;
01988   reduction->item(REDUCTION_VIRIAL_NBOND_YY) += virial;
01989   reduction->item(REDUCTION_VIRIAL_NBOND_ZZ) += virial;
01990   if ( doSlow ) {
01991     virial_slow *= (-1./6.);
01992     reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial_slow;
01993     reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial_slow;
01994     reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial_slow;
01995   }
01996 #endif
01997 
01998   if ( workStarted == 1 && ! deviceCUDA->getMergeGrids() &&
01999        ( localHostedPatches.size() || master == this ) ) {
02000     GBISP("not finished, call again\n");
02001     return 0;  // not finished, call again
02002   }
02003 
02004   if ( master != this ) {  // finished
02005     GBISP("finished\n");
02006     if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
02007     atomsChanged = 0;
02008     return 1;
02009   }
02010 
02011   cuda_timer_total += kernel_time;
02012 
02013   if ( !simParams->GBISOn || gbisPhase == 3 ) {
02014 
02015     atomsChanged = 0;
02016     finishReductions();
02017 
02018   } // !GBISOn || gbisPhase==3  
02019 
02020   // Next GBIS Phase
02021 GBISP("C.N.CUDA[%d]::fnWork: incrementing phase\n", CkMyPe())
02022     if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
02023 
02024   GBISP("C.N.CUDA[%d] finished ready for next step\n",CkMyPe());
02025   return 1;  // finished and ready for next step
02026 }
02027 
02028 
02029 void ComputeNonbondedCUDA::finishReductions() {
02030   //fprintf(stderr, "%d ComputeNonbondedCUDA::finishReductions \n",CkMyPe());
02031 
02032   basePriority = PROXY_DATA_PRIORITY;  // higher to aid overlap
02033 
02034   SimParameters *simParams = Node::Object()->simParameters;
02035 
02036   if ( 0 && patchPairsReordered && patchPairsReordered < 3 ) {
02037     if ( patchPairsReordered++ == 2) {
02038       int patch_len = patchRecords.size();
02039       ResizeArray<int> plast(patch_len);
02040       for ( int i=0; i<patch_len; ++i ) {
02041         plast[i] = -1;
02042       }
02043       int order_len = localComputeRecords.size()+remoteComputeRecords.size();
02044       for ( int i=0; i<order_len; ++i ) {
02045         plast[computeRecords[block_order[i]].pid[0]] = i;
02046         iout << "block_order " << i << " " << block_order[i] << " " << computeRecords[block_order[i]].pid[0] << "\n";
02047       }
02048       iout << endi;
02049       for ( int i=0; i<patch_len; ++i ) {
02050         iout << "patch_last " << i << " " << plast[i] << "\n";
02051       }
02052       iout << endi;
02053     }
02054   }
02055 
02056   {
02057     Tensor virial_tensor;
02058     BigReal energyv = 0.;
02059     BigReal energye = 0.;
02060     BigReal energys = 0.;
02061     int nexcluded = 0;
02062     for ( int i = 0; i < num_virials; ++i ) {
02063       virial_tensor.xx += virials[16*i];
02064       virial_tensor.xy += virials[16*i+1];
02065       virial_tensor.xz += virials[16*i+2];
02066       virial_tensor.yx += virials[16*i+3];
02067       virial_tensor.yy += virials[16*i+4];
02068       virial_tensor.yz += virials[16*i+5];
02069       virial_tensor.zx += virials[16*i+6];
02070       virial_tensor.zy += virials[16*i+7];
02071       virial_tensor.zz += virials[16*i+8];
02072       energyv += virials[16*i+9];
02073       energye += virials[16*i+10];
02074       energys += virials[16*i+11];
02075       nexcluded += ((int *)virials)[16*i+12];
02076       if (simParams->GBISOn) {
02077         energye += energy_gbis[i];
02078       }
02079     }
02080     reduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += nexcluded;
02081     ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NBOND,virial_tensor);
02082     if ( doEnergy ) {
02083       reduction->item(REDUCTION_LJ_ENERGY) += energyv;
02084       reduction->item(REDUCTION_ELECT_ENERGY) += energye;
02085       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += energys;
02086     }
02087   }
02088   if ( doSlow ) {
02089     Tensor virial_slow_tensor;
02090     for ( int i = 0; i < num_virials; ++i ) {
02091       virial_slow_tensor.xx += slow_virials[16*i];
02092       virial_slow_tensor.xy += slow_virials[16*i+1];
02093       virial_slow_tensor.xz += slow_virials[16*i+2];
02094       virial_slow_tensor.yx += slow_virials[16*i+3];
02095       virial_slow_tensor.yy += slow_virials[16*i+4];
02096       virial_slow_tensor.yz += slow_virials[16*i+5];
02097       virial_slow_tensor.zx += slow_virials[16*i+6];
02098       virial_slow_tensor.zy += slow_virials[16*i+7];
02099       virial_slow_tensor.zz += slow_virials[16*i+8];
02100     }
02101     ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_SLOW,virial_slow_tensor);
02102   }
02103 
02104   reduction->submit();
02105 
02106   cuda_timer_count++;
02107   if ( simParams->outputCudaTiming &&
02108         step % simParams->outputCudaTiming == 0 ) {
02109 
02110     // int natoms = mol->numAtoms; 
02111     // double wpa = wcount;  wpa /= natoms;
02112 
02113     // CkPrintf("Pe %d CUDA kernel %f ms, total %f ms, wpa %f\n", CkMyPe(),
02114     //  kernel_time * 1.e3, time * 1.e3, wpa);
02115 
02116 #if 0
02117     float upload_ms, remote_calc_ms;
02118     float local_calc_ms, total_ms;
02119     cuda_errcheck("before event timers");
02120     cudaEventElapsedTime(&upload_ms, start_upload, start_calc);
02121     cuda_errcheck("in event timer 1");
02122     cudaEventElapsedTime(&remote_calc_ms, start_calc, end_remote_download);
02123     cuda_errcheck("in event timer 2");
02124     cudaEventElapsedTime(&local_calc_ms, end_remote_download, end_local_download);
02125     cuda_errcheck("in event timer 3");
02126     cudaEventElapsedTime(&total_ms, start_upload, end_local_download);
02127     cuda_errcheck("in event timer 4");
02128     cuda_errcheck("in event timers");
02129 
02130     CkPrintf("CUDA EVENT TIMING: %d %f %f %f %f\n",
02131              CkMyPe(), upload_ms, remote_calc_ms,
02132              local_calc_ms, total_ms);
02133 #endif
02134 
02135     if ( cuda_timer_count >= simParams->outputCudaTiming ) {
02136       cuda_timer_total /= cuda_timer_count;
02137       CkPrintf("CUDA TIMING: %d  %f ms/step on node %d\n",
02138                step, cuda_timer_total * 1.e3, CkMyPe());
02139     }
02140     cuda_timer_count = 0;
02141     cuda_timer_total = 0;
02142   }
02143 
02144 }
02145 
02146 
02147 #endif  // NAMD_CUDA
02148 

Generated on Sat Feb 24 01:17:13 2018 for NAMD by  doxygen 1.4.7