ComputeNonbondedCUDA.C

Go to the documentation of this file.
00001 
00002 #include "common.h"
00003 #include "charm++.h"
00004 
00005 #ifdef NAMD_CUDA
00006 #include <cuda_runtime.h>
00007 #include <cuda.h>
00008 #endif
00009 
00010 #include "WorkDistrib.h"
00011 #include "ComputeMgr.h"
00012 #include "ProxyMgr.h"
00013 #include "ComputeNonbondedCUDAKernel.h"
00014 #include "ComputeNonbondedCUDA.h"
00015 #include "LJTable.h"
00016 #include "ObjectArena.h"
00017 #include "SortAtoms.h"
00018 #include "Priorities.h"
00019 #include <algorithm>
00020 
00021 #include "NamdTypes.h"
00022 #include "DeviceCUDA.h"
00023 #include "CudaUtils.h"
00024 
00025 //#define PRINT_GBIS
00026 #undef PRINT_GBIS
00027 
00028 #ifdef NAMD_CUDA
00029 
00030 #ifdef WIN32
00031 #define __thread __declspec(thread)
00032 #endif
00033 
00034 extern __thread int max_grid_size;
00035 
00036 extern __thread cudaStream_t stream;
00037 extern __thread cudaStream_t stream2;
00038 
00039 extern __thread DeviceCUDA *deviceCUDA;
00040 
00041 void cuda_errcheck(const char *msg) {
00042   cudaError_t err;
00043   if ((err = cudaGetLastError()) != cudaSuccess) {
00044     char host[128];
00045     gethostname(host, 128);  host[127] = 0;
00046     char devstr[128] = "";
00047     int devnum;
00048     if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00049       sprintf(devstr, " device %d", devnum);
00050     }
00051     cudaDeviceProp deviceProp;
00052     if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
00053       sprintf(devstr, " device %d pci %x:%x:%x", devnum,
00054         deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
00055     }
00056     char errmsg[1024];
00057     sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
00058     NAMD_die(errmsg);
00059   }
00060 }
00061 
00062 static inline bool sortop_bitreverse(int a, int b) {
00063   if ( a == b ) return 0; 
00064   for ( int bit = 1; bit; bit *= 2 ) {
00065     if ( (a&bit) != (b&bit) ) return ((a&bit) < (b&bit));
00066   }
00067   return 0;
00068 }
00069 
00070 static __thread ComputeNonbondedCUDA* cudaCompute = 0;
00071 static __thread ComputeMgr *computeMgr = 0;
00072 
00073 void send_build_cuda_force_table() {
00074   computeMgr->sendBuildCudaForceTable();
00075 }
00076 
00077 void build_cuda_force_table() {
00078   if ( deviceCUDA->getMasterPe() != CkMyPe() ) return;
00079   ComputeNonbondedCUDA::build_lj_table();
00080   ComputeNonbondedCUDA::build_force_table();
00081 }
00082 
00083 void ComputeNonbondedCUDA::build_lj_table() {  // static
00084 
00085   const LJTable* const ljTable = ComputeNonbondedUtil:: ljTable;
00086   const int dim = ljTable->get_table_dim();
00087 
00088   // round dim up to odd multiple of 16
00089   int tsize = (((dim+16+31)/32)*32)-16;
00090   if ( tsize < dim ) NAMD_bug("ComputeNonbondedCUDA::build_lj_table bad tsize");
00091 
00092   float2 *t = new float2[tsize*tsize];
00093   float2 *row = t;
00094   for ( int i=0; i<dim; ++i, row += tsize ) {
00095     for ( int j=0; j<dim; ++j ) {
00096       const LJTable::TableEntry *e = ljTable->table_val(i,j);
00097       row[j].x = e->A * scaling;
00098       row[j].y = e->B * scaling;
00099     }
00100   }
00101 
00102   cuda_bind_lj_table(t,tsize);
00103   delete [] t;
00104 
00105   if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
00106     CkPrintf("Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim);
00107   }
00108 }
00109 
00110 void ComputeNonbondedCUDA::build_force_table() {  // static
00111 
00112   float4 t[FORCE_TABLE_SIZE];
00113   float4 et[FORCE_TABLE_SIZE];  // energy table
00114 
00115   const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta;
00116   const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp;
00117   // const int r2_delta_expc = 64 * (r2_delta_exp - 127);
00118   const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
00119 
00120   double r2list[FORCE_TABLE_SIZE];  // double to match cpu code
00121   for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00122     double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00123     r2list[i] = r*r + r2_delta;
00124   }
00125 
00126   union { double f; int32 i[2]; } byte_order_test;
00127   byte_order_test.f = 1.0;  // should occupy high-order bits only
00128   int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
00129 
00130   for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00131     double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00132     int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc;  // table_i >= 0
00133 
00134     if ( r > cutoff ) {
00135       t[i].x = 0.;
00136       t[i].y = 0.;
00137       t[i].z = 0.;
00138       t[i].w = 0.;
00139       et[i].x = 0.;
00140       et[i].y = 0.;
00141       et[i].z = 0.;
00142       et[i].w = 0.;
00143       continue;
00144     }
00145 
00146     BigReal diffa = r2list[i] - r2_table[table_i];
00147 
00148     // coulomb 1/r or fast force
00149     // t[i].x = 1. / (r2 * r);  // -1/r * d/dr r^-1
00150     {
00151       BigReal table_a = fast_table[4*table_i];
00152       BigReal table_b = fast_table[4*table_i+1];
00153       BigReal table_c = fast_table[4*table_i+2];
00154       BigReal table_d = fast_table[4*table_i+3];
00155       BigReal grad =
00156                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00157       t[i].x = 2. * grad;
00158       BigReal ener = table_a + diffa *
00159                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00160       et[i].x = ener;
00161     }
00162 
00163 
00164     // pme correction for slow force
00165     // t[i].w = 0.;
00166     {
00167       BigReal table_a = scor_table[4*table_i];
00168       BigReal table_b = scor_table[4*table_i+1];
00169       BigReal table_c = scor_table[4*table_i+2];
00170       BigReal table_d = scor_table[4*table_i+3];
00171       BigReal grad =
00172                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00173       t[i].w = 2. * grad;
00174       BigReal ener = table_a + diffa *
00175                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00176       et[i].w = ener;
00177     }
00178 
00179 
00180     // vdw 1/r^6
00181     // t[i].y = 6. / (r8);  // -1/r * d/dr r^-6
00182     {
00183       BigReal table_a = vdwb_table[4*table_i];
00184       BigReal table_b = vdwb_table[4*table_i+1];
00185       BigReal table_c = vdwb_table[4*table_i+2];
00186       BigReal table_d = vdwb_table[4*table_i+3];
00187       BigReal grad =
00188                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00189       t[i].y = 2. * -1. * grad;
00190       BigReal ener = table_a + diffa *
00191                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00192       et[i].y = -1. * ener;
00193     }
00194 
00195 
00196     // vdw 1/r^12
00197     // t[i].z = 12e / (r8 * r4 * r2);  // -1/r * d/dr r^-12
00198     {
00199       BigReal table_a = vdwa_table[4*table_i];
00200       BigReal table_b = vdwa_table[4*table_i+1];
00201       BigReal table_c = vdwa_table[4*table_i+2];
00202       BigReal table_d = vdwa_table[4*table_i+3];
00203       BigReal grad =
00204                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00205       t[i].z = 2. * grad;
00206       BigReal ener = table_a + diffa *
00207                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00208       et[i].z = ener;
00209     }
00210 
00211     // CkPrintf("%d %g %g %g %g %g %g\n", i, r, diffa,
00212     //   t[i].x, t[i].y, t[i].z, t[i].w);
00213 
00214 /*
00215     double r2 = r * r;
00216     double r4 = r2 * r2;
00217     double r8 = r4 * r4;
00218 
00219     t[i].x = 1. / (r2 * r);  // -1/r * d/dr r^-1
00220     t[i].y = 6. / (r8);  // -1/r * d/dr r^-6
00221     t[i].z = 12. / (r8 * r4 * r2);  // -1/r * d/dr r^-12
00222     t[i].w = 0.;
00223 */
00224   }
00225 
00226   t[0].x = 0.f;
00227   t[0].y = 0.f;
00228   t[0].z = 0.f;
00229   t[0].w = 0.f;
00230   et[0].x = et[1].x;
00231   et[0].y = et[1].y;
00232   et[0].z = et[1].z;
00233   et[0].w = et[1].w;
00234 
00235   cuda_bind_force_table(t,et);
00236 
00237   if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
00238     CkPrintf("Info: Updated CUDA force table with %d elements.\n", FORCE_TABLE_SIZE);
00239   }
00240 }
00241 
00242 struct exlist_sortop {
00243   bool operator() (int32 *li, int32 *lj) {
00244     return ( li[1] < lj[1] );
00245   }
00246 };
00247 
00248 void build_cuda_exclusions() {
00249   if ( deviceCUDA->getMasterPe() != CkMyPe() ) return;
00250   ComputeNonbondedCUDA::build_exclusions();
00251 }
00252 
00253 static __thread int2 *exclusionsByAtom;
00254 
00255 void ComputeNonbondedCUDA::build_exclusions() {
00256   Molecule *mol = Node::Object()->molecule;
00257 
00258 #ifdef MEM_OPT_VERSION
00259   int natoms = mol->exclSigPoolSize;
00260 #else
00261   int natoms = mol->numAtoms; 
00262 #endif
00263 
00264   delete [] exclusionsByAtom;
00265   exclusionsByAtom = new int2[natoms];
00266 
00267   // create unique sorted lists
00268 
00269   ObjectArena<int32> listArena;
00270   ResizeArray<int32*> unique_lists;
00271   int32 **listsByAtom = new int32*[natoms];
00272   SortableResizeArray<int32> curList;
00273   for ( int i=0; i<natoms; ++i ) {
00274     curList.resize(0);
00275     curList.add(0);  // always excluded from self
00276 #ifdef MEM_OPT_VERSION
00277     const ExclusionSignature *sig = mol->exclSigPool + i;
00278     int n = sig->fullExclCnt;
00279     for ( int j=0; j<n; ++j ) { curList.add(sig->fullOffset[j]); }
00280     n += 1;
00281 #else
00282     const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
00283     int n = mol_list[0] + 1;
00284     for ( int j=1; j<n; ++j ) {
00285       curList.add(mol_list[j] - i);
00286     }
00287 #endif
00288     curList.sort();
00289 
00290     int j;
00291     for ( j=0; j<unique_lists.size(); ++j ) {
00292       if ( n != unique_lists[j][0] ) continue;  // no match
00293       int k;
00294       for ( k=0; k<n; ++k ) {
00295         if ( unique_lists[j][k+3] != curList[k] ) break;
00296       }
00297       if ( k == n ) break;  // found match
00298     }
00299     if ( j == unique_lists.size() ) {  // no match
00300       int32 *list = listArena.getNewArray(n+3);
00301       list[0] = n;
00302       int maxdiff = 0;
00303       maxdiff = -1 * curList[0];
00304       if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
00305       list[1] = maxdiff;
00306       for ( int k=0; k<n; ++k ) {
00307         list[k+3] = curList[k];
00308       }
00309       unique_lists.add(list);
00310     }
00311     listsByAtom[i] = unique_lists[j];
00312   }
00313   // sort lists by maxdiff
00314   std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
00315   long int totalbits = 0;
00316   int nlists = unique_lists.size();
00317   for ( int j=0; j<nlists; ++j ) {
00318     int32 *list = unique_lists[j];
00319     int maxdiff = list[1];
00320     list[2] = totalbits + maxdiff;
00321     totalbits += 2*maxdiff + 1;
00322   }
00323   for ( int i=0; i<natoms; ++i ) {
00324     exclusionsByAtom[i].x = listsByAtom[i][1];  // maxdiff
00325     exclusionsByAtom[i].y = listsByAtom[i][2];  // start
00326   }
00327   delete [] listsByAtom;
00328 
00329   if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
00330 
00331   {
00332     long int bytesneeded = totalbits / 8;
00333     if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
00334     CkPrintf("Info: Found %d unique exclusion lists needing %ld bytes\n",
00335                 unique_lists.size(), bytesneeded);
00336     }
00337 
00338     long int bytesavail = MAX_EXCLUSIONS * sizeof(unsigned int);
00339     if ( bytesneeded > bytesavail ) {
00340       char errmsg[512];
00341       sprintf(errmsg,"Found %d unique exclusion lists needing %ld bytes "
00342                      "but only %ld bytes can be addressed with 32-bit int.",
00343                      unique_lists.size(), bytesneeded, bytesavail);
00344       NAMD_die(errmsg);
00345     }
00346   }
00347 
00348 #define SET_EXCL(EXCL,BASE,DIFF) \
00349          (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
00350 
00351   unsigned int *exclusion_bits = new unsigned int[totalbits/32];
00352   memset(exclusion_bits, 0, totalbits/8);
00353 
00354   long int base = 0;
00355   for ( int i=0; i<unique_lists.size(); ++i ) {
00356     base += unique_lists[i][1];
00357     if ( unique_lists[i][2] != (int32)base ) {
00358       NAMD_bug("ComputeNonbondedCUDA::build_exclusions base != stored");
00359     }
00360     int n = unique_lists[i][0];
00361     for ( int j=0; j<n; ++j ) {
00362       SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
00363     }
00364     base += unique_lists[i][1] + 1;
00365   }
00366 
00367   cuda_bind_exclusions(exclusion_bits, totalbits/32);
00368 
00369   delete [] exclusion_bits;
00370 }
00371 
00372 
00373 void register_cuda_compute_self(ComputeID c, PatchID pid) {
00374 
00375   if ( ! cudaCompute ) NAMD_bug("register_self called early");
00376 
00377   cudaCompute->requirePatch(pid);
00378 
00379   ComputeNonbondedCUDA::compute_record cr;
00380   cr.c = c;
00381   cr.pid[0] = pid;  cr.pid[1] = pid;
00382   cr.offset = 0.;
00383   if ( cudaCompute->patchRecords[pid].isLocal ) {
00384     cudaCompute->localComputeRecords.add(cr);
00385   } else {
00386     cudaCompute->remoteComputeRecords.add(cr);
00387   }
00388 }
00389 
00390 void register_cuda_compute_pair(ComputeID c, PatchID pid[], int t[]) {
00391 
00392   if ( ! cudaCompute ) NAMD_bug("register_pair called early");
00393  
00394   cudaCompute->requirePatch(pid[0]);
00395   cudaCompute->requirePatch(pid[1]);
00396 
00397   ComputeNonbondedCUDA::compute_record cr;
00398   cr.c = c; 
00399   cr.pid[0] = pid[0];  cr.pid[1] = pid[1];
00400 
00401   int t1 = t[0];
00402   int t2 = t[1];
00403   Vector offset = cudaCompute->patchMap->center(pid[0])
00404                 - cudaCompute->patchMap->center(pid[1]);
00405   offset.x += (t1%3-1) - (t2%3-1);
00406   offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
00407   offset.z += (t1/9-1) - (t2/9-1);
00408   cr.offset = offset;
00409 
00410   if ( cudaCompute->patchRecords[pid[0]].isLocal ) {
00411     cudaCompute->localComputeRecords.add(cr);
00412   } else {
00413     cudaCompute->remoteComputeRecords.add(cr);
00414   }
00415 }
00416 
00417 void unregister_cuda_compute(ComputeID c) {  // static
00418 
00419   NAMD_bug("unregister_compute unimplemented");
00420 
00421 }
00422 
00423 // static __thread cudaEvent_t start_upload;
00424 static __thread cudaEvent_t start_calc;
00425 static __thread cudaEvent_t end_remote_download;
00426 static __thread cudaEvent_t end_local_download;
00427 
00428 static __thread ResizeArray<patch_pair> *patch_pairs_ptr;
00429 static __thread ResizeArray<int> *patch_pair_num_ptr;
00430 
00431 void init_arrays();
00432 
00433 ComputeNonbondedCUDA::ComputeNonbondedCUDA(ComputeID c, ComputeMgr *mgr,
00434                                            ComputeNonbondedCUDA *m, int idx) : Compute(c), slaveIndex(idx) {
00435 #ifdef PRINT_GBIS
00436    CkPrintf("C.N.CUDA[%d]::constructor cid=%d\n", CkMyPe(), c);
00437 #endif
00438 
00439   if ( sizeof(patch_pair) & 15 ) NAMD_bug("sizeof(patch_pair) % 16 != 0");
00440   if ( sizeof(atom) & 15 ) NAMD_bug("sizeof(atom) % 16 != 0");
00441   if ( sizeof(atom_param) & 15 ) NAMD_bug("sizeof(atom_param) % 16 != 0");
00442 
00443   // CkPrintf("create ComputeNonbondedCUDA\n");
00444   master = m ? m : this;
00445   cudaCompute = this;
00446   computeMgr = mgr;
00447   patchMap = PatchMap::Object();
00448   atomMap = AtomMap::Object();
00449   reduction = 0;
00450 
00451   SimParameters *params = Node::Object()->simParameters;
00452   if (params->pressureProfileOn) {
00453     NAMD_die("pressure profile not supported in CUDA");
00454   }
00455 
00456   atomsChanged = 1;
00457   computesChanged = 1;
00458   patchPairsReordered = 1;
00459 
00460   pairlistsValid = 0;
00461   pairlistTolerance = 0.;
00462   usePairlists = 0;
00463   savePairlists = 0;
00464   plcutoff2 = 0.;
00465 
00466   workStarted = 0;
00467   basePriority = PROXY_DATA_PRIORITY;
00468   localWorkMsg2 = new (PRIORITY_SIZE) LocalWorkMsg;
00469 
00470   // Zero array sizes and pointers
00471   init_arrays();
00472 
00473   atoms_size = 0;
00474   atoms = NULL;
00475 
00476   forces_size = 0;
00477   forces = NULL;
00478   
00479   slow_forces_size = 0;
00480   slow_forces = NULL;
00481 
00482   psiSumH_size = 0;
00483   psiSumH = NULL;
00484 
00485   dEdaSumH_size = 0;
00486   dEdaSumH = NULL;
00487 
00488   if ( master != this ) { // I am slave
00489     masterPe = master->masterPe;
00490     master->slaves[slaveIndex] = this;
00491     if ( master->slavePes[slaveIndex] != CkMyPe() ) {
00492       NAMD_bug("ComputeNonbondedCUDA slavePes[slaveIndex] != CkMyPe");
00493     }
00494     deviceID = master->deviceID;
00495     registerPatches();
00496     return;
00497   }
00498   masterPe = CkMyPe();
00499 
00500   const bool streaming = ! (deviceCUDA->getNoStreaming() || params->GBISOn);
00501   if ( streaming && ! deviceCUDA->getSharedGpu() && ! deviceCUDA->getNoMergeGrids() )
00502     deviceCUDA->setMergeGrids(1);
00503 
00504   // Sanity checks for New CUDA kernel
00505   if (params->GBISOn) {
00506     // GBIS
00507     if (deviceCUDA->getNoMergeGrids()) {
00508       NAMD_die("CUDA kernel cannot use +nomergegrids with GBIS simulations");
00509     }
00510     // Set mergegrids ON as long as user hasn't defined +nomergegrids
00511     if (!deviceCUDA->getNoMergeGrids()) deviceCUDA->setMergeGrids(1);
00512     // Final sanity check
00513     if (!deviceCUDA->getMergeGrids()) NAMD_die("CUDA GBIS kernel final sanity check failed");
00514   } else {
00515     // non-GBIS
00516     if (deviceCUDA->getNoStreaming() && !deviceCUDA->getMergeGrids()) {
00517       NAMD_die("CUDA kernel requires +mergegrids if +nostreaming is used");
00518     }
00519   }
00520 
00521 #if CUDA_VERSION >= 5050
00522   int leastPriority, greatestPriority;
00523   cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
00524   cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
00525   if ( leastPriority != greatestPriority ) {
00526     if ( CkMyNode() == 0 ) {
00527       int dev = deviceCUDA->getDeviceID();
00528       CkPrintf("CUDA device %d stream priority range %d %d\n", dev, leastPriority, greatestPriority);
00529     }
00530     if ( deviceCUDA->getMergeGrids() && params->PMEOn && params->PMEOffload && !params->usePMECUDA) {
00531       greatestPriority = leastPriority;
00532     }
00533     if (params->usePMECUDA) greatestPriority = leastPriority;
00534     cudaStreamCreateWithPriority(&stream,cudaStreamDefault,greatestPriority);
00535     cudaStreamCreateWithPriority(&stream2,cudaStreamDefault,leastPriority);
00536   } else
00537 #endif
00538   {
00539     cudaStreamCreate(&stream);
00540     cuda_errcheck("cudaStreamCreate");
00541     int dev = deviceCUDA->getDeviceID();
00542     cudaDeviceProp deviceProp;
00543     cudaGetDeviceProperties(&deviceProp, dev);
00544     cuda_errcheck("cudaGetDeviceProperties");
00545     if ( deviceProp.concurrentKernels && deviceProp.major > 2 ) {
00546       if ( CkMyNode() == 0 ) CkPrintf("CUDA device %d supports concurrent kernels.\n", dev);
00547       cudaStreamCreate(&stream2);
00548     } else {
00549       stream2 = stream;
00550     }
00551   }
00552   cuda_errcheck("cudaStreamCreate");
00553 
00554   // Get GPU device ID
00555   deviceID = deviceCUDA->getDeviceID();
00556 
00557   cuda_init();
00558   if ( max_grid_size < 65535 ) NAMD_bug("bad CUDA max_grid_size");
00559   build_exclusions();
00560   // cudaEventCreate(&start_upload);
00561   cudaEventCreateWithFlags(&start_calc,cudaEventDisableTiming);
00562   cudaEventCreateWithFlags(&end_remote_download,cudaEventDisableTiming);
00563   cudaEventCreateWithFlags(&end_local_download,cudaEventDisableTiming);
00564 
00565   patchRecords.resize(patchMap->numPatches());
00566   patch_pairs_ptr = new ResizeArray<patch_pair>;
00567   patch_pair_num_ptr = new ResizeArray<int>;
00568 
00569   if ( params->PMEOn && params->PMEOffload && !params->usePMECUDA) deviceCUDA->setGpuIsMine(0);
00570 }
00571 
00572 
00573 ComputeNonbondedCUDA::~ComputeNonbondedCUDA() { ; }
00574 
00575 void ComputeNonbondedCUDA::requirePatch(int pid) {
00576 
00577   computesChanged = 1;
00578   patch_record &pr = patchRecords[pid];
00579   if ( pr.refCount == 0 ) {
00580     pr.isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->node(pid)) == CmiPhysicalNodeID(CkMyPe()) );
00581     pr.isSameNode = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
00582     if ( deviceCUDA->getMergeGrids() ) {
00583       pr.isLocal = 0;
00584     } else if ( CkNumNodes() < 2 ) {
00585       pr.isLocal = 1 & ( 1 ^ patchMap->index_a(pid) ^
00586          patchMap->index_b(pid) ^ patchMap->index_c(pid) );
00587     } else {
00588       pr.isLocal = pr.isSameNode;
00589     }
00590     if ( pr.isLocal ) {
00591       localActivePatches.add(pid);
00592     } else {
00593       remoteActivePatches.add(pid);
00594     }
00595     activePatches.add(pid);
00596     pr.patchID = pid;
00597     pr.hostPe = -1;
00598     pr.x = NULL;
00599     pr.xExt = NULL;
00600     pr.r = NULL;
00601     pr.f = NULL;
00602     pr.intRad      = NULL;
00603     pr.psiSum      = NULL;
00604     pr.bornRad     = NULL;
00605     pr.dEdaSum     = NULL;
00606     pr.dHdrPrefix  = NULL;
00607   }
00608   pr.refCount += 1;
00609 }
00610 
00611 void ComputeNonbondedCUDA::registerPatches() {
00612 
00613   SimParameters *simParams = Node::Object()->simParameters;
00614   int npatches = master->activePatches.size();
00615   int *pids = master->activePatches.begin();
00616   patch_record *recs = master->patchRecords.begin();
00617   for ( int i=0; i<npatches; ++i ) {
00618     int pid = pids[i];
00619     patch_record &pr = recs[pid];
00620     if ( pr.hostPe == CkMyPe() ) {
00621       pr.slave = this;
00622       pr.msg = new (PRIORITY_SIZE) FinishWorkMsg;
00623       hostedPatches.add(pid);
00624       if ( pr.isLocal ) {
00625         localHostedPatches.add(pid);
00626       } else {
00627         remoteHostedPatches.add(pid);
00628       }
00629       ProxyMgr::Object()->createProxy(pid);
00630       pr.p = patchMap->patch(pid);
00631       pr.positionBox = pr.p->registerPositionPickup(this);
00632       pr.forceBox = pr.p->registerForceDeposit(this);
00633       if (simParams->GBISOn) {
00634         pr.intRadBox      = pr.p->registerIntRadPickup(this);
00635         pr.psiSumBox      = pr.p->registerPsiSumDeposit(this);
00636         pr.bornRadBox     = pr.p->registerBornRadPickup(this);
00637         pr.dEdaSumBox     = pr.p->registerDEdaSumDeposit(this);
00638         pr.dHdrPrefixBox  = pr.p->registerDHdrPrefixPickup(this);
00639       }
00640     }
00641   }
00642   if ( master == this ) setNumPatches(activePatches.size());
00643   else setNumPatches(hostedPatches.size());
00644   if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
00645   CkPrintf("Pe %d hosts %d local and %d remote patches for pe %d\n", CkMyPe(), localHostedPatches.size(), remoteHostedPatches.size(), masterPe);
00646 }
00647 
00648 struct pid_sortop_reverse_priority {
00649   bool operator() (int pidj, int pidi) {  // i and j reversed
00650     int ppi = PATCH_PRIORITY(pidi);
00651     int ppj = PATCH_PRIORITY(pidj);
00652     if ( ppi != ppj ) return ppi < ppj;
00653     return pidi < pidj;
00654   }
00655 };
00656 
00657 void ComputeNonbondedCUDA::assignPatches() {
00658 
00659   int *pesOnNodeSharingDevice = new int[CkMyNodeSize()];
00660   int numPesOnNodeSharingDevice = 0;
00661   int masterIndex = -1;
00662   for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
00663     int pe = deviceCUDA->getPesSharingDevice(i);
00664     if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
00665     if ( CkNodeOf(pe) == CkMyNode() ) {
00666       pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
00667     }
00668   }
00669 
00670   int npatches = activePatches.size();
00671 
00672   if ( npatches ) {
00673     reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00674   }
00675 
00676   // calculate priority rank of local home patch within pe
00677   {
00678     ResizeArray< ResizeArray<int> > homePatchByRank(CkMyNodeSize());
00679     for ( int i=0; i<npatches; ++i ) {
00680       int pid = activePatches[i];
00681       int homePe = patchMap->node(pid);
00682       if ( CkNodeOf(homePe) == CkMyNode() ) {
00683         homePatchByRank[CkRankOf(homePe)].add(pid);
00684       }
00685     }
00686     for ( int i=0; i<CkMyNodeSize(); ++i ) {
00687       pid_sortop_reverse_priority so;
00688       std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
00689       int masterBoost = ( CkMyRank() == i ? 2 : 0 );
00690       for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
00691         int pid = homePatchByRank[i][j];
00692         patchRecords[pid].reversePriorityRankInPe = j + masterBoost;
00693       }
00694     }
00695   }
00696 
00697   int *count = new int[npatches];
00698   memset(count, 0, sizeof(int)*npatches);
00699   int *pcount = new int[numPesOnNodeSharingDevice];
00700   memset(pcount, 0, sizeof(int)*numPesOnNodeSharingDevice);
00701   int *rankpcount = new int[CkMyNodeSize()];
00702   memset(rankpcount, 0, sizeof(int)*CkMyNodeSize());
00703   char *table = new char[npatches*numPesOnNodeSharingDevice];
00704   memset(table, 0, npatches*numPesOnNodeSharingDevice);
00705 
00706   int unassignedpatches = npatches;
00707 
00708   if ( 0 ) { // assign all to device pe
00709     for ( int i=0; i<npatches; ++i ) {
00710       int pid = activePatches[i];
00711       patch_record &pr = patchRecords[pid];
00712       pr.hostPe = CkMyPe();
00713     }
00714     unassignedpatches = 0;
00715     pcount[masterIndex] = npatches;
00716   } else 
00717 
00718   // assign if home pe and build table of natural proxies
00719   for ( int i=0; i<npatches; ++i ) {
00720     int pid = activePatches[i];
00721     patch_record &pr = patchRecords[pid];
00722     int homePe = patchMap->node(pid);
00723     for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00724       int pe = pesOnNodeSharingDevice[j];
00725       if ( pe == homePe ) {
00726         pr.hostPe = pe;  --unassignedpatches;
00727         pcount[j] += 1;
00728       }
00729       if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
00730         table[i*numPesOnNodeSharingDevice+j] = 1;
00731       }
00732     }
00733     if ( pr.hostPe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
00734       pr.hostPe = homePe;  --unassignedpatches;
00735       rankpcount[CkRankOf(homePe)] += 1;
00736     }
00737   }
00738   // assign if only one pe has a required proxy
00739   int assignj = 0;
00740   for ( int i=0; i<npatches; ++i ) {
00741     int pid = activePatches[i];
00742     patch_record &pr = patchRecords[pid];
00743     if ( pr.hostPe != -1 ) continue;
00744     int c = 0;
00745     int lastj;
00746     for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00747       if ( table[i*numPesOnNodeSharingDevice+j] ) { ++c; lastj=j; }
00748     }
00749     count[i] = c;
00750     if ( c == 1 ) {
00751       pr.hostPe = pesOnNodeSharingDevice[lastj];
00752       --unassignedpatches;
00753       pcount[lastj] += 1;
00754     }
00755   }
00756   while ( unassignedpatches ) {
00757     int i;
00758     for ( i=0; i<npatches; ++i ) {
00759       if ( ! table[i*numPesOnNodeSharingDevice+assignj] ) continue;
00760       int pid = activePatches[i];
00761       patch_record &pr = patchRecords[pid];
00762       if ( pr.hostPe != -1 ) continue;
00763       pr.hostPe = pesOnNodeSharingDevice[assignj];
00764       --unassignedpatches;
00765       pcount[assignj] += 1;
00766       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00767       break;
00768     }
00769     if ( i<npatches ) continue;  // start search again
00770     for ( i=0; i<npatches; ++i ) {
00771       int pid = activePatches[i];
00772       patch_record &pr = patchRecords[pid];
00773       if ( pr.hostPe != -1 ) continue;
00774       if ( count[i] ) continue;
00775       pr.hostPe = pesOnNodeSharingDevice[assignj];
00776       --unassignedpatches;
00777       pcount[assignj] += 1;
00778       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00779       break;
00780     }
00781     if ( i<npatches ) continue;  // start search again
00782     if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00783   }
00784 
00785   for ( int i=0; i<npatches; ++i ) {
00786     int pid = activePatches[i];
00787     patch_record &pr = patchRecords[pid];
00788     // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), pid, pr.hostPe);
00789   }
00790 
00791   slavePes = new int[CkMyNodeSize()];
00792   slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
00793   numSlaves = 0;
00794   for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00795     int pe = pesOnNodeSharingDevice[j];
00796     int rank = pe - CkNodeFirst(CkMyNode());
00797     // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
00798     //          CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
00799     if ( pe == CkMyPe() ) continue;
00800     if ( ! pcount[j] && ! rankpcount[rank] ) continue;
00801     rankpcount[rank] = 0;  // skip in rank loop below
00802     slavePes[numSlaves] = pe;
00803     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00804     ++numSlaves;
00805   }
00806   for ( int j=0; j<CkMyNodeSize(); ++j ) {
00807     int pe = CkNodeFirst(CkMyNode()) + j;
00808     // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
00809     //          CkMyPe(),j,pe,rankpcount[j]);
00810     if ( ! rankpcount[j] ) continue;
00811     if ( pe == CkMyPe() ) continue;
00812     slavePes[numSlaves] = pe;
00813     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00814     ++numSlaves;
00815   }
00816   registerPatches();
00817 
00818   delete [] pesOnNodeSharingDevice;
00819   delete [] count;
00820   delete [] pcount;
00821   delete [] rankpcount;
00822   delete [] table;
00823 }
00824 
00825 static __thread int atom_params_size;
00826 static __thread atom_param* atom_params;
00827 
00828 static __thread int vdw_types_size;
00829 static __thread int* vdw_types;
00830 
00831 static __thread int dummy_size;
00832 static __thread float* dummy_dev;
00833 
00834 static __thread int force_ready_queue_size;
00835 static __thread int *force_ready_queue;
00836 static __thread int force_ready_queue_len;
00837 static __thread int force_ready_queue_next;
00838 
00839 static __thread int block_order_size;
00840 static __thread int *block_order;
00841 
00842 static __thread int num_atoms;
00843 static __thread int num_local_atoms;
00844 static __thread int num_remote_atoms;
00845 
00846 static __thread int virials_size;
00847 static __thread float *virials;
00848 static __thread int num_virials;
00849 // NOTE: slow_virials is a computed pointer from "virials" -do not deallocate
00850 static __thread float *slow_virials;
00851 
00852 static __thread int energy_gbis_size;
00853 static __thread float *energy_gbis;
00854 
00855 //GBIS host pointers
00856 static __thread int intRad0H_size;
00857 static __thread float *intRad0H;
00858 static __thread int intRadSH_size;
00859 static __thread float *intRadSH;
00860 static __thread int bornRadH_size;
00861 static __thread float *bornRadH;
00862 static __thread int dHdrPrefixH_size;
00863 static __thread float *dHdrPrefixH;
00864 
00865 static __thread int cuda_timer_count;
00866 static __thread double cuda_timer_total;
00867 static __thread double kernel_time;
00868 static __thread double remote_submit_time;
00869 static __thread double local_submit_time;
00870 
00871 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
00872 
00873 extern "C" void CcdCallBacksReset(void *ignored,double curWallTime);  // fix Charm++
00874 
00875 #ifdef PRINT_GBIS
00876 #define GBISP(...) CkPrintf(__VA_ARGS__);
00877 #else
00878 #define GBISP(...)
00879 #endif
00880 
00881 #define count_limit 1000000
00882 static __thread int check_count;
00883 static __thread int check_remote_count;
00884 static __thread int check_local_count;
00885 
00886 void init_arrays() {
00887 
00888   atom_params_size = 0;
00889   atom_params = NULL;
00890 
00891   vdw_types_size = 0;
00892   vdw_types = NULL;
00893   
00894   dummy_size = 0;
00895   dummy_dev = NULL;
00896 
00897   force_ready_queue_size = 0;
00898   force_ready_queue = NULL;
00899   force_ready_queue_len = 0;
00900   force_ready_queue_next = 0;
00901   
00902   block_order_size = 0;
00903   block_order = NULL;
00904   
00905   num_atoms = 0;
00906   num_local_atoms = 0;
00907   num_remote_atoms = 0;
00908 
00909   virials_size = 0;
00910   virials = NULL;
00911   num_virials = 0;
00912 
00913   energy_gbis_size = 0;
00914   energy_gbis = NULL;
00915 
00916   intRad0H_size = 0;
00917   intRad0H = NULL;
00918   intRadSH_size = 0;
00919   intRadSH = NULL;
00920   bornRadH_size = 0;
00921   bornRadH = NULL;
00922   dHdrPrefixH_size = 0;
00923   dHdrPrefixH = NULL;
00924 
00925 }
00926 
00927 void cuda_check_progress(void *arg, double walltime) {
00928   CUDA_TRACE_POLL_REMOTE;
00929 
00930   int flindex;
00931   int poll_again = 1;
00932   while ( -1 != (flindex = force_ready_queue[force_ready_queue_next]) ) {
00933     //    CkPrintf("Pe %d forces ready %d is index %d at %lf\n",
00934     //       CkMyPe(), force_ready_queue_next, flindex, walltime);
00935     force_ready_queue[force_ready_queue_next] = -1;
00936     ++force_ready_queue_next;
00937     check_count = 0;
00938     if ( force_ready_queue_next == force_ready_queue_len ) {
00939       poll_again = 0;
00940       CUDA_TRACE_LOCAL(kernel_time,walltime);
00941       kernel_time = walltime - kernel_time;
00942       // need to guarantee this finishes before the last patch message!
00943       ((ComputeNonbondedCUDA *) arg)->workStarted = 0;
00944       ((ComputeNonbondedCUDA *) arg)->finishReductions();
00945     }
00946     ((ComputeNonbondedCUDA *) arg)->messageFinishPatch(flindex);
00947     if ( force_ready_queue_next == force_ready_queue_len ) break;
00948   }
00949   if ( ++check_count >= count_limit ) {
00950     char errmsg[256];
00951     sprintf(errmsg,"cuda_check_progress polled %d times over %f s on step %d",
00952             check_count, walltime - remote_submit_time,
00953             ((ComputeNonbondedCUDA *) arg)->step);
00954     cudaDie(errmsg,cudaSuccess);
00955   }
00956   if ( poll_again ) {
00957     CcdCallBacksReset(0,walltime);  // fix Charm++
00958     CUDA_POLL(cuda_check_progress, arg);
00959   }
00960 }
00961 
00962 void cuda_check_remote_progress(void *arg, double walltime) {
00963 
00964   CUDA_TRACE_POLL_REMOTE;
00965   cudaError_t err = cudaEventQuery(end_remote_download);
00966   if ( err == cudaSuccess ) {
00967     local_submit_time = walltime;
00968     CUDA_TRACE_REMOTE(remote_submit_time,local_submit_time);
00969     if ( deviceCUDA->getMergeGrids() ) {  // no local
00970       kernel_time = local_submit_time - kernel_time;
00971     }
00972     check_remote_count = 0;
00973     cuda_errcheck("at cuda remote stream completed");
00974     WorkDistrib::messageFinishCUDA((ComputeNonbondedCUDA *) arg);
00975   } else if ( err != cudaErrorNotReady ) {
00976     char errmsg[256];
00977     sprintf(errmsg,"in cuda_check_remote_progress after polling %d times over %f s on step %d",
00978             check_remote_count, walltime - remote_submit_time,
00979             ((ComputeNonbondedCUDA *) arg)->step);
00980     cudaDie(errmsg,err);
00981   } else if ( ++check_remote_count >= count_limit ) {
00982     char errmsg[256];
00983     sprintf(errmsg,"cuda_check_remote_progress polled %d times over %f s on step %d",
00984             check_remote_count, walltime - remote_submit_time,
00985             ((ComputeNonbondedCUDA *) arg)->step);
00986     cudaDie(errmsg,err);
00987   } else if ( check_local_count ) {
00988     NAMD_bug("nonzero check_local_count in cuda_check_remote_progress");
00989   } else {
00990     CcdCallBacksReset(0,walltime);  // fix Charm++
00991     CUDA_POLL(cuda_check_remote_progress, arg);
00992   }
00993 }
00994 
00995 void cuda_check_local_progress(void *arg, double walltime) {
00996 
00997   CUDA_TRACE_POLL_LOCAL;
00998   cudaError_t err = cudaEventQuery(end_local_download);
00999   if ( err == cudaSuccess ) {
01000     CUDA_TRACE_LOCAL(local_submit_time,walltime);
01001     kernel_time = walltime - kernel_time;
01002     check_local_count = 0;
01003     cuda_errcheck("at cuda local stream completed");
01004     WorkDistrib::messageFinishCUDA((ComputeNonbondedCUDA *) arg);
01005   } else if ( err != cudaErrorNotReady ) {
01006     char errmsg[256];
01007     sprintf(errmsg,"in cuda_check_local_progress after polling %d times over %f s on step %d",
01008             check_local_count, walltime - local_submit_time,
01009             ((ComputeNonbondedCUDA *) arg)->step);
01010     cudaDie(errmsg,err);
01011   } else if ( ++check_local_count >= count_limit ) {
01012     char errmsg[256];
01013     sprintf(errmsg,"cuda_check_local_progress polled %d times over %f s on step %d",
01014             check_local_count, walltime - local_submit_time,
01015             ((ComputeNonbondedCUDA *) arg)->step);
01016     cudaDie(errmsg,err);
01017   } else if ( check_remote_count ) {
01018     NAMD_bug("nonzero check_remote_count in cuda_check_local_progress");
01019   } else {
01020     CcdCallBacksReset(0,walltime);  // fix Charm++
01021     CUDA_POLL(cuda_check_local_progress, arg);
01022   }
01023 }
01024 
01025 #if 0
01026 // don't use this one unless timer is part of stream, above is better
01027 void cuda_check_progress(void *arg, double walltime) {
01028   if ( cuda_stream_finished() ) {
01029     kernel_time = walltime - kernel_time;
01030     CcdCallBacksReset(0,walltime);  // fix Charm++
01031     CUDA_POLL(ccd_index);
01032     // ((ComputeNonbondedCUDA *) arg)->finishWork();
01033     WorkDistrib::messageEnqueueWork((ComputeNonbondedCUDA *) arg);
01034   }
01035 }
01036 #endif
01037 
01038 void ComputeNonbondedCUDA::atomUpdate() {
01039   //fprintf(stderr, "%d ComputeNonbondedCUDA::atomUpdate\n",CkMyPe());
01040   atomsChanged = 1;
01041 }
01042 
01043 static __thread int kernel_launch_state = 0;
01044 
01045 struct cr_sortop_distance {
01046   const Lattice &l;
01047   cr_sortop_distance(const Lattice &lattice) : l(lattice) { }
01048   bool operator() (ComputeNonbondedCUDA::compute_record i,
01049                         ComputeNonbondedCUDA::compute_record j) {
01050     Vector a = l.a();
01051     Vector b = l.b();
01052     Vector c = l.c();
01053     BigReal ri = (i.offset.x * a + i.offset.y * b + i.offset.z * c).length2();
01054     BigReal rj = (j.offset.x * a + j.offset.y * b + j.offset.z * c).length2();
01055     return ( ri < rj );
01056   }
01057 };
01058 
01059 struct cr_sortop_reverse_priority {
01060   cr_sortop_distance &distop;
01061   const ComputeNonbondedCUDA::patch_record *pr;
01062   cr_sortop_reverse_priority(cr_sortop_distance &sod,
01063        const ComputeNonbondedCUDA::patch_record *patchrecs) : distop(sod), pr(patchrecs) { }
01064   bool pid_compare_priority(int pidi, int pidj) {
01065     const ComputeNonbondedCUDA::patch_record &pri = pr[pidi];
01066     const ComputeNonbondedCUDA::patch_record &prj = pr[pidj];
01067     if ( pri.isSamePhysicalNode && ! prj.isSamePhysicalNode ) return 0;
01068     if ( prj.isSamePhysicalNode && ! pri.isSamePhysicalNode ) return 1;
01069     if ( pri.isSameNode && ! prj.isSameNode ) return 0;
01070     if ( prj.isSameNode && ! pri.isSameNode ) return 1;
01071     if ( pri.isSameNode ) {  // and prj.isSameNode
01072       int rpri = pri.reversePriorityRankInPe;
01073       int rprj = prj.reversePriorityRankInPe;
01074       if ( rpri != rprj ) return rpri > rprj;
01075       return sortop_bitreverse(CkRankOf(pri.hostPe),CkRankOf(prj.hostPe));
01076     }
01077     int ppi = PATCH_PRIORITY(pidi);
01078     int ppj = PATCH_PRIORITY(pidj);
01079     if ( ppi != ppj ) return ppi < ppj;
01080     return pidi < pidj;
01081   }
01082   bool operator() (ComputeNonbondedCUDA::compute_record j,
01083                         ComputeNonbondedCUDA::compute_record i) {  // i and j reversed
01084     int pidi = pid_compare_priority(i.pid[0],i.pid[1]) ? i.pid[0] : i.pid[1];
01085     int pidj = pid_compare_priority(j.pid[0],j.pid[1]) ? j.pid[0] : j.pid[1];
01086     if ( pidi != pidj ) return pid_compare_priority(pidi, pidj);
01087     return distop(i,j);
01088   }
01089 };
01090 
01091 void ComputeNonbondedCUDA::skip() {
01092   //fprintf(stderr, "ComputeNonbondedCUDA::skip()\n");
01093   SimParameters *simParams = Node::Object()->simParameters;
01094   for ( int i=0; i<hostedPatches.size(); ++i ) {
01095     patch_record &pr = master->patchRecords[hostedPatches[i]];
01096     pr.positionBox->skip();
01097     pr.forceBox->skip();
01098     if (simParams->GBISOn) {
01099       pr.intRadBox->skip();
01100       pr.psiSumBox->skip();
01101       pr.bornRadBox->skip();
01102       pr.dEdaSumBox->skip();
01103       pr.dHdrPrefixBox->skip();
01104     }
01105   }
01106 }
01107 
01108 int ComputeNonbondedCUDA::noWork() {
01109 
01110   SimParameters *simParams = Node::Object()->simParameters;
01111   Flags &flags = master->patchRecords[hostedPatches[0]].p->flags;
01112   lattice = flags.lattice;
01113   doSlow = flags.doFullElectrostatics;
01114   doEnergy = flags.doEnergy;
01115   step = flags.step;
01116 
01117   if ( ! flags.doNonbonded ) {
01118     GBISP("GBIS[%d] noWork() don't do nonbonded\n",CkMyPe());
01119     if ( master != this ) {
01120       computeMgr->sendNonbondedCUDASlaveReady(masterPe,
01121                         hostedPatches.size(),atomsChanged,sequence());
01122     } else {
01123       for ( int i = 0; i < numSlaves; ++i ) {
01124         computeMgr->sendNonbondedCUDASlaveSkip(slaves[i],slavePes[i]);
01125       }
01126       skip();
01127     }
01128     if ( reduction ) {
01129        reduction->submit();
01130     }
01131 
01132     return 1;
01133   }
01134 
01135   for ( int i=0; i<hostedPatches.size(); ++i ) {
01136     patch_record &pr = master->patchRecords[hostedPatches[i]];
01137     if (!simParams->GBISOn || gbisPhase == 1) {
01138       GBISP("GBIS[%d] noWork() P0[%d] open()\n",CkMyPe(), pr.patchID);
01139       pr.x = pr.positionBox->open();
01140       pr.xExt = pr.p->getCompAtomExtInfo();
01141     }
01142 
01143     if (simParams->GBISOn) {
01144       if (gbisPhase == 1) {
01145         GBISP("GBIS[%d] noWork() P1[%d] open()\n",CkMyPe(),pr.patchID);
01146         pr.intRad     = pr.intRadBox->open();
01147         pr.psiSum     = pr.psiSumBox->open();
01148       } else if (gbisPhase == 2) {
01149         GBISP("GBIS[%d] noWork() P2[%d] open()\n",CkMyPe(),pr.patchID);
01150         pr.bornRad    = pr.bornRadBox->open();
01151         pr.dEdaSum    = pr.dEdaSumBox->open();
01152       } else if (gbisPhase == 3) {
01153         GBISP("GBIS[%d] noWork() P3[%d] open()\n",CkMyPe(),pr.patchID);
01154         pr.dHdrPrefix = pr.dHdrPrefixBox->open();
01155       }
01156       GBISP("opened GBIS boxes");
01157     }
01158   }
01159 
01160   if ( master == this ) return 0; //work to do, enqueue as usual
01161 
01162   // message masterPe
01163   computeMgr->sendNonbondedCUDASlaveReady(masterPe,
01164                         hostedPatches.size(),atomsChanged,sequence());
01165   atomsChanged = 0;
01166 
01167   workStarted = 1;
01168 
01169   return 1;
01170 }
01171 
01172 void ComputeNonbondedCUDA::doWork() {
01173 GBISP("C.N.CUDA[%d]::doWork: seq %d, phase %d, workStarted %d, atomsChanged %d\n", \
01174 CkMyPe(), sequence(), gbisPhase, workStarted, atomsChanged);
01175 
01176   // Set GPU device ID
01177   cudaCheck(cudaSetDevice(deviceID));
01178 
01179   ResizeArray<patch_pair> &patch_pairs(*patch_pairs_ptr);
01180   ResizeArray<int> &patch_pair_num(*patch_pair_num_ptr);
01181 
01182   if ( workStarted ) { //if work already started, check if finished
01183     if ( finishWork() ) {  // finished
01184       workStarted = 0;
01185       basePriority = PROXY_DATA_PRIORITY;  // higher to aid overlap
01186     } else {  // need to call again
01187       workStarted = 2;
01188       basePriority = PROXY_RESULTS_PRIORITY;  // lower for local
01189       if ( master == this && kernel_launch_state > 2 ) {
01190         cuda_check_local_progress(this,0.);  // launches polling
01191       }
01192     }
01193     return;
01194   }
01195 
01196   workStarted = 1;
01197   basePriority = COMPUTE_PROXY_PRIORITY;
01198 
01199   Molecule *mol = Node::Object()->molecule;
01200   Parameters *params = Node::Object()->parameters;
01201   SimParameters *simParams = Node::Object()->simParameters;
01202 
01203   //execute only during GBIS phase 1, or if not using GBIS
01204   if (!simParams->GBISOn || gbisPhase == 1) {
01205 
01206     //bind new patches to GPU
01207     if ( atomsChanged || computesChanged ) {
01208       int npatches = activePatches.size();
01209 
01210       pairlistsValid = 0;
01211       pairlistTolerance = 0.;
01212 
01213       if ( computesChanged ) {
01214         computesChanged = 0;
01215 
01216         if ( ! dummy_size ) {
01217           dummy_size = 1024*1024;
01218           cudaMalloc((void**)&dummy_dev,dummy_size);
01219           cuda_errcheck("in cudaMalloc dummy_dev");
01220         }
01221 
01222         bool did_realloc = reallocate_host<int>(&force_ready_queue, &force_ready_queue_size, npatches,
01223           1.2f, cudaHostAllocMapped);
01224         if (did_realloc) {
01225           for (int k=0; k < force_ready_queue_size; ++k)
01226             force_ready_queue[k] = -1;
01227         }
01228         force_ready_queue_len = npatches;
01229         reallocate_host<int>(&block_order, &block_order_size,
01230           2*(localComputeRecords.size()+remoteComputeRecords.size()),
01231           1.2f, cudaHostAllocMapped);
01232     
01233         num_virials = npatches;
01234         reallocate_host<float>(&virials, &virials_size, 2*16*num_virials,
01235           1.2f, cudaHostAllocMapped);
01236         slow_virials = virials + 16*num_virials;
01237 
01238         reallocate_host<float>(&energy_gbis, &energy_gbis_size, npatches,
01239           1.2f, cudaHostAllocMapped);
01240         for (int i  = 0; i < energy_gbis_size; i++) energy_gbis[i] = 0.f;
01241 
01242         int *ap = activePatches.begin();
01243         for ( int i=0; i<localActivePatches.size(); ++i ) {
01244           *(ap++) = localActivePatches[i];
01245         }
01246         for ( int i=0; i<remoteActivePatches.size(); ++i ) {
01247           *(ap++) = remoteActivePatches[i];
01248         }
01249 
01250         // sort computes by distance between patches
01251         cr_sortop_distance so(lattice);
01252         std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),so);
01253         std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),so);
01254 
01255         const bool streaming = ! (deviceCUDA->getNoStreaming() || simParams->GBISOn);
01256 
01257         if ( streaming ) {
01258           // iout << "Reverse-priority sorting...\n" << endi;
01259           cr_sortop_reverse_priority sorp(so,patchRecords.begin());
01260           std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),sorp);
01261           std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),sorp);
01262           patchPairsReordered = 0;
01263           //patchPairsReordered = 1;
01264       // int len = remoteComputeRecords.size();
01265       // for ( int i=0; i<len; ++i ) {
01266       //   iout << "reverse_order " << i << " " << remoteComputeRecords[i].pid[0] << "\n";
01267       // }
01268       // int len2 = localComputeRecords.size();
01269       // for ( int i=0; i<len2; ++i ) {
01270       //   iout << "reverse_order " << (i+len) << " " << localComputeRecords[i].pid[0] << "\n";
01271       // }
01272       // iout << endi;
01273         } else {
01274           patchPairsReordered = 1;
01275         }
01276  
01277         int nlc = localComputeRecords.size();
01278         int nrc = remoteComputeRecords.size();
01279         computeRecords.resize(nlc+nrc);
01280         compute_record *cr = computeRecords.begin();
01281         for ( int i=0; i<nrc; ++i ) {
01282           *(cr++) = remoteComputeRecords[i];
01283         }
01284         for ( int i=0; i<nlc; ++i ) {
01285           *(cr++) = localComputeRecords[i];
01286         }
01287 
01288         // patch_pair_num[i] = number of patch pairs that involve patch i
01289         patch_pair_num.resize(npatches);
01290         for ( int i=0; i<npatches; ++i ) {
01291           patchRecords[activePatches[i]].localIndex = i;
01292           patch_pair_num[i] = 0;
01293         }
01294 
01295         int ncomputes = computeRecords.size();
01296         patch_pairs.resize(ncomputes);
01297         for ( int i=0; i<ncomputes; ++i ) {
01298           ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01299           int lp1 = patchRecords[cr.pid[0]].localIndex;
01300           int lp2 = patchRecords[cr.pid[1]].localIndex;
01301           patch_pair_num[lp1]++;
01302           if (lp1 != lp2) patch_pair_num[lp2]++;
01303           patch_pair &pp = patch_pairs[i];
01304           pp.offset.x = cr.offset.x;
01305           pp.offset.y = cr.offset.y;
01306           pp.offset.z = cr.offset.z;
01307         }
01308 
01309         for ( int i=0; i<ncomputes; ++i ) {
01310           ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01311           int lp1 = patchRecords[cr.pid[0]].localIndex;
01312           int lp2 = patchRecords[cr.pid[1]].localIndex;
01313           patch_pair &pp = patch_pairs[i];
01314           pp.patch1_ind = lp1;
01315           pp.patch2_ind = lp2;
01316           pp.patch1_num_pairs = patch_pair_num[lp1];
01317           pp.patch2_num_pairs = patch_pair_num[lp2];
01318         }
01319 
01320       if ( CmiPhysicalNodeID(CkMyPe()) < 2 ) {
01321         CkPrintf("Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
01322                  CkMyPe(), localActivePatches.size(), remoteActivePatches.size(),
01323                  localComputeRecords.size(), remoteComputeRecords.size());
01324       }
01325     }  // computesChanged
01326     else if ( ! patchPairsReordered ) {
01327       patchPairsReordered = 1;
01328       int len = patch_pairs.size();
01329       int nlc = localComputeRecords.size();
01330       int nrc = remoteComputeRecords.size();
01331       if ( len != nlc + nrc ) NAMD_bug("array size mismatch in ComputeNonbondedCUDA reordering");
01332       ResizeArray<ComputeNonbondedCUDA::compute_record> new_computeRecords(len);
01333       ResizeArray<patch_pair> new_patch_pairs(len);
01334       int irc=nrc;
01335       int ilc=len;
01336       for ( int i=0; i<len; ++i ) {
01337         int boi = block_order[i];
01338         int dest;
01339         if ( boi < nrc ) { dest = --irc; } else { dest = --ilc; }
01340         new_computeRecords[dest] = computeRecords[boi];
01341         new_patch_pairs[dest] = patch_pairs[boi];
01342       }
01343       if ( irc != 0 || ilc != nrc ) NAMD_bug("block index mismatch in ComputeNonbondedCUDA reordering");
01344       computeRecords.swap(new_computeRecords);
01345       patch_pairs.swap(new_patch_pairs);
01346     }
01347 
01348     int istart = 0;
01349     int i;
01350     for ( i=0; i<npatches; ++i ) {
01351       if ( i == localActivePatches.size() ) {
01352         num_local_atoms = istart;
01353       }
01354       patch_record &pr = patchRecords[activePatches[i]];
01355       pr.localStart = istart;
01356       int natoms = pr.p->getNumAtoms();
01357       int nfreeatoms = natoms;
01358       if ( fixedAtomsOn ) {
01359         const CompAtomExt *aExt = pr.xExt;
01360         for ( int j=0; j<natoms; ++j ) {
01361           if ( aExt[j].atomFixed ) --nfreeatoms;
01362         }
01363       }
01364       pr.numAtoms = natoms;
01365       pr.numFreeAtoms = nfreeatoms;
01366       istart += natoms;
01367       istart += 16 - (natoms & 15);
01368     }
01369     if ( i == localActivePatches.size() ) {
01370       num_local_atoms = istart;
01371     }
01372     num_atoms = istart;
01373     num_remote_atoms = num_atoms - num_local_atoms;
01374     reallocate_host<atom_param>(&atom_params, &atom_params_size, num_atoms, 1.2f);
01375     reallocate_host<int>(&vdw_types, &vdw_types_size, num_atoms, 1.2f);
01376     reallocate_host<CudaAtom>(&atoms, &atoms_size, num_atoms, 1.2f);
01377     reallocate_host<float4>(&forces, &forces_size, num_atoms, 1.2f, cudaHostAllocMapped);
01378     reallocate_host<float4>(&slow_forces, &slow_forces_size, num_atoms, 1.2f, cudaHostAllocMapped);
01379     if (simParams->GBISOn) {
01380       reallocate_host<float>(&intRad0H, &intRad0H_size, num_atoms, 1.2f);
01381       reallocate_host<float>(&intRadSH, &intRadSH_size, num_atoms, 1.2f);
01382       reallocate_host<GBReal>(&psiSumH, &psiSumH_size, num_atoms, 1.2f, cudaHostAllocMapped);
01383       reallocate_host<float>(&bornRadH, &bornRadH_size, num_atoms, 1.2f);
01384       reallocate_host<GBReal>(&dEdaSumH, &dEdaSumH_size, num_atoms, 1.2f, cudaHostAllocMapped);
01385       reallocate_host<float>(&dHdrPrefixH, &dHdrPrefixH_size, num_atoms, 1.2f);
01386     }
01387 
01388     int bfstart = 0;
01389     int exclmask_start = 0;
01390     int ncomputes = computeRecords.size();
01391     for ( int i=0; i<ncomputes; ++i ) {
01392       ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01393       int p1 = cr.pid[0];
01394       int p2 = cr.pid[1];
01395       patch_pair &pp = patch_pairs[i];
01396       pp.patch1_start = patchRecords[p1].localStart;
01397       pp.patch1_size  = patchRecords[p1].numAtoms;
01398       pp.patch1_free_size = patchRecords[p1].numFreeAtoms;
01399       pp.patch2_start = patchRecords[p2].localStart;
01400       pp.patch2_size  = patchRecords[p2].numAtoms;
01401       pp.patch2_free_size = patchRecords[p2].numFreeAtoms;
01402       pp.plist_start = bfstart;
01403       // size1*size2 = number of patch pairs
01404       int size1 = (pp.patch1_size-1)/WARPSIZE+1;
01405       int size2 = (pp.patch2_size-1)/WARPSIZE+1;
01406       pp.plist_size = (size1*size2-1)/32+1;
01407       bfstart += pp.plist_size;
01408       pp.exclmask_start = exclmask_start;
01409       exclmask_start += size1*size2;
01410     } //for ncomputes
01411 
01412     cuda_bind_patch_pairs(patch_pairs.begin(), patch_pairs.size(),
01413       activePatches.size(), num_atoms, bfstart, 
01414       exclmask_start);
01415 
01416   }  // atomsChanged || computesChanged
01417 
01418   double charge_scaling = sqrt(COULOMB * scaling * dielectric_1);
01419 
01420   Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01421   float maxAtomMovement = 0.;
01422   float maxPatchTolerance = 0.;
01423 
01424   for ( int i=0; i<activePatches.size(); ++i ) {
01425     patch_record &pr = patchRecords[activePatches[i]];
01426 
01427     float maxMove = pr.p->flags.maxAtomMovement;
01428     if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
01429 
01430     float maxTol = pr.p->flags.pairlistTolerance;
01431     if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
01432 
01433     int start = pr.localStart;
01434     int n = pr.numAtoms;
01435     const CompAtom *a = pr.x;
01436     const CompAtomExt *aExt = pr.xExt;
01437     if ( atomsChanged ) {
01438 
01439       atom_param *ap = atom_params + start;
01440       for ( int k=0; k<n; ++k ) {
01441         int j = aExt[k].sortOrder;
01442         ap[k].vdw_type = a[j].vdwType;
01443         vdw_types[start + k] = a[j].vdwType;
01444         ap[k].index = aExt[j].id;
01445 #ifdef MEM_OPT_VERSION
01446         ap[k].excl_index = exclusionsByAtom[aExt[j].exclId].y;
01447         ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].exclId].x;
01448 #else // ! MEM_OPT_VERSION
01449         ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
01450         ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
01451 #endif // MEM_OPT_VERSION
01452       }
01453     }
01454     {
01455 #if 1
01456       const CudaAtom *ac = pr.p->getCudaAtomList();
01457       CudaAtom *ap = atoms + start;
01458       memcpy(ap, ac, sizeof(atom)*n);
01459 #else
01460       Vector center =
01461         pr.p->flags.lattice.unscale(cudaCompute->patchMap->center(pr.patchID));
01462       atom *ap = atoms + start;
01463       for ( int k=0; k<n; ++k ) {
01464         int j = aExt[k].sortOrder;
01465         ap[k].position.x = a[j].position.x - center.x;
01466         ap[k].position.y = a[j].position.y - center.y;
01467         ap[k].position.z = a[j].position.z - center.z;
01468         ap[k].charge = charge_scaling * a[j].charge;
01469       }
01470 #endif
01471     }
01472   }
01473 
01474   savePairlists = 0;
01475   usePairlists = 0;
01476   if ( flags.savePairlists ) {
01477     savePairlists = 1;
01478     usePairlists = 1;
01479   } else if ( flags.usePairlists ) {
01480     if ( ! pairlistsValid ||
01481          ( 2. * maxAtomMovement > pairlistTolerance ) ) {
01482       reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
01483     } else {
01484       usePairlists = 1;
01485     }
01486   }
01487   if ( ! usePairlists ) {
01488     pairlistsValid = 0;
01489   }
01490   float plcutoff = cutoff;
01491   if ( savePairlists ) {
01492     pairlistsValid = 1;
01493     pairlistTolerance = 2. * maxPatchTolerance;
01494     plcutoff += pairlistTolerance;
01495   }
01496   plcutoff2 = plcutoff * plcutoff;
01497 
01498   //CkPrintf("plcutoff = %f  listTolerance = %f  save = %d  use = %d\n",
01499   //  plcutoff, pairlistTolerance, savePairlists, usePairlists);
01500 
01501 #if 0
01502   // calculate warp divergence
01503   if ( 1 ) {
01504     Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01505     Lattice &lattice = flags.lattice;
01506     float3 lata, latb, latc;
01507     lata.x = lattice.a().x;
01508     lata.y = lattice.a().y;
01509     lata.z = lattice.a().z;
01510     latb.x = lattice.b().x;
01511     latb.y = lattice.b().y;
01512     latb.z = lattice.b().z;
01513     latc.x = lattice.c().x;
01514     latc.y = lattice.c().y;
01515     latc.z = lattice.c().z;
01516 
01517     int ncomputes = computeRecords.size();
01518     for ( int ic=0; ic<ncomputes; ++ic ) {
01519       patch_pair &pp = patch_pairs[ic];
01520       atom *a1 = atoms + pp.patch1_atom_start;
01521       int n1 = pp.patch1_size;
01522       atom *a2 = atoms + pp.patch2_atom_start;
01523       int n2 = pp.patch2_size;
01524       float offx = pp.offset.x * lata.x
01525                + pp.offset.y * latb.x
01526                + pp.offset.z * latc.x;
01527       float offy = pp.offset.x * lata.y
01528                + pp.offset.y * latb.y
01529                + pp.offset.z * latc.y;
01530       float offz = pp.offset.x * lata.z
01531                + pp.offset.y * latb.z
01532                + pp.offset.z * latc.z;
01533       // CkPrintf("%f %f %f\n", offx, offy, offz);
01534       int atoms_tried = 0;
01535       int blocks_tried = 0;
01536       int atoms_used = 0;
01537       int blocks_used = 0;
01538       for ( int ii=0; ii<n1; ii+=32 ) {  // warps
01539         for ( int jj=0; jj<n2; jj+=16 ) {  // shared atom loads
01540           int block_used = 0;
01541           for ( int j=jj; j<jj+16 && j<n2; ++j ) {  // shared atoms
01542             int atom_used = 0;
01543             for ( int i=ii; i<ii+32 && i<n1; ++i ) {  // threads
01544               float dx = offx + a1[i].position.x - a2[j].position.x;
01545               float dy = offy + a1[i].position.y - a2[j].position.y;
01546               float dz = offz + a1[i].position.z - a2[j].position.z;
01547               float r2 = dx*dx + dy*dy + dz*dz;
01548               if ( r2 < cutoff2 ) atom_used = 1;
01549             }
01550             ++atoms_tried;
01551             if ( atom_used ) { block_used = 1; ++atoms_used; }
01552           }
01553           ++blocks_tried;
01554           if ( block_used ) { ++blocks_used; }
01555         }
01556       }
01557       CkPrintf("blocks = %d/%d (%f)  atoms = %d/%d (%f)\n",
01558                 blocks_used, blocks_tried, blocks_used/(float)blocks_tried,
01559                 atoms_used, atoms_tried, atoms_used/(float)atoms_tried);
01560     }
01561   }
01562 #endif
01563 
01564   } // !GBISOn || gbisPhase == 1
01565 
01566   //Do GBIS
01567   if (simParams->GBISOn) {
01568     //open GBIS boxes depending on phase
01569     for ( int i=0; i<activePatches.size(); ++i ) {
01570       patch_record &pr = master->patchRecords[activePatches[i]];
01571       GBISP("doWork[%d] accessing arrays for P%d\n",CkMyPe(),gbisPhase);
01572       if (gbisPhase == 1) {
01573         //Copy GBIS intRadius to Host
01574         if (atomsChanged) {
01575           float *intRad0 = intRad0H + pr.localStart;
01576           float *intRadS = intRadSH + pr.localStart;
01577           for ( int k=0; k<pr.numAtoms; ++k ) {
01578             int j = pr.xExt[k].sortOrder;
01579             intRad0[k] = pr.intRad[2*j+0];
01580             intRadS[k] = pr.intRad[2*j+1];
01581           }
01582         }
01583       } else if (gbisPhase == 2) {
01584         float *bornRad = bornRadH + pr.localStart;
01585         for ( int k=0; k<pr.numAtoms; ++k ) {
01586           int j = pr.xExt[k].sortOrder;
01587           bornRad[k] = pr.bornRad[j];
01588         }
01589       } else if (gbisPhase == 3) {
01590         float *dHdrPrefix = dHdrPrefixH + pr.localStart;
01591         for ( int k=0; k<pr.numAtoms; ++k ) {
01592           int j = pr.xExt[k].sortOrder;
01593           dHdrPrefix[k] = pr.dHdrPrefix[j];
01594         }
01595       } // end phases
01596     } // end for patches
01597   } // if GBISOn
01598 
01599   kernel_time = CkWallTimer();
01600   kernel_launch_state = 1;
01601   //if ( gpu_is_mine || ! doSlow ) recvYieldDevice(-1);
01602   if ( deviceCUDA->getGpuIsMine() || ! doSlow ) recvYieldDevice(-1);
01603 }
01604 
01605 void cuda_check_remote_calc(void *arg, double walltime) {
01606   // in theory we only need end_remote_calc, but overlap isn't reliable
01607   // if ( cudaEventQuery(end_remote_calc) == cudaSuccess ) {
01608   if ( cudaEventQuery(end_remote_download) == cudaSuccess ) {
01609 // CkPrintf("Pe %d yielding to %d after remote calc\n", CkMyPe(), next_pe_sharing_gpu);
01610     computeMgr->sendYieldDevice(deviceCUDA->getNextPeSharingGpu());
01611 // CkPrintf("Pe %d yielded to %d after remote calc\n", CkMyPe(), next_pe_sharing_gpu);
01612   } else {
01613     CcdCallBacksReset(0,walltime);  // fix Charm++
01614     CUDA_POLL(cuda_check_remote_calc, arg);
01615   }
01616 }
01617 
01618 void cuda_check_local_calc(void *arg, double walltime) {
01619   // in theory we only need end_local_calc, but overlap isn't reliable
01620   // if ( cudaEventQuery(end_local_calc) == cudaSuccess ) {
01621   if ( cudaEventQuery(end_local_download) == cudaSuccess ) {
01622 // CkPrintf("Pe %d yielding to %d after local calc\n", CkMyPe(), next_pe_sharing_gpu);
01623     computeMgr->sendYieldDevice(deviceCUDA->getNextPeSharingGpu());
01624 // CkPrintf("Pe %d yielded to %d after local calc\n", CkMyPe(), next_pe_sharing_gpu);
01625   } else {
01626     CcdCallBacksReset(0,walltime);  // fix Charm++
01627     CUDA_POLL(cuda_check_local_calc, arg);
01628   }
01629 }
01630 
01631 // computeMgr->sendYieldDevice(next_pe_sharing_gpu);
01632 
01633 void ComputeNonbondedCUDA::recvYieldDevice(int pe) {
01634 GBISP("C.N.CUDA[%d]::recvYieldDevice: seq %d, workStarted %d, \
01635 gbisPhase %d, kls %d, from pe %d\n", CkMyPe(), sequence(), \
01636 workStarted, gbisPhase, kernel_launch_state, pe)
01637 
01638   float3 lata, latb, latc;
01639   lata.x = lattice.a().x;
01640   lata.y = lattice.a().y;
01641   lata.z = lattice.a().z;
01642   latb.x = lattice.b().x;
01643   latb.y = lattice.b().y;
01644   latb.z = lattice.b().z;
01645   latc.x = lattice.c().x;
01646   latc.y = lattice.c().y;
01647   latc.z = lattice.c().z;
01648   SimParameters *simParams = Node::Object()->simParameters;
01649 
01650   // Set GPU device ID
01651   cudaSetDevice(deviceID);
01652 
01653   const bool streaming = ! (deviceCUDA->getNoStreaming() || simParams->GBISOn);
01654 
01655   double walltime;
01656   if ( kernel_launch_state == 1 || kernel_launch_state == 2 ) {
01657     walltime = CkWallTimer();
01658     CcdCallBacksReset(0,walltime);  // fix Charm++
01659   }
01660 
01661   switch ( kernel_launch_state ) {
01663 // Remote
01664   case 1:
01665 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: case 1\n", CkMyPe())
01666     ++kernel_launch_state;
01667     //gpu_is_mine = 0;
01668     deviceCUDA->setGpuIsMine(0);
01669     remote_submit_time = walltime;
01670 
01671     if (!simParams->GBISOn || gbisPhase == 1) {
01672       // cudaEventRecord(start_upload, stream);
01673       if ( atomsChanged ) {
01674         cuda_bind_atom_params(atom_params);
01675         cuda_bind_vdw_types(vdw_types);
01676       }
01677       if ( simParams->GBISOn) {
01678         cuda_bind_GBIS_psiSum(psiSumH);
01679                if ( atomsChanged ) {
01680                  cuda_bind_GBIS_intRad(intRad0H, intRadSH);
01681                }
01682       }
01683       atomsChanged = 0;
01684       cuda_bind_atoms((const atom *)atoms);
01685       cuda_bind_forces(forces, slow_forces);
01686       cuda_bind_virials(virials, force_ready_queue, block_order);
01687       if ( simParams->GBISOn) {
01688         cuda_bind_GBIS_energy(energy_gbis);
01689       }
01690       if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
01691       //call CUDA Kernels
01692 
01693       cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
01694                             0,remoteComputeRecords.size(),
01695                             remoteComputeRecords.size()+localComputeRecords.size(),
01696                             doSlow, doEnergy, usePairlists, savePairlists, streaming, ! patchPairsReordered, stream);
01697       if (simParams->GBISOn) {
01698         cuda_GBIS_P1(
01699           0,remoteComputeRecords.size(),
01700                       localActivePatches.size(),remoteActivePatches.size(),
01701                       simParams->alpha_cutoff-simParams->fsMax,
01702                       simParams->coulomb_radius_offset,
01703                       lata, latb, latc, stream);
01704       }
01705       //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
01706       //    num_local_atom_records,num_remote_atom_records);
01707       if ( ( ! streaming ) || ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) ) {
01708         cudaEventRecord(end_remote_download, stream);
01709       }
01710       if ( streaming ) {
01711         force_ready_queue_next = 0;
01712         CUDA_POLL(cuda_check_progress,this);
01713       } else {
01714         CUDA_POLL(cuda_check_remote_progress,this);
01715       }
01716       if ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) {
01717         CUDA_POLL(cuda_check_remote_calc,this);
01718         break;
01719       }
01720     } // !GBIS or gbisPhase==1
01721     if (simParams->GBISOn) {
01722       if (gbisPhase == 1) {
01723         //GBIS P1 Kernel launched in previous code block
01724       } else if (gbisPhase == 2) {
01725 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P2>>>\n", CkMyPe())
01726         // cudaEventRecord(start_upload, stream);
01727         cuda_bind_GBIS_bornRad(bornRadH);
01728         cuda_bind_GBIS_dEdaSum(dEdaSumH);
01729         if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
01730         cuda_GBIS_P2(
01731           0,remoteComputeRecords.size(),
01732           localActivePatches.size(),remoteActivePatches.size(),
01733           (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
01734           simParams->nonbondedScaling, simParams->kappa,
01735           (simParams->switchingActive ? simParams->switchingDist : -1.0),
01736           simParams->dielectric, simParams->solvent_dielectric,
01737           lata, latb, latc,
01738           doEnergy, doSlow, stream
01739           );
01740         cudaEventRecord(end_remote_download, stream);
01741         CUDA_POLL(cuda_check_remote_progress,this);
01742       } else if (gbisPhase == 3) {
01743 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P3>>>\n", CkMyPe())
01744         // cudaEventRecord(start_upload, stream);
01745         cuda_bind_GBIS_dHdrPrefix(dHdrPrefixH);
01746         if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
01747         if (doSlow)
01748         cuda_GBIS_P3(
01749           0,remoteComputeRecords.size(),
01750           localActivePatches.size(),remoteActivePatches.size(),
01751           (simParams->alpha_cutoff-simParams->fsMax),
01752           simParams->coulomb_radius_offset,
01753           simParams->nonbondedScaling,
01754           lata, latb, latc, stream
01755           );
01756         cudaEventRecord(end_remote_download, stream);
01757         CUDA_POLL(cuda_check_remote_progress,this);
01758       }
01759     }
01760 
01762 // Local
01763   case 2:
01764 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: case 2\n", CkMyPe())
01765     ++kernel_launch_state;
01766     //gpu_is_mine = 0;
01767     deviceCUDA->setGpuIsMine(0);
01768 
01769     if ( stream2 != stream ) {
01770       // needed to ensure that upload is finished
01771       cudaStreamWaitEvent(stream2, start_calc, 0);
01772       // priorities do not prevent local from launching ahead
01773       // of remote, so delay local stream with a small memset
01774       cudaMemsetAsync(dummy_dev, 0, dummy_size, stream2);
01775     }
01776 
01777     if (!simParams->GBISOn || gbisPhase == 1) {
01778 
01779       cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
01780                             remoteComputeRecords.size(),localComputeRecords.size(),
01781                             remoteComputeRecords.size()+localComputeRecords.size(),
01782                             doSlow, doEnergy, usePairlists, savePairlists, streaming, ! patchPairsReordered, stream2);
01783       if (simParams->GBISOn) {
01784         cuda_GBIS_P1(
01785                      remoteComputeRecords.size(),localComputeRecords.size(),
01786                      0,localActivePatches.size(),
01787                      simParams->alpha_cutoff-simParams->fsMax,
01788                      simParams->coulomb_radius_offset,
01789                      lata, latb, latc, stream2 );
01790       }
01791       //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
01792       //    0,num_local_atom_records);
01793       //cuda_load_virials(virials, doSlow);  // slow_virials follows virials
01794       if ( ( ! streaming ) || ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) ) {
01795         cudaEventRecord(end_local_download, stream2);
01796       }
01797       if ( ! streaming && workStarted == 2 ) {
01798         GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
01799 cuda_check_local_progress\n", CkMyPe())
01800         CUDA_POLL(cuda_check_local_progress,this);
01801       }
01802       if ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) {
01803         GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
01804 cuda_check_local_calc\n", CkMyPe())
01805           CUDA_POLL(cuda_check_local_calc,this);
01806         break;
01807       }
01808 
01809     } // !GBIS or gbisPhase==1
01810     if (simParams->GBISOn) {
01811       if (gbisPhase == 1) {
01812         //GBIS P1 Kernel launched in previous code block
01813       } else if (gbisPhase == 2) {
01814 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P2>>>\n", CkMyPe())
01815         cuda_GBIS_P2(
01816           remoteComputeRecords.size(),localComputeRecords.size(),
01817           0,localActivePatches.size(),
01818           (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
01819           simParams->nonbondedScaling, simParams->kappa,
01820           (simParams->switchingActive ? simParams->switchingDist : -1.0),
01821           simParams->dielectric, simParams->solvent_dielectric,
01822           lata, latb, latc,
01823           doEnergy, doSlow, stream2
01824           );
01825         cudaEventRecord(end_local_download, stream2);
01826         if ( workStarted == 2 ) {
01827           CUDA_POLL(cuda_check_local_progress,this);
01828         }
01829       } else if (gbisPhase == 3) {
01830 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P3>>>\n", CkMyPe())
01831         if (doSlow)
01832         cuda_GBIS_P3(
01833           remoteComputeRecords.size(),localComputeRecords.size(),
01834           0,localActivePatches.size(),
01835           (simParams->alpha_cutoff-simParams->fsMax),
01836           simParams->coulomb_radius_offset,
01837           simParams->nonbondedScaling,
01838           lata, latb, latc, stream2
01839           );
01840         cudaEventRecord(end_local_download, stream2);
01841         if ( workStarted == 2 ) {
01842           CUDA_POLL(cuda_check_local_progress,this);
01843         }
01844       } // phases
01845     } // GBISOn
01846     if ( simParams->PMEOn && simParams->PMEOffload  && !simParams->usePMECUDA) break;
01847 
01848   default:
01849 GBISP("C.N.CUDA[%d]::recvYieldDevice: case default\n", CkMyPe())
01850     //gpu_is_mine = 1;
01851     deviceCUDA->setGpuIsMine(1);
01852     break;
01853   } // switch
01854 GBISP("C.N.CUDA[%d]::recvYieldDevice: DONE\n", CkMyPe())
01855 }
01856 
01857 void ComputeNonbondedCUDA::messageFinishPatch(int flindex) {
01858   int pid = activePatches[flindex];
01859   patch_record &pr = patchRecords[pid];
01860   //fprintf(stderr, "%d ComputeNonbondedCUDA::messageFinishPatch %d\n",CkMyPe(),pr.hostPe);
01861   computeMgr->sendNonbondedCUDASlaveEnqueuePatch(pr.slave,pr.hostPe,sequence(),PROXY_DATA_PRIORITY,flindex,pr.msg);
01862 }
01863 
01864 void ComputeNonbondedCUDA::finishPatch(int flindex) {
01865   //fprintf(stderr, "%d ComputeNonbondedCUDA::finishPatch \n",CkMyPe());
01866   patch_record &pr = master->patchRecords[master->activePatches[flindex]];
01867   pr.r = pr.forceBox->open();
01868   finishPatch(pr);
01869   pr.positionBox->close(&(pr.x));
01870   pr.forceBox->close(&(pr.r));
01871 }
01872 
01873 void ComputeNonbondedCUDA::finishPatch(patch_record &pr) {
01874   int start = pr.localStart;
01875   const CompAtomExt *aExt = pr.xExt;
01876   int nfree = pr.numAtoms;
01877   pr.f = pr.r->f[Results::nbond];
01878   Force *f = pr.f;
01879   Force *f_slow = pr.r->f[Results::slow];
01880   const CompAtom *a = pr.x;
01881   // int *ao = atom_order + start;
01882   float4 *af = master->forces + start;
01883   float4 *af_slow = master->slow_forces + start;
01884   // only free atoms return forces
01885   for ( int k=0; k<nfree; ++k ) {
01886     int j = aExt[k].sortOrder;
01887     f[j].x += af[k].x;
01888     f[j].y += af[k].y;
01889     f[j].z += af[k].z;
01890     // wcount += af[k].w;
01891     // virial += af[k].w;
01892     if ( doSlow ) {
01893       f_slow[j].x += af_slow[k].x;
01894       f_slow[j].y += af_slow[k].y;
01895       f_slow[j].z += af_slow[k].z;
01896       // virial_slow += af_slow[k].w;
01897     }
01898   }
01899 }
01900 
01901 //dtanner
01902 int ComputeNonbondedCUDA::finishWork() {
01903   //fprintf(stderr, "%d ComputeNonbondedCUDA::finishWork() \n",CkMyPe());
01904 
01905   if ( master == this ) {
01906     for ( int i = 0; i < numSlaves; ++i ) {
01907       computeMgr->sendNonbondedCUDASlaveEnqueue(slaves[i],slavePes[i],sequence(),priority(),workStarted);
01908     }
01909   }
01910 
01911 GBISP("C.N.CUDA[%d]::fnWork: workStarted %d, phase %d\n", \
01912 CkMyPe(), workStarted, gbisPhase)
01913 
01914   Molecule *mol = Node::Object()->molecule;
01915   SimParameters *simParams = Node::Object()->simParameters;
01916 
01917   ResizeArray<int> &patches( workStarted == 1 ?
01918                                 remoteHostedPatches : localHostedPatches );
01919 
01920   // long long int wcount = 0;
01921   // double virial = 0.;
01922   // double virial_slow = 0.;
01923   for ( int i=0; i<patches.size(); ++i ) {
01924     // CkPrintf("Pe %d patch %d of %d pid %d\n",CkMyPe(),i,patches.size(),patches[i]);
01925     patch_record &pr = master->patchRecords[patches[i]];
01926 
01927     if ( !simParams->GBISOn || gbisPhase == 1 ) {
01928       patch_record &pr = master->patchRecords[patches[i]];
01929 GBISP("GBIS[%d] fnWork() P0[%d] force.open()\n",CkMyPe(), pr.patchID);
01930       pr.r = pr.forceBox->open();
01931     } // !GBISOn || gbisPhase==1
01932 
01933     int start = pr.localStart;
01934     const CompAtomExt *aExt = pr.xExt;
01935     if ( !simParams->GBISOn || gbisPhase == 3 ) {
01936       finishPatch(pr);
01937     } // !GBISOn || gbisPhase == 3
01938 
01939 #if 0
01940     if ( i % 31 == 0 ) for ( int j=0; j<3; ++j ) {
01941       CkPrintf("Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
01942         j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
01943         af[j].w);
01944     }
01945 #endif
01946 
01947     //Close Boxes depending on Phase
01948     if (simParams->GBISOn) {
01949       if (gbisPhase == 1) {
01950         //Copy dEdaSum from Host to Patch Box
01951         GBReal *psiSumMaster = master->psiSumH + start;
01952         for ( int k=0; k<pr.numAtoms; ++k ) {
01953           int j = aExt[k].sortOrder;
01954           pr.psiSum[j] += psiSumMaster[k];
01955         }
01956 GBISP("C.N.CUDA[%d]::fnWork: P1 psiSum.close()\n", CkMyPe());
01957         pr.psiSumBox->close(&(pr.psiSum));
01958 
01959       } else if (gbisPhase == 2) {
01960         //Copy dEdaSum from Host to Patch Box
01961         GBReal *dEdaSumMaster = master->dEdaSumH + start;
01962         for ( int k=0; k<pr.numAtoms; ++k ) {
01963           int j = aExt[k].sortOrder;
01964           pr.dEdaSum[j] += dEdaSumMaster[k];
01965         }
01966 GBISP("C.N.CUDA[%d]::fnWork: P2 dEdaSum.close()\n", CkMyPe());
01967         pr.dEdaSumBox->close(&(pr.dEdaSum));
01968 
01969       } else if (gbisPhase == 3) {
01970 GBISP("C.N.CUDA[%d]::fnWork: P3 all.close()\n", CkMyPe());
01971         pr.intRadBox->close(&(pr.intRad)); //box 6
01972         pr.bornRadBox->close(&(pr.bornRad)); //box 7
01973         pr.dHdrPrefixBox->close(&(pr.dHdrPrefix)); //box 9
01974         pr.positionBox->close(&(pr.x)); //box 0
01975         pr.forceBox->close(&(pr.r));
01976       } //end phases
01977     } else { //not GBIS
01978 GBISP("C.N.CUDA[%d]::fnWork: pos/force.close()\n", CkMyPe());
01979       pr.positionBox->close(&(pr.x));
01980       pr.forceBox->close(&(pr.r));
01981     }
01982   }// end for
01983 
01984 #if 0
01985   virial *= (-1./6.);
01986   reduction->item(REDUCTION_VIRIAL_NBOND_XX) += virial;
01987   reduction->item(REDUCTION_VIRIAL_NBOND_YY) += virial;
01988   reduction->item(REDUCTION_VIRIAL_NBOND_ZZ) += virial;
01989   if ( doSlow ) {
01990     virial_slow *= (-1./6.);
01991     reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial_slow;
01992     reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial_slow;
01993     reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial_slow;
01994   }
01995 #endif
01996 
01997   if ( workStarted == 1 && ! deviceCUDA->getMergeGrids() &&
01998        ( localHostedPatches.size() || master == this ) ) {
01999     GBISP("not finished, call again\n");
02000     return 0;  // not finished, call again
02001   }
02002 
02003   if ( master != this ) {  // finished
02004     GBISP("finished\n");
02005     if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
02006     atomsChanged = 0;
02007     return 1;
02008   }
02009 
02010   cuda_timer_total += kernel_time;
02011 
02012   if ( !simParams->GBISOn || gbisPhase == 3 ) {
02013 
02014     atomsChanged = 0;
02015     finishReductions();
02016 
02017   } // !GBISOn || gbisPhase==3  
02018 
02019   // Next GBIS Phase
02020 GBISP("C.N.CUDA[%d]::fnWork: incrementing phase\n", CkMyPe())
02021     if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
02022 
02023   GBISP("C.N.CUDA[%d] finished ready for next step\n",CkMyPe());
02024   return 1;  // finished and ready for next step
02025 }
02026 
02027 
02028 void ComputeNonbondedCUDA::finishReductions() {
02029   //fprintf(stderr, "%d ComputeNonbondedCUDA::finishReductions \n",CkMyPe());
02030 
02031   basePriority = PROXY_DATA_PRIORITY;  // higher to aid overlap
02032 
02033   SimParameters *simParams = Node::Object()->simParameters;
02034 
02035   if ( 0 && patchPairsReordered && patchPairsReordered < 3 ) {
02036     if ( patchPairsReordered++ == 2) {
02037       int patch_len = patchRecords.size();
02038       ResizeArray<int> plast(patch_len);
02039       for ( int i=0; i<patch_len; ++i ) {
02040         plast[i] = -1;
02041       }
02042       int order_len = localComputeRecords.size()+remoteComputeRecords.size();
02043       for ( int i=0; i<order_len; ++i ) {
02044         plast[computeRecords[block_order[i]].pid[0]] = i;
02045         iout << "block_order " << i << " " << block_order[i] << " " << computeRecords[block_order[i]].pid[0] << "\n";
02046       }
02047       iout << endi;
02048       for ( int i=0; i<patch_len; ++i ) {
02049         iout << "patch_last " << i << " " << plast[i] << "\n";
02050       }
02051       iout << endi;
02052     }
02053   }
02054 
02055   {
02056     Tensor virial_tensor;
02057     BigReal energyv = 0.;
02058     BigReal energye = 0.;
02059     BigReal energys = 0.;
02060     int nexcluded = 0;
02061     for ( int i = 0; i < num_virials; ++i ) {
02062       virial_tensor.xx += virials[16*i];
02063       virial_tensor.xy += virials[16*i+1];
02064       virial_tensor.xz += virials[16*i+2];
02065       virial_tensor.yx += virials[16*i+3];
02066       virial_tensor.yy += virials[16*i+4];
02067       virial_tensor.yz += virials[16*i+5];
02068       virial_tensor.zx += virials[16*i+6];
02069       virial_tensor.zy += virials[16*i+7];
02070       virial_tensor.zz += virials[16*i+8];
02071       energyv += virials[16*i+9];
02072       energye += virials[16*i+10];
02073       energys += virials[16*i+11];
02074       nexcluded += ((int *)virials)[16*i+12];
02075       if (simParams->GBISOn) {
02076         energye += energy_gbis[i];
02077       }
02078     }
02079     reduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += nexcluded;
02080     ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NBOND,virial_tensor);
02081     if ( doEnergy ) {
02082       reduction->item(REDUCTION_LJ_ENERGY) += energyv;
02083       reduction->item(REDUCTION_ELECT_ENERGY) += energye;
02084       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += energys;
02085     }
02086   }
02087   if ( doSlow ) {
02088     Tensor virial_slow_tensor;
02089     for ( int i = 0; i < num_virials; ++i ) {
02090       virial_slow_tensor.xx += slow_virials[16*i];
02091       virial_slow_tensor.xy += slow_virials[16*i+1];
02092       virial_slow_tensor.xz += slow_virials[16*i+2];
02093       virial_slow_tensor.yx += slow_virials[16*i+3];
02094       virial_slow_tensor.yy += slow_virials[16*i+4];
02095       virial_slow_tensor.yz += slow_virials[16*i+5];
02096       virial_slow_tensor.zx += slow_virials[16*i+6];
02097       virial_slow_tensor.zy += slow_virials[16*i+7];
02098       virial_slow_tensor.zz += slow_virials[16*i+8];
02099     }
02100     ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_SLOW,virial_slow_tensor);
02101   }
02102 
02103   reduction->submit();
02104 
02105   cuda_timer_count++;
02106   if ( simParams->outputCudaTiming &&
02107         step % simParams->outputCudaTiming == 0 ) {
02108 
02109     // int natoms = mol->numAtoms; 
02110     // double wpa = wcount;  wpa /= natoms;
02111 
02112     // CkPrintf("Pe %d CUDA kernel %f ms, total %f ms, wpa %f\n", CkMyPe(),
02113     //  kernel_time * 1.e3, time * 1.e3, wpa);
02114 
02115 #if 0
02116     float upload_ms, remote_calc_ms;
02117     float local_calc_ms, total_ms;
02118     cuda_errcheck("before event timers");
02119     cudaEventElapsedTime(&upload_ms, start_upload, start_calc);
02120     cuda_errcheck("in event timer 1");
02121     cudaEventElapsedTime(&remote_calc_ms, start_calc, end_remote_download);
02122     cuda_errcheck("in event timer 2");
02123     cudaEventElapsedTime(&local_calc_ms, end_remote_download, end_local_download);
02124     cuda_errcheck("in event timer 3");
02125     cudaEventElapsedTime(&total_ms, start_upload, end_local_download);
02126     cuda_errcheck("in event timer 4");
02127     cuda_errcheck("in event timers");
02128 
02129     CkPrintf("CUDA EVENT TIMING: %d %f %f %f %f\n",
02130              CkMyPe(), upload_ms, remote_calc_ms,
02131              local_calc_ms, total_ms);
02132 #endif
02133 
02134     if ( cuda_timer_count >= simParams->outputCudaTiming ) {
02135       cuda_timer_total /= cuda_timer_count;
02136       CkPrintf("CUDA TIMING: %d  %f ms/step on node %d\n",
02137                step, cuda_timer_total * 1.e3, CkMyPe());
02138     }
02139     cuda_timer_count = 0;
02140     cuda_timer_total = 0;
02141   }
02142 
02143 }
02144 
02145 
02146 #endif  // NAMD_CUDA
02147 

Generated on Tue Oct 23 01:17:14 2018 for NAMD by  doxygen 1.4.7