Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members

ComputeNonbondedCUDA.C

Go to the documentation of this file.
00001 
00002 #include "common.h"
00003 #include "charm++.h"
00004 
00005 #ifdef NAMD_CUDA
00006 #include <cuda_runtime.h>
00007 #include <cuda.h>
00008 #endif
00009 
00010 #include "WorkDistrib.h"
00011 #include "ComputeMgr.h"
00012 #include "ComputeNonbondedCUDA.h"
00013 #include "ComputeNonbondedCUDAKernel.h"
00014 #include "ObjectArena.h"
00015 
00016 #ifdef NAMD_CUDA
00017 
00018 extern cudaStream_t stream;
00019 
00020 void cuda_errcheck(const char *msg) {
00021   cudaError_t err;
00022   if ((err = cudaGetLastError()) != cudaSuccess) {
00023     char errmsg[1024];
00024     sprintf(errmsg,"CUDA error %s: %s", msg, cudaGetErrorString(err));
00025     NAMD_die(errmsg);
00026   }
00027 }
00028 
00029 char *devicelist;
00030 static int usedevicelist;
00031 static int ignoresharing;
00032 
00033 void cuda_getargs(char **argv) {
00034   devicelist = 0;
00035   usedevicelist = CmiGetArgStringDesc(argv, "+devices", &devicelist,
00036         "comma-delimited list of CUDA device numbers such as 0,2,1,2");
00037   ignoresharing = CmiGetArgFlag(argv, "+ignoresharing");
00038 }
00039 
00040 static int shared_gpu;
00041 static int first_pe_sharing_gpu;
00042 static int next_pe_sharing_gpu;
00043 
00044 static int gpu_is_mine;
00045 
00046 void cuda_initialize() {
00047 
00048   char host[128];
00049 #ifdef NOHOSTNAME
00050   sprintf(host,"unknown");
00051 #else
00052   gethostname(host, 128);  host[127] = 0;
00053 #endif
00054 
00055   int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
00056   int myRankInPhysicalNode;
00057   int numPesOnPhysicalNode;
00058   int *pesOnPhysicalNode;
00059   CmiGetPesOnPhysicalNode(myPhysicalNodeID,
00060                            &pesOnPhysicalNode,&numPesOnPhysicalNode);
00061   {
00062     int i;
00063     for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00064       if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
00065         i = numPesOnPhysicalNode;
00066         break;
00067       }
00068       if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
00069     }
00070     if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
00071       CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
00072       for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00073         CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
00074           i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
00075       }
00076       myRankInPhysicalNode = 0;
00077       numPesOnPhysicalNode = 1;
00078       pesOnPhysicalNode = new int[1];
00079       pesOnPhysicalNode[0] = CkMyPe();
00080     } else {
00081       myRankInPhysicalNode = i;
00082     }
00083   }
00084   // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
00085 
00086   int deviceCount = 0;
00087   cudaGetDeviceCount(&deviceCount);
00088   if ( deviceCount <= 0 ) {
00089     NAMD_die("No CUDA devices found.");
00090   }
00091 
00092   int *devices;
00093   int ndevices = 0;
00094   int nexclusive = 0;
00095   if ( usedevicelist ) {
00096     devices = new int[strlen(devicelist)];
00097     int i = 0;
00098     while ( devicelist[i] ) {
00099       ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
00100       while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
00101       while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
00102     }
00103   } else {
00104     if ( ! CkMyPe() ) {
00105       CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
00106     }
00107     devices = new int[deviceCount];
00108     for ( int i=0; i<deviceCount; ++i ) {
00109       int dev = i % deviceCount;
00110 #if CUDA_VERSION >= 2020
00111       cudaDeviceProp deviceProp;
00112       cudaGetDeviceProperties(&deviceProp, dev);
00113       if ( deviceProp.computeMode != cudaComputeModeProhibited
00114            && deviceProp.multiProcessorCount > 2 ) {  // exclude weak cards
00115         devices[ndevices++] = dev;
00116       }
00117       if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
00118         ++nexclusive;
00119       }
00120 #else
00121       devices[ndevices++] = dev;
00122 #endif
00123     }
00124   }
00125 
00126   if ( ! ndevices ) {
00127     NAMD_die("All CUDA devices are in prohibited mode.");
00128   }
00129 
00130   shared_gpu = 0;
00131   gpu_is_mine = 1;
00132   first_pe_sharing_gpu = CkMyPe();
00133   next_pe_sharing_gpu = CkMyPe();
00134 
00135  if ( (ndevices >= numPesOnPhysicalNode) || (nexclusive == 0) ) {
00136 
00137   int dev;
00138   if ( numPesOnPhysicalNode > 1 ) {
00139     dev = devices[myRankInPhysicalNode % ndevices];
00140     if ( ! ignoresharing ) {
00141      for ( int i = (myRankInPhysicalNode + 1) % numPesOnPhysicalNode;
00142           i != myRankInPhysicalNode;
00143           i = (i + 1) % numPesOnPhysicalNode ) {
00144       if (devices[i % ndevices] == dev) {
00145         shared_gpu = 1;
00146         next_pe_sharing_gpu = pesOnPhysicalNode[i];
00147         break;
00148       }
00149      }
00150     }
00151     if ( shared_gpu ) {
00152       for ( int i = 0; i < numPesOnPhysicalNode; ++i ) {
00153         if (devices[i % ndevices] == dev) {
00154           first_pe_sharing_gpu = pesOnPhysicalNode[i];
00155           break;
00156         }
00157       }
00158       CkPrintf("Pe %d sharing CUDA device %d first %d next %d\n",
00159                 CkMyPe(), dev, first_pe_sharing_gpu, next_pe_sharing_gpu);
00160     }
00161   } else {  // in case phys node code is lying
00162     dev = devices[CkMyPe() % ndevices];
00163   }
00164 
00165   // disable token-passing but don't submit local until remote finished
00166   // if shared_gpu is true, otherwise submit all work immediately
00167   first_pe_sharing_gpu = CkMyPe();
00168   next_pe_sharing_gpu = CkMyPe();
00169 
00170   gpu_is_mine = ( first_pe_sharing_gpu == CkMyPe() ); 
00171 
00172   if ( dev >= deviceCount ) {
00173     char buf[256];
00174     sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
00175                 CkMyPe(), dev, host, deviceCount);
00176     NAMD_die(buf);
00177   }
00178 
00179   cudaDeviceProp deviceProp;
00180   cudaGetDeviceProperties(&deviceProp, dev);
00181   CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s'  Mem: %dMB  Rev: %d.%d\n",
00182              CkMyPe(), myRankInPhysicalNode, dev, host,
00183              deviceProp.name, deviceProp.totalGlobalMem / (1024*1024),
00184              deviceProp.major, deviceProp.minor);
00185 
00186   cudaSetDevice(dev);
00187   cudaError_t err;
00188   if ((err = cudaGetLastError()) != cudaSuccess) {
00189     char errmsg[1024];
00190     sprintf(errmsg,"CUDA error binding to device %d on pe %d: %s",
00191                         dev, CkMyPe(), cudaGetErrorString(err));
00192     NAMD_die(errmsg);
00193   }
00194 
00195  }  // just let CUDA pick a device for us
00196 
00197   if ( sizeof(patch_pair) & 15 ) NAMD_die("sizeof(patch_pair) % 16 != 0");
00198   if ( sizeof(force_list) & 15 ) NAMD_die("sizeof(force_list) % 16 != 0");
00199   if ( sizeof(atom) & 15 ) NAMD_die("sizeof(atom) % 16 != 0");
00200   if ( sizeof(atom_param) & 15 ) NAMD_die("sizeof(atom_param) % 16 != 0");
00201 
00202   cuda_init();
00203 
00204 }
00205 
00206 
00207 void build_cuda_force_table() {
00208   ComputeNonbondedCUDA::build_force_table();
00209 }
00210 
00211 void ComputeNonbondedCUDA::build_force_table() {  // static
00212 
00213   float4 t[FORCE_TABLE_SIZE];
00214 
00215   const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta;
00216   const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp;
00217   // const int r2_delta_expc = 64 * (r2_delta_exp - 127);
00218   const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
00219 
00220   double r2list[FORCE_TABLE_SIZE];  // double to match cpu code
00221   for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00222     double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00223     r2list[i] = r*r + r2_delta;
00224   }
00225 
00226   union { double f; int32 i[2]; } byte_order_test;
00227   byte_order_test.f = 1.0;  // should occupy high-order bits only
00228   int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
00229 
00230   for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00231     double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00232     int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc;  // table_i >= 0
00233 
00234     if ( r > cutoff ) {
00235       t[i].x = 0.;
00236       t[i].y = 0.;
00237       t[i].z = 0.;
00238       t[i].w = 0.;
00239       continue;
00240     }
00241 
00242     BigReal diffa = r2list[i] - r2_table[table_i];
00243 
00244     // coulomb 1/r or fast force
00245     // t[i].x = 1. / (r2 * r);  // -1/r * d/dr r^-1
00246     {
00247       // BigReal table_a = fast_table[4*table_i];
00248       BigReal table_b = fast_table[4*table_i+1];
00249       BigReal table_c = fast_table[4*table_i+2];
00250       BigReal table_d = fast_table[4*table_i+3];
00251       BigReal grad =
00252                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00253       t[i].x = 2. * grad;
00254     }
00255 
00256 
00257     // pme correction for slow force
00258     // t[i].w = 0.;
00259     {
00260       // BigReal table_a = scor_table[4*table_i];
00261       BigReal table_b = scor_table[4*table_i+1];
00262       BigReal table_c = scor_table[4*table_i+2];
00263       BigReal table_d = scor_table[4*table_i+3];
00264       BigReal grad =
00265                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00266       t[i].w = 2. * grad;
00267     }
00268 
00269 
00270     // vdw 1/r^6
00271     // t[i].y = 6. / (r8);  // -1/r * d/dr r^-6
00272     {
00273       // BigReal table_a = vdwb_table[4*table_i];
00274       BigReal table_b = vdwb_table[4*table_i+1];
00275       BigReal table_c = vdwb_table[4*table_i+2];
00276       BigReal table_d = vdwb_table[4*table_i+3];
00277       BigReal grad =
00278                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00279       t[i].y = 2. * -1. * grad;
00280     }
00281 
00282 
00283     // vdw 1/r^12
00284     // t[i].z = 12e / (r8 * r4 * r2);  // -1/r * d/dr r^-12
00285     {
00286       // BigReal table_a = vdwa_table[4*table_i];
00287       BigReal table_b = vdwa_table[4*table_i+1];
00288       BigReal table_c = vdwa_table[4*table_i+2];
00289       BigReal table_d = vdwa_table[4*table_i+3];
00290       BigReal grad =
00291                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00292       t[i].z = 2. * grad;
00293     }
00294 
00295     // CkPrintf("%d %g %g %g %g %g %g\n", i, r, diffa,
00296     //   t[i].x, t[i].y, t[i].z, t[i].w);
00297 
00298 /*
00299     double r2 = r * r;
00300     double r4 = r2 * r2;
00301     double r8 = r4 * r4;
00302 
00303     t[i].x = 1. / (r2 * r);  // -1/r * d/dr r^-1
00304     t[i].y = 6. / (r8);  // -1/r * d/dr r^-6
00305     t[i].z = 12. / (r8 * r4 * r2);  // -1/r * d/dr r^-12
00306     t[i].w = 0.;
00307 */
00308   }
00309 
00310   t[0].x = 0.f;
00311   t[0].y = 0.f;
00312   t[0].z = 0.f;
00313   t[0].w = 0.f;
00314 
00315   cuda_bind_force_table(t);
00316 
00317   if ( ! CkMyPe() ) {
00318     CkPrintf("Info: Updated CUDA force table with %d elements.\n", FORCE_TABLE_SIZE);
00319   }
00320 }
00321 
00322 static ComputeNonbondedCUDA* cudaCompute = 0;
00323 static ComputeMgr *computeMgr = 0;
00324 
00325 static ushort2 *exclusionsByAtom;
00326 
00327 void ComputeNonbondedCUDA::build_exclusions() {
00328   Molecule *mol = Node::Object()->molecule;
00329   int natoms = mol->numAtoms; 
00330   exclusionsByAtom = new ushort2[natoms];
00331 
00332   // create unique sorted lists
00333 
00334   ObjectArena<int32> listArena;
00335   ResizeArray<int32*> unique_lists;
00336   SortableResizeArray<int32> curList;
00337   int totalbits = 0;
00338   for ( int i=0; i<natoms; ++i ) {
00339     const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
00340     curList.resize(0);
00341     int n = mol_list[0] + 1;
00342     curList.add(0);  // always excluded from self
00343     for ( int j=1; j<n; ++j ) { curList.add(mol_list[j] - i); }
00344     curList.sort();
00345 
00346     int j;
00347     for ( j=0; j<unique_lists.size(); ++j ) {
00348       if ( n != unique_lists[j][0] ) continue;  // no match
00349       int k;
00350       for ( k=0; k<n; ++k ) {
00351         if ( unique_lists[j][k+3] != curList[k] ) break;
00352       }
00353       if ( k == n ) break;  // found match
00354     }
00355     if ( j == unique_lists.size() ) {  // no match
00356       int32 *list = listArena.getNewArray(n+3);
00357       list[0] = n;
00358       int maxdiff = 0;
00359       maxdiff = -1 * curList[0];
00360       if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
00361       list[1] = maxdiff;
00362       list[2] = totalbits + maxdiff;
00363       totalbits += 2*maxdiff + 1;
00364       for ( int k=0; k<n; ++k ) {
00365         list[k+3] = curList[k];
00366       }
00367       unique_lists.add(list);
00368     }
00369     exclusionsByAtom[i].x = unique_lists[j][1];  // maxdiff
00370     exclusionsByAtom[i].y = unique_lists[j][2];  // start
00371   }
00372 
00373   if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
00374 
00375   if ( ! CkMyPe() ) {
00376   CkPrintf("Info: Found %d unique exclusion lists needing %d bytes\n",
00377                 unique_lists.size(), totalbits / 8);
00378   }
00379 
00380 #define SET_EXCL(EXCL,BASE,DIFF) \
00381          (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
00382 
00383   unsigned int *exclusion_bits = new unsigned int[totalbits/32];
00384   memset(exclusion_bits, 0, totalbits/8);
00385 
00386   int base = 0;
00387   for ( int i=0; i<unique_lists.size(); ++i ) {
00388     base += unique_lists[i][1];
00389     if ( base != unique_lists[i][2] ) {
00390       NAMD_bug("ComputeNonbondedCUDA::build_exclusions base != stored");
00391     }
00392     int n = unique_lists[i][0];
00393     for ( int j=0; j<n; ++j ) {
00394       SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
00395     }
00396     base += unique_lists[i][1] + 1;
00397   }
00398 
00399   cuda_bind_exclusions(exclusion_bits, totalbits/32);
00400 
00401   delete [] exclusion_bits;
00402 }
00403 
00404 
00405 void register_cuda_compute_self(ComputeID c, PatchID pid) {
00406 
00407   if ( ! cudaCompute ) NAMD_die("register_self called early");
00408 
00409   cudaCompute->requirePatch(pid);
00410 
00411   ComputeNonbondedCUDA::compute_record cr;
00412   cr.c = c;
00413   cr.pid[0] = pid;  cr.pid[1] = pid;
00414   cr.offset = 0.;
00415   if ( cudaCompute->patchMap->node(pid) == CkMyPe() ) {
00416     cudaCompute->localComputeRecords.add(cr);
00417   } else {
00418     cudaCompute->remoteComputeRecords.add(cr);
00419   }
00420 }
00421 
00422 void register_cuda_compute_pair(ComputeID c, PatchID pid[], int t[]) {
00423 
00424   if ( ! cudaCompute ) NAMD_die("register_pair called early");
00425  
00426   cudaCompute->requirePatch(pid[0]);
00427   cudaCompute->requirePatch(pid[1]);
00428 
00429   ComputeNonbondedCUDA::compute_record cr, cr2;
00430   cr.c = c;  cr2.c = c;
00431   cr.pid[0] = pid[0];  cr.pid[1] = pid[1];
00432   cr2.pid[0] = pid[1];  cr2.pid[1] = pid[0];
00433 
00434   int t1 = t[0];
00435   int t2 = t[1];
00436   Vector offset = cudaCompute->patchMap->center(pid[0])
00437                 - cudaCompute->patchMap->center(pid[1]);
00438   offset.x += (t1%3-1) - (t2%3-1);
00439   offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
00440   offset.z += (t1/9-1) - (t2/9-1);
00441   cr.offset = offset;
00442   cr2.offset = -1. * offset;
00443     
00444   if ( cudaCompute->patchMap->node(pid[0]) == CkMyPe() ) {
00445     cudaCompute->localComputeRecords.add(cr);
00446   } else {
00447     cudaCompute->remoteComputeRecords.add(cr);
00448   }
00449   if ( cudaCompute->patchMap->node(pid[1]) == CkMyPe() ) {
00450     cudaCompute->localComputeRecords.add(cr2);
00451   } else {
00452     cudaCompute->remoteComputeRecords.add(cr2);
00453   }
00454 }
00455 
00456 void unregister_cuda_compute(ComputeID c) {  // static
00457 
00458   NAMD_die("unregister_compute unimplemented");
00459 
00460 }
00461 
00462 static int atomsChanged = 0;
00463 static int computesChanged = 0;
00464 
00465 static cudaEvent_t start_upload;
00466 static cudaEvent_t start_calc;
00467 static cudaEvent_t end_remote_calc;
00468 static cudaEvent_t end_remote_download;
00469 static cudaEvent_t end_local_calc;
00470 static cudaEvent_t end_local_download;
00471 
00472 ComputeNonbondedCUDA::ComputeNonbondedCUDA(ComputeID c, ComputeMgr *mgr) : Compute(c) {
00473   // CkPrintf("create ComputeNonbondedCUDA\n");
00474   cudaCompute = this;
00475   computeMgr = mgr;
00476   patchMap = PatchMap::Object();
00477   atomMap = AtomMap::Object();
00478   reduction = 0;
00479   build_exclusions();
00480 
00481   SimParameters *params = Node::Object()->simParameters;
00482   if (params->pressureProfileOn) {
00483     NAMD_die("pressure profile not supported in CUDA");
00484   }
00485 
00486   atomsChanged = 1;
00487   computesChanged = 1;
00488   workStarted = 0;
00489   basePriority = PROXY_DATA_PRIORITY;
00490 
00491   cudaEventCreate(&start_upload);
00492   cudaEventCreate(&start_calc);
00493   cudaEventCreate(&end_remote_calc);
00494   cudaEventCreate(&end_remote_download);
00495   cudaEventCreate(&end_local_calc);
00496   cudaEventCreate(&end_local_download);
00497 }
00498 
00499 
00500 ComputeNonbondedCUDA::~ComputeNonbondedCUDA() { ; }
00501 
00502 void ComputeNonbondedCUDA::requirePatch(int pid) {
00503 
00504   if ( ! reduction ) {
00505     reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00506   }
00507   computesChanged = 1;
00508   patch_record &pr = patchRecords.item(pid);
00509   if ( pr.refCount == 0 ) {
00510     pr.isLocal = ( patchMap->node(pid) == CkMyPe() );
00511     if ( pr.isLocal ) {
00512       localActivePatches.add(pid);
00513     } else {
00514       remoteActivePatches.add(pid);
00515     }
00516     activePatches.add(pid);
00517     setNumPatches(activePatches.size());
00518     pr.patchID = pid;
00519     pr.p = patchMap->patch(pid);
00520     pr.positionBox = pr.p->registerPositionPickup(cid);
00521     pr.forceBox = pr.p->registerForceDeposit(cid);
00522     pr.x = NULL;
00523     pr.xExt = NULL;
00524     pr.r = NULL;
00525     pr.f = NULL;
00526   }
00527   pr.refCount += 1;
00528 }
00529 
00530 static ResizeArray<patch_pair> patch_pairs;
00531 static ResizeArray<force_list> force_lists;
00532 
00533 static int num_atom_records_allocated;
00534 static atom_param* atom_params;
00535 static atom* atoms;
00536 static float4* forces;
00537 static float4* slow_forces;
00538 
00539 static int cuda_timer_count;
00540 static double cuda_timer_total;
00541 static double kernel_time;
00542 
00543 static int ccd_index_remote_download;
00544 static int ccd_index_local_download;
00545 #define CUDA_CONDITION CcdPERIODIC
00546 
00547 void cuda_check_remote_progress(void *arg, double) {
00548   if ( cudaEventQuery(end_remote_download) == cudaSuccess ) {
00549     // ((ComputeNonbondedCUDA *) arg)->finishWork();
00550     WorkDistrib::messageEnqueueWork((ComputeNonbondedCUDA *) arg);
00551   } else {
00552     ccd_index_remote_download = CcdCallOnCondition(CUDA_CONDITION, cuda_check_remote_progress, arg);
00553   }
00554 }
00555 
00556 void cuda_check_local_progress(void *arg, double) {
00557   if ( cudaEventQuery(end_local_download) == cudaSuccess ) {
00558     kernel_time += CkWallTimer();
00559     WorkDistrib::messageEnqueueWork((ComputeNonbondedCUDA *) arg);
00560   } else {
00561     ccd_index_local_download = CcdCallOnCondition(CUDA_CONDITION, cuda_check_local_progress, arg);
00562   }
00563 }
00564 
00565 #if 0
00566 // don't use this one unless timer is part of stream, above is better
00567 void cuda_check_progress(void *arg, double) {
00568   if ( cuda_stream_finished() ) {
00569     kernel_time += CkWallTimer();
00570     CcdCallOnCondition(CUDA_CONDITION, ccd_index);
00571     // ((ComputeNonbondedCUDA *) arg)->finishWork();
00572     WorkDistrib::messageEnqueueWork((ComputeNonbondedCUDA *) arg);
00573   }
00574 }
00575 #endif
00576 
00577 void ComputeNonbondedCUDA::atomUpdate() { atomsChanged = 1; }
00578 
00579 static int kernel_launch_state = 0;
00580 
00581 void ComputeNonbondedCUDA::doWork() {
00582 
00583 // CkPrintf("Pe %d doWork %d\n", CkMyPe(), workStarted);
00584 
00585   if ( workStarted ) {
00586     if ( finishWork() ) {  // finished
00587       workStarted = 0;
00588       basePriority = PROXY_DATA_PRIORITY;  // higher to aid overlap
00589     } else {  // need to call again
00590       workStarted = 2;
00591       basePriority = COMPUTE_HOME_PRIORITY;  // lower for local
00592       if ( kernel_launch_state > 2 ) ccd_index_local_download = CcdCallOnCondition(CUDA_CONDITION,cuda_check_local_progress,this);
00593     }
00594     return;
00595   }
00596   workStarted = 1;
00597   basePriority = COMPUTE_PROXY_PRIORITY;
00598 
00599   Molecule *mol = Node::Object()->molecule;
00600   Parameters *params = Node::Object()->parameters;
00601   SimParameters *simParams = Node::Object()->simParameters;
00602 
00603   if ( simParams->useFlexibleCell ) {
00604     NAMD_die("flexible-cell constant pressure not supported in CUDA");
00605   }
00606 
00607   for ( int i=0; i<activePatches.size(); ++i ) {
00608     patch_record &pr = patchRecords[activePatches[i]];
00609     pr.x = pr.positionBox->open();
00610     pr.xExt = pr.p->getCompAtomExtInfo();
00611   }
00612 
00613  if ( atomsChanged || computesChanged ) {
00614   int npatches = activePatches.size();
00615 
00616   if ( computesChanged ) {
00617     computesChanged = 0;
00618 
00619     int *ap = activePatches.begin();
00620     for ( int i=0; i<localActivePatches.size(); ++i ) {
00621       *(ap++) = localActivePatches[i];
00622     }
00623     for ( int i=0; i<remoteActivePatches.size(); ++i ) {
00624       *(ap++) = remoteActivePatches[i];
00625     }
00626 
00627     int nlc = localComputeRecords.size();
00628     int nrc = remoteComputeRecords.size();
00629     computeRecords.resize(nlc+nrc);
00630     compute_record *cr = computeRecords.begin();
00631     for ( int i=0; i<nlc; ++i ) {
00632       *(cr++) = localComputeRecords[i];
00633     }
00634     for ( int i=0; i<nrc; ++i ) {
00635       *(cr++) = remoteComputeRecords[i];
00636     }
00637 
00638     force_lists.resize(npatches);
00639     for ( int i=0; i<npatches; ++i ) {
00640       patchRecords[activePatches[i]].localIndex = i;
00641       force_lists[i].force_list_size = 0;
00642     }
00643 
00644     int ncomputes = computeRecords.size();
00645     patch_pairs.resize(ncomputes);
00646     for ( int i=0; i<ncomputes; ++i ) {
00647       ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
00648       int lp1 = patchRecords[cr.pid[0]].localIndex;
00649       int lp2 = patchRecords[cr.pid[1]].localIndex;
00650       force_lists[lp1].force_list_size++;
00651       patch_pair &pp = patch_pairs[i];
00652       pp.offset.x = cr.offset.x;
00653       pp.offset.y = cr.offset.y;
00654       pp.offset.z = cr.offset.z;
00655     }
00656 
00657    if ( simParams->outputCudaTiming ) {
00658     CkPrintf("Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
00659         CkMyPe(), localActivePatches.size(), remoteActivePatches.size(),
00660         localComputeRecords.size(), remoteComputeRecords.size());
00661    }
00662   }
00663 
00664   int istart = 0;
00665   int flstart = 0;
00666   int max_atoms_per_patch = 0;
00667   int i;
00668   for ( i=0; i<npatches; ++i ) {
00669     if ( i == localActivePatches.size() ) {
00670        num_local_atom_records = istart;
00671     }
00672     force_lists[i].force_list_start = flstart;
00673     force_lists[i].force_output_start = istart;
00674     patch_record &pr = patchRecords[activePatches[i]];
00675     pr.localStart = istart;
00676     int natoms = pr.p->getNumAtoms();
00677     int nfreeatoms = natoms;
00678     if ( fixedAtomsOn ) {
00679       const CompAtomExt *aExt = pr.xExt;
00680       for ( int j=0; j<natoms; ++j ) {
00681         if ( aExt[j].atomFixed ) --nfreeatoms;
00682       }
00683     }
00684     if ( natoms > max_atoms_per_patch ) max_atoms_per_patch = natoms;
00685     pr.numAtoms = natoms;
00686     pr.numFreeAtoms = nfreeatoms;
00687     if ( natoms & 15 ) { natoms += 16 - (natoms & 15); }
00688     if ( nfreeatoms & 15 ) { nfreeatoms += 16 - (nfreeatoms & 15); }
00689     force_lists[i].patch_size = nfreeatoms;
00690     flstart += nfreeatoms * force_lists[i].force_list_size;
00691     istart += natoms;  // already rounded up
00692     force_lists[i].force_list_size = 0;  // rebuild below
00693   }
00694   if ( i == localActivePatches.size() ) {
00695      num_local_atom_records = istart;
00696   }
00697   num_force_records = flstart;
00698   num_atom_records = istart;
00699   num_remote_atom_records = num_atom_records - num_local_atom_records;
00700   if ( num_atom_records > num_atom_records_allocated ) {
00701     if ( num_atom_records_allocated ) {
00702       cudaFreeHost(atom_params);
00703       cudaFreeHost(atoms);
00704       cudaFreeHost(forces);
00705       cudaFreeHost(slow_forces);
00706     }
00707     num_atom_records_allocated = 1.1 * num_atom_records + 1;
00708     cudaMallocHost((void**)&atom_params,sizeof(atom_param)*num_atom_records_allocated);
00709     cudaMallocHost((void**)&atoms,sizeof(atom)*num_atom_records_allocated);
00710     cudaMallocHost((void**)&forces,sizeof(float4)*num_atom_records_allocated);
00711     cudaMallocHost((void**)&slow_forces,sizeof(float4)*num_atom_records_allocated);
00712   }
00713 
00714 #if 0
00715   if ( max_atoms_per_patch > MAX_ATOMS_PER_PATCH ) {
00716     char errstr[1024];
00717     sprintf(errstr,"Found patch with %d atoms; limit is %d for CUDA."
00718         "  Try enabling twoAwayX, twoAwayY, and/or twoAwayZ to fix this.",
00719         max_atoms_per_patch, MAX_ATOMS_PER_PATCH);
00720     NAMD_die(errstr);
00721   }
00722 #endif
00723 
00724   int ncomputes = computeRecords.size();
00725   for ( int i=0; i<ncomputes; ++i ) {
00726     ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
00727     int p1 = cr.pid[0];
00728     int p2 = cr.pid[1];
00729     int lp1 = patchRecords[p1].localIndex;
00730     int lp2 = patchRecords[p2].localIndex;
00731     patch_pair &pp = patch_pairs[i];
00732     pp.patch1_atom_start = patchRecords[p1].localStart;
00733     pp.patch1_force_start = force_lists[lp1].force_list_start +
00734         force_lists[lp1].patch_size * force_lists[lp1].force_list_size;
00735     pp.patch1_size = patchRecords[p1].numAtoms;
00736     pp.patch1_force_size = patchRecords[p1].numFreeAtoms;
00737     // istart += pp.patch1_size;
00738     // if ( istart & 15 ) { istart += 16 - (istart & 15); }
00739     pp.patch2_atom_start = patchRecords[p2].localStart;
00740     // pp.patch2_force_start = force_lists[lp2].force_list_start +
00741 //      force_lists[lp2].patch_size * force_lists[lp2].force_list_size;
00742     pp.patch2_size = patchRecords[p2].numAtoms;
00743     // pp.patch2_force_size = patchRecords[p2].numFreeAtoms;
00744     // this must happen at end to get self computes right
00745     force_lists[lp1].force_list_size++;
00746     // if ( lp1 != lp2 || cr.t[0] != cr.t[1] ) {
00747     //   force_lists[lp2].force_list_size++;
00748     // }
00749     // istart += pp.patch2_size;
00750     // if ( istart & 15 ) { istart += 16 - (istart & 15); }
00751   }
00752 
00753 #if 0
00754   CkPrintf("Pe %d cuda_bind_patch_pairs %d %d %d %d %d\n", CkMyPe(),
00755         patch_pairs.size(), force_lists.size(),
00756         num_atom_records, num_force_records,
00757         max_atoms_per_patch);
00758 #endif
00759 
00760   int totalmem = patch_pairs.size() * sizeof(patch_pair) +
00761                 force_lists.size() * sizeof(force_list) +
00762                 num_force_records * sizeof(float4) +
00763                 num_atom_records * sizeof(atom) +
00764                 num_atom_records * sizeof(atom_param) +
00765                 num_atom_records * sizeof(float4);
00766   int totalcopy = num_atom_records * ( sizeof(atom) + sizeof(float4) );
00767 /*
00768   CkPrintf("Pe %d allocating %d MB of GPU memory, will copy %d kB per step\n",
00769                         CkMyPe(), totalmem >> 20, totalcopy >> 10);
00770 */
00771 
00772   cuda_bind_patch_pairs(patch_pairs.begin(), patch_pairs.size(),
00773                         force_lists.begin(), force_lists.size(),
00774                         num_atom_records, num_force_records,
00775                         max_atoms_per_patch);
00776 
00777  }  // atomsChanged || computesChanged
00778 
00779   double charge_scaling = sqrt(COLOUMB * scaling * dielectric_1);
00780 
00781 
00782   for ( int i=0; i<activePatches.size(); ++i ) {
00783     patch_record &pr = patchRecords[activePatches[i]];
00784 
00785     int start = pr.localStart;
00786     int n = pr.numAtoms;
00787     const CompAtom *a = pr.x;
00788     const CompAtomExt *aExt = pr.xExt;
00789     if ( atomsChanged ) {
00790       atom_param *ap = atom_params + start;
00791       int k = 0;
00792       for ( int j=0; j<n; ++j ) {
00793         // put free atoms first
00794         if ( fixedAtomsOn && aExt[j].atomFixed ) continue;
00795         ap[k].index = aExt[j].id;
00796         ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
00797         ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
00798         int vdwtype = mol->atomvdwtype(aExt[j].id);
00799         Real sig, eps, sig14, eps14;
00800         params->get_vdw_params(&sig,&eps,&sig14,&eps14,vdwtype);
00801         ap[k].sqrt_epsilon = sqrt(4.0 * scaling * eps);
00802         ap[k].half_sigma = 0.5 * sig;
00803 // CkPrintf("%d %d %f %f\n", k, vdwtype, sig, eps);
00804         ++k;
00805       }
00806       if ( fixedAtomsOn ) for ( int j=0; j<n; ++j ) {
00807         // put fixed atoms at end
00808         if ( ! (fixedAtomsOn && aExt[j].atomFixed) ) continue;
00809         ap[k].index = aExt[j].id;
00810         ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
00811         ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
00812         int vdwtype = mol->atomvdwtype(aExt[j].id);
00813         Real sig, eps, sig14, eps14;
00814         params->get_vdw_params(&sig,&eps,&sig14,&eps14,vdwtype);
00815         ap[k].sqrt_epsilon = sqrt(4.0 * scaling * eps);
00816         ap[k].half_sigma = 0.5 * sig;
00817 // CkPrintf("%d %d %f %f\n", k, vdwtype, sig, eps);
00818         ++k;
00819       }
00820     }
00821     {
00822       Vector center =
00823         pr.p->flags.lattice.unscale(cudaCompute->patchMap->center(pr.patchID));
00824       atom *ap = atoms + start;
00825       int k = 0;
00826       for ( int j=0; j<n; ++j ) {
00827         // put free atoms first
00828         if ( fixedAtomsOn && aExt[j].atomFixed ) continue;
00829         ap[k].position.x = a[j].position.x - center.x;
00830         ap[k].position.y = a[j].position.y - center.y;
00831         ap[k].position.z = a[j].position.z - center.z;
00832         ap[k].charge = charge_scaling * a[j].charge;
00833         ++k;
00834       }
00835       if ( fixedAtomsOn ) for ( int j=0; j<n; ++j ) {
00836         // put fixed atoms at end
00837         if ( ! (fixedAtomsOn && aExt[j].atomFixed) ) continue;
00838         ap[k].position.x = a[j].position.x - center.x;
00839         ap[k].position.y = a[j].position.y - center.y;
00840         ap[k].position.z = a[j].position.z - center.z;
00841         ap[k].charge = charge_scaling * a[j].charge;
00842         ++k;
00843       }
00844     }
00845   }
00846 
00847   kernel_time = -1. * CkWallTimer();
00848 #if 0
00849   kernel_launch_state = 3;
00850 
00851   cudaEventRecord(start_upload, stream);
00852 
00853   if ( atomsChanged ) {
00854     cuda_bind_atom_params(atom_params);
00855   }
00856 
00857   XXX - THIS PATH NOT UPDATED FOR SLOW FORCES - DO NOT COMPILE
00858   cuda_bind_atoms(atoms);
00859   cudaEventRecord(start_calc, stream);
00860   cuda_nonbonded_forces(cutoff2,
00861         localComputeRecords.size(),remoteComputeRecords.size(),
00862         localActivePatches.size(),remoteActivePatches.size());
00863   cudaEventRecord(end_remote_calc, stream);
00864   cuda_load_forces(forces,num_local_atom_records,num_remote_atom_records);
00865   cudaEventRecord(end_remote_download, stream);
00866   cuda_nonbonded_forces(cutoff2,
00867         0,localComputeRecords.size(),
00868         0,localActivePatches.size());
00869   cudaEventRecord(end_local_calc, stream);
00870   cuda_load_forces(forces,0,num_local_atom_records);
00871   cudaEventRecord(end_local_download, stream);
00872 
00873   if ( cuda_stream_finished() ) {
00874     CkPrintf("CUDA not overlapping with CPU work.\n");
00875   }
00876 
00877   // finishWork();
00878 
00879   ccd_index_remote_download = CcdCallOnCondition(CUDA_CONDITION,cuda_check_remote_progress,this);
00880 #else
00881 
00882   kernel_launch_state = 1;
00883   if ( gpu_is_mine ) recvYieldDevice(-1);
00884 
00885 #endif
00886 }
00887 
00888 static int ccd_index_remote_calc;
00889 void cuda_check_remote_calc(void *arg, double) {
00890   // in theory we only need end_remote_calc, but overlap isn't reliable
00891   // if ( cudaEventQuery(end_remote_calc) == cudaSuccess ) {
00892   if ( cudaEventQuery(end_remote_download) == cudaSuccess ) {
00893 // CkPrintf("Pe %d yielding to %d after remote calc\n", CkMyPe(), next_pe_sharing_gpu);
00894     computeMgr->sendYieldDevice(next_pe_sharing_gpu);
00895 // CkPrintf("Pe %d yielded to %d after remote calc\n", CkMyPe(), next_pe_sharing_gpu);
00896   } else {
00897     ccd_index_remote_calc = CcdCallOnCondition(CUDA_CONDITION, cuda_check_remote_calc, arg);
00898   }
00899 }
00900 
00901 static int ccd_index_local_calc;
00902 void cuda_check_local_calc(void *arg, double) {
00903   // in theory we only need end_local_calc, but overlap isn't reliable
00904   // if ( cudaEventQuery(end_local_calc) == cudaSuccess ) {
00905   if ( cudaEventQuery(end_local_download) == cudaSuccess ) {
00906 // CkPrintf("Pe %d yielding to %d after local calc\n", CkMyPe(), next_pe_sharing_gpu);
00907     computeMgr->sendYieldDevice(next_pe_sharing_gpu);
00908 // CkPrintf("Pe %d yielded to %d after local calc\n", CkMyPe(), next_pe_sharing_gpu);
00909   } else {
00910     ccd_index_local_calc = CcdCallOnCondition(CUDA_CONDITION, cuda_check_local_calc, arg);
00911   }
00912 }
00913 
00914 // computeMgr->sendYieldDevice(next_pe_sharing_gpu);
00915 
00916 void ComputeNonbondedCUDA::recvYieldDevice(int pe) {
00917 
00918 // CkPrintf("Pe %d entering state %d via yield from pe %d\n",
00919 //           CkMyPe(), kernel_launch_state, pe);
00920 
00921   Flags &flags = patchRecords[activePatches[0]].p->flags;
00922   int doSlow = flags.doFullElectrostatics;
00923 
00924   Lattice &lattice = flags.lattice;
00925   float3 lata, latb, latc;
00926   lata.x = lattice.a().x;
00927   lata.y = lattice.a().y;
00928   lata.z = lattice.a().z;
00929   latb.x = lattice.b().x;
00930   latb.y = lattice.b().y;
00931   latb.z = lattice.b().z;
00932   latc.x = lattice.c().x;
00933   latc.y = lattice.c().y;
00934   latc.z = lattice.c().z;
00935 
00936   switch ( kernel_launch_state ) {
00937   case 1:
00938     ++kernel_launch_state;
00939     gpu_is_mine = 0;
00940     cudaEventRecord(start_upload, stream);
00941 
00942     if ( atomsChanged ) {
00943       cuda_bind_atom_params(atom_params);
00944     }
00945 
00946     cuda_bind_atoms(atoms);
00947     cudaEventRecord(start_calc, stream);
00948     cuda_nonbonded_forces(lata, latb, latc, cutoff2,
00949         localComputeRecords.size(),remoteComputeRecords.size(),
00950         localActivePatches.size(),remoteActivePatches.size(), doSlow);
00951     cudaEventRecord(end_remote_calc, stream);
00952     cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
00953         num_local_atom_records,num_remote_atom_records);
00954     cudaEventRecord(end_remote_download, stream);
00955     ccd_index_remote_download = CcdCallOnCondition(CUDA_CONDITION,cuda_check_remote_progress,this);
00956     if ( shared_gpu ) {
00957       ccd_index_remote_calc = CcdCallOnCondition(CUDA_CONDITION,cuda_check_remote_calc,this);
00958       break;
00959     }
00960  
00961   case 2:
00962     ++kernel_launch_state;
00963     gpu_is_mine = 0;
00964     cuda_nonbonded_forces(lata, latb, latc, cutoff2,
00965         0,localComputeRecords.size(),
00966         0,localActivePatches.size(), doSlow);
00967     cudaEventRecord(end_local_calc, stream);
00968     cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
00969         0,num_local_atom_records);
00970     cudaEventRecord(end_local_download, stream);
00971     if ( workStarted == 2 ) ccd_index_local_download = CcdCallOnCondition(CUDA_CONDITION,cuda_check_local_progress,this);
00972     if ( shared_gpu ) {
00973       ccd_index_local_calc = CcdCallOnCondition(CUDA_CONDITION,cuda_check_local_calc,this);
00974       break;
00975     }
00976 
00977   default:
00978     gpu_is_mine = 1;
00979     break;
00980   }
00981 
00982 }
00983 
00984 
00985 int ComputeNonbondedCUDA::finishWork() {
00986 
00987   cuda_errcheck("cuda stream completed");
00988 
00989   Molecule *mol = Node::Object()->molecule;
00990   SimParameters *simParams = Node::Object()->simParameters;
00991 
00992   Flags &flags = patchRecords[activePatches[0]].p->flags;
00993   int doSlow = flags.doFullElectrostatics;
00994 
00995   ResizeArray<int> &patches( workStarted == 1 ?
00996                                 remoteActivePatches : localActivePatches );
00997 
00998   for ( int i=0; i<patches.size(); ++i ) {
00999     patch_record &pr = patchRecords[patches[i]];
01000     pr.r = pr.forceBox->open();
01001     pr.f = pr.r->f[Results::nbond];
01002   }
01003 
01004   // long long int wcount = 0;
01005   double virial = 0.;
01006   double virial_slow = 0.;
01007 
01008   for ( int i=0; i<patches.size(); ++i ) {
01009     patch_record &pr = patchRecords[patches[i]];
01010     int start = pr.localStart;
01011     int n = pr.numAtoms;
01012     Force *f = pr.f;
01013     Force *f_slow = pr.r->f[Results::slow];
01014     const CompAtom *a = pr.x;
01015     const CompAtomExt *aExt = pr.xExt;
01016     float4 *af = forces + start;
01017     float4 *af_slow = slow_forces + start;
01018     int k = 0;
01019     for ( int j=0; j<n; ++j ) {
01020       // only free atoms return forces
01021       if ( fixedAtomsOn && aExt[j].atomFixed ) continue;
01022       f[j].x += af[k].x;
01023       f[j].y += af[k].y;
01024       f[j].z += af[k].z;
01025       // wcount += af[k].w;
01026       virial += af[k].w;
01027       if ( doSlow ) {
01028         f_slow[j].x += af_slow[k].x;
01029         f_slow[j].y += af_slow[k].y;
01030         f_slow[j].z += af_slow[k].z;
01031         virial_slow += af_slow[k].w;
01032       }
01033       ++k;
01034     }
01035 
01036 #if 0
01037     // check exclusions reported as w
01038     if ( CkNumPes() == 1 ) {
01039       const CompAtomExt *aExt = pr.xExt;
01040       int k = 0;
01041       for ( int j=0; j<n; ++j ) {
01042         // only free atoms return forces
01043         if ( fixedAtomsOn && aExt[j].atomFixed ) continue;
01044         int excl_expected = mol->get_full_exclusions_for_atom(aExt[j].id)[0] + 1;
01045         if ( af[k].w != excl_expected ) {
01046           CkPrintf("%d:%d(%d) atom %d found %d exclusions but expected %d\n",
01047                 i, j, k, aExt[j].id, (int)af[k].w, excl_expected );
01048         }
01049         ++k;
01050       }
01051     }
01052 #endif
01053     
01054 
01055 #if 0
01056     if ( i % 31 == 0 ) for ( int j=0; j<3; ++j ) {
01057       CkPrintf("Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
01058         j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
01059         af[j].w);
01060     }
01061 #endif
01062     pr.positionBox->close(&(pr.x));
01063     pr.forceBox->close(&(pr.r));
01064   }
01065 
01066   virial *= (-1./6.);
01067   reduction->item(REDUCTION_VIRIAL_NBOND_XX) += virial;
01068   reduction->item(REDUCTION_VIRIAL_NBOND_YY) += virial;
01069   reduction->item(REDUCTION_VIRIAL_NBOND_ZZ) += virial;
01070   if ( doSlow ) {
01071     virial_slow *= (-1./6.);
01072     reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial_slow;
01073     reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial_slow;
01074     reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial_slow;
01075   }
01076 
01077   if ( workStarted == 1 ) return 0;  // not finished, call again
01078 
01079   atomsChanged = 0;
01080   reduction->submit();
01081 
01082   // int natoms = mol->numAtoms; 
01083   // double wpa = wcount;  wpa /= natoms;
01084 
01085   // CkPrintf("Pe %d CUDA kernel %f ms, total %f ms, wpa %f\n", CkMyPe(),
01086         //      kernel_time * 1.e3, time * 1.e3, wpa);
01087 
01088   float upload_ms, remote_calc_ms, remote_download_ms;
01089   float local_calc_ms, local_download_ms, total_ms;
01090   cuda_errcheck("before event timers");
01091   cudaEventElapsedTime(&upload_ms, start_upload, start_calc);
01092   cuda_errcheck("event timer 1");
01093   cudaEventElapsedTime(&remote_calc_ms, start_calc, end_remote_calc);
01094   cuda_errcheck("event timer 2");
01095   cudaEventElapsedTime(&remote_download_ms, end_remote_calc, end_remote_download);
01096   cuda_errcheck("event timer 3");
01097   cudaEventElapsedTime(&local_calc_ms, end_remote_download, end_local_calc);
01098   cuda_errcheck("event timer 4");
01099   cudaEventElapsedTime(&local_download_ms, end_local_calc, end_local_download);
01100   cuda_errcheck("event timer 5");
01101   cudaEventElapsedTime(&total_ms, start_upload, end_local_download);
01102   cuda_errcheck("event timer 6");
01103   cuda_errcheck("event timers");
01104 
01105   cuda_timer_total += kernel_time;
01106   if ( simParams->outputCudaTiming &&
01107         cuda_timer_count % simParams->outputCudaTiming == 0 ) {
01108     cuda_timer_total /= (cuda_timer_count + 1);
01109     CkPrintf("CUDA EVENT TIMING: %d %f %f %f %f %f %f\n",
01110         CkMyPe(), upload_ms, remote_calc_ms, remote_download_ms,
01111                         local_calc_ms, local_download_ms, total_ms);
01112     CkPrintf("CUDA TIMING: %f ms/step on node %d\n",
01113                         cuda_timer_total * 1.e3, CkMyPe());
01114     cuda_timer_count = 0;
01115     cuda_timer_total = 0;
01116   }
01117   cuda_timer_count++;
01118 
01119   return 1;  // finished and ready for next step
01120 }
01121 
01122 
01123 #endif  // NAMD_CUDA
01124 

Generated on Mon Nov 23 04:59:19 2009 for NAMD by  doxygen 1.3.9.1