Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members

ComputeNonbondedCUDA.C File Reference

#include "common.h"
#include "charm++.h"
#include <cuda_runtime.h>
#include <cuda.h>
#include "WorkDistrib.h"
#include "ComputeMgr.h"
#include "ProxyMgr.h"
#include "ComputeNonbondedCUDA.h"
#include "ComputeNonbondedCUDAKernel.h"
#include "LJTable.h"
#include "ObjectArena.h"
#include "SortAtoms.h"
#include <algorithm>
#include "NamdTypes.h"

Go to the source code of this file.

Classes

struct  exlist_sortop
struct  cr_sortop

Defines

#define CUDA_EVENT_ID_BASE   100
#define CUDA_TRACE_REMOTE(START, END)
#define CUDA_TRACE_LOCAL(START, END)
#define REGISTER_DEVICE_EVENTS(DEV)
#define SET_EXCL(EXCL, BASE, DIFF)   (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
#define PATCH_PAIRS_REF   ResizeArray<patch_pair> &patch_pairs(*patch_pairs_ptr);
#define FORCE_LISTS_REF   ResizeArray<force_list> &force_lists(*force_lists_ptr);
#define CUDA_POLL(FN, ARG)   CcdCallFnAfter(FN,ARG,0.1)
#define GBISP()
#define count_limit   1000000
#define stream2   stream

Functions

void cuda_errcheck (const char *msg)
void cuda_die (const char *msg)
void cuda_getargs (char **argv)
int cuda_device_pe ()
bool cuda_device_shared_with_pe (int pe)
bool sortop_bitreverse (int a, int b)
void cuda_register_user_events ()
void cuda_initialize ()
void send_build_cuda_force_table ()
void build_cuda_force_table ()
void register_cuda_compute_self (ComputeID c, PatchID pid)
void register_cuda_compute_pair (ComputeID c, PatchID pid[], int t[])
void unregister_cuda_compute (ComputeID c)
void cuda_check_remote_progress (void *arg, double)
void cuda_check_local_progress (void *arg, double)
void cuda_check_remote_calc (void *arg, double)
void cuda_check_local_calc (void *arg, double)

Variables

__thread cudaStream_t stream
__thread cudaStream_t stream2
char * devicelist
__thread int usedevicelist
__thread int ignoresharing
__thread int mergegrids
__thread int shared_gpu
__thread int first_pe_sharing_gpu
__thread int next_pe_sharing_gpu
__thread int devicePe
__thread int numPesSharingDevice
__thread int * pesSharingDevice
__thread int gpu_is_mine
__thread ComputeNonbondedCUDAcudaCompute = 0
__thread ComputeMgrcomputeMgr = 0
__thread int2 * exclusionsByAtom
__thread int atomsChanged = 0
__thread int computesChanged = 0
__thread int pairlistsValid = 0
__thread float pairlistTolerance = 0.
__thread int usePairlists = 0
__thread int savePairlists = 0
__thread float plcutoff2 = 0
__thread cudaEvent_t start_calc
__thread cudaEvent_t end_remote_download
__thread cudaEvent_t end_local_download
__thread ResizeArray< patch_pair > * patch_pairs_ptr
__thread ResizeArray< force_list > * force_lists_ptr
__thread int num_atom_records_allocated
__thread atom_param * atom_params
__thread atom * atoms
__thread int num_virials
__thread int num_virials_allocated
__thread float * virials
__thread float * slow_virials
__thread float * energy_gbis
__thread float * intRad0H
__thread float * intRadSH
__thread float * bornRadH
__thread float * dHdrPrefixH
__thread int cuda_timer_count
__thread double cuda_timer_total
__thread double kernel_time
__thread double remote_submit_time
__thread double local_submit_time
__thread int check_remote_count
__thread int check_local_count
__thread int kernel_launch_state = 0


Define Documentation

#define count_limit   1000000
 

Definition at line 1020 of file ComputeNonbondedCUDA.C.

#define CUDA_EVENT_ID_BASE   100
 

Definition at line 117 of file ComputeNonbondedCUDA.C.

#define CUDA_POLL FN,
ARG   )     CcdCallFnAfter(FN,ARG,0.1)
 

Definition at line 1012 of file ComputeNonbondedCUDA.C.

Referenced by cuda_check_local_calc(), cuda_check_local_progress(), cuda_check_remote_calc(), cuda_check_remote_progress(), and ComputeNonbondedCUDA::recvYieldDevice().

#define CUDA_TRACE_LOCAL START,
END   ) 
 

Value:

do { int dev; cudaGetDevice(&dev); traceUserBracketEvent( \
       CUDA_EVENT_ID_BASE + 2 * dev + 1, START, END); } while (0)

Definition at line 121 of file ComputeNonbondedCUDA.C.

Referenced by cuda_check_local_progress().

#define CUDA_TRACE_REMOTE START,
END   ) 
 

Value:

do { int dev; cudaGetDevice(&dev); traceUserBracketEvent( \
       CUDA_EVENT_ID_BASE + 2 * dev, START, END); } while (0)

Definition at line 118 of file ComputeNonbondedCUDA.C.

Referenced by cuda_check_remote_progress().

#define FORCE_LISTS_REF   ResizeArray<force_list> &force_lists(*force_lists_ptr);
 

Definition at line 714 of file ComputeNonbondedCUDA.C.

 
#define GBISP  ) 
 

Definition at line 1017 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), ComputeNonbondedCUDA::finishWork(), ComputeNonbondedCUDA::noWork(), and ComputeNonbondedCUDA::recvYieldDevice().

#define PATCH_PAIRS_REF   ResizeArray<patch_pair> &patch_pairs(*patch_pairs_ptr);
 

Definition at line 713 of file ComputeNonbondedCUDA.C.

#define REGISTER_DEVICE_EVENTS DEV   ) 
 

Value:

traceRegisterUserEvent("CUDA device " #DEV " remote", CUDA_EVENT_ID_BASE + 2 * DEV); \
  traceRegisterUserEvent("CUDA device " #DEV " local", CUDA_EVENT_ID_BASE + 2 * DEV + 1);

Referenced by cuda_register_user_events().

#define SET_EXCL EXCL,
BASE,
DIFF   )     (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
 

Referenced by ComputeNonbondedCUDA::build_exclusions().

#define stream2   stream
 

Referenced by cuda_init(), and ComputeNonbondedCUDA::recvYieldDevice().


Function Documentation

void build_cuda_force_table  ) 
 

Definition at line 354 of file ComputeNonbondedCUDA.C.

References ComputeNonbondedCUDA::build_force_table(), ComputeNonbondedCUDA::build_lj_table(), and devicePe.

Referenced by ComputeMgr::recvBuildCudaForceTable().

00354                               {
00355   if ( devicePe != CkMyPe() ) return;
00356   ComputeNonbondedCUDA::build_lj_table();
00357   ComputeNonbondedCUDA::build_force_table();
00358 }

void cuda_check_local_calc void *  arg,
double 
 

Definition at line 1650 of file ComputeNonbondedCUDA.C.

References computeMgr, cuda_check_local_calc(), CUDA_POLL, end_local_download, next_pe_sharing_gpu, and ComputeMgr::sendYieldDevice().

Referenced by cuda_check_local_calc(), and ComputeNonbondedCUDA::recvYieldDevice().

01650                                               {
01651   // in theory we only need end_local_calc, but overlap isn't reliable
01652   // if ( cudaEventQuery(end_local_calc) == cudaSuccess ) {
01653   if ( cudaEventQuery(end_local_download) == cudaSuccess ) {
01654 // CkPrintf("Pe %d yielding to %d after local calc\n", CkMyPe(), next_pe_sharing_gpu);
01655     computeMgr->sendYieldDevice(next_pe_sharing_gpu);
01656 // CkPrintf("Pe %d yielded to %d after local calc\n", CkMyPe(), next_pe_sharing_gpu);
01657   } else {
01658     CUDA_POLL(cuda_check_local_calc, arg);
01659   }
01660 }

void cuda_check_local_progress void *  arg,
double 
 

Definition at line 1049 of file ComputeNonbondedCUDA.C.

References check_local_count, cuda_check_local_progress(), cuda_errcheck(), CUDA_POLL, CUDA_TRACE_LOCAL, end_local_download, kernel_time, local_submit_time, NAMD_bug(), and NAMD_die().

Referenced by cuda_check_local_progress(), ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::recvYieldDevice().

01049                                                   {
01050 
01051   cudaError_t err = cudaEventQuery(end_local_download);
01052   if ( err == cudaSuccess ) {
01053     double wall_time = CkWallTimer();
01054     CUDA_TRACE_LOCAL(local_submit_time,wall_time);
01055     kernel_time = wall_time - kernel_time;
01056     ((ComputeNonbondedCUDA *) arg)->messageFinishWork();
01057     check_local_count = 0;
01058   } else if ( err != cudaErrorNotReady ) {
01059     cuda_errcheck("in cuda_check_local_progress");
01060     NAMD_bug("cuda_errcheck missed error in cuda_check_local_progress");
01061   } else if ( ++check_local_count >= count_limit ) {
01062     char errmsg[256];
01063     sprintf(errmsg,"cuda_check_local_progress polled %d times over %f s on step %d",
01064             check_local_count, CkWallTimer() - local_submit_time,
01065             ((ComputeNonbondedCUDA *) arg)->step);
01066     cuda_errcheck(errmsg);
01067     NAMD_die(errmsg);
01068   } else if ( check_remote_count ) {
01069     NAMD_bug("nonzero check_remote_count in cuda_check_local_progress");
01070   } else {
01071     CUDA_POLL(cuda_check_local_progress, arg);
01072   }
01073 }

void cuda_check_remote_calc void *  arg,
double 
 

Definition at line 1638 of file ComputeNonbondedCUDA.C.

References computeMgr, cuda_check_remote_calc(), CUDA_POLL, end_remote_download, next_pe_sharing_gpu, and ComputeMgr::sendYieldDevice().

Referenced by cuda_check_remote_calc(), and ComputeNonbondedCUDA::recvYieldDevice().

01638                                                {
01639   // in theory we only need end_remote_calc, but overlap isn't reliable
01640   // if ( cudaEventQuery(end_remote_calc) == cudaSuccess ) {
01641   if ( cudaEventQuery(end_remote_download) == cudaSuccess ) {
01642 // CkPrintf("Pe %d yielding to %d after remote calc\n", CkMyPe(), next_pe_sharing_gpu);
01643     computeMgr->sendYieldDevice(next_pe_sharing_gpu);
01644 // CkPrintf("Pe %d yielded to %d after remote calc\n", CkMyPe(), next_pe_sharing_gpu);
01645   } else {
01646     CUDA_POLL(cuda_check_remote_calc, arg);
01647   }
01648 }

void cuda_check_remote_progress void *  arg,
double 
 

Definition at line 1024 of file ComputeNonbondedCUDA.C.

References check_remote_count, cuda_check_remote_progress(), cuda_errcheck(), CUDA_POLL, CUDA_TRACE_REMOTE, end_remote_download, local_submit_time, NAMD_bug(), NAMD_die(), and remote_submit_time.

Referenced by cuda_check_remote_progress(), and ComputeNonbondedCUDA::recvYieldDevice().

01024                                                    {
01025 
01026   cudaError_t err = cudaEventQuery(end_remote_download);
01027   if ( err == cudaSuccess ) {
01028     local_submit_time = CkWallTimer();
01029     CUDA_TRACE_REMOTE(remote_submit_time,local_submit_time);
01030     ((ComputeNonbondedCUDA *) arg)->messageFinishWork();
01031     check_remote_count = 0;
01032   } else if ( err != cudaErrorNotReady ) {
01033     cuda_errcheck("in cuda_check_remote_progress");
01034     NAMD_bug("cuda_errcheck missed error in cuda_check_remote_progress");
01035   } else if ( ++check_remote_count >= count_limit ) {
01036     char errmsg[256];
01037     sprintf(errmsg,"cuda_check_remote_progress polled %d times over %f s on step %d",
01038             check_remote_count, CkWallTimer() - remote_submit_time,
01039             ((ComputeNonbondedCUDA *) arg)->step);
01040     cuda_errcheck(errmsg);
01041     NAMD_die(errmsg);
01042   } else if ( check_local_count ) {
01043     NAMD_bug("nonzero check_local_count in cuda_check_remote_progress");
01044   } else {
01045     CUDA_POLL(cuda_check_remote_progress, arg);
01046   }
01047 }

int cuda_device_pe  ) 
 

Definition at line 93 of file ComputeNonbondedCUDA.C.

Referenced by ComputeMgr::createComputes().

00093 { return devicePe; }

bool cuda_device_shared_with_pe int  pe  ) 
 

Definition at line 95 of file ComputeNonbondedCUDA.C.

References pesSharingDevice.

Referenced by ComputeMgr::createComputes().

00095                                         {
00096   for ( int i=0; i<numPesSharingDevice; ++i ) {
00097     if ( pesSharingDevice[i] == pe ) return true;
00098   }
00099   return false;
00100 }

void cuda_die const char *  msg  ) 
 

Definition at line 54 of file ComputeNonbondedCUDA.C.

References CmiPhysicalNodeID, and NAMD_die().

Referenced by cuda_initialize().

00054                                {
00055     char host[128];
00056 #ifdef NOHOSTNAME
00057     sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00058 #else
00059     gethostname(host, 128);  host[127] = 0;
00060 #endif
00061     char devstr[128] = "";
00062     int devnum;
00063     if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00064       sprintf(devstr, " device %d", devnum);
00065     }
00066     char errmsg[1024];
00067     sprintf(errmsg,"CUDA error on Pe %d (%s%s): %s", CkMyPe(), host, devstr, msg);
00068     NAMD_die(errmsg);
00069 }

void cuda_errcheck const char *  msg  ) 
 

Definition at line 34 of file ComputeNonbondedCUDA.C.

References CmiPhysicalNodeID, and NAMD_die().

Referenced by cuda_bind_atom_params(), cuda_bind_atoms(), cuda_bind_exclusions(), cuda_bind_force_table(), cuda_bind_forces(), cuda_bind_GBIS_bornRad(), cuda_bind_GBIS_dEdaSum(), cuda_bind_GBIS_dHdrPrefix(), cuda_bind_GBIS_energy(), cuda_bind_GBIS_intRad(), cuda_bind_GBIS_psiSum(), cuda_bind_lj_table(), cuda_bind_patch_pairs(), cuda_bind_virials(), cuda_check_local_progress(), cuda_check_remote_progress(), cuda_GBIS_P1(), cuda_GBIS_P2(), cuda_GBIS_P3(), cuda_init(), cuda_initialize(), cuda_nonbonded_forces(), ComputeNonbondedCUDA::doWork(), ComputeNonbondedCUDA::finishWork(), and ComputeNonbondedCUDA::messageFinishWork().

00034                                     {
00035   cudaError_t err;
00036   if ((err = cudaGetLastError()) != cudaSuccess) {
00037     char host[128];
00038 #ifdef NOHOSTNAME
00039     sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00040 #else
00041     gethostname(host, 128);  host[127] = 0;
00042 #endif
00043     char devstr[128] = "";
00044     int devnum;
00045     if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00046       sprintf(devstr, " device %d", devnum);
00047     }
00048     char errmsg[1024];
00049     sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
00050     NAMD_die(errmsg);
00051   }
00052 }

void cuda_getargs char **  argv  ) 
 

Definition at line 76 of file ComputeNonbondedCUDA.C.

References devicelist, ignoresharing, mergegrids, and usedevicelist.

Referenced by all_init().

00076                                {
00077   devicelist = 0;
00078   usedevicelist = CmiGetArgStringDesc(argv, "+devices", &devicelist,
00079         "comma-delimited list of CUDA device numbers such as 0,2,1,2");
00080   ignoresharing = CmiGetArgFlag(argv, "+ignoresharing");
00081   mergegrids = CmiGetArgFlag(argv, "+mergegrids");
00082 }

void cuda_initialize  ) 
 

Definition at line 150 of file ComputeNonbondedCUDA.C.

References CmiPhysicalNodeID, cuda_die(), cuda_errcheck(), cuda_register_user_events(), devicelist, devicePe, first_pe_sharing_gpu, gpu_is_mine, j, NAMD_bug(), NAMD_die(), next_pe_sharing_gpu, numPesSharingDevice, pesSharingDevice, shared_gpu, and sortop_bitreverse().

Referenced by all_init().

00150                        {
00151 
00152   if ( 0 == CkMyPe() ) cuda_register_user_events();
00153 
00154   char host[128];
00155 #ifdef NOHOSTNAME
00156   sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00157 #else
00158   gethostname(host, 128);  host[127] = 0;
00159 #endif
00160 
00161   int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
00162   int myRankInPhysicalNode;
00163   int numPesOnPhysicalNode;
00164   int *pesOnPhysicalNode;
00165   CmiGetPesOnPhysicalNode(myPhysicalNodeID,
00166                            &pesOnPhysicalNode,&numPesOnPhysicalNode);
00167 
00168   {
00169     int i;
00170     for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00171       if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
00172         i = numPesOnPhysicalNode;
00173         break;
00174       }
00175       if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
00176     }
00177     if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
00178       CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
00179       for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00180         CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
00181           i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
00182       }
00183       myRankInPhysicalNode = 0;
00184       numPesOnPhysicalNode = 1;
00185       pesOnPhysicalNode = new int[1];
00186       pesOnPhysicalNode[0] = CkMyPe();
00187     } else {
00188       myRankInPhysicalNode = i;
00189     }
00190   }
00191   // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
00192 
00193   int deviceCount = 0;
00194   cudaGetDeviceCount(&deviceCount);
00195   cuda_errcheck("in cudaGetDeviceCount");
00196   if ( deviceCount <= 0 ) {
00197     cuda_die("No CUDA devices found.");
00198   }
00199 
00200   int *devices;
00201   int ndevices = 0;
00202   int nexclusive = 0;
00203   if ( usedevicelist ) {
00204     devices = new int[strlen(devicelist)];
00205     int i = 0;
00206     while ( devicelist[i] ) {
00207       ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
00208       while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
00209       while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
00210     }
00211   } else {
00212     if ( ! CkMyPe() ) {
00213       CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
00214     }
00215     devices = new int[deviceCount];
00216     for ( int i=0; i<deviceCount; ++i ) {
00217       int dev = i % deviceCount;
00218 #if CUDA_VERSION >= 2020
00219       cudaDeviceProp deviceProp;
00220       cudaGetDeviceProperties(&deviceProp, dev);
00221       cuda_errcheck("in cudaGetDeviceProperties");
00222       if ( deviceProp.computeMode != cudaComputeModeProhibited
00223            && (deviceProp.major > 1 || deviceProp.minor >= 1)
00224            && deviceProp.canMapHostMemory
00225            && deviceProp.multiProcessorCount > 2 ) {  // exclude weak cards
00226         devices[ndevices++] = dev;
00227       }
00228       if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
00229         ++nexclusive;
00230       }
00231 #else
00232       devices[ndevices++] = dev;
00233 #endif
00234     }
00235   }
00236 
00237   if ( ! ndevices ) {
00238     cuda_die("All CUDA devices are in prohibited mode, of compute capability 1.0, or otherwise unusable.");
00239   }
00240 
00241   shared_gpu = 0;
00242   gpu_is_mine = 1;
00243   first_pe_sharing_gpu = CkMyPe();
00244   next_pe_sharing_gpu = CkMyPe();
00245 
00246  /* if ( (ndevices >= numPesOnPhysicalNode) || (nexclusive == 0) ) */ {
00247 
00248   int dev;
00249   if ( numPesOnPhysicalNode > 1 ) {
00250     int myDeviceRank = myRankInPhysicalNode * ndevices / numPesOnPhysicalNode;
00251     dev = devices[myDeviceRank];
00252     devicePe = CkMyPe();
00253     if ( ignoresharing ) {
00254       pesSharingDevice = new int[1];
00255       pesSharingDevice[0] = CkMyPe();
00256       numPesSharingDevice = 1;
00257     } else {
00258       pesSharingDevice = new int[numPesOnPhysicalNode];
00259       devicePe = -1;
00260       numPesSharingDevice = 0;
00261       for ( int i = 0; i < numPesOnPhysicalNode; ++i ) {
00262         if ( i * ndevices / numPesOnPhysicalNode == myDeviceRank ) {
00263           int thisPe = pesOnPhysicalNode[i];
00264           pesSharingDevice[numPesSharingDevice++] = thisPe;
00265           if ( devicePe < 1 ) devicePe = thisPe;
00266           if ( sortop_bitreverse(thisPe,devicePe) ) devicePe = thisPe;
00267         }
00268       }
00269       for ( int j = 0; j < ndevices; ++j ) {
00270         if ( devices[j] == dev && j != myDeviceRank ) shared_gpu = 1;
00271       }
00272     }
00273     if ( shared_gpu && devicePe == CkMyPe() ) {
00274       CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
00275     }
00276   } else {  // in case phys node code is lying
00277     dev = devices[CkMyPe() % ndevices];
00278     devicePe = CkMyPe();
00279     pesSharingDevice = new int[1];
00280     pesSharingDevice[0] = CkMyPe();
00281     numPesSharingDevice = 1;
00282   }
00283 
00284   if ( devicePe != CkMyPe() ) {
00285     CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
00286              CkMyPe(), myRankInPhysicalNode, devicePe);
00287     return;
00288   }
00289 
00290   // disable token-passing but don't submit local until remote finished
00291   // if shared_gpu is true, otherwise submit all work immediately
00292   first_pe_sharing_gpu = CkMyPe();
00293   next_pe_sharing_gpu = CkMyPe();
00294 
00295   gpu_is_mine = ( first_pe_sharing_gpu == CkMyPe() ); 
00296 
00297   if ( dev >= deviceCount ) {
00298     char buf[256];
00299     sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
00300                 CkMyPe(), dev, host, deviceCount);
00301     NAMD_die(buf);
00302   }
00303 
00304   cudaError_t err;
00305   cudaDeviceProp deviceProp;
00306   err = cudaGetDeviceProperties(&deviceProp, dev);
00307   if (err == cudaSuccess) {
00308     CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s'  Mem: %dMB  Rev: %d.%d\n",
00309              CkMyPe(), myRankInPhysicalNode, dev, host,
00310              deviceProp.name, deviceProp.totalGlobalMem / (1024*1024),
00311              deviceProp.major, deviceProp.minor);
00312 
00313     err = cudaSetDevice(dev);
00314   }
00315   if ( err != cudaSuccess) {
00316     char errmsg[1024];
00317     sprintf(errmsg,"CUDA error binding to device %d on pe %d: %s",
00318                         dev, CkMyPe(), cudaGetErrorString(err));
00319     NAMD_die(errmsg);
00320   }
00321 
00322  }  // just let CUDA pick a device for us
00323 
00324   cudaSetDeviceFlags(cudaDeviceMapHost);
00325   cuda_errcheck("in cudaSetDeviceFlags");
00326 
00327   int dev;
00328   if ( cudaGetDevice(&dev) == cudaSuccess ) {
00329     cudaDeviceProp deviceProp;
00330     cudaGetDeviceProperties(&deviceProp, dev);
00331     cuda_errcheck("in cudaGetDeviceProperties");
00332     if ( deviceProp.computeMode == cudaComputeModeProhibited )
00333       cuda_die("device in prohibited mode");
00334     if ( deviceProp.major < 2 && deviceProp.minor < 1 )
00335       cuda_die("device not of compute capability 1.1 or higher");
00336     if ( ! deviceProp.canMapHostMemory )
00337       cuda_die("device cannot map host memory");
00338   }
00339 
00340   if ( sizeof(patch_pair) & 15 ) NAMD_bug("sizeof(patch_pair) % 16 != 0");
00341   if ( sizeof(force_list) & 15 ) NAMD_bug("sizeof(force_list) % 16 != 0");
00342   if ( sizeof(atom) & 15 ) NAMD_bug("sizeof(atom) % 16 != 0");
00343   if ( sizeof(atom_param) & 15 ) NAMD_bug("sizeof(atom_param) % 16 != 0");
00344 
00345 }

void cuda_register_user_events  ) 
 

Definition at line 125 of file ComputeNonbondedCUDA.C.

References REGISTER_DEVICE_EVENTS.

Referenced by cuda_initialize().

00125                                  {
00126 
00127 #define REGISTER_DEVICE_EVENTS(DEV) \
00128   traceRegisterUserEvent("CUDA device " #DEV " remote", CUDA_EVENT_ID_BASE + 2 * DEV); \
00129   traceRegisterUserEvent("CUDA device " #DEV " local", CUDA_EVENT_ID_BASE + 2 * DEV + 1);
00130 
00131   REGISTER_DEVICE_EVENTS(0)
00132   REGISTER_DEVICE_EVENTS(1)
00133   REGISTER_DEVICE_EVENTS(2)
00134   REGISTER_DEVICE_EVENTS(3)
00135   REGISTER_DEVICE_EVENTS(4)
00136   REGISTER_DEVICE_EVENTS(5)
00137   REGISTER_DEVICE_EVENTS(6)
00138   REGISTER_DEVICE_EVENTS(7)
00139   REGISTER_DEVICE_EVENTS(8)
00140   REGISTER_DEVICE_EVENTS(9)
00141   REGISTER_DEVICE_EVENTS(10)
00142   REGISTER_DEVICE_EVENTS(11)
00143   REGISTER_DEVICE_EVENTS(12)
00144   REGISTER_DEVICE_EVENTS(13)
00145   REGISTER_DEVICE_EVENTS(14)
00146   REGISTER_DEVICE_EVENTS(15)
00147 
00148 }

void register_cuda_compute_pair ComputeID  c,
PatchID  pid[],
int  t[]
 

Definition at line 657 of file ComputeNonbondedCUDA.C.

References ResizeArray< Elem >::add(), ComputeNonbondedCUDA::compute_record::c, PatchMap::center(), cudaCompute, ComputeNonbondedCUDA::localComputeRecords, NAMD_bug(), ComputeNonbondedCUDA::compute_record::offset, ComputeNonbondedCUDA::patchMap, ComputeNonbondedCUDA::patchRecords, ComputeNonbondedCUDA::compute_record::pid, ComputeNonbondedCUDA::remoteComputeRecords, ComputeNonbondedCUDA::requirePatch(), Vector::x, Vector::y, and Vector::z.

Referenced by ComputeNonbondedPair::initialize().

00657                                                                      {
00658 
00659   if ( ! cudaCompute ) NAMD_bug("register_pair called early");
00660  
00661   cudaCompute->requirePatch(pid[0]);
00662   cudaCompute->requirePatch(pid[1]);
00663 
00664   ComputeNonbondedCUDA::compute_record cr, cr2;
00665   cr.c = c;  cr2.c = c;
00666   cr.pid[0] = pid[0];  cr.pid[1] = pid[1];
00667   cr2.pid[0] = pid[1];  cr2.pid[1] = pid[0];
00668 
00669   int t1 = t[0];
00670   int t2 = t[1];
00671   Vector offset = cudaCompute->patchMap->center(pid[0])
00672                 - cudaCompute->patchMap->center(pid[1]);
00673   offset.x += (t1%3-1) - (t2%3-1);
00674   offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
00675   offset.z += (t1/9-1) - (t2/9-1);
00676   cr.offset = offset;
00677   cr2.offset = -1. * offset;
00678     
00679   if ( cudaCompute->patchRecords[pid[0]].isLocal ) {
00680     cudaCompute->localComputeRecords.add(cr);
00681   } else {
00682     cudaCompute->remoteComputeRecords.add(cr);
00683   }
00684   if ( cudaCompute->patchRecords[pid[1]].isLocal ) {
00685     cudaCompute->localComputeRecords.add(cr2);
00686   } else {
00687     cudaCompute->remoteComputeRecords.add(cr2);
00688   }
00689 }

void register_cuda_compute_self ComputeID  c,
PatchID  pid
 

Definition at line 640 of file ComputeNonbondedCUDA.C.

References ResizeArray< Elem >::add(), ComputeNonbondedCUDA::compute_record::c, cudaCompute, ComputeNonbondedCUDA::localComputeRecords, NAMD_bug(), ComputeNonbondedCUDA::compute_record::offset, ComputeNonbondedCUDA::patchRecords, ComputeNonbondedCUDA::compute_record::pid, ComputeNonbondedCUDA::remoteComputeRecords, and ComputeNonbondedCUDA::requirePatch().

Referenced by ComputeNonbondedSelf::initialize().

00640                                                           {
00641 
00642   if ( ! cudaCompute ) NAMD_bug("register_self called early");
00643 
00644   cudaCompute->requirePatch(pid);
00645 
00646   ComputeNonbondedCUDA::compute_record cr;
00647   cr.c = c;
00648   cr.pid[0] = pid;  cr.pid[1] = pid;
00649   cr.offset = 0.;
00650   if ( cudaCompute->patchRecords[pid].isLocal ) {
00651     cudaCompute->localComputeRecords.add(cr);
00652   } else {
00653     cudaCompute->remoteComputeRecords.add(cr);
00654   }
00655 }

void send_build_cuda_force_table  ) 
 

Copyright (c) 1995, 1996, 1997, 1998, 1999, 2000 by The Board of Trustees of the University of Illinois. All rights reserved.

Definition at line 350 of file ComputeNonbondedCUDA.C.

References computeMgr, and ComputeMgr::sendBuildCudaForceTable().

Referenced by ComputeNonbondedUtil::select().

00350                                    {
00351   computeMgr->sendBuildCudaForceTable();
00352 }

bool sortop_bitreverse int  a,
int  b
[inline, static]
 

Definition at line 102 of file ComputeNonbondedCUDA.C.

Referenced by cuda_initialize().

00102                                                    {
00103   if ( a == b ) return 0; 
00104   for ( int bit = 1; bit; bit *= 2 ) {
00105     if ( (a&bit) != (b&bit) ) return ((a&bit) < (b&bit));
00106   }
00107   return 0;
00108 }

void unregister_cuda_compute ComputeID  c  ) 
 

Definition at line 691 of file ComputeNonbondedCUDA.C.

References NAMD_bug().

00691                                           {  // static
00692 
00693   NAMD_bug("unregister_compute unimplemented");
00694 
00695 }


Variable Documentation

__thread atom_param* atom_params [static]
 

Definition at line 988 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread atom* atoms [static]
 

Definition at line 989 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread int atomsChanged = 0 [static]
 

Definition at line 697 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::atomUpdate(), ComputeNonbondedCUDA::ComputeNonbondedCUDA(), ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::finishWork().

__thread float* bornRadH [static]
 

Definition at line 1002 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread int check_local_count [static]
 

Definition at line 1022 of file ComputeNonbondedCUDA.C.

Referenced by cuda_check_local_progress().

__thread int check_remote_count [static]
 

Definition at line 1021 of file ComputeNonbondedCUDA.C.

Referenced by cuda_check_remote_progress().

__thread ComputeMgr* computeMgr = 0 [static]
 

Definition at line 348 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::assignPatches(), ComputeNonbondedCUDA::ComputeNonbondedCUDA(), cuda_check_local_calc(), cuda_check_remote_calc(), ComputeNonbondedCUDA::messageFinishWork(), ComputeNonbondedCUDA::noWork(), and send_build_cuda_force_table().

__thread int computesChanged = 0 [static]
 

Definition at line 698 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::requirePatch().

__thread int cuda_timer_count [static]
 

Definition at line 1006 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::finishWork().

__thread double cuda_timer_total [static]
 

Definition at line 1007 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::finishWork().

__thread ComputeNonbondedCUDA* cudaCompute = 0 [static]
 

Definition at line 347 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), ComputeNonbondedCUDA::doWork(), register_cuda_compute_pair(), and register_cuda_compute_self().

char* devicelist
 

Definition at line 71 of file ComputeNonbondedCUDA.C.

Referenced by cuda_getargs(), and cuda_initialize().

__thread int devicePe [static]
 

Definition at line 87 of file ComputeNonbondedCUDA.C.

Referenced by build_cuda_force_table(), and cuda_initialize().

__thread float* dHdrPrefixH [static]
 

Definition at line 1004 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread cudaEvent_t end_local_download [static]
 

Definition at line 709 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), cuda_check_local_calc(), cuda_check_local_progress(), ComputeNonbondedCUDA::finishWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread cudaEvent_t end_remote_download [static]
 

Definition at line 708 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), cuda_check_remote_calc(), cuda_check_remote_progress(), ComputeNonbondedCUDA::finishWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread float* energy_gbis [static]
 

Definition at line 996 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), ComputeNonbondedCUDA::finishWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread int2* exclusionsByAtom [static]
 

Definition at line 525 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::build_exclusions(), and ComputeNonbondedCUDA::doWork().

__thread int first_pe_sharing_gpu [static]
 

Definition at line 85 of file ComputeNonbondedCUDA.C.

Referenced by cuda_initialize().

__thread ResizeArray<force_list>* force_lists_ptr [static]
 

Definition at line 712 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA().

__thread int gpu_is_mine [static]
 

Definition at line 91 of file ComputeNonbondedCUDA.C.

Referenced by cuda_initialize(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread int ignoresharing [static]
 

Definition at line 73 of file ComputeNonbondedCUDA.C.

Referenced by cuda_getargs().

__thread float* intRad0H [static]
 

Definition at line 999 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread float* intRadSH [static]
 

Definition at line 1000 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread int kernel_launch_state = 0 [static]
 

Definition at line 1089 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread double kernel_time [static]
 

Definition at line 1008 of file ComputeNonbondedCUDA.C.

Referenced by cuda_check_local_progress(), and ComputeNonbondedCUDA::doWork().

__thread double local_submit_time [static]
 

Definition at line 1010 of file ComputeNonbondedCUDA.C.

Referenced by cuda_check_local_progress(), and cuda_check_remote_progress().

__thread int mergegrids [static]
 

Definition at line 74 of file ComputeNonbondedCUDA.C.

Referenced by cuda_getargs(), and ComputeNonbondedCUDA::finishWork().

__thread int next_pe_sharing_gpu [static]
 

Definition at line 86 of file ComputeNonbondedCUDA.C.

Referenced by cuda_check_local_calc(), cuda_check_remote_calc(), and cuda_initialize().

__thread int num_atom_records_allocated [static]
 

Definition at line 986 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork().

__thread int num_virials [static]
 

Definition at line 992 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork().

__thread int num_virials_allocated [static]
 

Definition at line 993 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork().

__thread int numPesSharingDevice [static]
 

Definition at line 88 of file ComputeNonbondedCUDA.C.

Referenced by cuda_initialize().

__thread int pairlistsValid = 0 [static]
 

Definition at line 700 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork().

__thread float pairlistTolerance = 0. [static]
 

Definition at line 701 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork().

__thread ResizeArray<patch_pair>* patch_pairs_ptr [static]
 

Definition at line 711 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA().

__thread int* pesSharingDevice [static]
 

Definition at line 89 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::assignPatches(), cuda_device_shared_with_pe(), and cuda_initialize().

__thread float plcutoff2 = 0 [static]
 

Definition at line 704 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread double remote_submit_time [static]
 

Definition at line 1009 of file ComputeNonbondedCUDA.C.

Referenced by cuda_check_remote_progress(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread int savePairlists = 0 [static]
 

Definition at line 703 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork().

__thread int shared_gpu [static]
 

Definition at line 84 of file ComputeNonbondedCUDA.C.

Referenced by cuda_initialize(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread float* slow_virials [static]
 

Definition at line 995 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::finishWork().

__thread cudaEvent_t start_calc [static]
 

Definition at line 707 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), ComputeNonbondedCUDA::finishWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread cudaStream_t stream
 

Definition at line 148 of file ComputeNonbondedCUDAKernel.cu.

Referenced by cuda_bind_atom_params(), cuda_bind_atoms(), cuda_bind_GBIS_bornRad(), cuda_bind_GBIS_dHdrPrefix(), cuda_bind_GBIS_intRad(), cuda_init(), cuda_nonbonded_forces(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread cudaStream_t stream2
 

Definition at line 149 of file ComputeNonbondedCUDAKernel.cu.

__thread int usedevicelist [static]
 

Definition at line 72 of file ComputeNonbondedCUDA.C.

Referenced by cuda_getargs().

__thread int usePairlists = 0 [static]
 

Definition at line 702 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork().

__thread float* virials [static]
 

Definition at line 994 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), ComputeNonbondedCUDA::finishWork(), and ComputeNonbondedCUDA::recvYieldDevice().


Generated on Fri May 25 04:07:18 2012 for NAMD by  doxygen 1.3.9.1