Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Namespace Members | Class Members | File Members

ComputeNonbondedCUDA.C File Reference

#include "common.h"
#include "charm++.h"
#include <cuda_runtime.h>
#include <cuda.h>
#include "WorkDistrib.h"
#include "ComputeMgr.h"
#include "ProxyMgr.h"
#include "ComputeNonbondedCUDA.h"
#include "ComputeNonbondedCUDAKernel.h"
#include "LJTable.h"
#include "ObjectArena.h"
#include "SortAtoms.h"
#include <algorithm>
#include "NamdTypes.h"

Go to the source code of this file.

Classes

struct  exlist_sortop
struct  cr_sortop

Defines

#define CUDA_EVENT_ID_POLL_REMOTE   98
#define CUDA_TRACE_POLL_REMOTE   traceUserEvent(CUDA_EVENT_ID_POLL_REMOTE)
#define CUDA_EVENT_ID_POLL_LOCAL   99
#define CUDA_TRACE_POLL_LOCAL   traceUserEvent(CUDA_EVENT_ID_POLL_LOCAL)
#define CUDA_EVENT_ID_BASE   100
#define CUDA_TRACE_REMOTE(START, END)
#define CUDA_TRACE_LOCAL(START, END)
#define REGISTER_DEVICE_EVENTS(DEV)
#define SET_EXCL(EXCL, BASE, DIFF)   (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
#define PATCH_PAIRS_REF   ResizeArray<patch_pair> &patch_pairs(*patch_pairs_ptr);
#define FORCE_LISTS_REF   ResizeArray<force_list> &force_lists(*force_lists_ptr);
#define CUDA_POLL(FN, ARG)   CcdCallFnAfter(FN,ARG,0.1)
#define GBISP()
#define count_limit   1000000
#define stream2   stream

Functions

void cuda_errcheck (const char *msg)
void cuda_die (const char *msg)
void cuda_getargs (char **argv)
int cuda_device_pe ()
bool cuda_device_shared_with_pe (int pe)
bool sortop_bitreverse (int a, int b)
void cuda_register_user_events ()
void cuda_initialize ()
void send_build_cuda_force_table ()
void build_cuda_force_table ()
void register_cuda_compute_self (ComputeID c, PatchID pid)
void register_cuda_compute_pair (ComputeID c, PatchID pid[], int t[])
void unregister_cuda_compute (ComputeID c)
void cuda_check_remote_progress (void *arg, double walltime)
void cuda_check_local_progress (void *arg, double walltime)
void cuda_check_remote_calc (void *arg, double)
void cuda_check_local_calc (void *arg, double)

Variables

__thread cudaStream_t stream
__thread cudaStream_t stream2
char * devicelist
__thread int usedevicelist
__thread int ignoresharing
__thread int mergegrids
__thread int shared_gpu
__thread int first_pe_sharing_gpu
__thread int next_pe_sharing_gpu
__thread int devicePe
__thread int numPesSharingDevice
__thread int * pesSharingDevice
__thread int gpu_is_mine
__thread ComputeNonbondedCUDAcudaCompute = 0
__thread ComputeMgrcomputeMgr = 0
__thread int2 * exclusionsByAtom
__thread int atomsChanged = 0
__thread int computesChanged = 0
__thread int pairlistsValid = 0
__thread float pairlistTolerance = 0.
__thread int usePairlists = 0
__thread int savePairlists = 0
__thread float plcutoff2 = 0
__thread cudaEvent_t start_calc
__thread cudaEvent_t end_remote_download
__thread cudaEvent_t end_local_download
__thread ResizeArray< patch_pair > * patch_pairs_ptr
__thread ResizeArray< force_list > * force_lists_ptr
__thread int num_atom_records_allocated
__thread atom_param * atom_params
__thread atom * atoms
__thread int num_virials
__thread int num_virials_allocated
__thread float * virials
__thread float * slow_virials
__thread float * energy_gbis
__thread float * intRad0H
__thread float * intRadSH
__thread float * bornRadH
__thread float * dHdrPrefixH
__thread int cuda_timer_count
__thread double cuda_timer_total
__thread double kernel_time
__thread double remote_submit_time
__thread double local_submit_time
__thread int check_remote_count
__thread int check_local_count
__thread int kernel_launch_state = 0


Define Documentation

#define count_limit   1000000
 

Definition at line 1032 of file ComputeNonbondedCUDA.C.

#define CUDA_EVENT_ID_BASE   100
 

Definition at line 123 of file ComputeNonbondedCUDA.C.

#define CUDA_EVENT_ID_POLL_LOCAL   99
 

Definition at line 120 of file ComputeNonbondedCUDA.C.

Referenced by cuda_register_user_events().

#define CUDA_EVENT_ID_POLL_REMOTE   98
 

Definition at line 117 of file ComputeNonbondedCUDA.C.

Referenced by cuda_register_user_events().

#define CUDA_POLL FN,
ARG   )     CcdCallFnAfter(FN,ARG,0.1)
 

Definition at line 1024 of file ComputeNonbondedCUDA.C.

Referenced by cuda_check_local_calc(), cuda_check_local_progress(), cuda_check_remote_calc(), cuda_check_remote_progress(), and ComputeNonbondedCUDA::recvYieldDevice().

#define CUDA_TRACE_LOCAL START,
END   ) 
 

Value:

do { int dev; cudaGetDevice(&dev); traceUserBracketEvent( \
       CUDA_EVENT_ID_BASE + 2 * dev + 1, START, END); } while (0)

Definition at line 127 of file ComputeNonbondedCUDA.C.

Referenced by cuda_check_local_progress().

#define CUDA_TRACE_POLL_LOCAL   traceUserEvent(CUDA_EVENT_ID_POLL_LOCAL)
 

Definition at line 121 of file ComputeNonbondedCUDA.C.

#define CUDA_TRACE_POLL_REMOTE   traceUserEvent(CUDA_EVENT_ID_POLL_REMOTE)
 

Definition at line 118 of file ComputeNonbondedCUDA.C.

#define CUDA_TRACE_REMOTE START,
END   ) 
 

Value:

do { int dev; cudaGetDevice(&dev); traceUserBracketEvent( \
       CUDA_EVENT_ID_BASE + 2 * dev, START, END); } while (0)

Definition at line 124 of file ComputeNonbondedCUDA.C.

Referenced by cuda_check_remote_progress().

#define FORCE_LISTS_REF   ResizeArray<force_list> &force_lists(*force_lists_ptr);
 

Definition at line 723 of file ComputeNonbondedCUDA.C.

 
#define GBISP  ) 
 

Definition at line 1029 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), ComputeNonbondedCUDA::finishWork(), ComputeNonbondedCUDA::noWork(), and ComputeNonbondedCUDA::recvYieldDevice().

#define PATCH_PAIRS_REF   ResizeArray<patch_pair> &patch_pairs(*patch_pairs_ptr);
 

Definition at line 722 of file ComputeNonbondedCUDA.C.

#define REGISTER_DEVICE_EVENTS DEV   ) 
 

Value:

traceRegisterUserEvent("CUDA device " #DEV " remote", CUDA_EVENT_ID_BASE + 2 * DEV); \
  traceRegisterUserEvent("CUDA device " #DEV " local", CUDA_EVENT_ID_BASE + 2 * DEV + 1);

Referenced by cuda_register_user_events().

#define SET_EXCL EXCL,
BASE,
DIFF   )     (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
 

Referenced by ComputeNonbondedCUDA::build_exclusions().

#define stream2   stream
 

Referenced by cuda_init(), and ComputeNonbondedCUDA::recvYieldDevice().


Function Documentation

void build_cuda_force_table  ) 
 

Definition at line 363 of file ComputeNonbondedCUDA.C.

References ComputeNonbondedCUDA::build_force_table(), ComputeNonbondedCUDA::build_lj_table(), and devicePe.

Referenced by ComputeMgr::recvBuildCudaForceTable().

00363                               {
00364   if ( devicePe != CkMyPe() ) return;
00365   ComputeNonbondedCUDA::build_lj_table();
00366   ComputeNonbondedCUDA::build_force_table();
00367 }

void cuda_check_local_calc void *  arg,
double 
 

Definition at line 1666 of file ComputeNonbondedCUDA.C.

References computeMgr, cuda_check_local_calc(), CUDA_POLL, end_local_download, next_pe_sharing_gpu, and ComputeMgr::sendYieldDevice().

Referenced by cuda_check_local_calc(), and ComputeNonbondedCUDA::recvYieldDevice().

01666                                                              {
01667   // in theory we only need end_local_calc, but overlap isn't reliable
01668   // if ( cudaEventQuery(end_local_calc) == cudaSuccess ) {
01669   if ( cudaEventQuery(end_local_download) == cudaSuccess ) {
01670 // CkPrintf("Pe %d yielding to %d after local calc\n", CkMyPe(), next_pe_sharing_gpu);
01671     computeMgr->sendYieldDevice(next_pe_sharing_gpu);
01672 // CkPrintf("Pe %d yielded to %d after local calc\n", CkMyPe(), next_pe_sharing_gpu);
01673   } else {
01674     CUDA_POLL(cuda_check_local_calc, arg);
01675   }
01676 }

void cuda_check_local_progress void *  arg,
double  walltime
 

Definition at line 1065 of file ComputeNonbondedCUDA.C.

References check_local_count, cuda_check_local_progress(), cuda_errcheck(), CUDA_POLL, CUDA_TRACE_LOCAL, end_local_download, kernel_time, local_submit_time, NAMD_bug(), and NAMD_die().

Referenced by cuda_check_local_progress(), ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::recvYieldDevice().

01065                                                            {
01066 
01067   CUDA_TRACE_POLL_LOCAL;
01068   cudaError_t err = cudaEventQuery(end_local_download);
01069   if ( err == cudaSuccess ) {
01070     CUDA_TRACE_LOCAL(local_submit_time,walltime);
01071     kernel_time = walltime - kernel_time;
01072     ((ComputeNonbondedCUDA *) arg)->messageFinishWork();
01073     check_local_count = 0;
01074   } else if ( err != cudaErrorNotReady ) {
01075     cuda_errcheck("in cuda_check_local_progress");
01076     NAMD_bug("cuda_errcheck missed error in cuda_check_local_progress");
01077   } else if ( ++check_local_count >= count_limit ) {
01078     char errmsg[256];
01079     sprintf(errmsg,"cuda_check_local_progress polled %d times over %f s on step %d",
01080             check_local_count, walltime - local_submit_time,
01081             ((ComputeNonbondedCUDA *) arg)->step);
01082     cuda_errcheck(errmsg);
01083     NAMD_die(errmsg);
01084   } else if ( check_remote_count ) {
01085     NAMD_bug("nonzero check_remote_count in cuda_check_local_progress");
01086   } else {
01087     CUDA_POLL(cuda_check_local_progress, arg);
01088   }
01089 }

void cuda_check_remote_calc void *  arg,
double 
 

Definition at line 1654 of file ComputeNonbondedCUDA.C.

References computeMgr, cuda_check_remote_calc(), CUDA_POLL, end_remote_download, next_pe_sharing_gpu, and ComputeMgr::sendYieldDevice().

Referenced by cuda_check_remote_calc(), and ComputeNonbondedCUDA::recvYieldDevice().

01654                                                               {
01655   // in theory we only need end_remote_calc, but overlap isn't reliable
01656   // if ( cudaEventQuery(end_remote_calc) == cudaSuccess ) {
01657   if ( cudaEventQuery(end_remote_download) == cudaSuccess ) {
01658 // CkPrintf("Pe %d yielding to %d after remote calc\n", CkMyPe(), next_pe_sharing_gpu);
01659     computeMgr->sendYieldDevice(next_pe_sharing_gpu);
01660 // CkPrintf("Pe %d yielded to %d after remote calc\n", CkMyPe(), next_pe_sharing_gpu);
01661   } else {
01662     CUDA_POLL(cuda_check_remote_calc, arg);
01663   }
01664 }

void cuda_check_remote_progress void *  arg,
double  walltime
 

Definition at line 1036 of file ComputeNonbondedCUDA.C.

References check_remote_count, cuda_check_remote_progress(), cuda_errcheck(), CUDA_POLL, CUDA_TRACE_REMOTE, end_remote_download, kernel_time, local_submit_time, NAMD_bug(), NAMD_die(), and remote_submit_time.

Referenced by cuda_check_remote_progress(), and ComputeNonbondedCUDA::recvYieldDevice().

01036                                                             {
01037 
01038   CUDA_TRACE_POLL_REMOTE;
01039   cudaError_t err = cudaEventQuery(end_remote_download);
01040   if ( err == cudaSuccess ) {
01041     local_submit_time = walltime;
01042     CUDA_TRACE_REMOTE(remote_submit_time,local_submit_time);
01043     if ( mergegrids ) {  // no local
01044       kernel_time = local_submit_time - kernel_time;
01045     }
01046     ((ComputeNonbondedCUDA *) arg)->messageFinishWork();
01047     check_remote_count = 0;
01048   } else if ( err != cudaErrorNotReady ) {
01049     cuda_errcheck("in cuda_check_remote_progress");
01050     NAMD_bug("cuda_errcheck missed error in cuda_check_remote_progress");
01051   } else if ( ++check_remote_count >= count_limit ) {
01052     char errmsg[256];
01053     sprintf(errmsg,"cuda_check_remote_progress polled %d times over %f s on step %d",
01054             check_remote_count, walltime - remote_submit_time,
01055             ((ComputeNonbondedCUDA *) arg)->step);
01056     cuda_errcheck(errmsg);
01057     NAMD_die(errmsg);
01058   } else if ( check_local_count ) {
01059     NAMD_bug("nonzero check_local_count in cuda_check_remote_progress");
01060   } else {
01061     CUDA_POLL(cuda_check_remote_progress, arg);
01062   }
01063 }

int cuda_device_pe  ) 
 

Definition at line 93 of file ComputeNonbondedCUDA.C.

Referenced by ComputeMgr::createComputes().

00093 { return devicePe; }

bool cuda_device_shared_with_pe int  pe  ) 
 

Definition at line 95 of file ComputeNonbondedCUDA.C.

References pesSharingDevice.

Referenced by ComputeMgr::createComputes().

00095                                         {
00096   for ( int i=0; i<numPesSharingDevice; ++i ) {
00097     if ( pesSharingDevice[i] == pe ) return true;
00098   }
00099   return false;
00100 }

void cuda_die const char *  msg  ) 
 

Definition at line 54 of file ComputeNonbondedCUDA.C.

References CmiPhysicalNodeID, and NAMD_die().

Referenced by cuda_initialize().

00054                                {
00055     char host[128];
00056 #ifdef NOHOSTNAME
00057     sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00058 #else
00059     gethostname(host, 128);  host[127] = 0;
00060 #endif
00061     char devstr[128] = "";
00062     int devnum;
00063     if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00064       sprintf(devstr, " device %d", devnum);
00065     }
00066     char errmsg[1024];
00067     sprintf(errmsg,"CUDA error on Pe %d (%s%s): %s", CkMyPe(), host, devstr, msg);
00068     NAMD_die(errmsg);
00069 }

void cuda_errcheck const char *  msg  ) 
 

Definition at line 34 of file ComputeNonbondedCUDA.C.

References CmiPhysicalNodeID, and NAMD_die().

Referenced by cuda_bind_atom_params(), cuda_bind_atoms(), cuda_bind_exclusions(), cuda_bind_force_table(), cuda_bind_forces(), cuda_bind_GBIS_bornRad(), cuda_bind_GBIS_dEdaSum(), cuda_bind_GBIS_dHdrPrefix(), cuda_bind_GBIS_energy(), cuda_bind_GBIS_intRad(), cuda_bind_GBIS_psiSum(), cuda_bind_lj_table(), cuda_bind_patch_pairs(), cuda_bind_virials(), cuda_check_local_progress(), cuda_check_remote_progress(), cuda_GBIS_P1(), cuda_GBIS_P2(), cuda_GBIS_P3(), cuda_init(), cuda_initialize(), cuda_nonbonded_forces(), ComputeNonbondedCUDA::doWork(), ComputeNonbondedCUDA::finishWork(), and ComputeNonbondedCUDA::messageFinishWork().

00034                                     {
00035   cudaError_t err;
00036   if ((err = cudaGetLastError()) != cudaSuccess) {
00037     char host[128];
00038 #ifdef NOHOSTNAME
00039     sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00040 #else
00041     gethostname(host, 128);  host[127] = 0;
00042 #endif
00043     char devstr[128] = "";
00044     int devnum;
00045     if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00046       sprintf(devstr, " device %d", devnum);
00047     }
00048     char errmsg[1024];
00049     sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
00050     NAMD_die(errmsg);
00051   }
00052 }

void cuda_getargs char **  argv  ) 
 

Definition at line 76 of file ComputeNonbondedCUDA.C.

References devicelist, ignoresharing, mergegrids, and usedevicelist.

Referenced by all_init().

00076                                {
00077   devicelist = 0;
00078   usedevicelist = CmiGetArgStringDesc(argv, "+devices", &devicelist,
00079         "comma-delimited list of CUDA device numbers such as 0,2,1,2");
00080   ignoresharing = CmiGetArgFlag(argv, "+ignoresharing");
00081   mergegrids = CmiGetArgFlag(argv, "+mergegrids");
00082 }

void cuda_initialize  ) 
 

Definition at line 159 of file ComputeNonbondedCUDA.C.

References CmiPhysicalNodeID, cuda_die(), cuda_errcheck(), cuda_register_user_events(), devicelist, devicePe, first_pe_sharing_gpu, gpu_is_mine, j, NAMD_bug(), NAMD_die(), next_pe_sharing_gpu, numPesSharingDevice, pesSharingDevice, shared_gpu, and sortop_bitreverse().

Referenced by all_init().

00159                        {
00160 
00161   if ( 0 == CkMyPe() ) cuda_register_user_events();
00162 
00163   char host[128];
00164 #ifdef NOHOSTNAME
00165   sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00166 #else
00167   gethostname(host, 128);  host[127] = 0;
00168 #endif
00169 
00170   int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
00171   int myRankInPhysicalNode;
00172   int numPesOnPhysicalNode;
00173   int *pesOnPhysicalNode;
00174   CmiGetPesOnPhysicalNode(myPhysicalNodeID,
00175                            &pesOnPhysicalNode,&numPesOnPhysicalNode);
00176 
00177   {
00178     int i;
00179     for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00180       if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
00181         i = numPesOnPhysicalNode;
00182         break;
00183       }
00184       if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
00185     }
00186     if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
00187       CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
00188       for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00189         CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
00190           i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
00191       }
00192       myRankInPhysicalNode = 0;
00193       numPesOnPhysicalNode = 1;
00194       pesOnPhysicalNode = new int[1];
00195       pesOnPhysicalNode[0] = CkMyPe();
00196     } else {
00197       myRankInPhysicalNode = i;
00198     }
00199   }
00200   // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
00201 
00202   int deviceCount = 0;
00203   cudaGetDeviceCount(&deviceCount);
00204   cuda_errcheck("in cudaGetDeviceCount");
00205   if ( deviceCount <= 0 ) {
00206     cuda_die("No CUDA devices found.");
00207   }
00208 
00209   int *devices;
00210   int ndevices = 0;
00211   int nexclusive = 0;
00212   if ( usedevicelist ) {
00213     devices = new int[strlen(devicelist)];
00214     int i = 0;
00215     while ( devicelist[i] ) {
00216       ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
00217       while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
00218       while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
00219     }
00220   } else {
00221     if ( ! CkMyPe() ) {
00222       CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
00223     }
00224     devices = new int[deviceCount];
00225     for ( int i=0; i<deviceCount; ++i ) {
00226       int dev = i % deviceCount;
00227 #if CUDA_VERSION >= 2020
00228       cudaDeviceProp deviceProp;
00229       cudaGetDeviceProperties(&deviceProp, dev);
00230       cuda_errcheck("in cudaGetDeviceProperties");
00231       if ( deviceProp.computeMode != cudaComputeModeProhibited
00232            && (deviceProp.major > 1 || deviceProp.minor >= 1)
00233            && deviceProp.canMapHostMemory
00234            && deviceProp.multiProcessorCount > 2 ) {  // exclude weak cards
00235         devices[ndevices++] = dev;
00236       }
00237       if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
00238         ++nexclusive;
00239       }
00240 #else
00241       devices[ndevices++] = dev;
00242 #endif
00243     }
00244   }
00245 
00246   if ( ! ndevices ) {
00247     cuda_die("All CUDA devices are in prohibited mode, of compute capability 1.0, unable to map host memory, too small, or otherwise unusable.");
00248   }
00249 
00250   shared_gpu = 0;
00251   gpu_is_mine = 1;
00252   first_pe_sharing_gpu = CkMyPe();
00253   next_pe_sharing_gpu = CkMyPe();
00254 
00255  /* if ( (ndevices >= numPesOnPhysicalNode) || (nexclusive == 0) ) */ {
00256 
00257   int dev;
00258   if ( numPesOnPhysicalNode > 1 ) {
00259     int myDeviceRank = myRankInPhysicalNode * ndevices / numPesOnPhysicalNode;
00260     dev = devices[myDeviceRank];
00261     devicePe = CkMyPe();
00262     if ( ignoresharing ) {
00263       pesSharingDevice = new int[1];
00264       pesSharingDevice[0] = CkMyPe();
00265       numPesSharingDevice = 1;
00266     } else {
00267       pesSharingDevice = new int[numPesOnPhysicalNode];
00268       devicePe = -1;
00269       numPesSharingDevice = 0;
00270       for ( int i = 0; i < numPesOnPhysicalNode; ++i ) {
00271         if ( i * ndevices / numPesOnPhysicalNode == myDeviceRank ) {
00272           int thisPe = pesOnPhysicalNode[i];
00273           pesSharingDevice[numPesSharingDevice++] = thisPe;
00274           if ( devicePe < 1 ) devicePe = thisPe;
00275           if ( sortop_bitreverse(thisPe,devicePe) ) devicePe = thisPe;
00276         }
00277       }
00278       for ( int j = 0; j < ndevices; ++j ) {
00279         if ( devices[j] == dev && j != myDeviceRank ) shared_gpu = 1;
00280       }
00281     }
00282     if ( shared_gpu && devicePe == CkMyPe() ) {
00283       CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
00284     }
00285   } else {  // in case phys node code is lying
00286     dev = devices[CkMyPe() % ndevices];
00287     devicePe = CkMyPe();
00288     pesSharingDevice = new int[1];
00289     pesSharingDevice[0] = CkMyPe();
00290     numPesSharingDevice = 1;
00291   }
00292 
00293   if ( devicePe != CkMyPe() ) {
00294     CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
00295              CkMyPe(), myRankInPhysicalNode, devicePe);
00296     return;
00297   }
00298 
00299   // disable token-passing but don't submit local until remote finished
00300   // if shared_gpu is true, otherwise submit all work immediately
00301   first_pe_sharing_gpu = CkMyPe();
00302   next_pe_sharing_gpu = CkMyPe();
00303 
00304   gpu_is_mine = ( first_pe_sharing_gpu == CkMyPe() ); 
00305 
00306   if ( dev >= deviceCount ) {
00307     char buf[256];
00308     sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
00309                 CkMyPe(), dev, host, deviceCount);
00310     NAMD_die(buf);
00311   }
00312 
00313   cudaError_t err;
00314   cudaDeviceProp deviceProp;
00315   err = cudaGetDeviceProperties(&deviceProp, dev);
00316   if (err == cudaSuccess) {
00317     CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s'  Mem: %dMB  Rev: %d.%d\n",
00318              CkMyPe(), myRankInPhysicalNode, dev, host,
00319              deviceProp.name, deviceProp.totalGlobalMem / (1024*1024),
00320              deviceProp.major, deviceProp.minor);
00321 
00322     err = cudaSetDevice(dev);
00323   }
00324   if ( err != cudaSuccess) {
00325     char errmsg[1024];
00326     sprintf(errmsg,"CUDA error binding to device %d on pe %d: %s",
00327                         dev, CkMyPe(), cudaGetErrorString(err));
00328     NAMD_die(errmsg);
00329   }
00330 
00331  }  // just let CUDA pick a device for us
00332 
00333   cudaSetDeviceFlags(cudaDeviceMapHost);
00334   cuda_errcheck("in cudaSetDeviceFlags");
00335 
00336   int dev;
00337   if ( cudaGetDevice(&dev) == cudaSuccess ) {
00338     cudaDeviceProp deviceProp;
00339     cudaGetDeviceProperties(&deviceProp, dev);
00340     cuda_errcheck("in cudaGetDeviceProperties");
00341     if ( deviceProp.computeMode == cudaComputeModeProhibited )
00342       cuda_die("device in prohibited mode");
00343     if ( deviceProp.major < 2 && deviceProp.minor < 1 )
00344       cuda_die("device not of compute capability 1.1 or higher");
00345     if ( ! deviceProp.canMapHostMemory )
00346       cuda_die("device cannot map host memory");
00347   }
00348 
00349   if ( sizeof(patch_pair) & 15 ) NAMD_bug("sizeof(patch_pair) % 16 != 0");
00350   if ( sizeof(force_list) & 15 ) NAMD_bug("sizeof(force_list) % 16 != 0");
00351   if ( sizeof(atom) & 15 ) NAMD_bug("sizeof(atom) % 16 != 0");
00352   if ( sizeof(atom_param) & 15 ) NAMD_bug("sizeof(atom_param) % 16 != 0");
00353 
00354 }

void cuda_register_user_events  ) 
 

Definition at line 131 of file ComputeNonbondedCUDA.C.

References CUDA_EVENT_ID_POLL_LOCAL, CUDA_EVENT_ID_POLL_REMOTE, and REGISTER_DEVICE_EVENTS.

Referenced by cuda_initialize().

00131                                  {
00132 
00133   traceRegisterUserEvent("CUDA poll remote", CUDA_EVENT_ID_POLL_REMOTE);
00134   traceRegisterUserEvent("CUDA poll local", CUDA_EVENT_ID_POLL_LOCAL);
00135 
00136 #define REGISTER_DEVICE_EVENTS(DEV) \
00137   traceRegisterUserEvent("CUDA device " #DEV " remote", CUDA_EVENT_ID_BASE + 2 * DEV); \
00138   traceRegisterUserEvent("CUDA device " #DEV " local", CUDA_EVENT_ID_BASE + 2 * DEV + 1);
00139 
00140   REGISTER_DEVICE_EVENTS(0)
00141   REGISTER_DEVICE_EVENTS(1)
00142   REGISTER_DEVICE_EVENTS(2)
00143   REGISTER_DEVICE_EVENTS(3)
00144   REGISTER_DEVICE_EVENTS(4)
00145   REGISTER_DEVICE_EVENTS(5)
00146   REGISTER_DEVICE_EVENTS(6)
00147   REGISTER_DEVICE_EVENTS(7)
00148   REGISTER_DEVICE_EVENTS(8)
00149   REGISTER_DEVICE_EVENTS(9)
00150   REGISTER_DEVICE_EVENTS(10)
00151   REGISTER_DEVICE_EVENTS(11)
00152   REGISTER_DEVICE_EVENTS(12)
00153   REGISTER_DEVICE_EVENTS(13)
00154   REGISTER_DEVICE_EVENTS(14)
00155   REGISTER_DEVICE_EVENTS(15)
00156 
00157 }

void register_cuda_compute_pair ComputeID  c,
PatchID  pid[],
int  t[]
 

Definition at line 666 of file ComputeNonbondedCUDA.C.

References ResizeArray< Elem >::add(), ComputeNonbondedCUDA::compute_record::c, PatchMap::center(), cudaCompute, ComputeNonbondedCUDA::localComputeRecords, NAMD_bug(), ComputeNonbondedCUDA::compute_record::offset, ComputeNonbondedCUDA::patchMap, ComputeNonbondedCUDA::patchRecords, ComputeNonbondedCUDA::compute_record::pid, ComputeNonbondedCUDA::remoteComputeRecords, ComputeNonbondedCUDA::requirePatch(), Vector::x, Vector::y, and Vector::z.

Referenced by ComputeNonbondedPair::initialize().

00666                                                                      {
00667 
00668   if ( ! cudaCompute ) NAMD_bug("register_pair called early");
00669  
00670   cudaCompute->requirePatch(pid[0]);
00671   cudaCompute->requirePatch(pid[1]);
00672 
00673   ComputeNonbondedCUDA::compute_record cr, cr2;
00674   cr.c = c;  cr2.c = c;
00675   cr.pid[0] = pid[0];  cr.pid[1] = pid[1];
00676   cr2.pid[0] = pid[1];  cr2.pid[1] = pid[0];
00677 
00678   int t1 = t[0];
00679   int t2 = t[1];
00680   Vector offset = cudaCompute->patchMap->center(pid[0])
00681                 - cudaCompute->patchMap->center(pid[1]);
00682   offset.x += (t1%3-1) - (t2%3-1);
00683   offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
00684   offset.z += (t1/9-1) - (t2/9-1);
00685   cr.offset = offset;
00686   cr2.offset = -1. * offset;
00687     
00688   if ( cudaCompute->patchRecords[pid[0]].isLocal ) {
00689     cudaCompute->localComputeRecords.add(cr);
00690   } else {
00691     cudaCompute->remoteComputeRecords.add(cr);
00692   }
00693   if ( cudaCompute->patchRecords[pid[1]].isLocal ) {
00694     cudaCompute->localComputeRecords.add(cr2);
00695   } else {
00696     cudaCompute->remoteComputeRecords.add(cr2);
00697   }
00698 }

void register_cuda_compute_self ComputeID  c,
PatchID  pid
 

Definition at line 649 of file ComputeNonbondedCUDA.C.

References ResizeArray< Elem >::add(), ComputeNonbondedCUDA::compute_record::c, cudaCompute, ComputeNonbondedCUDA::localComputeRecords, NAMD_bug(), ComputeNonbondedCUDA::compute_record::offset, ComputeNonbondedCUDA::patchRecords, ComputeNonbondedCUDA::compute_record::pid, ComputeNonbondedCUDA::remoteComputeRecords, and ComputeNonbondedCUDA::requirePatch().

Referenced by ComputeNonbondedSelf::initialize().

00649                                                           {
00650 
00651   if ( ! cudaCompute ) NAMD_bug("register_self called early");
00652 
00653   cudaCompute->requirePatch(pid);
00654 
00655   ComputeNonbondedCUDA::compute_record cr;
00656   cr.c = c;
00657   cr.pid[0] = pid;  cr.pid[1] = pid;
00658   cr.offset = 0.;
00659   if ( cudaCompute->patchRecords[pid].isLocal ) {
00660     cudaCompute->localComputeRecords.add(cr);
00661   } else {
00662     cudaCompute->remoteComputeRecords.add(cr);
00663   }
00664 }

void send_build_cuda_force_table  ) 
 

Copyright (c) 1995, 1996, 1997, 1998, 1999, 2000 by The Board of Trustees of the University of Illinois. All rights reserved.

Definition at line 359 of file ComputeNonbondedCUDA.C.

References computeMgr, and ComputeMgr::sendBuildCudaForceTable().

Referenced by ComputeNonbondedUtil::select().

00359                                    {
00360   computeMgr->sendBuildCudaForceTable();
00361 }

bool sortop_bitreverse int  a,
int  b
[inline, static]
 

Definition at line 102 of file ComputeNonbondedCUDA.C.

Referenced by cuda_initialize().

00102                                                    {
00103   if ( a == b ) return 0; 
00104   for ( int bit = 1; bit; bit *= 2 ) {
00105     if ( (a&bit) != (b&bit) ) return ((a&bit) < (b&bit));
00106   }
00107   return 0;
00108 }

void unregister_cuda_compute ComputeID  c  ) 
 

Definition at line 700 of file ComputeNonbondedCUDA.C.

References NAMD_bug().

00700                                           {  // static
00701 
00702   NAMD_bug("unregister_compute unimplemented");
00703 
00704 }


Variable Documentation

__thread atom_param* atom_params [static]
 

Definition at line 1000 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread atom* atoms [static]
 

Definition at line 1001 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread int atomsChanged = 0 [static]
 

Definition at line 706 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::atomUpdate(), ComputeNonbondedCUDA::ComputeNonbondedCUDA(), ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::finishWork().

__thread float* bornRadH [static]
 

Definition at line 1014 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread int check_local_count [static]
 

Definition at line 1034 of file ComputeNonbondedCUDA.C.

Referenced by cuda_check_local_progress().

__thread int check_remote_count [static]
 

Definition at line 1033 of file ComputeNonbondedCUDA.C.

Referenced by cuda_check_remote_progress().

__thread ComputeMgr* computeMgr = 0 [static]
 

Definition at line 357 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::assignPatches(), ComputeNonbondedCUDA::ComputeNonbondedCUDA(), cuda_check_local_calc(), cuda_check_remote_calc(), ComputeNonbondedCUDA::messageFinishWork(), ComputeNonbondedCUDA::noWork(), and send_build_cuda_force_table().

__thread int computesChanged = 0 [static]
 

Definition at line 707 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::requirePatch().

__thread int cuda_timer_count [static]
 

Definition at line 1018 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::finishWork().

__thread double cuda_timer_total [static]
 

Definition at line 1019 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::finishWork().

__thread ComputeNonbondedCUDA* cudaCompute = 0 [static]
 

Definition at line 356 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), ComputeNonbondedCUDA::doWork(), register_cuda_compute_pair(), and register_cuda_compute_self().

char* devicelist
 

Definition at line 71 of file ComputeNonbondedCUDA.C.

Referenced by cuda_getargs(), and cuda_initialize().

__thread int devicePe [static]
 

Definition at line 87 of file ComputeNonbondedCUDA.C.

Referenced by build_cuda_force_table(), and cuda_initialize().

__thread float* dHdrPrefixH [static]
 

Definition at line 1016 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread cudaEvent_t end_local_download [static]
 

Definition at line 718 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), cuda_check_local_calc(), cuda_check_local_progress(), ComputeNonbondedCUDA::finishWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread cudaEvent_t end_remote_download [static]
 

Definition at line 717 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), cuda_check_remote_calc(), cuda_check_remote_progress(), ComputeNonbondedCUDA::finishWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread float* energy_gbis [static]
 

Definition at line 1008 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), ComputeNonbondedCUDA::finishWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread int2* exclusionsByAtom [static]
 

Definition at line 534 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::build_exclusions(), and ComputeNonbondedCUDA::doWork().

__thread int first_pe_sharing_gpu [static]
 

Definition at line 85 of file ComputeNonbondedCUDA.C.

Referenced by cuda_initialize().

__thread ResizeArray<force_list>* force_lists_ptr [static]
 

Definition at line 721 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA().

__thread int gpu_is_mine [static]
 

Definition at line 91 of file ComputeNonbondedCUDA.C.

Referenced by cuda_initialize(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread int ignoresharing [static]
 

Definition at line 73 of file ComputeNonbondedCUDA.C.

Referenced by cuda_getargs().

__thread float* intRad0H [static]
 

Definition at line 1011 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread float* intRadSH [static]
 

Definition at line 1012 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread int kernel_launch_state = 0 [static]
 

Definition at line 1105 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread double kernel_time [static]
 

Definition at line 1020 of file ComputeNonbondedCUDA.C.

Referenced by cuda_check_local_progress(), cuda_check_remote_progress(), and ComputeNonbondedCUDA::doWork().

__thread double local_submit_time [static]
 

Definition at line 1022 of file ComputeNonbondedCUDA.C.

Referenced by cuda_check_local_progress(), and cuda_check_remote_progress().

__thread int mergegrids [static]
 

Definition at line 74 of file ComputeNonbondedCUDA.C.

Referenced by cuda_getargs(), and ComputeNonbondedCUDA::finishWork().

__thread int next_pe_sharing_gpu [static]
 

Definition at line 86 of file ComputeNonbondedCUDA.C.

Referenced by cuda_check_local_calc(), cuda_check_remote_calc(), and cuda_initialize().

__thread int num_atom_records_allocated [static]
 

Definition at line 998 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork().

__thread int num_virials [static]
 

Definition at line 1004 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork().

__thread int num_virials_allocated [static]
 

Definition at line 1005 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork().

__thread int numPesSharingDevice [static]
 

Definition at line 88 of file ComputeNonbondedCUDA.C.

Referenced by cuda_initialize().

__thread int pairlistsValid = 0 [static]
 

Definition at line 709 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork().

__thread float pairlistTolerance = 0. [static]
 

Definition at line 710 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork().

__thread ResizeArray<patch_pair>* patch_pairs_ptr [static]
 

Definition at line 720 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA().

__thread int* pesSharingDevice [static]
 

Definition at line 89 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::assignPatches(), cuda_device_shared_with_pe(), and cuda_initialize().

__thread float plcutoff2 = 0 [static]
 

Definition at line 713 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread double remote_submit_time [static]
 

Definition at line 1021 of file ComputeNonbondedCUDA.C.

Referenced by cuda_check_remote_progress(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread int savePairlists = 0 [static]
 

Definition at line 712 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork().

__thread int shared_gpu [static]
 

Definition at line 84 of file ComputeNonbondedCUDA.C.

Referenced by cuda_initialize(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread float* slow_virials [static]
 

Definition at line 1007 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), and ComputeNonbondedCUDA::finishWork().

__thread cudaEvent_t start_calc [static]
 

Definition at line 716 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), ComputeNonbondedCUDA::finishWork(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread cudaStream_t stream
 

Definition at line 148 of file ComputeNonbondedCUDAKernel.cu.

Referenced by cuda_bind_atom_params(), cuda_bind_atoms(), cuda_bind_GBIS_bornRad(), cuda_bind_GBIS_dHdrPrefix(), cuda_bind_GBIS_intRad(), cuda_init(), cuda_nonbonded_forces(), and ComputeNonbondedCUDA::recvYieldDevice().

__thread cudaStream_t stream2
 

Definition at line 149 of file ComputeNonbondedCUDAKernel.cu.

__thread int usedevicelist [static]
 

Definition at line 72 of file ComputeNonbondedCUDA.C.

Referenced by cuda_getargs().

__thread int usePairlists = 0 [static]
 

Definition at line 711 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork().

__thread float* virials [static]
 

Definition at line 1006 of file ComputeNonbondedCUDA.C.

Referenced by ComputeNonbondedCUDA::doWork(), ComputeNonbondedCUDA::finishWork(), and ComputeNonbondedCUDA::recvYieldDevice().


Generated on Wed May 22 04:07:19 2013 for NAMD by  doxygen 1.3.9.1