CudaUtils.C

Go to the documentation of this file.
00001 
00002 #include <stdio.h>
00003 #include "common.h"
00004 #include "charm++.h"
00005 #include "CudaUtils.h"
00006 
00007 #ifdef NAMD_CUDA
00008 
00009 void cudaDie(const char *msg, cudaError_t err) {
00010   char host[128];
00011 #ifdef NOHOSTNAME
00012   sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00013 #else
00014   gethostname(host, 128);  host[127] = 0;
00015 #endif
00016   char devstr[128] = "";
00017   int devnum;
00018   if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00019     sprintf(devstr, " device %d", devnum);
00020   }
00021   cudaDeviceProp deviceProp;
00022   if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
00023     sprintf(devstr, " device %d pci %x:%x:%x", devnum,
00024       deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
00025   }
00026   char errmsg[1024];
00027   if (err == cudaSuccess) {
00028     sprintf(errmsg,"CUDA error %s on Pe %d (%s%s)", msg, CkMyPe(), host, devstr);
00029   } else {
00030     sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));    
00031   }
00032   NAMD_die(errmsg);
00033 }
00034 
00035 void cudaNAMD_bug(const char *msg) {NAMD_bug(msg);}
00036 
00037 void cuda_affinity_initialize() {
00038   // called before Converse/Charm++ initialization so can't use Cmi/Ck utilities
00039   char *forkstr;
00040   forkstr=getenv("CmiMyForks");
00041   int forks = 0;
00042   if (forkstr) sscanf(forkstr,"%d",&forks);
00043   if ( forks ) return;  // CUDA will fail on child if initialized before fork
00044   int devcnt = 0;
00045   cudaError_t err = cudaGetDeviceCount(&devcnt);
00046   if ( devcnt == 1 ) {  // only one device so it must be ours
00047     int *dummy;
00048     if ( err == cudaSuccess ) err = cudaSetDevice(0);
00049     if ( err == cudaSuccess ) err = cudaSetDeviceFlags(cudaDeviceMapHost);
00050     if ( err == cudaSuccess ) err = cudaMalloc(&dummy, 4);
00051   }
00052   if ( err != cudaSuccess ) {
00053 #ifdef NOHOSTNAME
00054     fprintf(stderr,"CUDA initialization error: %s\n", cudaGetErrorString(err));
00055 #else
00056     char host[128];
00057     gethostname(host, 128);  host[127] = 0;
00058     fprintf(stderr,"CUDA initialization error on %s: %s\n", host, cudaGetErrorString(err));
00059 #endif
00060   }
00061 }
00062 
00063 //----------------------------------------------------------------------------------------
00064 
00065 void clear_device_array_async_T(void *data, const int ndata, cudaStream_t stream, const size_t sizeofT) {
00066   cudaCheck(cudaMemsetAsync(data, 0, sizeofT*ndata, stream));
00067 }
00068 
00069 void clear_device_array_T(void *data, const int ndata, const size_t sizeofT) {
00070   cudaCheck(cudaMemset(data, 0, sizeofT*ndata));
00071 }
00072 
00073 //----------------------------------------------------------------------------------------
00074 //
00075 // Allocate page-locked host memory
00076 // pp = memory pointer
00077 // len = length of the array
00078 //
00079 void allocate_host_T(void **pp, const int len, const size_t sizeofT) {
00080   cudaCheck(cudaMallocHost(pp, sizeofT*len));
00081 }
00082 
00083 //----------------------------------------------------------------------------------------
00084 //
00085 // Allocate gpu memory
00086 // pp = memory pointer
00087 // len = length of the array
00088 //
00089 void allocate_device_T(void **pp, const int len, const size_t sizeofT) {
00090   cudaCheck(cudaMalloc(pp, sizeofT*len));
00091 }
00092 
00093 //----------------------------------------------------------------------------------------
00094 //
00095 // Deallocate gpu memory
00096 // pp = memory pointer
00097 //
00098 void deallocate_device_T(void **pp) {
00099   
00100   if (*pp != NULL) {
00101     cudaCheck(cudaFree((void *)(*pp)));
00102     *pp = NULL;
00103   }
00104 
00105 }
00106 
00107 //----------------------------------------------------------------------------------------
00108 //
00109 // Deallocate page-locked host memory
00110 // pp = memory pointer
00111 //
00112 void deallocate_host_T(void **pp) {
00113   
00114   if (*pp != NULL) {
00115     cudaCheck(cudaFreeHost((void *)(*pp)));
00116     *pp = NULL;
00117   }
00118 
00119 }
00120 
00121 //----------------------------------------------------------------------------------------
00122 //
00123 // Allocate & re-allocate device memory
00124 // pp = memory pointer
00125 // curlen = current length of the array
00126 // newlen = new required length of the array
00127 // fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
00128 //
00129 // returns true if reallocation happened
00130 //
00131 bool reallocate_device_T(void **pp, int *curlen, const int newlen, const float fac, const size_t sizeofT) {
00132 
00133   if (*pp != NULL && *curlen < newlen) {
00134     cudaCheck(cudaFree((void *)(*pp)));
00135     *pp = NULL;
00136   }
00137 
00138   if (*pp == NULL) {
00139     if (fac > 1.0f) {
00140       *curlen = (int)(((double)(newlen))*(double)fac);
00141     } else {
00142       *curlen = newlen;
00143     }
00144     cudaCheck(cudaMalloc(pp, sizeofT*(*curlen)));
00145     return true;
00146   }
00147 
00148   return false;
00149 }
00150 
00151 //----------------------------------------------------------------------------------------
00152 //
00153 // Allocate & re-allocate page-locked host memory
00154 // pp = memory pointer
00155 // curlen = current length of the array
00156 // newlen = new required length of the array
00157 // fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
00158 // flag = allocation type:
00159 //        cudaHostAllocDefault = default type, emulates cudaMallocHost
00160 //        cudaHostAllocMapped  = maps allocation into CUDA address space
00161 //
00162 // returns true if reallocation happened
00163 //
00164 bool reallocate_host_T(void **pp, int *curlen, const int newlen, 
00165                        const float fac, const unsigned int flag, const size_t sizeofT) {
00166 
00167   if (*pp != NULL && *curlen < newlen) {
00168     cudaCheck(cudaFreeHost((void *)(*pp)));
00169     *pp = NULL;
00170   }
00171 
00172   if (*pp == NULL) {
00173     if (fac > 1.0f) {
00174       *curlen = (int)(((double)(newlen))*(double)fac);
00175     } else {
00176       *curlen = newlen;
00177     }
00178     cudaCheck(cudaHostAlloc(pp, sizeofT*(*curlen), flag));
00179     return true;
00180   }
00181 
00182   return false;
00183 }
00184 
00185 //----------------------------------------------------------------------------------------
00186 //
00187 // Copies memory Host -> Device
00188 //
00189 void copy_HtoD_async_T(const void *h_array, void *d_array, int array_len, cudaStream_t stream,
00190            const size_t sizeofT) {
00191   cudaCheck(cudaMemcpyAsync(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice, stream));
00192 }
00193 
00194 void copy_HtoD_T(const void *h_array, void *d_array, int array_len,
00195      const size_t sizeofT) {
00196   cudaCheck(cudaMemcpy(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice));
00197 }
00198 
00199 //----------------------------------------------------------------------------------------
00200 //
00201 // Copies memory Device -> Host
00202 //
00203 void copy_DtoH_async_T(const void *d_array, void *h_array, const int array_len, cudaStream_t stream,
00204            const size_t sizeofT) {
00205   cudaCheck(cudaMemcpyAsync(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost, stream));
00206 }
00207 
00208 void copy_DtoH_T(const void *d_array, void *h_array, const int array_len, const size_t sizeofT) {
00209   cudaCheck(cudaMemcpy(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost));
00210 }
00211 
00212 //----------------------------------------------------------------------------------------
00213 //
00214 // Copies memory Device -> Device
00215 //
00216 void copy_DtoD_async_T(const void *d_src, void *d_dst, const int array_len, cudaStream_t stream,
00217            const size_t sizeofT) {
00218   cudaCheck(cudaMemcpyAsync(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice, stream));
00219 }
00220 
00221 void copy_DtoD_T(const void *d_src, void *d_dst, const int array_len, const size_t sizeofT) {
00222   cudaCheck(cudaMemcpy(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice));
00223 }
00224 
00225 //----------------------------------------------------------------------------------------
00226 //
00227 // Copies memory between two devices Device -> Device
00228 //
00229 void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev,
00230   const void *d_src, void *d_dst, const int array_len, cudaStream_t stream,
00231   const size_t sizeofT) {
00232   cudaCheck(cudaMemcpyPeerAsync(d_dst, dst_dev, d_src, src_dev, sizeofT*array_len, stream));
00233 }
00234 
00235 //----------------------------------------------------------------------------------------
00236 //
00237 // Copies 3D memory block Host -> Device
00238 //
00239 void copy3D_HtoD_T(void* src_data, void* dst_data,
00240   int src_x0, int src_y0, int src_z0,
00241   size_t src_xsize, size_t src_ysize,
00242   int dst_x0, int dst_y0, int dst_z0,
00243   size_t dst_xsize, size_t dst_ysize,
00244   size_t width, size_t height, size_t depth,
00245   size_t sizeofT, cudaStream_t stream) {
00246   cudaMemcpy3DParms parms = {0};
00247 
00248   parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
00249   parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
00250 
00251   parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
00252   parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
00253 
00254   parms.extent = make_cudaExtent(sizeofT*width, height, depth);
00255   parms.kind = cudaMemcpyHostToDevice;
00256 
00257   cudaCheck(cudaMemcpy3DAsync(&parms, stream));
00258 }
00259 
00260 //----------------------------------------------------------------------------------------
00261 //
00262 // Copies 3D memory block Device -> Host
00263 //
00264 void copy3D_DtoH_T(void* src_data, void* dst_data,
00265   int src_x0, int src_y0, int src_z0,
00266   size_t src_xsize, size_t src_ysize,
00267   int dst_x0, int dst_y0, int dst_z0,
00268   size_t dst_xsize, size_t dst_ysize,
00269   size_t width, size_t height, size_t depth,
00270   size_t sizeofT, cudaStream_t stream) {
00271   cudaMemcpy3DParms parms = {0};
00272 
00273   parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
00274   parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
00275 
00276   parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
00277   parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
00278 
00279   parms.extent = make_cudaExtent(sizeofT*width, height, depth);
00280   parms.kind = cudaMemcpyDeviceToHost;
00281 
00282   cudaCheck(cudaMemcpy3DAsync(&parms, stream));
00283 }
00284 
00285 //----------------------------------------------------------------------------------------
00286 //
00287 // Copies 3D memory block Device -> Device
00288 //
00289 void copy3D_DtoD_T(void* src_data, void* dst_data,
00290   int src_x0, int src_y0, int src_z0,
00291   size_t src_xsize, size_t src_ysize,
00292   int dst_x0, int dst_y0, int dst_z0,
00293   size_t dst_xsize, size_t dst_ysize,
00294   size_t width, size_t height, size_t depth,
00295   size_t sizeofT, cudaStream_t stream) {
00296   cudaMemcpy3DParms parms = {0};
00297 
00298   parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
00299   parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
00300 
00301   parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
00302   parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
00303 
00304   parms.extent = make_cudaExtent(sizeofT*width, height, depth);
00305   parms.kind = cudaMemcpyDeviceToDevice;
00306 
00307   cudaCheck(cudaMemcpy3DAsync(&parms, stream));
00308 }
00309 
00310 //----------------------------------------------------------------------------------------
00311 //
00312 // Copies 3D memory block between devices Device -> Device
00313 //
00314 void copy3D_PeerDtoD_T(int src_dev, int dst_dev,
00315   void* src_data, void* dst_data,
00316   int src_x0, int src_y0, int src_z0,
00317   size_t src_xsize, size_t src_ysize,
00318   int dst_x0, int dst_y0, int dst_z0,
00319   size_t dst_xsize, size_t dst_ysize,
00320   size_t width, size_t height, size_t depth,
00321   size_t sizeofT, cudaStream_t stream) {
00322   cudaMemcpy3DPeerParms parms = {0};
00323 
00324   parms.srcDevice = src_dev;
00325   parms.dstDevice = dst_dev;
00326 
00327   parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
00328   parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
00329 
00330   parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
00331   parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
00332 
00333   parms.extent = make_cudaExtent(sizeofT*width, height, depth);
00334 
00335   cudaCheck(cudaMemcpy3DPeerAsync(&parms, stream));
00336 }
00337 #endif // NAMD_CUDA

Generated on Fri Jun 22 01:17:13 2018 for NAMD by  doxygen 1.4.7