CudaUtils.C

Go to the documentation of this file.
00001 
00002 #include <stdio.h>
00003 #include "common.h"
00004 #include "charm++.h"
00005 #include "CudaUtils.h"
00006 
00007 #ifdef NAMD_CUDA
00008 
00009 void cudaDie(const char *msg, cudaError_t err) {
00010   char host[128];
00011 #ifdef NOHOSTNAME
00012   sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00013 #else
00014   gethostname(host, 128);  host[127] = 0;
00015 #endif
00016   char devstr[128] = "";
00017   int devnum;
00018   if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00019     sprintf(devstr, " device %d", devnum);
00020   }
00021   cudaDeviceProp deviceProp;
00022   if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
00023     sprintf(devstr, " device %d pci %x:%x:%x", devnum,
00024       deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
00025   }
00026   char errmsg[1024];
00027   if (err == cudaSuccess) {
00028     sprintf(errmsg,"CUDA error %s on Pe %d (%s%s)", msg, CkMyPe(), host, devstr);
00029   } else {
00030     sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));    
00031   }
00032   NAMD_die(errmsg);
00033 }
00034 
00035 void cudaNAMD_bug(const char *msg) {NAMD_bug(msg);}
00036 
00037 //----------------------------------------------------------------------------------------
00038 
00039 void clear_device_array_async_T(void *data, const int ndata, cudaStream_t stream, const size_t sizeofT) {
00040   cudaCheck(cudaMemsetAsync(data, 0, sizeofT*ndata, stream));
00041 }
00042 
00043 void clear_device_array_T(void *data, const int ndata, const size_t sizeofT) {
00044   cudaCheck(cudaMemset(data, 0, sizeofT*ndata));
00045 }
00046 
00047 //----------------------------------------------------------------------------------------
00048 //
00049 // Allocate page-locked host memory
00050 // pp = memory pointer
00051 // len = length of the array
00052 //
00053 void allocate_host_T(void **pp, const int len, const size_t sizeofT) {
00054   cudaCheck(cudaMallocHost(pp, sizeofT*len));
00055 }
00056 
00057 //----------------------------------------------------------------------------------------
00058 //
00059 // Allocate gpu memory
00060 // pp = memory pointer
00061 // len = length of the array
00062 //
00063 void allocate_device_T(void **pp, const int len, const size_t sizeofT) {
00064   cudaCheck(cudaMalloc(pp, sizeofT*len));
00065 }
00066 
00067 //----------------------------------------------------------------------------------------
00068 //
00069 // Deallocate gpu memory
00070 // pp = memory pointer
00071 //
00072 void deallocate_device_T(void **pp) {
00073   
00074   if (*pp != NULL) {
00075     cudaCheck(cudaFree((void *)(*pp)));
00076     *pp = NULL;
00077   }
00078 
00079 }
00080 
00081 //----------------------------------------------------------------------------------------
00082 //
00083 // Deallocate page-locked host memory
00084 // pp = memory pointer
00085 //
00086 void deallocate_host_T(void **pp) {
00087   
00088   if (*pp != NULL) {
00089     cudaCheck(cudaFreeHost((void *)(*pp)));
00090     *pp = NULL;
00091   }
00092 
00093 }
00094 
00095 //----------------------------------------------------------------------------------------
00096 //
00097 // Allocate & re-allocate device memory
00098 // pp = memory pointer
00099 // curlen = current length of the array
00100 // newlen = new required length of the array
00101 // fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
00102 //
00103 // returns true if reallocation happened
00104 //
00105 bool reallocate_device_T(void **pp, int *curlen, const int newlen, const float fac, const size_t sizeofT) {
00106 
00107   if (*pp != NULL && *curlen < newlen) {
00108     cudaCheck(cudaFree((void *)(*pp)));
00109     *pp = NULL;
00110   }
00111 
00112   if (*pp == NULL) {
00113     if (fac > 1.0f) {
00114       *curlen = (int)(((double)(newlen))*(double)fac);
00115     } else {
00116       *curlen = newlen;
00117     }
00118     cudaCheck(cudaMalloc(pp, sizeofT*(*curlen)));
00119     return true;
00120   }
00121 
00122   return false;
00123 }
00124 
00125 //----------------------------------------------------------------------------------------
00126 //
00127 // Allocate & re-allocate page-locked host memory
00128 // pp = memory pointer
00129 // curlen = current length of the array
00130 // newlen = new required length of the array
00131 // fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
00132 // flag = allocation type:
00133 //        cudaHostAllocDefault = default type, emulates cudaMallocHost
00134 //        cudaHostAllocMapped  = maps allocation into CUDA address space
00135 //
00136 // returns true if reallocation happened
00137 //
00138 bool reallocate_host_T(void **pp, int *curlen, const int newlen, 
00139                        const float fac, const unsigned int flag, const size_t sizeofT) {
00140 
00141   if (*pp != NULL && *curlen < newlen) {
00142     cudaCheck(cudaFreeHost((void *)(*pp)));
00143     *pp = NULL;
00144   }
00145 
00146   if (*pp == NULL) {
00147     if (fac > 1.0f) {
00148       *curlen = (int)(((double)(newlen))*(double)fac);
00149     } else {
00150       *curlen = newlen;
00151     }
00152     cudaCheck(cudaHostAlloc(pp, sizeofT*(*curlen), flag));
00153     return true;
00154   }
00155 
00156   return false;
00157 }
00158 
00159 //----------------------------------------------------------------------------------------
00160 //
00161 // Copies memory Host -> Device
00162 //
00163 void copy_HtoD_async_T(const void *h_array, void *d_array, int array_len, cudaStream_t stream,
00164            const size_t sizeofT) {
00165   cudaCheck(cudaMemcpyAsync(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice, stream));
00166 }
00167 
00168 void copy_HtoD_T(const void *h_array, void *d_array, int array_len,
00169      const size_t sizeofT) {
00170   cudaCheck(cudaMemcpy(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice));
00171 }
00172 
00173 //----------------------------------------------------------------------------------------
00174 //
00175 // Copies memory Device -> Host
00176 //
00177 void copy_DtoH_async_T(const void *d_array, void *h_array, const int array_len, cudaStream_t stream,
00178            const size_t sizeofT) {
00179   cudaCheck(cudaMemcpyAsync(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost, stream));
00180 }
00181 
00182 void copy_DtoH_T(const void *d_array, void *h_array, const int array_len, const size_t sizeofT) {
00183   cudaCheck(cudaMemcpy(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost));
00184 }
00185 
00186 //----------------------------------------------------------------------------------------
00187 //
00188 // Copies memory Device -> Device
00189 //
00190 void copy_DtoD_async_T(const void *d_src, void *d_dst, const int array_len, cudaStream_t stream,
00191            const size_t sizeofT) {
00192   cudaCheck(cudaMemcpyAsync(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice, stream));
00193 }
00194 
00195 void copy_DtoD_T(const void *d_src, void *d_dst, const int array_len, const size_t sizeofT) {
00196   cudaCheck(cudaMemcpy(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice));
00197 }
00198 
00199 //----------------------------------------------------------------------------------------
00200 //
00201 // Copies memory between two devices Device -> Device
00202 //
00203 void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev,
00204   const void *d_src, void *d_dst, const int array_len, cudaStream_t stream,
00205   const size_t sizeofT) {
00206   cudaCheck(cudaMemcpyPeerAsync(d_dst, dst_dev, d_src, src_dev, sizeofT*array_len, stream));
00207 }
00208 
00209 //----------------------------------------------------------------------------------------
00210 //
00211 // Copies 3D memory block Host -> Device
00212 //
00213 void copy3D_HtoD_T(void* src_data, void* dst_data,
00214   int src_x0, int src_y0, int src_z0,
00215   size_t src_xsize, size_t src_ysize,
00216   int dst_x0, int dst_y0, int dst_z0,
00217   size_t dst_xsize, size_t dst_ysize,
00218   size_t width, size_t height, size_t depth,
00219   size_t sizeofT, cudaStream_t stream) {
00220   cudaMemcpy3DParms parms = {0};
00221 
00222   parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
00223   parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
00224 
00225   parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
00226   parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
00227 
00228   parms.extent = make_cudaExtent(sizeofT*width, height, depth);
00229   parms.kind = cudaMemcpyHostToDevice;
00230 
00231   cudaCheck(cudaMemcpy3DAsync(&parms, stream));
00232 }
00233 
00234 //----------------------------------------------------------------------------------------
00235 //
00236 // Copies 3D memory block Device -> Host
00237 //
00238 void copy3D_DtoH_T(void* src_data, void* dst_data,
00239   int src_x0, int src_y0, int src_z0,
00240   size_t src_xsize, size_t src_ysize,
00241   int dst_x0, int dst_y0, int dst_z0,
00242   size_t dst_xsize, size_t dst_ysize,
00243   size_t width, size_t height, size_t depth,
00244   size_t sizeofT, cudaStream_t stream) {
00245   cudaMemcpy3DParms parms = {0};
00246 
00247   parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
00248   parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
00249 
00250   parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
00251   parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
00252 
00253   parms.extent = make_cudaExtent(sizeofT*width, height, depth);
00254   parms.kind = cudaMemcpyDeviceToHost;
00255 
00256   cudaCheck(cudaMemcpy3DAsync(&parms, stream));
00257 }
00258 
00259 //----------------------------------------------------------------------------------------
00260 //
00261 // Copies 3D memory block Device -> Device
00262 //
00263 void copy3D_DtoD_T(void* src_data, void* dst_data,
00264   int src_x0, int src_y0, int src_z0,
00265   size_t src_xsize, size_t src_ysize,
00266   int dst_x0, int dst_y0, int dst_z0,
00267   size_t dst_xsize, size_t dst_ysize,
00268   size_t width, size_t height, size_t depth,
00269   size_t sizeofT, cudaStream_t stream) {
00270   cudaMemcpy3DParms parms = {0};
00271 
00272   parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
00273   parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
00274 
00275   parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
00276   parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
00277 
00278   parms.extent = make_cudaExtent(sizeofT*width, height, depth);
00279   parms.kind = cudaMemcpyDeviceToDevice;
00280 
00281   cudaCheck(cudaMemcpy3DAsync(&parms, stream));
00282 }
00283 
00284 //----------------------------------------------------------------------------------------
00285 //
00286 // Copies 3D memory block between devices Device -> Device
00287 //
00288 void copy3D_PeerDtoD_T(int src_dev, int dst_dev,
00289   void* src_data, void* dst_data,
00290   int src_x0, int src_y0, int src_z0,
00291   size_t src_xsize, size_t src_ysize,
00292   int dst_x0, int dst_y0, int dst_z0,
00293   size_t dst_xsize, size_t dst_ysize,
00294   size_t width, size_t height, size_t depth,
00295   size_t sizeofT, cudaStream_t stream) {
00296   cudaMemcpy3DPeerParms parms = {0};
00297 
00298   parms.srcDevice = src_dev;
00299   parms.dstDevice = dst_dev;
00300 
00301   parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
00302   parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
00303 
00304   parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
00305   parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
00306 
00307   parms.extent = make_cudaExtent(sizeofT*width, height, depth);
00308 
00309   cudaCheck(cudaMemcpy3DPeerAsync(&parms, stream));
00310 }
00311 #endif // NAMD_CUDA

Generated on Sun Feb 25 01:17:14 2018 for NAMD by  doxygen 1.4.7