CudaUtils.C

Go to the documentation of this file.
00001 
00002 #include <stdio.h>
00003 #include "common.h"
00004 #include "charm++.h"
00005 #include "CudaUtils.h"
00006 
00007 #ifdef NAMD_CUDA
00008 
00009 void cudaDie(const char *msg, cudaError_t err) {
00010   char host[128];
00011 #ifdef NOHOSTNAME
00012   sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00013 #else
00014   gethostname(host, 128);  host[127] = 0;
00015 #endif
00016   char devstr[128] = "";
00017   int devnum;
00018   if ( cudaGetDevice(&devnum) == cudaSuccess ) {
00019     sprintf(devstr, " device %d", devnum);
00020   }
00021   char errmsg[1024];
00022   if (err == cudaSuccess) {
00023     sprintf(errmsg,"CUDA error %s on Pe %d (%s%s)", msg, CkMyPe(), host, devstr);
00024   } else {
00025     sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));    
00026   }
00027   NAMD_die(errmsg);
00028 }
00029 
00030 void cudaNAMD_bug(const char *msg) {NAMD_bug(msg);}
00031 
00032 //----------------------------------------------------------------------------------------
00033 
00034 void clear_device_array_async_T(void *data, const int ndata, cudaStream_t stream, const size_t sizeofT) {
00035   cudaCheck(cudaMemsetAsync(data, 0, sizeofT*ndata, stream));
00036 }
00037 
00038 void clear_device_array_T(void *data, const int ndata, const size_t sizeofT) {
00039   cudaCheck(cudaMemset(data, 0, sizeofT*ndata));
00040 }
00041 
00042 //----------------------------------------------------------------------------------------
00043 //
00044 // Allocate page-locked host memory
00045 // pp = memory pointer
00046 // len = length of the array
00047 //
00048 void allocate_host_T(void **pp, const int len, const size_t sizeofT) {
00049   cudaCheck(cudaMallocHost(pp, sizeofT*len));
00050 }
00051 
00052 //----------------------------------------------------------------------------------------
00053 //
00054 // Allocate gpu memory
00055 // pp = memory pointer
00056 // len = length of the array
00057 //
00058 void allocate_device_T(void **pp, const int len, const size_t sizeofT) {
00059   cudaCheck(cudaMalloc(pp, sizeofT*len));
00060 }
00061 
00062 //----------------------------------------------------------------------------------------
00063 //
00064 // Deallocate gpu memory
00065 // pp = memory pointer
00066 //
00067 void deallocate_device_T(void **pp) {
00068   
00069   if (*pp != NULL) {
00070     cudaCheck(cudaFree((void *)(*pp)));
00071     *pp = NULL;
00072   }
00073 
00074 }
00075 
00076 //----------------------------------------------------------------------------------------
00077 //
00078 // Deallocate page-locked host memory
00079 // pp = memory pointer
00080 //
00081 void deallocate_host_T(void **pp) {
00082   
00083   if (*pp != NULL) {
00084     cudaCheck(cudaFreeHost((void *)(*pp)));
00085     *pp = NULL;
00086   }
00087 
00088 }
00089 
00090 //----------------------------------------------------------------------------------------
00091 //
00092 // Allocate & re-allocate device memory
00093 // pp = memory pointer
00094 // curlen = current length of the array
00095 // newlen = new required length of the array
00096 // fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
00097 //
00098 // returns true if reallocation happened
00099 //
00100 bool reallocate_device_T(void **pp, int *curlen, const int newlen, const float fac, const size_t sizeofT) {
00101 
00102   if (*pp != NULL && *curlen < newlen) {
00103     cudaCheck(cudaFree((void *)(*pp)));
00104     *pp = NULL;
00105   }
00106 
00107   if (*pp == NULL) {
00108     if (fac > 1.0f) {
00109       *curlen = (int)(((double)(newlen))*(double)fac);
00110     } else {
00111       *curlen = newlen;
00112     }
00113     cudaCheck(cudaMalloc(pp, sizeofT*(*curlen)));
00114     return true;
00115   }
00116 
00117   return false;
00118 }
00119 
00120 //----------------------------------------------------------------------------------------
00121 //
00122 // Allocate & re-allocate page-locked host memory
00123 // pp = memory pointer
00124 // curlen = current length of the array
00125 // newlen = new required length of the array
00126 // fac = extra space allocation factor: in case of re-allocation new length will be fac*newlen
00127 // flag = allocation type:
00128 //        cudaHostAllocDefault = default type, emulates cudaMallocHost
00129 //        cudaHostAllocMapped  = maps allocation into CUDA address space
00130 //
00131 // returns true if reallocation happened
00132 //
00133 bool reallocate_host_T(void **pp, int *curlen, const int newlen, 
00134                        const float fac, const unsigned int flag, const size_t sizeofT) {
00135 
00136   if (*pp != NULL && *curlen < newlen) {
00137     cudaCheck(cudaFreeHost((void *)(*pp)));
00138     *pp = NULL;
00139   }
00140 
00141   if (*pp == NULL) {
00142     if (fac > 1.0f) {
00143       *curlen = (int)(((double)(newlen))*(double)fac);
00144     } else {
00145       *curlen = newlen;
00146     }
00147     cudaCheck(cudaHostAlloc(pp, sizeofT*(*curlen), flag));
00148     return true;
00149   }
00150 
00151   return false;
00152 }
00153 
00154 //----------------------------------------------------------------------------------------
00155 //
00156 // Copies memory Host -> Device
00157 //
00158 void copy_HtoD_async_T(const void *h_array, void *d_array, int array_len, cudaStream_t stream,
00159            const size_t sizeofT) {
00160   cudaCheck(cudaMemcpyAsync(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice, stream));
00161 }
00162 
00163 void copy_HtoD_T(const void *h_array, void *d_array, int array_len,
00164      const size_t sizeofT) {
00165   cudaCheck(cudaMemcpy(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice));
00166 }
00167 
00168 //----------------------------------------------------------------------------------------
00169 //
00170 // Copies memory Device -> Host
00171 //
00172 void copy_DtoH_async_T(const void *d_array, void *h_array, const int array_len, cudaStream_t stream,
00173            const size_t sizeofT) {
00174   cudaCheck(cudaMemcpyAsync(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost, stream));
00175 }
00176 
00177 void copy_DtoH_T(const void *d_array, void *h_array, const int array_len, const size_t sizeofT) {
00178   cudaCheck(cudaMemcpy(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost));
00179 }
00180 
00181 //----------------------------------------------------------------------------------------
00182 //
00183 // Copies memory Device -> Device
00184 //
00185 void copy_DtoD_async_T(const void *d_src, void *d_dst, const int array_len, cudaStream_t stream,
00186            const size_t sizeofT) {
00187   cudaCheck(cudaMemcpyAsync(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice, stream));
00188 }
00189 
00190 void copy_DtoD_T(const void *d_src, void *d_dst, const int array_len, const size_t sizeofT) {
00191   cudaCheck(cudaMemcpy(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice));
00192 }
00193 
00194 //----------------------------------------------------------------------------------------
00195 //
00196 // Copies memory between two devices Device -> Device
00197 //
00198 void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev,
00199   const void *d_src, void *d_dst, const int array_len, cudaStream_t stream,
00200   const size_t sizeofT) {
00201   cudaCheck(cudaMemcpyPeerAsync(d_dst, dst_dev, d_src, src_dev, sizeofT*array_len, stream));
00202 }
00203 
00204 //----------------------------------------------------------------------------------------
00205 //
00206 // Copies 3D memory block Host -> Device
00207 //
00208 void copy3D_HtoD_T(void* src_data, void* dst_data,
00209   int src_x0, int src_y0, int src_z0,
00210   size_t src_xsize, size_t src_ysize,
00211   int dst_x0, int dst_y0, int dst_z0,
00212   size_t dst_xsize, size_t dst_ysize,
00213   size_t width, size_t height, size_t depth,
00214   size_t sizeofT, cudaStream_t stream) {
00215   cudaMemcpy3DParms parms = {0};
00216 
00217   parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
00218   parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
00219 
00220   parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
00221   parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
00222 
00223   parms.extent = make_cudaExtent(sizeofT*width, height, depth);
00224   parms.kind = cudaMemcpyHostToDevice;
00225 
00226   cudaCheck(cudaMemcpy3DAsync(&parms, stream));
00227 }
00228 
00229 //----------------------------------------------------------------------------------------
00230 //
00231 // Copies 3D memory block Device -> Host
00232 //
00233 void copy3D_DtoH_T(void* src_data, void* dst_data,
00234   int src_x0, int src_y0, int src_z0,
00235   size_t src_xsize, size_t src_ysize,
00236   int dst_x0, int dst_y0, int dst_z0,
00237   size_t dst_xsize, size_t dst_ysize,
00238   size_t width, size_t height, size_t depth,
00239   size_t sizeofT, cudaStream_t stream) {
00240   cudaMemcpy3DParms parms = {0};
00241 
00242   parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
00243   parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
00244 
00245   parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
00246   parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
00247 
00248   parms.extent = make_cudaExtent(sizeofT*width, height, depth);
00249   parms.kind = cudaMemcpyDeviceToHost;
00250 
00251   cudaCheck(cudaMemcpy3DAsync(&parms, stream));
00252 }
00253 
00254 //----------------------------------------------------------------------------------------
00255 //
00256 // Copies 3D memory block Device -> Device
00257 //
00258 void copy3D_DtoD_T(void* src_data, void* dst_data,
00259   int src_x0, int src_y0, int src_z0,
00260   size_t src_xsize, size_t src_ysize,
00261   int dst_x0, int dst_y0, int dst_z0,
00262   size_t dst_xsize, size_t dst_ysize,
00263   size_t width, size_t height, size_t depth,
00264   size_t sizeofT, cudaStream_t stream) {
00265   cudaMemcpy3DParms parms = {0};
00266 
00267   parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
00268   parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
00269 
00270   parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
00271   parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
00272 
00273   parms.extent = make_cudaExtent(sizeofT*width, height, depth);
00274   parms.kind = cudaMemcpyDeviceToDevice;
00275 
00276   cudaCheck(cudaMemcpy3DAsync(&parms, stream));
00277 }
00278 
00279 //----------------------------------------------------------------------------------------
00280 //
00281 // Copies 3D memory block between devices Device -> Device
00282 //
00283 void copy3D_PeerDtoD_T(int src_dev, int dst_dev,
00284   void* src_data, void* dst_data,
00285   int src_x0, int src_y0, int src_z0,
00286   size_t src_xsize, size_t src_ysize,
00287   int dst_x0, int dst_y0, int dst_z0,
00288   size_t dst_xsize, size_t dst_ysize,
00289   size_t width, size_t height, size_t depth,
00290   size_t sizeofT, cudaStream_t stream) {
00291   cudaMemcpy3DPeerParms parms = {0};
00292 
00293   parms.srcDevice = src_dev;
00294   parms.dstDevice = dst_dev;
00295 
00296   parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0);
00297   parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize);
00298 
00299   parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0);
00300   parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize);
00301 
00302   parms.extent = make_cudaExtent(sizeofT*width, height, depth);
00303 
00304   cudaCheck(cudaMemcpy3DPeerAsync(&parms, stream));
00305 }
00306 #endif // NAMD_CUDA

Generated on Tue Nov 21 01:17:13 2017 for NAMD by  doxygen 1.4.7