| version 1.1 | version 1.2 |
|---|
| |
| NAMD_die(errmsg); | NAMD_die(errmsg); |
| } | } |
| | |
| | void cudaNAMD_bug(const char *msg) {NAMD_bug(msg);} |
| | |
| | //---------------------------------------------------------------------------------------- |
| | |
| | void clear_device_array_async_T(void *data, const int ndata, cudaStream_t stream, const size_t sizeofT) { |
| | cudaCheck(cudaMemsetAsync(data, 0, sizeofT*ndata, stream)); |
| | } |
| | |
| | void clear_device_array_T(void *data, const int ndata, const size_t sizeofT) { |
| | cudaCheck(cudaMemset(data, 0, sizeofT*ndata)); |
| | } |
| | |
| | //---------------------------------------------------------------------------------------- |
| | // |
| | // Allocate page-locked host memory |
| | // pp = memory pointer |
| | // len = length of the array |
| | // |
| | void allocate_host_T(void **pp, const int len, const size_t sizeofT) { |
| | cudaCheck(cudaMallocHost(pp, sizeofT*len)); |
| | } |
| | |
| | //---------------------------------------------------------------------------------------- |
| | // |
| | // Allocate gpu memory |
| | // pp = memory pointer |
| | // len = length of the array |
| | // |
| | void allocate_device_T(void **pp, const int len, const size_t sizeofT) { |
| | cudaCheck(cudaMalloc(pp, sizeofT*len)); |
| | } |
| | |
| //---------------------------------------------------------------------------------------- | //---------------------------------------------------------------------------------------- |
| // | // |
| // Deallocate gpu memory | // Deallocate gpu memory |
| |
| | |
| //---------------------------------------------------------------------------------------- | //---------------------------------------------------------------------------------------- |
| // | // |
| // Allocate gpu memory | |
| // pp = memory pointer | |
| // len = length of the array | |
| // | |
| void allocate_device_T(void **pp, const int len, const size_t sizeofT) { | |
| cudaCheck(cudaMalloc(pp, sizeofT*len)); | |
| } | |
| | |
| void copy_DtoD_T(const void *d_src, void *d_dst, const int array_len, const size_t sizeofT) { | |
| cudaCheck(cudaMemcpy(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice)); | |
| } | |
| | |
| //---------------------------------------------------------------------------------------- | |
| // | |
| // Allocate & re-allocate page-locked host memory, preserves content | |
| // | |
| bool resize_device_T(void **pp, int *curlen, const int cur_size, const int new_size, | |
| const float fac, const size_t sizeofT) { | |
| | |
| void *old = NULL; | |
| | |
| if (*pp != NULL && *curlen < new_size) { | |
| allocate_device_T(&old, cur_size, sizeofT); | |
| copy_DtoD_T(*pp, old, cur_size, sizeofT); | |
| cudaCheck(cudaDeviceSynchronize()); //Make sure D-D copy is done | |
| cudaCheck(cudaFree((void *)(*pp))); | |
| *pp = NULL; | |
| } | |
| | |
| if (*pp == NULL) { | |
| if (fac > 1.0f) { | |
| *curlen = (int)(((double)(new_size))*(double)fac); | |
| } else { | |
| *curlen = new_size; | |
| } | |
| allocate_device_T(pp, *curlen, sizeofT); | |
| if (old != NULL) { | |
| copy_DtoD_T(old, *pp, cur_size, sizeofT); | |
| cudaCheck(cudaDeviceSynchronize()); //Make sure D-D copy is done | |
| deallocate_device_T(&old); | |
| } | |
| return true; | |
| } | |
| | |
| return false; | |
| } | |
| | |
| //---------------------------------------------------------------------------------------- | |
| // | |
| // Allocate & re-allocate page-locked host memory | // Allocate & re-allocate page-locked host memory |
| // pp = memory pointer | // pp = memory pointer |
| // curlen = current length of the array | // curlen = current length of the array |
| |
| | |
| //---------------------------------------------------------------------------------------- | //---------------------------------------------------------------------------------------- |
| // | // |
| // Allocate & re-allocate page-locked host memory, preserves content | // Copies memory Host -> Device |
| // | // |
| bool resize_host_T(void **pp, int *curlen, const int cur_size, const int new_size, | void copy_HtoD_async_T(const void *h_array, void *d_array, int array_len, cudaStream_t stream, |
| const float fac, const unsigned int flag, const size_t sizeofT) { | const size_t sizeofT) { |
| | cudaCheck(cudaMemcpyAsync(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice, stream)); |
| | } |
| | |
| char *old = NULL; | void copy_HtoD_T(const void *h_array, void *d_array, int array_len, |
| | const size_t sizeofT) { |
| | cudaCheck(cudaMemcpy(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice)); |
| | } |
| | |
| if (*pp != NULL && *curlen < new_size) { | //---------------------------------------------------------------------------------------- |
| old = new char[cur_size*sizeofT]; | // |
| memcpy(old, *pp, cur_size*sizeofT); | // Copies memory Device -> Host |
| cudaCheck(cudaFreeHost((void *)(*pp))); | // |
| *pp = NULL; | void copy_DtoH_async_T(const void *d_array, void *h_array, const int array_len, cudaStream_t stream, |
| | const size_t sizeofT) { |
| | cudaCheck(cudaMemcpyAsync(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost, stream)); |
| } | } |
| | |
| if (*pp == NULL) { | void copy_DtoH_T(const void *d_array, void *h_array, const int array_len, const size_t sizeofT) { |
| if (fac > 1.0f) { | cudaCheck(cudaMemcpy(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost)); |
| *curlen = (int)(((double)(new_size))*(double)fac); | |
| } else { | |
| *curlen = new_size; | |
| } | } |
| cudaCheck(cudaHostAlloc(pp, sizeofT*(*curlen), flag)); | |
| if (old != NULL) { | //---------------------------------------------------------------------------------------- |
| memcpy(*pp, old, cur_size*sizeofT); | // |
| delete [] old; | // Copies memory Device -> Device |
| | // |
| | void copy_DtoD_async_T(const void *d_src, void *d_dst, const int array_len, cudaStream_t stream, |
| | const size_t sizeofT) { |
| | cudaCheck(cudaMemcpyAsync(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice, stream)); |
| } | } |
| return true; | |
| | void copy_DtoD_T(const void *d_src, void *d_dst, const int array_len, const size_t sizeofT) { |
| | cudaCheck(cudaMemcpy(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice)); |
| } | } |
| | |
| return false; | //---------------------------------------------------------------------------------------- |
| | // |
| | // Copies memory between two devices Device -> Device |
| | // |
| | void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev, |
| | const void *d_src, void *d_dst, const int array_len, cudaStream_t stream, |
| | const size_t sizeofT) { |
| | cudaCheck(cudaMemcpyPeerAsync(d_dst, dst_dev, d_src, src_dev, sizeofT*array_len, stream)); |
| | } |
| | |
| | //---------------------------------------------------------------------------------------- |
| | // |
| | // Copies 3D memory block Host -> Device |
| | // |
| | void copy3D_HtoD_T(void* src_data, void* dst_data, |
| | int src_x0, int src_y0, int src_z0, |
| | size_t src_xsize, size_t src_ysize, |
| | int dst_x0, int dst_y0, int dst_z0, |
| | size_t dst_xsize, size_t dst_ysize, |
| | size_t width, size_t height, size_t depth, |
| | size_t sizeofT, cudaStream_t stream) { |
| | cudaMemcpy3DParms parms = {0}; |
| | |
| | parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0); |
| | parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize); |
| | |
| | parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0); |
| | parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize); |
| | |
| | parms.extent = make_cudaExtent(sizeofT*width, height, depth); |
| | parms.kind = cudaMemcpyHostToDevice; |
| | |
| | cudaCheck(cudaMemcpy3DAsync(&parms, stream)); |
| | } |
| | |
| | //---------------------------------------------------------------------------------------- |
| | // |
| | // Copies 3D memory block Device -> Host |
| | // |
| | void copy3D_DtoH_T(void* src_data, void* dst_data, |
| | int src_x0, int src_y0, int src_z0, |
| | size_t src_xsize, size_t src_ysize, |
| | int dst_x0, int dst_y0, int dst_z0, |
| | size_t dst_xsize, size_t dst_ysize, |
| | size_t width, size_t height, size_t depth, |
| | size_t sizeofT, cudaStream_t stream) { |
| | cudaMemcpy3DParms parms = {0}; |
| | |
| | parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0); |
| | parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize); |
| | |
| | parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0); |
| | parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize); |
| | |
| | parms.extent = make_cudaExtent(sizeofT*width, height, depth); |
| | parms.kind = cudaMemcpyDeviceToHost; |
| | |
| | cudaCheck(cudaMemcpy3DAsync(&parms, stream)); |
| | } |
| | |
| | //---------------------------------------------------------------------------------------- |
| | // |
| | // Copies 3D memory block Device -> Device |
| | // |
| | void copy3D_DtoD_T(void* src_data, void* dst_data, |
| | int src_x0, int src_y0, int src_z0, |
| | size_t src_xsize, size_t src_ysize, |
| | int dst_x0, int dst_y0, int dst_z0, |
| | size_t dst_xsize, size_t dst_ysize, |
| | size_t width, size_t height, size_t depth, |
| | size_t sizeofT, cudaStream_t stream) { |
| | cudaMemcpy3DParms parms = {0}; |
| | |
| | parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0); |
| | parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize); |
| | |
| | parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0); |
| | parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize); |
| | |
| | parms.extent = make_cudaExtent(sizeofT*width, height, depth); |
| | parms.kind = cudaMemcpyDeviceToDevice; |
| | |
| | cudaCheck(cudaMemcpy3DAsync(&parms, stream)); |
| } | } |
| | |
| | //---------------------------------------------------------------------------------------- |
| | // |
| | // Copies 3D memory block between devices Device -> Device |
| | // |
| | void copy3D_PeerDtoD_T(int src_dev, int dst_dev, |
| | void* src_data, void* dst_data, |
| | int src_x0, int src_y0, int src_z0, |
| | size_t src_xsize, size_t src_ysize, |
| | int dst_x0, int dst_y0, int dst_z0, |
| | size_t dst_xsize, size_t dst_ysize, |
| | size_t width, size_t height, size_t depth, |
| | size_t sizeofT, cudaStream_t stream) { |
| | cudaMemcpy3DPeerParms parms = {0}; |
| | |
| | parms.srcDevice = src_dev; |
| | parms.dstDevice = dst_dev; |
| | |
| | parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0); |
| | parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize); |
| | |
| | parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0); |
| | parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize); |
| | |
| | parms.extent = make_cudaExtent(sizeofT*width, height, depth); |
| | |
| | cudaCheck(cudaMemcpy3DPeerAsync(&parms, stream)); |
| | } |
| #endif // NAMD_CUDA | #endif // NAMD_CUDA |