version 1.1 | version 1.2 |
---|
| |
NAMD_die(errmsg); | NAMD_die(errmsg); |
} | } |
| |
| void cudaNAMD_bug(const char *msg) {NAMD_bug(msg);} |
| |
| //---------------------------------------------------------------------------------------- |
| |
| void clear_device_array_async_T(void *data, const int ndata, cudaStream_t stream, const size_t sizeofT) { |
| cudaCheck(cudaMemsetAsync(data, 0, sizeofT*ndata, stream)); |
| } |
| |
| void clear_device_array_T(void *data, const int ndata, const size_t sizeofT) { |
| cudaCheck(cudaMemset(data, 0, sizeofT*ndata)); |
| } |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // Allocate page-locked host memory |
| // pp = memory pointer |
| // len = length of the array |
| // |
| void allocate_host_T(void **pp, const int len, const size_t sizeofT) { |
| cudaCheck(cudaMallocHost(pp, sizeofT*len)); |
| } |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // Allocate gpu memory |
| // pp = memory pointer |
| // len = length of the array |
| // |
| void allocate_device_T(void **pp, const int len, const size_t sizeofT) { |
| cudaCheck(cudaMalloc(pp, sizeofT*len)); |
| } |
| |
//---------------------------------------------------------------------------------------- | //---------------------------------------------------------------------------------------- |
// | // |
// Deallocate gpu memory | // Deallocate gpu memory |
| |
| |
//---------------------------------------------------------------------------------------- | //---------------------------------------------------------------------------------------- |
// | // |
// Allocate gpu memory | |
// pp = memory pointer | |
// len = length of the array | |
// | |
void allocate_device_T(void **pp, const int len, const size_t sizeofT) { | |
cudaCheck(cudaMalloc(pp, sizeofT*len)); | |
} | |
| |
void copy_DtoD_T(const void *d_src, void *d_dst, const int array_len, const size_t sizeofT) { | |
cudaCheck(cudaMemcpy(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice)); | |
} | |
| |
//---------------------------------------------------------------------------------------- | |
// | |
// Allocate & re-allocate page-locked host memory, preserves content | |
// | |
bool resize_device_T(void **pp, int *curlen, const int cur_size, const int new_size, | |
const float fac, const size_t sizeofT) { | |
| |
void *old = NULL; | |
| |
if (*pp != NULL && *curlen < new_size) { | |
allocate_device_T(&old, cur_size, sizeofT); | |
copy_DtoD_T(*pp, old, cur_size, sizeofT); | |
cudaCheck(cudaDeviceSynchronize()); //Make sure D-D copy is done | |
cudaCheck(cudaFree((void *)(*pp))); | |
*pp = NULL; | |
} | |
| |
if (*pp == NULL) { | |
if (fac > 1.0f) { | |
*curlen = (int)(((double)(new_size))*(double)fac); | |
} else { | |
*curlen = new_size; | |
} | |
allocate_device_T(pp, *curlen, sizeofT); | |
if (old != NULL) { | |
copy_DtoD_T(old, *pp, cur_size, sizeofT); | |
cudaCheck(cudaDeviceSynchronize()); //Make sure D-D copy is done | |
deallocate_device_T(&old); | |
} | |
return true; | |
} | |
| |
return false; | |
} | |
| |
//---------------------------------------------------------------------------------------- | |
// | |
// Allocate & re-allocate page-locked host memory | // Allocate & re-allocate page-locked host memory |
// pp = memory pointer | // pp = memory pointer |
// curlen = current length of the array | // curlen = current length of the array |
| |
| |
//---------------------------------------------------------------------------------------- | //---------------------------------------------------------------------------------------- |
// | // |
// Allocate & re-allocate page-locked host memory, preserves content | // Copies memory Host -> Device |
// | // |
bool resize_host_T(void **pp, int *curlen, const int cur_size, const int new_size, | void copy_HtoD_async_T(const void *h_array, void *d_array, int array_len, cudaStream_t stream, |
const float fac, const unsigned int flag, const size_t sizeofT) { | const size_t sizeofT) { |
| cudaCheck(cudaMemcpyAsync(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice, stream)); |
| } |
| |
char *old = NULL; | void copy_HtoD_T(const void *h_array, void *d_array, int array_len, |
| const size_t sizeofT) { |
| cudaCheck(cudaMemcpy(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice)); |
| } |
| |
if (*pp != NULL && *curlen < new_size) { | //---------------------------------------------------------------------------------------- |
old = new char[cur_size*sizeofT]; | // |
memcpy(old, *pp, cur_size*sizeofT); | // Copies memory Device -> Host |
cudaCheck(cudaFreeHost((void *)(*pp))); | // |
*pp = NULL; | void copy_DtoH_async_T(const void *d_array, void *h_array, const int array_len, cudaStream_t stream, |
| const size_t sizeofT) { |
| cudaCheck(cudaMemcpyAsync(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost, stream)); |
} | } |
| |
if (*pp == NULL) { | void copy_DtoH_T(const void *d_array, void *h_array, const int array_len, const size_t sizeofT) { |
if (fac > 1.0f) { | cudaCheck(cudaMemcpy(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost)); |
*curlen = (int)(((double)(new_size))*(double)fac); | |
} else { | |
*curlen = new_size; | |
} | } |
cudaCheck(cudaHostAlloc(pp, sizeofT*(*curlen), flag)); | |
if (old != NULL) { | //---------------------------------------------------------------------------------------- |
memcpy(*pp, old, cur_size*sizeofT); | // |
delete [] old; | // Copies memory Device -> Device |
| // |
| void copy_DtoD_async_T(const void *d_src, void *d_dst, const int array_len, cudaStream_t stream, |
| const size_t sizeofT) { |
| cudaCheck(cudaMemcpyAsync(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice, stream)); |
} | } |
return true; | |
| void copy_DtoD_T(const void *d_src, void *d_dst, const int array_len, const size_t sizeofT) { |
| cudaCheck(cudaMemcpy(d_dst, d_src, sizeofT*array_len, cudaMemcpyDeviceToDevice)); |
} | } |
| |
return false; | //---------------------------------------------------------------------------------------- |
| // |
| // Copies memory between two devices Device -> Device |
| // |
| void copy_PeerDtoD_async_T(const int src_dev, const int dst_dev, |
| const void *d_src, void *d_dst, const int array_len, cudaStream_t stream, |
| const size_t sizeofT) { |
| cudaCheck(cudaMemcpyPeerAsync(d_dst, dst_dev, d_src, src_dev, sizeofT*array_len, stream)); |
| } |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // Copies 3D memory block Host -> Device |
| // |
| void copy3D_HtoD_T(void* src_data, void* dst_data, |
| int src_x0, int src_y0, int src_z0, |
| size_t src_xsize, size_t src_ysize, |
| int dst_x0, int dst_y0, int dst_z0, |
| size_t dst_xsize, size_t dst_ysize, |
| size_t width, size_t height, size_t depth, |
| size_t sizeofT, cudaStream_t stream) { |
| cudaMemcpy3DParms parms = {0}; |
| |
| parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0); |
| parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize); |
| |
| parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0); |
| parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize); |
| |
| parms.extent = make_cudaExtent(sizeofT*width, height, depth); |
| parms.kind = cudaMemcpyHostToDevice; |
| |
| cudaCheck(cudaMemcpy3DAsync(&parms, stream)); |
| } |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // Copies 3D memory block Device -> Host |
| // |
| void copy3D_DtoH_T(void* src_data, void* dst_data, |
| int src_x0, int src_y0, int src_z0, |
| size_t src_xsize, size_t src_ysize, |
| int dst_x0, int dst_y0, int dst_z0, |
| size_t dst_xsize, size_t dst_ysize, |
| size_t width, size_t height, size_t depth, |
| size_t sizeofT, cudaStream_t stream) { |
| cudaMemcpy3DParms parms = {0}; |
| |
| parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0); |
| parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize); |
| |
| parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0); |
| parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize); |
| |
| parms.extent = make_cudaExtent(sizeofT*width, height, depth); |
| parms.kind = cudaMemcpyDeviceToHost; |
| |
| cudaCheck(cudaMemcpy3DAsync(&parms, stream)); |
| } |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // Copies 3D memory block Device -> Device |
| // |
| void copy3D_DtoD_T(void* src_data, void* dst_data, |
| int src_x0, int src_y0, int src_z0, |
| size_t src_xsize, size_t src_ysize, |
| int dst_x0, int dst_y0, int dst_z0, |
| size_t dst_xsize, size_t dst_ysize, |
| size_t width, size_t height, size_t depth, |
| size_t sizeofT, cudaStream_t stream) { |
| cudaMemcpy3DParms parms = {0}; |
| |
| parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0); |
| parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize); |
| |
| parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0); |
| parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize); |
| |
| parms.extent = make_cudaExtent(sizeofT*width, height, depth); |
| parms.kind = cudaMemcpyDeviceToDevice; |
| |
| cudaCheck(cudaMemcpy3DAsync(&parms, stream)); |
} | } |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // Copies 3D memory block between devices Device -> Device |
| // |
| void copy3D_PeerDtoD_T(int src_dev, int dst_dev, |
| void* src_data, void* dst_data, |
| int src_x0, int src_y0, int src_z0, |
| size_t src_xsize, size_t src_ysize, |
| int dst_x0, int dst_y0, int dst_z0, |
| size_t dst_xsize, size_t dst_ysize, |
| size_t width, size_t height, size_t depth, |
| size_t sizeofT, cudaStream_t stream) { |
| cudaMemcpy3DPeerParms parms = {0}; |
| |
| parms.srcDevice = src_dev; |
| parms.dstDevice = dst_dev; |
| |
| parms.srcPos = make_cudaPos(sizeofT*src_x0, src_y0, src_z0); |
| parms.srcPtr = make_cudaPitchedPtr(src_data, sizeofT*src_xsize, src_xsize, src_ysize); |
| |
| parms.dstPos = make_cudaPos(sizeofT*dst_x0, dst_y0, dst_z0); |
| parms.dstPtr = make_cudaPitchedPtr(dst_data, sizeofT*dst_xsize, dst_xsize, dst_ysize); |
| |
| parms.extent = make_cudaExtent(sizeofT*width, height, depth); |
| |
| cudaCheck(cudaMemcpy3DPeerAsync(&parms, stream)); |
| } |
#endif // NAMD_CUDA | #endif // NAMD_CUDA |