#include <CudaPmeSolverUtil.h>
|
| CudaPmeTranspose (PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream) |
|
| ~CudaPmeTranspose () |
|
void | setDataPtrsYZX (std::vector< float2 * > &dataPtrsNew, float2 *data) |
|
void | setDataPtrsZXY (std::vector< float2 * > &dataPtrsNew, float2 *data) |
|
void | transposeXYZtoYZX (const float2 *data) |
|
void | transposeXYZtoZXY (const float2 *data) |
|
void | waitStreamSynchronize () |
|
void | copyDataDeviceToHost (const int iblock, float2 *h_data, const int h_dataSize) |
|
void | copyDataHostToDevice (const int iblock, float2 *data_in, float2 *data_out) |
|
void | copyDataDeviceToDevice (const int iblock, float2 *data_out) |
|
float2 * | getBuffer (const int iblock) |
|
void | copyDataToPeerDeviceYZX (const int iblock, int deviceID_out, int permutation_out, float2 *data_out) |
|
void | copyDataToPeerDeviceZXY (const int iblock, int deviceID_out, int permutation_out, float2 *data_out) |
|
| PmeTranspose (PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock) |
|
virtual | ~PmeTranspose () |
|
Definition at line 171 of file CudaPmeSolverUtil.h.
CudaPmeTranspose::CudaPmeTranspose |
( |
PmeGrid |
pmeGrid, |
|
|
const int |
permutation, |
|
|
const int |
jblock, |
|
|
const int |
kblock, |
|
|
int |
deviceID, |
|
|
cudaStream_t |
stream |
|
) |
| |
Definition at line 788 of file CudaPmeSolverUtil.C.
References cudaCheck, PmeTranspose::dataSize, and PmeTranspose::nblock.
793 allocate_device<float2>(&d_data,
dataSize);
794 #ifndef P2P_ENABLE_3D
795 allocate_device<float2>(&d_buffer,
dataSize);
799 dataPtrsYZX.resize(
nblock, NULL);
800 dataPtrsZXY.resize(
nblock, NULL);
802 allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*
nblock);
803 allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*
nblock);
PmeTranspose(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock)
__thread cudaStream_t stream
CudaPmeTranspose::~CudaPmeTranspose |
( |
| ) |
|
Definition at line 806 of file CudaPmeSolverUtil.C.
References cudaCheck.
808 deallocate_device<float2>(&d_data);
809 #ifndef P2P_ENABLE_3D
810 deallocate_device<float2>(&d_buffer);
812 deallocate_device< TransposeBatch<float2> >(&batchesZXY);
813 deallocate_device< TransposeBatch<float2> >(&batchesYZX);
void CudaPmeTranspose::copyDataDeviceToDevice |
( |
const int |
iblock, |
|
|
float2 * |
data_out |
|
) |
| |
Definition at line 1151 of file CudaPmeSolverUtil.C.
References cudaCheck, getBlockDim(), PmeTranspose::isize, PmeTranspose::jblock, PmeTranspose::jsize, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.
1155 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
1158 int i0, i1, j0, j1, k0, k1;
1159 getBlockDim(
pmeGrid,
permutation, iblock,
jblock,
kblock, i0, i1, j0, j1, k0, k1);
1164 float2* data_in = d_buffer + i0*nj*nk;
1166 copy3D_DtoD<float2>(data_in, data_out,
__thread cudaStream_t stream
void NAMD_bug(const char *err_msg)
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
void CudaPmeTranspose::copyDataDeviceToHost |
( |
const int |
iblock, |
|
|
float2 * |
h_data, |
|
|
const int |
h_dataSize |
|
) |
| |
Definition at line 1105 of file CudaPmeSolverUtil.C.
References cudaCheck, PmeTranspose::dataSize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, and PmeTranspose::pos.
1109 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
1111 int x0 =
pos[iblock];
1112 int nx =
pos[iblock+1] - x0;
1117 if (copyStart + copySize >
dataSize)
1118 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
1120 if (copySize > h_dataSize)
1121 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
1123 copy_DtoH<float2>(d_data+copyStart, h_data, copySize,
stream);
__thread cudaStream_t stream
void NAMD_bug(const char *err_msg)
void CudaPmeTranspose::copyDataHostToDevice |
( |
const int |
iblock, |
|
|
float2 * |
data_in, |
|
|
float2 * |
data_out |
|
) |
| |
Definition at line 1126 of file CudaPmeSolverUtil.C.
References cudaCheck, getBlockDim(), PmeTranspose::isize, PmeTranspose::jblock, PmeTranspose::jsize, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.
1130 NAMD_bug(
"CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
1133 int i0, i1, j0, j1, k0, k1;
1134 getBlockDim(
pmeGrid,
permutation, iblock,
jblock,
kblock, i0, i1, j0, j1, k0, k1);
1139 copy3D_HtoD<float2>(data_in, data_out,
__thread cudaStream_t stream
void NAMD_bug(const char *err_msg)
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
void CudaPmeTranspose::copyDataToPeerDeviceYZX |
( |
const int |
iblock, |
|
|
int |
deviceID_out, |
|
|
int |
permutation_out, |
|
|
float2 * |
data_out |
|
) |
| |
void CudaPmeTranspose::copyDataToPeerDeviceZXY |
( |
const int |
iblock, |
|
|
int |
deviceID_out, |
|
|
int |
permutation_out, |
|
|
float2 * |
data_out |
|
) |
| |
float2 * CudaPmeTranspose::getBuffer |
( |
const int |
iblock | ) |
|
Definition at line 1177 of file CudaPmeSolverUtil.C.
References getBlockDim(), PmeTranspose::jblock, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.
1179 NAMD_bug(
"CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
1182 int i0, i1, j0, j1, k0, k1;
1183 getBlockDim(
pmeGrid,
permutation, iblock,
jblock,
kblock, i0, i1, j0, j1, k0, k1);
1188 return d_buffer + i0*nj*nk;
void NAMD_bug(const char *err_msg)
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
void CudaPmeTranspose::setDataPtrsYZX |
( |
std::vector< float2 * > & |
dataPtrsNew, |
|
|
float2 * |
data |
|
) |
| |
Definition at line 819 of file CudaPmeSolverUtil.C.
References cudaCheck, TransposeBatch< T >::data_in, TransposeBatch< T >::data_out, PmeTranspose::jsize, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, TransposeBatch< T >::nx, PmeTranspose::pmeGrid, PmeTranspose::pos, TransposeBatch< T >::ysize_out, and TransposeBatch< T >::zsize_out.
820 if (dataPtrsYZX.size() != dataPtrsNew.size())
821 NAMD_bug(
"CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
822 for (
int iblock=0;iblock <
nblock;iblock++) {
823 dataPtrsYZX[iblock] = dataPtrsNew[iblock];
828 for (
int iperm=0;iperm < 3;iperm++) {
834 }
else if (iperm == 1) {
845 for (
int iblock=0;iblock <
nblock;iblock++) {
847 int x0 =
pos[iblock];
848 int nx =
pos[iblock+1] - x0;
849 max_nx = std::max(max_nx, nx);
853 if (dataPtrsYZX[iblock] == NULL) {
859 data_out = dataPtrsYZX[iblock];
860 width_out = isize_out;
870 h_batchesYZX[iperm*nblock + iblock] = batch;
879 max_nx_YZX[iperm] = max_nx;
882 copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*
nblock,
stream);
884 delete [] h_batchesYZX;
__thread cudaStream_t stream
void NAMD_bug(const char *err_msg)
void CudaPmeTranspose::setDataPtrsZXY |
( |
std::vector< float2 * > & |
dataPtrsNew, |
|
|
float2 * |
data |
|
) |
| |
Definition at line 890 of file CudaPmeSolverUtil.C.
References cudaCheck, TransposeBatch< T >::data_in, TransposeBatch< T >::data_out, PmeTranspose::jsize, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, TransposeBatch< T >::nx, PmeTranspose::pmeGrid, PmeTranspose::pos, TransposeBatch< T >::xsize_out, and TransposeBatch< T >::zsize_out.
891 if (dataPtrsZXY.size() != dataPtrsNew.size())
892 NAMD_bug(
"CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
893 for (
int iblock=0;iblock <
nblock;iblock++) {
894 dataPtrsZXY[iblock] = dataPtrsNew[iblock];
900 for (
int iperm=0;iperm < 3;iperm++) {
906 }
else if (iperm == 1) {
917 for (
int iblock=0;iblock <
nblock;iblock++) {
919 int x0 =
pos[iblock];
920 int nx =
pos[iblock+1] - x0;
921 max_nx = std::max(max_nx, nx);
925 if (dataPtrsZXY[iblock] == NULL) {
931 data_out = dataPtrsZXY[iblock];
932 width_out = isize_out;
942 h_batchesZXY[iperm*nblock + iblock] = batch;
945 max_nx_ZXY[iperm] = max_nx;
948 copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*
nblock,
stream);
950 delete [] h_batchesZXY;
__thread cudaStream_t stream
void NAMD_bug(const char *err_msg)
void CudaPmeTranspose::transposeXYZtoYZX |
( |
const float2 * |
data | ) |
|
|
virtual |
Implements PmeTranspose.
Definition at line 953 of file CudaPmeSolverUtil.C.
References batchTranspose_xyz_yzx(), cudaCheck, PmeTranspose::isize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, Perm_cX_Y_Z, Perm_Y_Z_cX, Perm_Z_cX_Y, and PmeTranspose::permutation.
971 NAMD_bug(
"PmeTranspose::transposeXYZtoYZX, invalid permutation");
void batchTranspose_xyz_yzx(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
__thread cudaStream_t stream
void NAMD_bug(const char *err_msg)
void CudaPmeTranspose::transposeXYZtoZXY |
( |
const float2 * |
data | ) |
|
|
virtual |
Implements PmeTranspose.
Definition at line 1027 of file CudaPmeSolverUtil.C.
References batchTranspose_xyz_zxy(), cudaCheck, PmeTranspose::isize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, Perm_cX_Y_Z, Perm_Y_Z_cX, Perm_Z_cX_Y, and PmeTranspose::permutation.
1045 NAMD_bug(
"PmeTranspose::transposeXYZtoZXY, invalid permutation");
__thread cudaStream_t stream
void NAMD_bug(const char *err_msg)
void batchTranspose_xyz_zxy(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void CudaPmeTranspose::waitStreamSynchronize |
( |
| ) |
|
The documentation for this class was generated from the following files: