CudaPmeTranspose Class Reference

#include <CudaPmeSolverUtil.h>

Inheritance diagram for CudaPmeTranspose:

PmeTranspose List of all members.

Public Member Functions

 CudaPmeTranspose (PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
 ~CudaPmeTranspose ()
void setDataPtrsYZX (std::vector< float2 * > &dataPtrsNew, float2 *data)
void setDataPtrsZXY (std::vector< float2 * > &dataPtrsNew, float2 *data)
void transposeXYZtoYZX (const float2 *data)
void transposeXYZtoZXY (const float2 *data)
void waitStreamSynchronize ()
void copyDataDeviceToHost (const int iblock, float2 *h_data, const int h_dataSize)
void copyDataHostToDevice (const int iblock, float2 *data_in, float2 *data_out)
void copyDataDeviceToDevice (const int iblock, float2 *data_out)
float2getBuffer (const int iblock)
void copyDataToPeerDeviceYZX (const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
void copyDataToPeerDeviceZXY (const int iblock, int deviceID_out, int permutation_out, float2 *data_out)

Detailed Description

Definition at line 139 of file CudaPmeSolverUtil.h.


Constructor & Destructor Documentation

CudaPmeTranspose::CudaPmeTranspose ( PmeGrid  pmeGrid,
const int  permutation,
const int  jblock,
const int  kblock,
int  deviceID,
cudaStream_t  stream 
)

Definition at line 605 of file CudaPmeSolverUtil.C.

References cudaCheck, PmeTranspose::dataSize, and PmeTranspose::nblock.

00606                                                                            : 
00607   PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
00608   cudaCheck(cudaSetDevice(deviceID));
00609 
00610   allocate_device<float2>(&d_data, dataSize);
00611 #ifndef P2P_ENABLE_3D
00612   allocate_device<float2>(&d_buffer, dataSize);
00613 #endif
00614 
00615   // Setup data pointers to NULL, these can be overridden later on by using setDataPtrs()
00616   dataPtrsYZX.resize(nblock, NULL);
00617   dataPtrsZXY.resize(nblock, NULL);
00618 
00619   allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*nblock);
00620   allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*nblock);
00621 }

CudaPmeTranspose::~CudaPmeTranspose (  ) 

Definition at line 623 of file CudaPmeSolverUtil.C.

References cudaCheck.

00623                                     {
00624   cudaCheck(cudaSetDevice(deviceID));
00625   deallocate_device<float2>(&d_data);
00626 #ifndef P2P_ENABLE_3D
00627   deallocate_device<float2>(&d_buffer);
00628 #endif
00629   deallocate_device< TransposeBatch<float2> >(&batchesZXY);
00630   deallocate_device< TransposeBatch<float2> >(&batchesYZX);
00631 }


Member Function Documentation

void CudaPmeTranspose::copyDataDeviceToDevice ( const int  iblock,
float2 data_out 
)

Definition at line 968 of file CudaPmeSolverUtil.C.

References cudaCheck, getBlockDim(), PmeTranspose::isize, PmeTranspose::jblock, PmeTranspose::jsize, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

00968                                                                                 {
00969   cudaCheck(cudaSetDevice(deviceID));
00970 
00971   if (iblock >= nblock)
00972     NAMD_bug("CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
00973 
00974   // Determine block size = how much we're copying
00975   int i0, i1, j0, j1, k0, k1;
00976   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
00977   int ni = i1-i0+1;
00978   int nj = j1-j0+1;
00979   int nk = k1-k0+1;
00980 
00981   float2* data_in = d_buffer + i0*nj*nk;
00982 
00983   copy3D_DtoD<float2>(data_in, data_out,
00984     0, 0, 0,
00985     ni, nj,
00986     i0, 0, 0,
00987     isize, jsize,
00988     ni, nj, nk, stream);
00989 }

void CudaPmeTranspose::copyDataDeviceToHost ( const int  iblock,
float2 h_data,
const int  h_dataSize 
)

Definition at line 922 of file CudaPmeSolverUtil.C.

References cudaCheck, PmeTranspose::dataSize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, and PmeTranspose::pos.

00922                                                                                                   {
00923   cudaCheck(cudaSetDevice(deviceID));
00924 
00925   if (iblock >= nblock)
00926     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
00927 
00928   int x0 = pos[iblock];
00929   int nx = pos[iblock+1] - x0;
00930 
00931   int copySize  = jsize*ksize*nx;
00932   int copyStart = jsize*ksize*x0;
00933 
00934   if (copyStart + copySize > dataSize)
00935     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
00936 
00937   if (copySize > h_dataSize) 
00938     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
00939 
00940   copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
00941 }

void CudaPmeTranspose::copyDataHostToDevice ( const int  iblock,
float2 data_in,
float2 data_out 
)

Definition at line 943 of file CudaPmeSolverUtil.C.

References cudaCheck, getBlockDim(), PmeTranspose::isize, PmeTranspose::jblock, PmeTranspose::jsize, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

00943                                                                                                {
00944   cudaCheck(cudaSetDevice(deviceID));
00945 
00946   if (iblock >= nblock)
00947     NAMD_bug("CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
00948 
00949   // Determine block size = how much we're copying
00950   int i0, i1, j0, j1, k0, k1;
00951   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
00952   int ni = i1-i0+1;
00953   int nj = j1-j0+1;
00954   int nk = k1-k0+1;
00955 
00956   copy3D_HtoD<float2>(data_in, data_out,
00957     0, 0, 0,
00958     ni, nj,
00959     i0, 0, 0,
00960     isize, jsize,
00961     ni, nj, nk, stream);
00962 }

void CudaPmeTranspose::copyDataToPeerDeviceYZX ( const int  iblock,
int  deviceID_out,
int  permutation_out,
float2 data_out 
)

Definition at line 1009 of file CudaPmeSolverUtil.C.

References PmeTranspose::jblock, and PmeTranspose::kblock.

01010                     {
01011 
01012   int iblock_out = jblock;
01013   int jblock_out = kblock;
01014   int kblock_out = iblock;
01015 
01016   copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
01017 }

void CudaPmeTranspose::copyDataToPeerDeviceZXY ( const int  iblock,
int  deviceID_out,
int  permutation_out,
float2 data_out 
)

Definition at line 1019 of file CudaPmeSolverUtil.C.

References PmeTranspose::jblock, and PmeTranspose::kblock.

01020                     {
01021 
01022   int iblock_out = kblock;
01023   int jblock_out = iblock;
01024   int kblock_out = jblock;
01025 
01026   copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
01027 }

float2 * CudaPmeTranspose::getBuffer ( const int  iblock  ) 

Definition at line 994 of file CudaPmeSolverUtil.C.

References getBlockDim(), PmeTranspose::jblock, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

00994                                                     {
00995   if (iblock >= nblock)
00996     NAMD_bug("CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
00997 
00998   // Determine block size = how much we're copying
00999   int i0, i1, j0, j1, k0, k1;
01000   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
01001   int ni = i1-i0+1;
01002   int nj = j1-j0+1;
01003   int nk = k1-k0+1;
01004 
01005   return d_buffer + i0*nj*nk;
01006 }

void CudaPmeTranspose::setDataPtrsYZX ( std::vector< float2 * > &  dataPtrsNew,
float2 data 
)

Definition at line 636 of file CudaPmeSolverUtil.C.

References cudaCheck, TransposeBatch< T >::data_in, TransposeBatch< T >::data_out, PmeTranspose::jsize, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, TransposeBatch< T >::nx, PmeTranspose::pmeGrid, PmeTranspose::pos, TransposeBatch< T >::ysize_out, and TransposeBatch< T >::zsize_out.

00636                                                                                    {
00637   if (dataPtrsYZX.size() != dataPtrsNew.size())
00638     NAMD_bug("CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
00639   for (int iblock=0;iblock < nblock;iblock++) {
00640     dataPtrsYZX[iblock] = dataPtrsNew[iblock];
00641   }
00642   // Build batched data structures
00643   TransposeBatch<float2> *h_batchesYZX = new TransposeBatch<float2>[3*nblock];
00644 
00645   for (int iperm=0;iperm < 3;iperm++) {
00646     int isize_out;
00647     if (iperm == 0) {
00648       // Perm_Z_cX_Y:
00649       // ZXY -> XYZ
00650       isize_out = pmeGrid.K1/2+1;
00651     } else if (iperm == 1) {
00652       // Perm_cX_Y_Z:
00653       // XYZ -> YZX
00654       isize_out = pmeGrid.K2;
00655     } else {
00656       // Perm_Y_Z_cX:
00657       // YZX -> ZXY
00658       isize_out = pmeGrid.K3;
00659     }
00660 
00661     int max_nx = 0;
00662     for (int iblock=0;iblock < nblock;iblock++) {
00663 
00664       int x0 = pos[iblock];
00665       int nx = pos[iblock+1] - x0;
00666       max_nx = std::max(max_nx, nx);
00667 
00668       int width_out;
00669       float2* data_out;
00670       if (dataPtrsYZX[iblock] == NULL) {
00671         // Local transpose, use internal buffer
00672         data_out = d_data + jsize*ksize*x0;
00673         width_out = jsize;
00674       } else {
00675         // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00676         data_out = dataPtrsYZX[iblock];
00677         width_out = isize_out;
00678       }
00679 
00680       TransposeBatch<float2> batch;
00681       batch.nx        = nx;
00682       batch.ysize_out = width_out;
00683       batch.zsize_out = ksize;
00684       batch.data_in   = data+x0;
00685       batch.data_out  = data_out;
00686 
00687       h_batchesYZX[iperm*nblock + iblock] = batch;
00688 
00689     // transpose_xyz_yzx(
00690     //   nx, jsize, ksize,
00691     //   isize, jsize,
00692     //   width_out, ksize,
00693     //   data+x0, data_out, stream);
00694     }
00695 
00696     max_nx_YZX[iperm] = max_nx;
00697   }
00698 
00699   copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*nblock, stream);
00700   cudaCheck(cudaStreamSynchronize(stream));
00701   delete [] h_batchesYZX;
00702 }

void CudaPmeTranspose::setDataPtrsZXY ( std::vector< float2 * > &  dataPtrsNew,
float2 data 
)

Definition at line 707 of file CudaPmeSolverUtil.C.

References cudaCheck, TransposeBatch< T >::data_in, TransposeBatch< T >::data_out, PmeTranspose::jsize, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, TransposeBatch< T >::nx, PmeTranspose::pmeGrid, PmeTranspose::pos, TransposeBatch< T >::xsize_out, and TransposeBatch< T >::zsize_out.

00707                                                                                    {
00708   if (dataPtrsZXY.size() != dataPtrsNew.size())
00709     NAMD_bug("CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
00710   for (int iblock=0;iblock < nblock;iblock++) {
00711     dataPtrsZXY[iblock] = dataPtrsNew[iblock];
00712   }
00713 
00714   // Build batched data structures
00715   TransposeBatch<float2> *h_batchesZXY = new TransposeBatch<float2>[3*nblock];
00716 
00717   for (int iperm=0;iperm < 3;iperm++) {
00718     int isize_out;
00719     if (iperm == 0) {
00720       // Perm_cX_Y_Z:
00721       // XYZ -> ZXY
00722       isize_out = pmeGrid.K3;
00723     } else if (iperm == 1) {
00724       // Perm_Z_cX_Y:
00725       // ZXY -> YZX
00726       isize_out = pmeGrid.K2;
00727     } else {
00728       // Perm_Y_Z_cX:
00729       // YZX -> XYZ
00730       isize_out = pmeGrid.K1/2+1;
00731     }
00732 
00733     int max_nx = 0;
00734     for (int iblock=0;iblock < nblock;iblock++) {
00735 
00736       int x0 = pos[iblock];
00737       int nx = pos[iblock+1] - x0;
00738       max_nx = std::max(max_nx, nx);
00739 
00740       int width_out;
00741       float2* data_out;
00742       if (dataPtrsZXY[iblock] == NULL) {
00743         // Local transpose, use internal buffer
00744         data_out = d_data + jsize*ksize*x0;
00745         width_out = ksize;
00746       } else {
00747         // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00748         data_out = dataPtrsZXY[iblock];
00749         width_out = isize_out;
00750       }
00751 
00752       TransposeBatch<float2> batch;
00753       batch.nx        = nx;
00754       batch.zsize_out = width_out;
00755       batch.xsize_out = nx;
00756       batch.data_in   = data+x0;
00757       batch.data_out  = data_out;
00758 
00759       h_batchesZXY[iperm*nblock + iblock] = batch;
00760     }
00761 
00762     max_nx_ZXY[iperm] = max_nx;
00763   }
00764 
00765   copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*nblock, stream);
00766   cudaCheck(cudaStreamSynchronize(stream));
00767   delete [] h_batchesZXY;
00768 }

void CudaPmeTranspose::transposeXYZtoYZX ( const float2 data  )  [virtual]

Implements PmeTranspose.

Definition at line 770 of file CudaPmeSolverUtil.C.

References batchTranspose_xyz_yzx(), cudaCheck, PmeTranspose::isize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, Perm_cX_Y_Z, Perm_Y_Z_cX, Perm_Z_cX_Y, and PmeTranspose::permutation.

00770                                                            {
00771   cudaCheck(cudaSetDevice(deviceID));
00772 
00773   int iperm;
00774   switch(permutation) {
00775     case Perm_Z_cX_Y:
00776     // ZXY -> XYZ
00777     iperm = 0;
00778     break;
00779     case Perm_cX_Y_Z:
00780     // XYZ -> YZX
00781     iperm = 1;
00782     break;
00783     case Perm_Y_Z_cX:
00784     // YZX -> ZXY
00785     iperm = 2;
00786     break;
00787     default:
00788     NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
00789     break;
00790   }
00791 
00792   batchTranspose_xyz_yzx(
00793     nblock, batchesYZX + iperm*nblock,
00794     max_nx_YZX[iperm], jsize, ksize,
00795     isize, jsize, stream);
00796 
00797 
00798 /*
00799   int isize_out;
00800   switch(permutation) {
00801     case Perm_Z_cX_Y:
00802     // ZXY -> XYZ
00803     isize_out = pmeGrid.K1/2+1;
00804     break;
00805     case Perm_cX_Y_Z:
00806     // XYZ -> YZX
00807     isize_out = pmeGrid.K2;
00808     break;
00809     case Perm_Y_Z_cX:
00810     // YZX -> ZXY
00811     isize_out = pmeGrid.K3;
00812     break;
00813     default:
00814     NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
00815     break;
00816   }
00817 
00818   for (int iblock=0;iblock < nblock;iblock++) {
00819 
00820     int x0 = pos[iblock];
00821     int nx = pos[iblock+1] - x0;
00822 
00823     int width_out;
00824     float2* data_out;
00825     if (dataPtrsYZX[iblock] == NULL) {
00826       // Local transpose, use internal buffer
00827       data_out = d_data + jsize*ksize*x0;
00828       width_out = jsize;
00829     } else {
00830       // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00831       data_out = dataPtrsYZX[iblock];
00832       width_out = isize_out;
00833     }
00834 
00835     transpose_xyz_yzx(
00836       nx, jsize, ksize,
00837       isize, jsize,
00838       width_out, ksize,
00839       data+x0, data_out, stream);
00840   }
00841 */
00842 }

void CudaPmeTranspose::transposeXYZtoZXY ( const float2 data  )  [virtual]

Implements PmeTranspose.

Definition at line 844 of file CudaPmeSolverUtil.C.

References batchTranspose_xyz_zxy(), cudaCheck, PmeTranspose::isize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, Perm_cX_Y_Z, Perm_Y_Z_cX, Perm_Z_cX_Y, and PmeTranspose::permutation.

00844                                                            {
00845   cudaCheck(cudaSetDevice(deviceID));
00846 
00847   int iperm;
00848   switch(permutation) {
00849     case Perm_cX_Y_Z:
00850     // XYZ -> ZXY
00851     iperm = 0;
00852     break;
00853     case Perm_Z_cX_Y:
00854     // ZXY -> YZX
00855     iperm = 1;
00856     break;
00857     case Perm_Y_Z_cX:
00858     // YZX -> XYZ
00859     iperm = 2;
00860     break;
00861     default:
00862     NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
00863     break;
00864   }
00865 
00866   batchTranspose_xyz_zxy(
00867     nblock, batchesZXY + iperm*nblock,
00868     max_nx_ZXY[iperm], jsize, ksize,
00869     isize, jsize, stream);
00870 
00871 /*
00872   int isize_out;
00873   switch(permutation) {
00874     case Perm_cX_Y_Z:
00875     // XYZ -> ZXY
00876     isize_out = pmeGrid.K3;
00877     break;
00878     case Perm_Z_cX_Y:
00879     // ZXY -> YZX
00880     isize_out = pmeGrid.K2;
00881     break;
00882     case Perm_Y_Z_cX:
00883     // YZX -> XYZ
00884     isize_out = pmeGrid.K1/2+1;
00885     break;
00886     default:
00887     NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
00888     break;
00889   }
00890 
00891   for (int iblock=0;iblock < nblock;iblock++) {
00892 
00893     int x0 = pos[iblock];
00894     int nx = pos[iblock+1] - x0;
00895 
00896     int width_out;
00897     float2* data_out;
00898     if (dataPtrsZXY[iblock] == NULL) {
00899       // Local transpose, use internal buffer
00900       data_out = d_data + jsize*ksize*x0;
00901       width_out = ksize;
00902     } else {
00903       // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00904       data_out = dataPtrsZXY[iblock];
00905       width_out = isize_out;
00906     }
00907 
00908     transpose_xyz_zxy(
00909       nx, jsize, ksize,
00910       isize, jsize,
00911       width_out, nx,
00912       data+x0, data_out, stream);
00913   }
00914 */
00915 }

void CudaPmeTranspose::waitStreamSynchronize (  ) 

Definition at line 917 of file CudaPmeSolverUtil.C.

References cudaCheck.

00917                                              {
00918   cudaCheck(cudaSetDevice(deviceID));
00919   cudaCheck(cudaStreamSynchronize(stream));
00920 }


The documentation for this class was generated from the following files:
Generated on Fri Oct 19 01:17:19 2018 for NAMD by  doxygen 1.4.7