CudaPmeTranspose Class Reference

#include <CudaPmeSolverUtil.h>

Inheritance diagram for CudaPmeTranspose:
PmeTranspose

List of all members.

Public Member Functions

 CudaPmeTranspose (PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
 ~CudaPmeTranspose ()
void setDataPtrsYZX (std::vector< float2 * > &dataPtrsNew, float2 *data)
void setDataPtrsZXY (std::vector< float2 * > &dataPtrsNew, float2 *data)
void transposeXYZtoYZX (const float2 *data)
void transposeXYZtoZXY (const float2 *data)
void waitStreamSynchronize ()
void copyDataDeviceToHost (const int iblock, float2 *h_data, const int h_dataSize)
void copyDataHostToDevice (const int iblock, float2 *data_in, float2 *data_out)
void copyDataDeviceToDevice (const int iblock, float2 *data_out)
float2getBuffer (const int iblock)
void copyDataToPeerDeviceYZX (const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
void copyDataToPeerDeviceZXY (const int iblock, int deviceID_out, int permutation_out, float2 *data_out)

Detailed Description

Definition at line 139 of file CudaPmeSolverUtil.h.


Constructor & Destructor Documentation

CudaPmeTranspose::CudaPmeTranspose ( PmeGrid  pmeGrid,
const int  permutation,
const int  jblock,
const int  kblock,
int  deviceID,
cudaStream_t  stream 
)

Definition at line 617 of file CudaPmeSolverUtil.C.

References cudaCheck, PmeTranspose::dataSize, and PmeTranspose::nblock.

00618                                                                            : 
00619   PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
00620   cudaCheck(cudaSetDevice(deviceID));
00621 
00622   allocate_device<float2>(&d_data, dataSize);
00623 #ifndef P2P_ENABLE_3D
00624   allocate_device<float2>(&d_buffer, dataSize);
00625 #endif
00626 
00627   // Setup data pointers to NULL, these can be overridden later on by using setDataPtrs()
00628   dataPtrsYZX.resize(nblock, NULL);
00629   dataPtrsZXY.resize(nblock, NULL);
00630 
00631   allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*nblock);
00632   allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*nblock);
00633 }

CudaPmeTranspose::~CudaPmeTranspose (  ) 

Definition at line 635 of file CudaPmeSolverUtil.C.

References cudaCheck.

00635                                     {
00636   cudaCheck(cudaSetDevice(deviceID));
00637   deallocate_device<float2>(&d_data);
00638 #ifndef P2P_ENABLE_3D
00639   deallocate_device<float2>(&d_buffer);
00640 #endif
00641   deallocate_device< TransposeBatch<float2> >(&batchesZXY);
00642   deallocate_device< TransposeBatch<float2> >(&batchesYZX);
00643 }


Member Function Documentation

void CudaPmeTranspose::copyDataDeviceToDevice ( const int  iblock,
float2 data_out 
)

Definition at line 980 of file CudaPmeSolverUtil.C.

References cudaCheck, getBlockDim(), PmeTranspose::isize, PmeTranspose::jblock, PmeTranspose::jsize, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

00980                                                                                 {
00981   cudaCheck(cudaSetDevice(deviceID));
00982 
00983   if (iblock >= nblock)
00984     NAMD_bug("CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
00985 
00986   // Determine block size = how much we're copying
00987   int i0, i1, j0, j1, k0, k1;
00988   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
00989   int ni = i1-i0+1;
00990   int nj = j1-j0+1;
00991   int nk = k1-k0+1;
00992 
00993   float2* data_in = d_buffer + i0*nj*nk;
00994 
00995   copy3D_DtoD<float2>(data_in, data_out,
00996     0, 0, 0,
00997     ni, nj,
00998     i0, 0, 0,
00999     isize, jsize,
01000     ni, nj, nk, stream);
01001 }

void CudaPmeTranspose::copyDataDeviceToHost ( const int  iblock,
float2 h_data,
const int  h_dataSize 
)

Definition at line 934 of file CudaPmeSolverUtil.C.

References cudaCheck, PmeTranspose::dataSize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, and PmeTranspose::pos.

00934                                                                                                   {
00935   cudaCheck(cudaSetDevice(deviceID));
00936 
00937   if (iblock >= nblock)
00938     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
00939 
00940   int x0 = pos[iblock];
00941   int nx = pos[iblock+1] - x0;
00942 
00943   int copySize  = jsize*ksize*nx;
00944   int copyStart = jsize*ksize*x0;
00945 
00946   if (copyStart + copySize > dataSize)
00947     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
00948 
00949   if (copySize > h_dataSize) 
00950     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
00951 
00952   copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
00953 }

void CudaPmeTranspose::copyDataHostToDevice ( const int  iblock,
float2 data_in,
float2 data_out 
)

Definition at line 955 of file CudaPmeSolverUtil.C.

References cudaCheck, getBlockDim(), PmeTranspose::isize, PmeTranspose::jblock, PmeTranspose::jsize, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

00955                                                                                                {
00956   cudaCheck(cudaSetDevice(deviceID));
00957 
00958   if (iblock >= nblock)
00959     NAMD_bug("CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
00960 
00961   // Determine block size = how much we're copying
00962   int i0, i1, j0, j1, k0, k1;
00963   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
00964   int ni = i1-i0+1;
00965   int nj = j1-j0+1;
00966   int nk = k1-k0+1;
00967 
00968   copy3D_HtoD<float2>(data_in, data_out,
00969     0, 0, 0,
00970     ni, nj,
00971     i0, 0, 0,
00972     isize, jsize,
00973     ni, nj, nk, stream);
00974 }

void CudaPmeTranspose::copyDataToPeerDeviceYZX ( const int  iblock,
int  deviceID_out,
int  permutation_out,
float2 data_out 
)

Definition at line 1021 of file CudaPmeSolverUtil.C.

References PmeTranspose::jblock, and PmeTranspose::kblock.

01022                     {
01023 
01024   int iblock_out = jblock;
01025   int jblock_out = kblock;
01026   int kblock_out = iblock;
01027 
01028   copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
01029 }

void CudaPmeTranspose::copyDataToPeerDeviceZXY ( const int  iblock,
int  deviceID_out,
int  permutation_out,
float2 data_out 
)

Definition at line 1031 of file CudaPmeSolverUtil.C.

References PmeTranspose::jblock, and PmeTranspose::kblock.

01032                     {
01033 
01034   int iblock_out = kblock;
01035   int jblock_out = iblock;
01036   int kblock_out = jblock;
01037 
01038   copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
01039 }

float2 * CudaPmeTranspose::getBuffer ( const int  iblock  ) 

Definition at line 1006 of file CudaPmeSolverUtil.C.

References getBlockDim(), PmeTranspose::jblock, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

01006                                                     {
01007   if (iblock >= nblock)
01008     NAMD_bug("CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
01009 
01010   // Determine block size = how much we're copying
01011   int i0, i1, j0, j1, k0, k1;
01012   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
01013   int ni = i1-i0+1;
01014   int nj = j1-j0+1;
01015   int nk = k1-k0+1;
01016 
01017   return d_buffer + i0*nj*nk;
01018 }

void CudaPmeTranspose::setDataPtrsYZX ( std::vector< float2 * > &  dataPtrsNew,
float2 data 
)

Definition at line 648 of file CudaPmeSolverUtil.C.

References cudaCheck, TransposeBatch< T >::data_in, TransposeBatch< T >::data_out, PmeTranspose::jsize, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, TransposeBatch< T >::nx, PmeTranspose::pmeGrid, PmeTranspose::pos, TransposeBatch< T >::ysize_out, and TransposeBatch< T >::zsize_out.

00648                                                                                    {
00649   if (dataPtrsYZX.size() != dataPtrsNew.size())
00650     NAMD_bug("CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
00651   for (int iblock=0;iblock < nblock;iblock++) {
00652     dataPtrsYZX[iblock] = dataPtrsNew[iblock];
00653   }
00654   // Build batched data structures
00655   TransposeBatch<float2> *h_batchesYZX = new TransposeBatch<float2>[3*nblock];
00656 
00657   for (int iperm=0;iperm < 3;iperm++) {
00658     int isize_out;
00659     if (iperm == 0) {
00660       // Perm_Z_cX_Y:
00661       // ZXY -> XYZ
00662       isize_out = pmeGrid.K1/2+1;
00663     } else if (iperm == 1) {
00664       // Perm_cX_Y_Z:
00665       // XYZ -> YZX
00666       isize_out = pmeGrid.K2;
00667     } else {
00668       // Perm_Y_Z_cX:
00669       // YZX -> ZXY
00670       isize_out = pmeGrid.K3;
00671     }
00672 
00673     int max_nx = 0;
00674     for (int iblock=0;iblock < nblock;iblock++) {
00675 
00676       int x0 = pos[iblock];
00677       int nx = pos[iblock+1] - x0;
00678       max_nx = std::max(max_nx, nx);
00679 
00680       int width_out;
00681       float2* data_out;
00682       if (dataPtrsYZX[iblock] == NULL) {
00683         // Local transpose, use internal buffer
00684         data_out = d_data + jsize*ksize*x0;
00685         width_out = jsize;
00686       } else {
00687         // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00688         data_out = dataPtrsYZX[iblock];
00689         width_out = isize_out;
00690       }
00691 
00692       TransposeBatch<float2> batch;
00693       batch.nx        = nx;
00694       batch.ysize_out = width_out;
00695       batch.zsize_out = ksize;
00696       batch.data_in   = data+x0;
00697       batch.data_out  = data_out;
00698 
00699       h_batchesYZX[iperm*nblock + iblock] = batch;
00700 
00701     // transpose_xyz_yzx(
00702     //   nx, jsize, ksize,
00703     //   isize, jsize,
00704     //   width_out, ksize,
00705     //   data+x0, data_out, stream);
00706     }
00707 
00708     max_nx_YZX[iperm] = max_nx;
00709   }
00710 
00711   copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*nblock, stream);
00712   cudaCheck(cudaStreamSynchronize(stream));
00713   delete [] h_batchesYZX;
00714 }

void CudaPmeTranspose::setDataPtrsZXY ( std::vector< float2 * > &  dataPtrsNew,
float2 data 
)

Definition at line 719 of file CudaPmeSolverUtil.C.

References cudaCheck, TransposeBatch< T >::data_in, TransposeBatch< T >::data_out, PmeTranspose::jsize, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, TransposeBatch< T >::nx, PmeTranspose::pmeGrid, PmeTranspose::pos, TransposeBatch< T >::xsize_out, and TransposeBatch< T >::zsize_out.

00719                                                                                    {
00720   if (dataPtrsZXY.size() != dataPtrsNew.size())
00721     NAMD_bug("CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
00722   for (int iblock=0;iblock < nblock;iblock++) {
00723     dataPtrsZXY[iblock] = dataPtrsNew[iblock];
00724   }
00725 
00726   // Build batched data structures
00727   TransposeBatch<float2> *h_batchesZXY = new TransposeBatch<float2>[3*nblock];
00728 
00729   for (int iperm=0;iperm < 3;iperm++) {
00730     int isize_out;
00731     if (iperm == 0) {
00732       // Perm_cX_Y_Z:
00733       // XYZ -> ZXY
00734       isize_out = pmeGrid.K3;
00735     } else if (iperm == 1) {
00736       // Perm_Z_cX_Y:
00737       // ZXY -> YZX
00738       isize_out = pmeGrid.K2;
00739     } else {
00740       // Perm_Y_Z_cX:
00741       // YZX -> XYZ
00742       isize_out = pmeGrid.K1/2+1;
00743     }
00744 
00745     int max_nx = 0;
00746     for (int iblock=0;iblock < nblock;iblock++) {
00747 
00748       int x0 = pos[iblock];
00749       int nx = pos[iblock+1] - x0;
00750       max_nx = std::max(max_nx, nx);
00751 
00752       int width_out;
00753       float2* data_out;
00754       if (dataPtrsZXY[iblock] == NULL) {
00755         // Local transpose, use internal buffer
00756         data_out = d_data + jsize*ksize*x0;
00757         width_out = ksize;
00758       } else {
00759         // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00760         data_out = dataPtrsZXY[iblock];
00761         width_out = isize_out;
00762       }
00763 
00764       TransposeBatch<float2> batch;
00765       batch.nx        = nx;
00766       batch.zsize_out = width_out;
00767       batch.xsize_out = nx;
00768       batch.data_in   = data+x0;
00769       batch.data_out  = data_out;
00770 
00771       h_batchesZXY[iperm*nblock + iblock] = batch;
00772     }
00773 
00774     max_nx_ZXY[iperm] = max_nx;
00775   }
00776 
00777   copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*nblock, stream);
00778   cudaCheck(cudaStreamSynchronize(stream));
00779   delete [] h_batchesZXY;
00780 }

void CudaPmeTranspose::transposeXYZtoYZX ( const float2 data  )  [virtual]

Implements PmeTranspose.

Definition at line 782 of file CudaPmeSolverUtil.C.

References batchTranspose_xyz_yzx(), cudaCheck, PmeTranspose::isize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, Perm_cX_Y_Z, Perm_Y_Z_cX, Perm_Z_cX_Y, and PmeTranspose::permutation.

00782                                                            {
00783   cudaCheck(cudaSetDevice(deviceID));
00784 
00785   int iperm;
00786   switch(permutation) {
00787     case Perm_Z_cX_Y:
00788     // ZXY -> XYZ
00789     iperm = 0;
00790     break;
00791     case Perm_cX_Y_Z:
00792     // XYZ -> YZX
00793     iperm = 1;
00794     break;
00795     case Perm_Y_Z_cX:
00796     // YZX -> ZXY
00797     iperm = 2;
00798     break;
00799     default:
00800     NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
00801     break;
00802   }
00803 
00804   batchTranspose_xyz_yzx(
00805     nblock, batchesYZX + iperm*nblock,
00806     max_nx_YZX[iperm], jsize, ksize,
00807     isize, jsize, stream);
00808 
00809 
00810 /*
00811   int isize_out;
00812   switch(permutation) {
00813     case Perm_Z_cX_Y:
00814     // ZXY -> XYZ
00815     isize_out = pmeGrid.K1/2+1;
00816     break;
00817     case Perm_cX_Y_Z:
00818     // XYZ -> YZX
00819     isize_out = pmeGrid.K2;
00820     break;
00821     case Perm_Y_Z_cX:
00822     // YZX -> ZXY
00823     isize_out = pmeGrid.K3;
00824     break;
00825     default:
00826     NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
00827     break;
00828   }
00829 
00830   for (int iblock=0;iblock < nblock;iblock++) {
00831 
00832     int x0 = pos[iblock];
00833     int nx = pos[iblock+1] - x0;
00834 
00835     int width_out;
00836     float2* data_out;
00837     if (dataPtrsYZX[iblock] == NULL) {
00838       // Local transpose, use internal buffer
00839       data_out = d_data + jsize*ksize*x0;
00840       width_out = jsize;
00841     } else {
00842       // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00843       data_out = dataPtrsYZX[iblock];
00844       width_out = isize_out;
00845     }
00846 
00847     transpose_xyz_yzx(
00848       nx, jsize, ksize,
00849       isize, jsize,
00850       width_out, ksize,
00851       data+x0, data_out, stream);
00852   }
00853 */
00854 }

void CudaPmeTranspose::transposeXYZtoZXY ( const float2 data  )  [virtual]

Implements PmeTranspose.

Definition at line 856 of file CudaPmeSolverUtil.C.

References batchTranspose_xyz_zxy(), cudaCheck, PmeTranspose::isize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, Perm_cX_Y_Z, Perm_Y_Z_cX, Perm_Z_cX_Y, and PmeTranspose::permutation.

00856                                                            {
00857   cudaCheck(cudaSetDevice(deviceID));
00858 
00859   int iperm;
00860   switch(permutation) {
00861     case Perm_cX_Y_Z:
00862     // XYZ -> ZXY
00863     iperm = 0;
00864     break;
00865     case Perm_Z_cX_Y:
00866     // ZXY -> YZX
00867     iperm = 1;
00868     break;
00869     case Perm_Y_Z_cX:
00870     // YZX -> XYZ
00871     iperm = 2;
00872     break;
00873     default:
00874     NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
00875     break;
00876   }
00877 
00878   batchTranspose_xyz_zxy(
00879     nblock, batchesZXY + iperm*nblock,
00880     max_nx_ZXY[iperm], jsize, ksize,
00881     isize, jsize, stream);
00882 
00883 /*
00884   int isize_out;
00885   switch(permutation) {
00886     case Perm_cX_Y_Z:
00887     // XYZ -> ZXY
00888     isize_out = pmeGrid.K3;
00889     break;
00890     case Perm_Z_cX_Y:
00891     // ZXY -> YZX
00892     isize_out = pmeGrid.K2;
00893     break;
00894     case Perm_Y_Z_cX:
00895     // YZX -> XYZ
00896     isize_out = pmeGrid.K1/2+1;
00897     break;
00898     default:
00899     NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
00900     break;
00901   }
00902 
00903   for (int iblock=0;iblock < nblock;iblock++) {
00904 
00905     int x0 = pos[iblock];
00906     int nx = pos[iblock+1] - x0;
00907 
00908     int width_out;
00909     float2* data_out;
00910     if (dataPtrsZXY[iblock] == NULL) {
00911       // Local transpose, use internal buffer
00912       data_out = d_data + jsize*ksize*x0;
00913       width_out = ksize;
00914     } else {
00915       // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00916       data_out = dataPtrsZXY[iblock];
00917       width_out = isize_out;
00918     }
00919 
00920     transpose_xyz_zxy(
00921       nx, jsize, ksize,
00922       isize, jsize,
00923       width_out, nx,
00924       data+x0, data_out, stream);
00925   }
00926 */
00927 }

void CudaPmeTranspose::waitStreamSynchronize (  ) 

Definition at line 929 of file CudaPmeSolverUtil.C.

References cudaCheck.

00929                                              {
00930   cudaCheck(cudaSetDevice(deviceID));
00931   cudaCheck(cudaStreamSynchronize(stream));
00932 }


The documentation for this class was generated from the following files:

Generated on 10 Dec 2018 for NAMD by  doxygen 1.6.1