CudaPmeTranspose Class Reference

#include <CudaPmeSolverUtil.h>

Inheritance diagram for CudaPmeTranspose:

PmeTranspose List of all members.

Public Member Functions

 CudaPmeTranspose (PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
 ~CudaPmeTranspose ()
void setDataPtrsYZX (std::vector< float2 * > &dataPtrsNew, float2 *data)
void setDataPtrsZXY (std::vector< float2 * > &dataPtrsNew, float2 *data)
void transposeXYZtoYZX (const float2 *data)
void transposeXYZtoZXY (const float2 *data)
void waitStreamSynchronize ()
void copyDataDeviceToHost (const int iblock, float2 *h_data, const int h_dataSize)
void copyDataHostToDevice (const int iblock, float2 *data_in, float2 *data_out)
void copyDataDeviceToDevice (const int iblock, float2 *data_out)
float2getBuffer (const int iblock)
void copyDataToPeerDeviceYZX (const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
void copyDataToPeerDeviceZXY (const int iblock, int deviceID_out, int permutation_out, float2 *data_out)

Detailed Description

Definition at line 139 of file CudaPmeSolverUtil.h.


Constructor & Destructor Documentation

CudaPmeTranspose::CudaPmeTranspose ( PmeGrid  pmeGrid,
const int  permutation,
const int  jblock,
const int  kblock,
int  deviceID,
cudaStream_t  stream 
)

Definition at line 615 of file CudaPmeSolverUtil.C.

References cudaCheck, PmeTranspose::dataSize, and PmeTranspose::nblock.

00616                                                                            : 
00617   PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
00618   cudaCheck(cudaSetDevice(deviceID));
00619 
00620   allocate_device<float2>(&d_data, dataSize);
00621 #ifndef P2P_ENABLE_3D
00622   allocate_device<float2>(&d_buffer, dataSize);
00623 #endif
00624 
00625   // Setup data pointers to NULL, these can be overridden later on by using setDataPtrs()
00626   dataPtrsYZX.resize(nblock, NULL);
00627   dataPtrsZXY.resize(nblock, NULL);
00628 
00629   allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*nblock);
00630   allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*nblock);
00631 }

CudaPmeTranspose::~CudaPmeTranspose (  ) 

Definition at line 633 of file CudaPmeSolverUtil.C.

References cudaCheck.

00633                                     {
00634   cudaCheck(cudaSetDevice(deviceID));
00635   deallocate_device<float2>(&d_data);
00636 #ifndef P2P_ENABLE_3D
00637   deallocate_device<float2>(&d_buffer);
00638 #endif
00639   deallocate_device< TransposeBatch<float2> >(&batchesZXY);
00640   deallocate_device< TransposeBatch<float2> >(&batchesYZX);
00641 }


Member Function Documentation

void CudaPmeTranspose::copyDataDeviceToDevice ( const int  iblock,
float2 data_out 
)

Definition at line 978 of file CudaPmeSolverUtil.C.

References cudaCheck, getBlockDim(), PmeTranspose::isize, PmeTranspose::jblock, PmeTranspose::jsize, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

00978                                                                                 {
00979   cudaCheck(cudaSetDevice(deviceID));
00980 
00981   if (iblock >= nblock)
00982     NAMD_bug("CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
00983 
00984   // Determine block size = how much we're copying
00985   int i0, i1, j0, j1, k0, k1;
00986   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
00987   int ni = i1-i0+1;
00988   int nj = j1-j0+1;
00989   int nk = k1-k0+1;
00990 
00991   float2* data_in = d_buffer + i0*nj*nk;
00992 
00993   copy3D_DtoD<float2>(data_in, data_out,
00994     0, 0, 0,
00995     ni, nj,
00996     i0, 0, 0,
00997     isize, jsize,
00998     ni, nj, nk, stream);
00999 }

void CudaPmeTranspose::copyDataDeviceToHost ( const int  iblock,
float2 h_data,
const int  h_dataSize 
)

Definition at line 932 of file CudaPmeSolverUtil.C.

References cudaCheck, PmeTranspose::dataSize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, and PmeTranspose::pos.

00932                                                                                                   {
00933   cudaCheck(cudaSetDevice(deviceID));
00934 
00935   if (iblock >= nblock)
00936     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
00937 
00938   int x0 = pos[iblock];
00939   int nx = pos[iblock+1] - x0;
00940 
00941   int copySize  = jsize*ksize*nx;
00942   int copyStart = jsize*ksize*x0;
00943 
00944   if (copyStart + copySize > dataSize)
00945     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
00946 
00947   if (copySize > h_dataSize) 
00948     NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
00949 
00950   copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
00951 }

void CudaPmeTranspose::copyDataHostToDevice ( const int  iblock,
float2 data_in,
float2 data_out 
)

Definition at line 953 of file CudaPmeSolverUtil.C.

References cudaCheck, getBlockDim(), PmeTranspose::isize, PmeTranspose::jblock, PmeTranspose::jsize, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

00953                                                                                                {
00954   cudaCheck(cudaSetDevice(deviceID));
00955 
00956   if (iblock >= nblock)
00957     NAMD_bug("CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
00958 
00959   // Determine block size = how much we're copying
00960   int i0, i1, j0, j1, k0, k1;
00961   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
00962   int ni = i1-i0+1;
00963   int nj = j1-j0+1;
00964   int nk = k1-k0+1;
00965 
00966   copy3D_HtoD<float2>(data_in, data_out,
00967     0, 0, 0,
00968     ni, nj,
00969     i0, 0, 0,
00970     isize, jsize,
00971     ni, nj, nk, stream);
00972 }

void CudaPmeTranspose::copyDataToPeerDeviceYZX ( const int  iblock,
int  deviceID_out,
int  permutation_out,
float2 data_out 
)

Definition at line 1019 of file CudaPmeSolverUtil.C.

References PmeTranspose::jblock, and PmeTranspose::kblock.

01020                     {
01021 
01022   int iblock_out = jblock;
01023   int jblock_out = kblock;
01024   int kblock_out = iblock;
01025 
01026   copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
01027 }

void CudaPmeTranspose::copyDataToPeerDeviceZXY ( const int  iblock,
int  deviceID_out,
int  permutation_out,
float2 data_out 
)

Definition at line 1029 of file CudaPmeSolverUtil.C.

References PmeTranspose::jblock, and PmeTranspose::kblock.

01030                     {
01031 
01032   int iblock_out = kblock;
01033   int jblock_out = iblock;
01034   int kblock_out = jblock;
01035 
01036   copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
01037 }

float2 * CudaPmeTranspose::getBuffer ( const int  iblock  ) 

Definition at line 1004 of file CudaPmeSolverUtil.C.

References getBlockDim(), PmeTranspose::jblock, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

01004                                                     {
01005   if (iblock >= nblock)
01006     NAMD_bug("CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
01007 
01008   // Determine block size = how much we're copying
01009   int i0, i1, j0, j1, k0, k1;
01010   getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
01011   int ni = i1-i0+1;
01012   int nj = j1-j0+1;
01013   int nk = k1-k0+1;
01014 
01015   return d_buffer + i0*nj*nk;
01016 }

void CudaPmeTranspose::setDataPtrsYZX ( std::vector< float2 * > &  dataPtrsNew,
float2 data 
)

Definition at line 646 of file CudaPmeSolverUtil.C.

References cudaCheck, TransposeBatch< T >::data_in, TransposeBatch< T >::data_out, PmeTranspose::jsize, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, TransposeBatch< T >::nx, PmeTranspose::pmeGrid, PmeTranspose::pos, TransposeBatch< T >::ysize_out, and TransposeBatch< T >::zsize_out.

00646                                                                                    {
00647   if (dataPtrsYZX.size() != dataPtrsNew.size())
00648     NAMD_bug("CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
00649   for (int iblock=0;iblock < nblock;iblock++) {
00650     dataPtrsYZX[iblock] = dataPtrsNew[iblock];
00651   }
00652   // Build batched data structures
00653   TransposeBatch<float2> *h_batchesYZX = new TransposeBatch<float2>[3*nblock];
00654 
00655   for (int iperm=0;iperm < 3;iperm++) {
00656     int isize_out;
00657     if (iperm == 0) {
00658       // Perm_Z_cX_Y:
00659       // ZXY -> XYZ
00660       isize_out = pmeGrid.K1/2+1;
00661     } else if (iperm == 1) {
00662       // Perm_cX_Y_Z:
00663       // XYZ -> YZX
00664       isize_out = pmeGrid.K2;
00665     } else {
00666       // Perm_Y_Z_cX:
00667       // YZX -> ZXY
00668       isize_out = pmeGrid.K3;
00669     }
00670 
00671     int max_nx = 0;
00672     for (int iblock=0;iblock < nblock;iblock++) {
00673 
00674       int x0 = pos[iblock];
00675       int nx = pos[iblock+1] - x0;
00676       max_nx = std::max(max_nx, nx);
00677 
00678       int width_out;
00679       float2* data_out;
00680       if (dataPtrsYZX[iblock] == NULL) {
00681         // Local transpose, use internal buffer
00682         data_out = d_data + jsize*ksize*x0;
00683         width_out = jsize;
00684       } else {
00685         // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00686         data_out = dataPtrsYZX[iblock];
00687         width_out = isize_out;
00688       }
00689 
00690       TransposeBatch<float2> batch;
00691       batch.nx        = nx;
00692       batch.ysize_out = width_out;
00693       batch.zsize_out = ksize;
00694       batch.data_in   = data+x0;
00695       batch.data_out  = data_out;
00696 
00697       h_batchesYZX[iperm*nblock + iblock] = batch;
00698 
00699     // transpose_xyz_yzx(
00700     //   nx, jsize, ksize,
00701     //   isize, jsize,
00702     //   width_out, ksize,
00703     //   data+x0, data_out, stream);
00704     }
00705 
00706     max_nx_YZX[iperm] = max_nx;
00707   }
00708 
00709   copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*nblock, stream);
00710   cudaCheck(cudaStreamSynchronize(stream));
00711   delete [] h_batchesYZX;
00712 }

void CudaPmeTranspose::setDataPtrsZXY ( std::vector< float2 * > &  dataPtrsNew,
float2 data 
)

Definition at line 717 of file CudaPmeSolverUtil.C.

References cudaCheck, TransposeBatch< T >::data_in, TransposeBatch< T >::data_out, PmeTranspose::jsize, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, TransposeBatch< T >::nx, PmeTranspose::pmeGrid, PmeTranspose::pos, TransposeBatch< T >::xsize_out, and TransposeBatch< T >::zsize_out.

00717                                                                                    {
00718   if (dataPtrsZXY.size() != dataPtrsNew.size())
00719     NAMD_bug("CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
00720   for (int iblock=0;iblock < nblock;iblock++) {
00721     dataPtrsZXY[iblock] = dataPtrsNew[iblock];
00722   }
00723 
00724   // Build batched data structures
00725   TransposeBatch<float2> *h_batchesZXY = new TransposeBatch<float2>[3*nblock];
00726 
00727   for (int iperm=0;iperm < 3;iperm++) {
00728     int isize_out;
00729     if (iperm == 0) {
00730       // Perm_cX_Y_Z:
00731       // XYZ -> ZXY
00732       isize_out = pmeGrid.K3;
00733     } else if (iperm == 1) {
00734       // Perm_Z_cX_Y:
00735       // ZXY -> YZX
00736       isize_out = pmeGrid.K2;
00737     } else {
00738       // Perm_Y_Z_cX:
00739       // YZX -> XYZ
00740       isize_out = pmeGrid.K1/2+1;
00741     }
00742 
00743     int max_nx = 0;
00744     for (int iblock=0;iblock < nblock;iblock++) {
00745 
00746       int x0 = pos[iblock];
00747       int nx = pos[iblock+1] - x0;
00748       max_nx = std::max(max_nx, nx);
00749 
00750       int width_out;
00751       float2* data_out;
00752       if (dataPtrsZXY[iblock] == NULL) {
00753         // Local transpose, use internal buffer
00754         data_out = d_data + jsize*ksize*x0;
00755         width_out = ksize;
00756       } else {
00757         // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00758         data_out = dataPtrsZXY[iblock];
00759         width_out = isize_out;
00760       }
00761 
00762       TransposeBatch<float2> batch;
00763       batch.nx        = nx;
00764       batch.zsize_out = width_out;
00765       batch.xsize_out = nx;
00766       batch.data_in   = data+x0;
00767       batch.data_out  = data_out;
00768 
00769       h_batchesZXY[iperm*nblock + iblock] = batch;
00770     }
00771 
00772     max_nx_ZXY[iperm] = max_nx;
00773   }
00774 
00775   copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*nblock, stream);
00776   cudaCheck(cudaStreamSynchronize(stream));
00777   delete [] h_batchesZXY;
00778 }

void CudaPmeTranspose::transposeXYZtoYZX ( const float2 data  )  [virtual]

Implements PmeTranspose.

Definition at line 780 of file CudaPmeSolverUtil.C.

References batchTranspose_xyz_yzx(), cudaCheck, PmeTranspose::isize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, Perm_cX_Y_Z, Perm_Y_Z_cX, Perm_Z_cX_Y, and PmeTranspose::permutation.

00780                                                            {
00781   cudaCheck(cudaSetDevice(deviceID));
00782 
00783   int iperm;
00784   switch(permutation) {
00785     case Perm_Z_cX_Y:
00786     // ZXY -> XYZ
00787     iperm = 0;
00788     break;
00789     case Perm_cX_Y_Z:
00790     // XYZ -> YZX
00791     iperm = 1;
00792     break;
00793     case Perm_Y_Z_cX:
00794     // YZX -> ZXY
00795     iperm = 2;
00796     break;
00797     default:
00798     NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
00799     break;
00800   }
00801 
00802   batchTranspose_xyz_yzx(
00803     nblock, batchesYZX + iperm*nblock,
00804     max_nx_YZX[iperm], jsize, ksize,
00805     isize, jsize, stream);
00806 
00807 
00808 /*
00809   int isize_out;
00810   switch(permutation) {
00811     case Perm_Z_cX_Y:
00812     // ZXY -> XYZ
00813     isize_out = pmeGrid.K1/2+1;
00814     break;
00815     case Perm_cX_Y_Z:
00816     // XYZ -> YZX
00817     isize_out = pmeGrid.K2;
00818     break;
00819     case Perm_Y_Z_cX:
00820     // YZX -> ZXY
00821     isize_out = pmeGrid.K3;
00822     break;
00823     default:
00824     NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
00825     break;
00826   }
00827 
00828   for (int iblock=0;iblock < nblock;iblock++) {
00829 
00830     int x0 = pos[iblock];
00831     int nx = pos[iblock+1] - x0;
00832 
00833     int width_out;
00834     float2* data_out;
00835     if (dataPtrsYZX[iblock] == NULL) {
00836       // Local transpose, use internal buffer
00837       data_out = d_data + jsize*ksize*x0;
00838       width_out = jsize;
00839     } else {
00840       // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00841       data_out = dataPtrsYZX[iblock];
00842       width_out = isize_out;
00843     }
00844 
00845     transpose_xyz_yzx(
00846       nx, jsize, ksize,
00847       isize, jsize,
00848       width_out, ksize,
00849       data+x0, data_out, stream);
00850   }
00851 */
00852 }

void CudaPmeTranspose::transposeXYZtoZXY ( const float2 data  )  [virtual]

Implements PmeTranspose.

Definition at line 854 of file CudaPmeSolverUtil.C.

References batchTranspose_xyz_zxy(), cudaCheck, PmeTranspose::isize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, Perm_cX_Y_Z, Perm_Y_Z_cX, Perm_Z_cX_Y, and PmeTranspose::permutation.

00854                                                            {
00855   cudaCheck(cudaSetDevice(deviceID));
00856 
00857   int iperm;
00858   switch(permutation) {
00859     case Perm_cX_Y_Z:
00860     // XYZ -> ZXY
00861     iperm = 0;
00862     break;
00863     case Perm_Z_cX_Y:
00864     // ZXY -> YZX
00865     iperm = 1;
00866     break;
00867     case Perm_Y_Z_cX:
00868     // YZX -> XYZ
00869     iperm = 2;
00870     break;
00871     default:
00872     NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
00873     break;
00874   }
00875 
00876   batchTranspose_xyz_zxy(
00877     nblock, batchesZXY + iperm*nblock,
00878     max_nx_ZXY[iperm], jsize, ksize,
00879     isize, jsize, stream);
00880 
00881 /*
00882   int isize_out;
00883   switch(permutation) {
00884     case Perm_cX_Y_Z:
00885     // XYZ -> ZXY
00886     isize_out = pmeGrid.K3;
00887     break;
00888     case Perm_Z_cX_Y:
00889     // ZXY -> YZX
00890     isize_out = pmeGrid.K2;
00891     break;
00892     case Perm_Y_Z_cX:
00893     // YZX -> XYZ
00894     isize_out = pmeGrid.K1/2+1;
00895     break;
00896     default:
00897     NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
00898     break;
00899   }
00900 
00901   for (int iblock=0;iblock < nblock;iblock++) {
00902 
00903     int x0 = pos[iblock];
00904     int nx = pos[iblock+1] - x0;
00905 
00906     int width_out;
00907     float2* data_out;
00908     if (dataPtrsZXY[iblock] == NULL) {
00909       // Local transpose, use internal buffer
00910       data_out = d_data + jsize*ksize*x0;
00911       width_out = ksize;
00912     } else {
00913       // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
00914       data_out = dataPtrsZXY[iblock];
00915       width_out = isize_out;
00916     }
00917 
00918     transpose_xyz_zxy(
00919       nx, jsize, ksize,
00920       isize, jsize,
00921       width_out, nx,
00922       data+x0, data_out, stream);
00923   }
00924 */
00925 }

void CudaPmeTranspose::waitStreamSynchronize (  ) 

Definition at line 927 of file CudaPmeSolverUtil.C.

References cudaCheck.

00927                                              {
00928   cudaCheck(cudaSetDevice(deviceID));
00929   cudaCheck(cudaStreamSynchronize(stream));
00930 }


The documentation for this class was generated from the following files:
Generated on Tue Nov 21 01:17:18 2017 for NAMD by  doxygen 1.4.7