NAMD
Public Member Functions | List of all members
CudaPmeTranspose Class Reference

#include <CudaPmeSolverUtil.h>

Inheritance diagram for CudaPmeTranspose:
PmeTranspose

Public Member Functions

 CudaPmeTranspose (PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
 
 ~CudaPmeTranspose ()
 
void setDataPtrsYZX (std::vector< float2 *> &dataPtrsNew, float2 *data)
 
void setDataPtrsZXY (std::vector< float2 *> &dataPtrsNew, float2 *data)
 
void transposeXYZtoYZX (const float2 *data)
 
void transposeXYZtoZXY (const float2 *data)
 
void waitStreamSynchronize ()
 
void copyDataDeviceToHost (const int iblock, float2 *h_data, const int h_dataSize)
 
void copyDataHostToDevice (const int iblock, float2 *data_in, float2 *data_out)
 
void copyDataDeviceToDevice (const int iblock, float2 *data_out)
 
float2 * getBuffer (const int iblock)
 
void copyDataToPeerDeviceYZX (const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
 
void copyDataToPeerDeviceZXY (const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
 
- Public Member Functions inherited from PmeTranspose
 PmeTranspose (PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock)
 
virtual ~PmeTranspose ()
 

Additional Inherited Members

- Protected Attributes inherited from PmeTranspose
PmeGrid pmeGrid
 
const int permutation
 
const int jblock
 
const int kblock
 
int isize
 
int jsize
 
int ksize
 
int dataSize
 
int nblock
 
std::vector< int > pos
 

Detailed Description

Definition at line 158 of file CudaPmeSolverUtil.h.

Constructor & Destructor Documentation

◆ CudaPmeTranspose()

CudaPmeTranspose::CudaPmeTranspose ( PmeGrid  pmeGrid,
const int  permutation,
const int  jblock,
const int  kblock,
int  deviceID,
cudaStream_t  stream 
)

Definition at line 842 of file CudaPmeSolverUtil.C.

References cudaCheck, PmeTranspose::dataSize, and PmeTranspose::nblock.

843  :
844  PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
845  cudaCheck(cudaSetDevice(deviceID));
846 
847  allocate_device<float2>(&d_data, dataSize);
848 #ifndef P2P_ENABLE_3D
849  allocate_device<float2>(&d_buffer, dataSize);
850 #endif
851 
852  // Setup data pointers to NULL, these can be overridden later on by using setDataPtrs()
853  dataPtrsYZX.resize(nblock, NULL);
854  dataPtrsZXY.resize(nblock, NULL);
855 
856  allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*nblock);
857  allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*nblock);
858 }
const int permutation
PmeTranspose(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock)
PmeGrid pmeGrid
const int jblock
const int kblock
#define cudaCheck(stmt)
Definition: CudaUtils.h:233

◆ ~CudaPmeTranspose()

CudaPmeTranspose::~CudaPmeTranspose ( )

Definition at line 860 of file CudaPmeSolverUtil.C.

References cudaCheck.

860  {
861  cudaCheck(cudaSetDevice(deviceID));
862  deallocate_device<float2>(&d_data);
863 #ifndef P2P_ENABLE_3D
864  deallocate_device<float2>(&d_buffer);
865 #endif
866  deallocate_device< TransposeBatch<float2> >(&batchesZXY);
867  deallocate_device< TransposeBatch<float2> >(&batchesYZX);
868 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:233

Member Function Documentation

◆ copyDataDeviceToDevice()

void CudaPmeTranspose::copyDataDeviceToDevice ( const int  iblock,
float2 *  data_out 
)

Definition at line 1204 of file CudaPmeSolverUtil.C.

References cudaCheck, getBlockDim(), PmeTranspose::isize, PmeTranspose::jblock, PmeTranspose::jsize, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

1204  {
1205  cudaCheck(cudaSetDevice(deviceID));
1206 
1207  if (iblock >= nblock)
1208  NAMD_bug("CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
1209 
1210  // Determine block size = how much we're copying
1211  int i0, i1, j0, j1, k0, k1;
1212  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1213  int ni = i1-i0+1;
1214  int nj = j1-j0+1;
1215  int nk = k1-k0+1;
1216 
1217  float2* data_in = d_buffer + i0*nj*nk;
1218 
1219  copy3D_DtoD<float2>(data_in, data_out,
1220  0, 0, 0,
1221  ni, nj,
1222  i0, 0, 0,
1223  isize, jsize,
1224  ni, nj, nk, stream);
1225 }
const int permutation
PmeGrid pmeGrid
void NAMD_bug(const char *err_msg)
Definition: common.C:195
const int jblock
const int kblock
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:89
#define cudaCheck(stmt)
Definition: CudaUtils.h:233

◆ copyDataDeviceToHost()

void CudaPmeTranspose::copyDataDeviceToHost ( const int  iblock,
float2 *  h_data,
const int  h_dataSize 
)

Definition at line 1158 of file CudaPmeSolverUtil.C.

References cudaCheck, PmeTranspose::dataSize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, and PmeTranspose::pos.

1158  {
1159  cudaCheck(cudaSetDevice(deviceID));
1160 
1161  if (iblock >= nblock)
1162  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
1163 
1164  int x0 = pos[iblock];
1165  int nx = pos[iblock+1] - x0;
1166 
1167  int copySize = jsize*ksize*nx;
1168  int copyStart = jsize*ksize*x0;
1169 
1170  if (copyStart + copySize > dataSize)
1171  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
1172 
1173  if (copySize > h_dataSize)
1174  NAMD_bug("CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
1175 
1176  copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
1177 }
std::vector< int > pos
void NAMD_bug(const char *err_msg)
Definition: common.C:195
#define cudaCheck(stmt)
Definition: CudaUtils.h:233

◆ copyDataHostToDevice()

void CudaPmeTranspose::copyDataHostToDevice ( const int  iblock,
float2 *  data_in,
float2 *  data_out 
)

Definition at line 1179 of file CudaPmeSolverUtil.C.

References cudaCheck, getBlockDim(), PmeTranspose::isize, PmeTranspose::jblock, PmeTranspose::jsize, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

1179  {
1180  cudaCheck(cudaSetDevice(deviceID));
1181 
1182  if (iblock >= nblock)
1183  NAMD_bug("CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
1184 
1185  // Determine block size = how much we're copying
1186  int i0, i1, j0, j1, k0, k1;
1187  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1188  int ni = i1-i0+1;
1189  int nj = j1-j0+1;
1190  int nk = k1-k0+1;
1191 
1192  copy3D_HtoD<float2>(data_in, data_out,
1193  0, 0, 0,
1194  ni, nj,
1195  i0, 0, 0,
1196  isize, jsize,
1197  ni, nj, nk, stream);
1198 }
const int permutation
PmeGrid pmeGrid
void NAMD_bug(const char *err_msg)
Definition: common.C:195
const int jblock
const int kblock
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:89
#define cudaCheck(stmt)
Definition: CudaUtils.h:233

◆ copyDataToPeerDeviceYZX()

void CudaPmeTranspose::copyDataToPeerDeviceYZX ( const int  iblock,
int  deviceID_out,
int  permutation_out,
float2 *  data_out 
)

Definition at line 1245 of file CudaPmeSolverUtil.C.

References PmeTranspose::jblock, and PmeTranspose::kblock.

1246  {
1247 
1248  int iblock_out = jblock;
1249  int jblock_out = kblock;
1250  int kblock_out = iblock;
1251 
1252  copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1253 }
const int jblock
const int kblock

◆ copyDataToPeerDeviceZXY()

void CudaPmeTranspose::copyDataToPeerDeviceZXY ( const int  iblock,
int  deviceID_out,
int  permutation_out,
float2 *  data_out 
)

Definition at line 1255 of file CudaPmeSolverUtil.C.

References PmeTranspose::jblock, and PmeTranspose::kblock.

1256  {
1257 
1258  int iblock_out = kblock;
1259  int jblock_out = iblock;
1260  int kblock_out = jblock;
1261 
1262  copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1263 }
const int jblock
const int kblock

◆ getBuffer()

float2 * CudaPmeTranspose::getBuffer ( const int  iblock)

Definition at line 1230 of file CudaPmeSolverUtil.C.

References getBlockDim(), PmeTranspose::jblock, PmeTranspose::kblock, NAMD_bug(), PmeTranspose::nblock, PmeTranspose::permutation, and PmeTranspose::pmeGrid.

1230  {
1231  if (iblock >= nblock)
1232  NAMD_bug("CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
1233 
1234  // Determine block size = how much we're copying
1235  int i0, i1, j0, j1, k0, k1;
1236  getBlockDim(pmeGrid, permutation, iblock, jblock, kblock, i0, i1, j0, j1, k0, k1);
1237  int ni = i1-i0+1;
1238  int nj = j1-j0+1;
1239  int nk = k1-k0+1;
1240 
1241  return d_buffer + i0*nj*nk;
1242 }
const int permutation
PmeGrid pmeGrid
void NAMD_bug(const char *err_msg)
Definition: common.C:195
const int jblock
const int kblock
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
Definition: PmeSolverUtil.h:89

◆ setDataPtrsYZX()

void CudaPmeTranspose::setDataPtrsYZX ( std::vector< float2 *> &  dataPtrsNew,
float2 *  data 
)

Definition at line 873 of file CudaPmeSolverUtil.C.

References cudaCheck, TransposeBatch< T >::data_in, TransposeBatch< T >::data_out, PmeTranspose::jsize, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, TransposeBatch< T >::nx, PmeTranspose::pmeGrid, PmeTranspose::pos, TransposeBatch< T >::ysize_out, and TransposeBatch< T >::zsize_out.

873  {
874  if (dataPtrsYZX.size() != dataPtrsNew.size())
875  NAMD_bug("CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
876  for (int iblock=0;iblock < nblock;iblock++) {
877  dataPtrsYZX[iblock] = dataPtrsNew[iblock];
878  }
879  // Build batched data structures
881 
882  for (int iperm=0;iperm < 3;iperm++) {
883  int isize_out;
884  if (iperm == 0) {
885  // Perm_Z_cX_Y:
886  // ZXY -> XYZ
887  isize_out = pmeGrid.K1/2+1;
888  } else if (iperm == 1) {
889  // Perm_cX_Y_Z:
890  // XYZ -> YZX
891  isize_out = pmeGrid.K2;
892  } else {
893  // Perm_Y_Z_cX:
894  // YZX -> ZXY
895  isize_out = pmeGrid.K3;
896  }
897 
898  int max_nx = 0;
899  for (int iblock=0;iblock < nblock;iblock++) {
900 
901  int x0 = pos[iblock];
902  int nx = pos[iblock+1] - x0;
903  max_nx = std::max(max_nx, nx);
904 
905  int width_out;
906  float2* data_out;
907  if (dataPtrsYZX[iblock] == NULL) {
908  // Local transpose, use internal buffer
909  data_out = d_data + jsize*ksize*x0;
910  width_out = jsize;
911  } else {
912  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
913  data_out = dataPtrsYZX[iblock];
914  width_out = isize_out;
915  }
916 
918  batch.nx = nx;
919  batch.ysize_out = width_out;
920  batch.zsize_out = ksize;
921  batch.data_in = data+x0;
922  batch.data_out = data_out;
923 
924  h_batchesYZX[iperm*nblock + iblock] = batch;
925 
926  // transpose_xyz_yzx(
927  // nx, jsize, ksize,
928  // isize, jsize,
929  // width_out, ksize,
930  // data+x0, data_out, stream);
931  }
932 
933  max_nx_YZX[iperm] = max_nx;
934  }
935 
936  copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*nblock, stream);
937  cudaCheck(cudaStreamSynchronize(stream));
938  delete [] h_batchesYZX;
939 }
int K2
Definition: PmeBase.h:21
int K1
Definition: PmeBase.h:21
std::vector< int > pos
PmeGrid pmeGrid
void NAMD_bug(const char *err_msg)
Definition: common.C:195
int K3
Definition: PmeBase.h:21
#define cudaCheck(stmt)
Definition: CudaUtils.h:233

◆ setDataPtrsZXY()

void CudaPmeTranspose::setDataPtrsZXY ( std::vector< float2 *> &  dataPtrsNew,
float2 *  data 
)

Definition at line 944 of file CudaPmeSolverUtil.C.

References cudaCheck, TransposeBatch< T >::data_in, TransposeBatch< T >::data_out, PmeTranspose::jsize, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, TransposeBatch< T >::nx, PmeTranspose::pmeGrid, PmeTranspose::pos, TransposeBatch< T >::xsize_out, and TransposeBatch< T >::zsize_out.

944  {
945  if (dataPtrsZXY.size() != dataPtrsNew.size())
946  NAMD_bug("CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
947  for (int iblock=0;iblock < nblock;iblock++) {
948  dataPtrsZXY[iblock] = dataPtrsNew[iblock];
949  }
950 
951  // Build batched data structures
953 
954  for (int iperm=0;iperm < 3;iperm++) {
955  int isize_out;
956  if (iperm == 0) {
957  // Perm_cX_Y_Z:
958  // XYZ -> ZXY
959  isize_out = pmeGrid.K3;
960  } else if (iperm == 1) {
961  // Perm_Z_cX_Y:
962  // ZXY -> YZX
963  isize_out = pmeGrid.K2;
964  } else {
965  // Perm_Y_Z_cX:
966  // YZX -> XYZ
967  isize_out = pmeGrid.K1/2+1;
968  }
969 
970  int max_nx = 0;
971  for (int iblock=0;iblock < nblock;iblock++) {
972 
973  int x0 = pos[iblock];
974  int nx = pos[iblock+1] - x0;
975  max_nx = std::max(max_nx, nx);
976 
977  int width_out;
978  float2* data_out;
979  if (dataPtrsZXY[iblock] == NULL) {
980  // Local transpose, use internal buffer
981  data_out = d_data + jsize*ksize*x0;
982  width_out = ksize;
983  } else {
984  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
985  data_out = dataPtrsZXY[iblock];
986  width_out = isize_out;
987  }
988 
990  batch.nx = nx;
991  batch.zsize_out = width_out;
992  batch.xsize_out = nx;
993  batch.data_in = data+x0;
994  batch.data_out = data_out;
995  h_batchesZXY[iperm*nblock + iblock] = batch;
996  }
997 
998  max_nx_ZXY[iperm] = max_nx;
999  }
1000 
1001  copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*nblock, stream);
1002  cudaCheck(cudaStreamSynchronize(stream));
1003  delete [] h_batchesZXY;
1004 }
int K2
Definition: PmeBase.h:21
int K1
Definition: PmeBase.h:21
std::vector< int > pos
PmeGrid pmeGrid
void NAMD_bug(const char *err_msg)
Definition: common.C:195
int K3
Definition: PmeBase.h:21
#define cudaCheck(stmt)
Definition: CudaUtils.h:233

◆ transposeXYZtoYZX()

void CudaPmeTranspose::transposeXYZtoYZX ( const float2 *  data)
virtual

Implements PmeTranspose.

Definition at line 1006 of file CudaPmeSolverUtil.C.

References batchTranspose_xyz_yzx(), cudaCheck, PmeTranspose::isize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, Perm_cX_Y_Z, Perm_Y_Z_cX, Perm_Z_cX_Y, and PmeTranspose::permutation.

1006  {
1007  cudaCheck(cudaSetDevice(deviceID));
1008 
1009  int iperm;
1010  switch(permutation) {
1011  case Perm_Z_cX_Y:
1012  // ZXY -> XYZ
1013  iperm = 0;
1014  break;
1015  case Perm_cX_Y_Z:
1016  // XYZ -> YZX
1017  iperm = 1;
1018  break;
1019  case Perm_Y_Z_cX:
1020  // YZX -> ZXY
1021  iperm = 2;
1022  break;
1023  default:
1024  NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
1025  break;
1026  }
1027 
1029  nblock, batchesYZX + iperm*nblock,
1030  max_nx_YZX[iperm], jsize, ksize,
1031  isize, jsize, stream);
1032 
1033 
1034 /*
1035  int isize_out;
1036  switch(permutation) {
1037  case Perm_Z_cX_Y:
1038  // ZXY -> XYZ
1039  isize_out = pmeGrid.K1/2+1;
1040  break;
1041  case Perm_cX_Y_Z:
1042  // XYZ -> YZX
1043  isize_out = pmeGrid.K2;
1044  break;
1045  case Perm_Y_Z_cX:
1046  // YZX -> ZXY
1047  isize_out = pmeGrid.K3;
1048  break;
1049  default:
1050  NAMD_bug("PmeTranspose::transposeXYZtoYZX, invalid permutation");
1051  break;
1052  }
1053 
1054  for (int iblock=0;iblock < nblock;iblock++) {
1055 
1056  int x0 = pos[iblock];
1057  int nx = pos[iblock+1] - x0;
1058 
1059  int width_out;
1060  float2* data_out;
1061  if (dataPtrsYZX[iblock] == NULL) {
1062  // Local transpose, use internal buffer
1063  data_out = d_data + jsize*ksize*x0;
1064  width_out = jsize;
1065  } else {
1066  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
1067  data_out = dataPtrsYZX[iblock];
1068  width_out = isize_out;
1069  }
1070 
1071  transpose_xyz_yzx(
1072  nx, jsize, ksize,
1073  isize, jsize,
1074  width_out, ksize,
1075  data+x0, data_out, stream);
1076  }
1077 */
1078 }
const int permutation
void batchTranspose_xyz_yzx(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void NAMD_bug(const char *err_msg)
Definition: common.C:195
#define cudaCheck(stmt)
Definition: CudaUtils.h:233

◆ transposeXYZtoZXY()

void CudaPmeTranspose::transposeXYZtoZXY ( const float2 *  data)
virtual

Implements PmeTranspose.

Definition at line 1080 of file CudaPmeSolverUtil.C.

References batchTranspose_xyz_zxy(), cudaCheck, PmeTranspose::isize, PmeTranspose::jsize, PmeTranspose::ksize, NAMD_bug(), PmeTranspose::nblock, Perm_cX_Y_Z, Perm_Y_Z_cX, Perm_Z_cX_Y, and PmeTranspose::permutation.

1080  {
1081  cudaCheck(cudaSetDevice(deviceID));
1082 
1083  int iperm;
1084  switch(permutation) {
1085  case Perm_cX_Y_Z:
1086  // XYZ -> ZXY
1087  iperm = 0;
1088  break;
1089  case Perm_Z_cX_Y:
1090  // ZXY -> YZX
1091  iperm = 1;
1092  break;
1093  case Perm_Y_Z_cX:
1094  // YZX -> XYZ
1095  iperm = 2;
1096  break;
1097  default:
1098  NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
1099  break;
1100  }
1101 
1103  nblock, batchesZXY + iperm*nblock,
1104  max_nx_ZXY[iperm], jsize, ksize,
1105  isize, jsize, stream);
1106 
1107 /*
1108  int isize_out;
1109  switch(permutation) {
1110  case Perm_cX_Y_Z:
1111  // XYZ -> ZXY
1112  isize_out = pmeGrid.K3;
1113  break;
1114  case Perm_Z_cX_Y:
1115  // ZXY -> YZX
1116  isize_out = pmeGrid.K2;
1117  break;
1118  case Perm_Y_Z_cX:
1119  // YZX -> XYZ
1120  isize_out = pmeGrid.K1/2+1;
1121  break;
1122  default:
1123  NAMD_bug("PmeTranspose::transposeXYZtoZXY, invalid permutation");
1124  break;
1125  }
1126 
1127  for (int iblock=0;iblock < nblock;iblock++) {
1128 
1129  int x0 = pos[iblock];
1130  int nx = pos[iblock+1] - x0;
1131 
1132  int width_out;
1133  float2* data_out;
1134  if (dataPtrsZXY[iblock] == NULL) {
1135  // Local transpose, use internal buffer
1136  data_out = d_data + jsize*ksize*x0;
1137  width_out = ksize;
1138  } else {
1139  // Non-local tranpose, use buffer in dataPtr[] and the size of that buffer
1140  data_out = dataPtrsZXY[iblock];
1141  width_out = isize_out;
1142  }
1143 
1144  transpose_xyz_zxy(
1145  nx, jsize, ksize,
1146  isize, jsize,
1147  width_out, nx,
1148  data+x0, data_out, stream);
1149  }
1150 */
1151 }
const int permutation
void NAMD_bug(const char *err_msg)
Definition: common.C:195
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
void batchTranspose_xyz_zxy(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)

◆ waitStreamSynchronize()

void CudaPmeTranspose::waitStreamSynchronize ( )

Definition at line 1153 of file CudaPmeSolverUtil.C.

References cudaCheck.

1153  {
1154  cudaCheck(cudaSetDevice(deviceID));
1155  cudaCheck(cudaStreamSynchronize(stream));
1156 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:233

The documentation for this class was generated from the following files: