4 #include <cuda_runtime.h>
8 #include <hip/hip_runtime.h>
15 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
19 fprintf(stderr,
"writeComplexToDisk %d %s\n", size, filename);
21 copy_DtoH<float2>(d_data, h_data, size,
stream);
23 FILE *handle = fopen(filename,
"w");
24 for (
int i=0;i < size;i++)
25 fprintf(handle,
"%f %f\n", h_data[i].
x, h_data[i].
y);
31 FILE *handle = fopen(filename,
"w");
32 for (
int i=0;i < size;i++)
33 fprintf(handle,
"%f %f\n", h_data[i].
x, h_data[i].
y);
38 fprintf(stderr,
"writeRealToDisk %d %s\n", size, filename);
39 float* h_data =
new float[size];
40 copy_DtoH<float>(d_data, h_data, size,
stream);
42 FILE *handle = fopen(filename,
"w");
43 for (
int i=0;i < size;i++)
44 fprintf(handle,
"%f\n", h_data[i]);
50 void CudaFFTCompute::plan3D(
int *n,
int flags) {
52 forwardType = CUFFT_R2C;
53 backwardType = CUFFT_C2R;
54 cufftCheck(cufftPlan3d(&forwardPlan, n[2], n[1], n[0], CUFFT_R2C));
55 cufftCheck(cufftPlan3d(&backwardPlan, n[2], n[1], n[0], CUFFT_C2R));
60 void CudaFFTCompute::plan2D(
int *n,
int howmany,
int flags) {
62 forwardType = CUFFT_R2C;
63 backwardType = CUFFT_C2R;
64 int nt[2] = {n[1], n[0]};
65 cufftCheck(cufftPlanMany(&forwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_R2C, howmany));
66 cufftCheck(cufftPlanMany(&backwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, howmany));
71 void CudaFFTCompute::plan1DX(
int *n,
int howmany,
int flags) {
73 forwardType = CUFFT_R2C;
74 backwardType = CUFFT_C2R;
75 cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_R2C, howmany));
76 cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2R, howmany));
81 void CudaFFTCompute::plan1DY(
int *n,
int howmany,
int flags) {
83 forwardType = CUFFT_C2C;
84 backwardType = CUFFT_C2C;
85 cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
86 cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
91 void CudaFFTCompute::plan1DZ(
int *n,
int howmany,
int flags) {
93 forwardType = CUFFT_C2C;
94 backwardType = CUFFT_C2C;
95 cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
96 cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
109 float* CudaFFTCompute::allocateData(
const int dataSizeRequired) {
112 allocate_device<float>(&tmp, dataSizeRequired);
121 if (forwardType == CUFFT_R2C) {
133 }
else if (forwardType == CUFFT_C2C) {
149 cudaNAMD_bug(
"CudaFFTCompute::forward(), unsupported FFT type");
155 if (backwardType == CUFFT_C2R) {
170 }
else if (backwardType == CUFFT_C2C) {
182 cudaNAMD_bug(
"CudaFFTCompute::backward(), unsupported FFT type");
186 void CudaFFTCompute::setStream() {
188 cufftCheck(cufftSetStream(forwardPlan, stream));
189 cufftCheck(cufftSetStream(backwardPlan, stream));
193 void CudaFFTCompute::createPlans(
194 rocfft_transform_type forwardTransformType, rocfft_transform_type backwardTransformType,
195 size_t dimensions,
const size_t* lengths,
size_t transforms)
200 rocfft_placement_notinplace,
201 forwardTransformType,
202 rocfft_precision_single,
203 dimensions, lengths, transforms,
nullptr
209 rocfft_placement_notinplace,
210 backwardTransformType,
211 rocfft_precision_single,
212 dimensions, lengths, transforms,
nullptr
217 rocfftCheck(rocfft_execution_info_create(&forwardPlanInfo));
218 rocfftCheck(rocfft_execution_info_create(&backwardPlanInfo));
220 size_t workBufferSize;
221 rocfftCheck(rocfft_plan_get_work_buffer_size(forwardPlan, &workBufferSize));
222 if(workBufferSize > 0) {
223 cudaCheck(hipMalloc(&forwardWorkBuffer, workBufferSize));
224 rocfftCheck(rocfft_execution_info_set_work_buffer(forwardPlanInfo, forwardWorkBuffer, workBufferSize));
226 forwardWorkBuffer =
nullptr;
228 rocfftCheck(rocfft_plan_get_work_buffer_size(backwardPlan, &workBufferSize));
229 if(workBufferSize > 0) {
230 cudaCheck(hipMalloc(&backwardWorkBuffer, workBufferSize));
231 rocfftCheck(rocfft_execution_info_set_work_buffer(backwardPlanInfo, backwardWorkBuffer, workBufferSize));
233 backwardWorkBuffer =
nullptr;
237 void CudaFFTCompute::plan3D(
int *n,
int flags) {
239 size_t lengths[3] = { (size_t)n[0], (
size_t)n[1], (size_t)n[2] };
241 rocfft_transform_type_real_forward, rocfft_transform_type_real_inverse,
247 void CudaFFTCompute::plan2D(
int *n,
int howmany,
int flags) {
249 size_t lengths[2] = { (size_t)n[0], (
size_t)n[1] };
251 rocfft_transform_type_real_forward, rocfft_transform_type_real_inverse,
257 void CudaFFTCompute::plan1DX(
int *n,
int howmany,
int flags) {
259 size_t lengths[1] = { (size_t)n[0] };
261 rocfft_transform_type_real_forward, rocfft_transform_type_real_inverse,
267 void CudaFFTCompute::plan1DY(
int *n,
int howmany,
int flags) {
269 size_t lengths[1] = { (size_t)n[0] };
271 rocfft_transform_type_complex_forward, rocfft_transform_type_complex_inverse,
277 void CudaFFTCompute::plan1DZ(
int *n,
int howmany,
int flags) {
279 size_t lengths[1] = { (size_t)n[0] };
281 rocfft_transform_type_complex_forward, rocfft_transform_type_complex_inverse,
289 rocfftCheck(rocfft_plan_destroy(forwardPlan));
290 rocfftCheck(rocfft_plan_destroy(backwardPlan));
291 rocfftCheck(rocfft_execution_info_destroy(forwardPlanInfo));
292 rocfftCheck(rocfft_execution_info_destroy(backwardPlanInfo));
293 if (forwardWorkBuffer !=
nullptr)
297 if (backwardWorkBuffer !=
nullptr)
305 float* CudaFFTCompute::allocateData(
const int dataSizeRequired) {
308 allocate_device<float>(&tmp, dataSizeRequired);
314 rocfftCheck(rocfft_execute(forwardPlan, (
void**)&
dataSrc, (
void**)&
dataDst, forwardPlanInfo));
319 rocfftCheck(rocfft_execute(backwardPlan, (
void**)&
dataDst, (
void**)&
dataSrc, backwardPlanInfo));
322 void CudaFFTCompute::setStream() {
324 rocfftCheck(rocfft_execution_info_set_stream(forwardPlanInfo, stream));
325 rocfftCheck(rocfft_execution_info_set_stream(backwardPlanInfo, stream));
333 const int jblock,
const int kblock,
double kappa,
int deviceID, cudaStream_t stream) :
335 deviceID(deviceID), stream(stream) {
340 float *bm1f =
new float[pmeGrid.
K1];
341 float *bm2f =
new float[pmeGrid.
K2];
342 float *bm3f =
new float[pmeGrid.
K3];
343 for (
int i=0;i < pmeGrid.
K1;i++) bm1f[i] = (
float)
bm1[i];
344 for (
int i=0;i < pmeGrid.
K2;i++) bm2f[i] = (
float)
bm2[i];
345 for (
int i=0;i < pmeGrid.
K3;i++) bm3f[i] = (
float)
bm3[i];
346 allocate_device<float>(&d_bm1, pmeGrid.
K1);
347 allocate_device<float>(&d_bm2, pmeGrid.
K2);
348 allocate_device<float>(&d_bm3, pmeGrid.
K3);
349 copy_HtoD_sync<float>(bm1f, d_bm1, pmeGrid.
K1);
350 copy_HtoD_sync<float>(bm2f, d_bm2, pmeGrid.
K2);
351 copy_HtoD_sync<float>(bm3f, d_bm3, pmeGrid.
K3);
355 allocate_device<EnergyVirial>(&d_energyVirial, 1);
356 allocate_host<EnergyVirial>(&h_energyVirial, 1);
358 cudaCheck(cudaEventCreate(©EnergyVirialEvent));
364 deallocate_device<float>(&d_bm1);
365 deallocate_device<float>(&d_bm2);
366 deallocate_device<float>(&d_bm3);
367 deallocate_device<EnergyVirial>(&d_energyVirial);
368 deallocate_host<EnergyVirial>(&h_energyVirial);
369 cudaCheck(cudaEventDestroy(copyEnergyVirialEvent));
375 fprintf(stderr,
"K-SPACE LATTICE %g %g %g %g %g %g %g %g %g\n",
376 lattice.
a().
x, lattice.
a().
y, lattice.
a().
z,
377 lattice.
b().
x, lattice.
b().
y, lattice.
b().
z,
378 lattice.
c().
x, lattice.
c().
y, lattice.
c().
z);
382 const bool doEnergyVirial = (doEnergy || doVirial);
384 int nfft1, nfft2, nfft3;
385 float *prefac1, *prefac2, *prefac3;
391 float recip1x, recip1y, recip1z;
392 float recip2x, recip2y, recip2z;
393 float recip3x, recip3y, recip3z;
430 NAMD_bug(
"CudaPmeKSpaceCompute::solve, invalid permutation");
464 if (doEnergyVirial) clear_device_array<EnergyVirial>(d_energyVirial, 1, stream);
467 recip1x, recip1y, recip1z, recip2x, recip2y, recip2z, recip3x, recip3y, recip3z,
468 volume, prefac1, prefac2, prefac3,
j0,
k0, doEnergyVirial,
469 &d_energyVirial->energy, d_energyVirial->virial, (
float2*)data,
473 if (doEnergyVirial) {
474 copy_DtoH<EnergyVirial>(d_energyVirial, h_energyVirial, 1, stream);
475 cudaCheck(cudaEventRecord(copyEnergyVirialEvent, stream));
486 void CudaPmeKSpaceCompute::energyAndVirialCheck(
void *arg,
double walltime) {
489 cudaError_t err = cudaEventQuery(c->copyEnergyVirialEvent);
490 if (err == cudaSuccess) {
493 if (c->pencilXYZPtr != NULL)
495 else if (c->pencilZPtr != NULL)
498 NAMD_bug(
"CudaPmeKSpaceCompute::energyAndVirialCheck, pencilXYZPtr and pencilZPtr not set");
500 }
else if (err == cudaErrorNotReady) {
503 if (c->checkCount >= 1000000) {
505 sprintf(errmsg,
"CudaPmeKSpaceCompute::energyAndVirialCheck polled %d times",
512 sprintf(errmsg,
"in CudaPmeKSpaceCompute::energyAndVirialCheck after polling %d times",
519 CcdCallFnAfter(energyAndVirialCheck, arg, 0.1);
524 pencilXYZPtr = pencilPtr;
529 CcdCallFnAfter(energyAndVirialCheck,
this, 0.1);
535 pencilZPtr = pencilPtr;
539 CcdCallFnAfter(energyAndVirialCheck,
this, 0.1);
543 return h_energyVirial->energy;
549 virial[0] = h_energyVirial->virial[3];
550 virial[1] = h_energyVirial->virial[4];
551 virial[2] = h_energyVirial->virial[1];
553 virial[3] = h_energyVirial->virial[4];
554 virial[4] = h_energyVirial->virial[5];
555 virial[5] = h_energyVirial->virial[2];
557 virial[6] = h_energyVirial->virial[1];
558 virial[7] = h_energyVirial->virial[7];
559 virial[8] = h_energyVirial->virial[0];
562 virial[0] = h_energyVirial->virial[0];
563 virial[1] = h_energyVirial->virial[1];
564 virial[2] = h_energyVirial->virial[2];
566 virial[3] = h_energyVirial->virial[1];
567 virial[4] = h_energyVirial->virial[3];
568 virial[5] = h_energyVirial->virial[4];
570 virial[6] = h_energyVirial->virial[2];
571 virial[7] = h_energyVirial->virial[4];
572 virial[8] = h_energyVirial->virial[5];
585 const int jblock,
const int kblock,
int deviceID, cudaStream_t
stream) :
588 NAMD_bug(
"CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute, insufficient dataSize");
603 cudaCheck(cudaEventCreate(&gatherForceEvent));
611 if (d_atoms != NULL) deallocate_device<CudaAtom>(&d_atoms);
612 if (d_force != NULL) deallocate_device<CudaForce>(&d_force);
615 deallocate_device<float>(&
data);
616 cudaCheck(cudaEventDestroy(gatherForceEvent));
645 reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity,
numAtoms, 1.5f);
670 void CudaPmeRealSpaceCompute::cuda_gatherforce_check(
void *arg,
double walltime) {
674 cudaError_t err = cudaEventQuery(c->gatherForceEvent);
675 if (err == cudaSuccess) {
681 }
else if (err == cudaErrorNotReady) {
684 if (c->checkCount >= 1000000) {
686 sprintf(errmsg,
"CudaPmeRealSpaceCompute::cuda_gatherforce_check polled %d times",
693 sprintf(errmsg,
"in CudaPmeRealSpaceCompute::cuda_gatherforce_check after polling %d times",
700 CcdCallFnAfter(cuda_gatherforce_check, arg, 0.1);
705 devicePtr = devicePtr_in;
709 CcdCallFnAfter(cuda_gatherforce_check,
this, 0.1);
714 cudaCheck(cudaEventSynchronize(gatherForceEvent));
717 void CudaPmeRealSpaceCompute::setupGridData(
float* data,
int data_len) {
729 if (tex_data == data && tex_data_len == data_len)
return;
731 tex_data_len = data_len;
733 cudaResourceDesc resDesc;
734 memset(&resDesc, 0,
sizeof(resDesc));
735 resDesc.resType = cudaResourceTypeLinear;
736 resDesc.res.linear.devPtr =
data;
737 resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
738 resDesc.res.linear.desc.x =
sizeof(float)*8;
739 resDesc.res.linear.sizeInBytes = data_len*
sizeof(float);
740 cudaTextureDesc texDesc;
741 memset(&texDesc, 0,
sizeof(texDesc));
742 texDesc.readMode = cudaReadModeElementType;
743 cudaCheck(cudaCreateTextureObject(&gridTexObj, &resDesc, &texDesc, NULL));
745 if (grid_data == data && grid_data_len == data_len)
return;
747 grid_data_len = data_len;
755 reallocate_device<CudaForce>(&d_force, &d_forceCapacity,
numAtoms, 1.5f);
766 copy_DtoH<CudaForce>(d_force, force,
numAtoms, stream);
768 cudaCheck(cudaEventRecord(gatherForceEvent, stream));
789 const int jblock,
const int kblock,
int deviceID, cudaStream_t
stream) :
790 PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
793 allocate_device<float2>(&d_data,
dataSize);
794 #ifndef P2P_ENABLE_3D
795 allocate_device<float2>(&d_buffer,
dataSize);
799 dataPtrsYZX.resize(
nblock, NULL);
800 dataPtrsZXY.resize(
nblock, NULL);
802 allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*
nblock);
803 allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*
nblock);
808 deallocate_device<float2>(&d_data);
809 #ifndef P2P_ENABLE_3D
810 deallocate_device<float2>(&d_buffer);
812 deallocate_device< TransposeBatch<float2> >(&batchesZXY);
813 deallocate_device< TransposeBatch<float2> >(&batchesYZX);
820 if (dataPtrsYZX.size() != dataPtrsNew.size())
821 NAMD_bug(
"CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
822 for (
int iblock=0;iblock <
nblock;iblock++) {
823 dataPtrsYZX[iblock] = dataPtrsNew[iblock];
828 for (
int iperm=0;iperm < 3;iperm++) {
834 }
else if (iperm == 1) {
845 for (
int iblock=0;iblock <
nblock;iblock++) {
847 int x0 =
pos[iblock];
848 int nx =
pos[iblock+1] - x0;
849 max_nx = std::max(max_nx, nx);
853 if (dataPtrsYZX[iblock] == NULL) {
859 data_out = dataPtrsYZX[iblock];
860 width_out = isize_out;
870 h_batchesYZX[iperm*nblock + iblock] = batch;
879 max_nx_YZX[iperm] = max_nx;
882 copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*
nblock, stream);
883 cudaCheck(cudaStreamSynchronize(stream));
884 delete [] h_batchesYZX;
891 if (dataPtrsZXY.size() != dataPtrsNew.size())
892 NAMD_bug(
"CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
893 for (
int iblock=0;iblock <
nblock;iblock++) {
894 dataPtrsZXY[iblock] = dataPtrsNew[iblock];
900 for (
int iperm=0;iperm < 3;iperm++) {
906 }
else if (iperm == 1) {
917 for (
int iblock=0;iblock <
nblock;iblock++) {
919 int x0 =
pos[iblock];
920 int nx =
pos[iblock+1] - x0;
921 max_nx = std::max(max_nx, nx);
925 if (dataPtrsZXY[iblock] == NULL) {
931 data_out = dataPtrsZXY[iblock];
932 width_out = isize_out;
942 h_batchesZXY[iperm*nblock + iblock] = batch;
945 max_nx_ZXY[iperm] = max_nx;
948 copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*
nblock, stream);
949 cudaCheck(cudaStreamSynchronize(stream));
950 delete [] h_batchesZXY;
971 NAMD_bug(
"PmeTranspose::transposeXYZtoYZX, invalid permutation");
1045 NAMD_bug(
"PmeTranspose::transposeXYZtoZXY, invalid permutation");
1102 cudaCheck(cudaStreamSynchronize(stream));
1109 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
1111 int x0 =
pos[iblock];
1112 int nx =
pos[iblock+1] - x0;
1117 if (copyStart + copySize >
dataSize)
1118 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
1120 if (copySize > h_dataSize)
1121 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
1123 copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
1130 NAMD_bug(
"CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
1133 int i0, i1, j0, j1, k0, k1;
1134 getBlockDim(
pmeGrid,
permutation, iblock,
jblock,
kblock, i0, i1, j0, j1, k0, k1);
1139 copy3D_HtoD<float2>(data_in, data_out,
1144 ni, nj, nk, stream);
1147 #ifndef P2P_ENABLE_3D
1155 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
1158 int i0, i1, j0, j1, k0, k1;
1159 getBlockDim(
pmeGrid,
permutation, iblock,
jblock,
kblock, i0, i1, j0, j1, k0, k1);
1164 float2* data_in = d_buffer + i0*nj*nk;
1166 copy3D_DtoD<float2>(data_in, data_out,
1171 ni, nj, nk, stream);
1179 NAMD_bug(
"CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
1182 int i0, i1, j0, j1, k0, k1;
1183 getBlockDim(
pmeGrid,
permutation, iblock,
jblock,
kblock, i0, i1, j0, j1, k0, k1);
1188 return d_buffer + i0*nj*nk;
1197 int kblock_out = iblock;
1199 copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1206 int jblock_out = iblock;
1209 copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1212 void CudaPmeTranspose::copyDataToPeerDevice(
const int iblock,
1213 const int iblock_out,
const int jblock_out,
const int kblock_out,
1214 int deviceID_out,
int permutation_out,
float2* data_out) {
1219 int i0, i1, j0, j1, k0, k1;
1220 getBlockDim(
pmeGrid, permutation_out, iblock_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
1226 int isize_out = i1-i0+1;
1227 int jsize_out = j1-j0+1;
1229 int x0 =
pos[iblock];
1232 #ifndef P2P_ENABLE_3D
1234 copy_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out, ni*nj*nk, stream);
1236 copy3D_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out,
1240 isize_out, jsize_out,
1241 ni, nj, nk, stream);
void energyAndVirialSetCallback(CudaPmePencilXYZ *pencilPtr)
void batchTranspose_xyz_yzx(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void setDataPtrsYZX(std::vector< float2 * > &dataPtrsNew, float2 *data)
CudaPmeTranspose(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)
~CudaPmeRealSpaceCompute()
static __thread atom * atoms
void energyAndVirialDone()
static void getPencilDim(const PmeGrid &pmeGrid, const int permutation, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
void spreadCharge(Lattice &lattice)
void CcdCallBacksReset(void *ignored, double curWallTime)
void copyAtoms(const int numAtoms, const CudaAtom *atoms)
CudaPmeRealSpaceCompute(PmeGrid pmeGrid, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
void energyAndVirialDone()
void copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
__thread cudaStream_t stream
void copyDataDeviceToDevice(const int iblock, float2 *data_out)
void copyDataDeviceToHost(const int iblock, float2 *h_data, const int h_dataSize)
void NAMD_bug(const char *err_msg)
void writeHostComplexToDisk(const float2 *h_data, const int size, const char *filename)
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
void getVirial(double *virial)
CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream)
BigReal volume(void) const
void writeComplexToDisk(const float2 *d_data, const int size, const char *filename, cudaStream_t stream)
void batchTranspose_xyz_zxy(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void cudaNAMD_bug(const char *msg)
__global__ void gather_force(const float4 *xyzq, const int ncoord, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const float *data, const cudaTextureObject_t gridTexObj, const int stride, FT *force)
void setDataPtrsZXY(std::vector< float2 * > &dataPtrsNew, float2 *data)
void spread_charge(const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
void gatherForce(Lattice &lattice, CudaForce *force)
void waitStreamSynchronize()
void solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float *data)
void gatherForceSetCallback(ComputePmeCUDADevice *devicePtr_in)
void transposeXYZtoYZX(const float2 *data)
void copyDataHostToDevice(const int iblock, float2 *data_in, float2 *data_out)
void transposeXYZtoZXY(const float2 *data)
void waitGatherForceDone()
void copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
float2 * getBuffer(const int iblock)
void writeRealToDisk(const float *d_data, const int size, const char *filename, cudaStream_t stream)