5 #include <cuda_runtime.h> 8 #include <hip/hip_runtime.h> 22 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 30 void writeComplexToDisk(
const float2 *d_data,
const int size,
const char* filename, cudaStream_t stream) {
31 fprintf(stderr,
"writeComplexToDisk %d %s\n", size, filename);
32 float2* h_data =
new float2[size];
33 copy_DtoH<float2>(d_data, h_data, size, stream);
35 FILE *handle = fopen(filename,
"w");
36 for (
int i=0;i < size;i++)
37 fprintf(handle,
"%f %f\n", h_data[i].x, h_data[i].y);
43 FILE *handle = fopen(filename,
"w");
44 for (
int i=0;i < size;i++)
45 fprintf(handle,
"%f %f\n", h_data[i].x, h_data[i].y);
49 void writeRealToDisk(
const float *d_data,
const int size,
const char* filename, cudaStream_t stream) {
50 fprintf(stderr,
"writeRealToDisk %d %s\n", size, filename);
51 float* h_data =
new float[size];
52 copy_DtoH<float>(d_data, h_data, size, stream);
54 FILE *handle = fopen(filename,
"w");
55 for (
int i=0;i < size;i++)
56 fprintf(handle,
"%f\n", h_data[i]);
62 : deviceID(deviceID), stream(stream) {
65 void CudaFFTCompute::plan3D(
int *n,
int flags) {
67 forwardType = CUFFT_R2C;
68 backwardType = CUFFT_C2R;
69 cufftCheck(cufftPlan3d(&forwardPlan, n[2], n[1], n[0], CUFFT_R2C));
70 cufftCheck(cufftPlan3d(&backwardPlan, n[2], n[1], n[0], CUFFT_C2R));
75 void CudaFFTCompute::plan2D(
int *n,
int howmany,
int flags) {
77 forwardType = CUFFT_R2C;
78 backwardType = CUFFT_C2R;
79 int nt[2] = {n[1], n[0]};
80 cufftCheck(cufftPlanMany(&forwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_R2C, howmany));
81 cufftCheck(cufftPlanMany(&backwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, howmany));
86 void CudaFFTCompute::plan1DX(
int *n,
int howmany,
int flags) {
88 forwardType = CUFFT_R2C;
89 backwardType = CUFFT_C2R;
90 cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_R2C, howmany));
91 cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2R, howmany));
96 void CudaFFTCompute::plan1DY(
int *n,
int howmany,
int flags) {
98 forwardType = CUFFT_C2C;
99 backwardType = CUFFT_C2C;
100 cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
101 cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
106 void CudaFFTCompute::plan1DZ(
int *n,
int howmany,
int flags) {
108 forwardType = CUFFT_C2C;
109 backwardType = CUFFT_C2C;
110 cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
111 cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
124 float* CudaFFTCompute::allocateData(
const int dataSizeRequired) {
127 allocate_device<float>(&tmp, dataSizeRequired);
136 if (forwardType == CUFFT_R2C) {
140 cudaCheck(cudaStreamSynchronize(stream));
141 fprintf(stderr,
"AP FORWARD FFT\n");
142 fprintf(stderr,
"COPY DEVICE ARRAYS BACK TO HOST\n");
145 allocate_host<float>(&tran, m);
146 copy_DtoH<float>(
dataDst, tran, m, stream);
147 cudaCheck(cudaStreamSynchronize(stream));
148 TestArray_write<float>(
"tran_charge_grid_good.bin",
149 "transformed charge grid good", tran, m);
150 deallocate_host<float>(&tran);
162 }
else if (forwardType == CUFFT_C2C) {
178 cudaNAMD_bug(
"CudaFFTCompute::forward(), unsupported FFT type");
184 if (backwardType == CUFFT_C2R) {
194 cudaCheck(cudaStreamSynchronize(stream));
195 fprintf(stderr,
"AP BACKWARD FFT\n");
196 fprintf(stderr,
"COPY DEVICE ARRAYS BACK TO HOST\n");
199 allocate_host<float>(&grid, gridsize);
200 copy_DtoH<float>((
float*)
dataSrc, grid, gridsize, stream);
201 cudaCheck(cudaStreamSynchronize(stream));
202 TestArray_write<float>(
"potential_grid_good.bin",
203 "potential grid good", grid, gridsize);
204 deallocate_host<float>(&grid);
213 }
else if (backwardType == CUFFT_C2C) {
225 cudaNAMD_bug(
"CudaFFTCompute::backward(), unsupported FFT type");
229 void CudaFFTCompute::setStream() {
231 cufftCheck(cufftSetStream(forwardPlan, stream));
232 cufftCheck(cufftSetStream(backwardPlan, stream));
237 const int jblock,
const int kblock,
double kappa,
int deviceID, cudaStream_t stream,
unsigned int iGrid) :
239 deviceID(deviceID), stream(stream) {
247 for (
int i=0;i <
pmeGrid.
K1;i++) bm1f[i] = (
float)
bm1[i];
248 for (
int i=0;i <
pmeGrid.
K2;i++) bm2f[i] = (
float)
bm2[i];
249 for (
int i=0;i <
pmeGrid.
K3;i++) bm3f[i] = (
float)
bm3[i];
250 allocate_device<float>(&d_bm1,
pmeGrid.
K1);
251 allocate_device<float>(&d_bm2,
pmeGrid.
K2);
252 allocate_device<float>(&d_bm3,
pmeGrid.
K3);
253 copy_HtoD_sync<float>(bm1f, d_bm1,
pmeGrid.
K1);
254 copy_HtoD_sync<float>(bm2f, d_bm2,
pmeGrid.
K2);
255 copy_HtoD_sync<float>(bm3f, d_bm3,
pmeGrid.
K3);
259 allocate_device<EnergyVirial>(&d_energyVirial, 1);
260 allocate_host<EnergyVirial>(&h_energyVirial, 1);
262 cudaCheck(cudaEventCreate(©EnergyVirialEvent));
268 deallocate_device<float>(&d_bm1);
269 deallocate_device<float>(&d_bm2);
270 deallocate_device<float>(&d_bm3);
271 deallocate_device<EnergyVirial>(&d_energyVirial);
272 deallocate_host<EnergyVirial>(&h_energyVirial);
273 cudaCheck(cudaEventDestroy(copyEnergyVirialEvent));
279 fprintf(stderr,
"K-SPACE LATTICE %g %g %g %g %g %g %g %g %g\n",
280 lattice.
a().
x, lattice.
a().
y, lattice.
a().
z,
281 lattice.
b().
x, lattice.
b().
y, lattice.
b().
z,
282 lattice.
c().
x, lattice.
c().
y, lattice.
c().
z);
286 const bool doEnergyVirial = (doEnergy || doVirial);
288 int nfft1, nfft2, nfft3;
289 float *prefac1, *prefac2, *prefac3;
295 float recip1x, recip1y, recip1z;
296 float recip2x, recip2y, recip2z;
297 float recip3x, recip3y, recip3z;
334 NAMD_bug(
"CudaPmeKSpaceCompute::solve, invalid permutation");
368 if (doEnergyVirial) clear_device_array<EnergyVirial>(d_energyVirial, 1, stream);
372 cudaCheck(cudaStreamSynchronize(stream));
373 fprintf(stderr,
"AP calling scalar sum\n");
374 fprintf(stderr,
"(permutation == Perm_cX_Y_Z) = %s\n",
376 fprintf(stderr,
"nfft1=%d nfft2=%d nfft3=%d\n", nfft1, nfft2, nfft3);
378 fprintf(stderr,
"kappa=%g\n",
kappa);
379 fprintf(stderr,
"recip1x=%g recip1y=%g recip1z=%g\n",
380 (
double)recip1x, (
double)recip1y, (
double)recip1z);
381 fprintf(stderr,
"recip2x=%g recip2y=%g recip2z=%g\n",
382 (
double)recip2x, (
double)recip2y, (
double)recip2z);
383 fprintf(stderr,
"recip3x=%g recip3y=%g recip3z=%g\n",
384 (
double)recip3x, (
double)recip3y, (
double)recip3z);
385 fprintf(stderr,
"volume=%g\n", volume);
386 fprintf(stderr,
"j0=%d k0=%d\n",
j0,
k0);
388 allocate_host<float>(&
bm1, nfft1);
389 allocate_host<float>(&
bm2, nfft2);
390 allocate_host<float>(&
bm3, nfft3);
391 copy_DtoH<float>(prefac1,
bm1, nfft1, stream);
392 copy_DtoH<float>(prefac2,
bm2, nfft2, stream);
393 copy_DtoH<float>(prefac3,
bm3, nfft3, stream);
394 TestArray_write<float>(
"bm1_good.bin",
"structure factor bm1 good",
396 TestArray_write<float>(
"bm2_good.bin",
"structure factor bm2 good",
398 TestArray_write<float>(
"bm3_good.bin",
"structure factor bm3 good",
400 deallocate_host<float>(&
bm1);
401 deallocate_host<float>(&
bm2);
402 deallocate_host<float>(&
bm3);
407 recip1x, recip1y, recip1z, recip2x, recip2y, recip2z, recip3x, recip3y, recip3z,
408 volume, prefac1, prefac2, prefac3,
j0,
k0, doEnergyVirial,
409 &d_energyVirial->energy, d_energyVirial->virial, (float2*)data,
413 cudaCheck(cudaStreamSynchronize(stream));
414 fprintf(stderr,
"AP SCALAR SUM\n");
415 fprintf(stderr,
"COPY DEVICE ARRAYS BACK TO HOST\n");
416 int m = 2 * (nfft1/2 + 1) * nfft2 * nfft3;
418 allocate_host<float>(&tran, m);
419 copy_DtoH<float>((
float*)data, tran, m, stream);
420 cudaCheck(cudaStreamSynchronize(stream));
421 TestArray_write<float>(
"tran_potential_grid_good.bin",
422 "transformed potential grid good", tran, m);
423 deallocate_host<float>(&tran);
428 if (doEnergyVirial) {
429 copy_DtoH<EnergyVirial>(d_energyVirial, h_energyVirial, 1, stream);
430 cudaCheck(cudaEventRecord(copyEnergyVirialEvent, stream));
441 void CudaPmeKSpaceCompute::energyAndVirialCheck(
void *arg,
double walltime) {
444 cudaError_t err = cudaEventQuery(c->copyEnergyVirialEvent);
445 if (err == cudaSuccess) {
448 if (c->pencilXYZPtr != NULL)
450 else if (c->pencilZPtr != NULL)
453 NAMD_bug(
"CudaPmeKSpaceCompute::energyAndVirialCheck, pencilXYZPtr and pencilZPtr not set");
455 }
else if (err == cudaErrorNotReady) {
458 if (c->checkCount >= 1000000) {
460 sprintf(errmsg,
"CudaPmeKSpaceCompute::energyAndVirialCheck polled %d times",
467 sprintf(errmsg,
"in CudaPmeKSpaceCompute::energyAndVirialCheck after polling %d times",
474 CcdCallFnAfter(energyAndVirialCheck, arg, 0.1);
479 pencilXYZPtr = pencilPtr;
484 CcdCallFnAfter(energyAndVirialCheck,
this, 0.1);
490 pencilZPtr = pencilPtr;
494 CcdCallFnAfter(energyAndVirialCheck,
this, 0.1);
498 return h_energyVirial->energy;
504 virial[0] = h_energyVirial->virial[3];
505 virial[1] = h_energyVirial->virial[4];
506 virial[2] = h_energyVirial->virial[1];
508 virial[3] = h_energyVirial->virial[4];
509 virial[4] = h_energyVirial->virial[5];
510 virial[5] = h_energyVirial->virial[2];
512 virial[6] = h_energyVirial->virial[1];
513 virial[7] = h_energyVirial->virial[7];
514 virial[8] = h_energyVirial->virial[0];
517 virial[0] = h_energyVirial->virial[0];
518 virial[1] = h_energyVirial->virial[1];
519 virial[2] = h_energyVirial->virial[2];
521 virial[3] = h_energyVirial->virial[1];
522 virial[4] = h_energyVirial->virial[3];
523 virial[5] = h_energyVirial->virial[4];
525 virial[6] = h_energyVirial->virial[2];
526 virial[7] = h_energyVirial->virial[4];
527 virial[8] = h_energyVirial->virial[5];
530 fprintf(stderr,
"AP PME VIRIAL =\n" 531 " %g %g %g\n %g %g %g\n %g %g %g\n",
532 virial[0], virial[1], virial[2], virial[3], virial[4],
533 virial[5], virial[6], virial[7], virial[8]);
546 const int jblock,
const int kblock,
int deviceID, cudaStream_t stream) :
549 NAMD_bug(
"CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute, insufficient dataSize");
564 cudaCheck(cudaEventCreate(&gatherForceEvent));
572 if (d_atoms != NULL) deallocate_device<CudaAtom>(&d_atoms);
573 if (d_force != NULL) deallocate_device<CudaForce>(&d_force);
576 deallocate_device<float>(&
data);
577 cudaCheck(cudaEventDestroy(gatherForceEvent));
606 reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity,
numAtoms, 1.5f);
609 copy_HtoD<CudaAtom>(atoms, d_atoms,
numAtoms, stream);
622 allocate_host<float>(&xyzq, 4*natoms);
623 copy_DtoH<float>((
float *)d_atoms, xyzq, 4*natoms, stream);
624 cudaCheck(cudaStreamSynchronize(stream));
625 char fname[64], remark[64];
626 sprintf(fname,
"pme_atoms_xyzq_soa_%d.bin", step);
627 sprintf(remark,
"SOA PME atoms xyzq, step %d\n", step);
628 TestArray_write<float>(fname, remark, xyzq, 4*natoms);
629 deallocate_host<float>(&xyzq);
640 fprintf(stderr,
"Calling spread_charge with parameters:\n");
641 fprintf(stderr,
"numAtoms = %d\n",
numAtoms);
642 fprintf(stderr,
"pmeGrid.K1 = %d\n",
pmeGrid.
K1);
643 fprintf(stderr,
"pmeGrid.K2 = %d\n",
pmeGrid.
K2);
644 fprintf(stderr,
"pmeGrid.K3 = %d\n",
pmeGrid.
K3);
645 fprintf(stderr,
"xsize = %d\n",
xsize);
646 fprintf(stderr,
"ysize = %d\n",
ysize);
647 fprintf(stderr,
"zsize = %d\n",
zsize);
648 fprintf(stderr,
"y0 = %d\n",
y0);
649 fprintf(stderr,
"z0 = %d\n",
z0);
650 fprintf(stderr,
"(pmeGrid.yBlocks == 1) = %d\n", (
pmeGrid.
yBlocks == 1));
651 fprintf(stderr,
"(pmeGrid.zBlocks == 1) = %d\n", (
pmeGrid.
zBlocks == 1));
660 cudaCheck(cudaStreamSynchronize(stream));
661 fprintf(stderr,
"AP SPREAD CHARGES\n");
662 fprintf(stderr,
"COPY DEVICE ARRAYS BACK TO HOST\n");
664 allocate_host<float>(&xyzq, 4*
numAtoms);
665 copy_DtoH<float>((
float *)d_atoms, xyzq, 4*
numAtoms, stream);
668 allocate_host<float>(&grid, gridlen);
669 copy_DtoH<float>(
data, grid, gridlen, stream);
670 cudaCheck(cudaStreamSynchronize(stream));
671 TestArray_write<float>(
"xyzq_good.bin",
"xyzq good", xyzq, 4*
numAtoms);
672 TestArray_write<float>(
"charge_grid_good.bin",
"charge grid good",
674 deallocate_host<float>(&xyzq);
675 deallocate_host<float>(&grid);
685 void CudaPmeRealSpaceCompute::cuda_gatherforce_check(
void *arg,
double walltime) {
689 cudaError_t err = cudaEventQuery(c->gatherForceEvent);
690 if (err == cudaSuccess) {
696 }
else if (err == cudaErrorNotReady) {
699 if (c->checkCount >= 1000000) {
701 sprintf(errmsg,
"CudaPmeRealSpaceCompute::cuda_gatherforce_check polled %d times",
708 sprintf(errmsg,
"in CudaPmeRealSpaceCompute::cuda_gatherforce_check after polling %d times",
715 CcdCallFnAfter(cuda_gatherforce_check, arg, 0.1);
720 devicePtr = devicePtr_in;
724 CcdCallFnAfter(cuda_gatherforce_check,
this, 0.1);
729 cudaCheck(cudaEventSynchronize(gatherForceEvent));
732 void CudaPmeRealSpaceCompute::setupGridData(
float* data,
int data_len) {
744 if (tex_data ==
data && tex_data_len == data_len)
return;
746 tex_data_len = data_len;
748 cudaResourceDesc resDesc;
749 memset(&resDesc, 0,
sizeof(resDesc));
750 resDesc.resType = cudaResourceTypeLinear;
751 resDesc.res.linear.devPtr =
data;
752 resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
753 resDesc.res.linear.desc.x =
sizeof(float)*8;
754 resDesc.res.linear.sizeInBytes = data_len*
sizeof(float);
755 cudaTextureDesc texDesc;
756 memset(&texDesc, 0,
sizeof(texDesc));
757 texDesc.readMode = cudaReadModeElementType;
758 cudaCheck(cudaCreateTextureObject(&gridTexObj, &resDesc, &texDesc, NULL));
760 if (grid_data ==
data && grid_data_len == data_len)
return;
762 grid_data_len = data_len;
772 reallocate_device<CudaForce>(&d_force, &d_forceCapacity,
numAtoms, 1.5f);
776 fprintf(stderr,
"AP gather force arguments\n");
777 fprintf(stderr,
"numAtoms = %d\n",
numAtoms);
778 fprintf(stderr,
"pmeGrid.K1 = %d\n",
pmeGrid.
K1);
779 fprintf(stderr,
"pmeGrid.K2 = %d\n",
pmeGrid.
K2);
780 fprintf(stderr,
"pmeGrid.K3 = %d\n",
pmeGrid.
K3);
781 fprintf(stderr,
"xsize = %d\n",
xsize);
782 fprintf(stderr,
"ysize = %d\n",
ysize);
783 fprintf(stderr,
"zsize = %d\n",
zsize);
784 fprintf(stderr,
"y0 = %d\n",
y0);
785 fprintf(stderr,
"z0 = %d\n",
z0);
786 fprintf(stderr,
"(pmeGrid.yBlocks == 1) = %d\n", (
pmeGrid.
yBlocks == 1));
787 fprintf(stderr,
"(pmeGrid.zBlocks == 1) = %d\n", (
pmeGrid.
zBlocks == 1));
789 fprintf(stderr,
"gridTexObj = %p\n", gridTexObj);
807 cudaCheck(cudaStreamSynchronize(stream));
808 fprintf(stderr,
"AP GATHER FORCE\n");
809 fprintf(stderr,
"COPY DEVICE ARRAYS BACK TO HOST\n");
812 allocate_host<float>(&xyz, 3*natoms);
813 copy_DtoH<float>((
float*)d_force, xyz, 3*natoms, stream);
814 cudaCheck(cudaStreamSynchronize(stream));
815 TestArray_write<float>(
"gather_force_good.bin",
816 "gather force good", xyz, 3*natoms);
817 deallocate_host<float>(&xyz);
821 copy_DtoH<CudaForce>(d_force, force,
numAtoms, stream);
823 cudaCheck(cudaEventRecord(gatherForceEvent, stream));
846 const int jblock,
const int kblock,
int deviceID, cudaStream_t stream) :
847 PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
850 allocate_device<float2>(&d_data,
dataSize);
851 #ifndef P2P_ENABLE_3D 852 allocate_device<float2>(&d_buffer,
dataSize);
856 dataPtrsYZX.resize(
nblock, NULL);
857 dataPtrsZXY.resize(
nblock, NULL);
859 allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*
nblock);
860 allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*
nblock);
865 deallocate_device<float2>(&d_data);
866 #ifndef P2P_ENABLE_3D 867 deallocate_device<float2>(&d_buffer);
869 deallocate_device< TransposeBatch<float2> >(&batchesZXY);
870 deallocate_device< TransposeBatch<float2> >(&batchesYZX);
877 if (dataPtrsYZX.size() != dataPtrsNew.size())
878 NAMD_bug(
"CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
879 for (
int iblock=0;iblock <
nblock;iblock++) {
880 dataPtrsYZX[iblock] = dataPtrsNew[iblock];
885 for (
int iperm=0;iperm < 3;iperm++) {
891 }
else if (iperm == 1) {
902 for (
int iblock=0;iblock <
nblock;iblock++) {
904 int x0 =
pos[iblock];
905 int nx =
pos[iblock+1] - x0;
906 max_nx = std::max(max_nx, nx);
910 if (dataPtrsYZX[iblock] == NULL) {
916 data_out = dataPtrsYZX[iblock];
917 width_out = isize_out;
927 h_batchesYZX[iperm*
nblock + iblock] = batch;
936 max_nx_YZX[iperm] = max_nx;
939 copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*
nblock, stream);
940 cudaCheck(cudaStreamSynchronize(stream));
941 delete [] h_batchesYZX;
948 if (dataPtrsZXY.size() != dataPtrsNew.size())
949 NAMD_bug(
"CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
950 for (
int iblock=0;iblock <
nblock;iblock++) {
951 dataPtrsZXY[iblock] = dataPtrsNew[iblock];
957 for (
int iperm=0;iperm < 3;iperm++) {
963 }
else if (iperm == 1) {
974 for (
int iblock=0;iblock <
nblock;iblock++) {
976 int x0 =
pos[iblock];
977 int nx =
pos[iblock+1] - x0;
978 max_nx = std::max(max_nx, nx);
982 if (dataPtrsZXY[iblock] == NULL) {
988 data_out = dataPtrsZXY[iblock];
989 width_out = isize_out;
998 h_batchesZXY[iperm*
nblock + iblock] = batch;
1001 max_nx_ZXY[iperm] = max_nx;
1004 copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*
nblock, stream);
1005 cudaCheck(cudaStreamSynchronize(stream));
1006 delete [] h_batchesZXY;
1027 NAMD_bug(
"PmeTranspose::transposeXYZtoYZX, invalid permutation");
1101 NAMD_bug(
"PmeTranspose::transposeXYZtoZXY, invalid permutation");
1158 cudaCheck(cudaStreamSynchronize(stream));
1165 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
1167 int x0 =
pos[iblock];
1168 int nx =
pos[iblock+1] - x0;
1173 if (copyStart + copySize >
dataSize)
1174 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
1176 if (copySize > h_dataSize)
1177 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
1179 copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
1186 NAMD_bug(
"CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
1189 int i0, i1, j0, j1, k0, k1;
1190 getBlockDim(
pmeGrid,
permutation, iblock,
jblock,
kblock, i0, i1, j0, j1, k0, k1);
1195 copy3D_HtoD<float2>(data_in, data_out,
1200 ni, nj, nk, stream);
1203 #ifndef P2P_ENABLE_3D 1211 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
1214 int i0, i1, j0, j1, k0, k1;
1215 getBlockDim(
pmeGrid,
permutation, iblock,
jblock,
kblock, i0, i1, j0, j1, k0, k1);
1220 float2* data_in = d_buffer + i0*nj*nk;
1222 copy3D_DtoD<float2>(data_in, data_out,
1227 ni, nj, nk, stream);
1235 NAMD_bug(
"CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
1238 int i0, i1, j0, j1, k0, k1;
1239 getBlockDim(
pmeGrid,
permutation, iblock,
jblock,
kblock, i0, i1, j0, j1, k0, k1);
1244 return d_buffer + i0*nj*nk;
1253 int kblock_out = iblock;
1255 copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1262 int jblock_out = iblock;
1265 copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1268 void CudaPmeTranspose::copyDataToPeerDevice(
const int iblock,
1269 const int iblock_out,
const int jblock_out,
const int kblock_out,
1270 int deviceID_out,
int permutation_out, float2* data_out) {
1275 int i0, i1, j0, j1, k0, k1;
1276 getBlockDim(
pmeGrid, permutation_out, iblock_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
1282 int isize_out = i1-i0+1;
1283 int jsize_out = j1-j0+1;
1285 int x0 =
pos[iblock];
1288 #ifndef P2P_ENABLE_3D 1290 copy_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out, ni*nj*nk, stream);
1292 copy3D_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out,
1296 isize_out, jsize_out,
1297 ni, nj, nk, stream);
1308 pmeGrid(pmeGrid_), deviceID(deviceID_), deviceIndex(deviceIndex_),
1309 natoms(0), d_atoms(0), d_forces(0),
1310 d_grids(0), gridsize(0),
1311 d_trans(0), transize(0),
1312 d_bm1(0), d_bm2(0), d_bm3(0),
1314 self_energy_alch_first_time(true),
1315 force_scaling_alch_first_time(true),
1316 selfEnergy(0), selfEnergy_FEP(0), selfEnergy_TI_1(0), selfEnergy_TI_2(0), m_step(0)
1335 NAMD_bug(
"CudaPmeOneDevice requires GPU-resident mode");
1340 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP) 1341 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1342 int leastPriority, greatestPriority;
1343 cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
1344 cudaCheck(cudaStreamCreateWithPriority(&
stream, cudaStreamDefault, greatestPriority));
1375 #ifdef NODEGROUP_FORCE_REGISTER 1376 DeviceData& devData = cpdata.ckLocalBranch()->devData[
deviceIndex];
1379 devData.f_slow_size =
natoms;
1389 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 1403 cudaDeviceProp deviceProp;
1405 const int texture_alignment = int(deviceProp.textureAlignment);
1410 if ((
gridsize % texture_alignment) != 0) {
1424 cudaResourceDesc resDesc;
1425 memset(&resDesc, 0,
sizeof(resDesc));
1426 resDesc.resType = cudaResourceTypeLinear;
1427 resDesc.res.linear.devPtr = (
void*)(
d_grids + iGrid * (
size_t)
gridsize);
1428 resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
1429 resDesc.res.linear.desc.x =
sizeof(float)*8;
1430 resDesc.res.linear.sizeInBytes =
gridsize*
sizeof(float);
1431 cudaTextureDesc texDesc;
1432 memset(&texDesc, 0,
sizeof(texDesc));
1433 texDesc.readMode = cudaReadModeElementType;
1441 double *bm1 =
new double[k1];
1442 double *bm2 =
new double[k2];
1443 double *bm3 =
new double[k3];
1451 float *bm1f =
new float[k1];
1452 float *bm2f =
new float[k2];
1453 float *bm3f =
new float[k3];
1454 for (
int i=0; i < k1; i++) bm1f[i] = (
float) bm1[i];
1455 for (
int i=0; i < k2; i++) bm2f[i] = (
float) bm2[i];
1456 for (
int i=0; i < k3; i++) bm3f[i] = (
float) bm3[i];
1457 allocate_device<float>(&
d_bm1, k1);
1458 allocate_device<float>(&
d_bm2, k2);
1459 allocate_device<float>(&
d_bm3, k3);
1460 copy_HtoD_sync<float>(bm1f,
d_bm1, k1);
1461 copy_HtoD_sync<float>(bm2f,
d_bm2, k2);
1462 copy_HtoD_sync<float>(bm3f,
d_bm3, k3);
1476 deallocate_device<float4>(&
d_atoms);
1477 deallocate_device<float3>(&
d_forces);
1478 deallocate_device<float2>(&
d_trans);
1479 deallocate_device<float>(&
d_grids);
1483 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 1487 #if defined(NAMD_CUDA) // only CUDA uses texture objects 1501 #if defined(NAMD_CUDA) // only CUDA uses texture objects 1512 deallocate_device<float>(&
d_bm1);
1513 deallocate_device<float>(&
d_bm2);
1514 deallocate_device<float>(&
d_bm3);
1517 if (reductionGpuResident) {
1518 delete reductionGpuResident;
1539 double volume = lattice.
volume();
1567 if (doEnergyVirial) {
1571 if (updateSelfEnergy && (sim_params.
alchOn ==
false)) {
1600 float(k1), (
float)k2, (
float)k3, order3,
1626 scalar_sum(
true , k1, k2, k3, (k1/2 + 1), k2, k3,
1627 kappa, arx, ary, arz, brx, bry, brz, crx, cry, crz, volume,
1674 if (doEnergyVirial) {
1685 scaleAndMergeForce(
m_step);
1697 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1698 PatchData* patchData = cpdata.ckLocalBranch();
1701 double energy, energy_F, energy_TI_1, energy_TI_2;
1729 fprintf(stderr,
"PME VIRIAL =\n" 1730 " %g %g %g\n %g %g %g\n %g %g %g\n",
1731 virial[0], virial[1], virial[2], virial[3], virial[4],
1732 virial[5], virial[6], virial[7], virial[8]);
1734 reduction->
item(REDUCTION_VIRIAL_SLOW_XX) += virial[0];
1735 reduction->
item(REDUCTION_VIRIAL_SLOW_XY) += virial[1];
1736 reduction->
item(REDUCTION_VIRIAL_SLOW_XZ) += virial[2];
1737 reduction->
item(REDUCTION_VIRIAL_SLOW_YX) += virial[3];
1738 reduction->
item(REDUCTION_VIRIAL_SLOW_YY) += virial[4];
1739 reduction->
item(REDUCTION_VIRIAL_SLOW_YZ) += virial[5];
1740 reduction->
item(REDUCTION_VIRIAL_SLOW_ZX) += virial[6];
1741 reduction->
item(REDUCTION_VIRIAL_SLOW_ZY) += virial[7];
1742 reduction->
item(REDUCTION_VIRIAL_SLOW_ZZ) += virial[8];
1755 void CudaPmeOneDevice::calcSelfEnergyAlch(
int step) {
1768 (lambda1Down != sim_params.
getElecLambda(1.0 - alchLambda1)) ||
1769 (lambda2Down != sim_params.
getElecLambda(1.0 - alchLambda2)) ||
1779 calcSelfEnergyFEPWrapper(
d_selfEnergy,
d_selfEnergy_FEP,
selfEnergy,
selfEnergy_FEP,
d_atoms,
d_partition,
natoms,
kappa, sim_params.
alchDecouple, lambda1Up, lambda2Up, lambda1Down, lambda2Down,
stream);
1788 (lambda1Down != sim_params.
getElecLambda(1.0 - alchLambda1)) ||
1798 calcSelfEnergyTIWrapper(
d_selfEnergy,
d_selfEnergy_TI_1,
d_selfEnergy_TI_2,
selfEnergy,
selfEnergy_TI_1,
selfEnergy_TI_2,
d_atoms,
d_partition,
natoms,
kappa, sim_params.
alchDecouple, lambda1Up, lambda1Down,
stream);
1804 void CudaPmeOneDevice::scaleAndMergeForce(
int step) {
1810 (lambda1Down != sim_params.
getElecLambda(1.0 - alchLambda1)) ||
1815 scale_factors[0] = lambda1Up;
1816 scale_factors[1] = lambda1Down;
1818 scale_factors[2] = 1.0 - lambda1Up;
1819 scale_factors[3] = 1.0 - lambda1Down;
1822 scale_factors[
num_used_grids-1] = (lambda1Up + lambda1Down - 1.0) * (-1.0);
1830 void CudaPmeOneDevice::scaleAndComputeFEPEnergyVirials(
const EnergyVirial* energyVirials,
int step,
double& energy,
double& energy_F,
double (&virial)[9]) {
1831 double scale1 = 1.0;
1832 double scale2 = 1.0;
1835 for (
unsigned int i = 0; i < 9; ++i) {
1845 energy += energyVirials[0].energy * elecLambdaUp;
1846 energy_F += energyVirials[0].energy * elecLambda2Up;
1847 energy += energyVirials[1].energy * elecLambdaDown;
1848 energy_F += energyVirials[1].energy * elecLambda2Down;
1849 virial[0] += energyVirials[0].virial[0] * elecLambdaUp;
1850 virial[1] += energyVirials[0].virial[1] * elecLambdaUp;
1851 virial[2] += energyVirials[0].virial[2] * elecLambdaUp;
1852 virial[3] += energyVirials[0].virial[1] * elecLambdaUp;
1853 virial[4] += energyVirials[0].virial[3] * elecLambdaUp;
1854 virial[5] += energyVirials[0].virial[4] * elecLambdaUp;
1855 virial[6] += energyVirials[0].virial[2] * elecLambdaUp;
1856 virial[7] += energyVirials[0].virial[4] * elecLambdaUp;
1857 virial[8] += energyVirials[0].virial[5] * elecLambdaUp;
1858 virial[0] += energyVirials[1].virial[0] * elecLambdaDown;
1859 virial[1] += energyVirials[1].virial[1] * elecLambdaDown;
1860 virial[2] += energyVirials[1].virial[2] * elecLambdaDown;
1861 virial[3] += energyVirials[1].virial[1] * elecLambdaDown;
1862 virial[4] += energyVirials[1].virial[3] * elecLambdaDown;
1863 virial[5] += energyVirials[1].virial[4] * elecLambdaDown;
1864 virial[6] += energyVirials[1].virial[2] * elecLambdaDown;
1865 virial[7] += energyVirials[1].virial[4] * elecLambdaDown;
1866 virial[8] += energyVirials[1].virial[5] * elecLambdaDown;
1868 energy += energyVirials[2].energy * (1.0 - elecLambdaUp);
1869 energy_F += energyVirials[2].energy * (1.0 - elecLambda2Up);
1870 energy += energyVirials[3].energy * (1.0 - elecLambdaDown);
1871 energy_F += energyVirials[3].energy * (1.0 - elecLambda2Down);
1872 virial[0] += energyVirials[2].virial[0] * (1.0 - elecLambdaUp);
1873 virial[1] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1874 virial[2] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1875 virial[3] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1876 virial[4] += energyVirials[2].virial[3] * (1.0 - elecLambdaUp);
1877 virial[5] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1878 virial[6] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1879 virial[7] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1880 virial[8] += energyVirials[2].virial[5] * (1.0 - elecLambdaUp);
1881 virial[0] += energyVirials[3].virial[0] * (1.0 - elecLambdaDown);
1882 virial[1] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1883 virial[2] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1884 virial[3] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1885 virial[4] += energyVirials[3].virial[3] * (1.0 - elecLambdaDown);
1886 virial[5] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1887 virial[6] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1888 virial[7] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1889 virial[8] += energyVirials[3].virial[5] * (1.0 - elecLambdaDown);
1891 energy += energyVirials[4].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1892 energy_F += energyVirials[4].energy * (-1.0 * (elecLambda2Up + elecLambda2Down - 1.0));
1893 virial[0] += energyVirials[4].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1894 virial[1] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1895 virial[2] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1896 virial[3] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1897 virial[4] += energyVirials[4].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1898 virial[5] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1899 virial[6] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1900 virial[7] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1901 virial[8] += energyVirials[4].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1905 energy += energyVirials[2].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1906 energy_F += energyVirials[2].energy * (-1.0 * (elecLambda2Up + elecLambda2Down - 1.0));
1907 virial[0] += energyVirials[2].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1908 virial[1] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1909 virial[2] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1910 virial[3] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1911 virial[4] += energyVirials[2].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1912 virial[5] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1913 virial[6] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1914 virial[7] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1915 virial[8] += energyVirials[2].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1920 void CudaPmeOneDevice::scaleAndComputeTIEnergyVirials(
const EnergyVirial* energyVirials,
int step,
double& energy,
double& energy_TI_1,
double& energy_TI_2,
double (&virial)[9]) {
1921 double scale1 = 1.0;
1925 for (
unsigned int i = 0; i < 9; ++i) {
1932 energy += energyVirials[0].energy * elecLambdaUp;
1933 energy += energyVirials[1].energy * elecLambdaDown;
1934 energy_TI_1 += energyVirials[0].energy;
1935 energy_TI_2 += energyVirials[1].energy;
1936 virial[0] += energyVirials[0].virial[0] * elecLambdaUp;
1937 virial[1] += energyVirials[0].virial[1] * elecLambdaUp;
1938 virial[2] += energyVirials[0].virial[2] * elecLambdaUp;
1939 virial[3] += energyVirials[0].virial[1] * elecLambdaUp;
1940 virial[4] += energyVirials[0].virial[3] * elecLambdaUp;
1941 virial[5] += energyVirials[0].virial[4] * elecLambdaUp;
1942 virial[6] += energyVirials[0].virial[2] * elecLambdaUp;
1943 virial[7] += energyVirials[0].virial[4] * elecLambdaUp;
1944 virial[8] += energyVirials[0].virial[5] * elecLambdaUp;
1945 virial[0] += energyVirials[1].virial[0] * elecLambdaDown;
1946 virial[1] += energyVirials[1].virial[1] * elecLambdaDown;
1947 virial[2] += energyVirials[1].virial[2] * elecLambdaDown;
1948 virial[3] += energyVirials[1].virial[1] * elecLambdaDown;
1949 virial[4] += energyVirials[1].virial[3] * elecLambdaDown;
1950 virial[5] += energyVirials[1].virial[4] * elecLambdaDown;
1951 virial[6] += energyVirials[1].virial[2] * elecLambdaDown;
1952 virial[7] += energyVirials[1].virial[4] * elecLambdaDown;
1953 virial[8] += energyVirials[1].virial[5] * elecLambdaDown;
1955 energy += energyVirials[2].energy * (1.0 - elecLambdaUp);
1956 energy += energyVirials[3].energy * (1.0 - elecLambdaDown);
1957 energy_TI_1 += -1.0 * energyVirials[2].energy;
1958 energy_TI_2 += -1.0 * energyVirials[3].energy;
1959 virial[0] += energyVirials[2].virial[0] * (1.0 - elecLambdaUp);
1960 virial[1] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1961 virial[2] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1962 virial[3] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1963 virial[4] += energyVirials[2].virial[3] * (1.0 - elecLambdaUp);
1964 virial[5] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1965 virial[6] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1966 virial[7] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1967 virial[8] += energyVirials[2].virial[5] * (1.0 - elecLambdaUp);
1968 virial[0] += energyVirials[3].virial[0] * (1.0 - elecLambdaDown);
1969 virial[1] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1970 virial[2] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1971 virial[3] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1972 virial[4] += energyVirials[3].virial[3] * (1.0 - elecLambdaDown);
1973 virial[5] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1974 virial[6] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1975 virial[7] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1976 virial[8] += energyVirials[3].virial[5] * (1.0 - elecLambdaDown);
1977 energy += energyVirials[4].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1978 energy_TI_1 += -1.0 * energyVirials[4].energy;
1979 energy_TI_2 += -1.0 * energyVirials[4].energy;
1980 virial[0] += energyVirials[4].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1981 virial[1] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1982 virial[2] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1983 virial[3] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1984 virial[4] += energyVirials[4].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1985 virial[5] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1986 virial[6] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1987 virial[7] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1988 virial[8] += energyVirials[4].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1990 energy += energyVirials[2].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1991 energy_TI_1 += -1.0 * energyVirials[2].energy;
1992 energy_TI_2 += -1.0 * energyVirials[2].energy;
1993 virial[0] += energyVirials[2].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1994 virial[1] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1995 virial[2] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1996 virial[3] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1997 virial[4] += energyVirials[2].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1998 virial[5] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1999 virial[6] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
2000 virial[7] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
2001 virial[8] += energyVirials[2].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
2009 double gw = w * grid;
2014 const int numThreads,
const int3 patchGridDim,
const int order) {
2016 const int gridBytes = patchGridDim.x * patchGridDim.y * patchGridDim.z *
sizeof(float);
2018 order *
sizeof(
float);
2019 const int indexBytes = numThreads *
sizeof(char4);
2021 return gridBytes + thetaBytes + indexBytes;
2025 const int numThreads,
const int3 patchGridDim,
const int order) {
2027 const int gridBytes = patchGridDim.x * patchGridDim.y * patchGridDim.z *
sizeof(float);
2031 return gridBytes + thetaBytes;
2036 use = use && (
order == 8);
2037 use = use && (periodicY);
2038 use = use && (periodicZ);
2048 const int3 constexprPatchGridDim = make_int3(
2055 constexprPatchGridDim, 8 );
2058 constexprPatchGridDim, 8 );
2067 double3* patchMin, double3* patchMax, double3* awayDists) {
2097 for (
int i = 0; i < numPatches; i++) {
2100 double3 width = pmax - pmin;
2104 marginVal.
x = 0.5 * (awayDists[i].x -
simParams->cutoff / sysdima);
2105 marginVal.y = 0.5 * (awayDists[i].y -
simParams->cutoff / sysdimb);
2106 marginVal.z = 0.5 * (awayDists[i].z -
simParams->cutoff / sysdimc);
2109 double3 minAtom = pmin - marginVal;
2110 double3 maxAtom = pmax + marginVal;
2126 gridWidth.x = gridMax.x - gridMin.x +
order;
2127 gridWidth.y = gridMax.y - gridMin.y +
order;
2128 gridWidth.z = gridMax.z - gridMin.z +
order;
2136 numPatches,
nullptr);
2137 cudaStreamSynchronize(
nullptr);
void spread_charge_v2(const PatchLevelPmeData patchLevelPmeData, const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const float nfftx_f, const float nffty_f, const float nfftz_f, const int order3, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
void finishReduction(bool doEnergyVirial)
bool isEqual(const Lattice &other) const
CudaPmeOneDevice(PmeGrid pmeGrid_, int deviceID_, int deviceIndex_)
#define NAMD_EVENT_STOP(eon, id)
cufftHandle * backwardPlans
void compute_b_moduli(double *bm, int K, int order)
NAMD_HOST_DEVICE Vector c() const
int gatherForceSharedBytes
void energyAndVirialSetCallback(CudaPmePencilXYZ *pencilPtr)
bool force_scaling_alch_first_time
void gatherForceDone(unsigned int iGrid)
static constexpr int kNumThreads
void batchTranspose_xyz_yzx(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
CudaPmeTranspose(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
double * d_selfEnergy_FEP
~CudaPmeRealSpaceCompute()
BigReal alchElecLambdaStart
virtual void submit(void)=0
SimParameters * simParameters
void cudaDie(const char *msg, cudaError_t err)
bool simulationCompatible
Bool CUDASOAintegrateMode
EnergyVirial * d_energyVirials
int3 * d_patchGridOffsets
NAMD_HOST_DEVICE Position unscale(ScaledPosition s) const
void checkPatchLevelLatticeCompatibilityAndComputeOffsets(const Lattice &lattice, const int numPatches, const CudaLocalRecord *localRecords, double3 *patchMin, double3 *patchMax, double3 *awayDists)
SubmitReduction * willSubmit(int setID, int size=-1)
static void getPencilDim(const PmeGrid &pmeGrid, const int permutation, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
void energyAndVirialDone(unsigned int iGrid)
static ReductionMgr * Object(void)
void checkPatchLevelSimParamCompatibility(const int order, const bool periodicY, const bool periodicZ)
void spreadCharge(Lattice &lattice)
BigReal getElecLambda(const BigReal) const
void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)
void copyAtoms(const int numAtoms, const CudaAtom *atoms)
PatchLevelPmeData patchLevelPmeData
CudaPmeRealSpaceCompute(PmeGrid pmeGrid, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
int spreadChargeSharedBytes
void copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
void copyDataDeviceToDevice(const int iblock, float2 *data_out)
void calcSelfEnergyTIWrapper(double *d_selfEnergy, double *d_selfEnergy_TI_1, double *d_selfEnergy_TI_2, double &h_selfEnergy, double &h_selfEnergy_TI_1, double &h_selfEnergy_TI_2, const float4 *d_atoms, const int *d_partition, const int num_atoms, const double ewaldcof, const bool alchDecouple, const double lambda1Up, const double lambda1Down, cudaStream_t stream)
CudaFFTCompute(int deviceID, cudaStream_t stream)
int computeSharedMemoryPatchLevelSpreadCharge(const int numThreads, const int3 patchGridDim, const int order)
void copyDataDeviceToHost(const int iblock, float2 *h_data, const int h_dataSize)
#define NAMD_EVENT_START(eon, id)
void NAMD_bug(const char *err_msg)
void writeHostComplexToDisk(const float2 *h_data, const int size, const char *filename)
NAMD_HOST_DEVICE ScaledPosition scale(Position p) const
__thread DeviceCUDA * deviceCUDA
bool self_energy_alch_first_time
void gather_force(const PatchLevelPmeData patchLevelPmeData, const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, const float *data, const int order, float3 *force, const cudaTextureObject_t gridTexObj, cudaStream_t stream)
NAMD_HOST_DEVICE BigReal volume(void) const
NAMD_HOST_DEVICE Vector a_r() const
void calcSelfEnergyFEPWrapper(double *d_selfEnergy, double *d_selfEnergy_FEP, double &h_selfEnergy, double &h_selfEnergyFEP, const float4 *d_atoms, const int *d_partition, const int num_atoms, const double ewaldcof, const bool alchDecouple, const double lambda1Up, const double lambda2Up, const double lambda1Down, const double lambda2Down, cudaStream_t stream)
NAMD_HOST_DEVICE Vector b_r() const
void getVirial(double *virial)
cudaTextureObject_t * gridTexObjArrays
static constexpr int kPatchGridDimPad
NAMD_HOST_DEVICE Vector c_r() const
void energyAndVirialDone(unsigned int iGrid)
void writeComplexToDisk(const float2 *d_data, const int size, const char *filename, cudaStream_t stream)
NAMD_HOST_DEVICE Vector b() const
double * d_selfEnergy_TI_2
void cudaNAMD_bug(const char *msg)
double compute_selfEnergy(double *d_selfEnergy, const float4 *d_atoms, int natoms, double ewaldcof, cudaStream_t stream)
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
void spread_charge(const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
BigReal getCurrentLambda2(const int) const
BigReal getCurrentLambda(const int) const
void gatherForce(Lattice &lattice, CudaForce *force)
void waitStreamSynchronize()
unsigned int multipleGridIndex
void solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float *data)
void gatherForceSetCallback(ComputePmeCUDADevice *devicePtr_in)
void scaleAndMergeForceWrapper(float3 *forces, const float *factors, const size_t num_arrays, const int num_atoms, cudaStream_t stream)
void setDataPtrsYZX(std::vector< float2 *> &dataPtrsNew, float2 *data)
const CudaLocalRecord * localRecords
static constexpr int kDim
void transposeXYZtoYZX(const float2 *data)
void setDataPtrsZXY(std::vector< float2 *> &dataPtrsNew, float2 *data)
CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream, unsigned int iGrid=0)
void batchTranspose_xyz_zxy(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
cufftHandle * forwardPlans
size_t alchGetNumOfPMEGrids() const
void copyDataHostToDevice(const int iblock, float2 *data_in, float2 *data_out)
static constexpr int kThetaPad
void transposeXYZtoZXY(const float2 *data)
static constexpr int kPatchGridDim
NAMD_HOST_DEVICE Vector a() const
int3 * h_patchGridOffsets
NAMD_HOST_DEVICE Vector unit(void) const
void CcdCallBacksReset(void *ignored, double curWallTime)
void waitGatherForceDone()
void copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
int computeSharedMemoryPatchLevelGatherForce(const int numThreads, const int3 patchGridDim, const int order)
float2 * getBuffer(const int iblock)
int getShiftedGrid(const double x, const int grid)
double * d_selfEnergy_TI_1
EnergyVirial * h_energyVirials
unsigned int multipleGridIndex
void writeRealToDisk(const float *d_data, const int size, const char *filename, cudaStream_t stream)
float * d_scaling_factors
void checkPatchLevelDeviceCompatibility()
void compute(const Lattice &lattice, int doEnergyVirial, int step)