5 #include <cuda_runtime.h> 8 #include <hip/hip_runtime.h> 22 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 27 void writeComplexToDisk(
const float2 *d_data,
const int size,
const char* filename, cudaStream_t stream) {
28 fprintf(stderr,
"writeComplexToDisk %d %s\n", size, filename);
29 float2* h_data =
new float2[size];
30 copy_DtoH<float2>(d_data, h_data, size, stream);
32 FILE *handle = fopen(filename,
"w");
33 for (
int i=0;i < size;i++)
34 fprintf(handle,
"%f %f\n", h_data[i].x, h_data[i].y);
40 FILE *handle = fopen(filename,
"w");
41 for (
int i=0;i < size;i++)
42 fprintf(handle,
"%f %f\n", h_data[i].x, h_data[i].y);
46 void writeRealToDisk(
const float *d_data,
const int size,
const char* filename, cudaStream_t stream) {
47 fprintf(stderr,
"writeRealToDisk %d %s\n", size, filename);
48 float* h_data =
new float[size];
49 copy_DtoH<float>(d_data, h_data, size, stream);
51 FILE *handle = fopen(filename,
"w");
52 for (
int i=0;i < size;i++)
53 fprintf(handle,
"%f\n", h_data[i]);
59 : deviceID(deviceID), stream(stream) {
62 void CudaFFTCompute::plan3D(
int *n,
int flags) {
64 forwardType = CUFFT_R2C;
65 backwardType = CUFFT_C2R;
66 cufftCheck(cufftPlan3d(&forwardPlan, n[2], n[1], n[0], CUFFT_R2C));
67 cufftCheck(cufftPlan3d(&backwardPlan, n[2], n[1], n[0], CUFFT_C2R));
72 void CudaFFTCompute::plan2D(
int *n,
int howmany,
int flags) {
74 forwardType = CUFFT_R2C;
75 backwardType = CUFFT_C2R;
76 int nt[2] = {n[1], n[0]};
77 cufftCheck(cufftPlanMany(&forwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_R2C, howmany));
78 cufftCheck(cufftPlanMany(&backwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, howmany));
83 void CudaFFTCompute::plan1DX(
int *n,
int howmany,
int flags) {
85 forwardType = CUFFT_R2C;
86 backwardType = CUFFT_C2R;
87 cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_R2C, howmany));
88 cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2R, howmany));
93 void CudaFFTCompute::plan1DY(
int *n,
int howmany,
int flags) {
95 forwardType = CUFFT_C2C;
96 backwardType = CUFFT_C2C;
97 cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
98 cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
103 void CudaFFTCompute::plan1DZ(
int *n,
int howmany,
int flags) {
105 forwardType = CUFFT_C2C;
106 backwardType = CUFFT_C2C;
107 cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
108 cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
121 float* CudaFFTCompute::allocateData(
const int dataSizeRequired) {
124 allocate_device<float>(&tmp, dataSizeRequired);
133 if (forwardType == CUFFT_R2C) {
137 cudaCheck(cudaStreamSynchronize(stream));
138 fprintf(stderr,
"AP FORWARD FFT\n");
139 fprintf(stderr,
"COPY DEVICE ARRAYS BACK TO HOST\n");
142 allocate_host<float>(&tran, m);
143 copy_DtoH<float>(
dataDst, tran, m, stream);
144 cudaCheck(cudaStreamSynchronize(stream));
145 TestArray_write<float>(
"tran_charge_grid_good.bin",
146 "transformed charge grid good", tran, m);
147 deallocate_host<float>(&tran);
159 }
else if (forwardType == CUFFT_C2C) {
175 cudaNAMD_bug(
"CudaFFTCompute::forward(), unsupported FFT type");
181 if (backwardType == CUFFT_C2R) {
191 cudaCheck(cudaStreamSynchronize(stream));
192 fprintf(stderr,
"AP BACKWARD FFT\n");
193 fprintf(stderr,
"COPY DEVICE ARRAYS BACK TO HOST\n");
196 allocate_host<float>(&grid, gridsize);
197 copy_DtoH<float>((
float*)
dataSrc, grid, gridsize, stream);
198 cudaCheck(cudaStreamSynchronize(stream));
199 TestArray_write<float>(
"potential_grid_good.bin",
200 "potential grid good", grid, gridsize);
201 deallocate_host<float>(&grid);
210 }
else if (backwardType == CUFFT_C2C) {
222 cudaNAMD_bug(
"CudaFFTCompute::backward(), unsupported FFT type");
226 void CudaFFTCompute::setStream() {
228 cufftCheck(cufftSetStream(forwardPlan, stream));
229 cufftCheck(cufftSetStream(backwardPlan, stream));
234 const int jblock,
const int kblock,
double kappa,
int deviceID, cudaStream_t stream,
unsigned int iGrid) :
236 deviceID(deviceID), stream(stream) {
244 for (
int i=0;i <
pmeGrid.
K1;i++) bm1f[i] = (
float)
bm1[i];
245 for (
int i=0;i <
pmeGrid.
K2;i++) bm2f[i] = (
float)
bm2[i];
246 for (
int i=0;i <
pmeGrid.
K3;i++) bm3f[i] = (
float)
bm3[i];
247 allocate_device<float>(&d_bm1,
pmeGrid.
K1);
248 allocate_device<float>(&d_bm2,
pmeGrid.
K2);
249 allocate_device<float>(&d_bm3,
pmeGrid.
K3);
250 copy_HtoD_sync<float>(bm1f, d_bm1,
pmeGrid.
K1);
251 copy_HtoD_sync<float>(bm2f, d_bm2,
pmeGrid.
K2);
252 copy_HtoD_sync<float>(bm3f, d_bm3,
pmeGrid.
K3);
256 allocate_device<EnergyVirial>(&d_energyVirial, 1);
257 allocate_host<EnergyVirial>(&h_energyVirial, 1);
259 cudaCheck(cudaEventCreate(©EnergyVirialEvent));
265 deallocate_device<float>(&d_bm1);
266 deallocate_device<float>(&d_bm2);
267 deallocate_device<float>(&d_bm3);
268 deallocate_device<EnergyVirial>(&d_energyVirial);
269 deallocate_host<EnergyVirial>(&h_energyVirial);
270 cudaCheck(cudaEventDestroy(copyEnergyVirialEvent));
276 fprintf(stderr,
"K-SPACE LATTICE %g %g %g %g %g %g %g %g %g\n",
277 lattice.
a().
x, lattice.
a().
y, lattice.
a().
z,
278 lattice.
b().
x, lattice.
b().
y, lattice.
b().
z,
279 lattice.
c().
x, lattice.
c().
y, lattice.
c().
z);
283 const bool doEnergyVirial = (doEnergy || doVirial);
285 int nfft1, nfft2, nfft3;
286 float *prefac1, *prefac2, *prefac3;
292 float recip1x, recip1y, recip1z;
293 float recip2x, recip2y, recip2z;
294 float recip3x, recip3y, recip3z;
331 NAMD_bug(
"CudaPmeKSpaceCompute::solve, invalid permutation");
365 if (doEnergyVirial) clear_device_array<EnergyVirial>(d_energyVirial, 1, stream);
369 cudaCheck(cudaStreamSynchronize(stream));
370 fprintf(stderr,
"AP calling scalar sum\n");
371 fprintf(stderr,
"(permutation == Perm_cX_Y_Z) = %s\n",
373 fprintf(stderr,
"nfft1=%d nfft2=%d nfft3=%d\n", nfft1, nfft2, nfft3);
375 fprintf(stderr,
"kappa=%g\n",
kappa);
376 fprintf(stderr,
"recip1x=%g recip1y=%g recip1z=%g\n",
377 (
double)recip1x, (
double)recip1y, (
double)recip1z);
378 fprintf(stderr,
"recip2x=%g recip2y=%g recip2z=%g\n",
379 (
double)recip2x, (
double)recip2y, (
double)recip2z);
380 fprintf(stderr,
"recip3x=%g recip3y=%g recip3z=%g\n",
381 (
double)recip3x, (
double)recip3y, (
double)recip3z);
382 fprintf(stderr,
"volume=%g\n", volume);
383 fprintf(stderr,
"j0=%d k0=%d\n",
j0,
k0);
385 allocate_host<float>(&
bm1, nfft1);
386 allocate_host<float>(&
bm2, nfft2);
387 allocate_host<float>(&
bm3, nfft3);
388 copy_DtoH<float>(prefac1,
bm1, nfft1, stream);
389 copy_DtoH<float>(prefac2,
bm2, nfft2, stream);
390 copy_DtoH<float>(prefac3,
bm3, nfft3, stream);
391 TestArray_write<float>(
"bm1_good.bin",
"structure factor bm1 good",
393 TestArray_write<float>(
"bm2_good.bin",
"structure factor bm2 good",
395 TestArray_write<float>(
"bm3_good.bin",
"structure factor bm3 good",
397 deallocate_host<float>(&
bm1);
398 deallocate_host<float>(&
bm2);
399 deallocate_host<float>(&
bm3);
404 recip1x, recip1y, recip1z, recip2x, recip2y, recip2z, recip3x, recip3y, recip3z,
405 volume, prefac1, prefac2, prefac3,
j0,
k0, doEnergyVirial,
406 &d_energyVirial->energy, d_energyVirial->virial, (float2*)data,
410 cudaCheck(cudaStreamSynchronize(stream));
411 fprintf(stderr,
"AP SCALAR SUM\n");
412 fprintf(stderr,
"COPY DEVICE ARRAYS BACK TO HOST\n");
413 int m = 2 * (nfft1/2 + 1) * nfft2 * nfft3;
415 allocate_host<float>(&tran, m);
416 copy_DtoH<float>((
float*)data, tran, m, stream);
417 cudaCheck(cudaStreamSynchronize(stream));
418 TestArray_write<float>(
"tran_potential_grid_good.bin",
419 "transformed potential grid good", tran, m);
420 deallocate_host<float>(&tran);
425 if (doEnergyVirial) {
426 copy_DtoH<EnergyVirial>(d_energyVirial, h_energyVirial, 1, stream);
427 cudaCheck(cudaEventRecord(copyEnergyVirialEvent, stream));
438 void CudaPmeKSpaceCompute::energyAndVirialCheck(
void *arg,
double walltime) {
441 cudaError_t err = cudaEventQuery(c->copyEnergyVirialEvent);
442 if (err == cudaSuccess) {
445 if (c->pencilXYZPtr != NULL)
447 else if (c->pencilZPtr != NULL)
450 NAMD_bug(
"CudaPmeKSpaceCompute::energyAndVirialCheck, pencilXYZPtr and pencilZPtr not set");
452 }
else if (err == cudaErrorNotReady) {
455 if (c->checkCount >= 1000000) {
457 sprintf(errmsg,
"CudaPmeKSpaceCompute::energyAndVirialCheck polled %d times",
464 sprintf(errmsg,
"in CudaPmeKSpaceCompute::energyAndVirialCheck after polling %d times",
471 CcdCallFnAfter(energyAndVirialCheck, arg, 0.1);
476 pencilXYZPtr = pencilPtr;
481 CcdCallFnAfter(energyAndVirialCheck,
this, 0.1);
487 pencilZPtr = pencilPtr;
491 CcdCallFnAfter(energyAndVirialCheck,
this, 0.1);
495 return h_energyVirial->energy;
501 virial[0] = h_energyVirial->virial[3];
502 virial[1] = h_energyVirial->virial[4];
503 virial[2] = h_energyVirial->virial[1];
505 virial[3] = h_energyVirial->virial[4];
506 virial[4] = h_energyVirial->virial[5];
507 virial[5] = h_energyVirial->virial[2];
509 virial[6] = h_energyVirial->virial[1];
510 virial[7] = h_energyVirial->virial[7];
511 virial[8] = h_energyVirial->virial[0];
514 virial[0] = h_energyVirial->virial[0];
515 virial[1] = h_energyVirial->virial[1];
516 virial[2] = h_energyVirial->virial[2];
518 virial[3] = h_energyVirial->virial[1];
519 virial[4] = h_energyVirial->virial[3];
520 virial[5] = h_energyVirial->virial[4];
522 virial[6] = h_energyVirial->virial[2];
523 virial[7] = h_energyVirial->virial[4];
524 virial[8] = h_energyVirial->virial[5];
527 fprintf(stderr,
"AP PME VIRIAL =\n" 528 " %g %g %g\n %g %g %g\n %g %g %g\n",
529 virial[0], virial[1], virial[2], virial[3], virial[4],
530 virial[5], virial[6], virial[7], virial[8]);
543 const int jblock,
const int kblock,
int deviceID, cudaStream_t stream) :
546 NAMD_bug(
"CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute, insufficient dataSize");
561 cudaCheck(cudaEventCreate(&gatherForceEvent));
569 if (d_atoms != NULL) deallocate_device<CudaAtom>(&d_atoms);
570 if (d_force != NULL) deallocate_device<CudaForce>(&d_force);
573 deallocate_device<float>(&
data);
574 cudaCheck(cudaEventDestroy(gatherForceEvent));
603 reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity,
numAtoms, 1.5f);
606 copy_HtoD<CudaAtom>(atoms, d_atoms,
numAtoms, stream);
619 allocate_host<float>(&xyzq, 4*natoms);
620 copy_DtoH<float>((
float *)d_atoms, xyzq, 4*natoms, stream);
621 cudaCheck(cudaStreamSynchronize(stream));
622 char fname[64], remark[64];
623 sprintf(fname,
"pme_atoms_xyzq_soa_%d.bin", step);
624 sprintf(remark,
"SOA PME atoms xyzq, step %d\n", step);
625 TestArray_write<float>(fname, remark, xyzq, 4*natoms);
626 deallocate_host<float>(&xyzq);
637 fprintf(stderr,
"Calling spread_charge with parameters:\n");
638 fprintf(stderr,
"numAtoms = %d\n",
numAtoms);
639 fprintf(stderr,
"pmeGrid.K1 = %d\n",
pmeGrid.
K1);
640 fprintf(stderr,
"pmeGrid.K2 = %d\n",
pmeGrid.
K2);
641 fprintf(stderr,
"pmeGrid.K3 = %d\n",
pmeGrid.
K3);
642 fprintf(stderr,
"xsize = %d\n",
xsize);
643 fprintf(stderr,
"ysize = %d\n",
ysize);
644 fprintf(stderr,
"zsize = %d\n",
zsize);
645 fprintf(stderr,
"y0 = %d\n",
y0);
646 fprintf(stderr,
"z0 = %d\n",
z0);
647 fprintf(stderr,
"(pmeGrid.yBlocks == 1) = %d\n", (
pmeGrid.
yBlocks == 1));
648 fprintf(stderr,
"(pmeGrid.zBlocks == 1) = %d\n", (
pmeGrid.
zBlocks == 1));
657 cudaCheck(cudaStreamSynchronize(stream));
658 fprintf(stderr,
"AP SPREAD CHARGES\n");
659 fprintf(stderr,
"COPY DEVICE ARRAYS BACK TO HOST\n");
661 allocate_host<float>(&xyzq, 4*
numAtoms);
662 copy_DtoH<float>((
float *)d_atoms, xyzq, 4*
numAtoms, stream);
665 allocate_host<float>(&grid, gridlen);
666 copy_DtoH<float>(
data, grid, gridlen, stream);
667 cudaCheck(cudaStreamSynchronize(stream));
668 TestArray_write<float>(
"xyzq_good.bin",
"xyzq good", xyzq, 4*
numAtoms);
669 TestArray_write<float>(
"charge_grid_good.bin",
"charge grid good",
671 deallocate_host<float>(&xyzq);
672 deallocate_host<float>(&grid);
682 void CudaPmeRealSpaceCompute::cuda_gatherforce_check(
void *arg,
double walltime) {
686 cudaError_t err = cudaEventQuery(c->gatherForceEvent);
687 if (err == cudaSuccess) {
693 }
else if (err == cudaErrorNotReady) {
696 if (c->checkCount >= 1000000) {
698 sprintf(errmsg,
"CudaPmeRealSpaceCompute::cuda_gatherforce_check polled %d times",
705 sprintf(errmsg,
"in CudaPmeRealSpaceCompute::cuda_gatherforce_check after polling %d times",
712 CcdCallFnAfter(cuda_gatherforce_check, arg, 0.1);
717 devicePtr = devicePtr_in;
721 CcdCallFnAfter(cuda_gatherforce_check,
this, 0.1);
726 cudaCheck(cudaEventSynchronize(gatherForceEvent));
729 void CudaPmeRealSpaceCompute::setupGridData(
float* data,
int data_len) {
741 if (tex_data ==
data && tex_data_len == data_len)
return;
743 tex_data_len = data_len;
745 cudaResourceDesc resDesc;
746 memset(&resDesc, 0,
sizeof(resDesc));
747 resDesc.resType = cudaResourceTypeLinear;
748 resDesc.res.linear.devPtr =
data;
749 resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
750 resDesc.res.linear.desc.x =
sizeof(float)*8;
751 resDesc.res.linear.sizeInBytes = data_len*
sizeof(float);
752 cudaTextureDesc texDesc;
753 memset(&texDesc, 0,
sizeof(texDesc));
754 texDesc.readMode = cudaReadModeElementType;
755 cudaCheck(cudaCreateTextureObject(&gridTexObj, &resDesc, &texDesc, NULL));
757 if (grid_data ==
data && grid_data_len == data_len)
return;
759 grid_data_len = data_len;
769 reallocate_device<CudaForce>(&d_force, &d_forceCapacity,
numAtoms, 1.5f);
773 fprintf(stderr,
"AP gather force arguments\n");
774 fprintf(stderr,
"numAtoms = %d\n",
numAtoms);
775 fprintf(stderr,
"pmeGrid.K1 = %d\n",
pmeGrid.
K1);
776 fprintf(stderr,
"pmeGrid.K2 = %d\n",
pmeGrid.
K2);
777 fprintf(stderr,
"pmeGrid.K3 = %d\n",
pmeGrid.
K3);
778 fprintf(stderr,
"xsize = %d\n",
xsize);
779 fprintf(stderr,
"ysize = %d\n",
ysize);
780 fprintf(stderr,
"zsize = %d\n",
zsize);
781 fprintf(stderr,
"y0 = %d\n",
y0);
782 fprintf(stderr,
"z0 = %d\n",
z0);
783 fprintf(stderr,
"(pmeGrid.yBlocks == 1) = %d\n", (
pmeGrid.
yBlocks == 1));
784 fprintf(stderr,
"(pmeGrid.zBlocks == 1) = %d\n", (
pmeGrid.
zBlocks == 1));
786 fprintf(stderr,
"gridTexObj = %p\n", gridTexObj);
804 cudaCheck(cudaStreamSynchronize(stream));
805 fprintf(stderr,
"AP GATHER FORCE\n");
806 fprintf(stderr,
"COPY DEVICE ARRAYS BACK TO HOST\n");
809 allocate_host<float>(&xyz, 3*natoms);
810 copy_DtoH<float>((
float*)d_force, xyz, 3*natoms, stream);
811 cudaCheck(cudaStreamSynchronize(stream));
812 TestArray_write<float>(
"gather_force_good.bin",
813 "gather force good", xyz, 3*natoms);
814 deallocate_host<float>(&xyz);
818 copy_DtoH<CudaForce>(d_force, force,
numAtoms, stream);
820 cudaCheck(cudaEventRecord(gatherForceEvent, stream));
843 const int jblock,
const int kblock,
int deviceID, cudaStream_t stream) :
844 PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
847 allocate_device<float2>(&d_data,
dataSize);
848 #ifndef P2P_ENABLE_3D 849 allocate_device<float2>(&d_buffer,
dataSize);
853 dataPtrsYZX.resize(
nblock, NULL);
854 dataPtrsZXY.resize(
nblock, NULL);
856 allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*
nblock);
857 allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*
nblock);
862 deallocate_device<float2>(&d_data);
863 #ifndef P2P_ENABLE_3D 864 deallocate_device<float2>(&d_buffer);
866 deallocate_device< TransposeBatch<float2> >(&batchesZXY);
867 deallocate_device< TransposeBatch<float2> >(&batchesYZX);
874 if (dataPtrsYZX.size() != dataPtrsNew.size())
875 NAMD_bug(
"CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
876 for (
int iblock=0;iblock <
nblock;iblock++) {
877 dataPtrsYZX[iblock] = dataPtrsNew[iblock];
882 for (
int iperm=0;iperm < 3;iperm++) {
888 }
else if (iperm == 1) {
899 for (
int iblock=0;iblock <
nblock;iblock++) {
901 int x0 =
pos[iblock];
902 int nx =
pos[iblock+1] - x0;
903 max_nx = std::max(max_nx, nx);
907 if (dataPtrsYZX[iblock] == NULL) {
913 data_out = dataPtrsYZX[iblock];
914 width_out = isize_out;
924 h_batchesYZX[iperm*
nblock + iblock] = batch;
933 max_nx_YZX[iperm] = max_nx;
936 copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*
nblock, stream);
937 cudaCheck(cudaStreamSynchronize(stream));
938 delete [] h_batchesYZX;
945 if (dataPtrsZXY.size() != dataPtrsNew.size())
946 NAMD_bug(
"CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
947 for (
int iblock=0;iblock <
nblock;iblock++) {
948 dataPtrsZXY[iblock] = dataPtrsNew[iblock];
954 for (
int iperm=0;iperm < 3;iperm++) {
960 }
else if (iperm == 1) {
971 for (
int iblock=0;iblock <
nblock;iblock++) {
973 int x0 =
pos[iblock];
974 int nx =
pos[iblock+1] - x0;
975 max_nx = std::max(max_nx, nx);
979 if (dataPtrsZXY[iblock] == NULL) {
985 data_out = dataPtrsZXY[iblock];
986 width_out = isize_out;
995 h_batchesZXY[iperm*
nblock + iblock] = batch;
998 max_nx_ZXY[iperm] = max_nx;
1001 copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*
nblock, stream);
1002 cudaCheck(cudaStreamSynchronize(stream));
1003 delete [] h_batchesZXY;
1024 NAMD_bug(
"PmeTranspose::transposeXYZtoYZX, invalid permutation");
1098 NAMD_bug(
"PmeTranspose::transposeXYZtoZXY, invalid permutation");
1155 cudaCheck(cudaStreamSynchronize(stream));
1162 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
1164 int x0 =
pos[iblock];
1165 int nx =
pos[iblock+1] - x0;
1170 if (copyStart + copySize >
dataSize)
1171 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
1173 if (copySize > h_dataSize)
1174 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
1176 copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
1183 NAMD_bug(
"CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
1186 int i0, i1, j0, j1, k0, k1;
1187 getBlockDim(
pmeGrid,
permutation, iblock,
jblock,
kblock, i0, i1, j0, j1, k0, k1);
1192 copy3D_HtoD<float2>(data_in, data_out,
1197 ni, nj, nk, stream);
1200 #ifndef P2P_ENABLE_3D 1208 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
1211 int i0, i1, j0, j1, k0, k1;
1212 getBlockDim(
pmeGrid,
permutation, iblock,
jblock,
kblock, i0, i1, j0, j1, k0, k1);
1217 float2* data_in = d_buffer + i0*nj*nk;
1219 copy3D_DtoD<float2>(data_in, data_out,
1224 ni, nj, nk, stream);
1232 NAMD_bug(
"CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
1235 int i0, i1, j0, j1, k0, k1;
1236 getBlockDim(
pmeGrid,
permutation, iblock,
jblock,
kblock, i0, i1, j0, j1, k0, k1);
1241 return d_buffer + i0*nj*nk;
1250 int kblock_out = iblock;
1252 copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1259 int jblock_out = iblock;
1262 copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1265 void CudaPmeTranspose::copyDataToPeerDevice(
const int iblock,
1266 const int iblock_out,
const int jblock_out,
const int kblock_out,
1267 int deviceID_out,
int permutation_out, float2* data_out) {
1272 int i0, i1, j0, j1, k0, k1;
1273 getBlockDim(
pmeGrid, permutation_out, iblock_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
1279 int isize_out = i1-i0+1;
1280 int jsize_out = j1-j0+1;
1282 int x0 =
pos[iblock];
1285 #ifndef P2P_ENABLE_3D 1287 copy_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out, ni*nj*nk, stream);
1289 copy3D_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out,
1293 isize_out, jsize_out,
1294 ni, nj, nk, stream);
1305 pmeGrid(pmeGrid_), deviceID(deviceID_), deviceIndex(deviceIndex_),
1306 natoms(0), d_atoms(0), d_forces(0),
1307 d_grids(0), gridsize(0),
1308 d_trans(0), transize(0),
1309 d_bm1(0), d_bm2(0), d_bm3(0),
1311 self_energy_alch_first_time(true),
1312 force_scaling_alch_first_time(true),
1313 selfEnergy(0), selfEnergy_FEP(0), selfEnergy_TI_1(0), selfEnergy_TI_2(0), m_step(0)
1332 NAMD_bug(
"CudaPmeOneDevice requires GPU-resident mode");
1337 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP) 1338 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1339 int leastPriority, greatestPriority;
1340 cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
1341 cudaCheck(cudaStreamCreateWithPriority(&
stream, cudaStreamDefault, greatestPriority));
1372 #ifdef NODEGROUP_FORCE_REGISTER 1373 DeviceData& devData = cpdata.ckLocalBranch()->devData[
deviceIndex];
1376 devData.f_slow_size =
natoms;
1386 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 1400 cudaDeviceProp deviceProp;
1402 const int texture_alignment = int(deviceProp.textureAlignment);
1407 if ((
gridsize % texture_alignment) != 0) {
1421 cudaResourceDesc resDesc;
1422 memset(&resDesc, 0,
sizeof(resDesc));
1423 resDesc.resType = cudaResourceTypeLinear;
1424 resDesc.res.linear.devPtr = (
void*)(
d_grids + iGrid * (
size_t)
gridsize);
1425 resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
1426 resDesc.res.linear.desc.x =
sizeof(float)*8;
1427 resDesc.res.linear.sizeInBytes =
gridsize*
sizeof(float);
1428 cudaTextureDesc texDesc;
1429 memset(&texDesc, 0,
sizeof(texDesc));
1430 texDesc.readMode = cudaReadModeElementType;
1438 double *bm1 =
new double[k1];
1439 double *bm2 =
new double[k2];
1440 double *bm3 =
new double[k3];
1448 float *bm1f =
new float[k1];
1449 float *bm2f =
new float[k2];
1450 float *bm3f =
new float[k3];
1451 for (
int i=0; i < k1; i++) bm1f[i] = (
float) bm1[i];
1452 for (
int i=0; i < k2; i++) bm2f[i] = (
float) bm2[i];
1453 for (
int i=0; i < k3; i++) bm3f[i] = (
float) bm3[i];
1454 allocate_device<float>(&
d_bm1, k1);
1455 allocate_device<float>(&
d_bm2, k2);
1456 allocate_device<float>(&
d_bm3, k3);
1457 copy_HtoD_sync<float>(bm1f,
d_bm1, k1);
1458 copy_HtoD_sync<float>(bm2f,
d_bm2, k2);
1459 copy_HtoD_sync<float>(bm3f,
d_bm3, k3);
1473 deallocate_device<float4>(&
d_atoms);
1474 deallocate_device<float3>(&
d_forces);
1475 deallocate_device<float2>(&
d_trans);
1476 deallocate_device<float>(&
d_grids);
1480 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 1484 #if defined(NAMD_CUDA) // only CUDA uses texture objects 1498 #if defined(NAMD_CUDA) // only CUDA uses texture objects 1509 deallocate_device<float>(&
d_bm1);
1510 deallocate_device<float>(&
d_bm2);
1511 deallocate_device<float>(&
d_bm3);
1514 if (reductionGpuResident) {
1515 delete reductionGpuResident;
1536 double volume = lattice.
volume();
1564 if (doEnergyVirial) {
1568 if (updateSelfEnergy && (sim_params.
alchOn ==
false)) {
1597 float(k1), (
float)k2, (
float)k3, order3,
1623 scalar_sum(
true , k1, k2, k3, (k1/2 + 1), k2, k3,
1624 kappa, arx, ary, arz, brx, bry, brz, crx, cry, crz, volume,
1671 if (doEnergyVirial) {
1682 scaleAndMergeForce(
m_step);
1694 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1695 PatchData* patchData = cpdata.ckLocalBranch();
1698 double energy, energy_F, energy_TI_1, energy_TI_2;
1726 fprintf(stderr,
"PME VIRIAL =\n" 1727 " %g %g %g\n %g %g %g\n %g %g %g\n",
1728 virial[0], virial[1], virial[2], virial[3], virial[4],
1729 virial[5], virial[6], virial[7], virial[8]);
1731 reduction->
item(REDUCTION_VIRIAL_SLOW_XX) += virial[0];
1732 reduction->
item(REDUCTION_VIRIAL_SLOW_XY) += virial[1];
1733 reduction->
item(REDUCTION_VIRIAL_SLOW_XZ) += virial[2];
1734 reduction->
item(REDUCTION_VIRIAL_SLOW_YX) += virial[3];
1735 reduction->
item(REDUCTION_VIRIAL_SLOW_YY) += virial[4];
1736 reduction->
item(REDUCTION_VIRIAL_SLOW_YZ) += virial[5];
1737 reduction->
item(REDUCTION_VIRIAL_SLOW_ZX) += virial[6];
1738 reduction->
item(REDUCTION_VIRIAL_SLOW_ZY) += virial[7];
1739 reduction->
item(REDUCTION_VIRIAL_SLOW_ZZ) += virial[8];
1752 void CudaPmeOneDevice::calcSelfEnergyAlch(
int step) {
1765 (lambda1Down != sim_params.
getElecLambda(1.0 - alchLambda1)) ||
1766 (lambda2Down != sim_params.
getElecLambda(1.0 - alchLambda2)) ||
1776 calcSelfEnergyFEPWrapper(
d_selfEnergy,
d_selfEnergy_FEP,
selfEnergy,
selfEnergy_FEP,
d_atoms,
d_partition,
natoms,
kappa, sim_params.
alchDecouple, lambda1Up, lambda2Up, lambda1Down, lambda2Down,
stream);
1785 (lambda1Down != sim_params.
getElecLambda(1.0 - alchLambda1)) ||
1795 calcSelfEnergyTIWrapper(
d_selfEnergy,
d_selfEnergy_TI_1,
d_selfEnergy_TI_2,
selfEnergy,
selfEnergy_TI_1,
selfEnergy_TI_2,
d_atoms,
d_partition,
natoms,
kappa, sim_params.
alchDecouple, lambda1Up, lambda1Down,
stream);
1801 void CudaPmeOneDevice::scaleAndMergeForce(
int step) {
1807 (lambda1Down != sim_params.
getElecLambda(1.0 - alchLambda1)) ||
1812 scale_factors[0] = lambda1Up;
1813 scale_factors[1] = lambda1Down;
1815 scale_factors[2] = 1.0 - lambda1Up;
1816 scale_factors[3] = 1.0 - lambda1Down;
1819 scale_factors[
num_used_grids-1] = (lambda1Up + lambda1Down - 1.0) * (-1.0);
1827 void CudaPmeOneDevice::scaleAndComputeFEPEnergyVirials(
const EnergyVirial* energyVirials,
int step,
double& energy,
double& energy_F,
double (&virial)[9]) {
1828 double scale1 = 1.0;
1829 double scale2 = 1.0;
1832 for (
unsigned int i = 0; i < 9; ++i) {
1842 energy += energyVirials[0].energy * elecLambdaUp;
1843 energy_F += energyVirials[0].energy * elecLambda2Up;
1844 energy += energyVirials[1].energy * elecLambdaDown;
1845 energy_F += energyVirials[1].energy * elecLambda2Down;
1846 virial[0] += energyVirials[0].virial[0] * elecLambdaUp;
1847 virial[1] += energyVirials[0].virial[1] * elecLambdaUp;
1848 virial[2] += energyVirials[0].virial[2] * elecLambdaUp;
1849 virial[3] += energyVirials[0].virial[1] * elecLambdaUp;
1850 virial[4] += energyVirials[0].virial[3] * elecLambdaUp;
1851 virial[5] += energyVirials[0].virial[4] * elecLambdaUp;
1852 virial[6] += energyVirials[0].virial[2] * elecLambdaUp;
1853 virial[7] += energyVirials[0].virial[4] * elecLambdaUp;
1854 virial[8] += energyVirials[0].virial[5] * elecLambdaUp;
1855 virial[0] += energyVirials[1].virial[0] * elecLambdaDown;
1856 virial[1] += energyVirials[1].virial[1] * elecLambdaDown;
1857 virial[2] += energyVirials[1].virial[2] * elecLambdaDown;
1858 virial[3] += energyVirials[1].virial[1] * elecLambdaDown;
1859 virial[4] += energyVirials[1].virial[3] * elecLambdaDown;
1860 virial[5] += energyVirials[1].virial[4] * elecLambdaDown;
1861 virial[6] += energyVirials[1].virial[2] * elecLambdaDown;
1862 virial[7] += energyVirials[1].virial[4] * elecLambdaDown;
1863 virial[8] += energyVirials[1].virial[5] * elecLambdaDown;
1865 energy += energyVirials[2].energy * (1.0 - elecLambdaUp);
1866 energy_F += energyVirials[2].energy * (1.0 - elecLambda2Up);
1867 energy += energyVirials[3].energy * (1.0 - elecLambdaDown);
1868 energy_F += energyVirials[3].energy * (1.0 - elecLambda2Down);
1869 virial[0] += energyVirials[2].virial[0] * (1.0 - elecLambdaUp);
1870 virial[1] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1871 virial[2] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1872 virial[3] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1873 virial[4] += energyVirials[2].virial[3] * (1.0 - elecLambdaUp);
1874 virial[5] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1875 virial[6] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1876 virial[7] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1877 virial[8] += energyVirials[2].virial[5] * (1.0 - elecLambdaUp);
1878 virial[0] += energyVirials[3].virial[0] * (1.0 - elecLambdaDown);
1879 virial[1] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1880 virial[2] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1881 virial[3] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1882 virial[4] += energyVirials[3].virial[3] * (1.0 - elecLambdaDown);
1883 virial[5] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1884 virial[6] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1885 virial[7] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1886 virial[8] += energyVirials[3].virial[5] * (1.0 - elecLambdaDown);
1888 energy += energyVirials[4].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1889 energy_F += energyVirials[4].energy * (-1.0 * (elecLambda2Up + elecLambda2Down - 1.0));
1890 virial[0] += energyVirials[4].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1891 virial[1] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1892 virial[2] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1893 virial[3] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1894 virial[4] += energyVirials[4].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1895 virial[5] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1896 virial[6] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1897 virial[7] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1898 virial[8] += energyVirials[4].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1902 energy += energyVirials[2].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1903 energy_F += energyVirials[2].energy * (-1.0 * (elecLambda2Up + elecLambda2Down - 1.0));
1904 virial[0] += energyVirials[2].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1905 virial[1] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1906 virial[2] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1907 virial[3] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1908 virial[4] += energyVirials[2].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1909 virial[5] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1910 virial[6] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1911 virial[7] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1912 virial[8] += energyVirials[2].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1917 void CudaPmeOneDevice::scaleAndComputeTIEnergyVirials(
const EnergyVirial* energyVirials,
int step,
double& energy,
double& energy_TI_1,
double& energy_TI_2,
double (&virial)[9]) {
1918 double scale1 = 1.0;
1922 for (
unsigned int i = 0; i < 9; ++i) {
1929 energy += energyVirials[0].energy * elecLambdaUp;
1930 energy += energyVirials[1].energy * elecLambdaDown;
1931 energy_TI_1 += energyVirials[0].energy;
1932 energy_TI_2 += energyVirials[1].energy;
1933 virial[0] += energyVirials[0].virial[0] * elecLambdaUp;
1934 virial[1] += energyVirials[0].virial[1] * elecLambdaUp;
1935 virial[2] += energyVirials[0].virial[2] * elecLambdaUp;
1936 virial[3] += energyVirials[0].virial[1] * elecLambdaUp;
1937 virial[4] += energyVirials[0].virial[3] * elecLambdaUp;
1938 virial[5] += energyVirials[0].virial[4] * elecLambdaUp;
1939 virial[6] += energyVirials[0].virial[2] * elecLambdaUp;
1940 virial[7] += energyVirials[0].virial[4] * elecLambdaUp;
1941 virial[8] += energyVirials[0].virial[5] * elecLambdaUp;
1942 virial[0] += energyVirials[1].virial[0] * elecLambdaDown;
1943 virial[1] += energyVirials[1].virial[1] * elecLambdaDown;
1944 virial[2] += energyVirials[1].virial[2] * elecLambdaDown;
1945 virial[3] += energyVirials[1].virial[1] * elecLambdaDown;
1946 virial[4] += energyVirials[1].virial[3] * elecLambdaDown;
1947 virial[5] += energyVirials[1].virial[4] * elecLambdaDown;
1948 virial[6] += energyVirials[1].virial[2] * elecLambdaDown;
1949 virial[7] += energyVirials[1].virial[4] * elecLambdaDown;
1950 virial[8] += energyVirials[1].virial[5] * elecLambdaDown;
1952 energy += energyVirials[2].energy * (1.0 - elecLambdaUp);
1953 energy += energyVirials[3].energy * (1.0 - elecLambdaDown);
1954 energy_TI_1 += -1.0 * energyVirials[2].energy;
1955 energy_TI_2 += -1.0 * energyVirials[3].energy;
1956 virial[0] += energyVirials[2].virial[0] * (1.0 - elecLambdaUp);
1957 virial[1] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1958 virial[2] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1959 virial[3] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1960 virial[4] += energyVirials[2].virial[3] * (1.0 - elecLambdaUp);
1961 virial[5] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1962 virial[6] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1963 virial[7] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1964 virial[8] += energyVirials[2].virial[5] * (1.0 - elecLambdaUp);
1965 virial[0] += energyVirials[3].virial[0] * (1.0 - elecLambdaDown);
1966 virial[1] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1967 virial[2] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1968 virial[3] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1969 virial[4] += energyVirials[3].virial[3] * (1.0 - elecLambdaDown);
1970 virial[5] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1971 virial[6] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1972 virial[7] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1973 virial[8] += energyVirials[3].virial[5] * (1.0 - elecLambdaDown);
1974 energy += energyVirials[4].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1975 energy_TI_1 += -1.0 * energyVirials[4].energy;
1976 energy_TI_2 += -1.0 * energyVirials[4].energy;
1977 virial[0] += energyVirials[4].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1978 virial[1] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1979 virial[2] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1980 virial[3] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1981 virial[4] += energyVirials[4].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1982 virial[5] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1983 virial[6] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1984 virial[7] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1985 virial[8] += energyVirials[4].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1987 energy += energyVirials[2].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1988 energy_TI_1 += -1.0 * energyVirials[2].energy;
1989 energy_TI_2 += -1.0 * energyVirials[2].energy;
1990 virial[0] += energyVirials[2].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1991 virial[1] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1992 virial[2] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1993 virial[3] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1994 virial[4] += energyVirials[2].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1995 virial[5] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1996 virial[6] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1997 virial[7] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1998 virial[8] += energyVirials[2].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
2006 double gw = w * grid;
2011 const int numThreads,
const int3 patchGridDim,
const int order) {
2013 const int gridBytes = patchGridDim.x * patchGridDim.y * patchGridDim.z *
sizeof(float);
2015 order *
sizeof(
float);
2016 const int indexBytes = numThreads *
sizeof(char4);
2018 return gridBytes + thetaBytes + indexBytes;
2022 const int numThreads,
const int3 patchGridDim,
const int order) {
2024 const int gridBytes = patchGridDim.x * patchGridDim.y * patchGridDim.z *
sizeof(float);
2028 return gridBytes + thetaBytes;
2033 use = use && (
order == 8);
2034 use = use && (periodicY);
2035 use = use && (periodicZ);
2045 const int3 constexprPatchGridDim = make_int3(
2052 constexprPatchGridDim, 8 );
2055 constexprPatchGridDim, 8 );
2064 double3* patchMin, double3* patchMax, double3* awayDists) {
2094 for (
int i = 0; i < numPatches; i++) {
2097 double3 width = pmax - pmin;
2101 marginVal.
x = 0.5 * (awayDists[i].x -
simParams->cutoff / sysdima);
2102 marginVal.y = 0.5 * (awayDists[i].y -
simParams->cutoff / sysdimb);
2103 marginVal.z = 0.5 * (awayDists[i].z -
simParams->cutoff / sysdimc);
2106 double3 minAtom = pmin - marginVal;
2107 double3 maxAtom = pmax + marginVal;
2123 gridWidth.x = gridMax.x - gridMin.x +
order;
2124 gridWidth.y = gridMax.y - gridMin.y +
order;
2125 gridWidth.z = gridMax.z - gridMin.z +
order;
2133 numPatches,
nullptr);
2134 cudaStreamSynchronize(
nullptr);
void spread_charge_v2(const PatchLevelPmeData patchLevelPmeData, const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const float nfftx_f, const float nffty_f, const float nfftz_f, const int order3, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
void finishReduction(bool doEnergyVirial)
bool isEqual(const Lattice &other) const
CudaPmeOneDevice(PmeGrid pmeGrid_, int deviceID_, int deviceIndex_)
#define NAMD_EVENT_STOP(eon, id)
cufftHandle * backwardPlans
void compute_b_moduli(double *bm, int K, int order)
NAMD_HOST_DEVICE Vector c() const
int gatherForceSharedBytes
void energyAndVirialSetCallback(CudaPmePencilXYZ *pencilPtr)
bool force_scaling_alch_first_time
void gatherForceDone(unsigned int iGrid)
static constexpr int kNumThreads
void batchTranspose_xyz_yzx(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
CudaPmeTranspose(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
double * d_selfEnergy_FEP
~CudaPmeRealSpaceCompute()
BigReal alchElecLambdaStart
virtual void submit(void)=0
SimParameters * simParameters
void cudaDie(const char *msg, cudaError_t err)
bool simulationCompatible
Bool CUDASOAintegrateMode
EnergyVirial * d_energyVirials
int3 * d_patchGridOffsets
NAMD_HOST_DEVICE Position unscale(ScaledPosition s) const
void checkPatchLevelLatticeCompatibilityAndComputeOffsets(const Lattice &lattice, const int numPatches, const CudaLocalRecord *localRecords, double3 *patchMin, double3 *patchMax, double3 *awayDists)
SubmitReduction * willSubmit(int setID, int size=-1)
static void getPencilDim(const PmeGrid &pmeGrid, const int permutation, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
void energyAndVirialDone(unsigned int iGrid)
static ReductionMgr * Object(void)
void checkPatchLevelSimParamCompatibility(const int order, const bool periodicY, const bool periodicZ)
void spreadCharge(Lattice &lattice)
BigReal getElecLambda(const BigReal) const
void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)
void copyAtoms(const int numAtoms, const CudaAtom *atoms)
PatchLevelPmeData patchLevelPmeData
CudaPmeRealSpaceCompute(PmeGrid pmeGrid, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
int spreadChargeSharedBytes
void copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
void copyDataDeviceToDevice(const int iblock, float2 *data_out)
void calcSelfEnergyTIWrapper(double *d_selfEnergy, double *d_selfEnergy_TI_1, double *d_selfEnergy_TI_2, double &h_selfEnergy, double &h_selfEnergy_TI_1, double &h_selfEnergy_TI_2, const float4 *d_atoms, const int *d_partition, const int num_atoms, const double ewaldcof, const bool alchDecouple, const double lambda1Up, const double lambda1Down, cudaStream_t stream)
CudaFFTCompute(int deviceID, cudaStream_t stream)
int computeSharedMemoryPatchLevelSpreadCharge(const int numThreads, const int3 patchGridDim, const int order)
void copyDataDeviceToHost(const int iblock, float2 *h_data, const int h_dataSize)
#define NAMD_EVENT_START(eon, id)
void NAMD_bug(const char *err_msg)
void writeHostComplexToDisk(const float2 *h_data, const int size, const char *filename)
NAMD_HOST_DEVICE ScaledPosition scale(Position p) const
__thread DeviceCUDA * deviceCUDA
bool self_energy_alch_first_time
void gather_force(const PatchLevelPmeData patchLevelPmeData, const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, const float *data, const int order, float3 *force, const cudaTextureObject_t gridTexObj, cudaStream_t stream)
NAMD_HOST_DEVICE BigReal volume(void) const
NAMD_HOST_DEVICE Vector a_r() const
void calcSelfEnergyFEPWrapper(double *d_selfEnergy, double *d_selfEnergy_FEP, double &h_selfEnergy, double &h_selfEnergyFEP, const float4 *d_atoms, const int *d_partition, const int num_atoms, const double ewaldcof, const bool alchDecouple, const double lambda1Up, const double lambda2Up, const double lambda1Down, const double lambda2Down, cudaStream_t stream)
NAMD_HOST_DEVICE Vector b_r() const
void getVirial(double *virial)
cudaTextureObject_t * gridTexObjArrays
static constexpr int kPatchGridDimPad
NAMD_HOST_DEVICE Vector c_r() const
void energyAndVirialDone(unsigned int iGrid)
void writeComplexToDisk(const float2 *d_data, const int size, const char *filename, cudaStream_t stream)
NAMD_HOST_DEVICE Vector b() const
double * d_selfEnergy_TI_2
void cudaNAMD_bug(const char *msg)
double compute_selfEnergy(double *d_selfEnergy, const float4 *d_atoms, int natoms, double ewaldcof, cudaStream_t stream)
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
void spread_charge(const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
BigReal getCurrentLambda2(const int) const
BigReal getCurrentLambda(const int) const
void gatherForce(Lattice &lattice, CudaForce *force)
void waitStreamSynchronize()
unsigned int multipleGridIndex
void solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float *data)
void gatherForceSetCallback(ComputePmeCUDADevice *devicePtr_in)
void scaleAndMergeForceWrapper(float3 *forces, const float *factors, const size_t num_arrays, const int num_atoms, cudaStream_t stream)
void setDataPtrsYZX(std::vector< float2 *> &dataPtrsNew, float2 *data)
const CudaLocalRecord * localRecords
static constexpr int kDim
void transposeXYZtoYZX(const float2 *data)
void setDataPtrsZXY(std::vector< float2 *> &dataPtrsNew, float2 *data)
CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream, unsigned int iGrid=0)
void batchTranspose_xyz_zxy(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
cufftHandle * forwardPlans
size_t alchGetNumOfPMEGrids() const
void copyDataHostToDevice(const int iblock, float2 *data_in, float2 *data_out)
static constexpr int kThetaPad
void transposeXYZtoZXY(const float2 *data)
static constexpr int kPatchGridDim
NAMD_HOST_DEVICE Vector a() const
int3 * h_patchGridOffsets
NAMD_HOST_DEVICE Vector unit(void) const
void CcdCallBacksReset(void *ignored, double curWallTime)
void waitGatherForceDone()
void copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
int computeSharedMemoryPatchLevelGatherForce(const int numThreads, const int3 patchGridDim, const int order)
float2 * getBuffer(const int iblock)
int getShiftedGrid(const double x, const int grid)
double * d_selfEnergy_TI_1
EnergyVirial * h_energyVirials
unsigned int multipleGridIndex
void writeRealToDisk(const float *d_data, const int size, const char *filename, cudaStream_t stream)
float * d_scaling_factors
void checkPatchLevelDeviceCompatibility()
void compute(const Lattice &lattice, int doEnergyVirial, int step)