5 #include <cuda_runtime.h> 8 #include <hip/hip_runtime.h> 22 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 27 void writeComplexToDisk(
const float2 *d_data,
const int size,
const char* filename, cudaStream_t stream) {
28 fprintf(stderr,
"writeComplexToDisk %d %s\n", size, filename);
29 float2* h_data =
new float2[size];
30 copy_DtoH<float2>(d_data, h_data, size, stream);
32 FILE *handle = fopen(filename,
"w");
33 for (
int i=0;i < size;i++)
34 fprintf(handle,
"%f %f\n", h_data[i].x, h_data[i].y);
40 FILE *handle = fopen(filename,
"w");
41 for (
int i=0;i < size;i++)
42 fprintf(handle,
"%f %f\n", h_data[i].x, h_data[i].y);
46 void writeRealToDisk(
const float *d_data,
const int size,
const char* filename, cudaStream_t stream) {
47 fprintf(stderr,
"writeRealToDisk %d %s\n", size, filename);
48 float* h_data =
new float[size];
49 copy_DtoH<float>(d_data, h_data, size, stream);
51 FILE *handle = fopen(filename,
"w");
52 for (
int i=0;i < size;i++)
53 fprintf(handle,
"%f\n", h_data[i]);
59 : deviceID(deviceID), stream(stream) {
62 void CudaFFTCompute::plan3D(
int *n,
int flags) {
64 forwardType = CUFFT_R2C;
65 backwardType = CUFFT_C2R;
66 cufftCheck(cufftPlan3d(&forwardPlan, n[2], n[1], n[0], CUFFT_R2C));
67 cufftCheck(cufftPlan3d(&backwardPlan, n[2], n[1], n[0], CUFFT_C2R));
72 void CudaFFTCompute::plan2D(
int *n,
int howmany,
int flags) {
74 forwardType = CUFFT_R2C;
75 backwardType = CUFFT_C2R;
76 int nt[2] = {n[1], n[0]};
77 cufftCheck(cufftPlanMany(&forwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_R2C, howmany));
78 cufftCheck(cufftPlanMany(&backwardPlan, 2, nt, NULL, 1, 0, NULL, 1, 0, CUFFT_C2R, howmany));
83 void CudaFFTCompute::plan1DX(
int *n,
int howmany,
int flags) {
85 forwardType = CUFFT_R2C;
86 backwardType = CUFFT_C2R;
87 cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_R2C, howmany));
88 cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2R, howmany));
93 void CudaFFTCompute::plan1DY(
int *n,
int howmany,
int flags) {
95 forwardType = CUFFT_C2C;
96 backwardType = CUFFT_C2C;
97 cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
98 cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
103 void CudaFFTCompute::plan1DZ(
int *n,
int howmany,
int flags) {
105 forwardType = CUFFT_C2C;
106 backwardType = CUFFT_C2C;
107 cufftCheck(cufftPlanMany(&forwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
108 cufftCheck(cufftPlanMany(&backwardPlan, 1, n, NULL, 0, 0, NULL, 0, 0, CUFFT_C2C, howmany));
121 float* CudaFFTCompute::allocateData(
const int dataSizeRequired) {
124 allocate_device<float>(&tmp, dataSizeRequired);
133 if (forwardType == CUFFT_R2C) {
137 cudaCheck(cudaStreamSynchronize(stream));
138 fprintf(stderr,
"AP FORWARD FFT\n");
139 fprintf(stderr,
"COPY DEVICE ARRAYS BACK TO HOST\n");
142 allocate_host<float>(&tran, m);
143 copy_DtoH<float>(
dataDst, tran, m, stream);
144 cudaCheck(cudaStreamSynchronize(stream));
145 TestArray_write<float>(
"tran_charge_grid_good.bin",
146 "transformed charge grid good", tran, m);
147 deallocate_host<float>(&tran);
159 }
else if (forwardType == CUFFT_C2C) {
175 cudaNAMD_bug(
"CudaFFTCompute::forward(), unsupported FFT type");
181 if (backwardType == CUFFT_C2R) {
191 cudaCheck(cudaStreamSynchronize(stream));
192 fprintf(stderr,
"AP BACKWARD FFT\n");
193 fprintf(stderr,
"COPY DEVICE ARRAYS BACK TO HOST\n");
196 allocate_host<float>(&grid, gridsize);
197 copy_DtoH<float>((
float*)
dataSrc, grid, gridsize, stream);
198 cudaCheck(cudaStreamSynchronize(stream));
199 TestArray_write<float>(
"potential_grid_good.bin",
200 "potential grid good", grid, gridsize);
201 deallocate_host<float>(&grid);
210 }
else if (backwardType == CUFFT_C2C) {
222 cudaNAMD_bug(
"CudaFFTCompute::backward(), unsupported FFT type");
226 void CudaFFTCompute::setStream() {
228 cufftCheck(cufftSetStream(forwardPlan, stream));
229 cufftCheck(cufftSetStream(backwardPlan, stream));
234 const int jblock,
const int kblock,
double kappa,
int deviceID, cudaStream_t stream,
unsigned int iGrid) :
236 deviceID(deviceID), stream(stream) {
244 for (
int i=0;i <
pmeGrid.
K1;i++) bm1f[i] = (
float)
bm1[i];
245 for (
int i=0;i <
pmeGrid.
K2;i++) bm2f[i] = (
float)
bm2[i];
246 for (
int i=0;i <
pmeGrid.
K3;i++) bm3f[i] = (
float)
bm3[i];
247 allocate_device<float>(&d_bm1,
pmeGrid.
K1);
248 allocate_device<float>(&d_bm2,
pmeGrid.
K2);
249 allocate_device<float>(&d_bm3,
pmeGrid.
K3);
250 copy_HtoD_sync<float>(bm1f, d_bm1,
pmeGrid.
K1);
251 copy_HtoD_sync<float>(bm2f, d_bm2,
pmeGrid.
K2);
252 copy_HtoD_sync<float>(bm3f, d_bm3,
pmeGrid.
K3);
256 allocate_device<EnergyVirial>(&d_energyVirial, 1);
257 allocate_host<EnergyVirial>(&h_energyVirial, 1);
259 cudaCheck(cudaEventCreate(©EnergyVirialEvent));
265 deallocate_device<float>(&d_bm1);
266 deallocate_device<float>(&d_bm2);
267 deallocate_device<float>(&d_bm3);
268 deallocate_device<EnergyVirial>(&d_energyVirial);
269 deallocate_host<EnergyVirial>(&h_energyVirial);
270 cudaCheck(cudaEventDestroy(copyEnergyVirialEvent));
276 fprintf(stderr,
"K-SPACE LATTICE %g %g %g %g %g %g %g %g %g\n",
277 lattice.
a().
x, lattice.
a().
y, lattice.
a().
z,
278 lattice.
b().
x, lattice.
b().
y, lattice.
b().
z,
279 lattice.
c().
x, lattice.
c().
y, lattice.
c().
z);
283 const bool doEnergyVirial = (doEnergy || doVirial);
285 int nfft1, nfft2, nfft3;
286 float *prefac1, *prefac2, *prefac3;
292 float recip1x, recip1y, recip1z;
293 float recip2x, recip2y, recip2z;
294 float recip3x, recip3y, recip3z;
331 NAMD_bug(
"CudaPmeKSpaceCompute::solve, invalid permutation");
365 if (doEnergyVirial) clear_device_array<EnergyVirial>(d_energyVirial, 1, stream);
369 cudaCheck(cudaStreamSynchronize(stream));
370 fprintf(stderr,
"AP calling scalar sum\n");
371 fprintf(stderr,
"(permutation == Perm_cX_Y_Z) = %s\n",
373 fprintf(stderr,
"nfft1=%d nfft2=%d nfft3=%d\n", nfft1, nfft2, nfft3);
375 fprintf(stderr,
"kappa=%g\n",
kappa);
376 fprintf(stderr,
"recip1x=%g recip1y=%g recip1z=%g\n",
377 (
double)recip1x, (
double)recip1y, (
double)recip1z);
378 fprintf(stderr,
"recip2x=%g recip2y=%g recip2z=%g\n",
379 (
double)recip2x, (
double)recip2y, (
double)recip2z);
380 fprintf(stderr,
"recip3x=%g recip3y=%g recip3z=%g\n",
381 (
double)recip3x, (
double)recip3y, (
double)recip3z);
382 fprintf(stderr,
"volume=%g\n", volume);
383 fprintf(stderr,
"j0=%d k0=%d\n",
j0,
k0);
385 allocate_host<float>(&
bm1, nfft1);
386 allocate_host<float>(&
bm2, nfft2);
387 allocate_host<float>(&
bm3, nfft3);
388 copy_DtoH<float>(prefac1,
bm1, nfft1, stream);
389 copy_DtoH<float>(prefac2,
bm2, nfft2, stream);
390 copy_DtoH<float>(prefac3,
bm3, nfft3, stream);
391 TestArray_write<float>(
"bm1_good.bin",
"structure factor bm1 good",
393 TestArray_write<float>(
"bm2_good.bin",
"structure factor bm2 good",
395 TestArray_write<float>(
"bm3_good.bin",
"structure factor bm3 good",
397 deallocate_host<float>(&
bm1);
398 deallocate_host<float>(&
bm2);
399 deallocate_host<float>(&
bm3);
404 recip1x, recip1y, recip1z, recip2x, recip2y, recip2z, recip3x, recip3y, recip3z,
405 volume, prefac1, prefac2, prefac3,
j0,
k0, doEnergyVirial,
406 &d_energyVirial->energy, d_energyVirial->virial, (float2*)data,
410 cudaCheck(cudaStreamSynchronize(stream));
411 fprintf(stderr,
"AP SCALAR SUM\n");
412 fprintf(stderr,
"COPY DEVICE ARRAYS BACK TO HOST\n");
413 int m = 2 * (nfft1/2 + 1) * nfft2 * nfft3;
415 allocate_host<float>(&tran, m);
416 copy_DtoH<float>((
float*)data, tran, m, stream);
417 cudaCheck(cudaStreamSynchronize(stream));
418 TestArray_write<float>(
"tran_potential_grid_good.bin",
419 "transformed potential grid good", tran, m);
420 deallocate_host<float>(&tran);
425 if (doEnergyVirial) {
426 copy_DtoH<EnergyVirial>(d_energyVirial, h_energyVirial, 1, stream);
427 cudaCheck(cudaEventRecord(copyEnergyVirialEvent, stream));
438 void CudaPmeKSpaceCompute::energyAndVirialCheck(
void *arg,
double walltime) {
441 cudaError_t err = cudaEventQuery(c->copyEnergyVirialEvent);
442 if (err == cudaSuccess) {
445 if (c->pencilXYZPtr != NULL)
447 else if (c->pencilZPtr != NULL)
450 NAMD_bug(
"CudaPmeKSpaceCompute::energyAndVirialCheck, pencilXYZPtr and pencilZPtr not set");
452 }
else if (err == cudaErrorNotReady) {
455 if (c->checkCount >= 1000000) {
457 sprintf(errmsg,
"CudaPmeKSpaceCompute::energyAndVirialCheck polled %d times",
464 sprintf(errmsg,
"in CudaPmeKSpaceCompute::energyAndVirialCheck after polling %d times",
471 CcdCallFnAfter(energyAndVirialCheck, arg, 0.1);
476 pencilXYZPtr = pencilPtr;
481 CcdCallFnAfter(energyAndVirialCheck,
this, 0.1);
487 pencilZPtr = pencilPtr;
491 CcdCallFnAfter(energyAndVirialCheck,
this, 0.1);
495 return h_energyVirial->energy;
501 virial[0] = h_energyVirial->virial[3];
502 virial[1] = h_energyVirial->virial[4];
503 virial[2] = h_energyVirial->virial[1];
505 virial[3] = h_energyVirial->virial[4];
506 virial[4] = h_energyVirial->virial[5];
507 virial[5] = h_energyVirial->virial[2];
509 virial[6] = h_energyVirial->virial[1];
510 virial[7] = h_energyVirial->virial[7];
511 virial[8] = h_energyVirial->virial[0];
514 virial[0] = h_energyVirial->virial[0];
515 virial[1] = h_energyVirial->virial[1];
516 virial[2] = h_energyVirial->virial[2];
518 virial[3] = h_energyVirial->virial[1];
519 virial[4] = h_energyVirial->virial[3];
520 virial[5] = h_energyVirial->virial[4];
522 virial[6] = h_energyVirial->virial[2];
523 virial[7] = h_energyVirial->virial[4];
524 virial[8] = h_energyVirial->virial[5];
527 fprintf(stderr,
"AP PME VIRIAL =\n" 528 " %g %g %g\n %g %g %g\n %g %g %g\n",
529 virial[0], virial[1], virial[2], virial[3], virial[4],
530 virial[5], virial[6], virial[7], virial[8]);
543 const int jblock,
const int kblock,
int deviceID, cudaStream_t stream) :
546 NAMD_bug(
"CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute, insufficient dataSize");
561 cudaCheck(cudaEventCreate(&gatherForceEvent));
569 if (d_atoms != NULL) deallocate_device<CudaAtom>(&d_atoms);
570 if (d_force != NULL) deallocate_device<CudaForce>(&d_force);
573 deallocate_device<float>(&
data);
574 cudaCheck(cudaEventDestroy(gatherForceEvent));
603 reallocate_device<CudaAtom>(&d_atoms, &d_atomsCapacity,
numAtoms, 1.5f);
606 copy_HtoD<CudaAtom>(atoms, d_atoms,
numAtoms, stream);
619 allocate_host<float>(&xyzq, 4*natoms);
620 copy_DtoH<float>((
float *)d_atoms, xyzq, 4*natoms, stream);
621 cudaCheck(cudaStreamSynchronize(stream));
622 char fname[64], remark[64];
623 sprintf(fname,
"pme_atoms_xyzq_soa_%d.bin", step);
624 sprintf(remark,
"SOA PME atoms xyzq, step %d\n", step);
625 TestArray_write<float>(fname, remark, xyzq, 4*natoms);
626 deallocate_host<float>(&xyzq);
637 fprintf(stderr,
"Calling spread_charge with parameters:\n");
638 fprintf(stderr,
"numAtoms = %d\n",
numAtoms);
639 fprintf(stderr,
"pmeGrid.K1 = %d\n",
pmeGrid.
K1);
640 fprintf(stderr,
"pmeGrid.K2 = %d\n",
pmeGrid.
K2);
641 fprintf(stderr,
"pmeGrid.K3 = %d\n",
pmeGrid.
K3);
642 fprintf(stderr,
"xsize = %d\n",
xsize);
643 fprintf(stderr,
"ysize = %d\n",
ysize);
644 fprintf(stderr,
"zsize = %d\n",
zsize);
645 fprintf(stderr,
"y0 = %d\n",
y0);
646 fprintf(stderr,
"z0 = %d\n",
z0);
647 fprintf(stderr,
"(pmeGrid.yBlocks == 1) = %d\n", (
pmeGrid.
yBlocks == 1));
648 fprintf(stderr,
"(pmeGrid.zBlocks == 1) = %d\n", (
pmeGrid.
zBlocks == 1));
657 cudaCheck(cudaStreamSynchronize(stream));
658 fprintf(stderr,
"AP SPREAD CHARGES\n");
659 fprintf(stderr,
"COPY DEVICE ARRAYS BACK TO HOST\n");
661 allocate_host<float>(&xyzq, 4*
numAtoms);
662 copy_DtoH<float>((
float *)d_atoms, xyzq, 4*
numAtoms, stream);
665 allocate_host<float>(&grid, gridlen);
666 copy_DtoH<float>(
data, grid, gridlen, stream);
667 cudaCheck(cudaStreamSynchronize(stream));
668 TestArray_write<float>(
"xyzq_good.bin",
"xyzq good", xyzq, 4*
numAtoms);
669 TestArray_write<float>(
"charge_grid_good.bin",
"charge grid good",
671 deallocate_host<float>(&xyzq);
672 deallocate_host<float>(&grid);
682 void CudaPmeRealSpaceCompute::cuda_gatherforce_check(
void *arg,
double walltime) {
686 cudaError_t err = cudaEventQuery(c->gatherForceEvent);
687 if (err == cudaSuccess) {
693 }
else if (err == cudaErrorNotReady) {
696 if (c->checkCount >= 1000000) {
698 sprintf(errmsg,
"CudaPmeRealSpaceCompute::cuda_gatherforce_check polled %d times",
705 sprintf(errmsg,
"in CudaPmeRealSpaceCompute::cuda_gatherforce_check after polling %d times",
712 CcdCallFnAfter(cuda_gatherforce_check, arg, 0.1);
717 devicePtr = devicePtr_in;
721 CcdCallFnAfter(cuda_gatherforce_check,
this, 0.1);
726 cudaCheck(cudaEventSynchronize(gatherForceEvent));
729 void CudaPmeRealSpaceCompute::setupGridData(
float* data,
int data_len) {
741 if (tex_data ==
data && tex_data_len == data_len)
return;
743 tex_data_len = data_len;
745 cudaResourceDesc resDesc;
746 memset(&resDesc, 0,
sizeof(resDesc));
747 resDesc.resType = cudaResourceTypeLinear;
748 resDesc.res.linear.devPtr =
data;
749 resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
750 resDesc.res.linear.desc.x =
sizeof(float)*8;
751 resDesc.res.linear.sizeInBytes = data_len*
sizeof(float);
752 cudaTextureDesc texDesc;
753 memset(&texDesc, 0,
sizeof(texDesc));
754 texDesc.readMode = cudaReadModeElementType;
755 cudaCheck(cudaCreateTextureObject(&gridTexObj, &resDesc, &texDesc, NULL));
757 if (grid_data ==
data && grid_data_len == data_len)
return;
759 grid_data_len = data_len;
769 reallocate_device<CudaForce>(&d_force, &d_forceCapacity,
numAtoms, 1.5f);
773 fprintf(stderr,
"AP gather force arguments\n");
774 fprintf(stderr,
"numAtoms = %d\n",
numAtoms);
775 fprintf(stderr,
"pmeGrid.K1 = %d\n",
pmeGrid.
K1);
776 fprintf(stderr,
"pmeGrid.K2 = %d\n",
pmeGrid.
K2);
777 fprintf(stderr,
"pmeGrid.K3 = %d\n",
pmeGrid.
K3);
778 fprintf(stderr,
"xsize = %d\n",
xsize);
779 fprintf(stderr,
"ysize = %d\n",
ysize);
780 fprintf(stderr,
"zsize = %d\n",
zsize);
781 fprintf(stderr,
"y0 = %d\n",
y0);
782 fprintf(stderr,
"z0 = %d\n",
z0);
783 fprintf(stderr,
"(pmeGrid.yBlocks == 1) = %d\n", (
pmeGrid.
yBlocks == 1));
784 fprintf(stderr,
"(pmeGrid.zBlocks == 1) = %d\n", (
pmeGrid.
zBlocks == 1));
786 fprintf(stderr,
"gridTexObj = %p\n", gridTexObj);
804 cudaCheck(cudaStreamSynchronize(stream));
805 fprintf(stderr,
"AP GATHER FORCE\n");
806 fprintf(stderr,
"COPY DEVICE ARRAYS BACK TO HOST\n");
809 allocate_host<float>(&xyz, 3*natoms);
810 copy_DtoH<float>((
float*)d_force, xyz, 3*natoms, stream);
811 cudaCheck(cudaStreamSynchronize(stream));
812 TestArray_write<float>(
"gather_force_good.bin",
813 "gather force good", xyz, 3*natoms);
814 deallocate_host<float>(&xyz);
818 copy_DtoH<CudaForce>(d_force, force,
numAtoms, stream);
820 cudaCheck(cudaEventRecord(gatherForceEvent, stream));
843 const int jblock,
const int kblock,
int deviceID, cudaStream_t stream) :
844 PmeTranspose(pmeGrid, permutation, jblock, kblock), deviceID(deviceID), stream(stream) {
847 allocate_device<float2>(&d_data,
dataSize);
848 #ifndef P2P_ENABLE_3D 849 allocate_device<float2>(&d_buffer,
dataSize);
853 dataPtrsYZX.resize(
nblock, NULL);
854 dataPtrsZXY.resize(
nblock, NULL);
856 allocate_device< TransposeBatch<float2> >(&batchesYZX, 3*
nblock);
857 allocate_device< TransposeBatch<float2> >(&batchesZXY, 3*
nblock);
862 deallocate_device<float2>(&d_data);
863 #ifndef P2P_ENABLE_3D 864 deallocate_device<float2>(&d_buffer);
866 deallocate_device< TransposeBatch<float2> >(&batchesZXY);
867 deallocate_device< TransposeBatch<float2> >(&batchesYZX);
874 if (dataPtrsYZX.size() != dataPtrsNew.size())
875 NAMD_bug(
"CudaPmeTranspose::setDataPtrsYZX, invalid dataPtrsNew size");
876 for (
int iblock=0;iblock <
nblock;iblock++) {
877 dataPtrsYZX[iblock] = dataPtrsNew[iblock];
882 for (
int iperm=0;iperm < 3;iperm++) {
888 }
else if (iperm == 1) {
899 for (
int iblock=0;iblock <
nblock;iblock++) {
901 int x0 =
pos[iblock];
902 int nx =
pos[iblock+1] - x0;
903 max_nx = std::max(max_nx, nx);
907 if (dataPtrsYZX[iblock] == NULL) {
913 data_out = dataPtrsYZX[iblock];
914 width_out = isize_out;
924 h_batchesYZX[iperm*
nblock + iblock] = batch;
933 max_nx_YZX[iperm] = max_nx;
936 copy_HtoD< TransposeBatch<float2> >(h_batchesYZX, batchesYZX, 3*
nblock, stream);
937 cudaCheck(cudaStreamSynchronize(stream));
938 delete [] h_batchesYZX;
945 if (dataPtrsZXY.size() != dataPtrsNew.size())
946 NAMD_bug(
"CudaPmeTranspose::setDataPtrsZXY, invalid dataPtrsNew size");
947 for (
int iblock=0;iblock <
nblock;iblock++) {
948 dataPtrsZXY[iblock] = dataPtrsNew[iblock];
954 for (
int iperm=0;iperm < 3;iperm++) {
960 }
else if (iperm == 1) {
971 for (
int iblock=0;iblock <
nblock;iblock++) {
973 int x0 =
pos[iblock];
974 int nx =
pos[iblock+1] - x0;
975 max_nx = std::max(max_nx, nx);
979 if (dataPtrsZXY[iblock] == NULL) {
985 data_out = dataPtrsZXY[iblock];
986 width_out = isize_out;
995 h_batchesZXY[iperm*
nblock + iblock] = batch;
998 max_nx_ZXY[iperm] = max_nx;
1001 copy_HtoD< TransposeBatch<float2> >(h_batchesZXY, batchesZXY, 3*
nblock, stream);
1002 cudaCheck(cudaStreamSynchronize(stream));
1003 delete [] h_batchesZXY;
1024 NAMD_bug(
"PmeTranspose::transposeXYZtoYZX, invalid permutation");
1098 NAMD_bug(
"PmeTranspose::transposeXYZtoZXY, invalid permutation");
1155 cudaCheck(cudaStreamSynchronize(stream));
1162 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToHost, block index exceeds number of blocks");
1164 int x0 =
pos[iblock];
1165 int nx =
pos[iblock+1] - x0;
1170 if (copyStart + copySize >
dataSize)
1171 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToHost, dataSize exceeded");
1173 if (copySize > h_dataSize)
1174 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToHost, h_dataSize exceeded");
1176 copy_DtoH<float2>(d_data+copyStart, h_data, copySize, stream);
1183 NAMD_bug(
"CudaPmeTranspose::copyDataHostToDevice, block index exceeds number of blocks");
1186 int i0, i1, j0, j1, k0, k1;
1187 getBlockDim(
pmeGrid,
permutation, iblock,
jblock,
kblock, i0, i1, j0, j1, k0, k1);
1192 copy3D_HtoD<float2>(data_in, data_out,
1197 ni, nj, nk, stream);
1200 #ifndef P2P_ENABLE_3D 1208 NAMD_bug(
"CudaPmeTranspose::copyDataDeviceToDevice, block index exceeds number of blocks");
1211 int i0, i1, j0, j1, k0, k1;
1212 getBlockDim(
pmeGrid,
permutation, iblock,
jblock,
kblock, i0, i1, j0, j1, k0, k1);
1217 float2* data_in = d_buffer + i0*nj*nk;
1219 copy3D_DtoD<float2>(data_in, data_out,
1224 ni, nj, nk, stream);
1232 NAMD_bug(
"CudaPmeTranspose::getBuffer, block index exceeds number of blocks");
1235 int i0, i1, j0, j1, k0, k1;
1236 getBlockDim(
pmeGrid,
permutation, iblock,
jblock,
kblock, i0, i1, j0, j1, k0, k1);
1241 return d_buffer + i0*nj*nk;
1250 int kblock_out = iblock;
1252 copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1259 int jblock_out = iblock;
1262 copyDataToPeerDevice(iblock, iblock_out, jblock_out, kblock_out, deviceID_out, permutation_out, data_out);
1265 void CudaPmeTranspose::copyDataToPeerDevice(
const int iblock,
1266 const int iblock_out,
const int jblock_out,
const int kblock_out,
1267 int deviceID_out,
int permutation_out, float2* data_out) {
1272 int i0, i1, j0, j1, k0, k1;
1273 getBlockDim(
pmeGrid, permutation_out, iblock_out, jblock_out, kblock_out, i0, i1, j0, j1, k0, k1);
1279 int isize_out = i1-i0+1;
1280 int jsize_out = j1-j0+1;
1282 int x0 =
pos[iblock];
1285 #ifndef P2P_ENABLE_3D 1287 copy_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out, ni*nj*nk, stream);
1289 copy3D_PeerDtoD<float2>(deviceID, deviceID_out, data_in, data_out,
1293 isize_out, jsize_out,
1294 ni, nj, nk, stream);
1305 pmeGrid(pmeGrid_), deviceID(deviceID_), deviceIndex(deviceIndex_),
1306 natoms(0), d_atoms(0), d_forces(0),
1307 d_grids(0), gridsize(0),
1308 d_trans(0), transize(0),
1309 d_bm1(0), d_bm2(0), d_bm3(0),
1311 self_energy_alch_first_time(true),
1312 force_scaling_alch_first_time(true),
1313 selfEnergy(0), selfEnergy_FEP(0), selfEnergy_TI_1(0), selfEnergy_TI_2(0), m_step(0)
1332 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP) 1333 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1334 int leastPriority, greatestPriority;
1335 cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
1336 cudaCheck(cudaStreamCreateWithPriority(&
stream, cudaStreamDefault, greatestPriority));
1367 #ifdef NODEGROUP_FORCE_REGISTER 1368 DeviceData& devData = cpdata.ckLocalBranch()->devData[
deviceIndex];
1371 devData.f_slow_size =
natoms;
1381 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 1395 cudaDeviceProp deviceProp;
1397 const int texture_alignment = int(deviceProp.textureAlignment);
1402 if ((
gridsize % texture_alignment) != 0) {
1416 cudaResourceDesc resDesc;
1417 memset(&resDesc, 0,
sizeof(resDesc));
1418 resDesc.resType = cudaResourceTypeLinear;
1419 resDesc.res.linear.devPtr = (
void*)(
d_grids + iGrid * (
size_t)
gridsize);
1420 resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
1421 resDesc.res.linear.desc.x =
sizeof(float)*8;
1422 resDesc.res.linear.sizeInBytes =
gridsize*
sizeof(float);
1423 cudaTextureDesc texDesc;
1424 memset(&texDesc, 0,
sizeof(texDesc));
1425 texDesc.readMode = cudaReadModeElementType;
1433 double *bm1 =
new double[k1];
1434 double *bm2 =
new double[k2];
1435 double *bm3 =
new double[k3];
1443 float *bm1f =
new float[k1];
1444 float *bm2f =
new float[k2];
1445 float *bm3f =
new float[k3];
1446 for (
int i=0; i < k1; i++) bm1f[i] = (
float) bm1[i];
1447 for (
int i=0; i < k2; i++) bm2f[i] = (
float) bm2[i];
1448 for (
int i=0; i < k3; i++) bm3f[i] = (
float) bm3[i];
1449 allocate_device<float>(&
d_bm1, k1);
1450 allocate_device<float>(&
d_bm2, k2);
1451 allocate_device<float>(&
d_bm3, k3);
1452 copy_HtoD_sync<float>(bm1f,
d_bm1, k1);
1453 copy_HtoD_sync<float>(bm2f,
d_bm2, k2);
1454 copy_HtoD_sync<float>(bm3f,
d_bm3, k3);
1468 deallocate_device<float4>(&
d_atoms);
1469 deallocate_device<float3>(&
d_forces);
1470 deallocate_device<float2>(&
d_trans);
1471 deallocate_device<float>(&
d_grids);
1475 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 1479 #if defined(NAMD_CUDA) // only CUDA uses texture objects 1493 #if defined(NAMD_CUDA) // only CUDA uses texture objects 1504 deallocate_device<float>(&
d_bm1);
1505 deallocate_device<float>(&
d_bm2);
1506 deallocate_device<float>(&
d_bm3);
1528 double volume = lattice.
volume();
1556 if (doEnergyVirial) {
1560 if (updateSelfEnergy && (sim_params.
alchOn ==
false)) {
1589 float(k1), (
float)k2, (
float)k3, order3,
1615 scalar_sum(
true , k1, k2, k3, (k1/2 + 1), k2, k3,
1616 kappa, arx, ary, arz, brx, bry, brz, crx, cry, crz, volume,
1663 if (doEnergyVirial) {
1674 scaleAndMergeForce(
m_step);
1685 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1686 PatchData* patchData = cpdata.ckLocalBranch();
1690 double energy, energy_F, energy_TI_1, energy_TI_2;
1718 fprintf(stderr,
"PME VIRIAL =\n" 1719 " %g %g %g\n %g %g %g\n %g %g %g\n",
1720 virial[0], virial[1], virial[2], virial[3], virial[4],
1721 virial[5], virial[6], virial[7], virial[8]);
1723 (*reduction)[REDUCTION_VIRIAL_SLOW_XX] += virial[0];
1724 (*reduction)[REDUCTION_VIRIAL_SLOW_XY] += virial[1];
1725 (*reduction)[REDUCTION_VIRIAL_SLOW_XZ] += virial[2];
1726 (*reduction)[REDUCTION_VIRIAL_SLOW_YX] += virial[3];
1727 (*reduction)[REDUCTION_VIRIAL_SLOW_YY] += virial[4];
1728 (*reduction)[REDUCTION_VIRIAL_SLOW_YZ] += virial[5];
1729 (*reduction)[REDUCTION_VIRIAL_SLOW_ZX] += virial[6];
1730 (*reduction)[REDUCTION_VIRIAL_SLOW_ZY] += virial[7];
1731 (*reduction)[REDUCTION_VIRIAL_SLOW_ZZ] += virial[8];
1743 void CudaPmeOneDevice::calcSelfEnergyAlch(
int step) {
1756 (lambda1Down != sim_params.
getElecLambda(1.0 - alchLambda1)) ||
1757 (lambda2Down != sim_params.
getElecLambda(1.0 - alchLambda2)) ||
1767 calcSelfEnergyFEPWrapper(
d_selfEnergy,
d_selfEnergy_FEP,
selfEnergy,
selfEnergy_FEP,
d_atoms,
d_partition,
natoms,
kappa, sim_params.
alchDecouple, lambda1Up, lambda2Up, lambda1Down, lambda2Down,
stream);
1776 (lambda1Down != sim_params.
getElecLambda(1.0 - alchLambda1)) ||
1786 calcSelfEnergyTIWrapper(
d_selfEnergy,
d_selfEnergy_TI_1,
d_selfEnergy_TI_2,
selfEnergy,
selfEnergy_TI_1,
selfEnergy_TI_2,
d_atoms,
d_partition,
natoms,
kappa, sim_params.
alchDecouple, lambda1Up, lambda1Down,
stream);
1792 void CudaPmeOneDevice::scaleAndMergeForce(
int step) {
1798 (lambda1Down != sim_params.
getElecLambda(1.0 - alchLambda1)) ||
1803 scale_factors[0] = lambda1Up;
1804 scale_factors[1] = lambda1Down;
1806 scale_factors[2] = 1.0 - lambda1Up;
1807 scale_factors[3] = 1.0 - lambda1Down;
1810 scale_factors[
num_used_grids-1] = (lambda1Up + lambda1Down - 1.0) * (-1.0);
1818 void CudaPmeOneDevice::scaleAndComputeFEPEnergyVirials(
const EnergyVirial* energyVirials,
int step,
double& energy,
double& energy_F,
double (&virial)[9]) {
1819 double scale1 = 1.0;
1820 double scale2 = 1.0;
1823 for (
unsigned int i = 0; i < 9; ++i) {
1833 energy += energyVirials[0].energy * elecLambdaUp;
1834 energy_F += energyVirials[0].energy * elecLambda2Up;
1835 energy += energyVirials[1].energy * elecLambdaDown;
1836 energy_F += energyVirials[1].energy * elecLambda2Down;
1837 virial[0] += energyVirials[0].virial[0] * elecLambdaUp;
1838 virial[1] += energyVirials[0].virial[1] * elecLambdaUp;
1839 virial[2] += energyVirials[0].virial[2] * elecLambdaUp;
1840 virial[3] += energyVirials[0].virial[1] * elecLambdaUp;
1841 virial[4] += energyVirials[0].virial[3] * elecLambdaUp;
1842 virial[5] += energyVirials[0].virial[4] * elecLambdaUp;
1843 virial[6] += energyVirials[0].virial[2] * elecLambdaUp;
1844 virial[7] += energyVirials[0].virial[4] * elecLambdaUp;
1845 virial[8] += energyVirials[0].virial[5] * elecLambdaUp;
1846 virial[0] += energyVirials[1].virial[0] * elecLambdaDown;
1847 virial[1] += energyVirials[1].virial[1] * elecLambdaDown;
1848 virial[2] += energyVirials[1].virial[2] * elecLambdaDown;
1849 virial[3] += energyVirials[1].virial[1] * elecLambdaDown;
1850 virial[4] += energyVirials[1].virial[3] * elecLambdaDown;
1851 virial[5] += energyVirials[1].virial[4] * elecLambdaDown;
1852 virial[6] += energyVirials[1].virial[2] * elecLambdaDown;
1853 virial[7] += energyVirials[1].virial[4] * elecLambdaDown;
1854 virial[8] += energyVirials[1].virial[5] * elecLambdaDown;
1856 energy += energyVirials[2].energy * (1.0 - elecLambdaUp);
1857 energy_F += energyVirials[2].energy * (1.0 - elecLambda2Up);
1858 energy += energyVirials[3].energy * (1.0 - elecLambdaDown);
1859 energy_F += energyVirials[3].energy * (1.0 - elecLambda2Down);
1860 virial[0] += energyVirials[2].virial[0] * (1.0 - elecLambdaUp);
1861 virial[1] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1862 virial[2] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1863 virial[3] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1864 virial[4] += energyVirials[2].virial[3] * (1.0 - elecLambdaUp);
1865 virial[5] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1866 virial[6] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1867 virial[7] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1868 virial[8] += energyVirials[2].virial[5] * (1.0 - elecLambdaUp);
1869 virial[0] += energyVirials[3].virial[0] * (1.0 - elecLambdaDown);
1870 virial[1] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1871 virial[2] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1872 virial[3] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1873 virial[4] += energyVirials[3].virial[3] * (1.0 - elecLambdaDown);
1874 virial[5] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1875 virial[6] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1876 virial[7] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1877 virial[8] += energyVirials[3].virial[5] * (1.0 - elecLambdaDown);
1879 energy += energyVirials[4].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1880 energy_F += energyVirials[4].energy * (-1.0 * (elecLambda2Up + elecLambda2Down - 1.0));
1881 virial[0] += energyVirials[4].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1882 virial[1] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1883 virial[2] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1884 virial[3] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1885 virial[4] += energyVirials[4].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1886 virial[5] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1887 virial[6] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1888 virial[7] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1889 virial[8] += energyVirials[4].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1893 energy += energyVirials[2].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1894 energy_F += energyVirials[2].energy * (-1.0 * (elecLambda2Up + elecLambda2Down - 1.0));
1895 virial[0] += energyVirials[2].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1896 virial[1] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1897 virial[2] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1898 virial[3] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1899 virial[4] += energyVirials[2].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1900 virial[5] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1901 virial[6] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1902 virial[7] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1903 virial[8] += energyVirials[2].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1908 void CudaPmeOneDevice::scaleAndComputeTIEnergyVirials(
const EnergyVirial* energyVirials,
int step,
double& energy,
double& energy_TI_1,
double& energy_TI_2,
double (&virial)[9]) {
1909 double scale1 = 1.0;
1913 for (
unsigned int i = 0; i < 9; ++i) {
1920 energy += energyVirials[0].energy * elecLambdaUp;
1921 energy += energyVirials[1].energy * elecLambdaDown;
1922 energy_TI_1 += energyVirials[0].energy;
1923 energy_TI_2 += energyVirials[1].energy;
1924 virial[0] += energyVirials[0].virial[0] * elecLambdaUp;
1925 virial[1] += energyVirials[0].virial[1] * elecLambdaUp;
1926 virial[2] += energyVirials[0].virial[2] * elecLambdaUp;
1927 virial[3] += energyVirials[0].virial[1] * elecLambdaUp;
1928 virial[4] += energyVirials[0].virial[3] * elecLambdaUp;
1929 virial[5] += energyVirials[0].virial[4] * elecLambdaUp;
1930 virial[6] += energyVirials[0].virial[2] * elecLambdaUp;
1931 virial[7] += energyVirials[0].virial[4] * elecLambdaUp;
1932 virial[8] += energyVirials[0].virial[5] * elecLambdaUp;
1933 virial[0] += energyVirials[1].virial[0] * elecLambdaDown;
1934 virial[1] += energyVirials[1].virial[1] * elecLambdaDown;
1935 virial[2] += energyVirials[1].virial[2] * elecLambdaDown;
1936 virial[3] += energyVirials[1].virial[1] * elecLambdaDown;
1937 virial[4] += energyVirials[1].virial[3] * elecLambdaDown;
1938 virial[5] += energyVirials[1].virial[4] * elecLambdaDown;
1939 virial[6] += energyVirials[1].virial[2] * elecLambdaDown;
1940 virial[7] += energyVirials[1].virial[4] * elecLambdaDown;
1941 virial[8] += energyVirials[1].virial[5] * elecLambdaDown;
1943 energy += energyVirials[2].energy * (1.0 - elecLambdaUp);
1944 energy += energyVirials[3].energy * (1.0 - elecLambdaDown);
1945 energy_TI_1 += -1.0 * energyVirials[2].energy;
1946 energy_TI_2 += -1.0 * energyVirials[3].energy;
1947 virial[0] += energyVirials[2].virial[0] * (1.0 - elecLambdaUp);
1948 virial[1] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1949 virial[2] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1950 virial[3] += energyVirials[2].virial[1] * (1.0 - elecLambdaUp);
1951 virial[4] += energyVirials[2].virial[3] * (1.0 - elecLambdaUp);
1952 virial[5] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1953 virial[6] += energyVirials[2].virial[2] * (1.0 - elecLambdaUp);
1954 virial[7] += energyVirials[2].virial[4] * (1.0 - elecLambdaUp);
1955 virial[8] += energyVirials[2].virial[5] * (1.0 - elecLambdaUp);
1956 virial[0] += energyVirials[3].virial[0] * (1.0 - elecLambdaDown);
1957 virial[1] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1958 virial[2] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1959 virial[3] += energyVirials[3].virial[1] * (1.0 - elecLambdaDown);
1960 virial[4] += energyVirials[3].virial[3] * (1.0 - elecLambdaDown);
1961 virial[5] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1962 virial[6] += energyVirials[3].virial[2] * (1.0 - elecLambdaDown);
1963 virial[7] += energyVirials[3].virial[4] * (1.0 - elecLambdaDown);
1964 virial[8] += energyVirials[3].virial[5] * (1.0 - elecLambdaDown);
1965 energy += energyVirials[4].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1966 energy_TI_1 += -1.0 * energyVirials[4].energy;
1967 energy_TI_2 += -1.0 * energyVirials[4].energy;
1968 virial[0] += energyVirials[4].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1969 virial[1] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1970 virial[2] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1971 virial[3] += energyVirials[4].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1972 virial[4] += energyVirials[4].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1973 virial[5] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1974 virial[6] += energyVirials[4].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1975 virial[7] += energyVirials[4].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1976 virial[8] += energyVirials[4].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1978 energy += energyVirials[2].energy * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1979 energy_TI_1 += -1.0 * energyVirials[2].energy;
1980 energy_TI_2 += -1.0 * energyVirials[2].energy;
1981 virial[0] += energyVirials[2].virial[0] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1982 virial[1] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1983 virial[2] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1984 virial[3] += energyVirials[2].virial[1] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1985 virial[4] += energyVirials[2].virial[3] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1986 virial[5] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1987 virial[6] += energyVirials[2].virial[2] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1988 virial[7] += energyVirials[2].virial[4] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1989 virial[8] += energyVirials[2].virial[5] * (-1.0 * (elecLambdaUp + elecLambdaDown - 1.0));
1997 double gw = w * grid;
2002 const int numThreads,
const int3 patchGridDim,
const int order) {
2004 const int gridBytes = patchGridDim.x * patchGridDim.y * patchGridDim.z *
sizeof(float);
2006 order *
sizeof(
float);
2007 const int indexBytes = numThreads *
sizeof(char4);
2009 return gridBytes + thetaBytes + indexBytes;
2013 const int numThreads,
const int3 patchGridDim,
const int order) {
2015 const int gridBytes = patchGridDim.x * patchGridDim.y * patchGridDim.z *
sizeof(float);
2019 return gridBytes + thetaBytes;
2024 use = use && (
order == 8);
2025 use = use && (periodicY);
2026 use = use && (periodicZ);
2036 const int3 constexprPatchGridDim = make_int3(
2043 constexprPatchGridDim, 8 );
2046 constexprPatchGridDim, 8 );
2055 double3* patchMin, double3* patchMax, double3* awayDists) {
2085 for (
int i = 0; i < numPatches; i++) {
2088 double3 width = pmax - pmin;
2092 marginVal.
x = 0.5 * (awayDists[i].x -
simParams->cutoff / sysdima);
2093 marginVal.y = 0.5 * (awayDists[i].y -
simParams->cutoff / sysdimb);
2094 marginVal.z = 0.5 * (awayDists[i].z -
simParams->cutoff / sysdimc);
2097 double3 minAtom = pmin - marginVal;
2098 double3 maxAtom = pmax + marginVal;
2114 gridWidth.x = gridMax.x - gridMin.x +
order;
2115 gridWidth.y = gridMax.y - gridMin.y +
order;
2116 gridWidth.z = gridMax.z - gridMin.z +
order;
2124 numPatches,
nullptr);
2125 cudaStreamSynchronize(
nullptr);
void spread_charge_v2(const PatchLevelPmeData patchLevelPmeData, const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const float nfftx_f, const float nffty_f, const float nfftz_f, const int order3, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
void finishReduction(bool doEnergyVirial)
bool isEqual(const Lattice &other) const
CudaPmeOneDevice(PmeGrid pmeGrid_, int deviceID_, int deviceIndex_)
#define NAMD_EVENT_STOP(eon, id)
cufftHandle * backwardPlans
void compute_b_moduli(double *bm, int K, int order)
NAMD_HOST_DEVICE Vector c() const
int gatherForceSharedBytes
void energyAndVirialSetCallback(CudaPmePencilXYZ *pencilPtr)
bool force_scaling_alch_first_time
void gatherForceDone(unsigned int iGrid)
static constexpr int kNumThreads
void batchTranspose_xyz_yzx(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
CudaPmeTranspose(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
double * d_selfEnergy_FEP
~CudaPmeRealSpaceCompute()
BigReal alchElecLambdaStart
SimParameters * simParameters
void cudaDie(const char *msg, cudaError_t err)
bool simulationCompatible
EnergyVirial * d_energyVirials
int3 * d_patchGridOffsets
NAMD_HOST_DEVICE Position unscale(ScaledPosition s) const
void checkPatchLevelLatticeCompatibilityAndComputeOffsets(const Lattice &lattice, const int numPatches, const CudaLocalRecord *localRecords, double3 *patchMin, double3 *patchMax, double3 *awayDists)
static void getPencilDim(const PmeGrid &pmeGrid, const int permutation, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
void energyAndVirialDone(unsigned int iGrid)
void checkPatchLevelSimParamCompatibility(const int order, const bool periodicY, const bool periodicZ)
NodeReduction * reduction
void spreadCharge(Lattice &lattice)
BigReal getElecLambda(const BigReal) const
void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)
void copyAtoms(const int numAtoms, const CudaAtom *atoms)
PatchLevelPmeData patchLevelPmeData
CudaPmeRealSpaceCompute(PmeGrid pmeGrid, const int jblock, const int kblock, int deviceID, cudaStream_t stream)
int spreadChargeSharedBytes
void copyDataToPeerDeviceZXY(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
void copyDataDeviceToDevice(const int iblock, float2 *data_out)
void calcSelfEnergyTIWrapper(double *d_selfEnergy, double *d_selfEnergy_TI_1, double *d_selfEnergy_TI_2, double &h_selfEnergy, double &h_selfEnergy_TI_1, double &h_selfEnergy_TI_2, const float4 *d_atoms, const int *d_partition, const int num_atoms, const double ewaldcof, const bool alchDecouple, const double lambda1Up, const double lambda1Down, cudaStream_t stream)
CudaFFTCompute(int deviceID, cudaStream_t stream)
int computeSharedMemoryPatchLevelSpreadCharge(const int numThreads, const int3 patchGridDim, const int order)
void copyDataDeviceToHost(const int iblock, float2 *h_data, const int h_dataSize)
#define NAMD_EVENT_START(eon, id)
void NAMD_bug(const char *err_msg)
void writeHostComplexToDisk(const float2 *h_data, const int size, const char *filename)
NAMD_HOST_DEVICE ScaledPosition scale(Position p) const
__thread DeviceCUDA * deviceCUDA
bool self_energy_alch_first_time
void gather_force(const PatchLevelPmeData patchLevelPmeData, const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, const float *data, const int order, float3 *force, const cudaTextureObject_t gridTexObj, cudaStream_t stream)
NAMD_HOST_DEVICE BigReal volume(void) const
NAMD_HOST_DEVICE Vector a_r() const
void calcSelfEnergyFEPWrapper(double *d_selfEnergy, double *d_selfEnergy_FEP, double &h_selfEnergy, double &h_selfEnergyFEP, const float4 *d_atoms, const int *d_partition, const int num_atoms, const double ewaldcof, const bool alchDecouple, const double lambda1Up, const double lambda2Up, const double lambda1Down, const double lambda2Down, cudaStream_t stream)
NAMD_HOST_DEVICE Vector b_r() const
void getVirial(double *virial)
cudaTextureObject_t * gridTexObjArrays
static constexpr int kPatchGridDimPad
NAMD_HOST_DEVICE Vector c_r() const
void energyAndVirialDone(unsigned int iGrid)
void writeComplexToDisk(const float2 *d_data, const int size, const char *filename, cudaStream_t stream)
NAMD_HOST_DEVICE Vector b() const
double * d_selfEnergy_TI_2
void cudaNAMD_bug(const char *msg)
double compute_selfEnergy(double *d_selfEnergy, const float4 *d_atoms, int natoms, double ewaldcof, cudaStream_t stream)
static void getBlockDim(const PmeGrid &pmeGrid, const int permutation, const int iblock, const int jblock, const int kblock, int &i0, int &i1, int &j0, int &j1, int &k0, int &k1)
void spread_charge(const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
BigReal getCurrentLambda2(const int) const
BigReal getCurrentLambda(const int) const
void gatherForce(Lattice &lattice, CudaForce *force)
void waitStreamSynchronize()
unsigned int multipleGridIndex
void solve(Lattice &lattice, const bool doEnergy, const bool doVirial, float *data)
void gatherForceSetCallback(ComputePmeCUDADevice *devicePtr_in)
void scaleAndMergeForceWrapper(float3 *forces, const float *factors, const size_t num_arrays, const int num_atoms, cudaStream_t stream)
void setDataPtrsYZX(std::vector< float2 *> &dataPtrsNew, float2 *data)
const CudaLocalRecord * localRecords
static constexpr int kDim
void transposeXYZtoYZX(const float2 *data)
void setDataPtrsZXY(std::vector< float2 *> &dataPtrsNew, float2 *data)
CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation, const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream, unsigned int iGrid=0)
void batchTranspose_xyz_zxy(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
cufftHandle * forwardPlans
size_t alchGetNumOfPMEGrids() const
void copyDataHostToDevice(const int iblock, float2 *data_in, float2 *data_out)
static constexpr int kThetaPad
void transposeXYZtoZXY(const float2 *data)
static constexpr int kPatchGridDim
NAMD_HOST_DEVICE Vector a() const
int3 * h_patchGridOffsets
NAMD_HOST_DEVICE Vector unit(void) const
void CcdCallBacksReset(void *ignored, double curWallTime)
void waitGatherForceDone()
void copyDataToPeerDeviceYZX(const int iblock, int deviceID_out, int permutation_out, float2 *data_out)
int computeSharedMemoryPatchLevelGatherForce(const int numThreads, const int3 patchGridDim, const int order)
float2 * getBuffer(const int iblock)
int getShiftedGrid(const double x, const int grid)
double * d_selfEnergy_TI_1
EnergyVirial * h_energyVirials
unsigned int multipleGridIndex
void writeRealToDisk(const float *d_data, const int size, const char *filename, cudaStream_t stream)
float * d_scaling_factors
void checkPatchLevelDeviceCompatibility()
void compute(const Lattice &lattice, int doEnergyVirial, int step)