2 #define _USE_MATH_DEFINES
10 #include <hip/hip_runtime.h>
16 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
50 __forceinline__ __device__
double expfunc(
const double x) {
54 __forceinline__ __device__
float expfunc(
const float x) {
63 template <
typename T,
typename T2,
bool calc_energy_virial,
bool orderXYZ,
bool doOrtho>
65 const int size1,
const int size2,
const int size3,
66 const T recip1x,
const T recip1y,
const T recip1z,
67 const T recip2x,
const T recip2y,
const T recip2z,
68 const T recip3x,
const T recip3y,
const T recip3z,
69 const T* prefac1,
const T* prefac2,
const T* prefac3,
70 const T pi_ewald,
const T piv_inv,
71 const int k2_00,
const int k3_00,
73 double* __restrict__ energy_recip,
74 double* __restrict__ virial) {
77 extern __shared__ T sh_prefac[];
79 HIP_DYNAMIC_SHARED( T, sh_prefac)
83 T* sh_prefac1 = (T *)&sh_prefac[0];
84 T* sh_prefac2 = (T *)&sh_prefac[nfft1];
85 T* sh_prefac3 = (T *)&sh_prefac[nfft1 + nfft2];
88 unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
89 int k3 = tid/(size1*size2);
90 tid -= k3*size1*size2;
92 int k1 = tid - k2*size1;
95 int pos = k1 + (k2 + k3*size2)*size1;
102 const int lim2 = size2 + k2_00;
103 const int lim3 = size3 + k3_00;
106 int tot_inc = blockDim.x*gridDim.x;
107 const int k3_inc = tot_inc/(size1*size2);
108 tot_inc -= k3_inc*size1*size2;
109 const int k2_inc = tot_inc/size1;
110 const int k1_inc = tot_inc - k2_inc*size1;
113 if (k1 == 0 && k2 == 0 && k3 == 0) {
132 pos += k3_inc*size1*size2;
139 sh_prefac1[t] = prefac1[t];
144 sh_prefac2[t] = prefac2[t];
149 sh_prefac3[t] = prefac3[t];
156 double virial0 = 0.0;
157 double virial1 = 0.0;
158 double virial2 = 0.0;
159 double virial3 = 0.0;
160 double virial4 = 0.0;
161 double virial5 = 0.0;
166 const int nfft1_half = ((nfft1 & 1) == 0) ? nfft1/2 : -1;
167 const int nfft2_half = nfft2/2;
168 const int nfft3_half = nfft3/2;
174 int k2s = (k2 <= nfft2_half) ? k2 : k2 - nfft2;
175 int k3s = (k3 <= nfft3_half) ? k3 : k3 - nfft3;
183 m1 = recip1x*k1 + recip2x*k2s + recip3x*k3s;
184 m2 = recip1y*k1 + recip2y*k2s + recip3y*k3s;
185 m3 = recip1z*k1 + recip2z*k2s + recip3z*k3s;
188 T msq = m1*m1 + m2*m2 + m3*m3;
189 T msq_inv = ((T)1)/msq;
191 T theta3 = sh_prefac1[k1]*sh_prefac2[k2]*sh_prefac3[k3];
192 T q2 = ((T)2)*(q.x*q.x + q.y*q.y)*theta3;
193 if ( (k1 == 0) || (k1 == nfft1_half) ) q2 *= ((T)0.5);
194 T xp3 =
expfunc(-pi_ewald*msq)*piv_inv;
198 if (calc_energy_virial) {
200 T vir = ((T)2)*(pi_ewald + msq_inv);
202 virial0 += (double)(fac*(vir*m1*m1 - ((T)1)));
203 virial1 += (double)(fac*vir*m1*m2);
204 virial2 += (double)(fac*vir*m1*m3);
205 virial3 += (double)(fac*(vir*m2*m2 - ((T)1)));
206 virial4 += (double)(fac*vir*m2*m3);
207 virial5 += (double)(fac*(vir*m3*m3 - ((T)1)));
228 pos += k3_inc*size2*size1;
232 if (calc_energy_virial) {
233 const int tid = threadIdx.x & (warpSize-1);
234 const int base = (threadIdx.x/warpSize);
237 for (
int d=warpSize/2;d >= 1;d /= 2) {
257 sh_ev[base].
energy = energy;
258 sh_ev[base].
virial[0] = virial0;
259 sh_ev[base].
virial[1] = virial1;
260 sh_ev[base].
virial[2] = virial2;
261 sh_ev[base].
virial[3] = virial3;
262 sh_ev[base].
virial[4] = virial4;
263 sh_ev[base].
virial[5] = virial5;
267 energy = (tid < blockDim.x/warpSize) ? sh_ev[tid].energy : 0.0;
268 virial0 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[0] : 0.0;
269 virial1 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[1] : 0.0;
270 virial2 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[2] : 0.0;
271 virial3 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[3] : 0.0;
272 virial4 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[4] : 0.0;
273 virial5 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[5] : 0.0;
274 for (
int d=warpSize/2;d >= 1;d /= 2) {
292 if (threadIdx.x == 0) {
293 atomicAdd(energy_recip, energy*0.5);
300 atomicAdd(&virial[0], virial0);
301 atomicAdd(&virial[1], virial1);
302 atomicAdd(&virial[2], virial2);
303 atomicAdd(&virial[3], virial3);
304 atomicAdd(&virial[4], virial4);
305 atomicAdd(&virial[5], virial5);
312 template <
typename T>
313 __forceinline__ __device__
void write_grid(
const float val,
const int ind,
315 #if defined(NAMD_HIP) && ((HIP_VERSION_MAJOR >= 3) && (HIP_VERSION_MINOR > 3))
316 atomicAddNoRet(&data[ind], (T)val);
318 atomicAdd(&data[ind], (T)val);
325 template <
typename T,
int order>
328 theta[
order-1] = ((T)0);
330 theta[0] = ((T)1) - w;
333 for (
int k=3;k <=
order-1;k++) {
334 T div = ((T)1) / (T)(k-1);
335 theta[k-1] = div*w*theta[k-2];
337 for (
int j=1;j <= k-2;j++) {
338 theta[k-j-1] = div*((w+j)*theta[k-j-2] + (k-j-w)*theta[k-j-1]);
340 theta[0] = div*(((T)1) - w)*theta[0];
344 T div = ((T)1) / (T)(
order-1);
347 for (
int j=1;j <=
order-2;j++) {
351 theta[0] = div*(((T)1) - w)*theta[0];
357 template <
typename T,
typename T3,
int order>
360 theta[
order-1].x = ((T)0);
361 theta[
order-1].y = ((T)0);
362 theta[
order-1].z = ((T)0);
366 theta[0].x = ((T)1) - wx;
367 theta[0].y = ((T)1) - wy;
368 theta[0].z = ((T)1) - wz;
371 for (
int k=3;k <=
order-1;k++) {
372 T div = ((T)1) / (T)(k-1);
373 theta[k-1].x = div*wx*theta[k-2].x;
374 theta[k-1].y = div*wy*theta[k-2].y;
375 theta[k-1].z = div*wz*theta[k-2].z;
377 for (
int j=1;j <= k-2;j++) {
378 theta[k-j-1].x = div*((wx + j)*theta[k-j-2].
x + (k-j-wx)*theta[k-j-1].x);
379 theta[k-j-1].y = div*((wy + j)*theta[k-j-2].
y + (k-j-wy)*theta[k-j-1].y);
380 theta[k-j-1].z = div*((wz + j)*theta[k-j-2].
z + (k-j-wz)*theta[k-j-1].z);
382 theta[0].x = div*(((T)1) - wx)*theta[0].
x;
383 theta[0].y = div*(((T)1) - wy)*theta[0].
y;
384 theta[0].z = div*(((T)1) - wz)*theta[0].
z;
388 dtheta[0].x = -theta[0].x;
389 dtheta[0].y = -theta[0].y;
390 dtheta[0].z = -theta[0].z;
392 for (
int j=2;j <=
order;j++) {
393 dtheta[j-1].x = theta[j-2].x - theta[j-1].x;
394 dtheta[j-1].y = theta[j-2].y - theta[j-1].y;
395 dtheta[j-1].z = theta[j-2].z - theta[j-1].z;
399 T div = ((T)1) / (T)(
order-1);
404 for (
int j=1;j <=
order-2;j++) {
410 theta[0].x = div*(((T)1) - wx)*theta[0].
x;
411 theta[0].y = div*(((T)1) - wy)*theta[0].
y;
412 theta[0].z = div*(((T)1) - wz)*theta[0].
z;
420 template <
typename AT,
int order,
bool periodicY,
bool periodicZ>
423 const int nfftx,
const int nffty,
const int nfftz,
424 const int xsize,
const int ysize,
const int zsize,
425 const int xdim,
const int y00,
const int z00,
440 const unsigned int pos = blockIdx.x*blockDim.x + threadIdx.x;
441 const unsigned int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
443 if (pos < pos_end && threadIdx.y == 0) {
445 float4 xyzqi = xyzq[pos];
451 float frx = ((float)nfftx)*
x;
452 float fry = ((float)nffty)*
y;
453 float frz = ((float)nfftz)*
z;
459 float wx = frx - (float)frxi;
460 float wy = fry - (float)fryi;
461 float wz = frz - (float)frzi;
463 if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
464 if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
466 sh_ix[threadIdx.x] = frxi;
467 sh_iy[threadIdx.x] = fryi - y00;
468 sh_iz[threadIdx.x] = frzi - z00;
472 calc_one_theta<float, order>(wx, theta);
474 for (
int i=0;i <
order;i++) sh_thetax[threadIdx.x*order + i] = q*theta[i];
476 calc_one_theta<float, order>(wy, theta);
478 for (
int i=0;i <
order;i++) sh_thetay[threadIdx.x*order + i] = theta[i];
480 calc_one_theta<float, order>(wz, theta);
482 for (
int i=0;i <
order;i++) sh_thetaz[threadIdx.x*order + i] = theta[i];
491 const int tid = (threadIdx.x + threadIdx.y*blockDim.x) % order3;
492 const int x0 = tid %
order;
497 int iadd = blockDim.x*blockDim.y/order3;
498 int i = (threadIdx.x + threadIdx.y*blockDim.x)/order3;
499 int iend = pos_end - blockIdx.x*blockDim.x;
500 for (;i < iend;i += iadd) {
501 int x = sh_ix[i] + x0;
502 int y = sh_iy[i] + y0;
503 int z = sh_iz[i] + z0;
505 if (x >= nfftx) x -= nfftx;
507 if (periodicY && (y >= nffty)) y -= nffty;
508 if (!periodicY && (y < 0 || y >= ysize))
continue;
510 if (periodicZ && (z >= nfftz)) z -= nfftz;
511 if (!periodicZ && (z < 0 || z >= zsize))
continue;
514 int ind = x + xdim*(y + ysize*(
z));
521 write_grid<AT>(sh_thetax[i*order+x0]*sh_thetay[i*order+y0]*sh_thetaz[i*order+z0], ind, data);
528 template <
typename T> __forceinline__ __device__
530 const int stride,
const int pos,
535 template <> __forceinline__ __device__
537 const int stride,
const int pos,
543 reinterpret_cast<float*
>(force)[pos * 3 + 0] = fx;
544 reinterpret_cast<float*
>(force)[pos * 3 + 1] = fy;
545 reinterpret_cast<float*
>(force)[pos * 3 + 2] = fz;
548 force[pos+stride] = fy;
549 force[pos+stride*2] = fz;
554 template <> __forceinline__ __device__
556 const int stride,
const int pos,
562 reinterpret_cast<float*
>(force)[pos * 3 + 0] = fx;
563 reinterpret_cast<float*
>(force)[pos * 3 + 1] = fy;
564 reinterpret_cast<float*
>(force)[pos * 3 + 2] = fz;
574 template <
typename T,
int order>
596 template <
typename CT,
typename FT,
int order,
bool periodicY,
bool periodicZ>
598 const int nfftx,
const int nffty,
const int nfftz,
599 const int xsize,
const int ysize,
const int zsize,
600 const int xdim,
const int y00,
const int z00,
604 const cudaTextureObject_t gridTexObj,
609 const int tid = threadIdx.x + threadIdx.y*blockDim.x;
614 const int pos = blockIdx.x*blockDim.x + threadIdx.x;
615 const int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
618 if (pos < pos_end && threadIdx.y == 0) {
620 float4 xyzqi = xyzq[pos];
626 float frx = ((float)nfftx)*
x;
627 float fry = ((float)nffty)*
y;
628 float frz = ((float)nfftz)*
z;
634 float wx = frx - (float)frxi;
635 float wy = fry - (float)fryi;
636 float wz = frz - (float)frzi;
638 if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
639 if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
641 shmem[threadIdx.x].ix = frxi;
642 shmem[threadIdx.x].iy = fryi - y00;
643 shmem[threadIdx.x].iz = frzi - z00;
644 shmem[threadIdx.x].charge = q;
646 float3 theta_tmp[
order];
647 float3 dtheta_tmp[
order];
648 calc_theta_dtheta<float, float3, order>(wx, wy, wz, theta_tmp, dtheta_tmp);
651 for (
int i=0;i <
order;i++) shmem[threadIdx.x].thetax[i] = theta_tmp[i].x;
654 for (
int i=0;i <
order;i++) shmem[threadIdx.x].thetay[i] = theta_tmp[i].y;
657 for (
int i=0;i <
order;i++) shmem[threadIdx.x].thetaz[i] = theta_tmp[i].z;
660 for (
int i=0;i <
order;i++) shmem[threadIdx.x].dthetax[i] = dtheta_tmp[i].x;
663 for (
int i=0;i <
order;i++) shmem[threadIdx.x].dthetay[i] = dtheta_tmp[i].y;
666 for (
int i=0;i <
order;i++) shmem[threadIdx.x].dthetaz[i] = dtheta_tmp[i].z;
677 const int nsc = (
order == 4) ? 2 : ((
order == 6) ? 3 : 4);
680 const int t = (tid % 8);
683 const int tz0 = (t / 4)*nsc;
684 const int ty0 = ((t / 2) % 2)*nsc;
685 const int tx0 = (t % 2)*nsc;
699 const int base_end = pos_end - blockIdx.x*blockDim.x;
705 while (base < base_end) {
710 int ix0 = shmem[base].ix;
711 int iy0 = shmem[base].iy;
712 int iz0 = shmem[base].iz;
716 for (
int i=0;i < nsc*nsc*nsc;i++) {
717 int tz = tz0 + (i/(nsc*nsc));
718 int ty = ty0 + ((i/nsc) % nsc);
719 int tx = tx0 + (i % nsc);
724 if (ix >= nfftx) ix -= nfftx;
726 if (periodicY && (iy >= nffty)) iy -= nffty;
727 if (!periodicY && (iy < 0 || iy >= ysize))
continue;
729 if (periodicZ && (iz >= nfftz)) iz -= nfftz;
730 if (!periodicZ && (iz < 0 || iz >= zsize))
continue;
732 int ind = ix + (iy + iz*ysize)*xdim;
734 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
735 float q0 =
__ldg(&data[ind]);
737 float q0 = tex1Dfetch<float>(gridTexObj, ind);
739 float thx0 = shmem[base].thetax[tx];
740 float thy0 = shmem[base].thetay[ty];
741 float thz0 = shmem[base].thetaz[tz];
742 float dthx0 = shmem[base].dthetax[tx];
743 float dthy0 = shmem[base].dthetay[ty];
744 float dthz0 = shmem[base].dthetaz[tz];
745 f1 += dthx0 * thy0 * thz0 * q0;
746 f2 += thx0 * dthy0 * thz0 * q0;
747 f3 += thx0 * thy0 * dthz0 * q0;
753 const int i = threadIdx.x & 7;
774 warp_mask =
WARP_BALLOT(warp_mask, (base < base_end) );
779 if (pos < pos_end && threadIdx.y == 0) {
780 float f1 = shmem[threadIdx.x].f1;
781 float f2 = shmem[threadIdx.x].f2;
782 float f3 = shmem[threadIdx.x].f3;
783 float q = -shmem[threadIdx.x].charge;
787 float fx = q*f1*nfftx;
788 float fy = q*f2*nffty;
789 float fz = q*f3*nfftz;
790 gather_force_store<FT>(fx, fy, fz, stride, pos, force);
798 template <
typename T>
799 __device__ __forceinline__
801 const int x_in,
const int y_in,
const int z_in,
802 const int x_out,
const int y_out,
803 const int nx,
const int ny,
const int nz,
804 const int xsize_in,
const int ysize_in,
805 const int ysize_out,
const int zsize_out,
806 const T* data_in, T* data_out) {
813 if ((x_in < nx) && (y_in + j < ny) && (z_in < nz))
814 tile[threadIdx.y + j][threadIdx.x] = data_in[x_in + (y_in + j + z_in*ysize_in)*xsize_in];
819 const int z_out = z_in;
821 if ((x_out + j < nx) && (y_out < ny) && (z_out < nz))
822 data_out[y_out + (z_out + (x_out+j)*zsize_out)*ysize_out] = tile[threadIdx.x][threadIdx.y + j];
828 template <
typename T>
830 const int nx,
const int ny,
const int nz,
831 const int xsize_in,
const int ysize_in,
832 const int ysize_out,
const int zsize_out,
833 const T* data_in, T* data_out) {
835 int x_in = blockIdx.x *
TILEDIM + threadIdx.x;
836 int y_in = blockIdx.y *
TILEDIM + threadIdx.y;
837 int z_in = blockIdx.z + threadIdx.z;
839 int x_out = blockIdx.x *
TILEDIM + threadIdx.y;
840 int y_out = blockIdx.y *
TILEDIM + threadIdx.x;
842 transpose_xyz_yzx_device<T>(
847 ysize_out, zsize_out,
882 template <
typename T>
885 const int ny,
const int nz,
886 const int xsize_in,
const int ysize_in) {
888 int x_in = blockIdx.x *
TILEDIM + threadIdx.x;
889 int y_in = blockIdx.y *
TILEDIM + threadIdx.y;
890 int z_in = (blockIdx.z % nz) + threadIdx.z;
892 int x_out = blockIdx.x *
TILEDIM + threadIdx.y;
893 int y_out = blockIdx.y *
TILEDIM + threadIdx.x;
895 int bi = blockIdx.z/nz;
899 int ysize_out = batch.ysize_out;
900 int zsize_out = batch.zsize_out;
901 T* data_in = batch.data_in;
902 T* data_out = batch.data_out;
904 transpose_xyz_yzx_device<T>(
909 ysize_out, zsize_out,
976 template <
typename T>
977 __device__ __forceinline__
979 const int x_in,
const int y_in,
const int z_in,
980 const int x_out,
const int z_out,
981 const int nx,
const int ny,
const int nz,
982 const int xsize_in,
const int ysize_in,
983 const int zsize_out,
const int xsize_out,
984 const T* data_in, T* data_out) {
991 if ((x_in < nx) && (y_in < ny) && (z_in + k < nz))
992 tile[threadIdx.y + k][threadIdx.x] = data_in[x_in + (y_in + (z_in + k)*ysize_in)*xsize_in];
997 const int y_out = y_in;
999 if ((x_out + k < nx) && (y_out < ny) && (z_out < nz))
1000 data_out[z_out + (x_out + k + y_out*xsize_out)*zsize_out] = tile[threadIdx.x][threadIdx.y + k];
1006 template <
typename T>
1008 const int nx,
const int ny,
const int nz,
1009 const int xsize_in,
const int ysize_in,
1010 const int zsize_out,
const int xsize_out,
1011 const T* data_in, T* data_out) {
1013 int x_in = blockIdx.x *
TILEDIM + threadIdx.x;
1014 int y_in = blockIdx.z + threadIdx.z;
1015 int z_in = blockIdx.y *
TILEDIM + threadIdx.y;
1017 int x_out = blockIdx.x *
TILEDIM + threadIdx.y;
1018 int z_out = blockIdx.y *
TILEDIM + threadIdx.x;
1020 transpose_xyz_zxy_device<T>(
1021 x_in, y_in, z_in, x_out, z_out,
1024 zsize_out, xsize_out,
1037 template <
typename T>
1040 const int ny,
const int nz,
1041 const int xsize_in,
const int ysize_in) {
1043 int x_in = blockIdx.x *
TILEDIM + threadIdx.x;
1044 int y_in = (blockIdx.z % ny) + threadIdx.z;
1045 int z_in = blockIdx.y *
TILEDIM + threadIdx.y;
1047 int x_out = blockIdx.x *
TILEDIM + threadIdx.y;
1048 int z_out = blockIdx.y *
TILEDIM + threadIdx.x;
1050 int bi = blockIdx.z/ny;
1054 int zsize_out = batch.zsize_out;
1055 int xsize_out = batch.xsize_out;
1056 T* data_in = batch.data_in;
1057 T* data_out = batch.data_out;
1059 transpose_xyz_zxy_device<T>(
1060 x_in, y_in, z_in, x_out, z_out,
1063 zsize_out, xsize_out,
1073 const int nfftx,
const int nffty,
const int nfftz,
1074 const int xsize,
const int ysize,
const int zsize,
1075 const int xdim,
const int y00,
const int z00,
1076 const bool periodicY,
const bool periodicZ,
1077 float* data,
const int order, cudaStream_t
stream) {
1079 dim3 nthread, nblock;
1084 #if defined(NAMD_CUDA)
1090 nblock.x = (numAtoms - 1)/nthread.x + 1;
1093 if (periodicY && periodicZ)
1094 spread_charge_kernel<float, 4, true, true> <<< nblock, nthread, 0, stream >>> (
atoms, numAtoms,
1095 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1097 spread_charge_kernel<float, 4, true, false> <<< nblock, nthread, 0, stream >>> (
atoms, numAtoms,
1098 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1100 spread_charge_kernel<float, 4, false, true> <<< nblock, nthread, 0, stream >>> (
atoms, numAtoms,
1101 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1103 spread_charge_kernel<float, 4, false, false> <<< nblock, nthread, 0, stream >>> (
atoms, numAtoms,
1104 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1111 nblock.x = (numAtoms - 1)/nthread.x + 1;
1114 if (periodicY && periodicZ)
1115 spread_charge_kernel<float, 6, true, true> <<< nblock, nthread, 0, stream >>> (
atoms, numAtoms,
1116 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1118 spread_charge_kernel<float, 6, true, false> <<< nblock, nthread, 0, stream >>> (
atoms, numAtoms,
1119 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1121 spread_charge_kernel<float, 6, false, true> <<< nblock, nthread, 0, stream >>> (
atoms, numAtoms,
1122 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1124 spread_charge_kernel<float, 6, false, false> <<< nblock, nthread, 0, stream >>> (
atoms, numAtoms,
1125 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1132 nblock.x = (numAtoms - 1)/nthread.x + 1;
1135 if (periodicY && periodicZ)
1136 spread_charge_kernel<float, 8, true, true> <<< nblock, nthread, 0, stream >>> (
atoms, numAtoms,
1137 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1139 spread_charge_kernel<float, 8, true, false> <<< nblock, nthread, 0, stream >>> (
atoms, numAtoms,
1140 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1142 spread_charge_kernel<float, 8, false, true> <<< nblock, nthread, 0, stream >>> (
atoms, numAtoms,
1143 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1145 spread_charge_kernel<float, 8, false, false> <<< nblock, nthread, 0, stream >>> (
atoms, numAtoms,
1146 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1151 sprintf(str,
"spread_charge, order %d not implemented",order);
1158 void scalar_sum(
const bool orderXYZ,
const int nfft1,
const int nfft2,
const int nfft3,
1159 const int size1,
const int size2,
const int size3,
const double kappa,
1160 const float recip1x,
const float recip1y,
const float recip1z,
1161 const float recip2x,
const float recip2y,
const float recip2z,
1162 const float recip3x,
const float recip3y,
const float recip3z,
1163 const double volume,
1164 const float* prefac1,
const float* prefac2,
const float* prefac3,
1165 const int k2_00,
const int k3_00,
1166 const bool doEnergyVirial,
double* energy,
double* virial,
float2* data,
1176 int shmem_size =
sizeof(float)*(nfft1 + nfft2 + nfft3);
1177 if (doEnergyVirial) {
1181 float piv_inv = (float)(1.0/(
M_PI*volume));
1182 float fac = (float)(
M_PI*
M_PI/(kappa*kappa));
1184 if (doEnergyVirial) {
1186 scalar_sum_kernel<float, float2, true, true, false> <<< nblock, nthread, shmem_size, stream >>> (
1187 nfft1, nfft2, nfft3, size1, size2, size3,
1188 recip1x, recip1y, recip1z,
1189 recip2x, recip2y, recip2z,
1190 recip3x, recip3y, recip3z,
1191 prefac1, prefac2, prefac3,
1192 fac, piv_inv, k2_00, k3_00, data, energy, virial);
1194 scalar_sum_kernel<float, float2, true, false, false> <<< nblock, nthread, shmem_size, stream >>> (
1195 nfft1, nfft2, nfft3, size1, size2, size3,
1196 recip1x, recip1y, recip1z,
1197 recip2x, recip2y, recip2z,
1198 recip3x, recip3y, recip3z,
1199 prefac1, prefac2, prefac3,
1200 fac, piv_inv, k2_00, k3_00, data, energy, virial);
1204 scalar_sum_kernel<float, float2, false, true, false> <<< nblock, nthread, shmem_size, stream >>> (
1205 nfft1, nfft2, nfft3, size1, size2, size3,
1206 recip1x, recip1y, recip1z,
1207 recip2x, recip2y, recip2z,
1208 recip3x, recip3y, recip3z,
1209 prefac1, prefac2, prefac3,
1210 fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
1212 scalar_sum_kernel<float, float2, false, false, false> <<< nblock, nthread, shmem_size, stream >>> (
1213 nfft1, nfft2, nfft3, size1, size2, size3,
1214 recip1x, recip1y, recip1z,
1215 recip2x, recip2y, recip2z,
1216 recip3x, recip3y, recip3z,
1217 prefac1, prefac2, prefac3,
1218 fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
1227 const int nfftx,
const int nffty,
const int nfftz,
1228 const int xsize,
const int ysize,
const int zsize,
1229 const int xdim,
const int y00,
const int z00,
1230 const bool periodicY,
const bool periodicZ,
1231 const float* data,
const int order, float3* force,
1233 const cudaTextureObject_t gridTexObj,
1237 dim3 nthread(32, 2, 1);
1238 dim3 nblock((numAtoms - 1)/nthread.x + 1, 1, 1);
1243 if (periodicY && periodicZ)
1244 gather_force<float, float3, 4, true, true> <<< nblock, nthread, 0, stream >>> (
1245 atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1253 gather_force<float, float3, 4, true, false> <<< nblock, nthread, 0, stream >>> (
1254 atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1262 gather_force<float, float3, 4, false, true> <<< nblock, nthread, 0, stream >>> (
1263 atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1271 gather_force<float, float3, 4, false, false> <<< nblock, nthread, 0, stream >>> (
1272 atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1282 if (periodicY && periodicZ)
1283 gather_force<float, float3, 6, true, true> <<< nblock, nthread, 0, stream >>> (
1284 atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1292 gather_force<float, float3, 6, true, false> <<< nblock, nthread, 0, stream >>> (
1293 atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1301 gather_force<float, float3, 6, false, true> <<< nblock, nthread, 0, stream >>> (
1302 atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1310 gather_force<float, float3, 6, false, false> <<< nblock, nthread, 0, stream >>> (
1311 atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1321 if (periodicY && periodicZ)
1322 gather_force<float, float3, 8, true, true> <<< nblock, nthread, 0, stream >>> (
1323 atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1331 gather_force<float, float3, 8, true, false> <<< nblock, nthread, 0, stream >>> (
1332 atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1340 gather_force<float, float3, 8, false, true> <<< nblock, nthread, 0, stream >>> (
1341 atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1349 gather_force<float, float3, 8, false, false> <<< nblock, nthread, 0, stream >>> (
1350 atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1361 sprintf(str,
"gather_force, order %d not implemented",order);
1372 const int nx,
const int ny,
const int nz,
1373 const int xsize_in,
const int ysize_in,
1374 const int ysize_out,
const int zsize_out,
1379 transpose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>> (
1380 nx, ny, nz, xsize_in, ysize_in,
1381 ysize_out, zsize_out,
1392 const int max_nx,
const int ny,
const int nz,
1393 const int xsize_in,
const int ysize_in, cudaStream_t
stream) {
1396 dim3 numblock((max_nx-1)/
TILEDIM+1, (ny-1)/
TILEDIM+1, nz*numBatches);
1398 batchTranspose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>> (batches, ny, nz, xsize_in, ysize_in);
1407 const int nx,
const int ny,
const int nz,
1408 const int xsize_in,
const int ysize_in,
1409 const int zsize_out,
const int xsize_out,
1414 transpose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>> (
1415 nx, ny, nz, xsize_in, ysize_in,
1416 zsize_out, xsize_out,
1427 const int max_nx,
const int ny,
const int nz,
1428 const int xsize_in,
const int ysize_in,
1432 dim3 numblock((max_nx-1)/
TILEDIM+1, (nz-1)/
TILEDIM+1, ny*numBatches);
1434 batchTranspose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>> (batches, ny, nz, xsize_in, ysize_in);
__forceinline__ __device__ void gather_force_store(const float fx, const float fy, const float fz, const int stride, const int pos, T *force)
void batchTranspose_xyz_yzx(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)
__forceinline__ __device__ void write_grid(const float val, const int ind, T *data)
static __thread atom * atoms
void transpose_xyz_zxy(const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream)
if(ComputeNonbondedUtil::goMethod==2)
__thread cudaStream_t stream
__forceinline__ __device__ void calc_theta_dtheta(T wx, T wy, T wz, T3 *theta, T3 *dtheta)
__global__ void scalar_sum_kernel(const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const T recip1x, const T recip1y, const T recip1z, const T recip2x, const T recip2y, const T recip2z, const T recip3x, const T recip3y, const T recip3z, const T *prefac1, const T *prefac2, const T *prefac3, const T pi_ewald, const T piv_inv, const int k2_00, const int k3_00, T2 *data, double *__restrict__ energy_recip, double *__restrict__ virial)
__global__ void transpose_xyz_zxy_kernel(const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const T *data_in, T *data_out)
#define WARP_BALLOT(MASK, P)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__forceinline__ __device__ void calc_one_theta(const T w, T *theta)
void batchTranspose_xyz_zxy(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void cudaNAMD_bug(const char *msg)
__global__ void gather_force(const float4 *xyzq, const int ncoord, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const float *data, const cudaTextureObject_t gridTexObj, const int stride, FT *force)
void spread_charge(const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
__device__ __forceinline__ void transpose_xyz_zxy_device(const int x_in, const int y_in, const int z_in, const int x_out, const int z_out, const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const T *data_in, T *data_out)
__device__ __forceinline__ void transpose_xyz_yzx_device(const int x_in, const int y_in, const int z_in, const int x_out, const int y_out, const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const T *data_in, T *data_out)
void transpose_xyz_yzx(const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream)
__global__ void spread_charge_kernel(const float4 *xyzq, const int ncoord, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, AT *data)
__forceinline__ __device__ void gather_force_store< float >(const float fx, const float fy, const float fz, const int stride, const int pos, float *force)
__forceinline__ __device__ void gather_force_store< float3 >(const float fx, const float fy, const float fz, const int stride, const int pos, float3 *force)
__forceinline__ __device__ double expfunc(const double x)
__global__ void batchTranspose_xyz_yzx_kernel(const TransposeBatch< T > *batches, const int ny, const int nz, const int xsize_in, const int ysize_in)
__global__ void transpose_xyz_yzx_kernel(const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const T *data_in, T *data_out)
__global__ void batchTranspose_xyz_zxy_kernel(const TransposeBatch< T > *batches, const int ny, const int nz, const int xsize_in, const int ysize_in)
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)