| version 1.1 | version 1.2 |
|---|
| |
| double virial[6]; | double virial[6]; |
| }; | }; |
| | |
| | |
| | __forceinline__ __device__ double expfunc(const double x) { |
| | return exp(x); |
| | } |
| | |
| | __forceinline__ __device__ float expfunc(const float x) { |
| | return __expf(x); |
| | } |
| | |
| // | // |
| // Performs scalar sum on data(nfft1, nfft2, nfft3) | // Performs scalar sum on data(nfft1, nfft2, nfft3) |
| // T = float or double | // T = float or double |
| // T2 = float2 or double2 | // T2 = float2 or double2 |
| // | // |
| template <typename T, typename T2, bool calc_energy_virial, bool orderXYZ> | template <typename T, typename T2, bool calc_energy_virial, bool orderXYZ, bool doOrtho> |
| __global__ void scalar_sum_ortho_kernel(const int nfft1, const int nfft2, const int nfft3, | __global__ void scalar_sum_kernel(const int nfft1, const int nfft2, const int nfft3, |
| const int size1, const int size2, const int size3, | const int size1, const int size2, const int size3, |
| const int nf1, const int nf2, const int nf3, | const T recip1x, const T recip1y, const T recip1z, |
| const T recip11, const T recip22, const T recip33, | const T recip2x, const T recip2y, const T recip2z, |
| | const T recip3x, const T recip3y, const T recip3z, |
| const T* prefac1, const T* prefac2, const T* prefac3, | const T* prefac1, const T* prefac2, const T* prefac3, |
| const T fac, const T piv_inv, | const T pi_ewald, const T piv_inv, |
| const int k2_00, const int k3_00, | const int k2_00, const int k3_00, |
| T2* data, | T2* data, |
| double* __restrict__ energy_recip, | double* __restrict__ energy_recip, |
| |
| double virial4 = 0.0; | double virial4 = 0.0; |
| double virial5 = 0.0; | double virial5 = 0.0; |
| | |
| | // If nfft1 is odd, set nfft1_half to impossible value so that |
| | // the second condition in "if ( (k1 == 0) || (k1 == nfft1_half) )" |
| | // is never satisfied |
| | const int nfft1_half = ((nfft1 & 1) == 0) ? nfft1/2 : -1; |
| | const int nfft2_half = nfft2/2; |
| | const int nfft3_half = nfft3/2; |
| | |
| while (k3 < lim3) { | while (k3 < lim3) { |
| | |
| T2 q = data[pos]; | T2 q = data[pos]; |
| | |
| int m1 = k1; | int k2s = (k2 <= nfft2_half) ? k2 : k2 - nfft2; |
| int m2 = k2; | int k3s = (k3 <= nfft3_half) ? k3 : k3 - nfft3; |
| int m3 = k3; | |
| if (k1 >= nf1) m1 -= nfft1; | |
| if (k2 >= nf2) m2 -= nfft2; | |
| if (k3 >= nf3) m3 -= nfft3; | |
| | |
| T mhat1 = recip11*m1; | |
| T mhat2 = recip22*m2; | |
| T mhat3 = recip33*m3; | |
| | |
| T msq = mhat1*mhat1 + mhat2*mhat2 + mhat3*mhat3; | T m1, m2, m3; |
| T msq_inv = (T)1.0/msq; | if (doOrtho) { |
| | m1 = recip1x*k1; |
| // NOTE: check if it's faster to pre-calculate exp() | m2 = recip2y*k2s; |
| T eterm = exp(-fac*msq)*piv_inv*sh_prefac1[k1]*sh_prefac2[k2]*sh_prefac3[k3]*msq_inv; | m3 = recip3z*k3s; |
| | |
| if (calc_energy_virial) { | |
| T tmp1 = eterm*(q.x*q.x + q.y*q.y); | |
| T vterm = ((T)2)*(fac + msq_inv); | |
| T tmp2 = tmp1*vterm; | |
| | |
| energy += (double)tmp1; | |
| virial0 += (double)(tmp1*(vterm*mhat1*mhat1 - ((T)1))); | |
| virial1 += (double)(tmp2*mhat1*mhat2); | |
| virial2 += (double)(tmp2*mhat1*mhat3); | |
| virial3 += (double)(tmp1*(vterm*mhat2*mhat2 - ((T)1))); | |
| virial4 += (double)(tmp2*mhat2*mhat3); | |
| virial5 += (double)(tmp1*(vterm*mhat3*mhat3 - ((T)1))); | |
| | |
| // The following is put into a separate if {} -block to avoid divergence within warp and | |
| // save registers | |
| if ((orderXYZ && k1 >= 1 && k1 < nfft1) || | |
| (!orderXYZ && k2 >= 1 && k2 < nfft2)) { | |
| | |
| int k1s, k2s, k3s; | |
| if (orderXYZ) { | |
| k1s = nfft1 - (k1+1) + 1; | |
| k2s = ((nfft2-(k2+1)+1) % nfft2); | |
| k3s = ((nfft3-(k3+1)+1) % nfft3); | |
| } else { | } else { |
| k1s = ((nfft1-(k1+1)+1) % nfft1); | m1 = recip1x*k1 + recip2x*k2s + recip3x*k3s; |
| k2s = nfft2 - (k2+1) + 1; | m2 = recip1y*k1 + recip2y*k2s + recip3y*k3s; |
| k3s = ((nfft3-(k3+1)+1) % nfft3); | m3 = recip1z*k1 + recip2z*k2s + recip3z*k3s; |
| } | } |
| | |
| int m1s = k1s; | T msq = m1*m1 + m2*m2 + m3*m3; |
| int m2s = k2s; | T msq_inv = ((T)1)/msq; |
| int m3s = k3s; | |
| | T theta3 = sh_prefac1[k1]*sh_prefac2[k2]*sh_prefac3[k3]; |
| if (k1s >= nf1) m1s -= nfft1; | T q2 = ((T)2)*(q.x*q.x + q.y*q.y)*theta3; |
| if (k2s >= nf2) m2s -= nfft2; | if ( (k1 == 0) || (k1 == nfft1_half) ) q2 *= ((T)0.5); |
| if (k3s >= nf3) m3s -= nfft3; | T xp3 = expfunc(-pi_ewald*msq)*piv_inv; |
| | T C = xp3*msq_inv; |
| T mhat1s = recip11*m1s; | T theta = theta3*C; |
| T mhat2s = recip22*m2s; | |
| T mhat3s = recip33*m3s; | |
| | |
| T msqs = mhat1s*mhat1s + mhat2s*mhat2s + mhat3s*mhat3s; | |
| T msqs_inv = ((T)1)/msqs; | |
| | |
| T eterms = exp(-fac*msqs)*piv_inv*sh_prefac1[k1s]*sh_prefac2[k2s]*sh_prefac3[k3s]*msqs_inv; | |
| | |
| T tmp1s = eterms*(q.x*q.x + q.y*q.y); | |
| T vterms = ((T)2)*(fac + msqs_inv); | |
| T tmp2s = tmp1s*vterms; | |
| | |
| energy += (double)tmp1s; | |
| virial0 += (double)(tmp1s*(vterms*mhat1s*mhat1s - ((T)1))); | |
| virial1 += (double)(tmp2s*mhat1s*mhat2s); | |
| virial2 += (double)(tmp2s*mhat1s*mhat3s); | |
| virial3 += (double)(tmp1s*(vterms*mhat2s*mhat2s - ((T)1))); | |
| virial4 += (double)(tmp2s*mhat2s*mhat3s); | |
| virial5 += (double)(tmp1s*(vterms*mhat3s*mhat3s - ((T)1))); | |
| } | |
| | |
| | if (calc_energy_virial) { |
| | T fac = q2*C; |
| | T vir = ((T)2)*(pi_ewald + msq_inv); |
| | energy += fac; |
| | virial0 += (double)(fac*(vir*m1*m1 - ((T)1))); |
| | virial1 += (double)(fac*vir*m1*m2); |
| | virial2 += (double)(fac*vir*m1*m3); |
| | virial3 += (double)(fac*(vir*m2*m2 - ((T)1))); |
| | virial4 += (double)(fac*vir*m2*m3); |
| | virial5 += (double)(fac*(vir*m3*m3 - ((T)1))); |
| } | } |
| | |
| q.x *= eterm; | q.x *= theta; |
| q.y *= eterm; | q.y *= theta; |
| data[pos] = q; | data[pos] = q; |
| | |
| // Increment position | // Increment position |
| |
| | |
| // Reduce energy and virial | // Reduce energy and virial |
| if (calc_energy_virial) { | if (calc_energy_virial) { |
| #if __CUDA_ARCH__ < 300 | |
| // Requires blockDim.x*sizeof(RecipVirial_t) amount of shared memory | |
| volatile RecipVirial_t* sh_ev = (RecipVirial_t *)sh_prefac; | |
| // NOTE: this __syncthreads() is needed because we're using a single shared memory buffer | |
| __syncthreads(); | |
| sh_ev[threadIdx.x].energy = energy; | |
| sh_ev[threadIdx.x].virial[0] = virial0; | |
| sh_ev[threadIdx.x].virial[1] = virial1; | |
| sh_ev[threadIdx.x].virial[2] = virial2; | |
| sh_ev[threadIdx.x].virial[3] = virial3; | |
| sh_ev[threadIdx.x].virial[4] = virial4; | |
| sh_ev[threadIdx.x].virial[5] = virial5; | |
| __syncthreads(); | |
| #endif | |
| #if __CUDA_ARCH__ < 300 | |
| for (int d=1;d < blockDim.x;d *= 2) { | |
| int t = threadIdx.x + d; | |
| double energy_val = (t < blockDim.x) ? sh_ev[t].energy : 0.0; | |
| double virial0_val = (t < blockDim.x) ? sh_ev[t].virial[0] : 0.0; | |
| double virial1_val = (t < blockDim.x) ? sh_ev[t].virial[1] : 0.0; | |
| double virial2_val = (t < blockDim.x) ? sh_ev[t].virial[2] : 0.0; | |
| double virial3_val = (t < blockDim.x) ? sh_ev[t].virial[3] : 0.0; | |
| double virial4_val = (t < blockDim.x) ? sh_ev[t].virial[4] : 0.0; | |
| double virial5_val = (t < blockDim.x) ? sh_ev[t].virial[5] : 0.0; | |
| __syncthreads(); | |
| sh_ev[threadIdx.x].energy += energy_val; | |
| sh_ev[threadIdx.x].virial[0] += virial0_val; | |
| sh_ev[threadIdx.x].virial[1] += virial1_val; | |
| sh_ev[threadIdx.x].virial[2] += virial2_val; | |
| sh_ev[threadIdx.x].virial[3] += virial3_val; | |
| sh_ev[threadIdx.x].virial[4] += virial4_val; | |
| sh_ev[threadIdx.x].virial[5] += virial5_val; | |
| __syncthreads(); | |
| } | |
| #else | |
| const int tid = threadIdx.x & (warpSize-1); | const int tid = threadIdx.x & (warpSize-1); |
| const int base = (threadIdx.x/warpSize); | const int base = (threadIdx.x/warpSize); |
| volatile RecipVirial_t* sh_ev = (RecipVirial_t *)sh_prefac; | volatile RecipVirial_t* sh_ev = (RecipVirial_t *)sh_prefac; |
| |
| } | } |
| } | } |
| | |
| #endif | |
| | |
| if (threadIdx.x == 0) { | if (threadIdx.x == 0) { |
| #if __CUDA_ARCH__ < 300 | |
| energy = sh_ev[0].energy; | |
| virial0 = sh_ev[0].virial[0]; | |
| virial1 = sh_ev[0].virial[1]; | |
| virial2 = sh_ev[0].virial[2]; | |
| virial3 = sh_ev[0].virial[3]; | |
| virial4 = sh_ev[0].virial[4]; | |
| virial5 = sh_ev[0].virial[5]; | |
| #endif | |
| atomicAdd(energy_recip, energy*0.5); | atomicAdd(energy_recip, energy*0.5); |
| virial0 *= -0.5; | virial0 *= -0.5; |
| virial1 *= -0.5; | virial1 *= -0.5; |
| |
| | |
| } | } |
| | |
| #ifdef DISABLE_CUDA_TEXTURE_OBJECTS | |
| texture<float, 1, cudaReadModeElementType> gridTexRef; | |
| #endif | |
| | |
| template <typename T> | template <typename T> |
| __forceinline__ __device__ void write_grid(const float val, const int ind, | __forceinline__ __device__ void write_grid(const float val, const int ind, |
| T* data) { | T* data) { |
| |
| // | // |
| template <typename AT, int order, bool periodicY, bool periodicZ> | template <typename AT, int order, bool periodicY, bool periodicZ> |
| __global__ void | __global__ void |
| spread_charge_ortho_kernel(const float4 *xyzq, const int ncoord, | spread_charge_kernel(const float4 *xyzq, const int ncoord, |
| const float recip11, const float recip22, const float recip33, | |
| const int nfftx, const int nffty, const int nfftz, | const int nfftx, const int nffty, const int nfftz, |
| const int xsize, const int ysize, const int zsize, | const int xsize, const int ysize, const int zsize, |
| const int xdim, const int y00, const int z00, | const int xdim, const int y00, const int z00, |
| |
| float z = xyzqi.z; | float z = xyzqi.z; |
| float q = xyzqi.w; | float q = xyzqi.w; |
| | |
| // float w; | |
| // w = x*recip11 + 2.0f; | |
| // float frx = (float)(nfftx*(w - (floorf(w + 0.5f) - 0.5f))); | |
| // w = y*recip22 + 2.0f; | |
| // float fry = (float)(nffty*(w - (floorf(w + 0.5f) - 0.5f))); | |
| // w = z*recip33 + 2.0f; | |
| // float frz = (float)(nfftz*(w - (floorf(w + 0.5f) - 0.5f))); | |
| | |
| float frx = ((float)nfftx)*x; | float frx = ((float)nfftx)*x; |
| float fry = ((float)nffty)*y; | float fry = ((float)nffty)*y; |
| float frz = ((float)nfftz)*z; | float frz = ((float)nfftz)*z; |
| |
| | |
| if (tid < order*order*order) | if (tid < order*order*order) |
| write_grid<AT>(sh_thetax[i*order+x0]*sh_thetay[i*order+y0]*sh_thetaz[i*order+z0], ind, data); | write_grid<AT>(sh_thetax[i*order+x0]*sh_thetay[i*order+y0]*sh_thetaz[i*order+z0], ind, data); |
| | |
| // if (tid < order*order*order) | |
| // write_grid<AT>(1.0f, ind, data); | |
| | |
| // if (tid == 0) | |
| // write_grid<AT>(1.0f, ind, data); | |
| } | } |
| | |
| } | } |
| | |
| /* | |
| #define PATCHSPLIT 1 | |
| | |
| // | |
| // Spreads the charge on the grid. Calculates theta and dtheta on the fly | |
| // blockDim.x = Number of atoms each block loads | |
| // blockDim.y*blockDim.x/order3 = Number of atoms we spread at once | |
| // | |
| template <typename AT, int order> | |
| __global__ void | |
| spread_charge_ortho_kernel(const int4 *patch, const int npatch, | |
| const float4 *xyzq, const int ncoord, | |
| const float recip11, const float recip22, const float recip33, | |
| const int nfftx, const int nffty, const int nfftz, | |
| const int xsize, const int ysize, | |
| AT* data) { | |
| | |
| // Shared memory use: | |
| // order = 4: 1920 bytes | |
| // order = 6: 2688 bytes | |
| // order = 8: 3456 bytes | |
| __shared__ int sh_ix[32]; | |
| __shared__ int sh_iy[32]; | |
| __shared__ int sh_iz[32]; | |
| __shared__ float sh_thetax[order*32]; | |
| __shared__ float sh_thetay[order*32]; | |
| __shared__ float sh_thetaz[order*32]; | |
| | |
| // blockIdx.x = patch index | |
| int ipatch = blockIdx.x / PATCHSPLIT; | |
| int fpatch = blockIdx.x % PATCHSPLIT; | |
| if (ipatch >= npatch) return; | |
| | |
| int4 patchval = patch[ipatch]; | |
| int pos = (ipatch == 0) ? 0 : patch[ipatch-1].w; | |
| const int ox = patchval.x; | |
| const int oy = patchval.y; | |
| const int oz = patchval.z; | |
| int pos_end = patchval.w; | |
| int n = pos_end - pos; | |
| pos += n*fpatch/PATCHSPLIT; | |
| pos_end = pos + n*(fpatch+1)/PATCHSPLIT; | |
| | |
| // This block takes care of atoms in the range [pos, pos_end-1] | |
| for (;pos < pos_end;pos += blockDim.x) { | |
| | |
| __syncthreads(); | |
| | |
| if (pos + threadIdx.x < pos_end && threadIdx.y == 0) { | |
| | |
| float4 xyzqi = xyzq[pos + threadIdx.x]; | |
| float x = xyzqi.x; | |
| float y = xyzqi.y; | |
| float z = xyzqi.z; | |
| float q = xyzqi.w; | |
| | |
| float w; | |
| | |
| w = x*recip11 + 2.0f; | |
| float frx = (float)(nfftx*(w - (floorf(w + 0.5f) - 0.5f))); | |
| w = y*recip22 + 2.0f; | |
| float fry = (float)(nffty*(w - (floorf(w + 0.5f) - 0.5f))); | |
| w = z*recip33 + 2.0f; | |
| float frz = (float)(nfftz*(w - (floorf(w + 0.5f) - 0.5f))); | |
| | |
| int frxi = (int)frx; | |
| int fryi = (int)fry; | |
| int frzi = (int)frz; | |
| | |
| sh_ix[threadIdx.x] = frxi + ox; | |
| sh_iy[threadIdx.x] = fryi + oy; | |
| sh_iz[threadIdx.x] = frzi + oz; | |
| | |
| float wx = frx - (float)frxi; | |
| float wy = fry - (float)fryi; | |
| float wz = frz - (float)frzi; | |
| | |
| float theta[order]; | |
| | |
| calc_one_theta<float, order>(wx, theta); | |
| #pragma unroll | |
| for (int i=0;i < order;i++) sh_thetax[threadIdx.x*order + i] = q*theta[i]; | |
| | |
| calc_one_theta<float, order>(wy, theta); | |
| #pragma unroll | |
| for (int i=0;i < order;i++) sh_thetay[threadIdx.x*order + i] = theta[i]; | |
| | |
| calc_one_theta<float, order>(wz, theta); | |
| #pragma unroll | |
| for (int i=0;i < order;i++) sh_thetaz[threadIdx.x*order + i] = theta[i]; | |
| | |
| } | |
| | |
| __syncthreads(); | |
| | |
| // Grid point location, values of (ix0, iy0, iz0) are in range 0..order-1 | |
| // NOTE: Only tid=0...order*order*order-1 do any computation | |
| const int order3 = ((order*order*order-1)/warpSize + 1)*warpSize; | |
| const int tid = (threadIdx.x + threadIdx.y*blockDim.x) % order3; // 0...order3-1 | |
| const int x0 = tid % order; | |
| const int y0 = (tid / order) % order; | |
| const int z0 = tid / (order*order); | |
| | |
| // Loop over atoms | |
| // iadd = number of atoms to compute per iteration | |
| int iadd = blockDim.x*blockDim.y/order3; | |
| int i = (threadIdx.x + threadIdx.y*blockDim.x)/order3; | |
| int iend = min(blockDim.x, pos_end-pos); | |
| for (;i < iend;i += iadd) { | |
| int x = sh_ix[i] + x0; | |
| int y = sh_iy[i] + y0; | |
| int z = sh_iz[i] + z0; | |
| | |
| if (x >= nfftx) x -= nfftx; | |
| if (y >= nffty) y -= nffty; | |
| if (z >= nfftz) z -= nfftz; | |
| | |
| // Get position on the grid | |
| int ind = x + xsize*(y + ysize*(z)); | |
| | |
| // Here we unroll the 6x6x6 loop with 216 threads. | |
| // NOTE: We use 7*32=224 threads to do this | |
| // Calculate interpolated charge value and store it to global memory | |
| if (tid < order*order*order) | |
| write_grid<AT>(sh_thetax[i*order+x0]*sh_thetay[i*order+y0]*sh_thetaz[i*order+z0], ind, data); | |
| } | |
| } | |
| } | |
| */ | |
| | |
| //----------------------------------------------------------------------------------------- | //----------------------------------------------------------------------------------------- |
| // Generic version can not be used | // Generic version can not be used |
| template <typename T> __forceinline__ __device__ | template <typename T> __forceinline__ __device__ |
| |
| // blockDim.x*blockDim.y = Total number of threads per block | // blockDim.x*blockDim.y = Total number of threads per block |
| // | // |
| template <typename CT, typename FT, int order, bool periodicY, bool periodicZ> | template <typename CT, typename FT, int order, bool periodicY, bool periodicZ> |
| __global__ void gather_force_ortho(const float4 *xyzq, const int ncoord, | __global__ void gather_force(const float4 *xyzq, const int ncoord, |
| const int nfftx, const int nffty, const int nfftz, | const int nfftx, const int nffty, const int nfftz, |
| const int xsize, const int ysize, const int zsize, | const int xsize, const int ysize, const int zsize, |
| const int xdim, const int y00, const int z00, | const int xdim, const int y00, const int z00, |
| const float recip1, const float recip2, const float recip3, | // const float recip1, const float recip2, const float recip3, |
| const float* data, // NOTE: data is used for loads when __CUDA_ARCH__ >= 350 | const float* data, // NOTE: data is used for loads when __CUDA_ARCH__ >= 350 |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
| const cudaTextureObject_t gridTexObj, | const cudaTextureObject_t gridTexObj, |
| #endif | |
| const int stride, | const int stride, |
| FT *force) { | FT *force) { |
| | |
| |
| | |
| // Shared memory | // Shared memory |
| __shared__ gather_t<CT, order> shmem[32]; | __shared__ gather_t<CT, order> shmem[32]; |
| #if __CUDA_ARCH__ < 300 | |
| __shared__ float3 shred_buf[32*2]; | |
| volatile float3 *shred = &shred_buf[(tid/8)*8]; | |
| #endif | |
| | |
| const int pos = blockIdx.x*blockDim.x + threadIdx.x; | const int pos = blockIdx.x*blockDim.x + threadIdx.x; |
| const int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord); | const int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord); |
| |
| float z = xyzqi.z; | float z = xyzqi.z; |
| float q = xyzqi.w; | float q = xyzqi.w; |
| | |
| // float w; | |
| // w = x*recip1 + 2.0f; | |
| // float frx = (float)(nfftx*(w - (floorf(w + 0.5f) - 0.5f))); | |
| // w = y*recip2 + 2.0f; | |
| // float fry = (float)(nffty*(w - (floorf(w + 0.5f) - 0.5f))); | |
| // w = z*recip3 + 2.0f; | |
| // float frz = (float)(nfftz*(w - (floorf(w + 0.5f) - 0.5f))); | |
| | |
| float frx = ((float)nfftx)*x; | float frx = ((float)nfftx)*x; |
| float fry = ((float)nffty)*y; | float fry = ((float)nffty)*y; |
| float frz = ((float)nfftz)*z; | float frz = ((float)nfftz)*z; |
| |
| #if __CUDA_ARCH__ >= 350 | #if __CUDA_ARCH__ >= 350 |
| float q0 = __ldg(&data[ind]); | float q0 = __ldg(&data[ind]); |
| #else | #else |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
| float q0 = tex1Dfetch<float>(gridTexObj, ind); | float q0 = tex1Dfetch<float>(gridTexObj, ind); |
| #else | |
| float q0 = tex1Dfetch(gridTexRef, ind); | |
| #endif | |
| #endif | #endif |
| float thx0 = shmem[base].thetax[tx]; | float thx0 = shmem[base].thetax[tx]; |
| float thy0 = shmem[base].thetay[ty]; | float thy0 = shmem[base].thetay[ty]; |
| |
| //------------------------- | //------------------------- |
| | |
| // Reduce | // Reduce |
| #if __CUDA_ARCH__ >= 300 | |
| const int i = threadIdx.x & 7; | const int i = threadIdx.x & 7; |
| | |
| f1 += __shfl(f1, i+4, 8); | f1 += __shfl(f1, i+4, 8); |
| |
| shmem[base].f3 = f3; | shmem[base].f3 = f3; |
| } | } |
| | |
| #else | |
| const int i = threadIdx.x & 7; | |
| shred[i].x = f1; | |
| shred[i].y = f2; | |
| shred[i].z = f3; | |
| | |
| if (i < 4) { | |
| shred[i].x += shred[i+4].x; | |
| shred[i].y += shred[i+4].y; | |
| shred[i].z += shred[i+4].z; | |
| } | |
| | |
| if (i < 2) { | |
| shred[i].x += shred[i+2].x; | |
| shred[i].y += shred[i+2].y; | |
| shred[i].z += shred[i+2].z; | |
| } | |
| | |
| if (i == 0) { | |
| shmem[base].f1 = shred[0].x + shred[1].x; | |
| shmem[base].f2 = shred[0].y + shred[1].y; | |
| shmem[base].f3 = shred[0].z + shred[1].z; | |
| } | |
| #endif | |
| | |
| base += 8; | base += 8; |
| } | } |
| | |
| |
| float f2 = shmem[threadIdx.x].f2; | float f2 = shmem[threadIdx.x].f2; |
| float f3 = shmem[threadIdx.x].f3; | float f3 = shmem[threadIdx.x].f3; |
| float q = -shmem[threadIdx.x].charge; //*ccelec_float; | float q = -shmem[threadIdx.x].charge; //*ccelec_float; |
| float fx = q*recip1*f1*nfftx; | // float fx = q*recip1*f1*nfftx; |
| float fy = q*recip2*f2*nffty; | // float fy = q*recip2*f2*nffty; |
| float fz = q*recip3*f3*nfftz; | // float fz = q*recip3*f3*nfftz; |
| | float fx = q*f1*nfftx; |
| | float fy = q*f2*nffty; |
| | float fz = q*f3*nfftz; |
| gather_force_store<FT>(fx, fy, fz, stride, pos, force); | gather_force_store<FT>(fx, fy, fz, stride, pos, force); |
| } | } |
| | |
| } | } |
| | |
| /* | |
| // | |
| // Gathers forces from the grid | |
| // blockDim.x = Number of atoms each block loads | |
| // blockDim.x*blockDim.y = Total number of threads per block | |
| // | |
| template <typename CT, typename FT, int order> | |
| __global__ void gather_force_ortho(const int4* patch, const int npatch, | |
| const float4 *xyzq, const int ncoord, | |
| const int nfftx, const int nffty, const int nfftz, | |
| const int xsize, const int ysize, | |
| const float recip1, const float recip2, const float recip3, | |
| const float* data, // NOTE: data is used for loads when __CUDA_ARCH__ >= 350 | |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
| const cudaTextureObject_t gridTexObj, | |
| #endif | |
| const int stride, | |
| FT *force) { | |
| | |
| const int tid = threadIdx.x + threadIdx.y*blockDim.x; // 0...63 | |
| | |
| // Shared memory | |
| __shared__ gather_t<CT, order> shmem[32]; | |
| #if __CUDA_ARCH__ < 300 | |
| __shared__ float3 shred_buf[32*2]; | |
| volatile float3 *shred = &shred_buf[(tid/8)*8]; | |
| #endif | |
| | |
| // blockIdx.x = patch index | |
| if (blockIdx.x >= npatch) return; | |
| | |
| int4 patchval = patch[blockIdx.x]; | |
| int pos = (blockIdx.x == 0) ? 0 : patch[blockIdx.x-1].w; | |
| const int ox = patchval.x; | |
| const int oy = patchval.y; | |
| const int oz = patchval.z; | |
| const int pos_end = patchval.w; | |
| | |
| // This block takes care of atoms in the range [pos, pos_end-1] | |
| for (;pos < pos_end;pos += blockDim.x) { | |
| | |
| __syncthreads(); | |
| | |
| if (pos + threadIdx.x < pos_end && threadIdx.y == 0) { | |
| | |
| float4 xyzqi = xyzq[pos]; | |
| float x = xyzqi.x; | |
| float y = xyzqi.y; | |
| float z = xyzqi.z; | |
| float q = xyzqi.w; | |
| | |
| float w; | |
| | |
| w = x*recip1 + 2.0f; | |
| float frx = (float)(nfftx*(w - (floorf(w + 0.5f) - 0.5f))); | |
| | |
| w = y*recip2 + 2.0f; | |
| float fry = (float)(nffty*(w - (floorf(w + 0.5f) - 0.5f))); | |
| | |
| w = z*recip3 + 2.0f; | |
| float frz = (float)(nfftz*(w - (floorf(w + 0.5f) - 0.5f))); | |
| | |
| int frxi = (int)frx; | |
| int fryi = (int)fry; | |
| int frzi = (int)frz; | |
| | |
| shmem[threadIdx.x].ix = frxi + ox; | |
| shmem[threadIdx.x].iy = fryi + oy; | |
| shmem[threadIdx.x].iz = frzi + oz; | |
| shmem[threadIdx.x].charge = q; | |
| | |
| float wx = frx - (float)frxi; | |
| float wy = fry - (float)fryi; | |
| float wz = frz - (float)frzi; | |
| | |
| float3 theta_tmp[order]; | |
| float3 dtheta_tmp[order]; | |
| calc_theta_dtheta<float, float3, order>(wx, wy, wz, theta_tmp, dtheta_tmp); | |
| | |
| #pragma unroll | |
| for (int i=0;i < order;i++) shmem[threadIdx.x].thetax[i] = theta_tmp[i].x; | |
| | |
| #pragma unroll | |
| for (int i=0;i < order;i++) shmem[threadIdx.x].thetay[i] = theta_tmp[i].y; | |
| | |
| #pragma unroll | |
| for (int i=0;i < order;i++) shmem[threadIdx.x].thetaz[i] = theta_tmp[i].z; | |
| | |
| #pragma unroll | |
| for (int i=0;i < order;i++) shmem[threadIdx.x].dthetax[i] = dtheta_tmp[i].x; | |
| | |
| #pragma unroll | |
| for (int i=0;i < order;i++) shmem[threadIdx.x].dthetay[i] = dtheta_tmp[i].y; | |
| | |
| #pragma unroll | |
| for (int i=0;i < order;i++) shmem[threadIdx.x].dthetaz[i] = dtheta_tmp[i].z; | |
| | |
| } | |
| __syncthreads(); | |
| | |
| // We divide the order x order x order cube into 8 sub-cubes. | |
| // These sub-cubes are taken care by a single thread | |
| // The size of the sub-cubes is: | |
| // order=4 : 2x2x2 | |
| // order=6 : 3x3x3 | |
| // order=8 : 4x4x4 | |
| const int nsc = (order == 4) ? 2 : ((order == 6) ? 3 : 4); | |
| // Calculate the starting index on the sub-cube for this thread | |
| // tid = 0...63 | |
| const int t = (tid % 8); // sub-cube index (0...7) | |
| // t = (tx0 + ty0*2 + tz0*4)/nsc | |
| // (tx0, ty0, tz0) gives the starting index of the 3x3x3 sub-cube | |
| const int tz0 = (t / 4)*nsc; | |
| const int ty0 = ((t / 2) % 2)*nsc; | |
| const int tx0 = (t % 2)*nsc; | |
| | |
| // | |
| // Calculate forces for 32 atoms. We have 32*2 = 64 threads | |
| // Loop is iterated 4 times: | |
| // (iterations) | |
| // Threads 0...7 = atoms 0, 8, 16, 24 | |
| // Threads 8...15 = atoms 1, 9, 17, 25 | |
| // Threads 16...31 = atoms 2, 10, 18, 26 | |
| // ... | |
| // Threads 56...63 = atoms 7, 15, 23, 31 | |
| // | |
| | |
| int base = tid/8; | |
| //const int base_end = pos_end - blockIdx.x*blockDim.x; | |
| const int base_end = min(blockDim.x, pos_end-pos); | |
| while (base < base_end) { | |
| | |
| float f1 = 0.0f; | |
| float f2 = 0.0f; | |
| float f3 = 0.0f; | |
| int ix0 = shmem[base].ix; | |
| int iy0 = shmem[base].iy; | |
| int iz0 = shmem[base].iz; | |
| | |
| // Each thread calculates a nsc x nsc x nsc sub-cube | |
| #pragma unroll | |
| for (int i=0;i < nsc*nsc*nsc;i++) { | |
| int tz = tz0 + (i/(nsc*nsc)); | |
| int ty = ty0 + ((i/nsc) % nsc); | |
| int tx = tx0 + (i % nsc); | |
| | |
| int ix = ix0 + tx; | |
| int iy = iy0 + ty; | |
| int iz = iz0 + tz; | |
| if (ix >= nfftx) ix -= nfftx; | |
| if (iy >= nffty) iy -= nffty; | |
| if (iz >= nfftz) iz -= nfftz; | |
| #if __CUDA_ARCH__ >= 350 | |
| float q0 = __ldg(&data[ix + (iy + iz*ysize)*xsize]); | |
| #else | |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
| float q0 = tex1Dfetch<float>(gridTexObj, ix + (iy + iz*ysize)*xsize); | |
| #else | |
| float q0 = tex1Dfetch(gridTexRef, ix + (iy + iz*ysize)*xsize); | |
| #endif | |
| #endif | |
| float thx0 = shmem[base].thetax[tx]; | |
| float thy0 = shmem[base].thetay[ty]; | |
| float thz0 = shmem[base].thetaz[tz]; | |
| float dthx0 = shmem[base].dthetax[tx]; | |
| float dthy0 = shmem[base].dthetay[ty]; | |
| float dthz0 = shmem[base].dthetaz[tz]; | |
| f1 += dthx0 * thy0 * thz0 * q0; | |
| f2 += thx0 * dthy0 * thz0 * q0; | |
| f3 += thx0 * thy0 * dthz0 * q0; | |
| } | |
| | |
| //------------------------- | |
| | |
| // Reduce | |
| #if __CUDA_ARCH__ >= 300 | |
| const int i = threadIdx.x & 7; | |
| | |
| f1 += __shfl(f1, i+4, 8); | |
| f2 += __shfl(f2, i+4, 8); | |
| f3 += __shfl(f3, i+4, 8); | |
| | |
| f1 += __shfl(f1, i+2, 8); | |
| f2 += __shfl(f2, i+2, 8); | |
| f3 += __shfl(f3, i+2, 8); | |
| | |
| f1 += __shfl(f1, i+1, 8); | |
| f2 += __shfl(f2, i+1, 8); | |
| f3 += __shfl(f3, i+1, 8); | |
| | |
| if (i == 0) { | |
| shmem[base].f1 = f1; | |
| shmem[base].f2 = f2; | |
| shmem[base].f3 = f3; | |
| } | |
| | |
| #else | |
| const int i = threadIdx.x & 7; | |
| shred[i].x = f1; | |
| shred[i].y = f2; | |
| shred[i].z = f3; | |
| | |
| if (i < 4) { | |
| shred[i].x += shred[i+4].x; | |
| shred[i].y += shred[i+4].y; | |
| shred[i].z += shred[i+4].z; | |
| } | |
| | |
| if (i < 2) { | |
| shred[i].x += shred[i+2].x; | |
| shred[i].y += shred[i+2].y; | |
| shred[i].z += shred[i+2].z; | |
| } | |
| | |
| if (i == 0) { | |
| shmem[base].f1 = shred[0].x + shred[1].x; | |
| shmem[base].f2 = shred[0].y + shred[1].y; | |
| shmem[base].f3 = shred[0].z + shred[1].z; | |
| } | |
| #endif | |
| | |
| base += 8; | |
| } | |
| | |
| // Write forces | |
| __syncthreads(); | |
| if (pos < pos_end && threadIdx.y == 0) { | |
| float f1 = shmem[threadIdx.x].f1; | |
| float f2 = shmem[threadIdx.x].f2; | |
| float f3 = shmem[threadIdx.x].f3; | |
| float q = -shmem[threadIdx.x].charge;//*ccelec_float; | |
| float fx = q*recip1*f1*nfftx; | |
| float fy = q*recip2*f2*nffty; | |
| float fz = q*recip3*f3*nfftz; | |
| gather_force_store<FT>(fx, fy, fz, stride, pos, force); | |
| } | |
| } | |
| } | |
| */ | |
| | |
| /* | |
| // | |
| // Calculates sum of squared charge. Used in calculation of self energy | |
| // | |
| __global__ void calc_sum_charge_squared_kernel(const int ncoord, const float4* xyzq, | |
| double* __restrict__ sum_charge_squared) { | |
| // Shared memory | |
| // Required space: blockDim.x*sizeof(double) | |
| extern __shared__ double sh_q2[]; | |
| | |
| int i = threadIdx.x + blockIdx.x*blockDim.x; | |
| float q = 0.0f; | |
| if (i < ncoord) q = xyzq[i].w; | |
| sh_q2[threadIdx.x] = q*q; | |
| __syncthreads(); | |
| for(int d=1;d < blockDim.x;d *= 2) { | |
| int t = threadIdx.x + d; | |
| double q2_val = (t < blockDim.x) ? sh_q2[t] : 0.0; | |
| __syncthreads(); | |
| sh_q2[threadIdx.x] += q2_val; | |
| __syncthreads(); | |
| } | |
| if (threadIdx.x == 0) { | |
| atomicAdd(sum_charge_squared, sh_q2[0]); | |
| } | |
| | |
| } | |
| */ | |
| | |
| const int TILEDIM = 32; | const int TILEDIM = 32; |
| const int TILEROWS = 8; | const int TILEROWS = 8; |
| | |
| |
| //####################################################################################### | //####################################################################################### |
| //####################################################################################### | //####################################################################################### |
| | |
| void spread_charge_ortho(const float4 *atoms, const int numAtoms, | void spread_charge(const float4 *atoms, const int numAtoms, |
| const float recip11, const float recip22, const float recip33, | |
| const int nfftx, const int nffty, const int nfftz, | const int nfftx, const int nffty, const int nfftz, |
| const int xsize, const int ysize, const int zsize, | const int xsize, const int ysize, const int zsize, |
| const int xdim, const int y00, const int z00, | const int xdim, const int y00, const int z00, |
| |
| nblock.y = 1; | nblock.y = 1; |
| nblock.z = 1; | nblock.z = 1; |
| if (periodicY && periodicZ) | if (periodicY && periodicZ) |
| spread_charge_ortho_kernel<float, 4, true, true> <<< nblock, nthread, 0, stream >>> | spread_charge_kernel<float, 4, true, true> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, recip11, recip22, recip33, | (atoms, numAtoms, |
| nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); | nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); |
| else if (periodicY) | else if (periodicY) |
| spread_charge_ortho_kernel<float, 4, true, false> <<< nblock, nthread, 0, stream >>> | spread_charge_kernel<float, 4, true, false> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, recip11, recip22, recip33, | (atoms, numAtoms, |
| nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); | nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); |
| else if (periodicZ) | else if (periodicZ) |
| spread_charge_ortho_kernel<float, 4, false, true> <<< nblock, nthread, 0, stream >>> | spread_charge_kernel<float, 4, false, true> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, recip11, recip22, recip33, | (atoms, numAtoms, |
| nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); | nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); |
| else | else |
| spread_charge_ortho_kernel<float, 4, false, false> <<< nblock, nthread, 0, stream >>> | spread_charge_kernel<float, 4, false, false> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, recip11, recip22, recip33, | (atoms, numAtoms, |
| nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); | nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); |
| break; | break; |
| | |
| |
| nblock.y = 1; | nblock.y = 1; |
| nblock.z = 1; | nblock.z = 1; |
| if (periodicY && periodicZ) | if (periodicY && periodicZ) |
| spread_charge_ortho_kernel<float, 6, true, true> <<< nblock, nthread, 0, stream >>> | spread_charge_kernel<float, 6, true, true> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, recip11, recip22, recip33, | (atoms, numAtoms, |
| nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); | nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); |
| else if (periodicY) | else if (periodicY) |
| spread_charge_ortho_kernel<float, 6, true, false> <<< nblock, nthread, 0, stream >>> | spread_charge_kernel<float, 6, true, false> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, recip11, recip22, recip33, | (atoms, numAtoms, |
| nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); | nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); |
| else if (periodicZ) | else if (periodicZ) |
| spread_charge_ortho_kernel<float, 6, false, true> <<< nblock, nthread, 0, stream >>> | spread_charge_kernel<float, 6, false, true> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, recip11, recip22, recip33, | (atoms, numAtoms, |
| nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); | nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); |
| else | else |
| spread_charge_ortho_kernel<float, 6, false, false> <<< nblock, nthread, 0, stream >>> | spread_charge_kernel<float, 6, false, false> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, recip11, recip22, recip33, | (atoms, numAtoms, |
| nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); | nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); |
| break; | break; |
| | |
| |
| nblock.y = 1; | nblock.y = 1; |
| nblock.z = 1; | nblock.z = 1; |
| if (periodicY && periodicZ) | if (periodicY && periodicZ) |
| spread_charge_ortho_kernel<float, 8, true, true> <<< nblock, nthread, 0, stream >>> | spread_charge_kernel<float, 8, true, true> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, recip11, recip22, recip33, | (atoms, numAtoms, |
| nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); | nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); |
| else if (periodicY) | else if (periodicY) |
| spread_charge_ortho_kernel<float, 8, true, false> <<< nblock, nthread, 0, stream >>> | spread_charge_kernel<float, 8, true, false> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, recip11, recip22, recip33, | (atoms, numAtoms, |
| nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); | nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); |
| else if (periodicZ) | else if (periodicZ) |
| spread_charge_ortho_kernel<float, 8, false, true> <<< nblock, nthread, 0, stream >>> | spread_charge_kernel<float, 8, false, true> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, recip11, recip22, recip33, | (atoms, numAtoms, |
| nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); | nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); |
| else | else |
| spread_charge_ortho_kernel<float, 8, false, false> <<< nblock, nthread, 0, stream >>> | spread_charge_kernel<float, 8, false, false> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, recip11, recip22, recip33, | (atoms, numAtoms, |
| nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); | nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data); |
| break; | break; |
| | |
| default: | default: |
| char str[128]; | char str[128]; |
| sprintf(str, "spread_charge_ortho, order %d not implemented",order); | sprintf(str, "spread_charge, order %d not implemented",order); |
| cudaNAMD_bug(str); | cudaNAMD_bug(str); |
| } | } |
| cudaCheck(cudaGetLastError()); | cudaCheck(cudaGetLastError()); |
| | |
| } | } |
| | |
| void scalar_sum_ortho(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, | void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, |
| const int size1, const int size2, const int size3, const double kappa, | const int size1, const int size2, const int size3, const double kappa, |
| const float recip11, const float recip22, const float recip33, | const float recip1x, const float recip1y, const float recip1z, |
| | const float recip2x, const float recip2y, const float recip2z, |
| | const float recip3x, const float recip3y, const float recip3z, |
| | const double volume, |
| const float* prefac1, const float* prefac2, const float* prefac3, | const float* prefac1, const float* prefac2, const float* prefac3, |
| const int k2_00, const int k3_00, | const int k2_00, const int k3_00, |
| const bool doEnergyVirial, double* energy, double* virial, float2* data, | const bool doEnergyVirial, double* energy, double* virial, float2* data, |
| const int cuda_arch, cudaStream_t stream) { | cudaStream_t stream) { |
| | |
| // Best performance: | |
| // cuda_arch = 200: | |
| // energy & virial & (C2075 | K40c) & 512x14: 102.7 (C2075) | 70.4 (K240c) | |
| // C2075 & 768x12: 27.4 | |
| int nthread = 512; | |
| int nblock = 10; | |
| | |
| if (cuda_arch < 300) { | |
| if (doEnergyVirial) { | |
| nthread = 512; | |
| nblock = 14; | |
| } else { | |
| nthread = 768; | |
| nblock = 12; | |
| } | |
| } else { | |
| if (doEnergyVirial) { | |
| nthread = 1024; | |
| nblock = 14; | |
| } else { | |
| nthread = 1024; | |
| nblock = 14; | |
| } | |
| } | |
| | |
| | int nthread = 1024; |
| | int nblock = 64; |
| | |
| int shmem_size = sizeof(float)*(nfft1 + nfft2 + nfft3); | int shmem_size = sizeof(float)*(nfft1 + nfft2 + nfft3); |
| if (doEnergyVirial) { | if (doEnergyVirial) { |
| if (cuda_arch < 300) { | |
| shmem_size = max(shmem_size, (int)(nthread*sizeof(RecipVirial_t))); | |
| } else { | |
| const int warpSize = 32; | const int warpSize = 32; |
| shmem_size = max(shmem_size, (int)((nthread/warpSize)*sizeof(RecipVirial_t))); | shmem_size = max(shmem_size, (int)((nthread/warpSize)*sizeof(RecipVirial_t))); |
| } | } |
| } | |
| | |
| double inv_vol = recip11*recip22*recip33; | float piv_inv = (float)(1.0/(M_PI*volume)); |
| float piv_inv = (float)(inv_vol/M_PI); | |
| float fac = (float)(M_PI*M_PI/(kappa*kappa)); | float fac = (float)(M_PI*M_PI/(kappa*kappa)); |
| | |
| int nf1 = nfft1/2 + (nfft1 % 2); | |
| int nf2 = nfft2/2 + (nfft2 % 2); | |
| int nf3 = nfft3/2 + (nfft3 % 2); | |
| | |
| if (doEnergyVirial) { | if (doEnergyVirial) { |
| if (orderXYZ) { | if (orderXYZ) { |
| scalar_sum_ortho_kernel<float, float2, true, true> <<< nblock, nthread, shmem_size, stream >>> | scalar_sum_kernel<float, float2, true, true, false> <<< nblock, nthread, shmem_size, stream >>> |
| (nfft1, nfft2, nfft3, size1, size2, size3, | (nfft1, nfft2, nfft3, size1, size2, size3, |
| nf1, nf2, nf3, recip11, recip22, recip33, | recip1x, recip1y, recip1z, |
| | recip2x, recip2y, recip2z, |
| | recip3x, recip3y, recip3z, |
| prefac1, prefac2, prefac3, | prefac1, prefac2, prefac3, |
| fac, piv_inv, k2_00, k3_00, data, energy, virial); | fac, piv_inv, k2_00, k3_00, data, energy, virial); |
| } else { | } else { |
| scalar_sum_ortho_kernel<float, float2, true, false> <<< nblock, nthread, shmem_size, stream >>> | scalar_sum_kernel<float, float2, true, false, false> <<< nblock, nthread, shmem_size, stream >>> |
| (nfft1, nfft2, nfft3, size1, size2, size3, | (nfft1, nfft2, nfft3, size1, size2, size3, |
| nf1, nf2, nf3, recip11, recip22, recip33, | recip1x, recip1y, recip1z, |
| | recip2x, recip2y, recip2z, |
| | recip3x, recip3y, recip3z, |
| prefac1, prefac2, prefac3, | prefac1, prefac2, prefac3, |
| fac, piv_inv, k2_00, k3_00, data, energy, virial); | fac, piv_inv, k2_00, k3_00, data, energy, virial); |
| } | } |
| } else { | } else { |
| if (orderXYZ) { | if (orderXYZ) { |
| scalar_sum_ortho_kernel<float, float2, false, true> <<< nblock, nthread, shmem_size, stream >>> | scalar_sum_kernel<float, float2, false, true, false> <<< nblock, nthread, shmem_size, stream >>> |
| (nfft1, nfft2, nfft3, size1, size2, size3, | (nfft1, nfft2, nfft3, size1, size2, size3, |
| nf1, nf2, nf3, recip11, recip22, recip33, | recip1x, recip1y, recip1z, |
| | recip2x, recip2y, recip2z, |
| | recip3x, recip3y, recip3z, |
| prefac1, prefac2, prefac3, | prefac1, prefac2, prefac3, |
| fac, piv_inv, k2_00, k3_00, data, NULL, NULL); | fac, piv_inv, k2_00, k3_00, data, NULL, NULL); |
| } else { | } else { |
| scalar_sum_ortho_kernel<float, float2, false, false> <<< nblock, nthread, shmem_size, stream >>> | scalar_sum_kernel<float, float2, false, false, false> <<< nblock, nthread, shmem_size, stream >>> |
| (nfft1, nfft2, nfft3, size1, size2, size3, | (nfft1, nfft2, nfft3, size1, size2, size3, |
| nf1, nf2, nf3, recip11, recip22, recip33, | recip1x, recip1y, recip1z, |
| | recip2x, recip2y, recip2z, |
| | recip3x, recip3y, recip3z, |
| prefac1, prefac2, prefac3, | prefac1, prefac2, prefac3, |
| fac, piv_inv, k2_00, k3_00, data, NULL, NULL); | fac, piv_inv, k2_00, k3_00, data, NULL, NULL); |
| } | } |
| |
| | |
| } | } |
| | |
| void gather_force_ortho(const float4 *atoms, const int numAtoms, | void gather_force(const float4 *atoms, const int numAtoms, |
| const float recip11, const float recip22, const float recip33, | // const float recip11, const float recip22, const float recip33, |
| const int nfftx, const int nffty, const int nfftz, | const int nfftx, const int nffty, const int nfftz, |
| const int xsize, const int ysize, const int zsize, | const int xsize, const int ysize, const int zsize, |
| const int xdim, const int y00, const int z00, | const int xdim, const int y00, const int z00, |
| const bool periodicY, const bool periodicZ, | const bool periodicY, const bool periodicZ, |
| const float* data, const int order, float3* force, | const float* data, const int order, float3* force, |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
| const cudaTextureObject_t gridTexObj, | const cudaTextureObject_t gridTexObj, |
| #endif | |
| cudaStream_t stream) { | cudaStream_t stream) { |
| | |
| dim3 nthread(32, 2, 1); | dim3 nthread(32, 2, 1); |
| |
| switch(order) { | switch(order) { |
| case 4: | case 4: |
| if (periodicY && periodicZ) | if (periodicY && periodicZ) |
| gather_force_ortho<float, float3, 4, true, true> <<< nblock, nthread, 0, stream >>> | gather_force<float, float3, 4, true, true> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, | (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, |
| recip11, recip22, recip33, data, | // recip11, recip22, recip33, |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | data, |
| gridTexObj, | gridTexObj, |
| #endif | |
| 1, force); | 1, force); |
| else if (periodicY) | else if (periodicY) |
| gather_force_ortho<float, float3, 4, true, false> <<< nblock, nthread, 0, stream >>> | gather_force<float, float3, 4, true, false> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, | (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, |
| recip11, recip22, recip33, data, | // recip11, recip22, recip33, |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | data, |
| gridTexObj, | gridTexObj, |
| #endif | |
| 1, force); | 1, force); |
| else if (periodicZ) | else if (periodicZ) |
| gather_force_ortho<float, float3, 4, false, true> <<< nblock, nthread, 0, stream >>> | gather_force<float, float3, 4, false, true> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, | (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, |
| recip11, recip22, recip33, data, | // recip11, recip22, recip33, |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | data, |
| gridTexObj, | gridTexObj, |
| #endif | |
| 1, force); | 1, force); |
| else | else |
| gather_force_ortho<float, float3, 4, false, false> <<< nblock, nthread, 0, stream >>> | gather_force<float, float3, 4, false, false> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, | (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, |
| recip11, recip22, recip33, data, | // recip11, recip22, recip33, |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | data, |
| gridTexObj, | gridTexObj, |
| #endif | |
| 1, force); | 1, force); |
| break; | break; |
| | |
| case 6: | case 6: |
| if (periodicY && periodicZ) | if (periodicY && periodicZ) |
| gather_force_ortho<float, float3, 6, true, true> <<< nblock, nthread, 0, stream >>> | gather_force<float, float3, 6, true, true> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, | (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, |
| recip11, recip22, recip33, data, | // recip11, recip22, recip33, |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | data, |
| gridTexObj, | gridTexObj, |
| #endif | |
| 1, force); | 1, force); |
| else if (periodicY) | else if (periodicY) |
| gather_force_ortho<float, float3, 6, true, false> <<< nblock, nthread, 0, stream >>> | gather_force<float, float3, 6, true, false> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, | (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, |
| recip11, recip22, recip33, data, | // recip11, recip22, recip33, |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | data, |
| gridTexObj, | gridTexObj, |
| #endif | |
| 1, force); | 1, force); |
| else if (periodicZ) | else if (periodicZ) |
| gather_force_ortho<float, float3, 6, false, true> <<< nblock, nthread, 0, stream >>> | gather_force<float, float3, 6, false, true> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, | (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, |
| recip11, recip22, recip33, data, | // recip11, recip22, recip33, |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | data, |
| gridTexObj, | gridTexObj, |
| #endif | |
| 1, force); | 1, force); |
| else | else |
| gather_force_ortho<float, float3, 6, false, false> <<< nblock, nthread, 0, stream >>> | gather_force<float, float3, 6, false, false> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, | (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, |
| recip11, recip22, recip33, data, | // recip11, recip22, recip33, |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | data, |
| gridTexObj, | gridTexObj, |
| #endif | |
| 1, force); | 1, force); |
| break; | break; |
| | |
| case 8: | case 8: |
| if (periodicY && periodicZ) | if (periodicY && periodicZ) |
| gather_force_ortho<float, float3, 8, true, true> <<< nblock, nthread, 0, stream >>> | gather_force<float, float3, 8, true, true> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, | (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, |
| recip11, recip22, recip33, data, | // recip11, recip22, recip33, |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | data, |
| gridTexObj, | gridTexObj, |
| #endif | |
| 1, force); | 1, force); |
| else if (periodicY) | else if (periodicY) |
| gather_force_ortho<float, float3, 8, true, false> <<< nblock, nthread, 0, stream >>> | gather_force<float, float3, 8, true, false> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, | (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, |
| recip11, recip22, recip33, data, | // recip11, recip22, recip33, |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | data, |
| gridTexObj, | gridTexObj, |
| #endif | |
| 1, force); | 1, force); |
| else if (periodicZ) | else if (periodicZ) |
| gather_force_ortho<float, float3, 8, false, true> <<< nblock, nthread, 0, stream >>> | gather_force<float, float3, 8, false, true> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, | (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, |
| recip11, recip22, recip33, data, | // recip11, recip22, recip33, |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | data, |
| gridTexObj, | gridTexObj, |
| #endif | |
| 1, force); | 1, force); |
| else | else |
| gather_force_ortho<float, float3, 8, false, false> <<< nblock, nthread, 0, stream >>> | gather_force<float, float3, 8, false, false> <<< nblock, nthread, 0, stream >>> |
| (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, | (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, |
| recip11, recip22, recip33, data, | // recip11, recip22, recip33, |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | data, |
| gridTexObj, | gridTexObj, |
| #endif | |
| 1, force); | 1, force); |
| break; | break; |
| | |
| default: | default: |
| char str[128]; | char str[128]; |
| sprintf(str, "gather_force_ortho, order %d not implemented",order); | sprintf(str, "gather_force, order %d not implemented",order); |
| cudaNAMD_bug(str); | cudaNAMD_bug(str); |
| } | } |
| cudaCheck(cudaGetLastError()); | cudaCheck(cudaGetLastError()); |
| | |
| } | } |
| | |
| #ifdef DISABLE_CUDA_TEXTURE_OBJECTS | |
| void bindGridTexture(float* data, int data_len) { | |
| gridTexRef.normalized = 0; | |
| gridTexRef.filterMode = cudaFilterModePoint; | |
| gridTexRef.addressMode[0] = cudaAddressModeClamp; | |
| gridTexRef.channelDesc.x = sizeof(float)*8; | |
| gridTexRef.channelDesc.y = 0; | |
| gridTexRef.channelDesc.z = 0; | |
| gridTexRef.channelDesc.w = 0; | |
| gridTexRef.channelDesc.f = cudaChannelFormatKindFloat; | |
| cudaCheck(cudaBindTexture(NULL, gridTexRef, data, data_len*sizeof(float))); | |
| } | |
| #endif | |
| | |
| /* | |
| void calc_sum_charge_squared(const float4 *atoms, const int numAtoms, double* sum_charge_squared, | |
| cudaStream_t stream) { | |
| | |
| int nthread = 256; | |
| int nblock = (numAtoms-1)/nthread+1; | |
| int shmem_size = nthread*sizeof(double); | |
| calc_sum_charge_squared_kernel<<< nblock, nthread, shmem_size, stream >>> | |
| (numAtoms, atoms, sum_charge_squared); | |
| cudaCheck(cudaGetLastError()); | |
| | |
| } | |
| */ | |
| | |
| // | // |
| // Transpose | // Transpose |
| // | // |