version 1.1 | version 1.2 |
---|
| |
const TileList* __restrict__ tileLists, | const TileList* __restrict__ tileLists, |
const int* __restrict__ tileJatomStart, | const int* __restrict__ tileJatomStart, |
const PatchPairRecord* __restrict__ patchPairs, | const PatchPairRecord* __restrict__ patchPairs, |
const float latticeX, const float latticeY, const float latticeZ, | const float3 lata, const float3 latb, const float3 latc, |
const float4* __restrict__ xyzq, | const float4* __restrict__ xyzq, |
const float cutoff2, | const float cutoff2, |
const GBISParam<phase> param, | const GBISParam<phase> param, |
| |
int iatomSize = PPStmp.iatomSize; | int iatomSize = PPStmp.iatomSize; |
int jatomSize = PPStmp.jatomSize; | int jatomSize = PPStmp.jatomSize; |
| |
float shx = tmp.offsetXYZ.x*latticeX; | float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x; |
float shy = tmp.offsetXYZ.y*latticeY; | float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y; |
float shz = tmp.offsetXYZ.z*latticeZ; | float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z; |
| |
// Warp index (0...warpsize-1) | // Warp index (0...warpsize-1) |
const int wid = threadIdx.x % WARPSIZE; | const int wid = threadIdx.x % WARPSIZE; |
| |
// Phase 1 | // Phase 1 |
// | // |
void CudaComputeGBISKernel::GBISphase1(CudaTileListKernel& tlKernel, const int atomStorageSize, | void CudaComputeGBISKernel::GBISphase1(CudaTileListKernel& tlKernel, const int atomStorageSize, |
const float latticeX, const float latticeY, const float latticeZ, const float a_cut, float* h_psiSum, | const float3 lata, const float3 latb, const float3 latc, const float a_cut, float* h_psiSum, |
cudaStream_t stream) { | cudaStream_t stream) { |
| |
reallocate_device<float>(&psiSum, &psiSumSize, atomStorageSize, 1.2f); | reallocate_device<float>(&psiSum, &psiSumSize, atomStorageSize, 1.2f); |
| |
| |
GBIS_Kernel<false, false, 1> <<< nblock, nthread, 0, stream >>> | GBIS_Kernel<false, false, 1> <<< nblock, nthread, 0, stream >>> |
(tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), | (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), |
tlKernel.getPatchPairs(), latticeX, latticeY, latticeZ, tlKernel.get_xyzq(), cutoff2, | tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), cutoff2, |
param, intRad0, intRadS, NULL, psiSum, NULL, NULL); | param, intRad0, intRadS, NULL, psiSum, NULL, NULL); |
| |
cudaCheck(cudaGetLastError()); | cudaCheck(cudaGetLastError()); |
| |
// | // |
void CudaComputeGBISKernel::GBISphase2(CudaTileListKernel& tlKernel, const int atomStorageSize, | void CudaComputeGBISKernel::GBISphase2(CudaTileListKernel& tlKernel, const int atomStorageSize, |
const bool doEnergy, const bool doSlow, | const bool doEnergy, const bool doSlow, |
const float latticeX, const float latticeY, const float latticeZ, | const float3 lata, const float3 latb, const float3 latc, |
const float r_cut, const float scaling, const float kappa, const float smoothDist, | const float r_cut, const float scaling, const float kappa, const float smoothDist, |
const float epsilon_p, const float epsilon_s, | const float epsilon_p, const float epsilon_s, |
float4* d_forces, float* h_dEdaSum, cudaStream_t stream) { | float4* d_forces, float* h_dEdaSum, cudaStream_t stream) { |
| |
#define CALL(DOENERGY, DOSLOW) GBIS_Kernel<DOENERGY, DOSLOW, 2> \ | #define CALL(DOENERGY, DOSLOW) GBIS_Kernel<DOENERGY, DOSLOW, 2> \ |
<<< nblock, nthread, 0, stream >>> \ | <<< nblock, nthread, 0, stream >>> \ |
(tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), \ | (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), \ |
tlKernel.getPatchPairs(), latticeX, latticeY, latticeZ, tlKernel.get_xyzq(), param.r_cut2, \ | tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), param.r_cut2, \ |
param, bornRad, NULL, NULL, dEdaSum, d_forces, tlKernel.getTileListVirialEnergy()) | param, bornRad, NULL, NULL, dEdaSum, d_forces, tlKernel.getTileListVirialEnergy()) |
| |
if (!doEnergy && !doSlow) CALL(false, false); | if (!doEnergy && !doSlow) CALL(false, false); |
| |
// Phase 3 | // Phase 3 |
// | // |
void CudaComputeGBISKernel::GBISphase3(CudaTileListKernel& tlKernel, const int atomStorageSize, | void CudaComputeGBISKernel::GBISphase3(CudaTileListKernel& tlKernel, const int atomStorageSize, |
const float latticeX, const float latticeY, const float latticeZ, const float a_cut, | const float3 lata, const float3 latb, const float3 latc, const float a_cut, |
float4* d_forces, cudaStream_t stream) { | float4* d_forces, cudaStream_t stream) { |
| |
int nwarp = GBISKERNEL_NUM_WARP; | int nwarp = GBISKERNEL_NUM_WARP; |
| |
| |
GBIS_Kernel<false, false, 3> <<< nblock, nthread, 0, stream >>> | GBIS_Kernel<false, false, 3> <<< nblock, nthread, 0, stream >>> |
(tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), | (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), |
tlKernel.getPatchPairs(), latticeX, latticeY, latticeZ, tlKernel.get_xyzq(), cutoff2, | tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), cutoff2, |
param, intRad0, intRadS, dHdrPrefix, NULL, d_forces, NULL); | param, intRad0, intRadS, dHdrPrefix, NULL, d_forces, NULL); |
| |
cudaCheck(cudaGetLastError()); | cudaCheck(cudaGetLastError()); |