| version 1.1 | version 1.2 |
|---|
| |
| const TileList* __restrict__ tileLists, | const TileList* __restrict__ tileLists, |
| const int* __restrict__ tileJatomStart, | const int* __restrict__ tileJatomStart, |
| const PatchPairRecord* __restrict__ patchPairs, | const PatchPairRecord* __restrict__ patchPairs, |
| const float latticeX, const float latticeY, const float latticeZ, | const float3 lata, const float3 latb, const float3 latc, |
| const float4* __restrict__ xyzq, | const float4* __restrict__ xyzq, |
| const float cutoff2, | const float cutoff2, |
| const GBISParam<phase> param, | const GBISParam<phase> param, |
| |
| int iatomSize = PPStmp.iatomSize; | int iatomSize = PPStmp.iatomSize; |
| int jatomSize = PPStmp.jatomSize; | int jatomSize = PPStmp.jatomSize; |
| | |
| float shx = tmp.offsetXYZ.x*latticeX; | float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x; |
| float shy = tmp.offsetXYZ.y*latticeY; | float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y; |
| float shz = tmp.offsetXYZ.z*latticeZ; | float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z; |
| | |
| // Warp index (0...warpsize-1) | // Warp index (0...warpsize-1) |
| const int wid = threadIdx.x % WARPSIZE; | const int wid = threadIdx.x % WARPSIZE; |
| |
| // Phase 1 | // Phase 1 |
| // | // |
| void CudaComputeGBISKernel::GBISphase1(CudaTileListKernel& tlKernel, const int atomStorageSize, | void CudaComputeGBISKernel::GBISphase1(CudaTileListKernel& tlKernel, const int atomStorageSize, |
| const float latticeX, const float latticeY, const float latticeZ, const float a_cut, float* h_psiSum, | const float3 lata, const float3 latb, const float3 latc, const float a_cut, float* h_psiSum, |
| cudaStream_t stream) { | cudaStream_t stream) { |
| | |
| reallocate_device<float>(&psiSum, &psiSumSize, atomStorageSize, 1.2f); | reallocate_device<float>(&psiSum, &psiSumSize, atomStorageSize, 1.2f); |
| |
| | |
| GBIS_Kernel<false, false, 1> <<< nblock, nthread, 0, stream >>> | GBIS_Kernel<false, false, 1> <<< nblock, nthread, 0, stream >>> |
| (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), | (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), |
| tlKernel.getPatchPairs(), latticeX, latticeY, latticeZ, tlKernel.get_xyzq(), cutoff2, | tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), cutoff2, |
| param, intRad0, intRadS, NULL, psiSum, NULL, NULL); | param, intRad0, intRadS, NULL, psiSum, NULL, NULL); |
| | |
| cudaCheck(cudaGetLastError()); | cudaCheck(cudaGetLastError()); |
| |
| // | // |
| void CudaComputeGBISKernel::GBISphase2(CudaTileListKernel& tlKernel, const int atomStorageSize, | void CudaComputeGBISKernel::GBISphase2(CudaTileListKernel& tlKernel, const int atomStorageSize, |
| const bool doEnergy, const bool doSlow, | const bool doEnergy, const bool doSlow, |
| const float latticeX, const float latticeY, const float latticeZ, | const float3 lata, const float3 latb, const float3 latc, |
| const float r_cut, const float scaling, const float kappa, const float smoothDist, | const float r_cut, const float scaling, const float kappa, const float smoothDist, |
| const float epsilon_p, const float epsilon_s, | const float epsilon_p, const float epsilon_s, |
| float4* d_forces, float* h_dEdaSum, cudaStream_t stream) { | float4* d_forces, float* h_dEdaSum, cudaStream_t stream) { |
| |
| #define CALL(DOENERGY, DOSLOW) GBIS_Kernel<DOENERGY, DOSLOW, 2> \ | #define CALL(DOENERGY, DOSLOW) GBIS_Kernel<DOENERGY, DOSLOW, 2> \ |
| <<< nblock, nthread, 0, stream >>> \ | <<< nblock, nthread, 0, stream >>> \ |
| (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), \ | (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), \ |
| tlKernel.getPatchPairs(), latticeX, latticeY, latticeZ, tlKernel.get_xyzq(), param.r_cut2, \ | tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), param.r_cut2, \ |
| param, bornRad, NULL, NULL, dEdaSum, d_forces, tlKernel.getTileListVirialEnergy()) | param, bornRad, NULL, NULL, dEdaSum, d_forces, tlKernel.getTileListVirialEnergy()) |
| | |
| if (!doEnergy && !doSlow) CALL(false, false); | if (!doEnergy && !doSlow) CALL(false, false); |
| |
| // Phase 3 | // Phase 3 |
| // | // |
| void CudaComputeGBISKernel::GBISphase3(CudaTileListKernel& tlKernel, const int atomStorageSize, | void CudaComputeGBISKernel::GBISphase3(CudaTileListKernel& tlKernel, const int atomStorageSize, |
| const float latticeX, const float latticeY, const float latticeZ, const float a_cut, | const float3 lata, const float3 latb, const float3 latc, const float a_cut, |
| float4* d_forces, cudaStream_t stream) { | float4* d_forces, cudaStream_t stream) { |
| | |
| int nwarp = GBISKERNEL_NUM_WARP; | int nwarp = GBISKERNEL_NUM_WARP; |
| |
| | |
| GBIS_Kernel<false, false, 3> <<< nblock, nthread, 0, stream >>> | GBIS_Kernel<false, false, 3> <<< nblock, nthread, 0, stream >>> |
| (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), | (tlKernel.getNumTileListsGBIS(), tlKernel.getTileListsGBIS(), tlKernel.getTileJatomStartGBIS(), |
| tlKernel.getPatchPairs(), latticeX, latticeY, latticeZ, tlKernel.get_xyzq(), cutoff2, | tlKernel.getPatchPairs(), lata, latb, latc, tlKernel.get_xyzq(), cutoff2, |
| param, intRad0, intRadS, dHdrPrefix, NULL, d_forces, NULL); | param, intRad0, intRadS, dHdrPrefix, NULL, d_forces, NULL); |
| | |
| cudaCheck(cudaGetLastError()); | cudaCheck(cudaGetLastError()); |