| version 1.2 | version 1.3 |
|---|
| |
| //########################################################################### | //########################################################################### |
| | |
| CudaPmeKSpaceCompute::CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation, | CudaPmeKSpaceCompute::CudaPmeKSpaceCompute(PmeGrid pmeGrid, const int permutation, |
| const int jblock, const int kblock, double kappa, int cuda_arch, int deviceID, cudaStream_t stream) : | const int jblock, const int kblock, double kappa, int deviceID, cudaStream_t stream) : |
| PmeKSpaceCompute(pmeGrid, permutation, jblock, kblock, kappa), | PmeKSpaceCompute(pmeGrid, permutation, jblock, kblock, kappa), |
| cuda_arch(cuda_arch), deviceID(deviceID), stream(stream) { | deviceID(deviceID), stream(stream) { |
| | |
| cudaCheck(cudaSetDevice(deviceID)); | cudaCheck(cudaSetDevice(deviceID)); |
| | |
| |
| | |
| int nfft1, nfft2, nfft3; | int nfft1, nfft2, nfft3; |
| float *prefac1, *prefac2, *prefac3; | float *prefac1, *prefac2, *prefac3; |
| float recip1, recip2, recip3; | |
| | |
| Vector recip1v = lattice.a_r(); | BigReal volume = lattice.volume(); |
| Vector recip2v = lattice.b_r(); | Vector a_r = lattice.a_r(); |
| Vector recip3v = lattice.c_r(); | Vector b_r = lattice.b_r(); |
| double recip[9] = {recip1v.x, recip1v.y, recip1v.z, | Vector c_r = lattice.c_r(); |
| recip2v.x, recip2v.y, recip2v.z, | float recip1x, recip1y, recip1z; |
| recip3v.x, recip3v.y, recip3v.z}; | float recip2x, recip2y, recip2z; |
| | float recip3x, recip3y, recip3z; |
| | |
| if (permutation == Perm_Z_cX_Y) { | if (permutation == Perm_Z_cX_Y) { |
| // Z, X, Y | // Z, X, Y |
| |
| prefac1 = d_bm3; | prefac1 = d_bm3; |
| prefac2 = d_bm1; | prefac2 = d_bm1; |
| prefac3 = d_bm2; | prefac3 = d_bm2; |
| recip1 = (float)recip[8]; | recip1x = c_r.z; |
| recip2 = (float)recip[0]; | recip1y = c_r.x; |
| recip3 = (float)recip[4]; | recip1z = c_r.y; |
| | recip2x = a_r.z; |
| | recip2y = a_r.x; |
| | recip2z = a_r.y; |
| | recip3x = b_r.z; |
| | recip3y = b_r.x; |
| | recip3z = b_r.y; |
| } else if (permutation == Perm_cX_Y_Z) { | } else if (permutation == Perm_cX_Y_Z) { |
| // X, Y, Z | // X, Y, Z |
| nfft1 = pmeGrid.K1; | nfft1 = pmeGrid.K1; |
| |
| prefac1 = d_bm1; | prefac1 = d_bm1; |
| prefac2 = d_bm2; | prefac2 = d_bm2; |
| prefac3 = d_bm3; | prefac3 = d_bm3; |
| recip1 = (float)recip[0]; | recip1x = a_r.x; |
| recip2 = (float)recip[4]; | recip1y = a_r.y; |
| recip3 = (float)recip[8]; | recip1z = a_r.z; |
| | recip2x = b_r.x; |
| | recip2y = b_r.y; |
| | recip2z = b_r.z; |
| | recip3x = c_r.x; |
| | recip3y = c_r.y; |
| | recip3z = c_r.z; |
| } else { | } else { |
| NAMD_bug("CudaPmeKSpaceCompute::solve, invalid permutation"); | NAMD_bug("CudaPmeKSpaceCompute::solve, invalid permutation"); |
| } | } |
| | |
| ortho = (recip[1] == 0.0 && recip[2] == 0.0 && recip[3] == 0.0 && | |
| recip[5] == 0.0 && recip[6] == 0.0 && recip[7] == 0.0); | |
| | |
| if (!ortho) | |
| cudaNAMD_bug("CudaPmeKSpaceCompute::solve, only orthorhombic boxes are currently supported"); | |
| | |
| // ncall++; | // ncall++; |
| // if (ncall == 1) { | // if (ncall == 1) { |
| // char filename[256]; | // char filename[256]; |
| |
| // Clear energy and virial array if needed | // Clear energy and virial array if needed |
| if (doEnergyVirial) clear_device_array<EnergyVirial>(d_energyVirial, 1, stream); | if (doEnergyVirial) clear_device_array<EnergyVirial>(d_energyVirial, 1, stream); |
| | |
| scalar_sum_ortho(permutation == Perm_cX_Y_Z, nfft1, nfft2, nfft3, size1, size2, size3, kappa, | scalar_sum(permutation == Perm_cX_Y_Z, nfft1, nfft2, nfft3, size1, size2, size3, kappa, |
| recip1, recip2, recip3, prefac1, prefac2, prefac3, j0, k0, doEnergyVirial, | recip1x, recip1y, recip1z, recip2x, recip2y, recip2z, recip3x, recip3y, recip3z, |
| &d_energyVirial->energy, d_energyVirial->virial, (float2*)data, cuda_arch, stream); | volume, prefac1, prefac2, prefac3, j0, k0, doEnergyVirial, |
| | &d_energyVirial->energy, d_energyVirial->virial, (float2*)data, |
| | stream); |
| | |
| // Copy energy and virial to host if needed | // Copy energy and virial to host if needed |
| if (doEnergyVirial) { | if (doEnergyVirial) { |
| |
| // cudaCheck(cudaStreamSynchronize(stream)); | // cudaCheck(cudaStreamSynchronize(stream)); |
| } | } |
| | |
| // if (ncall == 1) { | |
| // float2* h_data = new float2[size1*size2*size3]; | |
| // float2* d_data = (float2*)data; | |
| // copy_DtoH<float2>(d_data, h_data, size1*size2*size3, stream); | |
| // cudaCheck(cudaStreamSynchronize(stream)); | |
| // FILE *handle = fopen("datafs.txt", "w"); | |
| // for (int i=0;i < size1*size2*size3;i++) { | |
| // fprintf(handle, "%f %f\n", h_data[i].x, h_data[i].y); | |
| // } | |
| // fclose(handle); | |
| // delete [] h_data; | |
| // } | |
| | |
| //fprintf(stderr, "CudaPmeKSpaceCompute::solve()\n"); | |
| } | } |
| | |
| // void CudaPmeKSpaceCompute::waitEnergyAndVirial() { | // void CudaPmeKSpaceCompute::waitEnergyAndVirial() { |
| |
| } | } |
| | |
| void CudaPmeKSpaceCompute::getVirial(double *virial) { | void CudaPmeKSpaceCompute::getVirial(double *virial) { |
| if (ortho) { | |
| if (permutation == Perm_Z_cX_Y) { | if (permutation == Perm_Z_cX_Y) { |
| // h_energyVirial->virial is storing ZZ, ZX, ZY, XX, XY, YY | // h_energyVirial->virial is storing ZZ, ZX, ZY, XX, XY, YY |
| virial[0] = h_energyVirial->virial[3]; | virial[0] = h_energyVirial->virial[3]; |
| |
| virial[7] = h_energyVirial->virial[4]; | virial[7] = h_energyVirial->virial[4]; |
| virial[8] = h_energyVirial->virial[5]; | virial[8] = h_energyVirial->virial[5]; |
| } | } |
| } else { | |
| NAMD_bug("CudaPmeKSpaceCompute::getVirial, only orthorhombic boxes are currently supported"); | |
| } | |
| } | } |
| | |
| | |
| |
| // Class constructor | // Class constructor |
| // | // |
| CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute(PmeGrid pmeGrid, | CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute(PmeGrid pmeGrid, |
| const int jblock, const int kblock, int cuda_arch, int deviceID, cudaStream_t stream) : | const int jblock, const int kblock, int deviceID, cudaStream_t stream) : |
| PmeRealSpaceCompute(pmeGrid, jblock, kblock), cuda_arch(cuda_arch), deviceID(deviceID), stream(stream) { | PmeRealSpaceCompute(pmeGrid, jblock, kblock), deviceID(deviceID), stream(stream) { |
| if (dataSize < xsize*ysize*zsize) | if (dataSize < xsize*ysize*zsize) |
| NAMD_bug("CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute, insufficient dataSize"); | NAMD_bug("CudaPmeRealSpaceCompute::CudaPmeRealSpaceCompute, insufficient dataSize"); |
| cudaCheck(cudaSetDevice(deviceID)); | cudaCheck(cudaSetDevice(deviceID)); |
| |
| // | // |
| void CudaPmeRealSpaceCompute::spreadCharge(Lattice &lattice) { | void CudaPmeRealSpaceCompute::spreadCharge(Lattice &lattice) { |
| cudaCheck(cudaSetDevice(deviceID)); | cudaCheck(cudaSetDevice(deviceID)); |
| Vector recip1v = lattice.a_r(); | |
| Vector recip2v = lattice.b_r(); | |
| Vector recip3v = lattice.c_r(); | |
| double recip[9] = {recip1v.x, recip1v.y, recip1v.z, | |
| recip2v.x, recip2v.y, recip2v.z, | |
| recip3v.x, recip3v.y, recip3v.z}; | |
| | |
| // Clear grid | // Clear grid |
| clear_device_array<float>(data, xsize*ysize*zsize, stream); | clear_device_array<float>(data, xsize*ysize*zsize, stream); |
| | |
| bool ortho = (recip[1] == 0.0 && recip[2] == 0.0 && recip[3] == 0.0 && | spread_charge((const float4*)d_atoms, numAtoms, |
| recip[5] == 0.0 && recip[6] == 0.0 && recip[7] == 0.0); | pmeGrid.K1, pmeGrid.K2, pmeGrid.K3, xsize, ysize, zsize, |
| | |
| if (!ortho) { | |
| cudaNAMD_bug("CudaPmeRealSpaceCompute::spreadCharge, only orthorhombic boxes are currently supported"); | |
| } | |
| | |
| float recip1 = (float)recip[0]; | |
| float recip2 = (float)recip[4]; | |
| float recip3 = (float)recip[8]; | |
| | |
| spread_charge_ortho((const float4*)d_atoms, numAtoms, | |
| recip1, recip2, recip3, pmeGrid.K1, pmeGrid.K2, pmeGrid.K3, xsize, ysize, zsize, | |
| xsize, y0, z0, (pmeGrid.yBlocks == 1), (pmeGrid.zBlocks == 1), | xsize, y0, z0, (pmeGrid.yBlocks == 1), (pmeGrid.zBlocks == 1), |
| data, pmeGrid.order, stream); | data, pmeGrid.order, stream); |
| | |
| |
| } | } |
| | |
| void CudaPmeRealSpaceCompute::setupGridTexture(float* data, int data_len) { | void CudaPmeRealSpaceCompute::setupGridTexture(float* data, int data_len) { |
| if (cuda_arch >= 350 || (tex_data == data && tex_data_len == data_len)) return; | if (tex_data == data && tex_data_len == data_len) return; |
| tex_data = data; | tex_data = data; |
| tex_data_len = data_len; | tex_data_len = data_len; |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
| // Use texture objects | // Use texture objects |
| cudaResourceDesc resDesc; | cudaResourceDesc resDesc; |
| memset(&resDesc, 0, sizeof(resDesc)); | memset(&resDesc, 0, sizeof(resDesc)); |
| |
| memset(&texDesc, 0, sizeof(texDesc)); | memset(&texDesc, 0, sizeof(texDesc)); |
| texDesc.readMode = cudaReadModeElementType; | texDesc.readMode = cudaReadModeElementType; |
| cudaCheck(cudaCreateTextureObject(&gridTexObj, &resDesc, &texDesc, NULL)); | cudaCheck(cudaCreateTextureObject(&gridTexObj, &resDesc, &texDesc, NULL)); |
| #else | |
| bindGridTexture(data, data_len); | |
| #endif | |
| } | } |
| | |
| void CudaPmeRealSpaceCompute::gatherForce(Lattice &lattice, CudaForce* force) { | void CudaPmeRealSpaceCompute::gatherForce(Lattice &lattice, CudaForce* force) { |
| |
| // Re-allocate force array if needed | // Re-allocate force array if needed |
| reallocate_device<CudaForce>(&d_force, &d_forceCapacity, numAtoms, 1.5f); | reallocate_device<CudaForce>(&d_force, &d_forceCapacity, numAtoms, 1.5f); |
| | |
| Vector recip1v = lattice.a_r(); | gather_force((const float4*)d_atoms, numAtoms, |
| Vector recip2v = lattice.b_r(); | pmeGrid.K1, pmeGrid.K2, pmeGrid.K3, |
| Vector recip3v = lattice.c_r(); | |
| double recip[9] = {recip1v.x, recip1v.y, recip1v.z, | |
| recip2v.x, recip2v.y, recip2v.z, | |
| recip3v.x, recip3v.y, recip3v.z}; | |
| | |
| bool ortho = (recip[1] == 0.0 && recip[2] == 0.0 && recip[3] == 0.0 && | |
| recip[5] == 0.0 && recip[6] == 0.0 && recip[7] == 0.0); | |
| | |
| if (!ortho) { | |
| cudaNAMD_bug("CudaPmeRealSpaceCompute::gatherForce, only orthorhombic boxes are currently supported"); | |
| } | |
| | |
| float recip1 = (float)recip[0]; | |
| float recip2 = (float)recip[4]; | |
| float recip3 = (float)recip[8]; | |
| | |
| gather_force_ortho((const float4*)d_atoms, numAtoms, | |
| recip1, recip2, recip3, pmeGrid.K1, pmeGrid.K2, pmeGrid.K3, | |
| xsize, ysize, zsize, xsize, y0, z0, (pmeGrid.yBlocks == 1), (pmeGrid.zBlocks == 1), | xsize, ysize, zsize, xsize, y0, z0, (pmeGrid.yBlocks == 1), (pmeGrid.zBlocks == 1), |
| data, pmeGrid.order, (float3*)d_force, | data, pmeGrid.order, (float3*)d_force, |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS | |
| gridTexObj, | gridTexObj, |
| #endif | |
| stream); | stream); |
| | |
| copy_DtoH<CudaForce>(d_force, force, numAtoms, stream); | copy_DtoH<CudaForce>(d_force, force, numAtoms, stream); |