namd/doxygen/ComputeGridForceCUDAKernel_8cu_source.html

#ifdef NAMD_HIP
#include <hipcub/hipcub.hpp>
#define cub hipcub
#endif //  NAMD_HIP

#ifdef NAMD_CUDA
#include <cuda.h>
#if __CUDACC_VER_MAJOR__ >= 11
#include <cub/cub.cuh>
#else
#include <namd_cub/cub.cuh>
#endif
#endif  // NAMD_CUDA

#include "ComputeGridForceCUDAKernel.h"

//CUDA needs everyone in one compilation unit.
#include "GridforceGridCUDAKernel.h"


#ifdef NODEGROUP_FORCE_REGISTER

template<int T_DOENERGY, int T_DOVIRIAL, int BLOCKS>
__global__ void computeGridForceKernel(
                          const GridforceGridCUDA& grid,
                          const Lattice lat,
                          const double* d_pos_x,
                          const double* d_pos_y,
                          const double* d_pos_z,
                          const char3* __restrict d_transform,
                          const int* __restrict gridForcedAtomIdxArr,
                          const int* __restrict gridForcedAtomIdxMap,
                          const float* __restrict d_gridded_charge,
                          const float* __restrict d_gridded_scale,
                          double* __restrict d_f_normal_x,
                          double* __restrict d_f_normal_y,
                          double* __restrict d_f_normal_z,
                          double3* __restrict h_netForce,
                          double3* __restrict d_netForce,
                          double* __restrict h_netEnergy,
                          double* __restrict d_netEnergy,
                          cudaTensor* __restrict h_virial,
                          cudaTensor* __restrict d_virial,
                          const int numGriddedAtoms,
                          unsigned int* __restrict d_tbcatomic
                          )
{
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  int totaltb = gridDim.x;
  bool isLastBlockDone;
  if(threadIdx.x == 0){
    isLastBlockDone = 0;

  }
  __syncthreads();

  double energy;
  if (T_DOENERGY) {
    energy = 0.0;
  }
  cudaTensor g_virial;
  double3 g_netForce;
  if (T_DOVIRIAL) {
    g_virial.xx = 0.0; g_virial.xy = 0.0; g_virial.xz = 0.0;
    g_virial.yx = 0.0; g_virial.yy = 0.0; g_virial.yz = 0.0;
    g_virial.zx = 0.0; g_virial.zy = 0.0; g_virial.zz = 0.0;
    g_netForce.x = 0.0;
    g_netForce.y = 0.0;
    g_netForce.z = 0.0;
  }
  double f_x, f_y, f_z;
  Vector gfScale= grid.get_scale();

  // Wrap coordinates using grid center

  // FIXME: going back to struct form for the position is
  // suboptimal, but this is a shortcut to getting something we
  // can use without rewriting all the gridforcees code, as
  // otherwise we have to SoAify both wrap_position and the chain
  // of functions in compute_VdV.  Which is a rat's nest of
  // computational code that would likely be better rewritten
  // entirely from scratch, because performance does not seem to
  // have been the overriding priority in the initial
  // implementation.  This modest violation of best practices
  // would at least let us compute all the grid updates inside the
  // device, without having to touch the host side between
  // migration steps.  Even using the GPU badly is likely to
  // perform orders of magnitude faster than syncing back to the
  // host every step to use the current host side scheme.
  if(tid < numGriddedAtoms)
    {
      int indexG = gridForcedAtomIdxMap[tid];
      int soaID = gridForcedAtomIdxArr[indexG];
      Position pos_i= Position(d_pos_x[soaID], d_pos_y[soaID], d_pos_z[soaID]);


// about 5x faster, but the energy and forces differ slightly, likely due to
// absent handling for the gapscale factor around the periodic boundary
//#define USE_INTERPOLATE_FORCE_D
#ifdef USE_INTERPOLATE_FORCE_D
      Position pos_wrapped = grid.wrap_position(pos_i,lat);
      ForceEnergy gridForceEnergy=grid.interpolateForceD(pos_wrapped);
      f_x = gridForceEnergy.force.x * d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.x;
      f_y = gridForceEnergy.force.y * d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.y;
      f_z = gridForceEnergy.force.z * d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.z;
      if(T_DOENERGY && gfScale.x == gfScale.y && gfScale.x == gfScale.z)
   {
     energy = gridForceEnergy.energy * d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.x;
   }
      if (1){
#else
      // Here's where the action happens
      float Vval;
      Vector dV;
      Position pos = grid.wrap_position(pos_i,lat);
      int err = grid.compute_VdV(pos, Vval, dV);
      if (!err) {
   //Force force = scale * Tensor::diagonal(gfScale) * (-charge * dV);
   f_x = -d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.x * dV.x;
   f_y = -d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.y * dV.y;
   f_z = -d_gridded_charge[indexG] * d_gridded_scale[indexG] * gfScale.z * dV.z;
   if(T_DOENERGY && gfScale.x == gfScale.y && gfScale.x == gfScale.z)
     {
       // only makes sense when scaling is isotropic
       energy += d_gridded_scale[indexG] * gfScale.x *
         (d_gridded_charge[indexG] * Vval);
     }
#endif // use force interpolation
#ifdef ATOMIC_ADD_GRID_FORCES
   // if run in different streams, each grid could race over a
   // specific atom's force if any atoms are in multiple grids.
   // In practice, we run them in the same stream
   // and grids are usually non overlapping, so not necessary.

   // FYI: the performance difference is barely measurable
   atomicAdd(&d_f_normal_x[soaID], f_x);
   atomicAdd(&d_f_normal_y[soaID], f_y);
   atomicAdd(&d_f_normal_z[soaID], f_z);
#else
   d_f_normal_x[soaID]+=f_x;
   d_f_normal_y[soaID]+=f_y;
   d_f_normal_z[soaID]+=f_z;
#endif

   if(T_DOVIRIAL){
     char3 t = d_transform[soaID];
     Position vpos = lat.reverse_transform(pos_i, t);
     g_netForce.x = f_x;
     g_netForce.y = f_y;
     g_netForce.z = f_z;
     g_virial.xx = f_x * vpos.x;
     g_virial.xy = f_x * vpos.y;
     g_virial.xz = f_x * vpos.z;
     g_virial.yx = f_y * vpos.x;
     g_virial.yy = f_y * vpos.y;
     g_virial.yz = f_y * vpos.z;
     g_virial.zx = f_z * vpos.x;
     g_virial.zy = f_z * vpos.y;
     g_virial.zz = f_z * vpos.z;
   }
      }
    }
#if 1
  // Reduce energy and virials
  typedef cub::BlockReduce<double, BLOCKS> BlockReduce;
  __shared__ typename BlockReduce::TempStorage temp_storage;

  if(T_DOENERGY){
    energy  = BlockReduce(temp_storage).Sum(energy);
    __syncthreads();
  }

  if(T_DOVIRIAL){
    g_netForce.x  = BlockReduce(temp_storage).Sum(g_netForce.x);
    __syncthreads();
    g_netForce.y  = BlockReduce(temp_storage).Sum(g_netForce.y);
    __syncthreads();
    g_netForce.z  = BlockReduce(temp_storage).Sum(g_netForce.z);
    __syncthreads();
    g_virial.xx  = BlockReduce(temp_storage).Sum(g_virial.xx);
    __syncthreads();
    g_virial.xy  = BlockReduce(temp_storage).Sum(g_virial.xy);
    __syncthreads();
    g_virial.xz  = BlockReduce(temp_storage).Sum(g_virial.xz);
    __syncthreads();
    g_virial.yx  = BlockReduce(temp_storage).Sum(g_virial.yx);
    __syncthreads();
    g_virial.yy  = BlockReduce(temp_storage).Sum(g_virial.yy);
    __syncthreads();
    g_virial.yz  = BlockReduce(temp_storage).Sum(g_virial.yz);
    __syncthreads();
    g_virial.zx  = BlockReduce(temp_storage).Sum(g_virial.zx);
    __syncthreads();
    g_virial.zy  = BlockReduce(temp_storage).Sum(g_virial.zy);
    __syncthreads();
    g_virial.zz  = BlockReduce(temp_storage).Sum(g_virial.zz);
    __syncthreads();
  }

  if(threadIdx.x == 0){
    if(T_DOENERGY){
      atomicAdd(d_netEnergy, energy);
    }
    if(T_DOVIRIAL){
      atomicAdd(&(d_netForce->x), g_netForce.x);
      atomicAdd(&(d_netForce->y), g_netForce.y);
      atomicAdd(&(d_netForce->z), g_netForce.z);
      atomicAdd(&(d_virial->xx), g_virial.xx);
      atomicAdd(&(d_virial->xy), g_virial.xy);
      atomicAdd(&(d_virial->xz), g_virial.xz);
      atomicAdd(&(d_virial->yx), g_virial.yx);
      atomicAdd(&(d_virial->yy), g_virial.yy);
      atomicAdd(&(d_virial->yz), g_virial.yz);
      atomicAdd(&(d_virial->zx), g_virial.zx);
      atomicAdd(&(d_virial->zy), g_virial.zy);
      atomicAdd(&(d_virial->zz), g_virial.zz);
    }
    __threadfence();
    unsigned int value = atomicInc(d_tbcatomic, totaltb);
    isLastBlockDone = (value == (totaltb -1));
  }
#endif


 __syncthreads();

  if(isLastBlockDone){
    if(threadIdx.x == 0){
      //updates to host-mapped mem
      if(T_DOENERGY){
   h_netEnergy[0]  = d_netEnergy[0];
      }
      if(T_DOVIRIAL){
   h_netForce->x = d_netForce->x;
   h_netForce->y = d_netForce->y;
   h_netForce->z = d_netForce->z;
   h_virial->xx = d_virial->xx;
   h_virial->xy = d_virial->xy;
   h_virial->xz = d_virial->xz;
   h_virial->yx = d_virial->yx;
   h_virial->yy = d_virial->yy;
   h_virial->yz = d_virial->yz;
   h_virial->zx = d_virial->zx;
   h_virial->zy = d_virial->zy;
   h_virial->zz = d_virial->zz;
      }
      if(T_DOENERGY){
   d_netEnergy[0] = 0;
      }
      if(T_DOVIRIAL){
   d_netForce->x = 0;
   d_netForce->y = 0;
   d_netForce->z = 0;
   d_virial->xx = 0;
   d_virial->xy = 0;
   d_virial->xz = 0;
   d_virial->yx = 0;
   d_virial->yy = 0;
   d_virial->yz = 0;
   d_virial->zx = 0;
   d_virial->zy = 0;
   d_virial->zz = 0;
      }
      d_tbcatomic[0] = 0;
      __threadfence();
    }
  }
}

void computeGridForce(
        const int doEnergy,
        const int doVirial,
        const GridforceGridCUDA& theGrid,
        const Lattice lat,
        const double* d_pos_x,
        const double* d_pos_y,
        const double* d_pos_z,
        const char3* __restrict d_transform,
        const int* gridForcedAtomIdxArr,
        const int* gridForcedAtomIdxMap,
        const float* d_gridded_charge,
        const float* d_gridded_scale,
        double* __restrict d_f_normal_x,
        double* __restrict d_f_normal_y,
        double* __restrict d_f_normal_z,
        double3* __restrict h_netForce,
        double3* __restrict d_netForce,
        double* __restrict h_netEnergy,
        double* __restrict d_netEnergy,
        cudaTensor* __restrict h_virial,
        cudaTensor* __restrict d_virial,
        const int numGriddedAtoms,
        unsigned int* d_tbcatomic,
        cudaStream_t stream

        )

{
  // most grids are not that big, put it one block

  //  for (int idx = 0; idx < numGriddedAtoms; idx++)
  // TODO use the block to replace the loop
  int options = doEnergy + (doVirial << 1);

  if(numGriddedAtoms > 1024)
    {
      const int blocks = 128;
      const int grid = (numGriddedAtoms+blocks-1)/blocks;

#define CALL_DO_CALC(DOENERGY, DOVIRIAL)                   \
      computeGridForceKernel<DOENERGY, DOVIRIAL, blocks>   \
   <<<grid, blocks, 0, stream>>>( theGrid,                 \
                                  lat,                     \
                                  d_pos_x,                 \
                                  d_pos_y,                 \
                                  d_pos_z,                 \
                                  d_transform,             \
                                  gridForcedAtomIdxArr,    \
                                  gridForcedAtomIdxMap,    \
                                  d_gridded_charge,        \
                                  d_gridded_scale,         \
                                  d_f_normal_x,            \
                                  d_f_normal_y,            \
                                  d_f_normal_z,            \
                                  h_netForce,              \
                                  d_netForce,              \
                                  h_netEnergy,             \
                                  d_netEnergy,             \
                                  h_virial,                \
                                  d_virial,                \
                                  numGriddedAtoms,         \
                                  d_tbcatomic              \
                                  );
      switch(options) {
      case 0: CALL_DO_CALC(0, 0); break;
      case 1: CALL_DO_CALC(1, 0); break;
      case 2: CALL_DO_CALC(0, 1); break;
      case 3: CALL_DO_CALC(1, 1); break;
      }
#undef CALL_DO_CALC
    }
  else
    {// relatively few gridded atoms can go in one threadblock
      const int blocks = 1024;
      const int grid = 1;
#define CALL_DO_CALC(DOENERGY, DOVIRIAL)                   \
      computeGridForceKernel<DOENERGY, DOVIRIAL, blocks>   \
   <<<grid, blocks, 0, stream>>>( theGrid,                 \
                                  lat,                     \
                                  d_pos_x,                 \
                                  d_pos_y,                 \
                                  d_pos_z,                 \
                                  d_transform,             \
                                  gridForcedAtomIdxArr,    \
                                  gridForcedAtomIdxMap,    \
                                  d_gridded_charge,        \
                                  d_gridded_scale,         \
                                  d_f_normal_x,            \
                                  d_f_normal_y,            \
                                  d_f_normal_z,            \
                                  h_netForce,              \
                                  d_netForce,              \
                                  h_netEnergy,             \
                                  d_netEnergy,             \
                                  h_virial,                \
                                  d_virial,                \
                                  numGriddedAtoms,         \
                                  d_tbcatomic              \
                                  );
      switch(options) {
      case 0: CALL_DO_CALC(0, 0); break;
      case 1: CALL_DO_CALC(1, 0); break;
      case 2: CALL_DO_CALC(0, 1); break;
      case 3: CALL_DO_CALC(1, 1); break;
      }
#undef CALL_DO_CALC
    }
}


#endif // NODEGROUP_FORCE_REGISTER