namd/doxygen/ComputeSMDCUDAKernel_8cu_source.html

#ifdef NAMD_CUDA
#if __CUDACC_VER_MAJOR__ >= 11
#include <cub/cub.cuh>
#else
#include <namd_cub/cub.cuh>
#endif
#endif

#ifdef NAMD_HIP
#include <hip/hip_runtime.h>
#include <hipcub/hipcub.hpp>
#define cub hipcub
#endif

#include "ComputeSMDCUDAKernel.h"
#include "ComputeCOMCudaKernel.h"
#include "HipDefines.h"

#ifdef NODEGROUP_FORCE_REGISTER


/*! Calculate SMD force and virial for large atom group (numSMDAtoms > 1024)
  Multiple thread block will be called to do this operation.
  The current COM (curCOM) must be calculated and pssed to this function. */
template<bool T_DOENERGY, bool T_MGPUON>
__global__ void computeSMDForceWithCOMKernel(
  const int                numSMDAtoms,
  const Lattice            lat,
  const double             inv_group_mass,
  const double             k,
  const double             k2,
  const double             velocity,
  const double3            direction,
  const int                currentTime,
  const double3            origCM,
  const float *  __restrict mass,
  const double*  __restrict pos_x,
  const double*  __restrict pos_y,
  const double*  __restrict pos_z,
  const char3*   __restrict transform,
  double*        __restrict f_normal_x,
  double*        __restrict f_normal_y,
  double*        __restrict f_normal_z,
  const int*     __restrict smdAtomsSOAIndex,
  cudaTensor*    __restrict d_virial,
  double3*       __restrict h_curCOM,
  double3*       __restrict d_curCOM,
  double3**      __restrict d_peerCOM,
  double*        __restrict h_extEnergy,
  double3*       __restrict h_extForce,
  cudaTensor*    __restrict h_extVirial,
  unsigned int*  __restrict tbcatomic)
{
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  int totaltb = gridDim.x;
  bool isLastBlockDone = 0;
  double3 group_f = {0, 0, 0};
  double energy = 0.0;
  double3 pos = {0, 0, 0};
  double3 f = {0, 0, 0};
  cudaTensor r_virial;
  double3 cm={h_curCOM->x, h_curCOM->y, h_curCOM->z};
  r_virial.xx = 0.0; r_virial.xy = 0.0; r_virial.xz = 0.0;
  r_virial.yx = 0.0; r_virial.yy = 0.0; r_virial.yz = 0.0;
  r_virial.zx = 0.0; r_virial.zy = 0.0; r_virial.zz = 0.0;
  int SOAindex;

  if(T_MGPUON)
    {
      cm.x = d_curCOM->x * inv_group_mass;
      cm.y = d_curCOM->y * inv_group_mass;
      cm.z = d_curCOM->z * inv_group_mass;
    }

  if(tid < numSMDAtoms){
    SOAindex = smdAtomsSOAIndex[tid];

    // uncoalesced memory access: too bad!
    double m = mass[SOAindex]; // Cast from float to double here
    pos.x = pos_x[SOAindex];
    pos.y = pos_y[SOAindex];
    pos.z = pos_z[SOAindex];

    // calculate the distance difference along direction
    double3 diffCOM;
    diffCOM.x = cm.x - origCM.x;
    diffCOM.y = cm.y - origCM.y;
    diffCOM.z = cm.z - origCM.z;
    double diff = diffCOM.x*direction.x + diffCOM.y*direction.y +
      diffCOM.z*direction.z;

    // Ok so we've calculated the new center of mass, now we can calculate the bias
    double preFactor = (velocity*currentTime - diff);
    group_f.x = k*preFactor*direction.x + k2*(diff*direction.x - diffCOM.x);
    group_f.y = k*preFactor*direction.y + k2*(diff*direction.y - diffCOM.y);
    group_f.z = k*preFactor*direction.z + k2*(diff*direction.z - diffCOM.z);

    // calculate the force on each atom
    f.x = group_f.x * m * inv_group_mass;
    f.y = group_f.y * m * inv_group_mass;
    f.z = group_f.z * m * inv_group_mass;

    // apply the bias
    f_normal_x[SOAindex] += f.x ;
    f_normal_y[SOAindex] += f.y ;
    f_normal_z[SOAindex] += f.z ;
    if(T_DOENERGY){
      // energy for restraint along the direction
      energy = 0.5*k*preFactor*preFactor;
      // energy for transverse restraint
      energy += 0.5*k2*(diffCOM.x*diffCOM.x + diffCOM.y*diffCOM.y +
        diffCOM.z*diffCOM.z - diff*diff);
      // unwrap coordinates before calculating the virial
      char3 t = transform[SOAindex];
      pos = lat.reverse_transform(pos, t);
      r_virial.xx = f.x * pos.x;
      r_virial.xy = f.x * pos.y;
      r_virial.xz = f.x * pos.z;
      r_virial.yx = f.y * pos.x;
      r_virial.yy = f.y * pos.y;
      r_virial.yz = f.y * pos.z;
      r_virial.zx = f.z * pos.x;
      r_virial.zy = f.z * pos.y;
      r_virial.zz = f.z * pos.z;
    }
  }
  __syncthreads();

  if(T_DOENERGY){
    typedef cub::BlockReduce<double, 128> BlockReduce;
    __shared__ typename BlockReduce::TempStorage temp_storage;

    r_virial.xx = BlockReduce(temp_storage).Sum(r_virial.xx);
    __syncthreads();
    r_virial.xy = BlockReduce(temp_storage).Sum(r_virial.xy);
    __syncthreads();
    r_virial.xz = BlockReduce(temp_storage).Sum(r_virial.xz);
    __syncthreads();

    r_virial.yx = BlockReduce(temp_storage).Sum(r_virial.yx);
    __syncthreads();
    r_virial.yy = BlockReduce(temp_storage).Sum(r_virial.yy);
    __syncthreads();
    r_virial.yz = BlockReduce(temp_storage).Sum(r_virial.yz);
    __syncthreads();

    r_virial.zx = BlockReduce(temp_storage).Sum(r_virial.zx);
    __syncthreads();
    r_virial.zy = BlockReduce(temp_storage).Sum(r_virial.zy);
    __syncthreads();
    r_virial.zz = BlockReduce(temp_storage).Sum(r_virial.zz);
    __syncthreads();

    if(threadIdx.x == 0){
      atomicAdd(&(d_virial->xx), r_virial.xx);
      atomicAdd(&(d_virial->xy), r_virial.xy);
      atomicAdd(&(d_virial->xz), r_virial.xz);

      atomicAdd(&(d_virial->yx), r_virial.yx);
      atomicAdd(&(d_virial->yy), r_virial.yy);
      atomicAdd(&(d_virial->yz), r_virial.yz);

      atomicAdd(&(d_virial->zx), r_virial.zx);
      atomicAdd(&(d_virial->zy), r_virial.zy);
      atomicAdd(&(d_virial->zz), r_virial.zz);

      __threadfence();
      unsigned int value = atomicInc(&tbcatomic[0], totaltb);
      isLastBlockDone = (value == (totaltb -1));
    }

    __syncthreads();
    // Last block will set the host values
    if(isLastBlockDone){
      if(threadIdx.x == 0){
        h_extEnergy[0] = energy;
        h_extForce->x  = group_f.x;
        h_extForce->y  = group_f.y;
        h_extForce->z  = group_f.z;

        h_extVirial->xx = d_virial->xx;
        h_extVirial->xy = d_virial->xy;
        h_extVirial->xz = d_virial->xz;
        h_extVirial->yx = d_virial->yx;
        h_extVirial->yy = d_virial->yy;
        h_extVirial->yz = d_virial->yz;
        h_extVirial->zx = d_virial->zx;
        h_extVirial->zy = d_virial->zy;
        h_extVirial->zz = d_virial->zz;
        //reset the device virial value
        d_virial->xx = 0;
        d_virial->xy = 0;
        d_virial->xz = 0;

        d_virial->yx = 0;
        d_virial->yy = 0;
        d_virial->yz = 0;

        d_virial->zx = 0;
        d_virial->zy = 0;
        d_virial->zz = 0;
      }
    }
  }
  else
    { // compute isLastBlockDone in the non energy steps
      if(threadIdx.x == 0){
   __threadfence();
   unsigned int value = atomicInc(&tbcatomic[0], totaltb);
   isLastBlockDone = (value == (totaltb -1));
      }
    }
  __syncthreads();
  if(isLastBlockDone){
    if(threadIdx.x == 0){
      if(T_MGPUON){
   h_curCOM->x = cm.x;
   h_curCOM->y = cm.y;
   h_curCOM->z = cm.z;
   d_curCOM->x = 0.0;
   d_curCOM->y = 0.0;
   d_curCOM->z = 0.0;
      }
      //resets atomic counter
      tbcatomic[0] = 0;
      __threadfence();
    }
  }
}


/*! Calculate SMD force, virial, and COM for small atom group (numSMDAtoms <= 1024)
  Single thread block will be called to do this operation.
  The current COM will be calculated and stored in h_curCM. */
template<bool T_DOENERGY, bool T_MGPUON>
__global__ void computeSMDForceKernel(
  const int                numSMDAtoms,
  const Lattice            lat,
  const double             inv_group_mass,
  const double             k,
  const double             k2,
  const double             velocity,
  const double3            direction,
  const int                currentTime,
  const double3            origCM,
  const float * __restrict mass,
  const double* __restrict pos_x,
  const double* __restrict pos_y,
  const double* __restrict pos_z,
  const char3*  __restrict transform,
  double*       __restrict f_normal_x,
  double*       __restrict f_normal_y,
  double*       __restrict f_normal_z,
  const int*    __restrict smdAtomsSOAIndex,
  double3*      __restrict h_curCM,
  double3*      __restrict d_curCM,
  double3**     __restrict d_peerCOM,
  double*       __restrict h_extEnergy,
  double3*      __restrict h_extForce,
  cudaTensor*   __restrict h_extVirial,
  unsigned int*  __restrict tbcatomic)
{
  __shared__ double3 group_f;
  __shared__ double energy;
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  int totaltb = gridDim.x;
  bool isLastBlockDone = 0;
  double m = 0;
  double3 cm = {0, 0, 0};
  double3 pos = {0, 0, 0};
  double3 f = {0, 0, 0};
  cudaTensor r_virial;
  r_virial.xx = 0.0; r_virial.xy = 0.0; r_virial.xz = 0.0;
  r_virial.yx = 0.0; r_virial.yy = 0.0; r_virial.yz = 0.0;
  r_virial.zx = 0.0; r_virial.zy = 0.0; r_virial.zz = 0.0;
  int SOAindex;
  // in the mGpuOn case the COM must be calculated across devices and passed in
  if(tid < numSMDAtoms){
    // First -> recalculate center of mass.
    // Only thread zero is doing this
    SOAindex = smdAtomsSOAIndex[tid];
    m = mass[SOAindex]; // Cast from float to double here
    pos.x = pos_x[SOAindex];
    pos.y = pos_y[SOAindex];
    pos.z = pos_z[SOAindex];

    // unwrap the  coordinate to calculate COM
    char3 t = transform[SOAindex];
    pos = lat.reverse_transform(pos, t);
    if(!T_MGPUON)
      {
   cm.x = pos.x * m;
   cm.y = pos.y * m;
   cm.z = pos.z * m;
      }
  }
  // now reduce the values and add it to thread zero
  typedef cub::BlockReduce<double, 1024> BlockReduce;
  __shared__ typename BlockReduce::TempStorage temp_storage;
  if(!T_MGPUON){
    cm.x = BlockReduce(temp_storage).Sum(cm.x);
    __syncthreads();
    cm.y = BlockReduce(temp_storage).Sum(cm.y);
    __syncthreads();
    cm.z = BlockReduce(temp_storage).Sum(cm.z);
    __syncthreads();
  }
  // Calculate group force and acceleration
  if(threadIdx.x == 0){
    if(T_MGPUON)
      {
   cm.x = d_curCM->x * inv_group_mass;
   cm.y = d_curCM->y * inv_group_mass;
   cm.z = d_curCM->z * inv_group_mass;
      }
    else
      {
   cm.x *= inv_group_mass; // calculates the current center of mass
   cm.y *= inv_group_mass; // calculates the current center of mass
   cm.z *= inv_group_mass; // calculates the current center of mass
      }
    // calculate the distance difference along direction
    double3 diffCOM;
    diffCOM.x = cm.x - origCM.x;
    diffCOM.y = cm.y - origCM.y;
    diffCOM.z = cm.z - origCM.z;
    double diff = diffCOM.x*direction.x + diffCOM.y*direction.y +
      diffCOM.z*direction.z;

    // Ok so we've calculated the new center of mass, now we can calculate the bias
    double preFactor = (velocity*currentTime - diff);
    group_f.x = k*preFactor*direction.x + k2*(diff*direction.x - diffCOM.x);
    group_f.y = k*preFactor*direction.y + k2*(diff*direction.y - diffCOM.y);
    group_f.z = k*preFactor*direction.z + k2*(diff*direction.z - diffCOM.z);
    if(T_DOENERGY) {
      // energy for restraint along the direction
      energy = 0.5*k*preFactor*preFactor;
      // energy for transverse restraint
      energy += 0.5*k2*(diffCOM.x*diffCOM.x + diffCOM.y*diffCOM.y +
        diffCOM.z*diffCOM.z - diff*diff);
    }
  }
  __syncthreads();

  if(tid < numSMDAtoms){
    // calculate the force on each atom
    f.x = group_f.x * m * inv_group_mass;
    f.y = group_f.y * m * inv_group_mass;
    f.z = group_f.z * m * inv_group_mass;

    // apply the bias
    f_normal_x[SOAindex] += f.x ;
    f_normal_y[SOAindex] += f.y ;
    f_normal_z[SOAindex] += f.z ;
    if(T_DOENERGY){
      r_virial.xx = f.x * pos.x;
      r_virial.xy = f.x * pos.y;
      r_virial.xz = f.x * pos.z;
      r_virial.yx = f.y * pos.x;
      r_virial.yy = f.y * pos.y;
      r_virial.yz = f.y * pos.z;
      r_virial.zx = f.z * pos.x;
      r_virial.zy = f.z * pos.y;
      r_virial.zz = f.z * pos.z;
    }
  }
  if(T_MGPUON){
    if(threadIdx.x == 0){
      __threadfence();
      unsigned int value = atomicInc(&tbcatomic[0], totaltb);
      isLastBlockDone = (value == (totaltb -1));
    }
  }
  if(T_DOENERGY){
    r_virial.xx = BlockReduce(temp_storage).Sum(r_virial.xx);
    __syncthreads();
    r_virial.xy = BlockReduce(temp_storage).Sum(r_virial.xy);
    __syncthreads();
    r_virial.xz = BlockReduce(temp_storage).Sum(r_virial.xz);
    __syncthreads();

    r_virial.yx = BlockReduce(temp_storage).Sum(r_virial.yx);
    __syncthreads();
    r_virial.yy = BlockReduce(temp_storage).Sum(r_virial.yy);
    __syncthreads();
    r_virial.yz = BlockReduce(temp_storage).Sum(r_virial.yz);
    __syncthreads();

    r_virial.zx = BlockReduce(temp_storage).Sum(r_virial.zx);
    __syncthreads();
    r_virial.zy = BlockReduce(temp_storage).Sum(r_virial.zy);
    __syncthreads();
    r_virial.zz = BlockReduce(temp_storage).Sum(r_virial.zz);
    __syncthreads();

    if(threadIdx.x == 0){
      // thread zero updates the value
      h_curCM->x = cm.x; // update current center of mass
      h_curCM->y = cm.y; // update current center of mass
      h_curCM->z = cm.z; // update current center of mass
      h_extEnergy[0] = energy;    // bias energy
      h_extForce->x  = group_f.x; // bias force
      h_extForce->y  = group_f.y;
      h_extForce->z  = group_f.z;

      h_extVirial->xx = r_virial.xx;
      h_extVirial->xy = r_virial.xy;
      h_extVirial->xz = r_virial.xz;
      h_extVirial->yx = r_virial.yx;
      h_extVirial->yy = r_virial.yy;
      h_extVirial->yz = r_virial.yz;
      h_extVirial->zx = r_virial.zx;
      h_extVirial->zy = r_virial.zy;
      h_extVirial->zz = r_virial.zz;
    }
  }
  // last block cleans up
  if(T_MGPUON) {
    if(isLastBlockDone){
      if(threadIdx.x == 0){
   // zero out for next iteration
   d_curCM->x = 0.0;
   d_curCM->y = 0.0;
   d_curCM->z = 0.0;
   //resets atomic counter
   tbcatomic[0] = 0;
   __threadfence();
      }
    }
  }
}

/*! Compute SMD force and virial on group of atoms */
void computeSMDForce(
  const Lattice     &lat,
  const double      inv_group_mass,
  const double      spring_constant,
  const double      transverse_spring_constant,
  const double      velocity,
  const double3     direction,
  const int         doEnergy,
  const int         currentTime,
  const bool        mGpuOn,
  const double3     origCM,
  const float*      d_mass,
  const double*     d_pos_x,
  const double*     d_pos_y,
  const double*     d_pos_z,
  const char3*      d_transform,
  double *          d_f_normal_x,
  double *          d_f_normal_y,
  double *          d_f_normal_z,
  const int         numSMDAtoms,
  const int*        d_smdAtomsSOAIndex,
  double3*          d_curCM,
  double3*          h_curCM,
  double3**         d_peerCOM,
  cudaTensor*       d_extVirial,
  double*           h_extEnergy,
  double3*          h_extForce,
  cudaTensor*       h_extVirial,
  unsigned int*     d_tbcatomic,
  const int         numDevices,
  const int         deviceIndex,
  cudaStream_t      stream)
{

  const int blocks = (numSMDAtoms > 1024) ? 128 : 1024;
  const int grid = (numSMDAtoms > 1024) ? (numSMDAtoms + blocks - 1) / blocks : 1;

#define CALL_WITH_COM(DOENERGY, MGPUON) \
 computeSMDForceWithCOMKernel<DOENERGY, MGPUON> \
   <<< grid, blocks, 0 , stream >>> \
   (numSMDAtoms, lat, inv_group_mass, spring_constant, \
    transverse_spring_constant, velocity,  direction, currentTime, \
    origCM, d_mass, d_pos_x, d_pos_y, d_pos_z,  d_transform, \
    d_f_normal_x, d_f_normal_y, d_f_normal_z,  d_smdAtomsSOAIndex, \
    d_extVirial, h_curCM, d_curCM, d_peerCOM, h_extEnergy, h_extForce,     \
    h_extVirial, d_tbcatomic);

#define CALL(DOENERGY, MGPUON) \
  computeSMDForceKernel<DOENERGY, MGPUON> \
  <<<grid, blocks, 0, stream>>> \
    (numSMDAtoms, lat, inv_group_mass, spring_constant, \
     transverse_spring_constant, velocity, direction,  currentTime, \
     origCM, d_mass, d_pos_x, d_pos_y, d_pos_z, d_transform, \
     d_f_normal_x, d_f_normal_y, d_f_normal_z, d_smdAtomsSOAIndex, \
     h_curCM, d_curCM, d_peerCOM, h_extEnergy, h_extForce, h_extVirial, \
     d_tbcatomic);

  if (numSMDAtoms > 1024) {
    if(!mGpuOn)
      { //first calculate the COM for SMD group and store it in h_curCM
   computeCOMKernel<128><<<grid, blocks, 0, stream>>>(
                                                      numSMDAtoms,
                                                      inv_group_mass,
                                                      lat,
                                                      d_mass,
                                                      d_pos_x,
                                                      d_pos_y,
                                                      d_pos_z,
                                                      d_transform,
                                                      d_smdAtomsSOAIndex,
                                                      d_curCM,
                                                      h_curCM,
                                                      d_tbcatomic);
      }
    else
      {// sum up the COMs across devices to this device
   computeDistCOMKernelMgpu<<<grid, blocks, 0, stream>>>(d_peerCOM,
                                                         d_curCM,
                                                         numDevices);
      }
    if(doEnergy && mGpuOn) CALL_WITH_COM(true, true);
    if(doEnergy && !mGpuOn) CALL_WITH_COM(true, false);
    if(!doEnergy && mGpuOn) CALL_WITH_COM(false, true);
    if(!doEnergy && !mGpuOn) CALL_WITH_COM(false, false);
  }
  else
    {
      if(mGpuOn)
   {// sum up the COMs across devices to this device
     computeDistCOMKernelMgpu<<<grid, blocks, 0, stream>>>(d_peerCOM,
                                                           d_curCM,
                                                           numDevices);

   }
      if(doEnergy && mGpuOn) CALL(true, true);
      if(doEnergy && !mGpuOn) CALL(true, false);
      if(!doEnergy && mGpuOn) CALL(false, true);
      if(!doEnergy && !mGpuOn) CALL(false, false);
    }
#undef CALL_WITH_COM
#undef CALL
}


void initPeerCOMmgpu(
            const int numDevices,
            const int deviceIndex,
            double3** d_peerPool,
            double3*  d_peerCOM,
            cudaStream_t      stream)
{
  const int blocks = numDevices;
  const int grid = 1;
  initPeerCOMKernel<<<grid, blocks, 0, stream>>>( numDevices,
                                                   deviceIndex,
                                                   d_peerPool,
                                                   d_peerCOM);
}


/* called in earlier phase to handle multi device COM */
void computeCOMSMDMgpu(
  const int         numSMDAtoms,
  const Lattice     &lat,
  const float*      d_mass,
  const double*     d_pos_x,
  const double*     d_pos_y,
  const double*     d_pos_z,
  const char3*      d_transform,
  const int*        d_smdAtomsSOAIndex,
  double3*          d_peerCOM,
  double3**         d_peer_curCM,
  unsigned int*     d_tbcatomic,
  const int         numDevices,
  const int         deviceIndex,
  cudaStream_t      stream)
{
  // block it up if large, otherwise all in one go
  const int blocks = (numSMDAtoms > 1024) ? 128 : 1024;
  const int grid = (numSMDAtoms > 1024) ? (numSMDAtoms + blocks - 1) / blocks : 1;
  //initialize the device memory to zero here
  cudaCheck(cudaMemset(d_peerCOM, 0, sizeof(double3)));
  if(numSMDAtoms >1024)
    computeCOMKernelMgpu<128><<<grid, blocks, 0, stream>>>(numSMDAtoms,
                                                      lat, d_mass,
                                                      d_pos_x, d_pos_y, d_pos_z,
                                                      d_transform,
                                                      d_smdAtomsSOAIndex,
                                                      d_peer_curCM,
                                                      numDevices,
                                                      deviceIndex,
                                                      d_tbcatomic);
  else
    computeCOMKernelMgpu<1024><<<grid, blocks, 0, stream>>>(numSMDAtoms,
                                                 lat, d_mass,
                                                 d_pos_x, d_pos_y, d_pos_z,
                                                 d_transform,
                                                 d_smdAtomsSOAIndex,
                                                 d_peer_curCM,
                                                 numDevices,
                                                 deviceIndex,
                                                 d_tbcatomic);
}

#endif // NODEGROUP_FORCE_REGISTER