CUDASpatialSearch.cu Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDASpatialSearch.cu,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.9 $      $Date: 2020/02/26 19:26:47 $
00014  *
00015  ***************************************************************************/
00022 #include <stdio.h>
00023 #include <stdlib.h>
00024 #include <string.h>
00025 #include <cuda.h>
00026 
00027 #if CUDART_VERSION < 4000
00028 #error The VMD MDFF feature requires CUDA 4.0 or later
00029 #endif
00030 
00031 #include "Inform.h"
00032 #include "utilities.h"
00033 #include "WKFThreads.h"
00034 #include "WKFUtils.h"
00035 #include "CUDAKernels.h" 
00036 #include "CUDASpatialSearch.h"
00037 #include "CUDASort.h"
00038 
00039 #include "AtomSel.h"
00040 #include "VMDApp.h"
00041 #include "MoleculeList.h"
00042 #include "VolumetricData.h"
00043 #include "VolMapCreate.h" // volmap_write_dx_file()
00044 
00045 #include <tcl.h>
00046 #include "TclCommands.h"
00047 
00048 #if 1
00049 #define CUERR { cudaError_t err; \
00050   if ((err = cudaGetLastError()) != cudaSuccess) { \
00051   printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00052   printf("Thread aborting...\n"); \
00053   return NULL; }}
00054 #else
00055 #define CUERR
00056 #endif
00057 
00058 
00059 //
00060 // Restrict macro to make it easy to do perf tuning tests
00061 //
00062 #if 0
00063 #define RESTRICT __restrict__
00064 #else
00065 #define RESTRICT
00066 #endif
00067 
00068 #if __CUDA_ARCH__ >= 300
00069 #define MAXTHRSORTHASH 512
00070 #define MINBLOCKSORTHASH 4
00071 #elif __CUDA_ARCH__ >= 200
00072 #define MAXTHRSORTHASH 512
00073 #define MINBLOCKSORTHASH 1
00074 #else
00075 #define MAXTHRSORTHASH 512
00076 #define MINBLOCKSORTHASH 1
00077 #endif
00078 
00079 //
00080 // Linear-time density kernels that use spatial hashing of atoms 
00081 // into a uniform grid of atom bins to reduce the number of 
00082 // density computations by truncating the gaussian to a given radius
00083 // and only considering bins of atoms that fall within that radius.
00084 //
00085 
00086 #define GRID_CELL_EMPTY 0xffffffff
00087 
00088 // calculate cell address as the hash value for each atom
00089 __global__ static void 
00090 // __launch_bounds__ ( MAXTHRSORTHASH, MINBLOCKSORTHASH )
00091 hashAtoms(unsigned int natoms,
00092           const float4 * RESTRICT xyzr,
00093           int3 numcells,
00094           float invgridspacing,
00095           unsigned int * RESTRICT atomIndex,
00096           unsigned int * RESTRICT atomHash) {
00097   unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;
00098   if (index >= natoms)
00099     return;
00100 
00101   float4 atom = xyzr[index]; // read atom coordinate and radius
00102 
00103   // compute cell index, clamped to fall within grid bounds
00104   int3 cell;
00105   cell.x = min(int(atom.x * invgridspacing), numcells.x-1);
00106   cell.y = min(int(atom.y * invgridspacing), numcells.y-1);
00107   cell.z = min(int(atom.z * invgridspacing), numcells.z-1);
00108 
00109   unsigned int hash = (cell.z * numcells.y * numcells.x) + 
00110                       (cell.y * numcells.x) + cell.x;
00111 
00112   atomIndex[index] = index; // original atom index
00113   atomHash[index] = hash;   // atoms hashed to cell address
00114 }
00115 
00116 
00117 // build cell lists and reorder atoms and colors using sorted atom index list
00118 __global__ static void
00119 // __launch_bounds__ ( MAXTHRSORTHASH, MINBLOCKSORTHASH )
00120 sortAtomsColorsGenCellLists(unsigned int natoms,
00121                             const float4 * RESTRICT xyzr_d,
00122                             const float4 * RESTRICT color_d,
00123                             const unsigned int *atomIndex_d,
00124                             const unsigned int *atomHash_d,
00125                             float4 * RESTRICT sorted_xyzr_d,
00126                             float4 * RESTRICT sorted_color_d,
00127                             uint2 * RESTRICT cellStartEnd_d) {
00128   extern __shared__ unsigned int hash_s[]; // blockSize + 1 elements
00129   unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;
00130   unsigned int hash;
00131 
00132   if (index < natoms) {
00133     hash = atomHash_d[index];
00134     hash_s[threadIdx.x+1] = hash; // use smem to avoid redundant loads
00135     if (index > 0 && threadIdx.x == 0) {
00136       // first thread in block must load neighbor particle hash
00137       hash_s[0] = atomHash_d[index-1];
00138     }
00139   }
00140 
00141   __syncthreads();
00142 
00143   if (index < natoms) {
00144     // Since atoms are sorted, if this atom has a different cell
00145     // than its predecessor, it is the first atom in its cell, and
00146     // it's index marks the end of the previous cell.
00147     if (index == 0 || hash != hash_s[threadIdx.x]) {
00148       cellStartEnd_d[hash].x = index; // set start
00149       if (index > 0)
00150         cellStartEnd_d[hash_s[threadIdx.x]].y = index; // set end
00151     }
00152 
00153     if (index == natoms - 1) {
00154       cellStartEnd_d[hash].y = index + 1; // set end
00155     }
00156 
00157     // Reorder atoms according to sorted indices
00158     unsigned int sortedIndex = atomIndex_d[index];
00159     float4 pos = xyzr_d[sortedIndex];
00160     sorted_xyzr_d[index] = pos;
00161 
00162     // Reorder colors according to sorted indices, if provided
00163     if (color_d != NULL) {
00164       float4 col = color_d[sortedIndex];
00165       sorted_color_d[index] = col;
00166     }
00167   }
00168 }
00169 
00170 
00171 
00172 // build cell lists and reorder atoms using sorted atom index list
00173 __global__ static void
00174 // __launch_bounds__ ( MAXTHRSORTHASH, MINBLOCKSORTHASH )
00175 sortAtomsGenCellLists(unsigned int natoms,
00176                       const float4 * RESTRICT xyzr_d,
00177                       const unsigned int *atomIndex_d,
00178                       const unsigned int *atomHash_d,
00179                       float4 * RESTRICT sorted_xyzr_d,
00180                       uint2 * RESTRICT cellStartEnd_d) {
00181   extern __shared__ unsigned int hash_s[]; // blockSize + 1 elements
00182   unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;
00183   unsigned int hash;
00184 
00185   if (index < natoms) {
00186     hash = atomHash_d[index];
00187     hash_s[threadIdx.x+1] = hash; // use smem to avoid redundant loads
00188     if (index > 0 && threadIdx.x == 0) {
00189       // first thread in block must load neighbor particle hash
00190       hash_s[0] = atomHash_d[index-1];
00191     }
00192   }
00193 
00194   __syncthreads();
00195 
00196   if (index < natoms) {
00197     // Since atoms are sorted, if this atom has a different cell
00198     // than its predecessor, it is the first atom in its cell, and
00199     // it's index marks the end of the previous cell.
00200     if (index == 0 || hash != hash_s[threadIdx.x]) {
00201       cellStartEnd_d[hash].x = index; // set start
00202       if (index > 0)
00203         cellStartEnd_d[hash_s[threadIdx.x]].y = index; // set end
00204     }
00205 
00206     if (index == natoms - 1) {
00207       cellStartEnd_d[hash].y = index + 1; // set end
00208     }
00209 
00210     // Reorder atoms according to sorted indices
00211     unsigned int sortedIndex = atomIndex_d[index];
00212     float4 pos = xyzr_d[sortedIndex];
00213     sorted_xyzr_d[index] = pos;
00214   }
00215 }
00216 
00217 
00218 
00219 int vmd_cuda_build_density_atom_grid(int natoms,
00220                                      const float4 * xyzr_d,
00221                                      const float4 * color_d,
00222                                      float4 * sorted_xyzr_d,
00223                                      float4 * sorted_color_d,
00224                                      unsigned int *atomIndex_d,
00225                                      unsigned int *atomHash_d,
00226                                      uint2 * cellStartEnd_d,
00227                                      int3 volsz,
00228                                      float invgridspacing) {
00229 
00230   // Compute block and grid sizes to use for various kernels
00231   dim3 hBsz(256, 1, 1);
00232 
00233   // If we have a very large atom count, we must either use
00234   // larger thread blocks, or use multi-dimensional grids of thread blocks.
00235   // We can use up to 65535 blocks in a 1-D grid, so we can use
00236   // 256-thread blocks for less than 16776960 atoms, and use 512-thread
00237   // blocks for up to 33553920 atoms.  Beyond that, we have to use 2-D grids
00238   // and modified kernels.
00239   if (natoms > 16000000)
00240     hBsz.x = 512; // this will get us
00241 
00242   dim3 hGsz(((natoms+hBsz.x-1) / hBsz.x), 1, 1);
00243 
00244   // Compute grid cell address as atom hash
00245   // XXX need to use 2-D indexing for large atom counts or we exceed the
00246   //     per-dimension 65535 block grid size limitation
00247   hashAtoms<<<hGsz, hBsz>>>(natoms, xyzr_d, volsz, invgridspacing,
00248                             atomIndex_d, atomHash_d);
00249 
00250   // Sort atom indices by their grid cell address
00251   // XXX no pre-allocated workspace yet...
00252   if (dev_radix_sort_by_key(atomHash_d, atomIndex_d, natoms, 
00253                             (unsigned int *) NULL, (unsigned int *) NULL,
00254                             NULL, 0, 0U, 0U) != 0) {
00255     // It is common to encounter thrust memory allocation issues, so
00256     // we have to catch thrown exceptions here, otherwise we're guaranteed
00257     // to eventually have a crash.  If we get a failure, we have to bomb
00258     // out entirely and fall back to the CPU.
00259     printf("dev_radix_sort_by_key() failed: %s line %d\n", __FILE__, __LINE__);
00260     return -1;
00261   }
00262 
00263   // Initialize all cells to empty
00264   int ncells = volsz.x * volsz.y * volsz.z;
00265   cudaMemset(cellStartEnd_d, GRID_CELL_EMPTY, ncells*sizeof(uint2));
00266 
00267   // Reorder atoms into sorted order and find start and end of each cell
00268   // XXX need to use 2-D indexing for large atom counts or we exceed the
00269   //     per-dimension 65535 block grid size limitation
00270   unsigned int smemSize = sizeof(unsigned int)*(hBsz.x+1);
00271   sortAtomsColorsGenCellLists<<<hGsz, hBsz, smemSize>>>(
00272                        natoms, xyzr_d, color_d, atomIndex_d, atomHash_d,
00273                        sorted_xyzr_d, sorted_color_d, cellStartEnd_d);
00274 
00275   // To gain a bit more performance we can disable detailed error checking
00276   // and use an all-or-nothing approach where errors fall through all
00277   // CUDA API calls until the end, and we only do cleanup at the end.
00278   cudaDeviceSynchronize();
00279   cudaError_t err = cudaGetLastError();
00280   if (err != cudaSuccess) {
00281     printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
00282     return -1;
00283   }
00284 
00285   return 0;
00286 }
00287 
00288 
00289 
00290 int vmd_cuda_build_density_atom_grid(int natoms,
00291                                      const float4 * xyzr_d,
00292                                      float4 *& sorted_xyzr_d,
00293                                      uint2 *& cellStartEnd_d,
00294                                      int3 volsz,
00295                                      float invgridspacing) {
00296   // Allocate work buffers and output array for atom hashing and sorting
00297   unsigned int *atomIndex_d = NULL;
00298   unsigned int *atomHash_d = NULL;
00299 
00300   cudaMalloc((void**)&sorted_xyzr_d, natoms * sizeof(float4));
00301   cudaMalloc((void**)&atomIndex_d, natoms * sizeof(unsigned int));
00302   cudaMalloc((void**)&atomHash_d, natoms * sizeof(unsigned int));
00303 
00304   // Allocate arrays of cell atom list starting and ending addresses
00305   int ncells = volsz.x * volsz.y * volsz.z;
00306   cudaMalloc((void**)&cellStartEnd_d, ncells * sizeof(uint2));
00307 
00308   // Compute block and grid sizes to use for various kernels
00309   dim3 hBsz(256, 1, 1);
00310   dim3 hGsz(((natoms+hBsz.x-1) / hBsz.x), 1, 1);
00311 
00312   // Compute grid cell address as atom hash
00313   hashAtoms<<<hGsz, hBsz>>>(natoms, xyzr_d, volsz, invgridspacing,
00314                             atomIndex_d, atomHash_d);
00315 
00316   // Sort atom indices by their grid cell address
00317   // XXX no pre-allocated workspace yet...
00318   if (dev_radix_sort_by_key(atomHash_d, atomIndex_d, natoms,
00319                             (unsigned int *) NULL, (unsigned int *) NULL,
00320                             NULL, 0, 0U, 0U) != 0) {
00321     // It is common to encounter thrust memory allocation issues, so
00322     // we have to catch thrown exceptions here, otherwise we're guaranteed
00323     // to eventually have a crash.  If we get a failure, we have to bomb
00324     // out entirely and fall back to the CPU.
00325     printf("dev_radix_sort_by_key() failed: %s line %d\n", __FILE__, __LINE__);
00326 
00327     // free memory allocations 
00328     cudaFree(sorted_xyzr_d);
00329     cudaFree(atomIndex_d);
00330     cudaFree(atomHash_d);
00331     cudaFree(cellStartEnd_d);
00332 
00333     return -1;
00334   }
00335 
00336   // Initialize all cells to empty
00337   cudaMemset(cellStartEnd_d, GRID_CELL_EMPTY, ncells*sizeof(uint2));
00338 
00339   // Reorder atoms into sorted order and find start and end of each cell
00340   // XXX need to use 2-D indexing for large atom counts or we exceed the
00341   //     per-dimension 65535 block grid size limitation
00342   unsigned int smemSize = sizeof(unsigned int)*(hBsz.x+1);
00343   sortAtomsGenCellLists<<<hGsz, hBsz, smemSize>>>(
00344                        natoms, xyzr_d, atomIndex_d, atomHash_d, 
00345                        sorted_xyzr_d, cellStartEnd_d);
00346 
00347   // To gain a bit more performance we can disable detailed error checking
00348   // and use an all-or-nothing approach where errors fall through all
00349   // CUDA API calls until the end, and we only do cleanup at the end.
00350   cudaDeviceSynchronize();
00351   cudaError_t err = cudaGetLastError();
00352   if (err != cudaSuccess) {
00353     printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
00354     return -1;
00355   }
00356 
00357   // Free temporary working buffers, caller must free the rest
00358   cudaFree(atomIndex_d);
00359   cudaFree(atomHash_d);
00360 
00361   return 0;
00362 }
00363 
00364 
00365