CUDAMeasureRDF.cu Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDAMeasureRDF.cu,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.28 $      $Date: 2020/02/26 20:16:56 $
00014  *
00015  ***************************************************************************/
00029 #include <stdio.h>
00030 #include <stdlib.h>
00031 #include <string.h>
00032 #include <math.h>
00033 #include <cuda.h>
00034 
00035 #include "Inform.h"
00036 #include "utilities.h"
00037 #include "WKFThreads.h"
00038 #include "WKFUtils.h"
00039 #include "CUDAKernels.h" 
00040 
00041 //
00042 // the below parameters are important to tuning the performance of the code
00043 //
00044 #if 0
00045 // original (slow?) default values for all GPU types
00046 // no hardware support for atomic operations, so the shared memory 
00047 // tagging scheme is used.
00048 #define NBLOCK 128        // size of an RDF data block 
00049 #define MAXBIN 64         // maximum number of bins in a histogram
00050 #elif 1
00051 // optimal values for compute 1.0 (G80/G9x) devices 
00052 // no hardware support for atomic operations, so the shared memory 
00053 // tagging scheme is used.
00054 #define NBLOCK 32         // size of an RDF data block 
00055 #define MAXBIN 1024       // maximum number of bins in a histogram
00056 #elif 1
00057 // optimal values for compute 1.3 (GT200) devices 
00058 // uses atomic operations to increment histogram bins
00059 #define NBLOCK 320        // size of an RDF data block 
00060 #define MAXBIN 3072       // maximum number of bins in a histogram
00061 #define __USEATOMIC  1    // enable atomic increment operations
00062 #elif 1
00063 // optimal values for compute 2.0 (Fermi) devices 
00064 // uses atomic operations to increment histogram bins
00065 // uses Fermi's 3x larger shared memory (48KB) to reduce the number 
00066 // of passes required for computation of very large histograms
00067 #define NBLOCK 896        // size of an RDF data block 
00068 #define MAXBIN 8192       // maximum number of bins in a histogram
00069 #define __USEATOMIC  1    // enable atomic increment operations
00070 #endif
00071 
00072 #define NCUDABLOCKS 256   // the number of blocks to divide the shared memory 
00073                           // dimension into.  Performance increases up to ~8 
00074                           // and then levels off
00075 
00076 
00077 #define NBLOCKHIST 64     // the number of threads used in the final histogram
00078                           // kernel.  The summation of the histograms isn't 
00079                           // a bottleneck so this isn't a critical choice
00080 
00081 #define NCONSTBLOCK 5440  // the number of atoms in constant memory.
00082                           // Can be 4096 (or larger) up to capacity limits.
00083 
00084 #define THREADSPERWARP 32 // this is correct for G80/GT200/Fermi GPUs
00085 #define WARP_LOG_SIZE 5   // this is also correct for G80/GT200/Fermi GPUs
00086 
00087 #define BIN_OVERFLOW_LIMIT 2147483648 // maximum value a bin can store before
00088                                       // we need to accumulate and start over
00089 
00090 
00091 // This routine prepares data for the calculation of the histogram.
00092 // It zeros out instances of the histograms in global memory.
00093 __global__ void init_hist(unsigned int* llhistg, // histograms
00094                           int maxbin) // number of bins per histogram
00095 {
00096 
00097 #ifndef __USEATOMIC
00098   int io; // index of atoms in selection
00099 #endif
00100 
00101   int iblock; // index of data blocks 
00102 
00103    unsigned int *llhist; // this pointer will point to the begining of 
00104   // each thread's instance of the histogram
00105 
00106   int nofblocks; // nofblocks is the number of full data blocks.  
00107   int nleftover; // nleftover is the dimension of the blocks left over 
00108   // after all full data blocks
00109   int maxloop; // the loop condition maximum
00110 
00111   // initialize llhists to point to an instance of the histogram for each
00112   // warp.  Zero out the integer (llhist) and floating point (histdev) copies
00113   // of the histogram in global memory
00114 #ifdef __USEATOMIC
00115   llhist=llhistg+blockIdx.x*maxbin+threadIdx.x;
00116   nofblocks=maxbin/NBLOCK;
00117   nleftover=maxbin-nofblocks*NBLOCK;
00118   maxloop=nofblocks*NBLOCK;
00119 
00120   for (iblock=0; iblock < maxloop; iblock+=NBLOCK) {
00121     llhist[iblock]=0UL;
00122   }
00123   // take care of leftovers
00124   if (threadIdx.x < nleftover) {
00125     llhist[iblock]=0UL;
00126   }
00127   
00128 #else
00129   llhist=llhistg+(((blockIdx.x)*NBLOCK)/THREADSPERWARP)*maxbin+threadIdx.x;
00130   nofblocks=maxbin/NBLOCK;
00131   nleftover=maxbin-nofblocks*NBLOCK;
00132   maxloop=nofblocks*NBLOCK;
00133   int maxloop2=(NBLOCK / THREADSPERWARP)*maxbin;
00134 
00135   for (io=0; io < maxloop2; io+=maxbin) {
00136     for (iblock=0; iblock < maxloop; iblock+=NBLOCK) {
00137       llhist[io + iblock]=0UL;
00138     }
00139     // take care of leftovers
00140     if (threadIdx.x < nleftover) {
00141       llhist[io + iblock]=0UL;
00142     }
00143   }
00144 #endif
00145 
00146   return;
00147 }
00148 
00149 
00150 
00151 /* This routine prepares data for the calculation of the histogram.
00152  * It zeros out the floating point histogram in global memory.
00153  */
00154 __global__ void init_hist_f(float* histdev) {
00155   histdev[threadIdx.x]=0.0f;
00156   return;
00157 }
00158 
00159 
00160 
00161 /* This routine prepares data for the calculation of the histogram.
00162  * It recenters the atom to a single periodic unit cell.  
00163  * It also chooses positions for the padding atoms such that 
00164  * they don't contribute to the histogram
00165  */
00166 __global__ void reimage_xyz(float* xyz,     // XYZ data in global memory.  
00167                             int natomsi,    // number of atoms
00168                             int natomsipad, // # atoms including padding atoms
00169                             float3 celld,   // the cell size of each frame
00170                             float rmax)     // dist. used to space padding atoms
00171 {
00172   int iblock; // index of data blocks 
00173 
00174   __shared__ float xyzi[3*NBLOCK]; // this array hold xyz data for the current 
00175   // data block.  
00176 
00177   int nofblocks; // ibin is an index associated with histogram 
00178   // bins.  nofblocks is the number of full data blocks.  nleftover is the 
00179   // dimension of the blocks left over after all full data blocks
00180 
00181   float *pxi; // these pointers point to the portion of xyz currently
00182   // being processed
00183 
00184   int threetimesid; // three times threadIdx.x
00185 
00186   float rtmp; // a temporary distance to be used in creating padding atoms
00187 
00188   int ifinalblock; // this is the index of the final block
00189 
00190   int loopmax; // maximum for the loop counter
00191 
00192   // initialize nofblocks, pxi, and threetimesid
00193   nofblocks=((natomsipad/NBLOCK)+NCUDABLOCKS-blockIdx.x-1)/NCUDABLOCKS;
00194   loopmax=nofblocks*3*NCUDABLOCKS*NBLOCK;
00195   ifinalblock=(natomsipad / NBLOCK - 1) % NCUDABLOCKS;
00196   pxi=xyz+blockIdx.x*3*NBLOCK+threadIdx.x;
00197   threetimesid=3*threadIdx.x;
00198 
00199   // shift all atoms into the same unit cell centered at the origin
00200   for (iblock=0; iblock<loopmax; iblock+=3*NCUDABLOCKS*NBLOCK) {
00201     __syncthreads();
00202     //these reads from global memory should be coallesced
00203     xyzi[threadIdx.x         ]=pxi[iblock         ];
00204     xyzi[threadIdx.x+  NBLOCK]=pxi[iblock+  NBLOCK];
00205     xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK];
00206     __syncthreads();
00207 
00208     // Shift the xyz coordinates so that 0 <= xyz < celld.
00209     // The ?: line is necesary because of the less than convenient behavior
00210     // of fmod for negative xyz values
00211     xyzi[threetimesid] = fmodf(xyzi[threetimesid  ], celld.x);
00212     if (xyzi[threetimesid  ] < 0.0f) {
00213       xyzi[threetimesid  ] += celld.x;
00214     }
00215     xyzi[threetimesid+1]=fmodf(xyzi[threetimesid+1], celld.y);
00216     if (xyzi[threetimesid+1] < 0.0f) {
00217       xyzi[threetimesid+1] += celld.y;
00218     }
00219     xyzi[threetimesid+2]=fmodf(xyzi[threetimesid+2], celld.z);
00220     if (xyzi[threetimesid+2] < 0.0f) {
00221       xyzi[threetimesid+2] += celld.z;
00222     }
00223 
00224     // if this is the final block then we pad
00225     if (iblock==loopmax-3*NCUDABLOCKS*NBLOCK && blockIdx.x==ifinalblock) {
00226       // pad the xyz coordinates with atoms which are spaced out such that they
00227       // cannot contribute to the histogram.  Note that these atoms are NOT 
00228       // reimaged
00229       if ((blockDim.x-threadIdx.x) <= (natomsipad - natomsi)) {
00230         rtmp = -((float)(threadIdx.x+1))*rmax;
00231         xyzi[threetimesid  ] = rtmp;
00232         xyzi[threetimesid+1] = rtmp;
00233         xyzi[threetimesid+2] = rtmp;
00234       }
00235     }
00236 
00237     __syncthreads();
00238 
00239     // store the xyz values back to global memory.  these stores are coallesced
00240     pxi[iblock         ]=xyzi[threadIdx.x         ];
00241     pxi[iblock+  NBLOCK]=xyzi[threadIdx.x+  NBLOCK];
00242     pxi[iblock+2*NBLOCK]=xyzi[threadIdx.x+2*NBLOCK];
00243 
00244     // increment pxi to the next block of global memory to be processed
00245     //pxi=pxi+3*NCUDABLOCKS*NBLOCK;
00246   }
00247 }
00248 
00249 
00250 // shift the "phantom" atoms so they don't interfere in non-periodic
00251 // calculations.
00252 __global__ void phantom_xyz(float* xyz,     // XYZ data in global memory.
00253                             int natomsi,    // number of atoms
00254                             int natomsipad, // # atoms including padding atoms
00255                             float minxyz,
00256                             float rmax)     // dist. used to space padding atoms
00257 {
00258   int iblock; // index of data blocks
00259 
00260   __shared__ float xyzi[3*NBLOCK]; // this array hold xyz data for the current
00261   // data block.
00262 
00263   int nofblocks; // ibin is an index associated with histogram
00264   // bins.  nofblocks is the number of full data blocks.  nleftover is the
00265   // dimension of the blocks left over after all full data blocks
00266 
00267   float *pxi; // these pointers point to the portion of xyz currently
00268   // being processed
00269 
00270   int threetimesid; // three times threadIdx.x
00271 
00272   float rtmp; // a temporary distance to be used in creating padding atoms
00273 
00274   int ifinalblock; // this is the index of the final block
00275 
00276   int loopmax; // maximum for the loop counter
00277 
00278   // initialize nofblocks, pxi, and threetimesid
00279   nofblocks=((natomsipad/NBLOCK)+NCUDABLOCKS-blockIdx.x-1)/NCUDABLOCKS;
00280   loopmax=nofblocks*3*NCUDABLOCKS*NBLOCK;
00281   ifinalblock=(natomsipad / NBLOCK - 1) % NCUDABLOCKS;
00282   pxi=xyz+blockIdx.x*3*NBLOCK+threadIdx.x;
00283   threetimesid=3*threadIdx.x;
00284 
00285   // shift all atoms into the same unit cell centered at the origin
00286   for (iblock=0; iblock<loopmax; iblock+=3*NCUDABLOCKS*NBLOCK) {
00287     __syncthreads();
00288     //these reads from global memory should be coallesced
00289     xyzi[threadIdx.x         ]=pxi[iblock         ];
00290     xyzi[threadIdx.x+  NBLOCK]=pxi[iblock+  NBLOCK];
00291     xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK];
00292     __syncthreads();
00293 
00294     // if this is the final block then we pad
00295     if (iblock==loopmax-3*NCUDABLOCKS*NBLOCK && blockIdx.x==ifinalblock) {
00296       // pad the xyz coordinates with atoms which are spaced out such that they
00297       // cannot contribute to the histogram.  Note that these atoms are NOT
00298       // reimaged
00299       if ((blockDim.x-threadIdx.x) <= (natomsipad - natomsi)) {
00300         rtmp = -((float)(threadIdx.x+1))*rmax-minxyz;
00301         xyzi[threetimesid  ] = rtmp;
00302         xyzi[threetimesid+1] = rtmp;
00303         xyzi[threetimesid+2] = rtmp;
00304       }
00305     }
00306 
00307     __syncthreads();
00308 
00309     // store the xyz values back to global memory.  these stores are coallesced
00310     pxi[iblock         ]=xyzi[threadIdx.x         ];
00311     pxi[iblock+  NBLOCK]=xyzi[threadIdx.x+  NBLOCK];
00312     pxi[iblock+2*NBLOCK]=xyzi[threadIdx.x+2*NBLOCK];
00313 
00314     // increment pxi to the next block of global memory to be processed
00315     //pxi=pxi+3*NCUDABLOCKS*NBLOCK;
00316   }
00317 }
00318 
00319 
00320 /* This kernel calculates a radial distribution function between 
00321  * two selections one selection is stored in its entirety in global memory.
00322  * The other is stored partially in constant memory.  This routine is 
00323  * called to calculate the rdf between all of selection 1 and the portion
00324  * of selection 2 in constant memory.  
00325  * Each element of selection 2 is associated with it's own thread.  To minimize
00326  * accesses to global memory selection 1 is divided into chunks of NBLOCK 
00327  * elements.  These chunks are loaded into shared memory simultaneously, 
00328  * and then then all possible pairs of these atoms with those in 
00329  * constant memory are calculated.
00330  */
00331 __constant__ static float xyzj[3*NCONSTBLOCK]; // this array stores the
00332 // coordinates of selection two in constant memory
00333 
00334 // this routine is called from the host to move coordinate data to constant mem
00335 void copycoordstoconstbuff(int natoms, const float* xyzh) {
00336   cudaMemcpyToSymbol(xyzj, xyzh, 3*natoms*sizeof(float));
00337 }
00338 
00339 
00340 // This subroutine is based on a similar routine in the CUDA SDK histogram256 
00341 // example.  It adds a count to the histogram in such a way that there are no 
00342 // collisions.  If several routines need to update the same element a tag 
00343 // applied to to the highest bits of that element is used to determine which 
00344 // elements have already updated it.  For devices with compute capability 1.2 
00345 // or higher this is not necessary as atomicAdds can be used.
00346 
00347 #ifndef __USEATOMIC
00348 __device__ void addData(volatile unsigned int *s_WarpHist, // the first element
00349                         // of the histogram to be operated on
00350                         unsigned int data,      // the bin to add a count to
00351                         unsigned int threadTag) // tag of the current thread
00352 {
00353   unsigned int count; // the count in the bin being operated on
00354 
00355   do {  
00356     count = s_WarpHist[data] & 0x07FFFFFFUL;
00357     count = threadTag | (count + 1);
00358     s_WarpHist[data] = count;
00359   } while(s_WarpHist[data] != count);
00360 }
00361 #endif
00362 
00363 
00364 // This is the routine that does all the real work.  It takes as input the
00365 // various atomic coordinates and produces one rdf per warp, as described 
00366 // above.  These rdfs will be summed later by calculate_hist.
00367 __global__ void calculate_rdf(int usepbc,     // periodic or non-periodic calc.
00368                               float* xyz,     // atom XYZ coords, gmem
00369                               int natomsi,    // # of atoms in sel1, gmem
00370                               int natomsj,    // # of atoms in sel2, cmem
00371                               float3 celld,   // cell size of each frame
00372                               unsigned int* llhistg, // histograms, gmem
00373                               int nbins,      // # of bins in each histogram
00374                               float rmin,     // minimum distance for histogram
00375                               float delr_inv) // recip. width of histogram bins
00376 {
00377   int io; // indices of the atoms in selection 2
00378   int iblock; //the block of selection 1 currently being processed.  
00379 
00380 #ifdef __USEATOMIC
00381   unsigned int *llhists1; // a pointer to the beginning of llhists
00382                           // which is operated on by the warp
00383 
00384   __shared__ unsigned int llhists[MAXBIN];  // array holds 1 histogram per warp.
00385 #else
00386   volatile unsigned int *llhists1; // a pointer to the beginning of llhists
00387                                    // which is operated on by the warp
00388 
00389   volatile __shared__ unsigned int llhists[(NBLOCK*MAXBIN)/THREADSPERWARP];  
00390   // this array will hold 1 histogram per warp.
00391 #endif
00392 
00393   __shared__ float xyzi[3*NBLOCK]; // coords for the portion of sel1 
00394                                    // being processed in shared memory
00395 
00396   float rxij, rxij2, rij; // registers holding intermediate values in
00397                           // calculating the distance between two atoms
00398 
00399   int ibin,nofblocksi;    // registers counters and upper limit in some loops
00400   int nleftover;          // # bins leftover after full blocks are processed
00401   float *pxi;             // pointer to the portion of sel1 being processed
00402   unsigned int joffset;           // the current offset in constant memory
00403   unsigned int loopmax, loopmax2; // the limit for several loops
00404   float rmin2=.0001/delr_inv;
00405 
00406 #ifndef __USEATOMIC
00407   // this thread tag is needed for the histograming method implemented in
00408   // addData above.
00409   const unsigned int threadTag = ((unsigned int)
00410                                  ( threadIdx.x & (THREADSPERWARP - 1)))
00411                                  << (32 - WARP_LOG_SIZE);
00412 #endif
00413 
00414   // initialize llhists1.  If atomics are used, then each block has its own 
00415   // copy of the histogram in shared memory, which will be stored to it's own 
00416   // place in global memory.  If atomics aren't used then we need 1 histogram 
00417   // per warp. The number of blocks is also set approprately so that the
00418   // histograms may be zeroed out.
00419 #ifdef __USEATOMIC
00420   llhists1=llhists;
00421   nofblocksi=nbins/NBLOCK;
00422   nleftover=nbins-nofblocksi*NBLOCK;
00423 #else
00424   llhists1=llhists+(threadIdx.x/THREADSPERWARP)*MAXBIN;
00425   nofblocksi=nbins/THREADSPERWARP;
00426   nleftover=nbins-nofblocksi*THREADSPERWARP;
00427 #endif
00428 
00429   // Here we also zero out the shared memory used in this routine.
00430 #ifdef __USEATOMIC
00431   loopmax = nofblocksi*NBLOCK;
00432   for (io=0; io<loopmax; io+=NBLOCK) {
00433     llhists1[io + threadIdx.x]=0UL;
00434   } 
00435   if (threadIdx.x < nleftover) {
00436     llhists1[io + threadIdx.x]=0UL;
00437   } 
00438    
00439 #else
00440   loopmax = nofblocksi*THREADSPERWARP;
00441   for (io=0; io<loopmax; io+=THREADSPERWARP) {
00442     llhists1[io + (threadIdx.x & (THREADSPERWARP - 1))]=0UL;
00443   }  
00444   if ((threadIdx.x & (THREADSPERWARP - 1)) < nleftover) {
00445     llhists1[io + (threadIdx.x & (THREADSPERWARP - 1))]=0UL;
00446   } 
00447 #endif
00448 
00449   // initialize nofblocks and pxi
00450   nofblocksi=((natomsi/NBLOCK)+NCUDABLOCKS-blockIdx.x-1)/NCUDABLOCKS;
00451   pxi=xyz+blockIdx.x*3*NBLOCK+threadIdx.x;
00452 
00453   loopmax = 3 * natomsj;
00454   int idxt3 = 3 * threadIdx.x;
00455   int idxt3p1 = idxt3+1;
00456   int idxt3p2 = idxt3+2;
00457 
00458   loopmax2 = nofblocksi*3*NCUDABLOCKS*NBLOCK;
00459 
00460   // loop over all atoms in constant memory, using either 
00461   // periodic or non-periodic particle pair distance calculation
00462   if (usepbc) {
00463     for (iblock=0; iblock<loopmax2; iblock+=3*NCUDABLOCKS*NBLOCK) {
00464       __syncthreads();
00465 
00466       // these loads from global memory should be coallesced
00467       xyzi[threadIdx.x         ]=pxi[iblock         ];
00468       xyzi[threadIdx.x+  NBLOCK]=pxi[iblock+  NBLOCK];
00469       xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK];
00470 
00471       __syncthreads();
00472 
00473       for (joffset=0; joffset<loopmax; joffset+=3) {
00474         // calculate the distance between two atoms.  rxij and rxij2 are reused
00475         // to minimize the number of registers used
00476         rxij=fabsf(xyzi[idxt3  ] - xyzj[joffset  ]);
00477         rxij2=celld.x - rxij;
00478         rxij=fminf(rxij, rxij2);
00479         rij=rxij*rxij;
00480         rxij=fabsf(xyzi[idxt3p1] - xyzj[joffset+1]);
00481         rxij2=celld.y - rxij;
00482         rxij=fminf(rxij, rxij2);
00483         rij+=rxij*rxij;
00484         rxij=fabsf(xyzi[idxt3p2] - xyzj[joffset+2]);
00485         rxij2=celld.z - rxij;
00486         rxij=fminf(rxij, rxij2);
00487         rij=sqrtf(rij + rxij*rxij);
00488 
00489         // increment the appropriate bin, don't add duplicates to the zeroth bin
00490         ibin=__float2int_rd((rij-rmin)*delr_inv);
00491         if (ibin<nbins && ibin>=0 && rij>rmin2) {
00492 #ifdef __USEATOMIC
00493           atomicAdd(llhists1+ibin, 1U);
00494 #else
00495           addData(llhists1, ibin, threadTag);
00496 #endif
00497         }
00498       } //joffset
00499     } //iblock
00500   } else { // non-periodic
00501     for (iblock=0; iblock<loopmax2; iblock+=3*NCUDABLOCKS*NBLOCK) {
00502       __syncthreads();
00503 
00504       // these loads from global memory should be coallesced
00505       xyzi[threadIdx.x         ]=pxi[iblock         ];
00506       xyzi[threadIdx.x+  NBLOCK]=pxi[iblock+  NBLOCK];
00507       xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK];
00508 
00509       __syncthreads();
00510 
00511       // loop over all atoms in constant memory
00512       for (joffset=0; joffset<loopmax; joffset+=3) {
00513         // calculate the distance between two atoms.  rxij and rxij2 are reused
00514         // to minimize the number of registers used
00515         rxij=xyzi[idxt3  ] - xyzj[joffset  ];
00516         rij=rxij*rxij;
00517         rxij=xyzi[idxt3p1] - xyzj[joffset+1];
00518         rij+=rxij*rxij;
00519         rxij=xyzi[idxt3p2] - xyzj[joffset+2];
00520         rij=sqrtf(rij + rxij*rxij);
00521 
00522         // increment the appropriate bin, don't add duplicates to the zeroth bin
00523         ibin=__float2int_rd((rij-rmin)*delr_inv);
00524         if (ibin<nbins && ibin>=0 && rij>rmin2) {
00525 #ifdef __USEATOMIC
00526           atomicAdd(llhists1+ibin, 1U);
00527 #else
00528           addData(llhists1, ibin, threadTag);
00529 #endif
00530         }
00531       } //joffset
00532     } //iblock
00533   }
00534 
00535   __syncthreads();
00536 
00537 
00538   // below we store the histograms to global memory so that they may be summed
00539   // in another routine.  Setting nofblo
00540   nofblocksi=nbins/NBLOCK;
00541   nleftover=nbins-nofblocksi*NBLOCK;
00542                
00543 #ifdef __USEATOMIC
00544   // reinitialize llhists1 to point to global memory
00545   unsigned int *llhistg1=llhistg+blockIdx.x*MAXBIN+threadIdx.x;
00546 
00547   // loop to add all elements to global memory
00548   loopmax = nofblocksi*NBLOCK;
00549   for (iblock=0; iblock < loopmax; iblock+=NBLOCK) {
00550     llhistg1[iblock] += llhists[iblock+threadIdx.x];
00551   }
00552   // take care of leftovers
00553   if (threadIdx.x < nleftover) {
00554     llhistg1[iblock] += llhists[iblock+threadIdx.x];
00555   }
00556 #else
00557   // reinitialize llhists1 to point to global memory
00558   unsigned int* llhistg1=llhistg+(((blockIdx.x)*NBLOCK)/THREADSPERWARP)*MAXBIN+threadIdx.x;
00559 
00560   // loop to add all elements to global memory
00561   loopmax = MAXBIN * (NBLOCK / THREADSPERWARP);
00562   loopmax2 = nofblocksi * NBLOCK;
00563   for (io=0; io < loopmax; io+=MAXBIN) {
00564     for (iblock=0; iblock < loopmax2; iblock+=NBLOCK) {
00565       llhistg1[io+iblock] += llhists[io+iblock+threadIdx.x];
00566     }
00567     // take care of leftovers
00568     if (threadIdx.x < nleftover) {
00569       llhistg1[iblock] += llhists[io+iblock+threadIdx.x];
00570     }
00571   }
00572 #endif
00573 
00574   return;
00575 }
00576 
00577 
00578 #define ull2float __uint2float_rn
00579 
00580 /* This routine takes a series of integer histograms in llhist, 
00581  * stored in as integers, converts them to floats, multiplies 
00582  * them by appropriate factors, and sums them.  The result is 
00583  * stored in histdev.
00584  */
00585 __global__ void calculate_histogram(float* histdev, // histogram to return
00586                              unsigned int* llhistg, // the histograms stored
00587                               // in global memory.  There is one histogram
00588                               // per block (warp) if atomicAdds are (not) used
00589                                     int nbins) // stride between geometries in xyz.
00590 {
00591   int io;      // index for inner loop
00592   int iblock;  // index for outer loop  
00593   int maxloop; // maxima for loop conditions
00594 
00595   __shared__ unsigned int llhists[NBLOCKHIST]; // block of int histogram in final summation
00596 
00597   __shared__ float xi[NBLOCKHIST]; // smem for the floating point histograms.
00598   int nofblocks;                   // number of data blocks per histogram
00599 
00600   nofblocks=nbins/NBLOCKHIST;      // set number of blocks per histogram
00601   int nleftover=nbins-nofblocks*NBLOCKHIST;
00602   maxloop = nofblocks*NBLOCKHIST;  // set maxloop and maxloop2
00603 
00604   // loop over all of the blocks in a histogram
00605   for (iblock=0;iblock<maxloop;iblock+=NBLOCKHIST) {
00606     xi[threadIdx.x]=0.0f; // zero out xi
00607 
00608     // loop over the histograms created by calculate_rdf
00609 #ifdef __USEATOMIC
00610     for (io=0; io < MAXBIN*NCUDABLOCKS; io+=MAXBIN) {
00611 #else
00612     for (io=0; io < MAXBIN * (NCUDABLOCKS*NBLOCK / THREADSPERWARP); io+=MAXBIN) {
00613 #endif
00614       // load integer histogram data into shared memory (coalesced)
00615       llhists[threadIdx.x]=llhistg[io+iblock+threadIdx.x];
00616 
00617       // shave off the thread tag that might remain from calculate_rdf
00618       llhists[threadIdx.x]=llhists[threadIdx.x] & 0x07FFFFFFUL;
00619 
00620       // convert to float
00621       xi[threadIdx.x]+=ull2float(llhists[threadIdx.x]);
00622     }
00623 
00624     // ... and store in global memory
00625     histdev[iblock+threadIdx.x]+=xi[threadIdx.x];
00626   }
00627 
00628   // take care of leftovers
00629   if (threadIdx.x < nleftover) {
00630     xi[threadIdx.x]=0.0f; // zero out xi
00631 
00632     // loop over the histograms created by calculate_rdf
00633 #ifdef __USEATOMIC
00634     for (io=0; io < MAXBIN*NCUDABLOCKS; io+=MAXBIN) {
00635 #else
00636     for (io=0; io < MAXBIN *(NCUDABLOCKS*NBLOCK / THREADSPERWARP); io+=MAXBIN) {
00637 #endif
00638       // load integer histogram data into shared memory (coalesced)
00639       llhists[threadIdx.x]=llhistg[io+iblock+threadIdx.x];
00640 
00641       // shave off the thread tag that might remain from calculate_rdf
00642       llhists[threadIdx.x]=llhists[threadIdx.x] & 0x07FFFFFFUL;
00643 
00644       // convert to float
00645       xi[threadIdx.x]+=ull2float(llhists[threadIdx.x]);
00646     }
00647 
00648     // ... and store in global memory
00649     histdev[iblock+threadIdx.x]+=xi[threadIdx.x];
00650   }
00651 
00652   // calculate_hist out
00653   return;
00654 }
00655 
00656 
00657 /*
00658  * calculate_histogram_block
00659  * Compute one histogram block 
00660  */
00661 void calculate_histogram_block(int usepbc,
00662                                float *xyz, 
00663                                int natomsi, 
00664                                int natomsj,
00665                                float3 celld,
00666                                unsigned int* llhistg,
00667                                int nbins,
00668                                float rmin,
00669                                float delr_inv,
00670                                float *histdev,
00671                                int nblockhist) {
00672   // zero out the histograms in global memory
00673   init_hist<<<NCUDABLOCKS, NBLOCK>>>(llhistg, MAXBIN);
00674       
00675   // calculate the histograms
00676   calculate_rdf<<<NCUDABLOCKS, NBLOCK>>>(usepbc, xyz, natomsi, natomsj, celld,
00677                                          llhistg, nbins, rmin, delr_inv);
00678 
00679   // sum histograms and begin normalization by adjusting for the cell volume
00680   calculate_histogram<<<1, nblockhist>>>(histdev, llhistg, nbins);
00681 }
00682 
00683 
00684 
00685 /*
00686  * input parameters for rdf_thread() routine
00687  */
00688 typedef struct {
00689   int usepbc;        // periodic or non-periodic calculation
00690   int natoms1;       // number of atoms in selection 1.
00691   float* xyz;        // coordinates of first selection, [natoms1][3]
00692   int natoms2;       // number of atoms in selection 2.
00693   float** xyz2array; // coordinates of selection 2. [natoms2][3] (per-thread)
00694   float* cell;       // the cell x y and z dimensions [3]
00695   float** histarray; // the final histogram in host mem [nbins] (per-thread)
00696   int nbins;         // the number of bins in the histogram
00697   int nblockhist;    // the size of a block used in the
00698                      // reduction of the histogram
00699   float rmin;        // the minimum value of the first bin
00700   float delr;        // the width of each bin
00701   int nblocks;       // number of const blocks to compute all pairs histogram
00702   int nhblock;       // # NBLOCKHIST-sized blocks needed to calc whole histogram
00703 } rdfthrparms;
00704 
00705 
00706 /*  
00707  * rdf_thread -- multithreaded version of subroutine_rdf()
00708  * This routine is called from the CPU to calculate g(r) for a single frame
00709  * on the GPU.  Right now it performs a brute force O(N^2) calculations
00710  * (with all atomic pairs considered).  Future version may reduce the overall
00711  * work by performing a grid search.
00712  */
00713 static void * rdf_thread(void *voidparms) {
00714   rdfthrparms *parms = NULL;
00715   int threadid=0;
00716 
00717   wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00718   wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00719 
00720 #if 0
00721 printf("rdf thread[%d] lanched and running...\n", threadid);
00722 #endif
00723   cudaSetDevice(threadid);
00724 
00725   /*
00726    * copy in per-thread parameters
00727    */
00728   const int natoms1 = parms->natoms1;
00729   const float *xyz = parms->xyz;
00730   const int natoms2 = parms->natoms2;
00731   const float *cell = parms->cell;
00732   const int nbins = parms->nbins;
00733   const int nblockhist = parms->nblockhist;
00734   const float rmin = parms->rmin;
00735   const float delr = parms->delr;
00736   float *xyz2 = parms->xyz2array[threadid];
00737   float *hist = parms->histarray[threadid];
00738   const int nhblock = parms->nhblock;
00739   const int usepbc = parms->usepbc;
00740 
00741   float* xyzdev;         // pointer to xyz data in global memory
00742   float3 celld;          // cell x, y, and z dimensions
00743   float* histdev;        // pointer to float histogram in global memory
00744   unsigned int* llhistg; // pointer to int histograms in global memory
00745 
00746   int nconstblocks; // # full blocks to be sent to constant memory
00747   int nleftover;    // # elements in leftover block of const mem
00748   int nbigblocks;   // number of blocks required to prevent histogram overflow
00749   int ibigblock;    // the index of the current big block
00750   int nleftoverbig; // the number of elements in the final big block
00751   int ntmpbig;
00752   int nbinstmp;     // number of bins currently being processed
00753   int natoms1pad;   // number of atoms in selection 1 after padding
00754   int natoms2pad;   // number of atoms in selection 1 after padding
00755   float rmax;       // cutoff radius for atom contributions to histogram
00756 
00757   // natoms1pad is natoms1 rounded up to the next multiple of NBLOCK
00758   natoms1pad=natoms1;
00759   if (natoms1%NBLOCK!=0) {natoms1pad = (natoms1 / NBLOCK + 1) * NBLOCK;}
00760   natoms2pad=natoms2;
00761   if (natoms2%NBLOCK!=0) {natoms2pad = (natoms2 / NBLOCK + 1) * NBLOCK;}
00762 
00763   // allocate global memory for:
00764 
00765   // the xyz coordinates of selection 1
00766   cudaMalloc((void**)&xyzdev, 3*max(natoms1pad,natoms2pad)*sizeof(float));
00767 
00768   // the final copy of the histogram
00769   cudaMalloc((void**)&histdev, nbins*sizeof(float));
00770 
00771   // zero out floating point histogram
00772   int ihblock;         // loop counter for histogram blocks
00773   int maxloop=nbins/nblockhist;
00774   if (maxloop*nblockhist!=nbins) {maxloop=maxloop+1;}
00775   for (ihblock=0; ihblock<maxloop; ihblock++) {
00776     init_hist_f<<<1, nblockhist>>>(histdev+ihblock*nblockhist);
00777   }
00778 
00779   // the global memory copies of the histogram
00780 #ifdef __USEATOMIC
00781   cudaMalloc((void**)&llhistg, (NCUDABLOCKS*MAXBIN*sizeof(int)));
00782 #else
00783   cudaMalloc((void**)&llhistg, (NCUDABLOCKS*NBLOCK*MAXBIN*
00784              sizeof(int))/THREADSPERWARP);
00785 #endif
00786 
00787   // set the cell dimensions
00788   celld.x=cell[0];
00789   celld.y=cell[1];
00790   celld.z=cell[2];
00791 
00792   // set rmax.  this is the distance that the padding atoms must be from the 
00793   // unit cell AND that they must be from each other
00794   rmax = 1.1f * (rmin+((float)nbins)*delr);
00795 
00796   // send the second selection to the gpu for reimaging
00797   cudaMemcpy(xyzdev, xyz2, 3*natoms2*sizeof(float), cudaMemcpyHostToDevice);
00798 
00799   if (usepbc) {
00800     // reimage atom coors into a single periodic cell
00801     reimage_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms2,natoms2pad,celld,rmax);
00802   } else {
00803     // shift phantom atoms so they don't interfere with non-periodic calc.
00804     phantom_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms2,natoms2pad,1.0e38f,rmax);
00805   }
00806 
00807   // now, unfortunately, we have to move it back because this selection 
00808   // will need to be sent to constant memory (done on the host side)
00809   cudaMemcpy(xyz2, xyzdev, 3*natoms2*sizeof(float), cudaMemcpyDeviceToHost);
00810 
00811   // send the xyz coords of selection 1 to the gpu
00812   cudaMemcpy(xyzdev, xyz, 3*natoms1*sizeof(float), cudaMemcpyHostToDevice);
00813 
00814   if (usepbc) {
00815     // reimage xyz coords back into the periodic box and zero all histograms
00816     reimage_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms1,natoms1pad,celld,rmax);
00817   } else {
00818     // shift phantom atoms so they don't interfere with non-periodic calc.
00819     phantom_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms1,natoms1pad,1.0e38f,rmax);
00820   }
00821 
00822   // all blocks except for the final one will have MAXBIN elements
00823   nbinstmp = MAXBIN; 
00824 
00825   // calc # of groups to divide sel2 into to fit it into constant memory
00826   nconstblocks=natoms2/NCONSTBLOCK;
00827   nleftover=natoms2-nconstblocks*NCONSTBLOCK;
00828 
00829   // set up the big blocks to avoid bin overflow
00830   ntmpbig = NBLOCK * ((BIN_OVERFLOW_LIMIT / NCONSTBLOCK) / NBLOCK);
00831   nbigblocks = natoms1pad / ntmpbig;
00832   nleftoverbig = natoms1pad - nbigblocks * ntmpbig;
00833 
00834   int nbblocks = (nleftoverbig != 0) ? nbigblocks+1 : nbigblocks;
00835 
00836   // parallel loop over all "constblocks"
00837   wkf_tasktile_t tile; // next batch of work units
00838   while (wkf_threadpool_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
00839 #if 0
00840 printf("rdf thread[%d] fetched tile: s: %d e: %d\n", threadid, tile.start, tile.end);
00841 #endif
00842     int iconstblock = -1; // loop counter for constblocks, force update 
00843     ihblock = -1;     // initialize to sentinel value to force immediate update
00844     int workitem;     // current parallel work item
00845     for (workitem=tile.start; workitem<tile.end; workitem++) {
00846       // decode work item into ihblock and iconstblock indices
00847       ihblock = workitem % nhblock;
00848       int newiconstblock = workitem / nhblock;
00849       int blocksz;
00850 
00851       // When moving to the next constblock we must copy in new coordinates
00852       if (iconstblock != newiconstblock) {
00853         iconstblock = newiconstblock;
00854 
00855         // take care of the leftovers on the last block
00856         blocksz = (iconstblock == nconstblocks && nleftover != 0) 
00857                   ? nleftover : NCONSTBLOCK;
00858 
00859         // send a "constblock" of coords from selection 2 to const mem
00860         copycoordstoconstbuff(blocksz, xyz2+(3*NCONSTBLOCK*iconstblock));
00861       } 
00862 
00863 #if 0
00864 printf("rdf thread[%d] iconstblock: %d ihblock: %d\n", threadid, 
00865        iconstblock, ihblock);
00866 #endif
00867 
00868       // loop over blocks of histogram
00869       // minimum distance for the current histogram block
00870       // the first block starts at rmin
00871       float rmintmp=rmin + (MAXBIN * delr * ihblock);
00872 
00873       nbinstmp = MAXBIN;
00874 
00875       // the final block will have nbin%MAXBIN elements
00876       if (ihblock==(nhblock-1)) { 
00877         nbinstmp = nbins%MAXBIN;
00878         // if nbins is an even multiple of MAXBIN, the final block is full
00879         if (nbinstmp==0) { nbinstmp = MAXBIN;}
00880       }
00881 
00882       for (ibigblock=0; ibigblock < nbblocks; ibigblock++) {
00883         // take care of the leftovers on the last block
00884         int bblocksz = (ibigblock == nbigblocks && nleftoverbig != 0) 
00885                        ? nleftoverbig : ntmpbig;
00886 
00887         calculate_histogram_block(usepbc, (xyzdev+ibigblock*ntmpbig*3),
00888                                   bblocksz, blocksz, celld,
00889                                   llhistg, nbinstmp, rmintmp, (1/delr),
00890                                   histdev+ihblock*MAXBIN, nblockhist);
00891       }
00892 
00893       cudaDeviceSynchronize();
00894     }
00895   } // end of parallel tile fetch while loop
00896 
00897   // retrieve g of r for this frame
00898   cudaMemcpy(hist, histdev, nbins*sizeof(float), cudaMemcpyDeviceToHost);
00899 
00900   cudaFree(xyzdev);  // free gpu global memory
00901   cudaFree(histdev);
00902   cudaFree(llhistg);
00903 
00904 #if 0
00905 printf("thread[%d] finished, releasing resources\n", threadid); 
00906 #endif
00907 
00908   return NULL;
00909 }
00910 
00911 
00912 int rdf_gpu(wkf_threadpool_t *devpool, // VMD GPU worker thread pool
00913             int usepbc,                // periodic or non-periodic calc.
00914             int natoms1,               // array of the number of atoms in
00915                                        // selection 1 in each frame.
00916             float *xyz,                // coordinates of first selection.
00917                                        // [natoms1][3]
00918             int natoms2,               // array of the number of atoms in
00919                                        // selection 2 in each frame.
00920             float *xyz2,               // coordinates of selection 2.
00921                                        // [natoms2][3]
00922             float* cell,               // the cell x y and z dimensions [3]
00923             float* hist,               // the histograms, 1 per block
00924                                        // [ncudablocks][maxbin]
00925             int nbins,                 // the number of bins in the histogram
00926             float rmin,                // the minimum value of the first bin
00927             float delr)                // the width of each bin
00928 {
00929   int i, ibin;
00930 
00931   if (devpool == NULL) {
00932     return -1; // abort if no device pool exists
00933   }
00934   int numprocs = wkf_threadpool_get_workercount(devpool);
00935 
00936   // multi-thread buffers
00937   float **xyz2array = (float**) calloc(1, sizeof(float *) * numprocs);
00938   float **histarray = (float**) calloc(1, sizeof(float *) * numprocs);
00939   for (i=0; i<numprocs; i++) {
00940     xyz2array[i] = (float*) calloc(1, sizeof(float) * 3 * (natoms2/NBLOCK + 1) * NBLOCK);
00941     memcpy(xyz2array[i], xyz2, sizeof(float) * 3 * (natoms2/NBLOCK + 1) * NBLOCK);
00942     histarray[i] = (float*) calloc(1, sizeof(float) * nbins);
00943   }
00944 
00945   memset(hist, 0, nbins * sizeof(float)); // clear global histogram array
00946 
00947   // setup thread parameters
00948   rdfthrparms parms;
00949 
00950   parms.usepbc = usepbc;
00951   parms.natoms1 = natoms1;
00952   parms.xyz = xyz;
00953   parms.natoms2 = natoms2;
00954   parms.cell = cell;
00955   parms.nbins = nbins;
00956   parms.nblockhist = NBLOCKHIST; // size of final reduction block 
00957   parms.rmin = rmin;
00958   parms.delr = delr;
00959   parms.xyz2array = xyz2array; // per-thread output data
00960   parms.histarray = histarray; // per-thread output data
00961 
00962   // calculate the number of blocks to divide the histogram into
00963   parms.nhblock = (nbins+MAXBIN-1)/MAXBIN;
00964 
00965   // calc # of groups to divide sel2 into to fit it into constant memory
00966   int nconstblocks=parms.natoms2/NCONSTBLOCK;
00967   int nleftoverconst=parms.natoms2 - nconstblocks*NCONSTBLOCK;
00968 
00969   // we have to do one extra cleanup pass if there are leftovers
00970   parms.nblocks = (nleftoverconst != 0) ? nconstblocks+1 : nconstblocks;
00971 
00972   int parblocks = parms.nblocks * parms.nhblock;
00973 #if 0
00974   printf("master thread: number of parallel work items: %d\n", parblocks);
00975 #endif
00976 
00977   // spawn child threads to do the work
00978   wkf_tasktile_t tile;
00979   tile.start=0;
00980   tile.end=parblocks;
00981   wkf_threadpool_sched_dynamic(devpool, &tile);
00982   wkf_threadpool_launch(devpool, rdf_thread, &parms, 1);
00983 
00984   memset(hist, 0, nbins * sizeof(float)); // clear global histogram array
00985   // collect independent thread results into final histogram buffer
00986   for (i=0; i<numprocs; i++) {
00987     for (ibin=0; ibin<nbins; ibin++) {
00988       hist[ibin] += histarray[i][ibin];
00989     }      
00990   }
00991 
00992   return 0;
00993 }
00994 
00995 
00996