Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members   Related Pages  

CUDAMeasureRDF.cu

Go to the documentation of this file.
00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 2007-2009 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 
00009 /***************************************************************************
00010  * RCS INFORMATION:
00011  *
00012  *      $RCSfile: CUDAMeasureRDF.cu,v $
00013  *      $Author: johns $        $Locker:  $             $State: Exp $
00014  *      $Revision: 1.25 $      $Date: 2011/01/18 15:32:28 $
00015  *
00016  ***************************************************************************
00017  * DESCRIPTION:
00018  *   CUDA accelerated RDF calculation
00019  *
00020  ***************************************************************************/
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #include <math.h>
00025 #include <cuda.h>
00026 
00027 #include "Inform.h"
00028 #include "utilities.h"
00029 #include "WKFThreads.h"
00030 #include "WKFUtils.h"
00031 #include "CUDAKernels.h" 
00032 
00033 //
00034 // the below parameters are important to tuning the performance of the code
00035 //
00036 #if 0
00037 // original (slow?) default values for all GPU types
00038 // no hardware support for atomic operations, so the shared memory 
00039 // tagging scheme is used.
00040 #define NBLOCK 128        // size of an RDF data block 
00041 #define MAXBIN 64         // maximum number of bins in a histogram
00042 #elif 1
00043 // optimal values for compute 1.0 (G80/G9x) devices 
00044 // no hardware support for atomic operations, so the shared memory 
00045 // tagging scheme is used.
00046 #define NBLOCK 32         // size of an RDF data block 
00047 #define MAXBIN 1024       // maximum number of bins in a histogram
00048 #elif 1
00049 // optimal values for compute 1.3 (GT200) devices 
00050 // uses atomic operations to increment histogram bins
00051 #define NBLOCK 320        // size of an RDF data block 
00052 #define MAXBIN 3072       // maximum number of bins in a histogram
00053 #define __USEATOMIC  1    // enable atomic increment operations
00054 #elif 1
00055 // optimal values for compute 2.0 (Fermi) devices 
00056 // uses atomic operations to increment histogram bins
00057 // uses Fermi's 3x larger shared memory (48KB) to reduce the number 
00058 // of passes required for computation of very large histograms
00059 #define NBLOCK 896        // size of an RDF data block 
00060 #define MAXBIN 8192       // maximum number of bins in a histogram
00061 #define __USEATOMIC  1    // enable atomic increment operations
00062 #endif
00063 
00064 #define NCUDABLOCKS 256   // the number of blocks to divide the shared memory 
00065                           // dimension into.  Performance increases up to ~8 
00066                           // and then levels off
00067 
00068 
00069 #define NBLOCKHIST 64     // the number of threads used in the final histogram
00070                           // kernel.  The summation of the histograms isn't 
00071                           // a bottleneck so this isn't a critical choice
00072 
00073 #define NCONSTBLOCK 5440  // the number of atoms in constant memory.
00074                           // Can be 4096 (or larger) up to capacity limits.
00075 
00076 #define THREADSPERWARP 32 // this is correct for G80/GT200/Fermi GPUs
00077 #define WARP_LOG_SIZE 5   // this is also correct for G80/GT200/Fermi GPUs
00078 
00079 #define BIN_OVERFLOW_LIMIT 2147483648 // maximum value a bin can store before
00080                                       // we need to accumulate and start over
00081 
00082 
00083 // This routine prepares data for the calculation of the histogram.
00084 // It zeros out instances of the histograms in global memory.
00085 __global__ void init_hist(unsigned int* llhistg, // histograms
00086                           int maxbin) // number of bins per histogram
00087 {
00088 
00089 #ifndef __USEATOMIC
00090   int io; // index of atoms in selection
00091 #endif
00092 
00093   int iblock; // index of data blocks 
00094 
00095    unsigned int *llhist; // this pointer will point to the begining of 
00096   // each thread's instance of the histogram
00097 
00098   int nofblocks; // nofblocks is the number of full data blocks.  
00099   int nleftover; // nleftover is the dimension of the blocks left over 
00100   // after all full data blocks
00101   int maxloop; // the loop condition maximum
00102 
00103   // initialize llhists to point to an instance of the histogram for each
00104   // warp.  Zero out the integer (llhist) and floating point (histdev) copies
00105   // of the histogram in global memory
00106 #ifdef __USEATOMIC
00107   llhist=llhistg+blockIdx.x*maxbin+threadIdx.x;
00108   nofblocks=maxbin/NBLOCK;
00109   nleftover=maxbin-nofblocks*NBLOCK;
00110   maxloop=nofblocks*NBLOCK;
00111 
00112   for (iblock=0; iblock < maxloop; iblock+=NBLOCK) {
00113     llhist[iblock]=0UL;
00114   }
00115   // take care of leftovers
00116   if (threadIdx.x < nleftover) {
00117     llhist[iblock]=0UL;
00118   }
00119   
00120 #else
00121   llhist=llhistg+(((blockIdx.x)*NBLOCK)/THREADSPERWARP)*maxbin+threadIdx.x;
00122   nofblocks=maxbin/NBLOCK;
00123   nleftover=maxbin-nofblocks*NBLOCK;
00124   maxloop=nofblocks*NBLOCK;
00125   int maxloop2=(NBLOCK / THREADSPERWARP)*maxbin;
00126 
00127   for (io=0; io < maxloop2; io+=maxbin) {
00128     for (iblock=0; iblock < maxloop; iblock+=NBLOCK) {
00129       llhist[io + iblock]=0UL;
00130     }
00131     // take care of leftovers
00132     if (threadIdx.x < nleftover) {
00133       llhist[io + iblock]=0UL;
00134     }
00135   }
00136 #endif
00137 
00138   return;
00139 }
00140 
00141 
00142 
00143 /* This routine prepares data for the calculation of the histogram.
00144  * It zeros out the floating point histogram in global memory.
00145  */
00146 __global__ void init_hist_f(float* histdev) {
00147   histdev[threadIdx.x]=0.0f;
00148   return;
00149 }
00150 
00151 
00152 
00153 /* This routine prepares data for the calculation of the histogram.
00154  * It recenters the atom to a single periodic unit cell.  
00155  * It also chooses positions for the padding atoms such that 
00156  * they don't contribute to the histogram
00157  */
00158 __global__ void reimage_xyz(float* xyz,     // XYZ data in global memory.  
00159                             int natomsi,    // number of atoms
00160                             int natomsipad, // # atoms including padding atoms
00161                             float3 celld,   // the cell size of each frame
00162                             float rmax)     // dist. used to space padding atoms
00163 {
00164   int iblock; // index of data blocks 
00165 
00166   __shared__ float xyzi[3*NBLOCK]; // this array hold xyz data for the current 
00167   // data block.  
00168 
00169   int nofblocks; // ibin is an index associated with histogram 
00170   // bins.  nofblocks is the number of full data blocks.  nleftover is the 
00171   // dimension of the blocks left over after all full data blocks
00172 
00173   float *pxi; // these pointers point to the portion of xyz currently
00174   // being processed
00175 
00176   int threetimesid; // three times threadIdx.x
00177 
00178   float rtmp; // a temporary distance to be used in creating padding atoms
00179 
00180   int ifinalblock; // this is the index of the final block
00181 
00182   int loopmax; // maximum for the loop counter
00183 
00184   // initialize nofblocks, pxi, and threetimesid
00185   nofblocks=((natomsipad/NBLOCK)+NCUDABLOCKS-blockIdx.x-1)/NCUDABLOCKS;
00186   loopmax=nofblocks*3*NCUDABLOCKS*NBLOCK;
00187   ifinalblock=(natomsipad / NBLOCK - 1) % NCUDABLOCKS;
00188   pxi=xyz+blockIdx.x*3*NBLOCK+threadIdx.x;
00189   threetimesid=3*threadIdx.x;
00190 
00191   // shift all atoms into the same unit cell centered at the origin
00192   for (iblock=0; iblock<loopmax; iblock+=3*NCUDABLOCKS*NBLOCK) {
00193     __syncthreads();
00194     //these reads from global memory should be coallesced
00195     xyzi[threadIdx.x         ]=pxi[iblock         ];
00196     xyzi[threadIdx.x+  NBLOCK]=pxi[iblock+  NBLOCK];
00197     xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK];
00198     __syncthreads();
00199 
00200     // Shift the xyz coordinates so that 0 <= xyz < celld.
00201     // The ?: line is necesary because of the less than convenient behavior
00202     // of fmod for negative xyz values
00203     xyzi[threetimesid] = fmodf(xyzi[threetimesid  ], celld.x);
00204     if (xyzi[threetimesid  ] < 0.0f) {
00205       xyzi[threetimesid  ] += celld.x;
00206     }
00207     xyzi[threetimesid+1]=fmodf(xyzi[threetimesid+1], celld.y);
00208     if (xyzi[threetimesid+1] < 0.0f) {
00209       xyzi[threetimesid+1] += celld.y;
00210     }
00211     xyzi[threetimesid+2]=fmodf(xyzi[threetimesid+2], celld.z);
00212     if (xyzi[threetimesid+2] < 0.0f) {
00213       xyzi[threetimesid+2] += celld.z;
00214     }
00215 
00216     // if this is the final block then we pad
00217     if (iblock==loopmax-3*NCUDABLOCKS*NBLOCK && blockIdx.x==ifinalblock) {
00218       // pad the xyz coordinates with atoms which are spaced out such that they
00219       // cannot contribute to the histogram.  Note that these atoms are NOT 
00220       // reimaged
00221       if ((blockDim.x-threadIdx.x) <= (natomsipad - natomsi)) {
00222         rtmp = -((float)(threadIdx.x+1))*rmax;
00223         xyzi[threetimesid  ] = rtmp;
00224         xyzi[threetimesid+1] = rtmp;
00225         xyzi[threetimesid+2] = rtmp;
00226       }
00227     }
00228 
00229     __syncthreads();
00230 
00231     // store the xyz values back to global memory.  these stores are coallesced
00232     pxi[iblock         ]=xyzi[threadIdx.x         ];
00233     pxi[iblock+  NBLOCK]=xyzi[threadIdx.x+  NBLOCK];
00234     pxi[iblock+2*NBLOCK]=xyzi[threadIdx.x+2*NBLOCK];
00235 
00236     // increment pxi to the next block of global memory to be processed
00237     //pxi=pxi+3*NCUDABLOCKS*NBLOCK;
00238   }
00239 }
00240 
00241 
00242 // shift the "phantom" atoms so they don't interfere in non-periodic
00243 // calculations.
00244 __global__ void phantom_xyz(float* xyz,     // XYZ data in global memory.
00245                             int natomsi,    // number of atoms
00246                             int natomsipad, // # atoms including padding atoms
00247                             float minxyz,
00248                             float rmax)     // dist. used to space padding atoms
00249 {
00250   int iblock; // index of data blocks
00251 
00252   __shared__ float xyzi[3*NBLOCK]; // this array hold xyz data for the current
00253   // data block.
00254 
00255   int nofblocks; // ibin is an index associated with histogram
00256   // bins.  nofblocks is the number of full data blocks.  nleftover is the
00257   // dimension of the blocks left over after all full data blocks
00258 
00259   float *pxi; // these pointers point to the portion of xyz currently
00260   // being processed
00261 
00262   int threetimesid; // three times threadIdx.x
00263 
00264   float rtmp; // a temporary distance to be used in creating padding atoms
00265 
00266   int ifinalblock; // this is the index of the final block
00267 
00268   int loopmax; // maximum for the loop counter
00269 
00270   // initialize nofblocks, pxi, and threetimesid
00271   nofblocks=((natomsipad/NBLOCK)+NCUDABLOCKS-blockIdx.x-1)/NCUDABLOCKS;
00272   loopmax=nofblocks*3*NCUDABLOCKS*NBLOCK;
00273   ifinalblock=(natomsipad / NBLOCK - 1) % NCUDABLOCKS;
00274   pxi=xyz+blockIdx.x*3*NBLOCK+threadIdx.x;
00275   threetimesid=3*threadIdx.x;
00276 
00277   // shift all atoms into the same unit cell centered at the origin
00278   for (iblock=0; iblock<loopmax; iblock+=3*NCUDABLOCKS*NBLOCK) {
00279     __syncthreads();
00280     //these reads from global memory should be coallesced
00281     xyzi[threadIdx.x         ]=pxi[iblock         ];
00282     xyzi[threadIdx.x+  NBLOCK]=pxi[iblock+  NBLOCK];
00283     xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK];
00284     __syncthreads();
00285 
00286     // if this is the final block then we pad
00287     if (iblock==loopmax-3*NCUDABLOCKS*NBLOCK && blockIdx.x==ifinalblock) {
00288       // pad the xyz coordinates with atoms which are spaced out such that they
00289       // cannot contribute to the histogram.  Note that these atoms are NOT
00290       // reimaged
00291       if ((blockDim.x-threadIdx.x) <= (natomsipad - natomsi)) {
00292         rtmp = -((float)(threadIdx.x+1))*rmax-minxyz;
00293         xyzi[threetimesid  ] = rtmp;
00294         xyzi[threetimesid+1] = rtmp;
00295         xyzi[threetimesid+2] = rtmp;
00296       }
00297     }
00298 
00299     __syncthreads();
00300 
00301     // store the xyz values back to global memory.  these stores are coallesced
00302     pxi[iblock         ]=xyzi[threadIdx.x         ];
00303     pxi[iblock+  NBLOCK]=xyzi[threadIdx.x+  NBLOCK];
00304     pxi[iblock+2*NBLOCK]=xyzi[threadIdx.x+2*NBLOCK];
00305 
00306     // increment pxi to the next block of global memory to be processed
00307     //pxi=pxi+3*NCUDABLOCKS*NBLOCK;
00308   }
00309 }
00310 
00311 
00312 /* This kernel calculates a radial distribution function between 
00313  * two selections one selection is stored in its entirety in global memory.
00314  * The other is stored partially in constant memory.  This routine is 
00315  * called to calculate the rdf between all of selection 1 and the portion
00316  * of selection 2 in constant memory.  
00317  * Each element of selection 2 is associated with it's own thread.  To minimize
00318  * accesses to global memory selection 1 is divided into chunks of NBLOCK 
00319  * elements.  These chunks are loaded into shared memory simultaneously, 
00320  * and then then all possible pairs of these atoms with those in 
00321  * constant memory are calculated.
00322  */
00323 __constant__ static float xyzj[3*NCONSTBLOCK]; // this array stores the
00324 // coordinates of selection two in constant memory
00325 
00326 // this routine is called from the host to move coordinate data to constant mem
00327 void copycoordstoconstbuff(int natoms, const float* xyzh) {
00328   cudaMemcpyToSymbol(xyzj, xyzh, 3*natoms*sizeof(float));
00329 }
00330 
00331 
00332 // This subroutine is based on a similar routine in the CUDA SDK histogram256 
00333 // example.  It adds a count to the histogram in such a way that there are no 
00334 // collisions.  If several routines need to update the same element a tag 
00335 // applied to to the highest bits of that element is used to determine which 
00336 // elements have already updated it.  For devices with compute capability 1.2 
00337 // or higher this is not necessary as atomicAdds can be used.
00338 
00339 #ifndef __USEATOMIC
00340 __device__ void addData(volatile unsigned int *s_WarpHist, // the first element
00341                         // of the histogram to be operated on
00342                         unsigned int data,      // the bin to add a count to
00343                         unsigned int threadTag) // tag of the current thread
00344 {
00345   unsigned int count; // the count in the bin being operated on
00346 
00347   do {  
00348     count = s_WarpHist[data] & 0x07FFFFFFUL;
00349     count = threadTag | (count + 1);
00350     s_WarpHist[data] = count;
00351   } while(s_WarpHist[data] != count);
00352 }
00353 #endif
00354 
00355 
00356 // This is the routine that does all the real work.  It takes as input the
00357 // various atomic coordinates and produces one rdf per warp, as described 
00358 // above.  These rdfs will be summed later by calculate_hist.
00359 __global__ void calculate_rdf(int usepbc,     // periodic or non-periodic calc.
00360                               float* xyz,     // atom XYZ coords, gmem
00361                               int natomsi,    // # of atoms in sel1, gmem
00362                               int natomsj,    // # of atoms in sel2, cmem
00363                               float3 celld,   // cell size of each frame
00364                               unsigned int* llhistg, // histograms, gmem
00365                               int nbins,      // # of bins in each histogram
00366                               float rmin,     // minimum distance for histogram
00367                               float delr_inv) // recip. width of histogram bins
00368 {
00369   int io; // indices of the atoms in selection 2
00370   int iblock; //the block of selection 1 currently being processed.  
00371 
00372 #ifdef __USEATOMIC
00373   unsigned int *llhists1; // a pointer to the beginning of llhists
00374                           // which is operated on by the warp
00375 
00376   __shared__ unsigned int llhists[MAXBIN];  // array holds 1 histogram per warp.
00377 #else
00378   volatile unsigned int *llhists1; // a pointer to the beginning of llhists
00379                                    // which is operated on by the warp
00380 
00381   volatile __shared__ unsigned int llhists[(NBLOCK*MAXBIN)/THREADSPERWARP];  
00382   // this array will hold 1 histogram per warp.
00383 #endif
00384 
00385   __shared__ float xyzi[3*NBLOCK]; // coords for the portion of sel1 
00386                                    // being processed in shared memory
00387 
00388   float rxij, rxij2, rij; // registers holding intermediate values in
00389                           // calculating the distance between two atoms
00390 
00391   int ibin,nofblocksi;    // registers counters and upper limit in some loops
00392   int nleftover;          // # bins leftover after full blocks are processed
00393   float *pxi;             // pointer to the portion of sel1 being processed
00394   unsigned int joffset;           // the current offset in constant memory
00395   unsigned int loopmax, loopmax2; // the limit for several loops
00396   float rmin2=.0001/delr_inv;
00397 
00398 #ifndef __USEATOMIC
00399   // this thread tag is needed for the histograming method implemented in
00400   // addData above.
00401   const unsigned int threadTag = ((unsigned int)
00402                                  ( threadIdx.x & (THREADSPERWARP - 1)))
00403                                  << (32 - WARP_LOG_SIZE);
00404 #endif
00405 
00406   // initialize llhists1.  If atomics are used, then each block has its own 
00407   // copy of the histogram in shared memory, which will be stored to it's own 
00408   // place in global memory.  If atomics aren't used then we need 1 histogram 
00409   // per warp. The number of blocks is also set approprately so that the
00410   // histograms may be zeroed out.
00411 #ifdef __USEATOMIC
00412   llhists1=llhists;
00413   nofblocksi=nbins/NBLOCK;
00414   nleftover=nbins-nofblocksi*NBLOCK;
00415 #else
00416   llhists1=llhists+(threadIdx.x/THREADSPERWARP)*MAXBIN;
00417   nofblocksi=nbins/THREADSPERWARP;
00418   nleftover=nbins-nofblocksi*THREADSPERWARP;
00419 #endif
00420 
00421   // Here we also zero out the shared memory used in this routine.
00422 #ifdef __USEATOMIC
00423   loopmax = nofblocksi*NBLOCK;
00424   for (io=0; io<loopmax; io+=NBLOCK) {
00425     llhists1[io + threadIdx.x]=0UL;
00426   } 
00427   if (threadIdx.x < nleftover) {
00428     llhists1[io + threadIdx.x]=0UL;
00429   } 
00430    
00431 #else
00432   loopmax = nofblocksi*THREADSPERWARP;
00433   for (io=0; io<loopmax; io+=THREADSPERWARP) {
00434     llhists1[io + (threadIdx.x & (THREADSPERWARP - 1))]=0UL;
00435   }  
00436   if ((threadIdx.x & (THREADSPERWARP - 1)) < nleftover) {
00437     llhists1[io + (threadIdx.x & (THREADSPERWARP - 1))]=0UL;
00438   } 
00439 #endif
00440 
00441   // initialize nofblocks and pxi
00442   nofblocksi=((natomsi/NBLOCK)+NCUDABLOCKS-blockIdx.x-1)/NCUDABLOCKS;
00443   pxi=xyz+blockIdx.x*3*NBLOCK+threadIdx.x;
00444 
00445   loopmax = 3 * natomsj;
00446   int idxt3 = 3 * threadIdx.x;
00447   int idxt3p1 = idxt3+1;
00448   int idxt3p2 = idxt3+2;
00449 
00450   loopmax2 = nofblocksi*3*NCUDABLOCKS*NBLOCK;
00451 
00452   // loop over all atoms in constant memory, using either 
00453   // periodic or non-periodic particle pair distance calculation
00454   if (usepbc) {
00455     for (iblock=0; iblock<loopmax2; iblock+=3*NCUDABLOCKS*NBLOCK) {
00456       __syncthreads();
00457 
00458       // these loads from global memory should be coallesced
00459       xyzi[threadIdx.x         ]=pxi[iblock         ];
00460       xyzi[threadIdx.x+  NBLOCK]=pxi[iblock+  NBLOCK];
00461       xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK];
00462 
00463       __syncthreads();
00464 
00465       for (joffset=0; joffset<loopmax; joffset+=3) {
00466         // calculate the distance between two atoms.  rxij and rxij2 are reused
00467         // to minimize the number of registers used
00468         rxij=fabsf(xyzi[idxt3  ] - xyzj[joffset  ]);
00469         rxij2=celld.x - rxij;
00470         rxij=fminf(rxij, rxij2);
00471         rij=rxij*rxij;
00472         rxij=fabsf(xyzi[idxt3p1] - xyzj[joffset+1]);
00473         rxij2=celld.y - rxij;
00474         rxij=fminf(rxij, rxij2);
00475         rij+=rxij*rxij;
00476         rxij=fabsf(xyzi[idxt3p2] - xyzj[joffset+2]);
00477         rxij2=celld.z - rxij;
00478         rxij=fminf(rxij, rxij2);
00479         rij=sqrtf(rij + rxij*rxij);
00480 
00481         // increment the appropriate bin, don't add duplicates to the zeroth bin
00482         ibin=__float2int_rd((rij-rmin)*delr_inv);
00483         if (ibin<nbins && ibin>=0 && rij>rmin2) {
00484 #ifdef __USEATOMIC
00485           atomicAdd(llhists1+ibin, 1U);
00486 #else
00487           addData(llhists1, ibin, threadTag);
00488 #endif
00489         }
00490       } //joffset
00491     } //iblock
00492   } else { // non-periodic
00493     for (iblock=0; iblock<loopmax2; iblock+=3*NCUDABLOCKS*NBLOCK) {
00494       __syncthreads();
00495 
00496       // these loads from global memory should be coallesced
00497       xyzi[threadIdx.x         ]=pxi[iblock         ];
00498       xyzi[threadIdx.x+  NBLOCK]=pxi[iblock+  NBLOCK];
00499       xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK];
00500 
00501       __syncthreads();
00502 
00503       // loop over all atoms in constant memory
00504       for (joffset=0; joffset<loopmax; joffset+=3) {
00505         // calculate the distance between two atoms.  rxij and rxij2 are reused
00506         // to minimize the number of registers used
00507         rxij=xyzi[idxt3  ] - xyzj[joffset  ];
00508         rij=rxij*rxij;
00509         rxij=xyzi[idxt3p1] - xyzj[joffset+1];
00510         rij+=rxij*rxij;
00511         rxij=xyzi[idxt3p2] - xyzj[joffset+2];
00512         rij=sqrtf(rij + rxij*rxij);
00513 
00514         // increment the appropriate bin, don't add duplicates to the zeroth bin
00515         ibin=__float2int_rd((rij-rmin)*delr_inv);
00516         if (ibin<nbins && ibin>=0 && rij>rmin2) {
00517 #ifdef __USEATOMIC
00518           atomicAdd(llhists1+ibin, 1U);
00519 #else
00520           addData(llhists1, ibin, threadTag);
00521 #endif
00522         }
00523       } //joffset
00524     } //iblock
00525   }
00526 
00527   __syncthreads();
00528 
00529 
00530   // below we store the histograms to global memory so that they may be summed
00531   // in another routine.  Setting nofblo
00532   nofblocksi=nbins/NBLOCK;
00533   nleftover=nbins-nofblocksi*NBLOCK;
00534                
00535 #ifdef __USEATOMIC
00536   // reinitialize llhists1 to point to global memory
00537   unsigned int *llhistg1=llhistg+blockIdx.x*MAXBIN+threadIdx.x;
00538 
00539   // loop to add all elements to global memory
00540   loopmax = nofblocksi*NBLOCK;
00541   for (iblock=0; iblock < loopmax; iblock+=NBLOCK) {
00542     llhistg1[iblock] += llhists[iblock+threadIdx.x];
00543   }
00544   // take care of leftovers
00545   if (threadIdx.x < nleftover) {
00546     llhistg1[iblock] += llhists[iblock+threadIdx.x];
00547   }
00548 #else
00549   // reinitialize llhists1 to point to global memory
00550   unsigned int* llhistg1=llhistg+(((blockIdx.x)*NBLOCK)/THREADSPERWARP)*MAXBIN+threadIdx.x;
00551 
00552   // loop to add all elements to global memory
00553   loopmax = MAXBIN * (NBLOCK / THREADSPERWARP);
00554   loopmax2 = nofblocksi * NBLOCK;
00555   for (io=0; io < loopmax; io+=MAXBIN) {
00556     for (iblock=0; iblock < loopmax2; iblock+=NBLOCK) {
00557       llhistg1[io+iblock] += llhists[io+iblock+threadIdx.x];
00558     }
00559     // take care of leftovers
00560     if (threadIdx.x < nleftover) {
00561       llhistg1[iblock] += llhists[io+iblock+threadIdx.x];
00562     }
00563   }
00564 #endif
00565 
00566   return;
00567 }
00568 
00569 
00570 #define ull2float __uint2float_rn
00571 
00572 /* This routine takes a series of integer histograms in llhist, 
00573  * stored in as integers, converts them to floats, multiplies 
00574  * them by appropriate factors, and sums them.  The result is 
00575  * stored in histdev.
00576  */
00577 __global__ void calculate_histogram(float* histdev, // histogram to return
00578                              unsigned int* llhistg, // the histograms stored
00579                               // in global memory.  There is one histogram
00580                               // per block (warp) if atomicAdds are (not) used
00581                                     int nbins) // stride between geometries in xyz.
00582 {
00583   int io;      // index for inner loop
00584   int iblock;  // index for outer loop  
00585   int maxloop; // maxima for loop conditions
00586 
00587   __shared__ unsigned int llhists[NBLOCKHIST]; // block of int histogram in final summation
00588 
00589   __shared__ float xi[NBLOCKHIST]; // smem for the floating point histograms.
00590   int nofblocks;                   // number of data blocks per histogram
00591 
00592   nofblocks=nbins/NBLOCKHIST;      // set number of blocks per histogram
00593   int nleftover=nbins-nofblocks*NBLOCKHIST;
00594   maxloop = nofblocks*NBLOCKHIST;  // set maxloop and maxloop2
00595 
00596   // loop over all of the blocks in a histogram
00597   for (iblock=0;iblock<maxloop;iblock+=NBLOCKHIST) {
00598     xi[threadIdx.x]=0.0f; // zero out xi
00599 
00600     // loop over the histograms created by calculate_rdf
00601 #ifdef __USEATOMIC
00602     for (io=0; io < MAXBIN*NCUDABLOCKS; io+=MAXBIN) {
00603 #else
00604     for (io=0; io < MAXBIN * (NCUDABLOCKS*NBLOCK / THREADSPERWARP); io+=MAXBIN) {
00605 #endif
00606       // load integer histogram data into shared memory (coalesced)
00607       llhists[threadIdx.x]=llhistg[io+iblock+threadIdx.x];
00608 
00609       // shave off the thread tag that might remain from calculate_rdf
00610       llhists[threadIdx.x]=llhists[threadIdx.x] & 0x07FFFFFFUL;
00611 
00612       // convert to float
00613       xi[threadIdx.x]+=ull2float(llhists[threadIdx.x]);
00614     }
00615 
00616     // ... and store in global memory
00617     histdev[iblock+threadIdx.x]+=xi[threadIdx.x];
00618   }
00619 
00620   // take care of leftovers
00621   if (threadIdx.x < nleftover) {
00622     xi[threadIdx.x]=0.0f; // zero out xi
00623 
00624     // loop over the histograms created by calculate_rdf
00625 #ifdef __USEATOMIC
00626     for (io=0; io < MAXBIN*NCUDABLOCKS; io+=MAXBIN) {
00627 #else
00628     for (io=0; io < MAXBIN *(NCUDABLOCKS*NBLOCK / THREADSPERWARP); io+=MAXBIN) {
00629 #endif
00630       // load integer histogram data into shared memory (coalesced)
00631       llhists[threadIdx.x]=llhistg[io+iblock+threadIdx.x];
00632 
00633       // shave off the thread tag that might remain from calculate_rdf
00634       llhists[threadIdx.x]=llhists[threadIdx.x] & 0x07FFFFFFUL;
00635 
00636       // convert to float
00637       xi[threadIdx.x]+=ull2float(llhists[threadIdx.x]);
00638     }
00639 
00640     // ... and store in global memory
00641     histdev[iblock+threadIdx.x]+=xi[threadIdx.x];
00642   }
00643 
00644   // calculate_hist out
00645   return;
00646 }
00647 
00648 
00649 /*
00650  * calculate_histogram_block
00651  * Compute one histogram block 
00652  */
00653 void calculate_histogram_block(int usepbc,
00654                                float *xyz, 
00655                                int natomsi, 
00656                                int natomsj,
00657                                float3 celld,
00658                                unsigned int* llhistg,
00659                                int nbins,
00660                                float rmin,
00661                                float delr_inv,
00662                                float *histdev,
00663                                int nblockhist) {
00664   // zero out the histograms in global memory
00665   init_hist<<<NCUDABLOCKS, NBLOCK>>>(llhistg, MAXBIN);
00666       
00667   // calculate the histograms
00668   calculate_rdf<<<NCUDABLOCKS, NBLOCK>>>(usepbc, xyz, natomsi, natomsj, celld,
00669                                          llhistg, nbins, rmin, delr_inv);
00670 
00671   // sum histograms and begin normalization by adjusting for the cell volume
00672   calculate_histogram<<<1, nblockhist>>>(histdev, llhistg, nbins);
00673 }
00674 
00675 
00676 
00677 /*
00678  * input parameters for rdf_thread() routine
00679  */
00680 typedef struct {
00681   int usepbc;        // periodic or non-periodic calculation
00682   int natoms1;       // number of atoms in selection 1.
00683   float* xyz;        // coordinates of first selection, [natoms1][3]
00684   int natoms2;       // number of atoms in selection 2.
00685   float** xyz2array; // coordinates of selection 2. [natoms2][3] (per-thread)
00686   float* cell;       // the cell x y and z dimensions [3]
00687   float** histarray; // the final histogram in host mem [nbins] (per-thread)
00688   int nbins;         // the number of bins in the histogram
00689   int nblockhist;    // the size of a block used in the
00690                      // reduction of the histogram
00691   float rmin;        // the minimum value of the first bin
00692   float delr;        // the width of each bin
00693   int nblocks;       // number of const blocks to compute all pairs histogram
00694   int nhblock;       // # NBLOCKHIST-sized blocks needed to calc whole histogram
00695 } rdfthrparms;
00696 
00697 
00698 /*  
00699  * rdf_thread -- multithreaded version of subroutine_rdf()
00700  * This routine is called from the CPU to calculate g(r) for a single frame
00701  * on the GPU.  Right now it performs a brute force O(N^2) calculations
00702  * (with all atomic pairs considered).  Future version may reduce the overall
00703  * work by performing a grid search.
00704  */
00705 static void * rdf_thread(void *voidparms) {
00706   rdfthrparms *parms = NULL;
00707   int threadid=0;
00708 
00709   wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00710   wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00711 
00712 #if 0
00713 printf("rdf thread[%d] lanched and running...\n", threadid);
00714 #endif
00715   cudaSetDevice(threadid);
00716 
00717   /*
00718    * copy in per-thread parameters
00719    */
00720   const int natoms1 = parms->natoms1;
00721   const float *xyz = parms->xyz;
00722   const int natoms2 = parms->natoms2;
00723   const float *cell = parms->cell;
00724   const int nbins = parms->nbins;
00725   const int nblockhist = parms->nblockhist;
00726   const float rmin = parms->rmin;
00727   const float delr = parms->delr;
00728   float *xyz2 = parms->xyz2array[threadid];
00729   float *hist = parms->histarray[threadid];
00730   const int nhblock = parms->nhblock;
00731   const int usepbc = parms->usepbc;
00732 
00733   float* xyzdev;         // pointer to xyz data in global memory
00734   float3 celld;          // cell x, y, and z dimensions
00735   float* histdev;        // pointer to float histogram in global memory
00736   unsigned int* llhistg; // pointer to int histograms in global memory
00737 
00738   int nconstblocks; // # full blocks to be sent to constant memory
00739   int nleftover;    // # elements in leftover block of const mem
00740   int nbigblocks;   // number of blocks required to prevent histogram overflow
00741   int ibigblock;    // the index of the current big block
00742   int nleftoverbig; // the number of elements in the final big block
00743   int ntmpbig;
00744   int nbinstmp;     // number of bins currently being processed
00745   int natoms1pad;   // number of atoms in selection 1 after padding
00746   int natoms2pad;   // number of atoms in selection 1 after padding
00747   float rmax;       // cutoff radius for atom contributions to histogram
00748 
00749   // natoms1pad is natoms1 rounded up to the next multiple of NBLOCK
00750   natoms1pad=natoms1;
00751   if (natoms1%NBLOCK!=0) {natoms1pad = (natoms1 / NBLOCK + 1) * NBLOCK;}
00752   natoms2pad=natoms2;
00753   if (natoms2%NBLOCK!=0) {natoms2pad = (natoms2 / NBLOCK + 1) * NBLOCK;}
00754 
00755   // allocate global memory for:
00756 
00757   // the xyz coordinates of selection 1
00758   cudaMalloc((void**)&xyzdev, 3*max(natoms1pad,natoms2pad)*sizeof(float));
00759 
00760   // the final copy of the histogram
00761   cudaMalloc((void**)&histdev, nbins*sizeof(float));
00762 
00763   // zero out floating point histogram
00764   int ihblock;         // loop counter for histogram blocks
00765   int maxloop=nbins/nblockhist;
00766   if (maxloop*nblockhist!=nbins) {maxloop=maxloop+1;}
00767   for (ihblock=0; ihblock<maxloop; ihblock++) {
00768     init_hist_f<<<1, nblockhist>>>(histdev+ihblock*nblockhist);
00769   }
00770 
00771   // the global memory copies of the histogram
00772 #ifdef __USEATOMIC
00773   cudaMalloc((void**)&llhistg, (NCUDABLOCKS*MAXBIN*sizeof(int)));
00774 #else
00775   cudaMalloc((void**)&llhistg, (NCUDABLOCKS*NBLOCK*MAXBIN*
00776              sizeof(int))/THREADSPERWARP);
00777 #endif
00778 
00779   // set the cell dimensions
00780   celld.x=cell[0];
00781   celld.y=cell[1];
00782   celld.z=cell[2];
00783 
00784   // set rmax.  this is the distance that the padding atoms must be from the 
00785   // unit cell AND that they must be from each other
00786   rmax = 1.1f * (rmin+((float)nbins)*delr);
00787 
00788   // send the second selection to the gpu for reimaging
00789   cudaMemcpy(xyzdev, xyz2, 3*natoms2*sizeof(float), cudaMemcpyHostToDevice);
00790 
00791   if (usepbc) {
00792     // reimage atom coors into a single periodic cell
00793     reimage_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms2,natoms2pad,celld,rmax);
00794   } else {
00795     // shift phantom atoms so they don't interfere with non-periodic calc.
00796     phantom_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms2,natoms2pad,1.0e38f,rmax);
00797   }
00798 
00799   // now, unfortunately, we have to move it back because this selection 
00800   // will need to be sent to constant memory (done on the host side)
00801   cudaMemcpy(xyz2, xyzdev, 3*natoms2*sizeof(float), cudaMemcpyDeviceToHost);
00802 
00803   // send the xyz coords of selection 1 to the gpu
00804   cudaMemcpy(xyzdev, xyz, 3*natoms1*sizeof(float), cudaMemcpyHostToDevice);
00805 
00806   if (usepbc) {
00807     // reimage xyz coords back into the periodic box and zero all histograms
00808     reimage_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms1,natoms1pad,celld,rmax);
00809   } else {
00810     // shift phantom atoms so they don't interfere with non-periodic calc.
00811     phantom_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms1,natoms1pad,1.0e38f,rmax);
00812   }
00813 
00814   // all blocks except for the final one will have MAXBIN elements
00815   nbinstmp = MAXBIN; 
00816 
00817   // calc # of groups to divide sel2 into to fit it into constant memory
00818   nconstblocks=natoms2/NCONSTBLOCK;
00819   nleftover=natoms2-nconstblocks*NCONSTBLOCK;
00820 
00821   // set up the big blocks to avoid bin overflow
00822   ntmpbig = NBLOCK * ((BIN_OVERFLOW_LIMIT / NCONSTBLOCK) / NBLOCK);
00823   nbigblocks = natoms1pad / ntmpbig;
00824   nleftoverbig = natoms1pad - nbigblocks * ntmpbig;
00825 
00826   int nbblocks = (nleftoverbig != 0) ? nbigblocks+1 : nbigblocks;
00827 
00828   // parallel loop over all "constblocks"
00829   wkf_tasktile_t tile; // next batch of work units
00830   while (wkf_threadpool_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
00831 #if 0
00832 printf("rdf thread[%d] fetched tile: s: %d e: %d\n", threadid, tile.start, tile.end);
00833 #endif
00834     int iconstblock = -1; // loop counter for constblocks, force update 
00835     ihblock = -1;     // initialize to sentinel value to force immediate update
00836     int workitem;     // current parallel work item
00837     for (workitem=tile.start; workitem<tile.end; workitem++) {
00838       // decode work item into ihblock and iconstblock indices
00839       ihblock = workitem % nhblock;
00840       int newiconstblock = workitem / nhblock;
00841       int blocksz;
00842 
00843       // When moving to the next constblock we must copy in new coordinates
00844       if (iconstblock != newiconstblock) {
00845         iconstblock = newiconstblock;
00846 
00847         // take care of the leftovers on the last block
00848         blocksz = (iconstblock == nconstblocks && nleftover != 0) 
00849                   ? nleftover : NCONSTBLOCK;
00850 
00851         // send a "constblock" of coords from selection 2 to const mem
00852         copycoordstoconstbuff(blocksz, xyz2+(3*NCONSTBLOCK*iconstblock));
00853       } 
00854 
00855 #if 0
00856 printf("rdf thread[%d] iconstblock: %d ihblock: %d\n", threadid, 
00857        iconstblock, ihblock);
00858 #endif
00859 
00860       // loop over blocks of histogram
00861       // minimum distance for the current histogram block
00862       // the first block starts at rmin
00863       float rmintmp=rmin + (MAXBIN * delr * ihblock);
00864 
00865       nbinstmp = MAXBIN;
00866 
00867       // the final block will have nbin%MAXBIN elements
00868       if (ihblock==(nhblock-1)) { 
00869         nbinstmp = nbins%MAXBIN;
00870         // if nbins is an even multiple of MAXBIN, the final block is full
00871         if (nbinstmp==0) { nbinstmp = MAXBIN;}
00872       }
00873 
00874       for (ibigblock=0; ibigblock < nbblocks; ibigblock++) {
00875         // take care of the leftovers on the last block
00876         int bblocksz = (ibigblock == nbigblocks && nleftoverbig != 0) 
00877                        ? nleftoverbig : ntmpbig;
00878 
00879         calculate_histogram_block(usepbc, (xyzdev+ibigblock*ntmpbig*3),
00880                                   bblocksz, blocksz, celld,
00881                                   llhistg, nbinstmp, rmintmp, (1/delr),
00882                                   histdev+ihblock*MAXBIN, nblockhist);
00883       }
00884 
00885       cudaThreadSynchronize();
00886     }
00887   } // end of parallel tile fetch while loop
00888 
00889   // retrieve g of r for this frame
00890   cudaMemcpy(hist, histdev, nbins*sizeof(float), cudaMemcpyDeviceToHost);
00891 
00892   cudaFree(xyzdev);  // free gpu global memory
00893   cudaFree(histdev);
00894   cudaFree(llhistg);
00895 
00896 #if 0
00897 printf("thread[%d] finished, releasing resources\n", threadid); 
00898 #endif
00899 
00900   return NULL;
00901 }
00902 
00903 
00904 int rdf_gpu(wkf_threadpool_t *devpool, // VMD GPU worker thread pool
00905             int usepbc,                // periodic or non-periodic calc.
00906             int natoms1,               // array of the number of atoms in
00907                                        // selection 1 in each frame.
00908             float *xyz,                // coordinates of first selection.
00909                                        // [natoms1][3]
00910             int natoms2,               // array of the number of atoms in
00911                                        // selection 2 in each frame.
00912             float *xyz2,               // coordinates of selection 2.
00913                                        // [natoms2][3]
00914             float* cell,               // the cell x y and z dimensions [3]
00915             float* hist,               // the histograms, 1 per block
00916                                        // [ncudablocks][maxbin]
00917             int nbins,                 // the number of bins in the histogram
00918             float rmin,                // the minimum value of the first bin
00919             float delr)                // the width of each bin
00920 {
00921   int i, ibin;
00922 
00923   if (devpool == NULL) {
00924     return -1; // abort if no device pool exists
00925   }
00926   int numprocs = wkf_threadpool_get_workercount(devpool);
00927 
00928   // multi-thread buffers
00929   float **xyz2array = (float**) calloc(1, sizeof(float *) * numprocs);
00930   float **histarray = (float**) calloc(1, sizeof(float *) * numprocs);
00931   for (i=0; i<numprocs; i++) {
00932     xyz2array[i] = (float*) calloc(1, sizeof(float) * 3 * (natoms2/NBLOCK + 1) * NBLOCK);
00933     memcpy(xyz2array[i], xyz2, sizeof(float) * 3 * (natoms2/NBLOCK + 1) * NBLOCK);
00934     histarray[i] = (float*) calloc(1, sizeof(float) * nbins);
00935   }
00936 
00937   memset(hist, 0, nbins * sizeof(float)); // clear global histogram array
00938 
00939   // setup thread parameters
00940   rdfthrparms parms;
00941 
00942   parms.usepbc = usepbc;
00943   parms.natoms1 = natoms1;
00944   parms.xyz = xyz;
00945   parms.natoms2 = natoms2;
00946   parms.cell = cell;
00947   parms.nbins = nbins;
00948   parms.nblockhist = NBLOCKHIST; // size of final reduction block 
00949   parms.rmin = rmin;
00950   parms.delr = delr;
00951   parms.xyz2array = xyz2array; // per-thread output data
00952   parms.histarray = histarray; // per-thread output data
00953 
00954   // calculate the number of blocks to divide the histogram into
00955   parms.nhblock = (nbins+MAXBIN-1)/MAXBIN;
00956 
00957   // calc # of groups to divide sel2 into to fit it into constant memory
00958   int nconstblocks=parms.natoms2/NCONSTBLOCK;
00959   int nleftoverconst=parms.natoms2 - nconstblocks*NCONSTBLOCK;
00960 
00961   // we have to do one extra cleanup pass if there are leftovers
00962   parms.nblocks = (nleftoverconst != 0) ? nconstblocks+1 : nconstblocks;
00963 
00964   int parblocks = parms.nblocks * parms.nhblock;
00965 #if 0
00966   printf("master thread: number of parallel work items: %d\n", parblocks);
00967 #endif
00968 
00969   // spawn child threads to do the work
00970   wkf_tasktile_t tile;
00971   tile.start=0;
00972   tile.end=parblocks;
00973   wkf_threadpool_sched_dynamic(devpool, &tile);
00974   wkf_threadpool_launch(devpool, rdf_thread, &parms, 1);
00975 
00976   memset(hist, 0, nbins * sizeof(float)); // clear global histogram array
00977   // collect independent thread results into final histogram buffer
00978   for (i=0; i<numprocs; i++) {
00979     for (ibin=0; ibin<nbins; ibin++) {
00980       hist[ibin] += histarray[i][ibin];
00981     }      
00982   }
00983 
00984   return 0;
00985 }
00986 
00987 
00988 

Generated on Sat May 26 01:47:50 2012 for VMD (current) by doxygen1.2.14 written by Dimitri van Heesch, © 1997-2002