CUDAMDFF.cu Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDAMDFF.cu,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.84 $      $Date: 2021/12/21 06:59:27 $
00014  *
00015  ***************************************************************************/
00028 #include <stdio.h>
00029 #include <stdlib.h>
00030 #include <string.h>
00031 #include <cuda.h>
00032 #if CUDART_VERSION >= 9000
00033 #include <cuda_fp16.h> // need to explicitly include for CUDA 9.0
00034 #endif
00035 #if CUDART_VERSION < 4000
00036 #error The VMD MDFF feature requires CUDA 4.0 or later
00037 #endif
00038 
00039 #include <float.h> // FLT_MAX etc
00040 
00041 
00042 #include "Inform.h"
00043 #include "utilities.h"
00044 #include "WKFThreads.h"
00045 #include "WKFUtils.h"
00046 #include "CUDAKernels.h" 
00047 #include "CUDASpatialSearch.h"
00048 
00049 #include "AtomSel.h"
00050 #include "VMDApp.h"
00051 #include "MoleculeList.h"
00052 #include "VolumetricData.h"
00053 #include "VolMapCreate.h" // volmap_write_dx_file()
00054 
00055 #include "CUDAMDFF.h"
00056 
00057 #include <tcl.h>
00058 #include "TclCommands.h"
00059 
00060 #define MIN(X,Y) (((X)<(Y))? (X) : (Y))
00061 #define MAX(X,Y) (((X)>(Y))? (X) : (Y))
00062 
00063 #if 1
00064 #define CUERR { cudaError_t err; \
00065   if ((err = cudaGetLastError()) != cudaSuccess) { \
00066   printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00067   printf("Thread aborting...\n"); \
00068   return NULL; }}
00069 #else
00070 #define CUERR
00071 #endif
00072 
00073 
00074 
00075 static int check_gpu_compute20(cudaError_t &err) {
00076   cudaDeviceProp deviceProp;
00077   int dev;
00078   if (cudaGetDevice(&dev) != cudaSuccess)
00079     return -1;
00080   
00081   memset(&deviceProp, 0, sizeof(cudaDeviceProp));
00082   
00083   if (cudaGetDeviceProperties(&deviceProp, dev) != cudaSuccess) {
00084     err = cudaGetLastError(); // eat error so next CUDA op succeeds
00085     return -1;
00086   }
00087 
00088   // this code currently requires compute capability 2.0
00089   if (deviceProp.major < 2)
00090     return -1;
00091 
00092   return 0;
00093 }
00094 
00095 
00096 
00097 //
00098 // Various math operators for vector types not already
00099 // provided by the regular CUDA headers
00100 //
00101 // "+" operator
00102 inline __host__ __device__ float2 operator+(float2 a, float2 b) {
00103   return make_float2(a.x + b.x, a.y + b.y);
00104 }
00105 
00106 // "+=" operator
00107 inline __host__ __device__ void operator+=(float2 &a, float2 b) {
00108   a.x += b.x;
00109   a.y += b.y;
00110 }
00111 
00112 inline __host__ __device__ float4 operator+(float4 a, float4 b) {
00113   return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
00114 }
00115 
00116 // "+=" operator
00117 inline __host__ __device__ void operator+=(float4 &a, float4 b) {
00118   a.x += b.x;
00119   a.y += b.y;
00120   a.z += b.z;
00121   a.w += b.w;
00122 }
00123 
00124 
00125 
00126 //
00127 // density format conversion routines
00128 //
00129 
00130 // no-op conversion for float to float
00131 inline __device__ void convert_density(float & df, float df2) {
00132   df = df2;
00133 }
00134 
00135 // Convert float (32-bit) to half-precision (16-bit floating point) stored
00136 // into an unsigned short (16-bit integer type).
00137 inline __device__ void convert_density(unsigned short & dh, float df2) {
00138   dh = __float2half_rn(df2);
00139 }
00140 
00141 
00142 //
00143 // Restrict macro to make it easy to do perf tuning tests
00144 //
00145 #if 1 && (CUDART_VERSION >= 4000)
00146 #define RESTRICT __restrict__
00147 #else
00148 #define RESTRICT
00149 #endif
00150 
00151 
00152 // 
00153 // Parameters for linear-time range-limited gaussian density kernels
00154 //
00155 #define GGRIDSZ   8.0f
00156 #define GBLOCKSZX 8
00157 #define GBLOCKSZY 8
00158 
00159 #define GBLOCKSZZ    2
00160 #define GUNROLL      4
00161 
00162 #define TOTALBLOCKSZ (GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ)
00163 
00164 
00165 //
00166 // All of the current kernel versions require support for atomic ops,
00167 // so they only work with compute capability 2.x or greater...
00168 //
00169 
00170 //
00171 // global atomic counter
00172 //
00173 __device__ unsigned int tbcatomic[3] = {0, 0, 0};
00174 
00175 __device__ void reset_atomic_counter(unsigned int *counter) {
00176   counter[0] = 0;
00177   __threadfence();
00178 }
00179 
00180 
00181 //
00182 // completely inlined device function to calculate the min/max
00183 // bounds of the acceleration grid region that must be traversed to
00184 // satisfy the current potential grid point
00185 //
00186 inline __device__ void calc_ac_cell_ids(int3 &abmin, int3 &abmax,
00187                                         int3 acncells,
00188                                         float3 acoriginoffset,
00189                                         float gridspacing, 
00190                                         float acgridspacing,
00191                                         float invacgridspacing) {
00192   // compute ac grid index of lower corner minus gaussian radius
00193   abmin.x = (acoriginoffset.x + (blockIdx.x * blockDim.x) * gridspacing - acgridspacing) * invacgridspacing;
00194   abmin.y = (acoriginoffset.y + (blockIdx.y * blockDim.y) * gridspacing - acgridspacing) * invacgridspacing;
00195   abmin.z = (acoriginoffset.z + (blockIdx.z * blockDim.z * GUNROLL) * gridspacing - acgridspacing) * invacgridspacing;
00196 
00197   // compute ac grid index of upper corner plus gaussian radius
00198   abmax.x = (acoriginoffset.x + ((blockIdx.x+1) * blockDim.x) * gridspacing + acgridspacing) * invacgridspacing;
00199   abmax.y = (acoriginoffset.y + ((blockIdx.y+1) * blockDim.y) * gridspacing + acgridspacing) * invacgridspacing;
00200   abmax.z = (acoriginoffset.z + ((blockIdx.z+1) * blockDim.z * GUNROLL) * gridspacing + acgridspacing) * invacgridspacing;
00201 
00202   abmin.x = (abmin.x < 0) ? 0 : abmin.x;
00203   abmin.y = (abmin.y < 0) ? 0 : abmin.y;
00204   abmin.z = (abmin.z < 0) ? 0 : abmin.z;
00205   abmax.x = (abmax.x >= acncells.x-1) ? acncells.x-1 : abmax.x;
00206   abmax.y = (abmax.y >= acncells.y-1) ? acncells.y-1 : abmax.y;
00207   abmax.z = (abmax.z >= acncells.z-1) ? acncells.z-1 : abmax.z;
00208 }
00209 
00210 
00211 
00212 inline __device__ void calc_densities(int xindex, int yindex, int zindex,
00213                                       const float4 * RESTRICT sorted_xyzr, 
00214                                       int3 abmin,
00215                                       int3 abmax,
00216                                       int3 acncells,
00217                                       float3 acoriginoffset,
00218                                       const uint2 * RESTRICT cellStartEnd,
00219                                       float gridspacing, 
00220                                       float &densityval1
00221 #if GUNROLL >= 2
00222                                       ,float &densityval2
00223 #endif
00224 #if GUNROLL >= 4
00225                                       ,float &densityval3
00226                                       ,float &densityval4
00227 #endif
00228                                       ) {
00229   float coorx = acoriginoffset.x + gridspacing * xindex;
00230   float coory = acoriginoffset.y + gridspacing * yindex;
00231   float coorz = acoriginoffset.z + gridspacing * zindex;
00232 
00233   int acplanesz = acncells.x * acncells.y;
00234   int xab, yab, zab;
00235   for (zab=abmin.z; zab<=abmax.z; zab++) {
00236     for (yab=abmin.y; yab<=abmax.y; yab++) {
00237       for (xab=abmin.x; xab<=abmax.x; xab++) {
00238         int abcellidx = zab * acplanesz + yab * acncells.x + xab;
00239         uint2 atomstartend = cellStartEnd[abcellidx];
00240         if (atomstartend.x != GRID_CELL_EMPTY) {
00241           unsigned int atomid;
00242           for (atomid=atomstartend.x; atomid<atomstartend.y; atomid++) {
00243             float4 atom = sorted_xyzr[atomid];
00244             float dx = coorx - atom.x;
00245             float dy = coory - atom.y;
00246             float dxy2 = dx*dx + dy*dy;
00247   
00248             float dz = coorz - atom.z;
00249             float r21 = (dxy2 + dz*dz) * atom.w;
00250             densityval1 += __expf(r21);
00251 
00252 #if GUNROLL >= 2
00253             float dz2 = dz + gridspacing;
00254             float r22 = (dxy2 + dz2*dz2) * atom.w;
00255             densityval2 += __expf(r22);
00256 #endif
00257 #if GUNROLL >= 4
00258             float dz3 = dz2 + gridspacing;
00259             float r23 = (dxy2 + dz3*dz3) * atom.w;
00260             densityval3 += __expf(r23);
00261 
00262             float dz4 = dz3 + gridspacing;
00263             float r24 = (dxy2 + dz4*dz4) * atom.w;
00264             densityval4 += __expf(r24);
00265 #endif
00266           }
00267         }
00268       }
00269     }
00270   }
00271 }
00272 
00273 
00274 
00275 #if GUNROLL == 1
00276 #define MAINDENSITYLOOP \
00277   float densityval1=0.0f; \
00278   calc_densities(xindex, yindex, zindex, sorted_xyzr, abmin, abmax, acncells, \
00279                  acoriginoffset, cellStartEnd, gridspacing, densityval1);
00280 #elif GUNROLL == 2
00281 #define MAINDENSITYLOOP \
00282   float densityval1=0.0f; \
00283   float densityval2=0.0f; \
00284   calc_densities(xindex, yindex, zindex, sorted_xyzr, abmin, abmax, acncells, \
00285                  acoriginoffset, cellStartEnd, gridspacing,                   \
00286                  densityval1, densityval2);
00287 #elif GUNROLL == 4
00288 #define MAINDENSITYLOOP \
00289   float densityval1=0.0f; \
00290   float densityval2=0.0f; \
00291   float densityval3=0.0f; \
00292   float densityval4=0.0f; \
00293   calc_densities(xindex, yindex, zindex, sorted_xyzr, abmin, abmax, acncells, \
00294                  acoriginoffset, cellStartEnd, gridspacing,                   \
00295                  densityval1, densityval2, densityval3, densityval4);
00296 #endif
00297 
00298 
00299 //
00300 // compute density map
00301 //
00302 __global__ static void gaussdensity_fast(int natoms,
00303                                          const float4 * RESTRICT sorted_xyzr, 
00304                                          int3 volsz,
00305                                          int3 acncells,
00306                                          float3 acoriginoffset,
00307                                          float acgridspacing,
00308                                          float invacgridspacing,
00309                                          const uint2 * RESTRICT cellStartEnd,
00310                                          float gridspacing, unsigned int z, 
00311                                          float * RESTRICT densitygrid) {
00312   unsigned int xindex  = (blockIdx.x * blockDim.x) + threadIdx.x;
00313   unsigned int yindex  = (blockIdx.y * blockDim.y) + threadIdx.y;
00314   unsigned int zindex  = ((blockIdx.z * blockDim.z) + threadIdx.z) * GUNROLL;
00315   unsigned int outaddr = zindex * volsz.x * volsz.y + 
00316                          yindex * volsz.x + 
00317                          xindex;
00318   zindex += z;
00319 
00320   // early exit if this thread is outside of the grid bounds
00321   if (xindex >= volsz.x || yindex >= volsz.y || zindex >= volsz.z)
00322     return;
00323 
00324   // compute ac grid index of lower corner minus gaussian radius
00325   int3 abmin, abmax;
00326   calc_ac_cell_ids(abmin, abmax, acncells, acoriginoffset,
00327                    gridspacing, acgridspacing, invacgridspacing);
00328 
00329   // density loop macro
00330   MAINDENSITYLOOP
00331 
00332   densitygrid[outaddr            ] = densityval1;
00333 #if GUNROLL >= 2
00334   int planesz = volsz.x * volsz.y;
00335   if (++zindex < volsz.z) // map isn't always an even multiple of block size
00336     densitygrid[outaddr +   planesz] = densityval2;
00337 #endif
00338 #if GUNROLL >= 4
00339   if (++zindex < volsz.z) // map isn't always an even multiple of block size
00340     densitygrid[outaddr + 2*planesz] = densityval3;
00341   if (++zindex < volsz.z) // map isn't always an even multiple of block size
00342     densitygrid[outaddr + 3*planesz] = densityval4;
00343 #endif
00344 }
00345 
00346 
00347 
00348 //
00349 // compute density differences
00350 //
00351 __global__ static void gaussdensity_diff(int natoms,
00352                                          const float4 * RESTRICT sorted_xyzr, 
00353                                          int3 volsz,
00354                                          int3 acncells,
00355                                          float3 acoriginoffset,
00356                                          float acgridspacing,
00357                                          float invacgridspacing,
00358                                          const uint2 * RESTRICT cellStartEnd,
00359                                          float gridspacing, unsigned int z, 
00360                                          int3 refvolsz,
00361                                          int3 refoffset,
00362                                          const float * RESTRICT refmap,
00363                                          float * RESTRICT diffmap) {
00364   unsigned int xindex  = (blockIdx.x * blockDim.x) + threadIdx.x;
00365   unsigned int yindex  = (blockIdx.y * blockDim.y) + threadIdx.y;
00366   unsigned int zindex  = ((blockIdx.z * blockDim.z) + threadIdx.z) * GUNROLL;
00367   unsigned int refaddr = (zindex+refoffset.z) * refvolsz.x * refvolsz.y + 
00368                          (yindex+refoffset.y) * refvolsz.x + 
00369                          (xindex+refoffset.x);
00370   unsigned int outaddr = zindex * volsz.x * volsz.y + 
00371                          yindex * volsz.x + 
00372                          xindex;
00373   zindex += z;
00374 
00375   // early exit if this thread is outside of the grid bounds
00376   if (xindex >= volsz.x || yindex >= volsz.y || zindex >= volsz.z)
00377     return;
00378 
00379   // compute ac grid index of lower corner minus gaussian radius
00380   int3 abmin, abmax;
00381   calc_ac_cell_ids(abmin, abmax, acncells, acoriginoffset,
00382                    gridspacing, acgridspacing, invacgridspacing);
00383 
00384   // density loop macro
00385   MAINDENSITYLOOP
00386 
00387   diffmap[outaddr            ] = refmap[refaddr               ] - densityval1;
00388 #if GUNROLL >= 2
00389   int planesz = volsz.x * volsz.y;
00390   int refplanesz = refvolsz.x * refvolsz.y;
00391   if (++zindex < volsz.z) // map isn't always an even multiple of block size
00392     diffmap[outaddr +   planesz] = refmap[refaddr +   refplanesz] - densityval2;
00393 #endif
00394 #if GUNROLL >= 4
00395   if (++zindex < volsz.z) // map isn't always an even multiple of block size
00396     diffmap[outaddr + 2*planesz] = refmap[refaddr + 2*refplanesz] - densityval3;
00397   if (++zindex < volsz.z) // map isn't always an even multiple of block size
00398     diffmap[outaddr + 3*planesz] = refmap[refaddr + 3*refplanesz] - densityval4;
00399 #endif
00400 }
00401 
00402 
00403 
00404 //
00405 // sum of absolute differences
00406 //
00407 __device__ float sumabsdiff_sumreduction(int tid, int totaltb, 
00408                                          float *sumabsdiffs_s,
00409                                          float *sumabsdiffs) {
00410   float sumabsdifftotal = 0.0f;
00411 
00412   // use precisely one warp to do the final reduction
00413   if (tid < warpSize) {
00414     for (int i=tid; i<totaltb; i+=warpSize) {
00415       sumabsdifftotal += sumabsdiffs[i];
00416     }
00417 
00418     // write to shared mem
00419     sumabsdiffs_s[tid] = sumabsdifftotal;
00420   }
00421   __syncthreads(); // all threads must hit syncthreads call...
00422 
00423   // perform intra-warp parallel reduction...
00424   // general loop version of parallel sum-reduction
00425   for (int s=warpSize>>1; s>0; s>>=1) {
00426     if (tid < s) {
00427       sumabsdiffs_s[tid] += sumabsdiffs_s[tid + s];
00428     }
00429     __syncthreads(); // all threads must hit syncthreads call...
00430   }
00431 
00432   return sumabsdiffs_s[0];
00433 }
00434 
00435 
00436 __global__ static void gaussdensity_sumabsdiff(int totaltb,
00437                                                int natoms,
00438                                                const float4 * RESTRICT sorted_xyzr, 
00439                                                int3 volsz,
00440                                                int3 acncells,
00441                                                float3 acoriginoffset,
00442                                                float acgridspacing,
00443                                                float invacgridspacing,
00444                                                const uint2 * RESTRICT cellStartEnd,
00445                                                float gridspacing, 
00446                                                unsigned int z, 
00447                                                int3 refvolsz,
00448                                                int3 refoffset,
00449                                                const float * RESTRICT refmap,
00450                                                float * RESTRICT sumabsdiff) {
00451   int tid = threadIdx.z*blockDim.x*blockDim.y + 
00452             threadIdx.y*blockDim.x + threadIdx.x;
00453 
00454 #if __CUDA_ARCH__ >= 200
00455   // setup shared variable
00456   __shared__ bool isLastBlockDone;
00457   if (tid == 0) 
00458     isLastBlockDone = 0;
00459   __syncthreads();
00460 #endif
00461 
00462   unsigned int xindex  = (blockIdx.x * blockDim.x) + threadIdx.x;
00463   unsigned int yindex  = (blockIdx.y * blockDim.y) + threadIdx.y;
00464   unsigned int zindex  = ((blockIdx.z * blockDim.z) + threadIdx.z) * GUNROLL;
00465   unsigned int refaddr = (zindex+refoffset.z) * refvolsz.x * refvolsz.y +
00466                          (yindex+refoffset.y) * refvolsz.x +
00467                          (xindex+refoffset.x);
00468   zindex += z;
00469 
00470   float thread_sumabsdiff = 0.0f;
00471 
00472   // can't early exit if this thread is outside of the grid bounds,
00473   // so we store a diff of 0.0 instead.
00474   if (xindex < volsz.x && yindex < volsz.y && zindex < volsz.z) {
00475     // compute ac grid index of lower corner minus gaussian radius
00476     int3 abmin, abmax;
00477     calc_ac_cell_ids(abmin, abmax, acncells, acoriginoffset,
00478                      gridspacing, acgridspacing, invacgridspacing);
00479 
00480     // density loop macro
00481     MAINDENSITYLOOP
00482 
00483     thread_sumabsdiff += fabsf(refmap[refaddr               ] - densityval1);
00484 #if GUNROLL >= 2
00485     int refplanesz = refvolsz.x * refvolsz.y;
00486     if (++zindex < volsz.z) // map isn't always an even multiple of block size
00487       thread_sumabsdiff += fabsf(refmap[refaddr +   refplanesz] - densityval2);
00488 #endif
00489 #if GUNROLL >= 4
00490     if (++zindex < volsz.z) // map isn't always an even multiple of block size
00491       thread_sumabsdiff += fabsf(refmap[refaddr + 2*refplanesz] - densityval3);
00492     if (++zindex < volsz.z) // map isn't always an even multiple of block size
00493       thread_sumabsdiff += fabsf(refmap[refaddr + 3*refplanesz] - densityval4);
00494 #endif
00495   }
00496 
00497   // all threads write their local sums to shared memory...
00498   __shared__ float sumabsdiff_s[TOTALBLOCKSZ];
00499 
00500   sumabsdiff_s[tid] = thread_sumabsdiff;
00501   __syncthreads(); // all threads must hit syncthreads call...
00502 
00503   // use precisely one warp to do the thread-block-wide reduction
00504   if (tid < warpSize) {
00505     float tmp = 0.0f;
00506     for (int i=tid; i<TOTALBLOCKSZ; i+=warpSize) {
00507       tmp += sumabsdiff_s[i];
00508     }
00509 
00510     // write to shared mem
00511     sumabsdiff_s[tid] = tmp;
00512   }
00513   __syncthreads(); // all threads must hit syncthreads call...
00514 
00515   // perform intra-warp parallel reduction...
00516   // general loop version of parallel sum-reduction
00517   for (int s=warpSize>>1; s>0; s>>=1) {
00518     if (tid < s) {
00519       sumabsdiff_s[tid] += sumabsdiff_s[tid + s];
00520     }
00521     __syncthreads(); // all threads must hit syncthreads call...
00522   }
00523 
00524   // check if we are the last thread block to finish and finalize results
00525   if (tid == 0) {   
00526     unsigned int bid = blockIdx.z * gridDim.x * gridDim.y +
00527                        blockIdx.y * gridDim.x + blockIdx.x;
00528     sumabsdiff[bid] = sumabsdiff_s[0];
00529 #if __CUDA_ARCH__ >= 200
00530     __threadfence();
00531 
00532     unsigned int value = atomicInc(&tbcatomic[0], totaltb);
00533     isLastBlockDone = (value == (totaltb - 1));
00534   }
00535   __syncthreads();
00536   if (isLastBlockDone) {
00537     float totalsumabsdiff = sumabsdiff_sumreduction(tid, totaltb, sumabsdiff_s, sumabsdiff); 
00538 
00539     if (tid == 0)
00540       sumabsdiff[totaltb] = totalsumabsdiff;
00541  
00542     reset_atomic_counter(&tbcatomic[0]);
00543 #else
00544     if (bid==0)
00545       sumabsdiff[totaltb] = 0.0f;
00546 #endif
00547   }
00548 }
00549 
00550 
00551 
00552 //
00553 // cross correlation
00554 //
00555 inline __device__ float calc_cc(float sums_ref, float sums_synth,
00556                                 float squares_ref, float squares_synth,
00557                                 int lsize, float lcc) {
00558   float cc = 0.0f;
00559 
00560   // detect and prevent cases that would cause division by zero
00561   // compute CC if at least one 1 pair of voxels was compared...
00562   if (lsize > 0) {
00563     float mean_ref     = sums_ref / lsize;
00564     float mean_synth   = sums_synth / lsize;
00565     float stddev_ref   = sqrtf(squares_ref / lsize - mean_ref*mean_ref);
00566     float stddev_synth = sqrtf(squares_synth / lsize - mean_synth*mean_synth);
00567 
00568     // To prevent corner cases that may contain only a single sample from
00569     // generating Inf or NaN correlation results, we test that the standard 
00570     // deviations are not only non-zero (thereby preventing Inf), but we go 
00571     // further, and test that the standard deviation values are 
00572     // greater than zero, as this type of comparison will return false 
00573     // in the presence of NaN, allowing us to handle that case better.
00574     // We can do the greater-than-zero comparison since it should not be
00575     // possible to get a negative result from the square root.
00576     if (stddev_ref > 0.0f && stddev_synth > 0.0f) {
00577       cc = (lcc - lsize*mean_ref*mean_synth) /
00578            (lsize * stddev_ref * stddev_synth);
00579 
00580       // report a CC of zero for blocks that have too few samples:
00581       // for now, we use a hard-coded threshold for testing purposes
00582       if (lsize < 32)
00583         cc = 0.0f; 
00584 
00585       // clamp out-of-range CC values, can be caused by too few samples...
00586       cc = (cc > -1.0f) ? cc : ((cc < 1.0f) ? cc : 1.0f);
00587     } else {
00588       cc = 0.0f;
00589     }
00590   }
00591 
00592   return cc;
00593 }
00594 
00595 
00596 
00597 // #define VMDUSESHUFFLE 1
00598 #if defined(VMDUSESHUFFLE) && __CUDA_ARCH__ >= 300 && CUDART_VERSION >= 9000
00599 // New warp shuffle-based CC sum reduction for Kepler and later GPUs.
00600 inline __device__ void cc_sumreduction(int tid, int totaltb, 
00601                                 float4 &total_cc_sums,
00602                                 float &total_lcc,
00603                                 int &total_lsize,
00604                                 float4 * RESTRICT tb_cc_sums,
00605                                 float * RESTRICT tb_lcc,
00606                                 int * RESTRICT tb_lsize) {
00607   total_cc_sums = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
00608   total_lcc = 0.0f;
00609   total_lsize = 0;
00610 
00611   // use precisely one warp to do the final reduction
00612   if (tid < warpSize) {
00613     for (int i=tid; i<totaltb; i+=warpSize) {
00614       total_cc_sums += tb_cc_sums[i];
00615       total_lcc     += tb_lcc[i];
00616       total_lsize   += tb_lsize[i];
00617     }
00618 
00619     // perform intra-warp parallel reduction...
00620     // general loop version of parallel sum-reduction
00621     for (int mask=warpSize/2; mask>0; mask>>=1) {
00622       total_cc_sums.x += __shfl_xor_sync(0xffffffff, total_cc_sums.x, mask);
00623       total_cc_sums.y += __shfl_xor_sync(0xffffffff, total_cc_sums.y, mask);
00624       total_cc_sums.z += __shfl_xor_sync(0xffffffff, total_cc_sums.z, mask);
00625       total_cc_sums.w += __shfl_xor_sync(0xffffffff, total_cc_sums.w, mask);
00626       total_lcc     += __shfl_xor_sync(0xffffffff, total_lcc, mask);
00627       total_lsize   += __shfl_xor_sync(0xffffffff, total_lsize, mask);
00628     }
00629   }
00630 }
00631 #else
00632 // shared memory based CC sum reduction 
00633 inline __device__ void cc_sumreduction(int tid, int totaltb, 
00634                                 float4 &total_cc_sums,
00635                                 float &total_lcc,
00636                                 int &total_lsize,
00637                                 float4 * RESTRICT tb_cc_sums,
00638                                 float * RESTRICT tb_lcc,
00639                                 int * RESTRICT tb_lsize) {
00640   total_cc_sums = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
00641   total_lcc = 0.0f;
00642   total_lsize = 0;
00643 
00644   // use precisely one warp to do the final reduction
00645   if (tid < warpSize) {
00646     for (int i=tid; i<totaltb; i+=warpSize) {
00647       total_cc_sums += tb_cc_sums[i];
00648       total_lcc     += tb_lcc[i];
00649       total_lsize   += tb_lsize[i];
00650     }
00651 
00652     // write to shared mem
00653     tb_cc_sums[tid] = total_cc_sums;
00654     tb_lcc[tid]     = total_lcc;
00655     tb_lsize[tid]   = total_lsize;
00656   }
00657   __syncthreads(); // all threads must hit syncthreads call...
00658 
00659   // perform intra-warp parallel reduction...
00660   // general loop version of parallel sum-reduction
00661   for (int s=warpSize>>1; s>0; s>>=1) {
00662     if (tid < s) {
00663       tb_cc_sums[tid] += tb_cc_sums[tid + s];
00664       tb_lcc[tid]     += tb_lcc[tid + s];
00665       tb_lsize[tid]   += tb_lsize[tid + s];
00666     }
00667     __syncthreads(); // all threads must hit syncthreads call...
00668   }
00669 
00670   total_cc_sums = tb_cc_sums[0];
00671   total_lcc = tb_lcc[0];
00672   total_lsize = tb_lsize[0];
00673 }
00674 #endif
00675 
00676 
00677 inline __device__ void thread_cc_sum(float ref, float density,
00678                                      float2 &thread_cc_means, 
00679                                      float2 &thread_cc_squares, 
00680                                      float &thread_lcc,
00681                                      int &thread_lsize) {
00682   if (!isnan(ref)) {
00683     thread_cc_means.x += ref;
00684     thread_cc_means.y += density;
00685     thread_cc_squares.x += ref*ref;
00686     thread_cc_squares.y += density*density;
00687     thread_lcc += ref * density;
00688     thread_lsize++;     
00689   }
00690 }
00691 
00692 
00693 //
00694 // A simple tool to detect cases where we're getting significant
00695 // floating point truncation when accumulating values...
00696 //
00697 #if 0
00698 #define CKACCUM(a, b)                                                   \
00699   {                                                                     \
00700      float tmp = a;                                                     \
00701      a += b;                                                            \
00702      float trunc = a - tmp;                                             \
00703      if (b > 1e-5f && (((b - trunc) / b) > 0.01))                       \
00704        printf("truncation: sum: %f incr: %f trunc: %f trunc_rem: %f\n", \
00705                a, b, trunc, (b-trunc));                                 \
00706   }                                                                     
00707 #else
00708 #define CKACCUM(a, b) \
00709   a+=b;
00710 #endif
00711 
00712 
00713 __global__ static void gaussdensity_cc(int totaltb,
00714                                        int natoms,
00715                                        const float4 *sorted_xyzr, 
00716                                        int3 volsz,
00717                                        int3 acncells,
00718                                        float3 acoriginoffset,
00719                                        float acgridspacing,
00720                                        float invacgridspacing,
00721                                        const uint2 * RESTRICT cellStartEnd,
00722                                        float gridspacing, 
00723                                        unsigned int z, 
00724                                        float threshold,
00725                                        int3 refvolsz,
00726                                        int3 refoffset,
00727                                        const float *refmap,
00728                                        float4 * RESTRICT tb_cc_sums,
00729                                        float * RESTRICT tb_lcc,
00730                                        int * RESTRICT tb_lsize,
00731                                        float * RESTRICT tb_CC) {
00732   int tid = threadIdx.z*blockDim.x*blockDim.y + 
00733             threadIdx.y*blockDim.x + threadIdx.x;
00734 
00735 #if __CUDA_ARCH__ >= 200
00736   // setup shared variable
00737   __shared__ bool isLastBlockDone;
00738   if (tid == 0) 
00739     isLastBlockDone = 0;
00740   __syncthreads();
00741 #endif
00742 
00743   unsigned int xindex  = (blockIdx.x * blockDim.x) + threadIdx.x;
00744   unsigned int yindex  = (blockIdx.y * blockDim.y) + threadIdx.y;
00745   unsigned int zindex  = ((blockIdx.z * blockDim.z) + threadIdx.z) * GUNROLL;
00746   unsigned int refaddr = (zindex+refoffset.z) * refvolsz.x * refvolsz.y +
00747                          (yindex+refoffset.y) * refvolsz.x +
00748                          (xindex+refoffset.x);
00749   zindex += z;
00750 
00751   float2 thread_cc_means = make_float2(0.0f, 0.0f);
00752   float2 thread_cc_squares = make_float2(0.0f, 0.0f);
00753   float  thread_lcc = 0.0f;
00754   int    thread_lsize = 0;
00755 
00756   // can't early exit if this thread is outside of the grid bounds,
00757   // so we store a diff of 0.0 instead.
00758   if (xindex < volsz.x && yindex < volsz.y && zindex < volsz.z) {
00759     // compute ac grid index of lower corner minus gaussian radius
00760     int3 abmin, abmax;
00761     calc_ac_cell_ids(abmin, abmax, acncells, acoriginoffset,
00762                      gridspacing, acgridspacing, invacgridspacing);
00763 
00764     // density loop macro
00765     MAINDENSITYLOOP
00766 
00767     // for each density value, compare vs. threshold, check grid bounds
00768     // since these maps aren't necessarily a multiple of the thread block
00769     // size and we're unrolled in the Z-dimension, and finally, if 
00770     // if accepted, continue with summation of the CC contributions 
00771     float ref;
00772     if (densityval1 >= threshold) {
00773       ref = refmap[refaddr               ];
00774       thread_cc_sum(ref, densityval1, thread_cc_means, thread_cc_squares, thread_lcc, thread_lsize);
00775     }
00776 #if GUNROLL >= 2
00777     int refplanesz = refvolsz.x * refvolsz.y;
00778     if ((densityval2 >= threshold) && (++zindex < volsz.z)) { 
00779       ref = refmap[refaddr +   refplanesz];
00780       thread_cc_sum(ref, densityval2, thread_cc_means, thread_cc_squares, thread_lcc, thread_lsize);
00781     }
00782 #endif
00783 #if GUNROLL >= 4
00784     if ((densityval3 >= threshold) && (++zindex < volsz.z)) { 
00785       ref = refmap[refaddr + 2*refplanesz];
00786       thread_cc_sum(ref, densityval3, thread_cc_means, thread_cc_squares, thread_lcc, thread_lsize);
00787     }
00788     if ((densityval4 >= threshold) && (++zindex < volsz.z)) { 
00789       ref = refmap[refaddr + 3*refplanesz];
00790       thread_cc_sum(ref, densityval4, thread_cc_means, thread_cc_squares, thread_lcc, thread_lsize);
00791     }
00792 #endif
00793   }
00794 
00795 
00796 #if defined(VMDUSESHUFFLE) && __CUDA_ARCH__ >= 300 && CUDART_VERSION >= 9000
00797   // all threads write their local sums to shared memory...
00798   __shared__ float2 tb_cc_means_s[TOTALBLOCKSZ];
00799   __shared__ float2 tb_cc_squares_s[TOTALBLOCKSZ];
00800   __shared__ float tb_lcc_s[TOTALBLOCKSZ];
00801   __shared__ int tb_lsize_s[TOTALBLOCKSZ];
00802 
00803   tb_cc_means_s[tid] = thread_cc_means;
00804   tb_cc_squares_s[tid] = thread_cc_squares;
00805   tb_lcc_s[tid] = thread_lcc;
00806   tb_lsize_s[tid] = thread_lsize;
00807   __syncthreads(); // all threads must hit syncthreads call...
00808 
00809   // use precisely one warp to do the thread-block-wide reduction
00810   if (tid < warpSize) {
00811     float2 tmp_cc_means = make_float2(0.0f, 0.0f);
00812     float2 tmp_cc_squares = make_float2(0.0f, 0.0f);
00813     float tmp_lcc = 0.0f;
00814     int tmp_lsize = 0;
00815     for (int i=tid; i<TOTALBLOCKSZ; i+=warpSize) {
00816       tmp_cc_means   += tb_cc_means_s[i];
00817       tmp_cc_squares += tb_cc_squares_s[i];
00818       tmp_lcc        += tb_lcc_s[i];
00819       tmp_lsize      += tb_lsize_s[i];
00820     }
00821 
00822     // perform intra-warp parallel reduction...
00823     // general loop version of parallel sum-reduction
00824     for (int mask=warpSize/2; mask>0; mask>>=1) {
00825       tmp_cc_means.x   += __shfl_xor_sync(0xffffffff, tmp_cc_means.x, mask);
00826       tmp_cc_means.y   += __shfl_xor_sync(0xffffffff, tmp_cc_means.y, mask);
00827       tmp_cc_squares.x += __shfl_xor_sync(0xffffffff, tmp_cc_squares.x, mask);
00828       tmp_cc_squares.y += __shfl_xor_sync(0xffffffff, tmp_cc_squares.y, mask);
00829       tmp_lcc          += __shfl_xor_sync(0xffffffff, tmp_lcc, mask);
00830       tmp_lsize        += __shfl_xor_sync(0xffffffff, tmp_lsize, mask);
00831     }
00832 
00833     // write per-thread-block partial sums to global memory,
00834     // if a per-thread-block CC output array is provided, write the 
00835     // local CC for this thread block out, and finally, check if we 
00836     // are the last thread block to finish, and finalize the overall
00837     // CC results for the entire grid of thread blocks.
00838     if (tid == 0) {   
00839       unsigned int bid = blockIdx.z * gridDim.x * gridDim.y +
00840                          blockIdx.y * gridDim.x + blockIdx.x;
00841 
00842       tb_cc_sums[bid] = make_float4(tmp_cc_means.x, tmp_cc_means.y,
00843                                     tmp_cc_squares.x, tmp_cc_squares.y);
00844       tb_lcc[bid]     = tmp_lcc;
00845       tb_lsize[bid]   = tmp_lsize;
00846 
00847       if (tb_CC != NULL) {
00848         float cc = calc_cc(tb_cc_means_s[0].x, tb_cc_means_s[0].y,
00849                            tb_cc_squares_s[0].x, tb_cc_squares_s[0].y,
00850                            tb_lsize_s[0], tb_lcc_s[0]);
00851 
00852         // write local per-thread-block CC to global memory
00853         tb_CC[bid]   = cc;
00854       }
00855 
00856       __threadfence();
00857 
00858       unsigned int value = atomicInc(&tbcatomic[0], totaltb);
00859       isLastBlockDone = (value == (totaltb - 1));
00860     }
00861   }
00862   __syncthreads();
00863 
00864   if (isLastBlockDone) {
00865     float4 total_cc_sums;
00866     float total_lcc;
00867     int total_lsize;
00868     cc_sumreduction(tid, totaltb, total_cc_sums, total_lcc, total_lsize,
00869                     tb_cc_sums, tb_lcc, tb_lsize); 
00870 
00871     if (tid == 0) {
00872       tb_cc_sums[totaltb] = total_cc_sums;
00873       tb_lcc[totaltb] = total_lcc;
00874       tb_lsize[totaltb] = total_lsize;
00875     }
00876  
00877     reset_atomic_counter(&tbcatomic[0]);
00878   }
00879 
00880 #else
00881 
00882   // all threads write their local sums to shared memory...
00883   __shared__ float2 tb_cc_means_s[TOTALBLOCKSZ];
00884   __shared__ float2 tb_cc_squares_s[TOTALBLOCKSZ];
00885   __shared__ float tb_lcc_s[TOTALBLOCKSZ];
00886   __shared__ int tb_lsize_s[TOTALBLOCKSZ];
00887 
00888   tb_cc_means_s[tid] = thread_cc_means;
00889   tb_cc_squares_s[tid] = thread_cc_squares;
00890   tb_lcc_s[tid] = thread_lcc;
00891   tb_lsize_s[tid] = thread_lsize;
00892   __syncthreads(); // all threads must hit syncthreads call...
00893 
00894   // use precisely one warp to do the thread-block-wide reduction
00895   if (tid < warpSize) {
00896     float2 tmp_cc_means = make_float2(0.0f, 0.0f);
00897     float2 tmp_cc_squares = make_float2(0.0f, 0.0f);
00898     float tmp_lcc = 0.0f;
00899     int tmp_lsize = 0;
00900     for (int i=tid; i<TOTALBLOCKSZ; i+=warpSize) {
00901       tmp_cc_means   += tb_cc_means_s[i];
00902       tmp_cc_squares += tb_cc_squares_s[i];
00903       tmp_lcc        += tb_lcc_s[i];
00904       tmp_lsize      += tb_lsize_s[i];
00905     }
00906 
00907     // write to shared mem
00908     tb_cc_means_s[tid]   = tmp_cc_means;
00909     tb_cc_squares_s[tid] = tmp_cc_squares;
00910     tb_lcc_s[tid]        = tmp_lcc;
00911     tb_lsize_s[tid]      = tmp_lsize;
00912   }
00913   __syncthreads(); // all threads must hit syncthreads call...
00914 
00915   // perform intra-warp parallel reduction...
00916   // general loop version of parallel sum-reduction
00917   for (int s=warpSize>>1; s>0; s>>=1) {
00918     if (tid < s) {
00919       tb_cc_means_s[tid]   += tb_cc_means_s[tid + s];
00920       tb_cc_squares_s[tid] += tb_cc_squares_s[tid + s];
00921       tb_lcc_s[tid]        += tb_lcc_s[tid + s];
00922       tb_lsize_s[tid]      += tb_lsize_s[tid + s];
00923     }
00924     __syncthreads(); // all threads must hit syncthreads call...
00925   }
00926 //#endif
00927 
00928   // write per-thread-block partial sums to global memory,
00929   // if a per-thread-block CC output array is provided, write the 
00930   // local CC for this thread block out, and finally, check if we 
00931   // are the last thread block to finish, and finalize the overall
00932   // CC results for the entire grid of thread blocks.
00933   if (tid == 0) {   
00934     unsigned int bid = blockIdx.z * gridDim.x * gridDim.y +
00935                        blockIdx.y * gridDim.x + blockIdx.x;
00936     tb_cc_sums[bid] = make_float4(tb_cc_means_s[0].x, tb_cc_means_s[0].y,
00937                                   tb_cc_squares_s[0].x, tb_cc_squares_s[0].y);
00938     tb_lcc[bid]     = tb_lcc_s[0];
00939     tb_lsize[bid]   = tb_lsize_s[0];
00940 
00941     if (tb_CC != NULL) {
00942       float cc = calc_cc(tb_cc_means_s[0].x, tb_cc_means_s[0].y,
00943                          tb_cc_squares_s[0].x, tb_cc_squares_s[0].y,
00944                          tb_lsize_s[0], tb_lcc_s[0]);
00945 
00946       // write local per-thread-block CC to global memory
00947       tb_CC[bid]   = cc;
00948     }
00949 
00950 #if __CUDA_ARCH__ >= 200
00951     __threadfence();
00952 
00953     unsigned int value = atomicInc(&tbcatomic[0], totaltb);
00954     isLastBlockDone = (value == (totaltb - 1));
00955   }
00956   __syncthreads();
00957   if (isLastBlockDone) {
00958     float4 total_cc_sums;
00959     float total_lcc;
00960     int total_lsize;
00961     cc_sumreduction(tid, totaltb, total_cc_sums, total_lcc, total_lsize,
00962                     tb_cc_sums, tb_lcc, tb_lsize); 
00963 
00964     if (tid == 0) {
00965       tb_cc_sums[totaltb] = total_cc_sums;
00966       tb_lcc[totaltb] = total_lcc;
00967       tb_lsize[totaltb] = total_lsize;
00968     }
00969  
00970     reset_atomic_counter(&tbcatomic[0]);
00971 #else
00972     // code path for older GPUs that lack atomic ops
00973     if (bid == 0) {
00974       tb_cc_sums[totaltb] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
00975       tb_lcc[totaltb] = 0.0f;
00976       tb_lsize[totaltb] = 0;
00977     }
00978 #endif
00979   }
00980 #endif
00981 }
00982 
00983 
00984 
00985 //
00986 // kernels for computing min/max/mean/stddev for a map
00987 //
00988 #if 0
00989 inline __device__ void mmms_reduction(int tid, int totaltb,
00990                                       float4 &total_mmms,
00991                                       int &total_lsize,
00992                                       float4 * RESTRICT tb_mmms,
00993                                       int * RESTRICT tb_lsize) {
00994   total_mmms = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
00995   total_lsize = 0;
00996 
00997   // use precisely one warp to do the final reduction
00998   if (tid < warpSize) {
00999     for (int i=tid; i<totaltb; i+=warpSize) {
01000       total_mmms += tb_mmms[i];
01001       total_lsize   += tb_lsize[i];
01002     }
01003 
01004     // write to shared mem
01005     tb_mmms[tid] = total_mmms;
01006     tb_lsize[tid]   = total_lsize;
01007   }
01008   __syncthreads(); // all threads must hit syncthreads call...
01009 
01010   // perform intra-warp parallel reduction...
01011   // general loop version of parallel sum-reduction
01012   for (int s=warpSize>>1; s>0; s>>=1) {
01013     if (tid < s) {
01014       tb_mmms[tid] += tb_mmms[tid + s];
01015       tb_lsize[tid]   += tb_lsize[tid + s];
01016     }
01017     __syncthreads(); // all threads must hit syncthreads call...
01018   }
01019 
01020   total_mmms = tb_mmms[0];
01021   total_lsize = tb_lsize[0];
01022 }
01023 
01024 
01025 inline __device__ void thread_mmms_reduce(float val,
01026                                           float2 &thread_minmax,
01027                                           float2 &thread_mean_square,
01028                                           int &thread_lsize) {
01029   if (!isnan(val)) {
01030     if (thread_lsize == 0) {
01031       thread_minmax.x = val;
01032       thread_minmax.y = val;
01033     } else {
01034       thread_minmax.x = fminf(thread_minmax.x, val);
01035       thread_minmax.y = fmaxf(thread_minmax.y, val);
01036     }
01037 
01038     thread_mean_square.x += val;
01039     thread_mean_square.y += val*val;
01040     thread_lsize++;    
01041   }
01042 }
01043 
01044 
01045 __global__ static void calc_minmax_mean_stddev(int totaltb,
01046                                                int3 volsz,
01047                                                float threshold,
01048                                                const float * RESTRICT map,
01049                                                float4 * RESTRICT tb_mmms,
01050                                                int * RESTRICT tb_lsize) {
01051   int tid = threadIdx.z*blockDim.x*blockDim.y + 
01052             threadIdx.y*blockDim.x + threadIdx.x;
01053 
01054 #if __CUDA_ARCH__ >= 200
01055   // setup shared variable
01056   __shared__ bool isLastBlockDone;
01057   if (tid == 0) 
01058     isLastBlockDone = 0;
01059   __syncthreads();
01060 #endif
01061 
01062   unsigned int xindex  = (blockIdx.x * blockDim.x) + threadIdx.x;
01063   unsigned int yindex  = (blockIdx.y * blockDim.y) + threadIdx.y;
01064   unsigned int zindex  = ((blockIdx.z * blockDim.z) + threadIdx.z) * GUNROLL;
01065   unsigned int mapaddr = zindex * volsz.x * volsz.y +
01066                          yindex * volsz.x +
01067                          xindex;
01068 
01069   float2 thread_minmax = make_float2(0.0f, 0.0f);
01070   float2 thread_mean_square = make_float2(0.0f, 0.0f);
01071   int    thread_lsize = 0;
01072 
01073   // can't early exit if this thread is outside of the grid bounds
01074   if (xindex < volsz.x && yindex < volsz.y && zindex < volsz.z) {
01075 
01076     // for each value, check grid bounds since these maps aren't necessarily 
01077     // a multiple of the thread block size and we're unrolled in the 
01078     // Z-dimension, and finally, if accepted, continue with reduction
01079     float val;
01080     val = map[mapaddr            ];
01081     thread_mmms_reduce(val, thread_minmax, thread_mean_square, thread_lsize);
01082 #if GUNROLL >= 2
01083     int planesz = volsz.x * volsz.y;
01084     if (++zindex < volsz.z) {
01085       val = map[mapaddr +   planesz];
01086       thread_mmms_reduce(val, thread_minmax, thread_mean_square, thread_lsize);
01087     }
01088 #endif
01089 #if GUNROLL >= 4
01090     if (++zindex < volsz.z) {
01091       val = map[mapaddr + 2*planesz];
01092       thread_mmms_reduce(val, thread_minmax, thread_mean_square, thread_lsize);
01093     }
01094     if (++zindex < volsz.z) {
01095       val = map[mapaddr + 3*planesz];
01096       thread_mmms_reduce(val, thread_minmax, thread_mean_square, thread_lsize);
01097     }
01098 #endif
01099   }
01100 
01101   // all threads write their local sums to shared memory...
01102   __shared__ float2 tb_minmax_s[TOTALBLOCKSZ];
01103   __shared__ float2 tb_mean_square_s[TOTALBLOCKSZ];
01104   __shared__ int tb_lsize_s[TOTALBLOCKSZ];
01105 
01106   tb_minmax_s[tid]      = thread_minmax;
01107   tb_mean_square_s[tid] = thread_mean_square;
01108   tb_lsize_s[tid]       = thread_lsize;
01109   __syncthreads(); // all threads must hit syncthreads call...
01110 
01111   // use precisely one warp to do the thread-block-wide reduction
01112   if (tid < warpSize) {
01113     float2 tmp_minmax = thread_minmax;
01114     float2 tmp_mean_square = make_float2(0.0f, 0.0f);
01115     int tmp_lsize = 0;
01116     for (int i=tid; i<TOTALBLOCKSZ; i+=warpSize) {
01117       tmp_minmax.x = fminf(tb_minmax_s[i].x, tmp_minmax.x);
01118       tmp_minmax.x = fmaxf(tb_minmax_s[i].y, tmp_minmax.y);
01119       tmp_mean_square += tb_mean_square_s[i];
01120       tmp_lsize       += tb_lsize_s[i];
01121     }
01122 
01123     // write to shared mem
01124     tb_minmax_s[tid]   = tmp_minmax;
01125     tb_mean_square_s[tid] = tmp_mean_square;
01126     tb_lsize_s[tid]      = tmp_lsize;
01127   }
01128   __syncthreads(); // all threads must hit syncthreads call...
01129 
01130   // perform intra-warp parallel reduction...
01131   // general loop version of parallel sum-reduction
01132   for (int s=warpSize>>1; s>0; s>>=1) {
01133     if (tid < s) {
01134       tb_minmax_s[tid]      += tb_minmax_s[tid + s];
01135       tb_mean_square_s[tid] += tb_mean_square_s[tid + s];
01136       tb_lsize_s[tid]       += tb_lsize_s[tid + s];
01137     }
01138     __syncthreads(); // all threads must hit syncthreads call...
01139   }
01140 
01141   // write per-thread-block partial sums to global memory,
01142   // check if we are the last thread block to finish, and finalize 
01143   // the overall results for the entire grid of thread blocks.
01144   if (tid == 0) {
01145     unsigned int bid = blockIdx.z * gridDim.x * gridDim.y +
01146                        blockIdx.y * gridDim.x + blockIdx.x;
01147     tb_mmms[bid] = make_float4(tb_minmax_s[0].x, tb_minmax_s[0].y,
01148                                tb_mean_square_s[0].x, tb_mean_square_s[0].y);
01149     tb_lsize[bid]   = tb_lsize_s[0];
01150 #if __CUDA_ARCH__ >= 200
01151     __threadfence();
01152 
01153     unsigned int value = atomicInc(&tbcatomic[0], totaltb);
01154     isLastBlockDone = (value == (totaltb - 1));
01155   }
01156   __syncthreads();
01157   if (isLastBlockDone) {
01158     float4 total_mmms;
01159     int total_lsize;
01160     mmms_reduction(tid, totaltb, total_mmms, total_lsize,
01161                    tb_mmms, tb_lsize);
01162 
01163     if (tid == 0) {
01164       tb_mmms[totaltb] = total_mmms;
01165       tb_lsize[totaltb] = total_lsize;
01166     }
01167 
01168     reset_atomic_counter(&tbcatomic[0]);
01169 #else
01170     // code path for older GPUs that lack atomic ops
01171     if (bid == 0) {
01172       tb_mmms[totaltb] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
01173       tb_lsize[totaltb] = 0;
01174     }
01175 #endif
01176   }
01177 }
01178 
01179 #endif
01180 
01181 
01182 static int vmd_cuda_build_accel(cudaError_t &err,
01183                                 const long int natoms,
01184                                 const int3 volsz,
01185                                 const float gausslim,
01186                                 const float radscale,
01187                                 const float maxrad,
01188                                 const float gridspacing,
01189                                 const float *xyzr_f, 
01190                                 float4 *& xyzr_d,
01191                                 float4 *& sorted_xyzr_d,
01192                                 uint2 *& cellStartEnd_d,
01193                                 float &acgridspacing,
01194                                 int3 &accelcells) {
01195   // compute grid spacing for the acceleration grid
01196   acgridspacing = gausslim * radscale * maxrad;
01197 
01198   // XXX allow override of value for testing
01199   if (getenv("VMDACGRIDSPACING") != NULL) {
01200     acgridspacing = atof(getenv("VMDACGRIDSPACING"));
01201   }
01202 
01203   accelcells.x = max(int((volsz.x*gridspacing) / acgridspacing), 1);
01204   accelcells.y = max(int((volsz.y*gridspacing) / acgridspacing), 1);
01205   accelcells.z = max(int((volsz.z*gridspacing) / acgridspacing), 1);
01206 #if 0
01207   printf("  Accel grid(%d, %d, %d) spacing %f\n",
01208          accelcells.x, accelcells.y, accelcells.z, acgridspacing);
01209 #endif
01210 
01211   // pack atom coordinates and radii into float4 types and copy to the device
01212   cudaMalloc((void**)&xyzr_d, natoms * sizeof(float4));
01213 
01214   // pre-process the atom coordinates and radii as needed
01215   // short-term fix until a new CUDA kernel takes care of this
01216   int i, i4;
01217   float4 *xyzr = (float4 *) malloc(natoms * sizeof(float4));
01218   float log2e = log2f(2.718281828f);
01219   for (i=0,i4=0; i<natoms; i++,i4+=4) {
01220     xyzr[i].x = xyzr_f[i4    ];
01221     xyzr[i].y = xyzr_f[i4 + 1];
01222     xyzr[i].z = xyzr_f[i4 + 2];
01223 
01224     float scaledrad = xyzr_f[i4 + 3] * radscale;
01225     float arinv = -1.0f * log2e / (2.0f*scaledrad*scaledrad);
01226 
01227     xyzr[i].w = arinv;
01228   }
01229   cudaMemcpy(xyzr_d, xyzr, natoms * sizeof(float4), cudaMemcpyHostToDevice);
01230   free(xyzr);
01231 
01232   // build uniform grid acceleration structure
01233   if (vmd_cuda_build_density_atom_grid(natoms, xyzr_d, sorted_xyzr_d, 
01234                                        cellStartEnd_d, 
01235                                        accelcells, 1.0f / acgridspacing) != 0) {
01236     cudaFree(xyzr_d);
01237     return -1;
01238   }
01239 
01240   return 0;
01241 }
01242 
01243 
01244 
01245 static int vmd_cuda_destroy_accel(float4 *& xyzr_d,
01246                                   float4 *& sorted_xyzr_d,
01247                                   uint2 *& cellStartEnd_d) {
01248   cudaFree(xyzr_d);
01249   xyzr_d = NULL;
01250 
01251   cudaFree(sorted_xyzr_d);
01252   sorted_xyzr_d = NULL;
01253 
01254   cudaFree(cellStartEnd_d);
01255   cellStartEnd_d = NULL;
01256 
01257   return 0;
01258 }
01259 
01260 
01261 //
01262 // Simple helper to calculate the CUDA kernel launch parameters for
01263 // the majority of the density kernel variants based on the dimensions
01264 // of the computed density map
01265 //
01266 static void vmd_cuda_kernel_launch_dims(int3 volsz, dim3 &Bsz, dim3 &Gsz) {
01267   Bsz = dim3(GBLOCKSZX, GBLOCKSZY, GBLOCKSZZ);
01268   Gsz = dim3((volsz.x+Bsz.x-1) / Bsz.x, 
01269              (volsz.y+Bsz.y-1) / Bsz.y,
01270              (volsz.z+(Bsz.z*GUNROLL)-1) / (Bsz.z * GUNROLL));
01271 
01272 #if 0
01273   printf("GPU kernel launch dimensions: Bsz: %dx%dx%d Gsz: %dx%dx%d\n",
01274          Bsz.x, Bsz.y, Bsz.z, Gsz.x, Gsz.y, Gsz.z);
01275 #endif
01276 }
01277 
01278 
01279 static int vmd_cuda_gaussdensity_calc(long int natoms,
01280                                       float3 acoriginoffset,
01281                                       int3 acvolsz,
01282                                       const float *xyzr_f, 
01283                                       float *volmap, 
01284                                       int3 volsz, float maxrad,
01285                                       float radscale, float gridspacing,
01286                                       float gausslim,
01287                                       int3 refvolsz,
01288                                       int3 refoffset,
01289                                       const float *refmap,
01290                                       float *diffmap, int verbose) {
01291   float4 *xyzr_d = NULL;
01292   float4 *sorted_xyzr_d = NULL;
01293   uint2 *cellStartEnd_d = NULL;
01294 
01295   wkf_timerhandle globaltimer = wkf_timer_create();
01296   wkf_timer_start(globaltimer);
01297 
01298   cudaError_t err;
01299   if (check_gpu_compute20(err) != 0) 
01300     return -1;
01301 
01302   // Allocate output arrays for the gaussian density map and 3-D texture map
01303   // We test for errors carefully here since this is the most likely place
01304   // for a memory allocation failure due to the size of the grid.
01305   unsigned long volmemsz = volsz.x * volsz.y * volsz.z * sizeof(float);
01306   float *devdensity = NULL;
01307   if (cudaMalloc((void**)&devdensity, volmemsz) != cudaSuccess) {
01308     err = cudaGetLastError(); // eat error so next CUDA op succeeds
01309     return -1;
01310   }
01311 
01312 #if 1
01313   float *devrefmap = NULL;
01314   float *devdiffmap = NULL;
01315   if (refmap && diffmap) {
01316     unsigned long refmemsz = refvolsz.x * refvolsz.y * refvolsz.z * sizeof(float);
01317     if (cudaMalloc((void**)&devrefmap, refmemsz) != cudaSuccess) {
01318       err = cudaGetLastError(); // eat error so next CUDA op succeeds
01319       return -1;
01320     }
01321     cudaMemcpy(devrefmap, refmap, refmemsz, cudaMemcpyHostToDevice);
01322 
01323     if (cudaMalloc((void**)&devdiffmap, volmemsz) != cudaSuccess) {
01324       err = cudaGetLastError(); // eat error so next CUDA op succeeds
01325       return -1;
01326     }
01327   }
01328 #endif
01329 
01330   int3 accelcells = make_int3(1, 1, 1);
01331   float acgridspacing = 0.0f;
01332   if (vmd_cuda_build_accel(err, natoms, acvolsz, gausslim, radscale, maxrad,
01333                            gridspacing, xyzr_f, xyzr_d, sorted_xyzr_d,
01334                            cellStartEnd_d, acgridspacing, accelcells) != 0) {
01335     return -1;
01336   }
01337 
01338   double sorttime = wkf_timer_timenow(globaltimer);
01339 
01340   dim3 Bsz, Gsz;
01341   vmd_cuda_kernel_launch_dims(volsz, Bsz, Gsz);
01342 
01343   gaussdensity_fast<<<Gsz, Bsz, 0>>>(natoms, 
01344                                      sorted_xyzr_d,
01345                                      volsz,
01346                                      accelcells,
01347                                      acoriginoffset,
01348                                      acgridspacing,
01349                                      1.0f / acgridspacing,
01350                                      cellStartEnd_d,
01351                                      gridspacing, 0,
01352                                      devdensity);
01353   cudaDeviceSynchronize(); 
01354   double densitykerneltime = wkf_timer_timenow(globaltimer);
01355 
01356 #if 1
01357   if (devrefmap && devdiffmap) {
01358 #if 1
01359     if (verbose) {
01360       printf("CUDA: refvol(%d,%d,%d) refoffset(%d,%d,%d)\n",
01361              refvolsz.x, refvolsz.y, refvolsz.z,
01362              refoffset.x, refoffset.y, refoffset.z);
01363     }
01364 #endif
01365     gaussdensity_diff<<<Gsz, Bsz, 0>>>(natoms, 
01366                                        sorted_xyzr_d,
01367                                        volsz,
01368                                        accelcells,
01369                                        acoriginoffset,
01370                                        acgridspacing,
01371                                        1.0f / acgridspacing,
01372                                        cellStartEnd_d,
01373                                        gridspacing, 0,
01374                                        refvolsz,
01375                                        refoffset,
01376                                        devrefmap,
01377                                        devdiffmap);
01378     cudaDeviceSynchronize(); 
01379   }
01380 #endif
01381   double diffkerneltime = wkf_timer_timenow(globaltimer);
01382 
01383 
01384 #if 1
01385   //
01386   // sum of absolute differences
01387   //
01388   if (devrefmap) {
01389     unsigned int blockcount = Gsz.x * Gsz.y * Gsz.z;
01390     float *devsumabsdiffs = NULL;
01391     if (cudaMalloc((void**)&devsumabsdiffs, (blockcount+1) * sizeof(float)) != cudaSuccess) {
01392       err = cudaGetLastError(); // eat error so next CUDA op succeeds
01393       return -1;
01394     }
01395     gaussdensity_sumabsdiff<<<Gsz, Bsz, 0>>>(blockcount,
01396                                              natoms, 
01397                                              sorted_xyzr_d,
01398                                              volsz,
01399                                              accelcells,
01400                                              acoriginoffset,
01401                                              acgridspacing,
01402                                              1.0f / acgridspacing,
01403                                              cellStartEnd_d,
01404                                              gridspacing, 0,
01405                                              refvolsz,
01406                                              refoffset,
01407                                              devrefmap,
01408                                              devsumabsdiffs);
01409     cudaDeviceSynchronize(); 
01410 
01411     float *sumabsdiffs = new float[blockcount+1];
01412     cudaMemcpy(sumabsdiffs, devsumabsdiffs, (blockcount+1)*sizeof(float), cudaMemcpyDeviceToHost);
01413     cudaFree(devsumabsdiffs);
01414 
01415 #if 0
01416     printf("Map of block-wise sumabsdiffs:\n");
01417     for (int i=0; i<blockcount; i++) {
01418       printf("  tb[%d] absdiff %f\n", i, sumabsdiffs[i]);
01419     }
01420 #endif
01421 
01422 #if 1
01423     float sumabsdifftotal = 0.0f;
01424     for (int j=0; j<blockcount; j++) {
01425       sumabsdifftotal += sumabsdiffs[j];
01426     }
01427     if (verbose) {
01428       printf("Sum of absolute differences: %f\n", sumabsdifftotal);
01429       printf("Kernel sum of absolute differences: %f\n", sumabsdiffs[blockcount]);
01430     }
01431 #endif
01432 
01433     delete [] sumabsdiffs;
01434   }
01435 #endif
01436   double sumabsdiffkerneltime = wkf_timer_timenow(globaltimer);
01437 
01438 
01439 #if 1
01440   //
01441   // cross correlation
01442   //
01443   if (devrefmap) {
01444     unsigned int blockcount = Gsz.x * Gsz.y * Gsz.z;
01445     float4 *dev_cc_sums = NULL;
01446     float *dev_lcc = NULL;
01447     int *dev_lsize = NULL;
01448     float *dev_CC = NULL;
01449  
01450     if (cudaMalloc((void**)&dev_cc_sums, (blockcount+1) * sizeof(float4)) != cudaSuccess) {
01451       err = cudaGetLastError(); // eat error so next CUDA op succeeds
01452       return -1;
01453     }
01454 
01455     if (cudaMalloc((void**)&dev_lcc, (blockcount+1) * sizeof(float)) != cudaSuccess) {
01456       err = cudaGetLastError(); // eat error so next CUDA op succeeds
01457       return -1;
01458     }
01459 
01460     if (cudaMalloc((void**)&dev_lsize, (blockcount+1) * sizeof(int)) != cudaSuccess) {
01461       err = cudaGetLastError(); // eat error so next CUDA op succeeds
01462       return -1;
01463     }
01464 
01465     if (cudaMalloc((void**)&dev_CC, (blockcount+1) * sizeof(float)) != cudaSuccess) {
01466       err = cudaGetLastError(); // eat error so next CUDA op succeeds
01467       return -1;
01468     }
01469 
01470 
01471     gaussdensity_cc<<<Gsz, Bsz, 0>>>(blockcount,
01472                                      natoms, 
01473                                      sorted_xyzr_d,
01474                                      volsz,
01475                                      accelcells,
01476                                      acoriginoffset,
01477                                      acgridspacing,
01478                                      1.0f / acgridspacing,
01479                                      cellStartEnd_d,
01480                                      gridspacing, 0,
01481                                      -FLT_MAX, // accept all densities...
01482                                      refvolsz,
01483                                      refoffset,
01484                                      devrefmap,
01485                                      dev_cc_sums,
01486                                      dev_lcc,
01487                                      dev_lsize,
01488                                      dev_CC);
01489     cudaDeviceSynchronize(); 
01490 
01491     float4 *cc_sums = new float4[blockcount+1];
01492     cudaMemcpy(cc_sums, dev_cc_sums, (blockcount+1)*sizeof(float4), cudaMemcpyDeviceToHost);
01493     cudaFree(dev_cc_sums);
01494 
01495     float *lcc = new float[blockcount+1];
01496     cudaMemcpy(lcc, dev_lcc, (blockcount+1)*sizeof(float), cudaMemcpyDeviceToHost);
01497     cudaFree(dev_lcc);
01498 
01499     int *lsize = new int[blockcount+1];
01500     cudaMemcpy(lsize, dev_lsize, (blockcount+1)*sizeof(int), cudaMemcpyDeviceToHost);
01501     cudaFree(dev_lsize);
01502 
01503     float *CC = new float[blockcount+1];
01504     cudaMemcpy(CC, dev_CC, (blockcount+1)*sizeof(float), cudaMemcpyDeviceToHost);
01505     cudaFree(dev_CC);
01506 
01507 #if 0
01508     float4 tmp_cc_sums = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
01509     float tmp_lcc = 0.0f;
01510     int tmp_lsize = 0;
01511     for (int j=0; j<blockcount; j++) {
01512       tmp_cc_sums.x += cc_sums[j].x;
01513       tmp_cc_sums.y += cc_sums[j].y;
01514       tmp_cc_sums.z += cc_sums[j].z;
01515       tmp_cc_sums.w += cc_sums[j].w;
01516       tmp_lcc += lcc[j]; 
01517       tmp_lsize += lsize[j]; 
01518     }
01519 
01520     if (verbose) {
01521       printf("CC sums:\n");
01522       printf("    mean_ref: %f\n", tmp_cc_sums.x);
01523       printf("    mean_syn: %f\n", tmp_cc_sums.y);
01524       printf("    stddev_ref: %f\n", tmp_cc_sums.z);
01525       printf("    stddev_syn: %f\n", tmp_cc_sums.w);
01526       printf("    lcc: %f\n", tmp_lcc);
01527       printf("    lsize: %d\n", tmp_lsize);
01528 
01529       printf("Kernel CC sums:\n");
01530       printf("    mean_ref: %f\n", cc_sums[blockcount].x);
01531       printf("    mean_syn: %f\n", cc_sums[blockcount].y);
01532       printf("    stddev_ref: %f\n", cc_sums[blockcount].z);
01533       printf("    stddev_syn: %f\n", cc_sums[blockcount].w);
01534       printf("    lcc: %f\n", lcc[blockcount]);
01535       printf("    lsize: %d\n", lsize[blockcount]);
01536     }
01537 #endif
01538 
01539     float mean_ref     = cc_sums[blockcount].x / lsize[blockcount];
01540     float mean_synth   = cc_sums[blockcount].y / lsize[blockcount];
01541     float stddev_ref   = sqrtf(cc_sums[blockcount].z / lsize[blockcount] - mean_ref*mean_ref);
01542     float stddev_synth = sqrtf(cc_sums[blockcount].w / lsize[blockcount] - mean_synth*mean_synth);
01543     float cc = (lcc[blockcount] - lsize[blockcount]*mean_ref*mean_synth) / 
01544                (lsize[blockcount] * stddev_ref * stddev_synth);
01545 
01546 #if 1
01547     if (verbose) {
01548       printf("Final MDFF Kernel CC results:\n");
01549       printf("  mean_ref: %f\n", mean_ref);
01550       printf("  mean_synth: %f\n", mean_synth);
01551       printf("  stdev_ref: %f\n", stddev_ref);
01552       printf("  stdev_synth: %f\n", stddev_synth);
01553       printf("  CC: %f\n", cc);
01554     }
01555 #endif
01556 
01557     delete [] cc_sums;
01558     delete [] lcc;
01559     delete [] lsize;
01560   }
01561 #endif
01562 
01563 
01564   cudaMemcpy(volmap, devdensity, volmemsz, cudaMemcpyDeviceToHost);
01565   cudaFree(devdensity);
01566 #if 1
01567   if (devrefmap && devdiffmap) {
01568     cudaMemcpy(diffmap, devdiffmap, volmemsz, cudaMemcpyDeviceToHost);
01569     cudaFree(devrefmap);
01570     cudaFree(devdiffmap);
01571   }
01572 #endif
01573 
01574   // free uniform grid acceleration structure
01575   vmd_cuda_destroy_accel(xyzr_d, sorted_xyzr_d, cellStartEnd_d);
01576 
01577   err = cudaGetLastError();
01578 
01579   wkf_timer_stop(globaltimer);
01580   double totalruntime = wkf_timer_time(globaltimer);
01581   wkf_timer_destroy(globaltimer);
01582 
01583   if (err != cudaSuccess) { 
01584     printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
01585     return -1;
01586   }
01587 
01588   if (verbose) {
01589     printf("MDFF CC GPU runtime: %.3f [sort: %.3f density %.3f diff %.3f copy: %.3f]\n", 
01590            totalruntime, sorttime, densitykerneltime-sorttime, 
01591            diffkerneltime-densitykerneltime, totalruntime-densitykerneltime);
01592   }
01593 
01594   return 0;
01595 }
01596 
01597 
01598 static int vmd_cuda_cc_calc(long int natoms,
01599                             float3 acoriginoffset,
01600                             int3 acvolsz,
01601                             const float *xyzr_f, 
01602                             int3 volsz, float maxrad,
01603                             float radscale, float gridspacing,
01604                             float gausslim,
01605                             int3 refvolsz,
01606                             int3 refoffset,
01607                             const float *devrefmap,
01608                             float ccthreshdensity,
01609                             float &result_cc,
01610                             const float *origin,
01611                             VolumetricData **spatialccvol,
01612                             int verbose) {
01613   float4 *xyzr_d = NULL;
01614   float4 *sorted_xyzr_d = NULL;
01615   uint2 *cellStartEnd_d = NULL;
01616 
01617   wkf_timerhandle globaltimer = wkf_timer_create();
01618   wkf_timer_start(globaltimer);
01619 
01620   if (!devrefmap)
01621     return -1;
01622 
01623   cudaError_t err;
01624   if (check_gpu_compute20(err) != 0) 
01625     return -1;
01626 
01627   double refcopytime = wkf_timer_timenow(globaltimer);
01628 
01629   int3 accelcells = make_int3(1, 1, 1);
01630   float acgridspacing = 0.0f;
01631   if (vmd_cuda_build_accel(err, natoms, acvolsz, gausslim, radscale, maxrad,
01632                            gridspacing, xyzr_f, xyzr_d, sorted_xyzr_d,
01633                            cellStartEnd_d, acgridspacing, accelcells) != 0) {
01634     return -1;
01635   }
01636 
01637   double sorttime = wkf_timer_timenow(globaltimer);
01638 
01639   dim3 Bsz, Gsz;
01640   vmd_cuda_kernel_launch_dims(volsz, Bsz, Gsz);
01641 
01642   //
01643   // cross correlation
01644   //
01645   unsigned int blockcount = Gsz.x * Gsz.y * Gsz.z;
01646   float4 *dev_cc_sums = NULL;
01647   float *dev_lcc = NULL;
01648   int *dev_lsize = NULL;
01649   float *dev_tbCC = NULL;
01650 
01651   cudaMalloc((void**)&dev_cc_sums, (blockcount+1) * sizeof(float4));
01652   cudaMalloc((void**)&dev_lcc, (blockcount+1) * sizeof(float));
01653 
01654   if (spatialccvol != NULL) {
01655     cudaMalloc((void**)&dev_tbCC, (blockcount+1) * sizeof(float));
01656   }
01657 
01658   if (cudaMalloc((void**)&dev_lsize, (blockcount+1) * sizeof(int)) != cudaSuccess) {
01659     err = cudaGetLastError(); // eat error so next CUDA op succeeds
01660     return -1;
01661   }
01662 
01663   gaussdensity_cc<<<Gsz, Bsz, 0>>>(blockcount,
01664                                    natoms, 
01665                                    sorted_xyzr_d,
01666                                    volsz,
01667                                    accelcells,
01668                                    acoriginoffset,
01669                                    acgridspacing,
01670                                    1.0f / acgridspacing,
01671                                    cellStartEnd_d,
01672                                    gridspacing, 0,
01673                                    ccthreshdensity,
01674                                    refvolsz,
01675                                    refoffset,
01676                                    devrefmap,
01677                                    dev_cc_sums,
01678                                    dev_lcc,
01679                                    dev_lsize,
01680                                    dev_tbCC);
01681   cudaDeviceSynchronize(); 
01682   double cctime = wkf_timer_timenow(globaltimer);
01683 
01684   float4 *cc_sums = new float4[blockcount+1];
01685   cudaMemcpy(cc_sums, dev_cc_sums, (blockcount+1)*sizeof(float4), cudaMemcpyDeviceToHost);
01686   cudaFree(dev_cc_sums);
01687 
01688   float *lcc = new float[blockcount+1];
01689   cudaMemcpy(lcc, dev_lcc, (blockcount+1)*sizeof(float), cudaMemcpyDeviceToHost);
01690   cudaFree(dev_lcc);
01691 
01692   int *lsize = new int[blockcount+1];
01693   cudaMemcpy(lsize, dev_lsize, (blockcount+1)*sizeof(int), cudaMemcpyDeviceToHost);
01694   cudaFree(dev_lsize);
01695 
01696   float mean_ref     = cc_sums[blockcount].x / lsize[blockcount];
01697   float mean_synth   = cc_sums[blockcount].y / lsize[blockcount];
01698   float stddev_ref   = sqrtf(cc_sums[blockcount].z / lsize[blockcount] - mean_ref*mean_ref);
01699   float stddev_synth = sqrtf(cc_sums[blockcount].w / lsize[blockcount] - mean_synth*mean_synth);
01700 
01701   float cc = 0.0f;
01702 
01703   // detect and prevent cases that would cause division by zero
01704   // compute CC if at least one 1 pair of voxels was compared...
01705   if (lsize[blockcount] > 0) {
01706     if (stddev_ref == 0.0f || stddev_synth == 0.0f) {
01707       printf("WARNING: Ill-conditioned CC calc. due to zero std. deviation:\n");
01708       printf("WARNING: stddev_ref: %f stddev_synth: %f\n", stddev_ref, stddev_synth);
01709       cc = 0.0f; 
01710     } else {
01711       cc = (lcc[blockcount] - lsize[blockcount]*mean_ref*mean_synth) / 
01712            (lsize[blockcount] * stddev_ref * stddev_synth);
01713     }
01714   }
01715 
01716   if (spatialccvol != NULL) {
01717     char mapname[64] = { "mdff spatial CC map" };
01718     float spxaxis[3] = {1.0f, 0.0f, 0.0f};
01719     float spyaxis[3] = {0.0f, 1.0f, 0.0f};
01720     float spzaxis[3] = {0.0f, 0.0f, 1.0f}; 
01721     spxaxis[0] = Bsz.x *  Gsz.x * gridspacing;
01722     spyaxis[1] = Bsz.y *  Gsz.y * gridspacing;
01723     spzaxis[2] = Bsz.z *  Gsz.z * gridspacing * GUNROLL;
01724 
01725     if (verbose) {
01726       printf("Spatial CC map: blockcount[%d]  x: %d y: %d z: %d  cnt: %d\n",
01727              blockcount, Gsz.x, Gsz.y, Gsz.z, (Gsz.x*Gsz.y*Gsz.z));
01728     }
01729 
01730     float *spatialccmap = new float[blockcount];
01731     cudaMemcpy(spatialccmap, dev_tbCC, blockcount*sizeof(float), cudaMemcpyDeviceToHost);
01732     cudaFree(dev_tbCC);
01733      
01734     *spatialccvol = new VolumetricData(mapname, origin, 
01735                                        spxaxis, spyaxis, spzaxis,
01736                                        Gsz.x, Gsz.y, Gsz.z, spatialccmap);
01737   }
01738 
01739 #if 0
01740   if (verbose) {
01741     printf("Final MDFF Kernel CC results:\n");
01742     printf("  GPU CC sum data:\n");
01743     printf("    mean_ref: %f\n", cc_sums[blockcount].x);
01744     printf("    mean_synth: %f\n", cc_sums[blockcount].y);
01745     printf("    squares_ref: %f\n", cc_sums[blockcount].z);
01746     printf("    squares_synth: %f\n", cc_sums[blockcount].w);
01747     printf("    lcc: %f\n", lcc[blockcount]);
01748     printf("    lsize: %d\n", lsize[blockcount]);
01749     printf("  Reduced/finalized GPU CC data:\n");
01750     printf("    mean_ref: %f\n", mean_ref);
01751     printf("    mean_synth: %f\n", mean_synth);
01752     printf("    stdev_ref: %f\n", stddev_ref);
01753     printf("    stdev_synth: %f\n", stddev_synth);
01754     printf("    voxels used: %d (total %d)\n", 
01755            lsize[blockcount], volsz.x*volsz.y*volsz.z);
01756     printf("  CC: %f\n", cc);
01757   }
01758 #endif
01759 
01760   result_cc = cc; // assign final result
01761 
01762   delete [] cc_sums;
01763   delete [] lcc;
01764   delete [] lsize;
01765 
01766   // free uniform grid acceleration structure
01767   vmd_cuda_destroy_accel(xyzr_d, sorted_xyzr_d, cellStartEnd_d);
01768 
01769   err = cudaGetLastError();
01770 
01771   wkf_timer_stop(globaltimer);
01772   double totalruntime = wkf_timer_time(globaltimer);
01773   wkf_timer_destroy(globaltimer);
01774 
01775   if (err != cudaSuccess) { 
01776     printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
01777     return -1;
01778   }
01779 
01780   if (verbose) {
01781     printf("MDFF CC GPU runtime: %.3f [refcopy: %.3f sort: %.3f cc: %.3f copy: %.3f]\n", 
01782            totalruntime, refcopytime, sorttime-refcopytime, cctime-sorttime, totalruntime-cctime);
01783   }
01784 
01785   return 0;
01786 }
01787 
01788 
01789 #if 0
01790 static int vmd_cuda_gaussdensity_sumabsdiff(long int natoms,
01791                                       const float *xyzr_f, 
01792                                       float *volmap, int3 volsz, float maxrad,
01793                                       float radscale, float gridspacing,
01794                                       float gausslim) {
01795   float4 *xyzr_d = NULL;
01796   float4 *sorted_xyzr_d = NULL;
01797   uint2 *cellStartEnd_d = NULL;
01798 
01799   wkf_timerhandle globaltimer = wkf_timer_create();
01800   wkf_timer_start(globaltimer);
01801 
01802   cudaError_t err;
01803   if (check_gpu_compute20(err) != 0) 
01804     return -1;
01805 
01806   // Allocate output arrays for the gaussian density map and 3-D texture map
01807   // We test for errors carefully here since this is the most likely place
01808   // for a memory allocation failure due to the size of the grid.
01809   unsigned long volmemsz = volsz.x * volsz.y * volsz.z * sizeof(float);
01810   float *devdensity = NULL;
01811   if (cudaMalloc((void**)&devdensity, volmemsz) != cudaSuccess) {
01812     err = cudaGetLastError(); // eat error so next CUDA op succeeds
01813     return -1;
01814   }
01815 
01816   int3 accelcells = make_int3(1, 1, 1);
01817   float acgridspacing = 0.0f;
01818   if (vmd_cuda_build_accel(err, natoms, volsz, gausslim, radscale, maxrad,
01819                            gridspacing, xyzr_f, xyzr_d, sorted_xyzr_d,
01820                            cellStartEnd_d, acgridspacing, accelcells) != 0) {
01821     return -1;
01822   }
01823 
01824   double sorttime = wkf_timer_timenow(globaltimer);
01825 
01826   dim3 Bsz, Gsz;
01827   vmd_cuda_kernel_launch_dims(volsz, Bsz, Gsz);
01828 
01829 #if 0
01830   int3 refoffset = make_int3(0, 0, 0);
01831   gaussdensity_sumabsdiff<<<Gsz, Bsz, 0>>>(natoms,
01832                                      sorted_xyzr_d,
01833                                      volsz,
01834                                      accelcells,
01835                                      acgridspacing,
01836                                      1.0f / acgridspacing,
01837                                      cellStartEnd_d,
01838                                      gridspacing, 0,
01839                                      refoffset,
01840 #if 1
01841                                      NULL, NULL);
01842 #else
01843                                      densitygrid,
01844                                      densitysumabsdiff);
01845 #endif
01846 #endif
01847 
01848   cudaDeviceSynchronize(); 
01849   double densitykerneltime = wkf_timer_timenow(globaltimer);
01850   cudaMemcpy(volmap, devdensity, volmemsz, cudaMemcpyDeviceToHost);
01851   cudaFree(devdensity);
01852 
01853   // free uniform grid acceleration structure
01854   vmd_cuda_destroy_accel(xyzr_d, sorted_xyzr_d, cellStartEnd_d);
01855 
01856   err = cudaGetLastError();
01857 
01858   wkf_timer_stop(globaltimer);
01859   double totalruntime = wkf_timer_time(globaltimer);
01860   wkf_timer_destroy(globaltimer);
01861 
01862   if (err != cudaSuccess) { 
01863     printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
01864     return -1;
01865   }
01866 
01867   printf("MDFF absdiff GPU runtime: %.3f [sort: %.3f density %.3f copy: %.3f]\n", 
01868          totalruntime, sorttime, densitykerneltime-sorttime, 
01869          totalruntime-densitykerneltime);
01870 
01871   return 0;
01872 }
01873 #endif
01874 
01875 
01876 static float gausslim_quality(int quality) {
01877   // set gaussian window size based on user-specified quality parameter
01878   float gausslim = 2.0f;
01879   switch (quality) {
01880     case 3: gausslim = 4.0f; break; // max quality
01881 
01882     case 2: gausslim = 3.0f; break; // high quality
01883 
01884     case 1: gausslim = 2.5f; break; // medium quality
01885 
01886     case 0:
01887     default: gausslim = 2.0f; // low quality
01888       break;
01889   }
01890 
01891   return gausslim;
01892 }
01893 
01894 
01895 
01896 static int calc_density_bounds(const AtomSel *sel, MoleculeList *mlist,
01897                                int verbose,
01898                                int quality, float radscale, float gridspacing,
01899                                float &maxrad, float *origin, 
01900                                float *xaxis, float *yaxis, float *zaxis,
01901                                int3 & volsz) {
01902   Molecule *m = mlist->mol_from_id(sel->molid());
01903   const float *atompos = sel->coordinates(mlist);
01904   const float *atomradii = m->extraflt.data("radius");
01905 
01906   vec_zero(origin);
01907   vec_zero(xaxis);
01908   vec_zero(yaxis);
01909   vec_zero(zaxis);
01910  
01911   // Query min/max atom radii for the entire molecule
01912   float minrad = 1.0f;
01913   maxrad = 1.0f;
01914   m->get_radii_minmax(minrad, maxrad);
01915 
01916   float fmin[3], fmax[3];
01917   minmax_selected_3fv_aligned(atompos, sel->on, sel->num_atoms,
01918                               sel->firstsel, sel->lastsel, fmin, fmax);
01919 
01920   float minx, miny, minz, maxx, maxy, maxz;
01921   minx = fmin[0]; miny = fmin[1]; minz = fmin[2];
01922   maxx = fmax[0]; maxy = fmax[1]; maxz = fmax[2];
01923 
01924   if (verbose) {
01925     printf("DenBBox: rminmax %f %f  min: %f %f %f  max: %f %f %f\n",
01926            minrad, maxrad, minx, miny, minz, maxx, maxy, maxz);
01927   }
01928 
01929   float mincoord[3], maxcoord[3];
01930   mincoord[0] = minx; mincoord[1] = miny; mincoord[2] = minz;
01931   maxcoord[0] = maxx; maxcoord[1] = maxy; maxcoord[2] = maxz;
01932 
01933   // crude estimate of the grid padding we require to prevent the
01934   // resulting isosurface from being clipped
01935   float gridpadding = radscale * maxrad * 1.70f;
01936   float padrad = gridpadding;
01937   padrad = 0.65f * sqrtf(4.0f/3.0f*((float) VMD_PI)*padrad*padrad*padrad);
01938   gridpadding = MAX(gridpadding, padrad);
01939 
01940   if (verbose) {
01941     printf("MDFFden: R*%.1f, H=%.1f Pad: %.1f minR: %.1f maxR: %.1f)\n",
01942            radscale, gridspacing, gridpadding, minrad, maxrad);
01943   }
01944 
01945   mincoord[0] -= gridpadding;
01946   mincoord[1] -= gridpadding;
01947   mincoord[2] -= gridpadding;
01948   maxcoord[0] += gridpadding;
01949   maxcoord[1] += gridpadding;
01950   maxcoord[2] += gridpadding;
01951 
01952   // compute the real grid dimensions from the selected atoms
01953   volsz.x = (int) ceil((maxcoord[0]-mincoord[0]) / gridspacing);
01954   volsz.y = (int) ceil((maxcoord[1]-mincoord[1]) / gridspacing);
01955   volsz.z = (int) ceil((maxcoord[2]-mincoord[2]) / gridspacing);
01956 
01957   // recalc the grid dimensions from rounded/padded voxel counts
01958   xaxis[0] = (volsz.x-1) * gridspacing;
01959   yaxis[1] = (volsz.y-1) * gridspacing;
01960   zaxis[2] = (volsz.z-1) * gridspacing;
01961   maxcoord[0] = mincoord[0] + xaxis[0];
01962   maxcoord[1] = mincoord[1] + yaxis[1];
01963   maxcoord[2] = mincoord[2] + zaxis[2];
01964 
01965   if (verbose) {
01966     printf("  GridSZ: (%4d %4d %4d)  BBox: (%.1f %.1f %.1f)->(%.1f %.1f %.1f)\n",
01967            volsz.x, volsz.y, volsz.z,
01968            mincoord[0], mincoord[1], mincoord[2],
01969            maxcoord[0], maxcoord[1], maxcoord[2]);
01970   }
01971 
01972   vec_copy(origin, mincoord);
01973   return 0;
01974 }
01975 
01976 
01977 static int map_uniform_spacing(double xax, double yax, double zax,
01978                                int szx, int szy, int szz) {
01979   // compute grid spacing values
01980   double xdelta = xax / (szx - 1);
01981   double ydelta = yax / (szy - 1);
01982   double zdelta = zax / (szz - 1);
01983 
01984   if ((xdelta == ydelta) && (ydelta == zdelta))
01985     return 1;
01986 
01987   // if we have less than 0.00001% relative error, that seems okay to treat
01988   // as "uniform" spacing too.
01989   const double relerrlim = 1e-7;
01990   double dxydelta = fabs(xdelta-ydelta); 
01991   double dyzdelta = fabs(ydelta-zdelta); 
01992   double dxzdelta = fabs(xdelta-zdelta); 
01993   if (((dxydelta / xdelta) < relerrlim) && ((dxydelta / xdelta) < relerrlim) &&
01994       ((dyzdelta / ydelta) < relerrlim) && ((dyzdelta / zdelta) < relerrlim) &&
01995       ((dxzdelta / xdelta) < relerrlim) && ((dxzdelta / zdelta) < relerrlim))
01996     return 1;
01997  
01998   return 0;
01999 }
02000                            
02001 
02002 static int calc_density_bounds_overlapping_map(int verbose, float &gridspacing,
02003                       float *origin, float *xaxis, float *yaxis, float *zaxis,
02004                       int3 &volsz, int3 &refoffset, 
02005                       const VolumetricData * refmap) {
02006   // compute grid spacing values
02007   float xdelta = xaxis[0] / (volsz.x - 1);
02008   float ydelta = yaxis[1] / (volsz.y - 1);
02009   float zdelta = zaxis[2] / (volsz.z - 1);
02010 
02011   if (verbose) {
02012     printf("calc_overlap: delta %f %f %f, gs: %f vsz: %d %d %d\n",
02013            xdelta, ydelta, zdelta, gridspacing, volsz.x, volsz.y, volsz.z);
02014 
02015     if (gridspacing != xdelta) {
02016       printf("calc_overlap: WARNING grid spacing != ref map spacing: (%f != %f)\n", gridspacing, xdelta);
02017     }
02018   }
02019 
02020   float refxdelta = float(refmap->xaxis[0] / (refmap->xsize - 1));
02021   float refydelta = float(refmap->yaxis[1] / (refmap->ysize - 1));
02022   float refzdelta = float(refmap->zaxis[2] / (refmap->zsize - 1));
02023   if (verbose) {
02024     printf("calc_overlap: refdelta %f %f %f, gs: %f vsz: %d %d %d\n",
02025            refxdelta, refydelta, refzdelta, gridspacing, 
02026            refmap->xsize, refmap->ysize, refmap->zsize);
02027   }
02028 
02029   // compute new origin, floor of the refmap nearest voxel coord
02030   float fvoxorigin[3];
02031   fvoxorigin[0] = float((origin[0] - refmap->origin[0]) / refxdelta); 
02032   fvoxorigin[1] = float((origin[1] - refmap->origin[1]) / refydelta); 
02033   fvoxorigin[2] = float((origin[2] - refmap->origin[2]) / refzdelta); 
02034 
02035   refoffset.x = int((fvoxorigin[0] < 0) ? 0 : floor(fvoxorigin[0])); 
02036   refoffset.y = int((fvoxorigin[1] < 0) ? 0 : floor(fvoxorigin[1])); 
02037   refoffset.z = int((fvoxorigin[2] < 0) ? 0 : floor(fvoxorigin[2])); 
02038   if (verbose) {
02039     printf("calc_overlap: refoffset: %d %d %d\n", 
02040            refoffset.x, refoffset.y, refoffset.z);
02041   }
02042 
02043   float maxcorner[3];
02044   maxcorner[0] = origin[0] + xaxis[0];
02045   maxcorner[1] = origin[1] + yaxis[1];
02046   maxcorner[2] = origin[2] + zaxis[2];
02047 
02048   float refmaxcorner[3];
02049   refmaxcorner[0] = float(refmap->origin[0] + refmap->xaxis[0]);
02050   refmaxcorner[1] = float(refmap->origin[1] + refmap->yaxis[1]);
02051   refmaxcorner[2] = float(refmap->origin[2] + refmap->zaxis[2]);
02052 
02053   maxcorner[0] = (maxcorner[0] > refmaxcorner[0]) ? refmaxcorner[0] : maxcorner[0]; 
02054   maxcorner[1] = (maxcorner[1] > refmaxcorner[1]) ? refmaxcorner[1] : maxcorner[1]; 
02055   maxcorner[2] = (maxcorner[2] > refmaxcorner[2]) ? refmaxcorner[2] : maxcorner[2]; 
02056 
02057   origin[0] = float(refmap->origin[0] + refoffset.x * refxdelta);
02058   origin[1] = float(refmap->origin[1] + refoffset.y * refydelta);
02059   origin[2] = float(refmap->origin[2] + refoffset.z * refzdelta);
02060 
02061   volsz.x = (int) round((maxcorner[0] - origin[0]) / refxdelta)+1;
02062   volsz.y = (int) round((maxcorner[1] - origin[1]) / refydelta)+1;
02063   volsz.z = (int) round((maxcorner[2] - origin[2]) / refzdelta)+1;
02064 
02065   // compute new corner
02066   xaxis[0] = ((volsz.x-1) * refxdelta);
02067   yaxis[1] = ((volsz.y-1) * refydelta);
02068   zaxis[2] = ((volsz.z-1) * refzdelta);
02069 
02070   return 0;
02071 }
02072 
02073 
02074 
02075 static float *build_xyzr_from_coords(const AtomSel *sel, const float *atompos,
02076                                      const float *atomradii, const float *origin) {
02077   // build compacted lists of bead coordinates, radii, and colors
02078   int ind = sel->firstsel * 3;
02079   int ind4=0;
02080 
02081   float *xyzr = (float *) malloc(sel->selected * sizeof(float) * 4);
02082 
02083   // build compacted lists of atom coordinates and radii only
02084   int i;
02085   for (i=sel->firstsel; i <= sel->lastsel; i++) {
02086     if (sel->on[i]) {
02087       const float *fp = atompos + ind;
02088       xyzr[ind4    ] = fp[0]-origin[0];
02089       xyzr[ind4 + 1] = fp[1]-origin[1];
02090       xyzr[ind4 + 2] = fp[2]-origin[2];
02091       xyzr[ind4 + 3] = atomradii[i];
02092       ind4 += 4;
02093     }
02094     ind += 3;
02095   }
02096 
02097   return xyzr;
02098 }
02099                                
02100 
02101 
02102 static float *build_xyzr_from_sel(const AtomSel *sel, MoleculeList *mlist,
02103                                   const float *origin) {
02104   Molecule *m = mlist->mol_from_id(sel->molid());
02105   const float *atompos = sel->coordinates(mlist);
02106   const float *atomradii = m->extraflt.data("radius");
02107 
02108   float *xyzr = build_xyzr_from_coords(sel, atompos, atomradii, origin);
02109 
02110   return xyzr;
02111 }
02112 
02113 
02114 //
02115 // Compute a simulated density map for a given atom selection and
02116 // reference density map.
02117 //
02118 int vmd_cuda_calc_density(const AtomSel *sel, MoleculeList *mlist, 
02119                           int quality, float radscale, float gridspacing, 
02120                           VolumetricData ** synthvol, 
02121                           const VolumetricData * refmap,
02122                           VolumetricData ** diffvol, 
02123                           int verbose) {
02124   wkf_timerhandle timer = wkf_timer_create();
02125   wkf_timer_start(timer);
02126 
02127   float maxrad, acorigin[3], origin[3], xaxis[3], yaxis[3], zaxis[3];
02128   int3 volsz   = make_int3(1, 1, 1);
02129   int3 refsz   = make_int3(1, 1, 1);
02130   int3 acvolsz = make_int3(1, 1, 1);
02131   int3 refoffset = make_int3(0, 0, 0);
02132 
02133   if (refmap) {
02134     gridspacing = float(refmap->xaxis[0] / (refmap->xsize - 1));
02135     if (verbose) {
02136       printf("refmap gridspacing: %f\n", gridspacing);
02137     }
02138   } 
02139 
02140   calc_density_bounds(sel, mlist, verbose, quality, radscale, gridspacing,
02141                       maxrad, origin, xaxis, yaxis, zaxis, volsz);
02142   vec_copy(acorigin, origin);
02143   acvolsz = volsz;
02144 
02145   if (verbose) {
02146     printf("dmap init orig: %f %f %f  axes: %f %f %f  sz: %d %d %d\n",
02147            origin[0], origin[1], origin[2],
02148            xaxis[0], yaxis[1], zaxis[2], 
02149            volsz.x, volsz.y, volsz.z);
02150   }
02151 
02152   if (refmap) {
02153     calc_density_bounds_overlapping_map(verbose, gridspacing, origin, 
02154                                         xaxis, yaxis, zaxis, volsz, 
02155                                         refoffset, refmap);
02156 
02157     if (verbose) {
02158       printf("ref  dvol orig: %f %f %f  axes: %f %f %f  sz: %d %d %d\n",
02159              refmap->origin[0], refmap->origin[1], refmap->origin[2],
02160              refmap->xaxis[0], refmap->yaxis[1], refmap->zaxis[2], 
02161              refmap->xsize, refmap->ysize, refmap->zsize);
02162 
02163       printf("dmap rmap orig: %f %f %f  axes: %f %f %f  sz: %d %d %d\n",
02164              origin[0], origin[1], origin[2],
02165              xaxis[0], yaxis[1], zaxis[2], 
02166              volsz.x, volsz.y, volsz.z);
02167     }
02168   }
02169 
02170   float3 acoriginoffset = make_float3(origin[0] - acorigin[0],
02171                                       origin[1] - acorigin[1],
02172                                       origin[2] - acorigin[2]);
02173 
02174   float *xyzr = build_xyzr_from_sel(sel, mlist, acorigin);
02175   float gausslim = gausslim_quality(quality); // set gaussian cutoff radius
02176 
02177   double pretime = wkf_timer_timenow(timer);
02178   float *volmap = new float[volsz.x*volsz.y*volsz.z];
02179 
02180   int rc=0;
02181   if (!diffvol) {
02182     rc = vmd_cuda_gaussdensity_calc(sel->selected, acoriginoffset, acvolsz,
02183                                     xyzr, volmap, volsz, 
02184                                     maxrad, radscale, gridspacing, gausslim,
02185                                     refsz, refoffset, NULL, NULL,
02186                                     verbose);
02187     // fall back to CPU if GPU routines fail
02188     if (rc) {
02189       free(xyzr);
02190       delete [] volmap;
02191       return -1;
02192     }
02193   } else {
02194     // emit a difference map
02195     float *diffmap = new float[volsz.x*volsz.y*volsz.z];
02196 
02197     refsz = make_int3(refmap->xsize, refmap->ysize, refmap->zsize);
02198     rc = vmd_cuda_gaussdensity_calc(sel->selected, acoriginoffset, acvolsz,
02199                                     xyzr, volmap, volsz, 
02200                                     maxrad, radscale, gridspacing, gausslim,
02201                                     refsz, refoffset, refmap->data, diffmap,
02202                                     verbose);
02203 
02204     // fall back to CPU if GPU routines fail
02205     if (rc) {
02206       free(xyzr);
02207       delete [] volmap;
02208       delete [] diffmap;
02209       return -1;
02210     }
02211 
02212     char diffdataname[64] = { "mdff density difference map" };
02213     *diffvol = new VolumetricData(diffdataname, origin, xaxis, yaxis, zaxis,
02214                                   volsz.x, volsz.y, volsz.z, diffmap);
02215   }
02216 
02217   char dataname[64] = { "mdff synthetic density map" };
02218   *synthvol = new VolumetricData(dataname, origin, xaxis, yaxis, zaxis,
02219                               volsz.x, volsz.y, volsz.z, volmap);
02220 
02221   double voltime = wkf_timer_timenow(timer);
02222   free(xyzr);
02223 
02224   if (verbose) {
02225     char strmsg[1024];
02226     sprintf(strmsg, "MDFFcc: %.3f [pre:%.3f vol:%.3f]",
02227             voltime, pretime, voltime-pretime);
02228     msgInfo << strmsg << sendmsg;
02229   }
02230   wkf_timer_destroy(timer);
02231 
02232   return 0; 
02233 }
02234 
02235 
02236 
02237 #if 0
02238 static int vmd_calc_cc(const AtomSel *sel, MoleculeList *mlist, 
02239                        int quality, float radscale, float gridspacing, 
02240                        const VolumetricData * refmap,
02241                        int verbose, float ccthreshdensity, float &result_cc) {
02242   wkf_timerhandle timer = wkf_timer_create();
02243   wkf_timer_start(timer);
02244 
02245   cudaError_t err = cudaGetLastError();
02246   if (err != cudaSuccess)
02247     return -1;
02248 
02249   if (!refmap)
02250     return -1;
02251 
02252   float maxrad, acorigin[3], origin[3], xaxis[3], yaxis[3], zaxis[3];
02253   int3 volsz   = make_int3(1, 1, 1);
02254   int3 refsz   = make_int3(1, 1, 1);
02255   int3 acvolsz = make_int3(1, 1, 1);
02256   int3 refoffset = make_int3(0, 0, 0);
02257 
02258   gridspacing = refmap->xaxis[0] / (refmap->xsize - 1);
02259   calc_density_bounds(sel, mlist, verbose, quality, radscale, gridspacing,
02260                       maxrad, origin, xaxis, yaxis, zaxis, volsz);
02261   vec_copy(acorigin, origin);
02262   acvolsz = volsz;
02263 
02264   calc_density_bounds_overlapping_map(verbose, gridspacing, origin, 
02265                                       xaxis, yaxis, zaxis, volsz, 
02266                                       refoffset, refmap);
02267 
02268   float3 acoriginoffset = make_float3(origin[0] - acorigin[0],
02269                                       origin[1] - acorigin[1],
02270                                       origin[2] - acorigin[2]);
02271 
02272   float *xyzr = build_xyzr_from_sel(sel, mlist, acorigin);
02273   float gausslim = gausslim_quality(quality); // set gaussian cutoff radius
02274 
02275   double pretime = wkf_timer_timenow(timer);
02276   refsz = make_int3(refmap->xsize, refmap->ysize, refmap->zsize);
02277   
02278   if (refoffset.x >= refsz.x || refoffset.y >= refsz.y || refoffset.z >= refsz.z) {
02279     printf("MDFF CC: no overlap between synthetic map and reference map!\n");
02280     return -1;
02281   }
02282 
02283   // copy the reference density map to the GPU before we begin using it for
02284   // subsequent calculations...
02285   float *devrefmap = NULL;
02286   unsigned long refmemsz = refsz.x * refsz.y * refsz.z * sizeof(float);
02287   if (cudaMalloc((void**)&devrefmap, refmemsz) != cudaSuccess) {
02288     err = cudaGetLastError(); // eat error so next CUDA op succeeds
02289     return -1;
02290   }
02291   cudaMemcpy(devrefmap, refmap->data, refmemsz, cudaMemcpyHostToDevice);
02292 
02293   result_cc = 0.0f;
02294   vmd_cuda_cc_calc(sel->selected, acoriginoffset, acvolsz, xyzr, 
02295                    volsz, maxrad, radscale, gridspacing, 
02296                    gausslim, refsz, refoffset, devrefmap, 
02297                    ccthreshdensity, result_cc, origin, NULL,
02298                    verbose);
02299   cudaFree(devrefmap);
02300 
02301   double cctime = wkf_timer_timenow(timer);
02302   free(xyzr);
02303 
02304   if (verbose) {
02305     char strmsg[1024];
02306     sprintf(strmsg, "MDFFcc: %.3f [pre:%.3f cc:%.3f]",
02307             cctime, pretime, cctime-pretime);
02308     msgInfo << strmsg << sendmsg;
02309   }
02310   wkf_timer_destroy(timer);
02311 
02312   return 0;
02313 }
02314 
02315 #endif
02316 
02317 
02318 //
02319 // Fast single-pass algorithm for computing a synthetic density map
02320 // for a given atom selection and reference map, and compute the 
02321 // cross correlation, optionally saving a spatially localized
02322 // cross correlation map, difference map, and simulated density map.
02323 //
02324 int vmd_cuda_compare_sel_refmap(const AtomSel *sel, MoleculeList *mlist, 
02325                                 int quality, float radscale, float gridspacing, 
02326                                 const VolumetricData * refmap,
02327                                 VolumetricData **synthvol, 
02328                                 VolumetricData **diffvol, 
02329                                 VolumetricData **spatialccvol, 
02330                                 float *CC, float ccthreshdensity,
02331                                 int verbose) {
02332   wkf_timerhandle timer = wkf_timer_create();
02333   wkf_timer_start(timer);
02334 
02335   if (!map_uniform_spacing(refmap->xaxis[0], refmap->yaxis[1], refmap->zaxis[2],
02336                            refmap->xsize, refmap->ysize, refmap->zsize)) {
02337     if (verbose)
02338       printf("mdffi cc: non-uniform grid spacing unimplemented on GPU, falling back to CPU!\n");
02339 
02340     return -1;
02341   }
02342 
02343   cudaError_t err = cudaGetLastError();
02344   if (err != cudaSuccess)
02345     return -1;
02346 
02347   if (!refmap)
02348     return -1;
02349 
02350   float maxrad, acorigin[3], origin[3], xaxis[3], yaxis[3], zaxis[3];
02351   int3 volsz   = make_int3(1, 1, 1);
02352   int3 refsz   = make_int3(1, 1, 1);
02353   int3 acvolsz = make_int3(1, 1, 1);
02354   int3 refoffset = make_int3(0, 0, 0);
02355 
02356   if (verbose) {
02357     printf("vmd_cuda_compare_sel_refmap():\n");
02358     printf("  refmap xaxis: %f %f %f   ",
02359            refmap->xaxis[0], 
02360            refmap->xaxis[1], 
02361            refmap->xaxis[2]);
02362     printf("  refmap size: %d %d %d\n",
02363            refmap->xsize, refmap->ysize, refmap->zsize);
02364     printf("  gridspacing (orig): %f  ", gridspacing);
02365   } 
02366   gridspacing = float(refmap->xaxis[0] / (refmap->xsize - 1));
02367   if (verbose) {
02368     printf("(new): %f\n", gridspacing);
02369   }
02370 
02371   // if we get a bad computed grid spacing, we have to bail out or
02372   // we will end up segfaulting...
02373   if (gridspacing == 0.0) {
02374     if (verbose)
02375       printf("GPU gridspacing is zero! bailing out!\n");
02376     return -1;
02377   }
02378 
02379   calc_density_bounds(sel, mlist, verbose, quality, radscale, gridspacing,
02380                       maxrad, origin, xaxis, yaxis, zaxis, volsz);
02381   vec_copy(acorigin, origin);
02382   acvolsz = volsz;
02383 
02384   calc_density_bounds_overlapping_map(verbose, gridspacing, origin, 
02385                                       xaxis, yaxis, zaxis, volsz, 
02386                                       refoffset, refmap);
02387 
02388   float3 acoriginoffset = make_float3(origin[0] - acorigin[0],
02389                                       origin[1] - acorigin[1],
02390                                       origin[2] - acorigin[2]);
02391 
02392   float *xyzr = build_xyzr_from_sel(sel, mlist, acorigin);
02393   float gausslim = gausslim_quality(quality); // set gaussian cutoff radius
02394 
02395   double pretime = wkf_timer_timenow(timer);
02396   refsz = make_int3(refmap->xsize, refmap->ysize, refmap->zsize);
02397 
02398   if (refoffset.x >= refsz.x || refoffset.y >= refsz.y || refoffset.z >= refsz.z) {
02399     printf("MDFF CC: no overlap between synthetic map and reference map!\n");
02400     return -1;
02401   }
02402 
02403   // copy the reference density map to the GPU before we begin using it for
02404   // subsequent calculations...
02405   float *devrefmap = NULL;
02406   unsigned long refmemsz = refsz.x * refsz.y * refsz.z * sizeof(float);
02407   if (cudaMalloc((void**)&devrefmap, refmemsz) != cudaSuccess) {
02408     err = cudaGetLastError(); // eat error so next CUDA op succeeds
02409     return -1;
02410   }
02411   cudaMemcpy(devrefmap, refmap->data, refmemsz, cudaMemcpyHostToDevice);
02412 
02413   if (CC != NULL) {
02414     vmd_cuda_cc_calc(sel->selected, acoriginoffset, acvolsz, xyzr, 
02415                      volsz, maxrad, radscale, gridspacing, 
02416                      gausslim, refsz, refoffset, devrefmap,
02417                      ccthreshdensity, *CC, origin, spatialccvol,
02418                      verbose);
02419   }
02420 
02421   // generate/save synthetic density map if requested
02422   float *synthmap = NULL;
02423   float *diffmap = NULL;
02424   if (synthvol != NULL || diffvol != NULL) {
02425     if (synthvol != NULL)
02426       synthmap = new float[volsz.x*volsz.y*volsz.z];
02427 
02428     if (diffvol != NULL)
02429       diffmap = new float[volsz.x*volsz.y*volsz.z];
02430 
02431     vmd_cuda_gaussdensity_calc(sel->selected, acoriginoffset, acvolsz, xyzr,
02432                                synthmap, volsz, maxrad, radscale, gridspacing,
02433                                gausslim, refsz, refoffset, refmap->data, diffmap,
02434                                verbose);
02435   }
02436 
02437 
02438   if (synthmap != NULL) {
02439     char mapname[64] = { "mdff synthetic density map" };
02440     *synthvol = new VolumetricData(mapname, origin, xaxis, yaxis, zaxis,
02441                                    volsz.x, volsz.y, volsz.z, synthmap);
02442   }
02443 
02444   if (diffmap != NULL) {
02445     char mapname[64] = { "MDFF density difference map" };
02446     *diffvol = new VolumetricData(mapname, origin, xaxis, yaxis, zaxis,
02447                                   volsz.x, volsz.y, volsz.z, diffmap);
02448   }
02449 
02450   cudaFree(devrefmap);
02451 
02452   double cctime = wkf_timer_timenow(timer);
02453   free(xyzr);
02454 
02455   if (verbose) {
02456     char strmsg[1024];
02457     sprintf(strmsg, "MDFF comp: %.3f [pre:%.3f cc:%.3f]",
02458             cctime, pretime, cctime-pretime);
02459     msgInfo << strmsg << sendmsg;
02460   }
02461   wkf_timer_destroy(timer);
02462 
02463   return 0; 
02464 }
02465 
02466 
02467 #if 0
02468 
02469 static int vmd_test_sumabsdiff(const AtomSel *sel, MoleculeList *mlist, 
02470                                int quality, float radscale, float gridspacing, 
02471                                const VolumetricData * refmap, int verbose) {
02472   wkf_timerhandle timer = wkf_timer_create();
02473   wkf_timer_start(timer);
02474 
02475   float maxrad, origin[3], xaxis[3], yaxis[3], zaxis[3];
02476   int3 volsz = make_int3(1, 1, 1);
02477   int3 refoffset = make_int3(0, 0, 0);
02478 
02479   calc_density_bounds(sel, mlist, verbose, quality, radscale, gridspacing,
02480                       maxrad, origin, xaxis, yaxis, zaxis, volsz);
02481 
02482   if (refmap) {
02483     printf("dmap init orig: %f %f %f  axes: %f %f %f  sz: %d %d %d\n",
02484            origin[0], origin[1], origin[2],
02485            xaxis[0], yaxis[1], zaxis[2], 
02486            volsz.x, volsz.y, volsz.z);
02487 
02488     calc_density_bounds_overlapping_map(1, gridspacing, origin, 
02489                                         xaxis, yaxis, zaxis, volsz, 
02490                                         refoffset, refmap);
02491 
02492     printf("ref  dvol orig: %f %f %f  axes: %f %f %f  sz: %d %d %d\n",
02493            refmap->origin[0], refmap->origin[1], refmap->origin[2],
02494            refmap->xaxis[0], refmap->yaxis[1], refmap->zaxis[2], 
02495            refmap->xsize, refmap->ysize, refmap->zsize);
02496 
02497     printf("dmap rmap orig: %f %f %f  axes: %f %f %f  sz: %d %d %d\n",
02498            origin[0], origin[1], origin[2],
02499            xaxis[0], yaxis[1], zaxis[2], 
02500            volsz.x, volsz.y, volsz.z);
02501   }
02502 
02503   float *xyzr = build_xyzr_from_sel(sel, mlist, origin);
02504   float gausslim = gausslim_quality(quality); // set gaussian cutoff radius
02505 
02506   double pretime = wkf_timer_timenow(timer);
02507 
02508 #if 0
02509   float *volmap = new float[volsz.x*volsz.y*volsz.z];
02510   char dataname[64] = { "mdff synthetic density map" };
02511 
02512   vmd_cuda_gaussdensity_calc(sel->selected, xyzr, volmap, volsz, 
02513                              maxrad, radscale, gridspacing, gausslim, verbose);
02514 
02515   *synthvol = new VolumetricData(dataname, origin, xaxis, yaxis, zaxis,
02516                                volsz.x, volsz.y, volsz.z, volmap);
02517 #endif
02518 
02519   double voltime = wkf_timer_timenow(timer);
02520   free(xyzr);
02521 
02522   if (verbose) {
02523     char strmsg[1024];
02524     sprintf(strmsg, "MDFFcc: %.3f [pre:%.3f vol:%.3f]",
02525             voltime, pretime, voltime-pretime);
02526     msgInfo << strmsg << sendmsg;
02527   }
02528   wkf_timer_destroy(timer);
02529 
02530   return 0; 
02531 }
02532 #endif
02533 
02534 
02535 
02536