CUDAMeasureQCP.cu Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDAMeasureQCP.cu,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.37 $      $Date: 2021/12/20 23:49:52 $
00014  *
00015  ***************************************************************************/
00026 #include <stdio.h>
00027 #include <stdlib.h>
00028 #include <string.h>
00029 #include <math.h>
00030 #include <cuda.h>
00031 
00032 #include "molfile_plugin.h"
00033 
00034 #include "Inform.h"
00035 #include "utilities.h"
00036 #include "WKFThreads.h"
00037 #include "WKFUtils.h"
00038 #include "CUDAKernels.h" 
00039 #include "Measure.h"
00040 
00041 //
00042 // Restrict macro to make it easy to do perf tuning tests
00043 //
00044 #if 1
00045 #define RESTRICT __restrict__
00046 #else
00047 #define RESTRICT
00048 #endif
00049 
00050 //#define VMDUSECUDAGDS 1
00051 #if defined(VMDUSECUDAGDS)
00052 #include </usr/local/gds-beta-0.7.1/lib/cufile.h>        // GPU-Direct Storage
00053 
00054 // direct calls to JS plugin for devel/testing until the plugin manager
00055 // and headers incorporate the new out-of-core GPU-direct I/O hooks.
00056 #define VMDJSPLUGININCLUDESRC 1
00057 #include "/home/johns/plugins/molfile_plugin/src/jsplugin.c"
00058 
00059 #if 1
00060 #define CUERR { cudaError_t err; \
00061   if ((err = cudaGetLastError()) != cudaSuccess) { \
00062   printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00063   printf("Thread aborting...\n"); \
00064   return NULL; }}
00065 #else
00066 #define CUERR
00067 #endif
00068 
00069 typedef struct {
00070   int nfiles;
00071   const char **trjfileset;
00072   jshandle **jshandles;
00073   CUfileHandle_t *cfh;
00074   int devcount;
00075   int natoms;
00076   const AtomSel *sel;
00077   int first;
00078   int last;
00079   int step;
00080   float *rmsdmat;
00081 } gpuqcprmsdmatoocthreadparms;
00082 #else
00083 typedef int fio_fd;
00084 #endif
00085 
00086 typedef struct {
00087   const AtomSel *sel;
00088   int first;
00089   int last;
00090   int step;
00091   float *rmsdmat;
00092   int padcnt;
00093   int framecrdsz;
00094   float *crds;
00095 } gpuqcprmsdthreadparms;
00096 
00097 
00098 //
00099 // Device global variable for block count used in single-pass
00100 // device-wide parallel reductions.
00101 //
00102 __device__ unsigned int glob_block_count = 0;
00103 
00104 
00105 //
00106 // Warp-wide sum reduction
00107 //
00108 template <typename T>
00109 __inline__ __device__ T warp_sum_reduction(T v) {
00110   for (int offset = warpSize/2; offset > 0; offset >>=1 ) {
00111 #if CUDART_VERSION >= 9000
00112     v += __shfl_down_sync(0xffffffff, v, offset);
00113 #else
00114     v += __shfl_down(v, offset); 
00115 #endif
00116   }
00117   return v;
00118 }
00119 
00120 
00121 //
00122 // Block-wide sum reduction
00123 // XXX would break with blockDim > 1024 (more than 32 warps) 
00124 //
00125 template <typename T>
00126 __inline__ __device__ T block_sum_reduction(T v) {
00127   static __shared__ T shared[32];
00128   int lane = threadIdx.x % warpSize;
00129   int wid = threadIdx.x / warpSize;
00130 
00131   v = warp_sum_reduction(v);
00132 
00133   __syncthreads(); // needed when called multiple times in a row
00134 
00135   if (lane == 0)
00136     shared[wid] = v;
00137 
00138   __syncthreads();
00139 
00140   if (threadIdx.x < blockDim.x / warpSize) {
00141     v = shared[lane];
00142   } else {
00143     v = 0;
00144   }
00145 
00146   if (wid == 0)
00147     v = warp_sum_reduction(v);
00148 
00149   return v;
00150 }
00151 
00152 
00153 // 
00154 // convert AOS (float3) coordinate layout to SOA (xxxx, yyyy, zzzz).
00155 // 
00156 __global__ static void 
00157 vmd_float3_aos_to_soa(int natoms, float3 *xyz,
00158                       float *crdx, float *crdy, float *crdz) {
00159   unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
00160   unsigned int tcnt = gridDim.x * blockDim.x;
00161   int i;
00162   for (i=tid; i<natoms; i+=tcnt) {
00163     float3 crd = xyz[i];
00164     crdx[i] = crd.x;
00165     crdy[i] = crd.y;
00166     crdz[i] = crd.z;
00167   }
00168 }
00169 
00170 
00171 // 
00172 // convert selected AOS (float3) coordinate layout to SOA (xxxx, yyyy, zzzz).
00173 // 
00174 __global__ static void 
00175 vmd_float3_aos_to_soa_selected(int natoms, int *idxmap, float3 *xyz,
00176                                float *crdx, float *crdy, float *crdz) {
00177   unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
00178   unsigned int tcnt = gridDim.x * blockDim.x;
00179   int i;
00180   for (i=tid; i<natoms; i+=tcnt) {
00181     int dst = idxmap[i];
00182     if (dst > 0) {
00183       float3 crd = xyz[i];
00184       crdx[dst] = crd.x;
00185       crdy[dst] = crd.y;
00186       crdz[dst] = crd.z;
00187     }
00188   }
00189 }
00190 
00191 
00192 int * vmd_gpu_selection_indexlist(const AtomSel *sel) {
00193   int i, j;
00194   int *cpuidxlist = new int[sel->selected];
00195 
00196   for (j=0,i=sel->firstsel; i<=sel->lastsel; i++) {
00197     if (sel->on[i]) {
00198       cpuidxlist[j] = i;
00199       j++; 
00200     }
00201   } 
00202 
00203   int *gpuidxlist = NULL;
00204   cudaMalloc((void**) &gpuidxlist, sel->selected * sizeof(int));
00205   cudaMemcpy(gpuidxlist, cpuidxlist, sel->selected * sizeof(int),
00206              cudaMemcpyHostToDevice);
00207   delete [] cpuidxlist;
00208 
00209   return gpuidxlist;
00210 }
00211 
00212 
00213 //
00214 // Device-wide QCP inner product kernel 
00215 //   Notes:  This kernel is designed to compute a QCP inner product for
00216 //   a single pairwise RMSD between two "large" structures, where "large"
00217 //   means that we have sufficient parallelism (due to the total number of
00218 //   atoms) to completely saturate GPU with work.  
00219 //   Since the algorithm does linear work and only a relatively small number 
00220 //   of FLOPS per atomic coordinate pair, this "large structure" case 
00221 //   is inherently memory bandwidth bound, at least in isolation.  There may
00222 //   an L2 cache performance benefit associated with scheduling back-to-back 
00223 //   trajectory frame pair RMSD calculations where one of the frames is held
00224 //   constant while looping over several others, but we could do much
00225 //   better by writing a special-case kernel that would incorporate looping
00226 //   internally within each thread block, thereby keeping one of the two 
00227 //   coordinate sets entirely resident in machine registers, so long as the
00228 //   problem was small enough to fit within the maximum CUDA 1-D grid size.
00229 //
00230 __global__ static void 
00231 vmd_qcp_innerprod_soa_devicewide(double *pr,
00232                                  const float * RESTRICT crdx1, 
00233                                  const float * RESTRICT crdy1, 
00234                                  const float * RESTRICT crdz1,
00235                                  const float * RESTRICT crdx2, 
00236                                  const float * RESTRICT crdy2, 
00237                                  const float * RESTRICT crdz2,
00238                                  const int natoms, 
00239                                  const float * RESTRICT weight) {
00240   unsigned int tid  = (blockIdx.x * blockDim.x) + threadIdx.x;
00241   unsigned int tcnt = gridDim.x * blockDim.x;
00242 
00243   __shared__ int isLastBlock[1];
00244   if(threadIdx.x == 0) {
00245     isLastBlock[0] = 0;
00246   }
00247   __syncthreads();
00248 
00249   double G, a0, a1, a2, a3, a4, a5, a6, a7, a8;
00250   G=a0=a1=a2=a3=a4=a5=a6=a7=a8=0.0;
00251 
00252   int start = tid; 
00253   int stop  = natoms;
00254   if (weight != NULL) {
00255     for (int i=start; i<stop; i+=tcnt) {
00256       float fw = weight[i];
00257       float fx1 = crdx1[i];
00258       float fy1 = crdy1[i];
00259       float fz1 = crdz1[i];
00260 
00261       float fx2 = crdx2[i];
00262       float fy2 = crdy2[i];
00263       float fz2 = crdz2[i];
00264 
00265       G += fw * ((fx1*fx1 + fy1*fy1 + fz1*fz1) + (fx2*fx2 + fy2*fy2 + fz2*fz2));
00266 
00267       a0 += fx1 * fx2;
00268       a1 += fx1 * fy2;
00269       a2 += fx1 * fz2;
00270 
00271       a3 += fy1 * fx2;
00272       a4 += fy1 * fy2;
00273       a5 += fy1 * fz2;
00274 
00275       a6 += fz1 * fx2;
00276       a7 += fz1 * fy2;
00277       a8 += fz1 * fz2;
00278     }
00279   } else {
00280     for (int i=start; i<stop; i+=tcnt) {
00281       float fx1 = crdx1[i];
00282       float fy1 = crdy1[i];
00283       float fz1 = crdz1[i];
00284 
00285       float fx2 = crdx2[i];
00286       float fy2 = crdy2[i];
00287       float fz2 = crdz2[i];
00288 
00289       G += ((fx1*fx1 + fy1*fy1 + fz1*fz1) + (fx2*fx2 + fy2*fy2 + fz2*fz2));
00290 
00291       a0 += fx1 * fx2;
00292       a1 += fx1 * fy2;
00293       a2 += fx1 * fz2;
00294 
00295       a3 += fy1 * fx2;
00296       a4 += fy1 * fy2;
00297       a5 += fy1 * fz2;
00298 
00299       a6 += fz1 * fx2;
00300       a7 += fz1 * fy2;
00301       a8 += fz1 * fz2;
00302     }
00303   }
00304 
00305   __syncthreads();
00306 
00307   G *= 0.5; 
00308 
00309   // block-wide sum reduction of inner product
00310   double bG  = block_sum_reduction(G);
00311   double ba0 = block_sum_reduction(a0);
00312   double ba1 = block_sum_reduction(a1);
00313   double ba2 = block_sum_reduction(a2);
00314   double ba3 = block_sum_reduction(a3);
00315   double ba4 = block_sum_reduction(a4);
00316   double ba5 = block_sum_reduction(a5);
00317   double ba6 = block_sum_reduction(a6);
00318   double ba7 = block_sum_reduction(a7);
00319   double ba8 = block_sum_reduction(a8);
00320  
00321   __syncthreads();
00322 
00323   // thread 0 in each block writes out per-block partial sums
00324   if (threadIdx.x == 0) {
00325     pr[(0*gridDim.x)+blockIdx.x] = ba0;
00326     pr[(1*gridDim.x)+blockIdx.x] = ba1;
00327     pr[(2*gridDim.x)+blockIdx.x] = ba2;
00328     pr[(3*gridDim.x)+blockIdx.x] = ba3;
00329     pr[(4*gridDim.x)+blockIdx.x] = ba4;
00330     pr[(5*gridDim.x)+blockIdx.x] = ba5;
00331     pr[(6*gridDim.x)+blockIdx.x] = ba6;
00332     pr[(7*gridDim.x)+blockIdx.x] = ba7;
00333     pr[(8*gridDim.x)+blockIdx.x] = ba8;
00334     pr[(9*gridDim.x)+blockIdx.x] = bG;
00335 
00336     __threadfence(); // ensure all prior memory writes post before continuing
00337 
00338     // increment atomic counter of number of blocks that have finished
00339     // their work, so that the last block to finish knows to do the final
00340     // parallel reduction of all of the individual per-block partial sums
00341     unsigned int old_block_count = atomicInc(&glob_block_count, gridDim.x);
00342     isLastBlock[0] = ( old_block_count == (gridDim.x -1) );
00343   }
00344 
00345   __syncthreads();
00346 
00347   // the last block performs the final reduction of all of the individual
00348   // per-block partial sums
00349   if (isLastBlock[0]) {
00350     glob_block_count = 0;
00351     __threadfence(); // ensure block_count memory write posts before continuing
00352 
00353     // each thread loops over all 10 items doing the individual reductions
00354     for (int l=0; l < 10; ++l) {
00355       double *pr_a = pr+(l*gridDim.x);
00356 
00357       // each thread reduces all 10 items
00358       float sum = 0; 
00359       for (int j = threadIdx.x; j < gridDim.x; j += blockDim.x) {
00360         sum += pr_a[j];
00361       }
00362 
00363       sum = block_sum_reduction(sum);
00364       if (threadIdx.x==0)
00365         pr_a[0]=sum;
00366     }
00367   }
00368 }
00369 
00370 
00371 //
00372 // Single-thread-block-per-structure-pair QCP inner product kernel 
00373 //   Notes:  This kernel is designed to compute QCP inner products for
00374 //   many pairwise RMSDs between "small" structures, where "small"
00375 //   means that we have sufficient parallelism (due to the total number of
00376 //   atoms) to supply a single CUDA thread block with sufficient work for
00377 //   roughly 256 threads per thread block.
00378 //   Since the algorithm does linear work and only a relatively small number 
00379 //   of FLOPS per atomic coordinate pair, the "small structure" case 
00380 //   is memory bandwidth bound in the worst case, but L1/L2 cache bandwidth
00381 //   bound in the best cases.  
00382 //   There is an opportunity to write a special case variant of this kernel
00383 //   that stores the atomic coordinates for one of the structures in machine
00384 //   registers such that they could be repeatedly reused, but this creates
00385 //   some additional challenges in performing parallel sum reductions.
00386 //
00387 __global__ static void 
00388 vmd_qcp_innerprod_soa_blockperpair(double *results,
00389                                    const float * RESTRICT crdx1, 
00390                                    const float * RESTRICT crdy1, 
00391                                    const float * RESTRICT crdz1,
00392                                    const float * RESTRICT crdx2, 
00393                                    const float * RESTRICT crdy2, 
00394                                    const float * RESTRICT crdz2,
00395                                    const int natoms, 
00396                                    const float * RESTRICT weight,
00397                                    const int num_structs) {
00398   unsigned int tid  = (blockIdx.x * blockDim.x) + threadIdx.x;
00399   unsigned int tcnt = gridDim.x * blockDim.x;
00400 
00401   double G, a0, a1, a2, a3, a4, a5, a6, a7, a8;
00402 
00403   // the grid of thread blocks loop over all of the structures
00404   for (int struct_id = blockIdx.x; struct_id < num_structs; struct_id+=gridDim.x) {
00405     G=a0=a1=a2=a3=a4=a5=a6=a7=a8=0.0;
00406     int struct_offset = struct_id * natoms * sizeof(float);
00407     int start = struct_offset + tid;
00408     int stop  = struct_offset + natoms;
00409     if (weight != NULL) {
00410       for (int i=start; i<stop; i+=tcnt) {
00411         float fw = weight[i];
00412         float fx1 = crdx1[i];
00413         float fy1 = crdy1[i];
00414         float fz1 = crdz1[i];
00415 
00416         float fx2 = crdx2[i];
00417         float fy2 = crdy2[i];
00418         float fz2 = crdz2[i];
00419 
00420         G += fw * ((fx1*fx1 + fy1*fy1 + fz1*fz1) + (fx2*fx2 + fy2*fy2 + fz2*fz2));
00421 
00422         a0 += fx1 * fx2;
00423         a1 += fx1 * fy2;
00424         a2 += fx1 * fz2;
00425 
00426         a3 += fy1 * fx2;
00427         a4 += fy1 * fy2;
00428         a5 += fy1 * fz2;
00429 
00430         a6 += fz1 * fx2;
00431         a7 += fz1 * fy2;
00432         a8 += fz1 * fz2;
00433       }
00434     } else {
00435       for (int i=start; i<stop; i+=tcnt) {
00436         float fx1 = crdx1[i];
00437         float fy1 = crdy1[i];
00438         float fz1 = crdz1[i];
00439 
00440         float fx2 = crdx2[i];
00441         float fy2 = crdy2[i];
00442         float fz2 = crdz2[i];
00443 
00444         G += ((fx1*fx1 + fy1*fy1 + fz1*fz1) + (fx2*fx2 + fy2*fy2 + fz2*fz2));
00445 
00446         a0 += fx1 * fx2;
00447         a1 += fx1 * fy2;
00448         a2 += fx1 * fz2;
00449 
00450         a3 += fy1 * fx2;
00451         a4 += fy1 * fy2;
00452         a5 += fy1 * fz2;
00453 
00454         a6 += fz1 * fx2;
00455         a7 += fz1 * fy2;
00456         a8 += fz1 * fz2;
00457       }
00458     }
00459 
00460     __syncthreads();
00461 
00462     G *= 0.5; 
00463 
00464     // block-wide sum reduction of inner product
00465     double bG  = block_sum_reduction(G);
00466     double ba0 = block_sum_reduction(a0);
00467     double ba1 = block_sum_reduction(a1);
00468     double ba2 = block_sum_reduction(a2);
00469     double ba3 = block_sum_reduction(a3);
00470     double ba4 = block_sum_reduction(a4);
00471     double ba5 = block_sum_reduction(a5);
00472     double ba6 = block_sum_reduction(a6);
00473     double ba7 = block_sum_reduction(a7);
00474     double ba8 = block_sum_reduction(a8);
00475 
00476     __syncthreads();
00477 
00478     // thread 0 in each block writes out per-block partial sums
00479     if (threadIdx.x == 0) {
00480       int addr = struct_id * 10;
00481       results[addr    ] = ba0;
00482       results[addr + 1] = ba1;
00483       results[addr + 2] = ba2;
00484       results[addr + 3] = ba3;
00485       results[addr + 4] = ba4;
00486       results[addr + 5] = ba5;
00487       results[addr + 6] = ba6;
00488       results[addr + 7] = ba7;
00489       results[addr + 8] = ba8;
00490       results[addr + 9] = bG;
00491     }
00492   }
00493 }
00494 
00495 
00496 // compute lower-triangular indices i,j from linear array index
00497 static int idx2sub_tril(long N, long ind, long *J, long *I) {
00498   if (ind > (N*(N+1)/2)) {
00499     return -1; // out of bounds
00500   }
00501 
00502 #if 1
00503   long i = long(N - 2 - floor(sqrt(-8*ind + 4*N*(N-1)-7)/2.0 - 0.5));
00504   long j = ind + i + 1 - N*(N-1)/2 + (N-i)*((N-i)-1)/2;
00505 #elif 0
00506   long i, j;
00507   // j = ceil(0.5*(2*N+1 - sqrt(-8*ind + 1 + 4*N*(N+1))));
00508   // i = ind - (j-1)*N + j*(j-1)/2;
00509 
00510   // i = floor((2*N+1 - sqrt((2*N+1)*(2*N+1) - 8*ind)) / 2);
00511   // XXX deal with ambiguous types for sqrt() on Solaris
00512   double tmp2np1 = 2*N+1;
00513   i = floor((tmp2np1 - sqrt(tmp2np1*tmp2np1 - 8.0*ind)) / 2);
00514   j = ind - N*i + i*(i-1)/2 + i;
00515 #endif
00516 
00517   *I = i;
00518   *J = j;
00519 
00520   return 0;
00521 }
00522 
00523 
00524 static void * measure_rmsdmat_qcp_thread(void *voidparms) {
00525   int threadid;
00526   gpuqcprmsdthreadparms *parms = NULL;
00527   wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00528   wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00529 
00530   //
00531   // copy in per-thread parameters
00532   //
00533   float *rmsdmat = parms->rmsdmat;
00534 
00535 #if 0
00536   int framecrdsz = parms->framecrdsz;
00537   float *crds = parms->crds;
00538 #endif
00539   const AtomSel *sel = parms->sel;
00540   int first  = parms->first;
00541   int last   = parms->last;
00542   int step   = parms->step;
00543 
00544 #if 1
00545   printf("QCP GPU[%d] running...\n", threadid);
00546 #endif
00547 
00548   int framecount = (last - first + 1) / step;
00549 
00550   wkf_tasktile_t tile;
00551   while (wkf_threadlaunch_next_tile(voidparms, 8, &tile) != WKF_SCHED_DONE) {
00552     long idx;
00553 
00554     for (idx=tile.start; idx<tile.end; idx++) {
00555       long i, j;
00556 
00557       // compute i,j from idx...
00558       // only compute off-diagonal elements, so we use (framecount-1)
00559       if (idx2sub_tril(framecount-1, idx, &i, &j)) {
00560         printf("qcpthread[%d]: work idx %ld out of triangle!\n", threadid, idx);
00561         break;
00562       }
00563 
00564       // calculate the (weighted) inner product of two structures
00565       double A[9];
00566       double E0 = 0;
00567 
00568 #if 0
00569       float *xj = crds + (j * 3 * framecrdsz);
00570       float *yj = xj + framecrdsz;
00571       float *zj = xj + framecrdsz*2;
00572 
00573       float *xi = crds + (i * 3 * framecrdsz);
00574       float *yi = xi + framecrdsz;
00575       float *zi = xi + framecrdsz*2;
00576 
00577       E0 = InnerProductSOA(A, xj, yj, zj, xi, yi, zi,
00578                            sel->selected, NULL /* weight */);
00579 #endif
00580 
00581       // calculate the RMSD & rotational matrix
00582       FastCalcRMSDAndRotation(NULL, A, &rmsdmat[j*framecount + i],
00583                               E0, sel->selected, -1);
00584     }
00585   }
00586 
00587   return NULL;
00588 }
00589 
00590 
00591 int qcp_soa_gpu(wkf_threadpool_t *devpool, // VMD GPU worker thread pool
00592                 const AtomSel *sel, float *crds, int framecrdsz, int padcnt,
00593                 int first, int last, int step, int framecount,
00594                 float *rmsdmat) {
00595   //
00596   // copy in per-thread parameters
00597   //
00598   gpuqcprmsdthreadparms parms;
00599   memset(&parms, 0, sizeof(parms));
00600 
00601   parms.sel = sel;
00602   parms.rmsdmat = rmsdmat;
00603   parms.padcnt = padcnt;
00604   parms.framecrdsz = framecrdsz;
00605   parms.crds = crds;
00606   parms.first = first;
00607   parms.last = last;
00608   parms.step = step;
00609 
00610   // spawn child threads to do the work
00611   wkf_tasktile_t tile;
00612   tile.start=0;
00613   tile.end=(framecount-1)*(framecount-1)/2; // only compute off-diag elements
00614 
00615 #if 1
00616   wkf_threadpool_sched_dynamic(devpool, &tile);
00617   wkf_threadpool_launch(devpool, measure_rmsdmat_qcp_thread, &parms, 1);
00618 #else
00619   int i, j;
00620   for (j=0; j<framecount; j++) {
00621     float *xj = crds + (j * 3 * framecrdsz);
00622     float *yj = xj + framecrdsz;
00623     float *zj = xj + framecrdsz*2;
00624     for (i=0; i<j; i++) {
00625       // calculate the (weighted) inner product of two structures
00626       double A[9];
00627 
00628       float *xi = crds + (i * 3 * framecrdsz);
00629       float *yi = xi + framecrdsz;
00630       float *zi = xi + framecrdsz*2;
00631 
00632       double E0 = InnerProductSOA(A, xj, yj, zj, xi, yi, zi,
00633                                   sel->selected, NULL /* weight */);
00634 
00635       // calculate the RMSD & rotational matrix
00636       FastCalcRMSDAndRotation(NULL, A, &rmsdmat[j*framecount + i],
00637                               E0, sel->selected, -1);
00638     }
00639   }
00640 #endif
00641 
00642   return 0;
00643 }  
00644 
00645 
00646 
00647 #if defined(VMDUSECUDAGDS)
00648 #define VMDGDSMAXFRAMEBUF     8
00649 static void * measure_rmsdmat_qcp_ooc_thread(void *voidparms) {
00650   int threadid;
00651   gpuqcprmsdmatoocthreadparms *parms = NULL;
00652   wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00653   wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00654 
00655   //
00656   // copy in per-thread parameters
00657   //
00658   float *rmsdmat = parms->rmsdmat;
00659 
00660   int nfiles = parms->nfiles;
00661   int natoms = parms->natoms;
00662   const AtomSel *sel = parms->sel;
00663   int first  = parms->first;
00664   int last   = parms->last;
00665   int step   = parms->step;
00666 
00667   int usecufile = 1;
00668   fio_fd *hostfds = NULL;
00669   int pinhostiobuffer = 1;
00670   if (getenv("VMDGDSHOSTNOPIN")) {
00671     pinhostiobuffer=0; 
00672   }
00673 
00674   if (getenv("VMDGDSUSEHOST")) {
00675     usecufile=0;
00676     hostfds = (fio_fd *) calloc(1, nfiles * sizeof(fio_fd));
00677 
00678     int hostusedirectio = 1;
00679     if (getenv("VMDGDSHOSTBUFFEREDIO") != NULL)
00680       hostusedirectio = 0;
00681  
00682     int openmode = FIO_READ;
00683     if (hostusedirectio) 
00684       openmode |= FIO_DIRECT;
00685 
00686     int i;
00687     for (i=0; i<nfiles; i++) {
00688       if (fio_open(parms->trjfileset[i], openmode, &hostfds[i]) < 0) {
00689         if (hostusedirectio) {
00690           printf("Thr[%d] direct I/O unavailable or can't open file '%s'\n", 
00691                  threadid, parms->trjfileset[i]);
00692         } else {
00693           printf("Thr[%d] can't open file '%s'\n", 
00694                  threadid, parms->trjfileset[i]);
00695         }
00696         return NULL;
00697       }
00698 #if 0
00699       else {
00700         printf("Thr[%d]: fd[%d]: %d, '%s'\n", threadid, i, hostfds[i],
00701                parms->trjfileset[i]);
00702       }
00703 #endif
00704     }
00705   }
00706 
00707   if (hostfds && usecufile) {
00708     printf("Inconsistent cufile/hostfds state, aborting!\n");
00709     return NULL;
00710   }
00711 
00712   /* ensure we have a large enough allocation so we can align */
00713   /* the starting pointer to a blocksz page boundary          */
00714   long blocksz = MOLFILE_DIRECTIO_MIN_BLOCK_SIZE;
00715   long sz = 3L*sizeof(float)*natoms + blocksz;
00716 
00717   /* pad the allocation to an even multiple of the block size */
00718   size_t blockpadsz = (sz + (blocksz - 1)) & (~(blocksz - 1));
00719 
00720   int framecount = (last - first + 1) / step;
00721   int framesperfile = framecount / nfiles;
00722 
00723   if (threadid == 0) {
00724     printf("Thr[%2d] running frame: %d natoms: %d selected: %d\n", 
00725            framecount, threadid, natoms, sel->selected);
00726   }
00727 
00728   cudaError_t crc;
00729   cudaStream_t qcpstream;
00730   double *pr=NULL;
00731   float *devptr=NULL;
00732   float *hostptr=NULL;
00733   float *hostptr_unaligned=NULL;
00734 
00735   float *crdx1=NULL, *crdy1=NULL, *crdz1=NULL;
00736   float *crdx2=NULL, *crdy2=NULL, *crdz2=NULL;
00737   int *gpuidxlist = NULL;
00738   int multiframeio = 0;
00739   if (getenv("VMDGDSMULTIFRAME"))
00740     multiframeio = atoi(getenv("VMDGDSMULTIFRAME"));
00741   if (multiframeio > VMDGDSMAXFRAMEBUF)
00742     multiframeio = VMDGDSMAXFRAMEBUF;
00743 
00744   // set block sizes and counts for QCP calcs
00745   dim3 QCPBsz = dim3(256, 1, 1);
00746   dim3 QCPGsz = dim3((natoms + QCPBsz.x - 1) / QCPBsz.x, 1, 1);
00747 
00748   if (parms->devcount > 0) {
00749     long gpuallocsz = (VMDGDSMAXFRAMEBUF+1) * blockpadsz;
00750 
00751     if (threadid == 0) {
00752       printf("Thr[%2d] Allocating QCP GPU timestep buf: %ld \n", 
00753              threadid, gpuallocsz);
00754     }
00755     crc = cudaMalloc((void**) &devptr, gpuallocsz);
00756 
00757     if (hostfds != NULL) {
00758       if (pinhostiobuffer) {
00759         crc = cudaMallocHost((void**) &hostptr, gpuallocsz);
00760         CUERR
00761       } else {
00762         hostptr = (float *) alloc_aligned_ptr(gpuallocsz, 4096, 
00763                                               (void**) &hostptr_unaligned);
00764       }
00765     }
00766 
00767     gpuidxlist = vmd_gpu_selection_indexlist(sel);
00768     long crdsz = sel->selected * sizeof(float);
00769     // 10x QCP sums
00770     crc = cudaMalloc((void**) &pr, 10 * sizeof(double) * (QCPGsz.x + 1));
00771 
00772     // atomic coord buffers
00773     crc = cudaMalloc((void**) &crdx1, crdsz);
00774     crc = cudaMalloc((void**) &crdy1, crdsz);
00775     crc = cudaMalloc((void**) &crdz1, crdsz);
00776     crc = cudaMalloc((void**) &crdx2, crdsz);
00777     crc = cudaMalloc((void**) &crdy2, crdsz);
00778     crc = cudaMalloc((void**) &crdz2, crdsz);
00779     if (crc != cudaSuccess) {
00780       printf("Thr[%2d], Failed to allocate GPU buffer!\n", threadid);
00781       return NULL; // XXX error handling needs to be done here
00782     }
00783 
00784     cudaStreamCreate(&qcpstream);
00785 
00786 #if defined(VMDUSECUDAGDS)
00787     cuFileBufRegister(devptr, gpuallocsz, 0);
00788 #endif
00789   }
00790 
00791 
00792 #if 1
00793 #define VMDGDSIOBENCHMARKONLY 1
00794   int verbose = (getenv("VMDGDSVERBOSE") != NULL) ? 1 : 0;
00795 
00796   wkf_tasktile_t tile;
00797   while (wkf_threadlaunch_next_tile(voidparms, VMDGDSMAXFRAMEBUF * 1, &tile) != WKF_SCHED_DONE) {
00798     //
00799     // simple I/O + compute benchmarking...
00800     //
00801     int idx;
00802     for (idx=tile.start; idx<tile.end; idx++) {
00803       int myfileidx = (threadid / 4) % nfiles;
00804       int fileframeidx = idx % framesperfile;
00805 
00806       //
00807       // compute multi-frame or single-frame I/O offsets and sizes
00808       //
00809       long startoffset, foffset, readlen;
00810       read_js_timestep_index_offsets(parms->jshandles[myfileidx], natoms, 
00811                                      fileframeidx, 
00812                                      0, natoms, NULL,
00813                                      &startoffset, &foffset, &readlen);
00814       if (multiframeio) {
00815         // multi-frame reads use the same starting offset, but the
00816         // read length is computed from the first and last frames
00817         // in the group.
00818         long multistartoffset, multifoffset, multireadlen;
00819         read_js_timestep_index_offsets(parms->jshandles[myfileidx], natoms, 
00820                                        fileframeidx+multiframeio-1, 
00821                                        0, natoms, NULL,
00822                                        &multistartoffset, &multifoffset, 
00823                                        &multireadlen);
00824 
00825         multireadlen = (multifoffset + multireadlen) - foffset;
00826 
00827         //printf("** readlen: %ld  multireadlen: %ld\n", readlen, multireadlen);
00828         readlen = multireadlen;
00829         idx+=multiframeio-1; // add in the required increment...
00830       }
00831 
00832       //
00833       // perform the required I/O via GDS or by host kernel I/O
00834       //
00835       long ret=0;
00836       if (usecufile) {
00837         ret = cuFileRead(parms->cfh[myfileidx], (char *) devptr, readlen, foffset, 0);
00838       } else if (hostfds) {
00839         foffset=0;
00840         ret=fio_fseek(hostfds[myfileidx], foffset, FIO_SEEK_SET);
00841         if (ret<0) { printf("fio_fseek() error!\n"); return NULL; }
00842         ret=fio_fread(hostptr, readlen, 1, hostfds[myfileidx]);
00843         if (ret<0) { printf("fio_fseek() error!\n"); return NULL; }
00844         cudaMemcpy(devptr, hostptr, readlen, cudaMemcpyHostToDevice);
00845         CUERR
00846       } else {
00847         printf("Inconsistent cufile/hostfds state, aborting!\n");
00848         return NULL;
00849       }
00850 
00851       // handle errors if they have occured
00852       if (ret < 0) {
00853         printf("Thr[%2d] Error: cuFileRead(): %ld\n", threadid, ret);
00854         return NULL; // XXX error handling needs to be done here
00855       }
00856 
00857 #if 0
00858       float hostbuf[64];
00859       cudaMemcpy(hostbuf, devptr, sizeof(hostbuf), cudaMemcpyDeviceToHost);
00860       int l;
00861       for (l=0; l<10; l++) {
00862         printf("%.1f ", hostbuf[l]); 
00863       }
00864       printf("\n");
00865 #endif
00866 
00867       float rmsd=0;
00868 #if 0
00869       cudaDeviceSynchronize();
00870       CUERR // check for errors
00871 
00872       dim3 Bsz = dim3(256, 1, 1);
00873       dim3 Gsz = dim3((natoms + Bsz.x - 1) / Bsz.x, 1, 1);
00874       if (sel->selected == natoms) {
00875         vmd_float3_aos_to_soa<<<Gsz, Bsz, 0, qcpstream>>>(natoms, (float3 *) devptr, crdx1, crdy1, crdz1);
00876       } else {
00877         vmd_float3_aos_to_soa_selected<<<Gsz, Bsz, 0, qcpstream>>>(natoms, gpuidxlist, (float3 *) devptr, crdx1, crdy1, crdz1);
00878       }
00879 
00880       vmd_qcp_innerprod_soa_devicewide<<<QCPGsz, QCPBsz, 0, qcpstream>>>(pr, crdx1, crdy1, crdz1, crdx1, crdy1, crdz1, natoms, NULL);
00881 
00882       double pr_host[10];
00883       cudaMemcpy(pr_host, pr, 10 * sizeof(double), cudaMemcpyDeviceToHost);
00884 #if 0
00885       printf("Thr[%2d] pr_host %.1f %.1f %.1f %.1f %.1f %.1f %.1f %.1f %.1f %.1f\n", 
00886              threadid, pr_host[0], pr_host[1], pr_host[2],
00887              pr_host[3], pr_host[4], pr_host[5],
00888              pr_host[6], pr_host[7], pr_host[8], pr_host[9]);
00889 #endif
00890 
00891       // calculate the RMSD & rotational matrix
00892       double E0 = pr_host[9];
00893       FastCalcRMSDAndRotation(NULL, pr_host, &rmsd, E0, sel->selected, -1);
00894       //rmsdmat[j*framecount + i] = rmsd;
00895 #endif
00896 
00897       if (verbose) {
00898         printf("Thr[%2d]F[%d][tile: %d to %d] frame: %d RMSD: %4.1f cuFile len: %ld off: %ld\n", 
00899                threadid, myfileidx, tile.start, tile.end, idx, rmsd, 
00900                readlen, foffset); 
00901       }  
00902     }
00903   }
00904 
00905 #else
00906 
00907   // 
00908   // Real QCP loops
00909   // 
00910   int xfileidx = -1;
00911   int yfileidx = -1;
00912   wkf_tasktile_t tile;
00913   long lastx = -1, lasty = -1;
00914   while (wkf_threadlaunch_next_tile(voidparms, 8, &tile) != WKF_SCHED_DONE) {
00915     int idx;
00916     for (idx=tile.start; idx<tile.end; idx++) {
00917 
00918       //
00919       // Perform I/O to get coordinates into the GPU
00920       //
00921 
00922       // compute i,j from idx...
00923       // only compute off-diagonal elements, so we use (framecount-1)
00924       long x=-1, y=-1, ret=0;
00925       if (idx2sub_tril(framecount-1, idx, &x, &y)) {
00926         printf("qcpthread[%d]: work idx %d out of triangle!\n", threadid, idx);
00927         break;
00928       }
00929 
00930       long startoffset, foffset, readlen;
00931 //printf("Thr[%2d] pair(%3ld x %3ld) cuFileRead(): offset %ld  readlen: %ld\n", 
00932 //       threadid, x, y, foffset, readlen);
00933 
00934       // if the "row" frame has changed, read in the new one
00935       if (lasty != y) {
00936         // XXX Hard-coded DGX-2 specific topology-file locality scheme.
00937         //     Topology determination should be entirely runtime-determined
00938         yfileidx = (threadid / 4) % nfiles;
00939         long fileframeidx = y % framesperfile;
00940         read_js_timestep_index_offsets(parms->jshandles[yfileidx], 
00941                                        natoms, fileframeidx, 
00942                                        0, natoms, NULL,
00943                                        &startoffset, &foffset, &readlen);
00944 printf("Thr[%2d] pair(%3ld x %3ld) cuFileRead(Y): offset %ld  readlen: %ld\n", 
00945        threadid, x, y, foffset, readlen);
00946         lasty = y;
00947       }
00948 
00949       // if no errs and the "column" frame has changed, read in the new one
00950       if (ret >= 0 && lastx != x) {
00951         // XXX Hard-coded DGX-2 specific topology-file locality scheme.
00952         //     Topology determination should be entirely runtime-determined
00953         xfileidx = (threadid / 4) % nfiles;
00954         long fileframeidx = x % framesperfile;
00955         read_js_timestep_index_offsets(parms->jshandles[xfileidx], 
00956                                        natoms, fileframeidx, 
00957                                        0, natoms, NULL,
00958                                        &startoffset, &foffset, &readlen);
00959 printf("Thr[%2d] pair(%3ld x %3ld) cuFileRead(X): offset %ld  readlen: %ld\n", 
00960        threadid, x, y, foffset, readlen);
00961         ret = cuFileRead(parms->cfh[xfileidx], (char *) devptr+blockpadsz, readlen, foffset);
00962           lastx = x;
00963       }
00964 
00965       // handle errors if they have occured
00966       if (ret < 0) {
00967         const char *descp = "unknown error code";
00968 #if 0
00969         // XXX this requires linkage with libcuda.so, which is
00970         //     against convention, so we avoid it for now
00971         if (cuGetErrorName(ret, &descp) != CUDA_SUCCESS)
00972           descp = "unknown cuda error";
00973 #endif
00974 
00975         printf("Thr[%2d] Error: cuFileRead(): %ld, '%s'\n", threadid, ret, descp);
00976         return NULL; // XXX error handling needs to be done here
00977       }
00978 
00979       //
00980       // Produce contiguous coordinates based on active atom selection
00981       //
00982 
00983       // 
00984       // Run QCP calculation(s)
00985       // 
00986       cudaDeviceSynchronize();
00987       CUERR // check for errors
00988 
00989       dim3 Bsz = dim3(256, 1, 1);
00990       dim3 Gsz = dim3((natoms + Bsz.x - 1) / Bsz.x, 1, 1);
00991       if (sel->selected == natoms) {
00992         vmd_float3_aos_to_soa<<<Gsz, Bsz, 0, qcpstream>>>(natoms, (float3 *) devptr, crdx1, crdy1, crdz1);
00993       } else {
00994         vmd_float3_aos_to_soa_selected<<<Gsz, Bsz, 0, qcpstream>>>(natoms, gpuidxlist, (float3 *) devptr, crdx1, crdy1, crdz1);
00995       }
00996 
00997       vmd_qcp_innerprod_soa_devicewide<<<QCPGsz, QCPBsz, 0, qcpstream>>>(pr, crdx1, crdy1, crdz1, crdx1, crdy1, crdz1, natoms, NULL);
00998 
00999 #if 0
01000       double pr_host[10];
01001       cudaMemcpy(pr_host, pr, 10 * sizeof(double), cudaMemcpyDeviceToHost);
01002 
01003       // calculate the RMSD & rotational matrix
01004       float rmsd=0;
01005       double E0 = pr_host[9];
01006       FastCalcRMSDAndRotation(NULL, pr_host, &rmsd, E0, sel->selected, -1);
01007       //rmsdmat[j*framecount + i] = rmsd;
01008 #endif
01009     }
01010   }
01011 #endif
01012 
01013   cudaFree(gpuidxlist);
01014   cudaFree(pr);
01015   cudaFree(crdx1);
01016   cudaFree(crdy1);
01017   cudaFree(crdz1);
01018   cudaFree(crdx2);
01019   cudaFree(crdy2);
01020   cudaFree(crdz2);
01021 
01022 #if defined(VMDUSECUDAGDS)
01023   if (usecufile) {
01024     cuFileBufDeregister(devptr);
01025   }
01026 #endif
01027 
01028   if (hostfds != NULL) {
01029     int i;
01030     for (i=0; i<nfiles; i++) {
01031       fio_fclose(hostfds[i]);
01032     }
01033     free(hostfds);
01034   }
01035 
01036   if (hostptr != NULL) {
01037     if (pinhostiobuffer) {
01038       cudaFreeHost(hostptr);
01039     } else {
01040       free(hostptr_unaligned);
01041     }
01042   }
01043 
01044   return NULL;
01045 }
01046 
01047 #endif
01048 
01049 
01050 int qcp_soa_gpu_ooc(wkf_threadpool_t *devpool, // VMD GPU worker thread pool
01051                     int nfiles, const char **trjfileset, 
01052                     const AtomSel *sel,
01053                     int first, int last, int step, float *rmsdmat) {
01054   printf("qcp_soa_gpu_ooc()\n");
01055   wkf_threadpool_t *bigpool = NULL;
01056 
01057 #if defined(VMDUSECUDAGDS)
01058   int devcount;
01059   cudaError_t crc = cudaGetDeviceCount(&devcount);
01060   printf("  GPU device count: %d\n", devcount);
01061   if (devcount==0)
01062     printf("  No GPU devices, continuing with host only...\n");
01063 
01064   CUfileHandle_t * cfh = (CUfileHandle_t *) calloc(1, nfiles * sizeof(CUfileHandle_t));
01065   CUfileDescr_t * cfhdesc = (CUfileDescr_t *) calloc(1, nfiles * sizeof(CUfileDescr_t)); 
01066   memset(&cfh[0], 0, sizeof(cfh));
01067   memset(&cfhdesc[0], 0, sizeof(cfhdesc));
01068 
01069   int natomschk = 0;
01070   jshandle **jshandles = (jshandle **) calloc(1, nfiles * sizeof(jshandle *));
01071   fio_fd *directio_fds = (fio_fd *) calloc(1, nfiles * sizeof(fio_fd));
01072 
01073   int i;
01074   for (i=0; i<nfiles; i++) {
01075     const char *filename = trjfileset[i];
01076     printf("File[%d] GDS setup, opening '%s'\n", i, filename);
01077     jshandles[i] = (jshandle *) open_js_read(filename, "js", &natomschk);
01078     if (!jshandles[i]) {
01079       printf("File[%d] open_js_read failed for file %s\n", i, filename);
01080       return -1; // deal with error handling later
01081     }
01082 
01083 #if vmdplugin_ABIVERSION > 17
01084     long blocksz = MOLFILE_DIRECTIO_MIN_BLOCK_SIZE;
01085     int filepgalignsz = 1;
01086     read_js_timestep_pagealign_size(jshandles[i], &filepgalignsz);
01087     if (filepgalignsz != blocksz) {
01088       printf("File[%d] Plugin-returned page alignment size mismatch!\n", i);
01089     } else {
01090       printf("File[%d] Page alignment size: %d\n", i, filepgalignsz);
01091     }
01092 #endif
01093 
01094     read_js_timestep_index_offsets(jshandles[i], natomschk, 0, 0, 0, 
01095                                    &directio_fds[i], NULL, NULL, NULL);
01096 
01097     cfhdesc[i].handle.fd = directio_fds[i]; // typedef of Unix FD
01098     cfhdesc[i].type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
01099     CUfileError_t cferr = cuFileHandleRegister(&cfh[i], &cfhdesc[i]);
01100 
01101     if (cferr.err != CU_FILE_SUCCESS) {
01102       printf("File[%d] cuFileImportExternalFile on fd %d failed!\n", 
01103              i, cfhdesc[i].handle.fd);
01104       return -1; // XXX error handling needs to be done here
01105     }
01106   }
01107 
01108 
01109   //
01110   // copy in per-thread parameters
01111   //
01112   gpuqcprmsdmatoocthreadparms parms;
01113   memset(&parms, 0, sizeof(parms));
01114   parms.devcount = devcount;
01115   parms.nfiles = nfiles;
01116   parms.trjfileset = trjfileset;
01117   parms.jshandles = jshandles;
01118   parms.cfh = cfh;
01119   parms.natoms = sel->num_atoms;
01120   parms.sel = sel;
01121   parms.rmsdmat = rmsdmat;
01122   parms.first = first;
01123   parms.last = last;
01124   parms.step = step;
01125 
01126   int framecount = nfiles * (last / step);
01127 
01128   // create timers
01129   wkf_timerhandle timer;
01130   timer=wkf_timer_create();
01131 
01132   // spawn child threads to do the work
01133   wkf_tasktile_t tile;
01134   tile.start=0;
01135 #if defined(VMDGDSIOBENCHMARKONLY)
01136   tile.end=framecount - 1; // first row only
01137 #elif 1
01138   tile.end=framecount*16 - 1; // first 16 rows only
01139 #else
01140   tile.end=(framecount-1)*(framecount-1)/2; // only compute off-diag elements
01141 #endif
01142 
01143 printf("** qcp_soa_gpu_ooc(): tile start: %d  end: %d\n", tile.start, tile.end);
01144  
01145   int gdsthreadspergpu = 1;
01146   if (getenv("VMDGDSTHREADSPERGPU") != NULL) 
01147     gdsthreadspergpu = atoi(getenv("VMDGDSTHREADSPERGPU"));
01148 
01149 printf("** gdsthreadspergpu: %d\n", gdsthreadspergpu);
01150 
01151   if (gdsthreadspergpu > 1) {
01152     // XXX extra-large GPU device thread pool
01153     int workercount = devcount * gdsthreadspergpu;
01154 
01155     int *devlist = new int[workercount];
01156     int k;
01157     for (k=0; k<workercount; k++) {
01158       devlist[k] = k / gdsthreadspergpu; // XXX ignores VMD CUDA device masks
01159     }
01160 
01161     msgInfo << "Creating Multi-worker (" 
01162             << gdsthreadspergpu << " per-GPU) CUDA device pool..." << sendmsg;
01163     bigpool=wkf_threadpool_create(workercount, devlist);
01164     delete [] devlist;
01165 
01166     // associate each worker thread with a specific GPU
01167     if (getenv("VMDCUDAVERBOSE") != NULL)
01168       wkf_threadpool_launch(bigpool, vmd_cuda_devpool_setdeviceonly, (void*)"VMD CUDA Dev Init", 1);
01169     else
01170       wkf_threadpool_launch(bigpool, vmd_cuda_devpool_setdeviceonly, NULL, 1);
01171 
01172     // clear all available device memory on each of the GPUs
01173     wkf_threadpool_launch(bigpool, vmd_cuda_devpool_clear_device_mem, NULL, 1);
01174 
01175     // XXX override which GPU device pool we're going to use
01176     devpool = bigpool;
01177   }
01178 
01179   // XXX affinitize GPU worker threads for best perf 
01180   wkf_threadpool_launch(devpool, vmd_cuda_affinitize_threads, NULL, 1);
01181 
01182   wkf_threadpool_sched_dynamic(devpool, &tile);
01183   wkf_timer_start(timer);
01184   wkf_threadpool_launch(devpool, measure_rmsdmat_qcp_ooc_thread, &parms, 1);
01185   wkf_timer_stop(timer);
01186 
01187   double runtime = wkf_timer_time(timer); 
01188   double gbytes = sel->num_atoms * 12L * (tile.end+1) / (1024.0 * 1024.0 * 1024.0);
01189 
01190   printf("QCP natoms: %d, fsz: %ld, tsz: %ld\n", 
01191          sel->num_atoms, sel->num_atoms * 12L, 
01192          sel->num_atoms * 12L * (tile.end+1)); 
01193 
01194   int pinhostiobuffer = 1;
01195   if (getenv("VMDGDSHOSTNOPIN"))
01196     pinhostiobuffer=0; 
01197 
01198   int hostusedirectio = 1;
01199   if (getenv("VMDGDSHOSTBUFFEREDIO") != NULL)
01200     hostusedirectio = 0;
01201 
01202   int usecufile=1;
01203   if (getenv("VMDGDSUSEHOST"))
01204     usecufile=0;
01205 
01206   if (usecufile) {
01207     printf("OOC I/O via GDS + cuFile\n");
01208   } else {
01209     printf("OOC I/O via host, %s APIs, %s memory buffers\n",
01210            (hostusedirectio) ? "Direct I/O" : "Buffered I/O",
01211            (pinhostiobuffer) ? "pinned" : "unpinned");
01212   } 
01213 
01214   int multiframeio = 0;
01215   if (getenv("VMDGDSMULTIFRAME"))
01216     multiframeio = atoi(getenv("VMDGDSMULTIFRAME"));
01217   if (multiframeio > VMDGDSMAXFRAMEBUF)
01218     multiframeio = VMDGDSMAXFRAMEBUF;
01219   if (multiframeio) {
01220     printf("QCP GDS multi-frame read opt: %d frames per call, %ld bytes\n",
01221            multiframeio, 
01222            multiframeio * sel->num_atoms * 12L);
01223   }
01224 
01225   printf("QCP runtime: %.1f, %.2fGB/sec\n", runtime, gbytes/runtime);
01226          
01227   for (i=0; i<nfiles; i++) { 
01228 #if defined(VMDUSECUDAGDS)
01229     cuFileHandleDeregister(cfh[i]);
01230 #endif
01231     close_js_read(jshandles[i]);
01232   }
01233 #endif
01234 
01235 #if defined(VMDUSECUDAGDS)
01236   if (cfh != NULL)
01237     free(cfh);
01238 
01239   if (cfhdesc != NULL)
01240      free(cfhdesc);
01241 
01242   if (jshandles != NULL)
01243     free(jshandles);
01244 
01245   if (directio_fds != NULL)
01246     free(directio_fds);
01247 #endif
01248 
01249   // if we created an extra large-thread-count-per-GPU thread pool, we 
01250   // need to destroy it here...
01251   if (bigpool != NULL) 
01252     wkf_threadpool_destroy(bigpool);
01253 
01254   return 0;
01255 }