CUDAFastPBC.cu Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 /***************************************************************************
00009  * RCS INFORMATION:
00010  *
00011  *      $RCSfile: CUDAFastPBC.cu,v $
00012  *      $Author: johns $        $Locker:  $             $State: Exp $
00013  *      $Revision: 1.8 $       $Date: 2021/12/21 05:36:37 $
00014  *
00015  ***************************************************************************/
00021 #include <stdio.h>
00022 
00023 // Uses thrust for vector ops, various scan() reductions, etc.
00024 #include <thrust/scan.h>
00025 #include <thrust/device_ptr.h>
00026 #include <thrust/reduce.h>
00027 #include <thrust/iterator/permutation_iterator.h>
00028 #include <thrust/device_vector.h>
00029 
00030 #include "FastPBC.h"
00031 
00032 __global__ void inverseboxsize (float *boxsize, float* invboxsize) {
00033         int tid = threadIdx.x;
00034         if (tid < 3) {
00035                 invboxsize[tid] = 1.0 / boxsize[tid];
00036         }
00037 }
00038 
00039 /*
00040 // This is an inefficient kernel. Much slower than the one that replaced 
00041 // it below. (~100 us)
00042 __global__ void repositionfragments(int fnum, float *pos, int *compoundmap, 
00043                                     int *indexlist, float *boxsize, 
00044                                     float *invboxsize) {
00045         int tid = threadIdx.x + blockIdx.x * blockDim.x;
00046         int i, j, k;
00047         for (int l = tid; l < fnum; l+=blockDim.x * gridDim.x) {
00048                 float ccenter[3];
00049                 int lowbound = compoundmap[l];
00050                 int highbound = compoundmap[l+1];
00051                 i = indexlist[lowbound];
00052                 //Use the first element within the compound as the center.
00053                 for (j=0; j < 3; j++) {
00054                         ccenter[j] = pos[i*3+j];
00055                 }
00056                 //move the compound, wrapping it to be within half a box dimension from the center
00057                 for (k = lowbound; k < highbound; k++ ) {
00058                         i = indexlist[k];
00059                         for (j=0; j < 3; j++) {
00060                                 pos[i*3+j] = pos[i*3+j] - (rintf((pos[i*3+j] - ccenter[j]) * invboxsize[j]) * boxsize[j]);
00061                         }
00062                 }
00063         }
00064 }
00065 */
00066 
00067 // Super-efficient kernel. ~8 us execution time
00068 __global__ void repositionfragments(float *pos, int sellen, int *atomtofragmap,
00069                                     int *compoundmap, int *indexlist, 
00070                                     float *boxsize, float *invboxsize) {
00071         int tid = threadIdx.x + blockIdx.x * blockDim.x;
00072         if (tid < 3*sellen) {
00073                 int idx = tid / 3;
00074                 int dim = tid % 3;
00075                 int cidx = indexlist[compoundmap[atomtofragmap[idx]]];
00076                 //printf("tid: %d, idx %d, dim %d, cidx %d\n", tid, idx, dim, cidx);
00077                 float center = pos[3 * cidx + dim];
00078                 pos[3*indexlist[idx]+dim] = pos[3*indexlist[idx]+dim] - (rintf((pos[3*indexlist[idx]+dim] - center) * invboxsize[dim]) * boxsize[dim]);
00079         }
00080 }
00081 
00082 
00083 __global__ void wrapcompound(float *pos, int sellen, float *center, 
00084                              int *atomtofragmap, int *indexlist, 
00085                              float *boxsize, float *invboxsize, 
00086                              float *fragcenters) {
00087         int tid = threadIdx.x + blockIdx.x * blockDim.x;
00088         if (tid < 3*sellen) {
00089                 int idx = tid / 3;
00090                 int dim = tid % 3;
00091                 int frag = atomtofragmap[idx];
00092                 int aidx = indexlist[idx];
00093                 pos[3*aidx+dim] = pos[3*aidx+dim] - (rintf((fragcenters[3*frag+dim] - center[dim]) * invboxsize[dim]) * boxsize[dim]);
00094         }
00095 }
00096 
00097 
00098 __global__ void wrapatomic(float *pos, int sellen, float *center, 
00099                            int *indexlist, float *boxsize, float *invboxsize) {
00100         int tid = threadIdx.x + blockIdx.x * blockDim.x;
00101         if (tid < 3*sellen) {
00102                 int idx = tid / 3;
00103                 int dim = tid % 3;
00104                 int aidx = indexlist[idx];
00105                 pos[3*aidx+dim] = pos[3*aidx+dim] - (rintf((pos[3*aidx+dim] - center[dim]) * invboxsize[dim]) * boxsize[dim]);
00106         }
00107 }
00108 
00109 
00110 __global__ void unwrapatomic(float *pos, float *prev, float *prevw, int sellen, 
00111                              int *indexlist, 
00112                              float *boxsize, float *invboxsize, float *oldboxsize, float *oldinvboxsize) {
00113         int tid = threadIdx.x + blockIdx.x * blockDim.x;
00114         if (tid < 3*sellen) {
00115                 int idx = tid / 3;
00116                 int dim = tid % 3;
00117                 int aidx = indexlist[idx];
00118                 float tmp = pos[3*aidx+dim]; // Holds the wrapped position
00119                 float disp = pos[3*aidx+dim] - prevw[3*aidx+dim]; //Displacement
00120                 pos[3*aidx+dim] = prev[3*aidx+dim] + disp - (rintf((disp) * invboxsize[dim]) * boxsize[dim])
00121                                                                         - (rintf((prevw[3*aidx+dim]-prev[3*aidx+dim])*oldinvboxsize[dim]) * (boxsize[dim]-oldboxsize[dim]));
00122                 prevw[3*aidx+dim] = tmp;//Set the prevw for the next frame.
00123         }
00124 }
00125 
00126 
00127 __global__ void fragmentperatom(int fnum, int *compoundmap, 
00128                                 int *atomtofragmap) {
00129         int tid = threadIdx.x + blockIdx.x * blockDim.x;
00130         if (tid < fnum && tid != 0) {
00131                 atomtofragmap[compoundmap[tid]] = 1;
00132         }
00133 }
00134 
00135 
00136 // XXX this likely duplicates stuff from prototypes in CUDAMeasure
00137 __global__ void measurecenter(float *pos, float *center, int len, 
00138                               float *weights, int *weightidx, float *wscale) {
00139         __shared__ float reduce[96]; //96 is not an arbitrary number. Its divisible by 3! This lets us use
00140         //aligned memory accesses.
00141         int tid = threadIdx.x + blockIdx.x * blockDim.x;
00142         float mcenter = 0;
00143         int dim;
00144         if (tid < 3*len) {
00145                 int idx = tid / 3;
00146                 dim = tid % 3;
00147                 mcenter = pos[3 * weightidx[idx] + dim] * weights[idx] * (*wscale);
00148         }
00149         reduce[threadIdx.x] = mcenter;
00150         __syncthreads();
00151         if (threadIdx.x < 48) {
00152                 reduce[threadIdx.x] += reduce[threadIdx.x + 48];
00153         }
00154         __syncthreads();
00155         if (threadIdx.x < 24) {
00156                 reduce[threadIdx.x] += reduce[threadIdx.x + 24];
00157         }
00158         __syncthreads();
00159         if (threadIdx.x < 12) {
00160                 reduce[threadIdx.x] += reduce[threadIdx.x + 12];
00161         }
00162         __syncthreads();
00163         if (threadIdx.x < 3) {
00164                 mcenter = reduce[threadIdx.x] + reduce[threadIdx.x + 3] + reduce[threadIdx.x + 6] + reduce[threadIdx.x + 9];
00165                 atomicAdd(&center[dim], mcenter);
00166         }
00167 }
00168 
00169 
00170 // Only differs from measurecenter based on how the weights are indexed.
00171 // Here the expectation is that the full mass array has been passed, 
00172 // so we need to find only specific elements of the weight array.
00173 __global__ void measurecenter_fullmass(float *pos, float *center, int len, 
00174                                        float *weights, int *weightidx, 
00175                                        float *wscale) {
00176         __shared__ float reduce[96]; //96 is not an arbitrary number. Its divisible by 3! This lets us use
00177         //aligned memory accesses.
00178         int tid = threadIdx.x + blockIdx.x * blockDim.x;
00179         float mcenter = 0;
00180         int dim;
00181         if (tid < 3*len) {
00182                 int idx = tid / 3;
00183                 dim = tid % 3;
00184                 int widx = weightidx[idx];
00185                 mcenter = pos[3 * widx + dim] * weights[widx] * (*wscale);
00186         }
00187         reduce[threadIdx.x] = mcenter;
00188         __syncthreads();
00189         if (threadIdx.x < 48) {
00190                 reduce[threadIdx.x] += reduce[threadIdx.x + 48];
00191         }
00192         __syncthreads();
00193         if (threadIdx.x < 24) {
00194                 reduce[threadIdx.x] += reduce[threadIdx.x + 24];
00195         }
00196         __syncthreads();
00197         if (threadIdx.x < 12) {
00198                 reduce[threadIdx.x] += reduce[threadIdx.x + 12];
00199         }
00200         __syncthreads();
00201         if (threadIdx.x < 3) {
00202                 mcenter = reduce[threadIdx.x] + reduce[threadIdx.x + 3] + reduce[threadIdx.x + 6] + reduce[threadIdx.x + 9];
00203                 atomicAdd(&center[dim], mcenter);
00204         }
00205 }
00206 
00207 
00208 // Harrumph. This kernel is inefficient. Less inefficient than the prettier 
00209 // way of doing it, but this has only 1 kernel call, whereas the other one 
00210 // had as many calls as there were fragments.
00211 __global__ void computefragcenters(float *pos, float *centers, int fnum, 
00212                                    float *weights, float *wscale, 
00213                                    int *compoundmap, int *indexlist) {
00214         int tid = threadIdx.x + blockIdx.x * blockDim.x;
00215         int i, j, k, f;
00216         float ccenter = 0;
00217         if (tid < 3*fnum) {
00218                 f = tid / 3;
00219                 j = tid % 3;
00220                 int lowbound = compoundmap[f];
00221                 int highbound = compoundmap[f+1];
00222                 //Find the center of the compound.
00223                 for (k = lowbound; k < highbound; k++ ) {
00224                         i = indexlist[k];
00225                         ccenter += pos[i*3+j] * weights[i] * wscale[f];
00226                 }
00227                 centers[3*f+j] = ccenter ;
00228         }
00229 }
00230 
00231 
00232 __global__ void fragwscale(float *fragscales, float *massarr, int fragnum, 
00233                            int *compoundmap, int *indexlist) {
00234         int tid = threadIdx.x + blockIdx.x * blockDim.x;
00235         int i, k, f;
00236         float fragmass = 0;
00237         if (tid < fragnum) {
00238                 f = tid;
00239                 int lowbound = compoundmap[f];
00240                 int highbound = compoundmap[f+1];
00241                 //Weigh the fragment.
00242                 for (k = lowbound; k < highbound; k++ ) {
00243                         i = indexlist[k];
00244                         fragmass += massarr[i];
00245                 }
00246                 fragscales[tid] = 1.0 / fragmass ;
00247         }
00248 }
00249 
00250 void fpbc_exec_unwrap(Molecule* mol, int first, int last, int sellen, int* indexlist) {
00251         Timestep *ts;
00252         int f;
00253         const int threads = 128;
00254         float *pos;
00255         float *gpupos;
00256         float *gpuprevu;
00257         float *gpuprevw;
00258         float boxsize[3];
00259         float *gpuboxsize;
00260         float *gpuinvboxsize;
00261         float *gpuoldboxsize;
00262         float *gpuoldinvboxsize;
00263         int *gpuindexlist;
00264         int blocks = (3*sellen + threads - 1) / threads;
00265         cudaHostRegister(indexlist, sizeof(int) * sellen,0);
00266         cudaHostRegister(boxsize, sizeof(float) * 3,0);
00267 
00268         cudaMalloc((void**) &gpupos, sizeof(float) * 3*mol->nAtoms);
00269         cudaMalloc((void**) &gpuprevw, sizeof(float) * 3*mol->nAtoms);
00270         cudaMalloc((void**) &gpuprevu, sizeof(float) * 3*mol->nAtoms);
00271         cudaMalloc((void**) &gpuboxsize, sizeof(float) * 3);
00272         cudaMalloc((void**) &gpuinvboxsize, sizeof(float) * 3);
00273         cudaMalloc((void**) &gpuoldboxsize, sizeof(float) * 3);
00274         cudaMalloc((void**) &gpuoldinvboxsize, sizeof(float) * 3);
00275         cudaMallocHost((void**) &pos, sizeof(float) * 3*mol->nAtoms);
00276         cudaMalloc((void**) &gpuindexlist, sizeof(int) * sellen);
00277         cudaMemcpy(gpuindexlist, indexlist, sizeof(int) * sellen, cudaMemcpyHostToDevice);
00278         f = first;
00279         ts = mol->get_frame(f);
00280         //Load in the first frame.
00281         cudaMemcpyAsync(pos, ts->pos, sizeof(float) * 3*mol->nAtoms,cudaMemcpyHostToHost);
00282         cudaMemcpyAsync(gpuprevw, pos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToDevice);
00283         cudaMemcpyAsync(gpuprevu, gpuprevw, sizeof(float) * 3*mol->nAtoms, cudaMemcpyDeviceToDevice);
00284         boxsize[0] = ts->a_length;
00285         boxsize[1] = ts->b_length;
00286         boxsize[2] = ts->c_length;
00287         cudaMemcpyAsync(gpuoldboxsize, boxsize, sizeof(float) * 3, cudaMemcpyHostToDevice);
00288         inverseboxsize<<<1,4>>>(gpuoldboxsize, gpuoldinvboxsize);
00289         //Do stuff
00290         for (f=first+1; f<=last; f++) {
00291                 ts = mol->get_frame(f);
00292                 boxsize[0] = ts->a_length;
00293                 boxsize[1] = ts->b_length;
00294                 boxsize[2] = ts->c_length;
00295                 //Block here just so that I don't overwrite our pinned buffer before it is written out to VMD memory.
00296                 cudaDeviceSynchronize();
00297                 //Buffer should be clear. Copy VMD timestep to our own buffer
00298                 cudaMemcpyAsync(pos, ts->pos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToHost);
00299                 //Copy pinned host memory to GPU
00300                 cudaMemcpyAsync(gpupos, pos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToDevice);
00301                 cudaMemcpyAsync(gpuboxsize, boxsize, sizeof(float) * 3, cudaMemcpyHostToDevice);
00302                 //Do math here.
00303                 inverseboxsize<<<1,4>>>(gpuboxsize, gpuinvboxsize);
00304                 unwrapatomic<<<blocks,threads>>>(gpupos, gpuprevu, gpuprevw, sellen, gpuindexlist, gpuboxsize, gpuinvboxsize, gpuoldboxsize, gpuoldinvboxsize);
00305                 //Copy out to buffer.
00306                 cudaMemcpyAsync(pos, gpupos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyDeviceToHost);
00307                 cudaMemcpyAsync(gpuprevu, gpupos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyDeviceToDevice);
00308                 cudaMemcpyAsync(gpuoldboxsize, gpuboxsize, sizeof(float) * 3, cudaMemcpyDeviceToDevice);
00309                 cudaMemcpyAsync(gpuoldinvboxsize, gpuinvboxsize, sizeof(float) * 3, cudaMemcpyDeviceToDevice);
00310                 //Copy back to VMD.
00311                 cudaMemcpyAsync(ts->pos, pos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToHost);
00312         }
00313         //Wait for outstanding memory transfers.
00314         cudaDeviceSynchronize();
00315 
00316         //Cleanup
00317         cudaFree(gpupos);
00318         cudaFree(gpuprevw);
00319         cudaFree(gpuprevu);
00320         cudaFree(gpuboxsize);
00321         cudaFree(gpuinvboxsize);
00322         cudaFree(gpuoldboxsize);
00323         cudaFree(gpuoldinvboxsize);
00324         cudaFreeHost(pos);
00325 
00326         cudaHostUnregister(indexlist);
00327         cudaHostUnregister(boxsize);
00328         cudaFree(gpuindexlist);
00329         cudaDeviceSynchronize();
00330         cudaError_t error = cudaGetLastError();
00331         if(error != cudaSuccess)
00332         {
00333                 // print the CUDA error message and fallback to CPU
00334                 printf("CUDA error: %s\n", cudaGetErrorString(error));
00335                 printf("Reverting to CPU algorithm\n");
00336                 fpbc_exec_unwrap_cpu(mol, first, last, sellen, indexlist);
00337         }
00338 }
00339 
00340 
00341 void fpbc_exec_wrapcompound(Molecule* mol, int first, int last, int fnum, int *compoundmap, int sellen, int* indexlist, float* weights, AtomSel* csel, float* center, float* massarr) {
00342         //Declare variables.
00343         int f, i, j;
00344         Timestep *ts;
00345         const int nStreams = 4;
00346         const int threads = 128;
00347         float *pos[nStreams];
00348         float *gpupos[nStreams];
00349         float boxsize[3];
00350         float *gpuboxsize[nStreams];
00351         float *gpucenters[nStreams];
00352         float *gpuinvboxsize[nStreams];
00353         float *gpuweights;
00354         float *wscale;
00355         float *gpufragweight;
00356         int *gpuweightidx;
00357         int *gpuindexlist;
00358         int *gpuatomtofragmap;
00359         int *gpucompoundmap;
00360         float *gpufragcenters[nStreams];
00361         int blocks_frag = (fnum + threads - 1) / threads;
00362         cudaStream_t stream[nStreams];
00363         //Allocate memory for static things (weight sums, maps, etc.)
00364         cudaMalloc((void**) &gpuatomtofragmap, sizeof(int) * sellen);
00365         cudaMalloc((void**) &wscale, sizeof(float));
00366         cudaMemset(gpuatomtofragmap, 0, sizeof(int) * sellen);
00367         cudaMalloc((void**) &gpucompoundmap, sizeof(int) * (fnum+1));
00368         cudaMalloc((void**) &gpuindexlist, sizeof(int) * sellen);
00369         cudaMalloc((void**) &gpuweights, sizeof(float) * mol->nAtoms);
00370         cudaMalloc((void**) &gpufragweight, sizeof(float) * fnum);
00371         cudaHostRegister(compoundmap, sizeof(int) * (fnum+1),0);
00372         cudaHostRegister(indexlist, sizeof(int) * sellen,0);
00373         cudaMemcpy(gpuweights, massarr, sizeof(float) * mol->nAtoms, cudaMemcpyHostToDevice);//Unlike in wrapatomic, we'll pass over the full mass array to the GPU, since odds are we'll need it.
00374         cudaMemcpy(gpucompoundmap, compoundmap, sizeof(int) * (fnum+1), cudaMemcpyHostToDevice);
00375         cudaMemcpy(gpuindexlist, indexlist, sizeof(int) * sellen, cudaMemcpyHostToDevice);
00376         //Make the atomtofragmap by setting elements to 1 and then doing a scan.
00377         fragmentperatom<<<blocks_frag, threads>>>(fnum, gpucompoundmap, gpuatomtofragmap);//Setup the gpu per atom map.
00378         thrust::inclusive_scan(thrust::device_ptr<int>(gpuatomtofragmap), thrust::device_ptr<int>(gpuatomtofragmap + sellen), thrust::device_ptr<int>(gpuatomtofragmap));
00379         //Get the mass per fragment (for scaling/finding center of mass for everything.)
00380         fragwscale<<<blocks_frag, threads>>>(gpufragweight, gpuweights, fnum, gpucompoundmap, gpuindexlist);
00381         cudaHostRegister(boxsize, sizeof(float)*3,0);
00382         if (csel != NULL) {
00383                 cudaMalloc((void**) &gpuweightidx, sizeof(int) * csel->selected);
00384                 int *weightidx = new int[csel->selected];
00385                 j=0;
00386                 for (i=csel->firstsel; i<=csel->lastsel; i++) {
00387                         if (csel->on[i]) {
00388                                 weightidx[j++] = i;
00389                         }
00390                 }
00391                 cudaMemcpy(gpuweightidx, weightidx, sizeof(int) * csel->selected, cudaMemcpyHostToDevice);
00392                 thrust::device_vector<int> ids (thrust::device_ptr<int>(gpuweightidx), thrust::device_ptr<int>(gpuweightidx+csel->selected));
00393                 thrust::device_vector<float> mass (thrust::device_ptr<float>(gpuweights), thrust::device_ptr<float>(gpuweights+mol->nAtoms));
00394                 float tmp = 1.0f / thrust::reduce(thrust::make_permutation_iterator(mass.begin(), ids.begin()),
00395                         thrust::make_permutation_iterator(mass.end(), ids.end()), 0, thrust::plus<float>());
00396                 cudaMemcpy(wscale, &tmp, sizeof(float), cudaMemcpyHostToDevice);
00397                 delete [] weightidx;
00398         }
00399         //Allocate memory and create streams for per-frame changables.
00400         for (f = 0; f< nStreams; f++) {
00401                 cudaStreamCreate(&stream[f]);
00402                 cudaMalloc((void**) &gpupos[f], sizeof(float) * 3*mol->nAtoms);
00403                 cudaMalloc((void**) &gpuboxsize[f], sizeof(float) * 3);
00404                 cudaMalloc((void**) &gpuinvboxsize[f], sizeof(float) * 3);
00405                 cudaMalloc((void**) &gpucenters[f], sizeof(float) * 3);
00406                 cudaMemcpyAsync(gpucenters[f], center, sizeof(float)*3, cudaMemcpyHostToDevice, stream[f]);
00407                 cudaMalloc((void**) &gpufragcenters[f], sizeof(float) * 3 * fnum);
00408                 cudaMallocHost(&pos[f], sizeof(float) * 3*mol->nAtoms);
00409         }
00410         cudaDeviceSynchronize();
00411         //Start looping over the frames.
00412         int blocks = (3*sellen + threads - 1) / threads;
00413         blocks_frag = (3*fnum + threads - 1) / threads;
00414         for (f=first; f<=last; f++) {
00415                 ts = mol->get_frame(f);
00416                 boxsize[0] = ts->a_length;
00417                 boxsize[1] = ts->b_length;
00418                 boxsize[2] = ts->c_length;
00419                 //Block here just so that I don't overwrite a buffer.
00420                 cudaStreamSynchronize(stream[f%nStreams]);
00421                 //Buffer should be clear. Copy VMD timestep to our own buffer
00422                 cudaMemcpyAsync(pos[f%nStreams], ts->pos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToHost, stream[f%nStreams]);
00423                 cudaMemcpyAsync(gpupos[f%nStreams], pos[f%nStreams], sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToDevice, stream[f%nStreams]);
00424                 cudaMemcpyAsync(gpuboxsize[f%nStreams], boxsize, sizeof(float)*3, cudaMemcpyHostToDevice, stream[f%nStreams]);
00425                 //Do math here.
00426                 inverseboxsize<<<1,4,0,stream[f%nStreams]>>>(gpuboxsize[f%nStreams], gpuinvboxsize[f%nStreams]);
00427                 if (csel != NULL) {
00428                         cudaMemsetAsync(gpucenters[f%nStreams],0, 3 * sizeof(float), stream[f%nStreams]);
00429                         //Measure the center of the selection if one is provided. Put it into the 3-vector gpucenters.
00430                         //To exploit some of the symmetry of the problem, pick a blocksize that is a multiple of 3, and preferably
00431                         //also a multiple of the warpsize (96 is good!)
00432                         measurecenter_fullmass<<<(3*csel->selected + 95) / 96, 96, 0, stream[f%nStreams]>>>(gpupos[f%nStreams], gpucenters[f%nStreams], csel->selected, gpuweights, gpuweightidx, wscale);
00433                 }
00434                 //Fragment centers need to be determined.
00435                 //TODO: make this not suck. At the moment, I think this is the biggest bottleneck.
00436                 computefragcenters<<<blocks_frag,threads,0,stream[f%nStreams]>>>(gpupos[f%nStreams], gpufragcenters[f%nStreams], fnum, gpuweights, gpufragweight, gpucompoundmap, gpuindexlist);
00437                 //Wrap.
00438                 wrapcompound<<<blocks, threads, 0, stream[f%nStreams]>>> (gpupos[f%nStreams], sellen, gpucenters[f%nStreams], gpuatomtofragmap, gpuindexlist, gpuboxsize[f%nStreams], gpuinvboxsize[f%nStreams], gpufragcenters[f%nStreams]);
00439                 //Copy back.
00440                 cudaMemcpyAsync(pos[f%nStreams], gpupos[f%nStreams], sizeof(float) * 3 *mol->nAtoms, cudaMemcpyDeviceToHost, stream[f%nStreams]);
00441                 //Copy back to VMD.
00442                 cudaMemcpyAsync(ts->pos, pos[f%nStreams], sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToHost, stream[f%nStreams]);
00443         }
00444         //Cleanup
00445         //Wait for outstanding memory transfers.
00446         cudaDeviceSynchronize();
00447         //Free memory.
00448         cudaHostUnregister(boxsize);
00449         cudaHostUnregister(compoundmap);
00450         cudaHostUnregister(indexlist);
00451         cudaFree(gpucompoundmap);
00452         cudaFree(gpuindexlist);
00453         cudaFree(gpuatomtofragmap);
00454         for (f = 0; f< nStreams; f++) {
00455                 cudaStreamDestroy(stream[f]);
00456                 cudaFree(gpupos[f]);
00457                 cudaFree(gpuboxsize[f]);
00458                 cudaFree(gpuinvboxsize[f]);
00459                 cudaFreeHost(pos[f]);
00460         }
00461         
00462         cudaFree(wscale);
00463         cudaFree(gpufragweight);
00464         cudaFree(gpuweights);
00465         if (csel != NULL) {
00466                 cudaFree(gpuweightidx);
00467         }
00468         cudaDeviceSynchronize();
00469         cudaError_t error = cudaGetLastError();
00470         if(error != cudaSuccess)
00471         {
00472                 // print the CUDA error message and fallback to CPU
00473                 printf("CUDA error: %s\n", cudaGetErrorString(error));
00474                 printf("Reverting to CPU algorithm\n");
00475                 fpbc_exec_wrapcompound_cpu(mol, first, last, fnum, compoundmap, sellen, indexlist, weights, csel, center, massarr);
00476         }
00477 }
00478 
00479 
00480 void fpbc_exec_wrapatomic(Molecule* mol, int first, int last, int sellen, int* indexlist, 
00481         float* weights, AtomSel* csel, float* center) {
00482         int f, i, j;
00483         Timestep *ts;
00484         const int nStreams = 4;
00485         const int threads = 128;
00486         float *gpupos[nStreams];
00487         float boxsize[3];
00488         float *pos[nStreams];
00489         float *gpuboxsize[nStreams];
00490         float *gpucenters[nStreams];
00491         float *gpuinvboxsize[nStreams];
00492         float *gpuweights;
00493         float *wscale;
00494         cudaMalloc((void**) &wscale, sizeof(float));
00495         int *gpuweightidx;
00496         int *gpuindexlist;
00497         //Prepare GPU memory and streams
00498         cudaStream_t stream[nStreams];
00499         cudaMalloc((void**) &gpuindexlist, sizeof(int) * sellen);
00500         cudaMemcpy(gpuindexlist, indexlist, sizeof(int) * sellen, cudaMemcpyHostToDevice);
00501         cudaHostRegister(center, sizeof(float) * 3,0);
00502         cudaHostRegister(boxsize, sizeof(float) * 3,0);
00503         for (f = 0; f< nStreams; f++) {
00504                 cudaStreamCreate(&stream[f]);
00505                 cudaMalloc((void**) &gpupos[f], sizeof(float) * 3*mol->nAtoms);
00506                 cudaMalloc((void**) &gpuboxsize[f], sizeof(float) * 3);
00507                 cudaMalloc((void**) &gpuinvboxsize[f], sizeof(float) * 3);
00508                 cudaMalloc((void**) &gpucenters[f], sizeof(float) * 3);
00509                 cudaMemcpyAsync(gpucenters[f], center, sizeof(float)*3, cudaMemcpyHostToDevice, stream[f]);
00510                 cudaMallocHost(&pos[f], sizeof(float) * 3*mol->nAtoms);
00511         }
00512         if (csel != NULL) {
00513                 cudaMalloc((void**) &gpuweights, sizeof(float) * csel->selected);
00514                 cudaMalloc((void**) &gpuweightidx, sizeof(int) * csel->selected);
00515                 cudaMemcpy(gpuweights, weights, sizeof(float) * csel->selected, cudaMemcpyHostToDevice);
00516                 int *weightidx = new int[csel->selected];
00517                 j=0;
00518                 for (i=csel->firstsel; i<=csel->lastsel; i++) {
00519                         if (csel->on[i]) {
00520                                 weightidx[j++] = i;
00521                         }
00522                 }
00523                 cudaMemcpy(gpuweightidx, weightidx, sizeof(int) * csel->selected, cudaMemcpyHostToDevice);
00524                 float tmp = 1.0f / thrust::reduce(thrust::device_ptr<float>(gpuweights), thrust::device_ptr<float>(gpuweights + csel->selected), 0, thrust::plus<float>());
00525                 cudaMemcpy(wscale, &tmp, sizeof(float), cudaMemcpyHostToDevice);
00526                 delete [] weightidx;
00527         }
00528         int blocks = (3*sellen + threads - 1) / threads;
00529         for (f=first; f<=last; f++) {
00530                 ts = mol->get_frame(f);
00531                 boxsize[0] = ts->a_length;
00532                 boxsize[1] = ts->b_length;
00533                 boxsize[2] = ts->c_length;
00534                 //Block here just so that I don't overwrite a buffer.
00535                 cudaStreamSynchronize(stream[f%nStreams]);
00536                 cudaMemcpyAsync(pos[f%nStreams], ts->pos, sizeof(float) * 3*mol->nAtoms,cudaMemcpyHostToHost, stream[f%nStreams]);
00537                 cudaMemcpyAsync(gpupos[f%nStreams], pos[f%nStreams], sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToDevice, stream[f%nStreams]);
00538                 cudaMemcpyAsync(gpuboxsize[f%nStreams], boxsize, sizeof(float)*3, cudaMemcpyHostToDevice, stream[f%nStreams]);
00539                 //Do math here.
00540                 inverseboxsize<<<1,4,0,stream[f%nStreams]>>>(gpuboxsize[f%nStreams], gpuinvboxsize[f%nStreams]);
00541                 if (csel != NULL) {
00542                         cudaMemsetAsync(gpucenters[f%nStreams],0, 3 * sizeof(float), stream[f%nStreams]);
00543                         //Measure the center of the selection if one is provided. Put it into the 3-vector gpucenters.
00544                         //To exploit some of the symmetry of the problem, pick a blocksize that is a multiple of 3, and preferably
00545                         //also a multiple of the warpsize (96 is good!)
00546                         measurecenter<<<(3*csel->selected + 95) / 96, 96, 0, stream[f%nStreams]>>>(gpupos[f%nStreams], gpucenters[f%nStreams], csel->selected, gpuweights, gpuweightidx, wscale);
00547                 }
00548                 //Wrap.
00549                 wrapatomic<<<blocks, threads, 0, stream[f%nStreams]>>> (gpupos[f%nStreams], sellen, gpucenters[f%nStreams], gpuindexlist, gpuboxsize[f%nStreams], gpuinvboxsize[f%nStreams]);
00550                 //Copy back.
00551                 cudaMemcpyAsync(pos[f%nStreams], gpupos[f%nStreams], sizeof(float) * 3 *mol->nAtoms, cudaMemcpyDeviceToHost, stream[f%nStreams]);
00552                 cudaMemcpyAsync(ts->pos, pos[f%nStreams], sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToHost, stream[f%nStreams]);
00553         }
00554         //Wait for outstanding memory transfers.
00555         cudaDeviceSynchronize();
00556 
00557         cudaHostUnregister(boxsize);
00558         cudaHostUnregister(center);
00559         cudaFree(gpuindexlist);
00560         for (f = 0; f< nStreams; f++) {
00561                 cudaStreamDestroy(stream[f]);
00562                 cudaFree(gpupos[f]);
00563                 cudaFree(gpuboxsize[f]);
00564                 cudaFree(gpuinvboxsize[f]);
00565                 cudaFree(gpucenters[f]);
00566                 cudaFreeHost(pos[f]);
00567         }
00568         
00569         if (csel != NULL) {
00570                 cudaFree(gpuweights);
00571                 cudaFree(gpuweightidx);
00572         }
00573         cudaDeviceSynchronize();
00574         cudaError_t error = cudaGetLastError();
00575         if(error != cudaSuccess)
00576         {
00577                 // print the CUDA error message and fallback to CPU
00578                 printf("CUDA error: %s\n", cudaGetErrorString(error));
00579                 printf("Reverting to CPU algorithm\n");
00580                 fpbc_exec_wrapatomic_cpu(mol, first, last, sellen, indexlist, weights, csel, center);
00581         }
00582 }
00583 
00584 
00585 void fpbc_exec_join(Molecule* mol, int first, int last, int fnum, int *compoundmap, int sellen, int* indexlist) {
00586         int f;
00587         
00588         Timestep *ts;
00589         const int nStreams = 4;
00590         const int threads = 128;
00591         float *gpupos[nStreams];
00592         float *pos[nStreams];
00593         float boxsize[3];
00594         float *gpuboxsize[nStreams];
00595         float *gpuinvboxsize[nStreams];
00596         int *gpucompoundmap;
00597         int *gpuatomtofragmap;
00598         int *gpuindexlist;
00599         int blocks = (fnum + threads - 1) / threads;
00600         cudaStream_t stream[nStreams];
00601         cudaMalloc((void**) &gpuatomtofragmap, sizeof(int) * sellen);
00602         cudaMemset(gpuatomtofragmap, 0, sizeof(int) * sellen);
00603         cudaMalloc((void**) &gpucompoundmap, sizeof(int) * (fnum+1));
00604         cudaMalloc((void**) &gpuindexlist, sizeof(int) * sellen);
00605         cudaHostRegister(compoundmap, sizeof(int) * (fnum+1),0);
00606         cudaHostRegister(indexlist, sizeof(int) * sellen,0);
00607         cudaMemcpy(gpucompoundmap, compoundmap, sizeof(int) * (fnum+1), cudaMemcpyHostToDevice);
00608         cudaMemcpy(gpuindexlist, indexlist, sizeof(int) * sellen, cudaMemcpyHostToDevice);
00609         fragmentperatom<<<blocks, threads>>>(fnum, gpucompoundmap, gpuatomtofragmap);//Setup the gpu per atom map.
00610         thrust::inclusive_scan(thrust::device_ptr<int>(gpuatomtofragmap), thrust::device_ptr<int>(gpuatomtofragmap + sellen), thrust::device_ptr<int>(gpuatomtofragmap));
00611         for (f = 0; f< nStreams; f++) {
00612                 cudaStreamCreate(&stream[f]);
00613                 cudaMalloc((void**) &gpupos[f], sizeof(float) * 3*mol->nAtoms);
00614                 cudaMalloc((void**) &gpuboxsize[f], sizeof(float) * 3);
00615                 cudaMalloc((void**) &gpuinvboxsize[f], sizeof(float) * 3);
00616                 cudaMallocHost(&pos[f], sizeof(float) * 3*mol->nAtoms);
00617         }
00618         cudaHostRegister(boxsize, sizeof(float)*3,0);
00619         
00620         //Make sure the gpuatomtofragmap is set before proceeding.
00621         cudaDeviceSynchronize();
00622         blocks = (3*sellen + threads - 1) / threads;
00623         for (f = first; f <= last; f++) {
00624                 ts = mol->get_frame(f);
00625                 boxsize[0] = ts->a_length;
00626                 boxsize[1] = ts->b_length;
00627                 boxsize[2] = ts->c_length;
00628                 //Block here just so that I don't overwrite a buffer.
00629                 cudaStreamSynchronize(stream[f%nStreams]);
00630                 cudaMemcpyAsync(pos[f%nStreams], ts->pos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToHost, stream[f%nStreams]);
00631                 //Copy pinned host memory to GPU
00632                 cudaMemcpyAsync(gpupos[f%nStreams], pos[f%nStreams], sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToDevice, stream[f%nStreams]);
00633                 cudaMemcpyAsync(gpuboxsize[f%nStreams], boxsize, sizeof(float)*3, cudaMemcpyHostToDevice, stream[f%nStreams]);
00634                 //Do math here.
00635                 inverseboxsize<<<1,4,0,stream[f%nStreams]>>>(gpuboxsize[f%nStreams], gpuinvboxsize[f%nStreams]);
00636                 repositionfragments<<<blocks,threads, 0, stream[f%nStreams]>>>(gpupos[f%nStreams], sellen, gpuatomtofragmap,
00637                         gpucompoundmap, gpuindexlist, gpuboxsize[f%nStreams], gpuinvboxsize[f%nStreams]);
00638                 //Copy back.
00639                 cudaMemcpyAsync(pos[f%nStreams], gpupos[f%nStreams], sizeof(float) * 3*mol->nAtoms, cudaMemcpyDeviceToHost,stream[f%nStreams]);
00640                 //Copy back to VMD.
00641                 cudaMemcpyAsync(ts->pos, pos[f%nStreams], sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToHost, stream[f%nStreams]);
00642         }
00643         //Wait for outstanding memory transfers.
00644         cudaDeviceSynchronize();
00645 
00646         cudaHostUnregister(boxsize);
00647         cudaHostUnregister(indexlist);
00648         cudaHostUnregister(compoundmap);
00649         cudaFree(gpucompoundmap);
00650         cudaFree(gpuindexlist);
00651         cudaFree(gpuatomtofragmap);
00652         for (f = 0; f< nStreams; f++) {
00653                 cudaStreamDestroy(stream[f]);
00654                 cudaFree(gpupos[f]);
00655                 cudaFree(gpuboxsize[f]);
00656                 cudaFree(gpuinvboxsize[f]);
00657                 cudaFreeHost(pos[f]);
00658         }
00659         
00660         cudaDeviceSynchronize();
00661         cudaError_t error = cudaGetLastError();
00662         if(error != cudaSuccess)
00663         {
00664                 // print the CUDA error message and fallback to CPU
00665                 printf("CUDA error: %s\n", cudaGetErrorString(error));
00666                 printf("Reverting to CPU algorithm\n");
00667                 fpbc_exec_join(mol, first, last, fnum, compoundmap, sellen, indexlist);
00668         }
00669 }
00670 
00671 
00672 void fpbc_exec_recenter(Molecule* mol, int first, int last, int csellen, int* cindexlist, int fnum, int *compoundmap, int sellen, int* indexlist, float* weights, AtomSel* csel, float* massarr) {
00673         //The basic idea here is to pass the data back and forth only once while both unwrapping and rewrapping the trajectory.
00674         Timestep *ts;
00675         int f;
00676         const int threads = 128;
00677         float *pos;
00678         float *gpupos;
00679         float *gpuprevu;
00680         float *gpuprevw;
00681         float boxsize[3];
00682         float *gpuboxsize;
00683         float *gpucenters;
00684         float *gpuinvboxsize;
00685         float *gpuoldboxsize;
00686         float *gpuoldinvboxsize;
00687         float *gpufragcenters;
00688         float *wscale;
00689         cudaMalloc((void**) &wscale, sizeof(float));
00690         float *gpuweights;
00691         int *gpuweightidx;
00692         int *gpuindexlist;
00693         float *gpufragweight;
00694         int *gpuatomtofragmap;
00695         int *gpucompoundmap;
00696         cudaHostRegister(indexlist, sizeof(int) * sellen,0);
00697         cudaHostRegister(cindexlist, sizeof(int) * csellen,0);
00698         cudaHostRegister(boxsize, sizeof(float) * 3,0);
00699         int blocks = (3*sellen + threads - 1) / threads;
00700         int blocks_frag = (fnum + threads - 1) / threads;
00701         cudaMalloc((void**) &gpupos, sizeof(float) * 3*mol->nAtoms);
00702         cudaMalloc((void**) &gpuprevw, sizeof(float) * 3*mol->nAtoms);
00703         cudaMalloc((void**) &gpuprevu, sizeof(float) * 3*mol->nAtoms);
00704         cudaMalloc((void**) &gpuboxsize, sizeof(float) * 3);
00705         cudaMalloc((void**) &gpuinvboxsize, sizeof(float) * 3);
00706         cudaMalloc((void**) &gpuoldboxsize, sizeof(float) * 3);
00707         cudaMalloc((void**) &gpuoldinvboxsize, sizeof(float) * 3);
00708         cudaMalloc((void**) &gpucenters, sizeof(float) * 3);
00709         if (fnum) {
00710                 cudaMalloc((void**) &gpufragcenters, sizeof(float) * 3 * fnum);
00711         }
00712         cudaMallocHost((void**) &pos, sizeof(float) * 3*mol->nAtoms);
00713         float tmp;
00714         cudaMalloc((void**) &gpuindexlist, sizeof(int) * sellen);
00715         
00716         cudaMemcpy(gpuindexlist, indexlist, sizeof(int) * sellen, cudaMemcpyHostToDevice);
00717         //Deal with computing the weighted center of mass.
00718         cudaMalloc((void**) &gpuweightidx, sizeof(int) * csellen);
00719         cudaMemcpy(gpuweightidx, cindexlist, sizeof(int) * csellen, cudaMemcpyHostToDevice);
00720         if (fnum) {//Compound runs only.
00721                 cudaMalloc((void**) &gpuweights, sizeof(float) * mol->nAtoms);
00722                 cudaMalloc((void**) &gpufragweight, sizeof(float) * fnum);
00723                 cudaHostRegister(compoundmap, sizeof(int) * (fnum+1),0);
00724                 cudaMemcpy(gpuweights, massarr, sizeof(float) * mol->nAtoms, cudaMemcpyHostToDevice);
00725                 cudaMalloc((void**) &gpucompoundmap, sizeof(int) * (fnum+1));
00726                 cudaMemcpy(gpucompoundmap, compoundmap, sizeof(int) * (fnum+1), cudaMemcpyHostToDevice);
00727                 cudaMalloc((void**) &gpuatomtofragmap, sizeof(int) * sellen);
00728                 cudaMemset(gpuatomtofragmap, 0, sizeof(int) * sellen);
00729                 fragmentperatom<<<blocks_frag, threads>>>(fnum, gpucompoundmap, gpuatomtofragmap);//Setup the gpu per atom map.
00730                 thrust::inclusive_scan(thrust::device_ptr<int>(gpuatomtofragmap), thrust::device_ptr<int>(gpuatomtofragmap + sellen), thrust::device_ptr<int>(gpuatomtofragmap));
00731                 //Get the mass per fragment (for scaling/finding center of mass for everything.)
00732                 fragwscale<<<blocks_frag, threads>>>(gpufragweight, gpuweights, fnum, gpucompoundmap, gpuindexlist);
00733                 thrust::device_vector<int> ids (thrust::device_ptr<int>(gpuweightidx), thrust::device_ptr<int>(gpuweightidx+csel->selected));
00734                 thrust::device_vector<float> mass (thrust::device_ptr<float>(gpuweights), thrust::device_ptr<float>(gpuweights+mol->nAtoms));
00735                 tmp = 1.0f / thrust::reduce(thrust::make_permutation_iterator(mass.begin(), ids.begin()),
00736                         thrust::make_permutation_iterator(mass.end(), ids.end()), 0, thrust::plus<float>());
00737                 cudaDeviceSynchronize();
00738         }
00739         else {
00740                 cudaMalloc((void**) &gpuweights, sizeof(float) * csel->selected);
00741                 cudaMemcpy(gpuweights, weights, sizeof(float) * csel->selected, cudaMemcpyHostToDevice);
00742                 tmp = 1.0f / thrust::reduce(thrust::device_ptr<float>(gpuweights), thrust::device_ptr<float>(gpuweights + csel->selected), 0, thrust::plus<float>());
00743         }
00744         cudaMemcpy(wscale, &tmp, sizeof(float), cudaMemcpyHostToDevice);
00745 
00746         //Do stuff
00747         blocks_frag = (3*fnum + threads - 1) / threads;
00748         for (f=first; f<=last; f++) {
00749                 ts = mol->get_frame(f);
00750                 if ( f > first) {
00751                         cudaMemcpyAsync(gpuoldboxsize, gpuboxsize, sizeof(float) * 3, cudaMemcpyDeviceToDevice);
00752                         cudaMemcpyAsync(gpuoldinvboxsize, gpuinvboxsize, sizeof(float) * 3, cudaMemcpyDeviceToDevice);
00753                 } else {
00754                         cudaMemcpyAsync(gpuprevw, pos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToDevice);
00755                         cudaMemcpyAsync(gpuprevu, gpuprevw, sizeof(float) * 3*mol->nAtoms, cudaMemcpyDeviceToDevice);
00756                 }
00757                 
00758                 boxsize[0] = ts->a_length;
00759                 boxsize[1] = ts->b_length;
00760                 boxsize[2] = ts->c_length;
00761                 cudaMemcpyAsync(pos, ts->pos, sizeof(float) * 3*mol->nAtoms,cudaMemcpyHostToHost);
00762                 cudaMemcpyAsync(gpupos, pos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToDevice);
00763                 cudaMemcpyAsync(gpuboxsize, boxsize, sizeof(float) * 3, cudaMemcpyHostToDevice);
00764                 //Do math here.
00765                 inverseboxsize<<<1,4>>>(gpuboxsize, gpuinvboxsize);
00766                 if (f > first) {//These are the ones that also need to be unwrapped.
00767                         //We must wait until the previous stream is done moving atoms around or loading. This part is inherently serial.
00768                         unwrapatomic<<<blocks,threads>>>(gpupos, gpuprevu, gpuprevw, csellen, gpuweightidx, gpuboxsize, gpuinvboxsize, gpuoldboxsize, gpuoldinvboxsize);
00769                         cudaMemcpyAsync(gpuprevu, gpupos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyDeviceToDevice);
00770                 }
00771                 cudaMemsetAsync(gpucenters,0, 3 * sizeof(float));               
00772                 //Compounding will have a non-zero fnum.
00773                 if (fnum) {
00774                         measurecenter_fullmass<<<(3*csel->selected + 95) / 96, 96>>>(gpupos, gpucenters, csel->selected, gpuweights, gpuweightidx, wscale);
00775                         //Wrap.
00776                         wrapcompound<<<blocks, threads>>> (gpupos, sellen, gpucenters, gpuatomtofragmap, gpuindexlist, gpuboxsize, gpuinvboxsize, gpufragcenters);
00777                 }
00778                 else {
00779                         measurecenter<<<(3*csel->selected + 95) / 96, 96>>>(gpupos, gpucenters, csel->selected, gpuweights, gpuweightidx, wscale);
00780                         //Wrap.
00781                         wrapatomic<<<blocks, threads>>> (gpupos, sellen, gpucenters, gpuindexlist, gpuboxsize, gpuinvboxsize);
00782                 }
00783                 //Copy out to buffer.
00784                 cudaMemcpyAsync(pos, gpupos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyDeviceToHost);
00785                 //Copy back to VMD.
00786                 cudaMemcpyAsync(ts->pos, pos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToHost);
00787         }
00788         //Wait for outstanding memory transfers.
00789         cudaDeviceSynchronize();
00790         //Cleanup
00791         cudaFree(gpupos);
00792         cudaFree(gpuboxsize);
00793         cudaFree(gpuinvboxsize);
00794         cudaFree(gpucenters);
00795         if (fnum) {
00796                 cudaFree(gpufragcenters);
00797         }
00798         cudaFreeHost(pos);
00799         cudaFree(gpuprevw);
00800         cudaFree(gpuprevu);
00801         cudaFree(gpuoldboxsize);
00802         cudaFree(gpuoldinvboxsize);
00803 
00804         cudaHostUnregister(indexlist);
00805         cudaHostUnregister(cindexlist);
00806         cudaHostUnregister(boxsize);
00807         cudaFree(gpuindexlist);
00808         cudaFree(gpuweightidx);
00809         cudaFree(gpuweights);
00810         cudaFree(wscale);
00811         if (fnum) {
00812                 cudaFree(gpucompoundmap);
00813                 cudaFree(gpuatomtofragmap);
00814                 cudaFree(gpufragweight);
00815                 cudaHostUnregister(compoundmap);
00816         }
00817         
00818         cudaDeviceSynchronize();
00819         cudaError_t error = cudaGetLastError();
00820         if(error != cudaSuccess)
00821         {
00822                 // print the CUDA error message and fallback to CPU
00823                 printf("CUDA error: %s\n", cudaGetErrorString(error));
00824                 printf("Reverting to CPU algorithm\n");
00825                 fpbc_exec_recenter_cpu(mol, first, last, csellen, cindexlist, fnum, compoundmap, sellen, indexlist, weights, csel, massarr);
00826         }
00827 }