00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00021 #include <stdio.h>
00022
00023
00024 #include <thrust/scan.h>
00025 #include <thrust/device_ptr.h>
00026 #include <thrust/reduce.h>
00027 #include <thrust/iterator/permutation_iterator.h>
00028 #include <thrust/device_vector.h>
00029
00030 #include "FastPBC.h"
00031
00032 __global__ void inverseboxsize (float *boxsize, float* invboxsize) {
00033 int tid = threadIdx.x;
00034 if (tid < 3) {
00035 invboxsize[tid] = 1.0 / boxsize[tid];
00036 }
00037 }
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068 __global__ void repositionfragments(float *pos, int sellen, int *atomtofragmap,
00069 int *compoundmap, int *indexlist,
00070 float *boxsize, float *invboxsize) {
00071 int tid = threadIdx.x + blockIdx.x * blockDim.x;
00072 if (tid < 3*sellen) {
00073 int idx = tid / 3;
00074 int dim = tid % 3;
00075 int cidx = indexlist[compoundmap[atomtofragmap[idx]]];
00076
00077 float center = pos[3 * cidx + dim];
00078 pos[3*indexlist[idx]+dim] = pos[3*indexlist[idx]+dim] - (rintf((pos[3*indexlist[idx]+dim] - center) * invboxsize[dim]) * boxsize[dim]);
00079 }
00080 }
00081
00082
00083 __global__ void wrapcompound(float *pos, int sellen, float *center,
00084 int *atomtofragmap, int *indexlist,
00085 float *boxsize, float *invboxsize,
00086 float *fragcenters) {
00087 int tid = threadIdx.x + blockIdx.x * blockDim.x;
00088 if (tid < 3*sellen) {
00089 int idx = tid / 3;
00090 int dim = tid % 3;
00091 int frag = atomtofragmap[idx];
00092 int aidx = indexlist[idx];
00093 pos[3*aidx+dim] = pos[3*aidx+dim] - (rintf((fragcenters[3*frag+dim] - center[dim]) * invboxsize[dim]) * boxsize[dim]);
00094 }
00095 }
00096
00097
00098 __global__ void wrapatomic(float *pos, int sellen, float *center,
00099 int *indexlist, float *boxsize, float *invboxsize) {
00100 int tid = threadIdx.x + blockIdx.x * blockDim.x;
00101 if (tid < 3*sellen) {
00102 int idx = tid / 3;
00103 int dim = tid % 3;
00104 int aidx = indexlist[idx];
00105 pos[3*aidx+dim] = pos[3*aidx+dim] - (rintf((pos[3*aidx+dim] - center[dim]) * invboxsize[dim]) * boxsize[dim]);
00106 }
00107 }
00108
00109
00110 __global__ void unwrapatomic(float *pos, float *prev, float *prevw, int sellen,
00111 int *indexlist,
00112 float *boxsize, float *invboxsize, float *oldboxsize, float *oldinvboxsize) {
00113 int tid = threadIdx.x + blockIdx.x * blockDim.x;
00114 if (tid < 3*sellen) {
00115 int idx = tid / 3;
00116 int dim = tid % 3;
00117 int aidx = indexlist[idx];
00118 float tmp = pos[3*aidx+dim];
00119 float disp = pos[3*aidx+dim] - prevw[3*aidx+dim];
00120 pos[3*aidx+dim] = prev[3*aidx+dim] + disp - (rintf((disp) * invboxsize[dim]) * boxsize[dim])
00121 - (rintf((prevw[3*aidx+dim]-prev[3*aidx+dim])*oldinvboxsize[dim]) * (boxsize[dim]-oldboxsize[dim]));
00122 prevw[3*aidx+dim] = tmp;
00123 }
00124 }
00125
00126
00127 __global__ void fragmentperatom(int fnum, int *compoundmap,
00128 int *atomtofragmap) {
00129 int tid = threadIdx.x + blockIdx.x * blockDim.x;
00130 if (tid < fnum && tid != 0) {
00131 atomtofragmap[compoundmap[tid]] = 1;
00132 }
00133 }
00134
00135
00136
00137 __global__ void measurecenter(float *pos, float *center, int len,
00138 float *weights, int *weightidx, float *wscale) {
00139 __shared__ float reduce[96];
00140
00141 int tid = threadIdx.x + blockIdx.x * blockDim.x;
00142 float mcenter = 0;
00143 int dim;
00144 if (tid < 3*len) {
00145 int idx = tid / 3;
00146 dim = tid % 3;
00147 mcenter = pos[3 * weightidx[idx] + dim] * weights[idx] * (*wscale);
00148 }
00149 reduce[threadIdx.x] = mcenter;
00150 __syncthreads();
00151 if (threadIdx.x < 48) {
00152 reduce[threadIdx.x] += reduce[threadIdx.x + 48];
00153 }
00154 __syncthreads();
00155 if (threadIdx.x < 24) {
00156 reduce[threadIdx.x] += reduce[threadIdx.x + 24];
00157 }
00158 __syncthreads();
00159 if (threadIdx.x < 12) {
00160 reduce[threadIdx.x] += reduce[threadIdx.x + 12];
00161 }
00162 __syncthreads();
00163 if (threadIdx.x < 3) {
00164 mcenter = reduce[threadIdx.x] + reduce[threadIdx.x + 3] + reduce[threadIdx.x + 6] + reduce[threadIdx.x + 9];
00165 atomicAdd(¢er[dim], mcenter);
00166 }
00167 }
00168
00169
00170
00171
00172
00173 __global__ void measurecenter_fullmass(float *pos, float *center, int len,
00174 float *weights, int *weightidx,
00175 float *wscale) {
00176 __shared__ float reduce[96];
00177
00178 int tid = threadIdx.x + blockIdx.x * blockDim.x;
00179 float mcenter = 0;
00180 int dim;
00181 if (tid < 3*len) {
00182 int idx = tid / 3;
00183 dim = tid % 3;
00184 int widx = weightidx[idx];
00185 mcenter = pos[3 * widx + dim] * weights[widx] * (*wscale);
00186 }
00187 reduce[threadIdx.x] = mcenter;
00188 __syncthreads();
00189 if (threadIdx.x < 48) {
00190 reduce[threadIdx.x] += reduce[threadIdx.x + 48];
00191 }
00192 __syncthreads();
00193 if (threadIdx.x < 24) {
00194 reduce[threadIdx.x] += reduce[threadIdx.x + 24];
00195 }
00196 __syncthreads();
00197 if (threadIdx.x < 12) {
00198 reduce[threadIdx.x] += reduce[threadIdx.x + 12];
00199 }
00200 __syncthreads();
00201 if (threadIdx.x < 3) {
00202 mcenter = reduce[threadIdx.x] + reduce[threadIdx.x + 3] + reduce[threadIdx.x + 6] + reduce[threadIdx.x + 9];
00203 atomicAdd(¢er[dim], mcenter);
00204 }
00205 }
00206
00207
00208
00209
00210
00211 __global__ void computefragcenters(float *pos, float *centers, int fnum,
00212 float *weights, float *wscale,
00213 int *compoundmap, int *indexlist) {
00214 int tid = threadIdx.x + blockIdx.x * blockDim.x;
00215 int i, j, k, f;
00216 float ccenter = 0;
00217 if (tid < 3*fnum) {
00218 f = tid / 3;
00219 j = tid % 3;
00220 int lowbound = compoundmap[f];
00221 int highbound = compoundmap[f+1];
00222
00223 for (k = lowbound; k < highbound; k++ ) {
00224 i = indexlist[k];
00225 ccenter += pos[i*3+j] * weights[i] * wscale[f];
00226 }
00227 centers[3*f+j] = ccenter ;
00228 }
00229 }
00230
00231
00232 __global__ void fragwscale(float *fragscales, float *massarr, int fragnum,
00233 int *compoundmap, int *indexlist) {
00234 int tid = threadIdx.x + blockIdx.x * blockDim.x;
00235 int i, k, f;
00236 float fragmass = 0;
00237 if (tid < fragnum) {
00238 f = tid;
00239 int lowbound = compoundmap[f];
00240 int highbound = compoundmap[f+1];
00241
00242 for (k = lowbound; k < highbound; k++ ) {
00243 i = indexlist[k];
00244 fragmass += massarr[i];
00245 }
00246 fragscales[tid] = 1.0 / fragmass ;
00247 }
00248 }
00249
00250 void fpbc_exec_unwrap(Molecule* mol, int first, int last, int sellen, int* indexlist) {
00251 Timestep *ts;
00252 int f;
00253 const int threads = 128;
00254 float *pos;
00255 float *gpupos;
00256 float *gpuprevu;
00257 float *gpuprevw;
00258 float boxsize[3];
00259 float *gpuboxsize;
00260 float *gpuinvboxsize;
00261 float *gpuoldboxsize;
00262 float *gpuoldinvboxsize;
00263 int *gpuindexlist;
00264 int blocks = (3*sellen + threads - 1) / threads;
00265 cudaHostRegister(indexlist, sizeof(int) * sellen,0);
00266 cudaHostRegister(boxsize, sizeof(float) * 3,0);
00267
00268 cudaMalloc((void**) &gpupos, sizeof(float) * 3*mol->nAtoms);
00269 cudaMalloc((void**) &gpuprevw, sizeof(float) * 3*mol->nAtoms);
00270 cudaMalloc((void**) &gpuprevu, sizeof(float) * 3*mol->nAtoms);
00271 cudaMalloc((void**) &gpuboxsize, sizeof(float) * 3);
00272 cudaMalloc((void**) &gpuinvboxsize, sizeof(float) * 3);
00273 cudaMalloc((void**) &gpuoldboxsize, sizeof(float) * 3);
00274 cudaMalloc((void**) &gpuoldinvboxsize, sizeof(float) * 3);
00275 cudaMallocHost((void**) &pos, sizeof(float) * 3*mol->nAtoms);
00276 cudaMalloc((void**) &gpuindexlist, sizeof(int) * sellen);
00277 cudaMemcpy(gpuindexlist, indexlist, sizeof(int) * sellen, cudaMemcpyHostToDevice);
00278 f = first;
00279 ts = mol->get_frame(f);
00280
00281 cudaMemcpyAsync(pos, ts->pos, sizeof(float) * 3*mol->nAtoms,cudaMemcpyHostToHost);
00282 cudaMemcpyAsync(gpuprevw, pos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToDevice);
00283 cudaMemcpyAsync(gpuprevu, gpuprevw, sizeof(float) * 3*mol->nAtoms, cudaMemcpyDeviceToDevice);
00284 boxsize[0] = ts->a_length;
00285 boxsize[1] = ts->b_length;
00286 boxsize[2] = ts->c_length;
00287 cudaMemcpyAsync(gpuoldboxsize, boxsize, sizeof(float) * 3, cudaMemcpyHostToDevice);
00288 inverseboxsize<<<1,4>>>(gpuoldboxsize, gpuoldinvboxsize);
00289
00290 for (f=first+1; f<=last; f++) {
00291 ts = mol->get_frame(f);
00292 boxsize[0] = ts->a_length;
00293 boxsize[1] = ts->b_length;
00294 boxsize[2] = ts->c_length;
00295
00296 cudaDeviceSynchronize();
00297
00298 cudaMemcpyAsync(pos, ts->pos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToHost);
00299
00300 cudaMemcpyAsync(gpupos, pos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToDevice);
00301 cudaMemcpyAsync(gpuboxsize, boxsize, sizeof(float) * 3, cudaMemcpyHostToDevice);
00302
00303 inverseboxsize<<<1,4>>>(gpuboxsize, gpuinvboxsize);
00304 unwrapatomic<<<blocks,threads>>>(gpupos, gpuprevu, gpuprevw, sellen, gpuindexlist, gpuboxsize, gpuinvboxsize, gpuoldboxsize, gpuoldinvboxsize);
00305
00306 cudaMemcpyAsync(pos, gpupos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyDeviceToHost);
00307 cudaMemcpyAsync(gpuprevu, gpupos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyDeviceToDevice);
00308 cudaMemcpyAsync(gpuoldboxsize, gpuboxsize, sizeof(float) * 3, cudaMemcpyDeviceToDevice);
00309 cudaMemcpyAsync(gpuoldinvboxsize, gpuinvboxsize, sizeof(float) * 3, cudaMemcpyDeviceToDevice);
00310
00311 cudaMemcpyAsync(ts->pos, pos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToHost);
00312 }
00313
00314 cudaDeviceSynchronize();
00315
00316
00317 cudaFree(gpupos);
00318 cudaFree(gpuprevw);
00319 cudaFree(gpuprevu);
00320 cudaFree(gpuboxsize);
00321 cudaFree(gpuinvboxsize);
00322 cudaFree(gpuoldboxsize);
00323 cudaFree(gpuoldinvboxsize);
00324 cudaFreeHost(pos);
00325
00326 cudaHostUnregister(indexlist);
00327 cudaHostUnregister(boxsize);
00328 cudaFree(gpuindexlist);
00329 cudaDeviceSynchronize();
00330 cudaError_t error = cudaGetLastError();
00331 if(error != cudaSuccess)
00332 {
00333
00334 printf("CUDA error: %s\n", cudaGetErrorString(error));
00335 printf("Reverting to CPU algorithm\n");
00336 fpbc_exec_unwrap_cpu(mol, first, last, sellen, indexlist);
00337 }
00338 }
00339
00340
00341 void fpbc_exec_wrapcompound(Molecule* mol, int first, int last, int fnum, int *compoundmap, int sellen, int* indexlist, float* weights, AtomSel* csel, float* center, float* massarr) {
00342
00343 int f, i, j;
00344 Timestep *ts;
00345 const int nStreams = 4;
00346 const int threads = 128;
00347 float *pos[nStreams];
00348 float *gpupos[nStreams];
00349 float boxsize[3];
00350 float *gpuboxsize[nStreams];
00351 float *gpucenters[nStreams];
00352 float *gpuinvboxsize[nStreams];
00353 float *gpuweights;
00354 float *wscale;
00355 float *gpufragweight;
00356 int *gpuweightidx;
00357 int *gpuindexlist;
00358 int *gpuatomtofragmap;
00359 int *gpucompoundmap;
00360 float *gpufragcenters[nStreams];
00361 int blocks_frag = (fnum + threads - 1) / threads;
00362 cudaStream_t stream[nStreams];
00363
00364 cudaMalloc((void**) &gpuatomtofragmap, sizeof(int) * sellen);
00365 cudaMalloc((void**) &wscale, sizeof(float));
00366 cudaMemset(gpuatomtofragmap, 0, sizeof(int) * sellen);
00367 cudaMalloc((void**) &gpucompoundmap, sizeof(int) * (fnum+1));
00368 cudaMalloc((void**) &gpuindexlist, sizeof(int) * sellen);
00369 cudaMalloc((void**) &gpuweights, sizeof(float) * mol->nAtoms);
00370 cudaMalloc((void**) &gpufragweight, sizeof(float) * fnum);
00371 cudaHostRegister(compoundmap, sizeof(int) * (fnum+1),0);
00372 cudaHostRegister(indexlist, sizeof(int) * sellen,0);
00373 cudaMemcpy(gpuweights, massarr, sizeof(float) * mol->nAtoms, cudaMemcpyHostToDevice);
00374 cudaMemcpy(gpucompoundmap, compoundmap, sizeof(int) * (fnum+1), cudaMemcpyHostToDevice);
00375 cudaMemcpy(gpuindexlist, indexlist, sizeof(int) * sellen, cudaMemcpyHostToDevice);
00376
00377 fragmentperatom<<<blocks_frag, threads>>>(fnum, gpucompoundmap, gpuatomtofragmap);
00378 thrust::inclusive_scan(thrust::device_ptr<int>(gpuatomtofragmap), thrust::device_ptr<int>(gpuatomtofragmap + sellen), thrust::device_ptr<int>(gpuatomtofragmap));
00379
00380 fragwscale<<<blocks_frag, threads>>>(gpufragweight, gpuweights, fnum, gpucompoundmap, gpuindexlist);
00381 cudaHostRegister(boxsize, sizeof(float)*3,0);
00382 if (csel != NULL) {
00383 cudaMalloc((void**) &gpuweightidx, sizeof(int) * csel->selected);
00384 int *weightidx = new int[csel->selected];
00385 j=0;
00386 for (i=csel->firstsel; i<=csel->lastsel; i++) {
00387 if (csel->on[i]) {
00388 weightidx[j++] = i;
00389 }
00390 }
00391 cudaMemcpy(gpuweightidx, weightidx, sizeof(int) * csel->selected, cudaMemcpyHostToDevice);
00392 thrust::device_vector<int> ids (thrust::device_ptr<int>(gpuweightidx), thrust::device_ptr<int>(gpuweightidx+csel->selected));
00393 thrust::device_vector<float> mass (thrust::device_ptr<float>(gpuweights), thrust::device_ptr<float>(gpuweights+mol->nAtoms));
00394 float tmp = 1.0f / thrust::reduce(thrust::make_permutation_iterator(mass.begin(), ids.begin()),
00395 thrust::make_permutation_iterator(mass.end(), ids.end()), 0, thrust::plus<float>());
00396 cudaMemcpy(wscale, &tmp, sizeof(float), cudaMemcpyHostToDevice);
00397 delete [] weightidx;
00398 }
00399
00400 for (f = 0; f< nStreams; f++) {
00401 cudaStreamCreate(&stream[f]);
00402 cudaMalloc((void**) &gpupos[f], sizeof(float) * 3*mol->nAtoms);
00403 cudaMalloc((void**) &gpuboxsize[f], sizeof(float) * 3);
00404 cudaMalloc((void**) &gpuinvboxsize[f], sizeof(float) * 3);
00405 cudaMalloc((void**) &gpucenters[f], sizeof(float) * 3);
00406 cudaMemcpyAsync(gpucenters[f], center, sizeof(float)*3, cudaMemcpyHostToDevice, stream[f]);
00407 cudaMalloc((void**) &gpufragcenters[f], sizeof(float) * 3 * fnum);
00408 cudaMallocHost(&pos[f], sizeof(float) * 3*mol->nAtoms);
00409 }
00410 cudaDeviceSynchronize();
00411
00412 int blocks = (3*sellen + threads - 1) / threads;
00413 blocks_frag = (3*fnum + threads - 1) / threads;
00414 for (f=first; f<=last; f++) {
00415 ts = mol->get_frame(f);
00416 boxsize[0] = ts->a_length;
00417 boxsize[1] = ts->b_length;
00418 boxsize[2] = ts->c_length;
00419
00420 cudaStreamSynchronize(stream[f%nStreams]);
00421
00422 cudaMemcpyAsync(pos[f%nStreams], ts->pos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToHost, stream[f%nStreams]);
00423 cudaMemcpyAsync(gpupos[f%nStreams], pos[f%nStreams], sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToDevice, stream[f%nStreams]);
00424 cudaMemcpyAsync(gpuboxsize[f%nStreams], boxsize, sizeof(float)*3, cudaMemcpyHostToDevice, stream[f%nStreams]);
00425
00426 inverseboxsize<<<1,4,0,stream[f%nStreams]>>>(gpuboxsize[f%nStreams], gpuinvboxsize[f%nStreams]);
00427 if (csel != NULL) {
00428 cudaMemsetAsync(gpucenters[f%nStreams],0, 3 * sizeof(float), stream[f%nStreams]);
00429
00430
00431
00432 measurecenter_fullmass<<<(3*csel->selected + 95) / 96, 96, 0, stream[f%nStreams]>>>(gpupos[f%nStreams], gpucenters[f%nStreams], csel->selected, gpuweights, gpuweightidx, wscale);
00433 }
00434
00435
00436 computefragcenters<<<blocks_frag,threads,0,stream[f%nStreams]>>>(gpupos[f%nStreams], gpufragcenters[f%nStreams], fnum, gpuweights, gpufragweight, gpucompoundmap, gpuindexlist);
00437
00438 wrapcompound<<<blocks, threads, 0, stream[f%nStreams]>>> (gpupos[f%nStreams], sellen, gpucenters[f%nStreams], gpuatomtofragmap, gpuindexlist, gpuboxsize[f%nStreams], gpuinvboxsize[f%nStreams], gpufragcenters[f%nStreams]);
00439
00440 cudaMemcpyAsync(pos[f%nStreams], gpupos[f%nStreams], sizeof(float) * 3 *mol->nAtoms, cudaMemcpyDeviceToHost, stream[f%nStreams]);
00441
00442 cudaMemcpyAsync(ts->pos, pos[f%nStreams], sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToHost, stream[f%nStreams]);
00443 }
00444
00445
00446 cudaDeviceSynchronize();
00447
00448 cudaHostUnregister(boxsize);
00449 cudaHostUnregister(compoundmap);
00450 cudaHostUnregister(indexlist);
00451 cudaFree(gpucompoundmap);
00452 cudaFree(gpuindexlist);
00453 cudaFree(gpuatomtofragmap);
00454 for (f = 0; f< nStreams; f++) {
00455 cudaStreamDestroy(stream[f]);
00456 cudaFree(gpupos[f]);
00457 cudaFree(gpuboxsize[f]);
00458 cudaFree(gpuinvboxsize[f]);
00459 cudaFreeHost(pos[f]);
00460 }
00461
00462 cudaFree(wscale);
00463 cudaFree(gpufragweight);
00464 cudaFree(gpuweights);
00465 if (csel != NULL) {
00466 cudaFree(gpuweightidx);
00467 }
00468 cudaDeviceSynchronize();
00469 cudaError_t error = cudaGetLastError();
00470 if(error != cudaSuccess)
00471 {
00472
00473 printf("CUDA error: %s\n", cudaGetErrorString(error));
00474 printf("Reverting to CPU algorithm\n");
00475 fpbc_exec_wrapcompound_cpu(mol, first, last, fnum, compoundmap, sellen, indexlist, weights, csel, center, massarr);
00476 }
00477 }
00478
00479
00480 void fpbc_exec_wrapatomic(Molecule* mol, int first, int last, int sellen, int* indexlist,
00481 float* weights, AtomSel* csel, float* center) {
00482 int f, i, j;
00483 Timestep *ts;
00484 const int nStreams = 4;
00485 const int threads = 128;
00486 float *gpupos[nStreams];
00487 float boxsize[3];
00488 float *pos[nStreams];
00489 float *gpuboxsize[nStreams];
00490 float *gpucenters[nStreams];
00491 float *gpuinvboxsize[nStreams];
00492 float *gpuweights;
00493 float *wscale;
00494 cudaMalloc((void**) &wscale, sizeof(float));
00495 int *gpuweightidx;
00496 int *gpuindexlist;
00497
00498 cudaStream_t stream[nStreams];
00499 cudaMalloc((void**) &gpuindexlist, sizeof(int) * sellen);
00500 cudaMemcpy(gpuindexlist, indexlist, sizeof(int) * sellen, cudaMemcpyHostToDevice);
00501 cudaHostRegister(center, sizeof(float) * 3,0);
00502 cudaHostRegister(boxsize, sizeof(float) * 3,0);
00503 for (f = 0; f< nStreams; f++) {
00504 cudaStreamCreate(&stream[f]);
00505 cudaMalloc((void**) &gpupos[f], sizeof(float) * 3*mol->nAtoms);
00506 cudaMalloc((void**) &gpuboxsize[f], sizeof(float) * 3);
00507 cudaMalloc((void**) &gpuinvboxsize[f], sizeof(float) * 3);
00508 cudaMalloc((void**) &gpucenters[f], sizeof(float) * 3);
00509 cudaMemcpyAsync(gpucenters[f], center, sizeof(float)*3, cudaMemcpyHostToDevice, stream[f]);
00510 cudaMallocHost(&pos[f], sizeof(float) * 3*mol->nAtoms);
00511 }
00512 if (csel != NULL) {
00513 cudaMalloc((void**) &gpuweights, sizeof(float) * csel->selected);
00514 cudaMalloc((void**) &gpuweightidx, sizeof(int) * csel->selected);
00515 cudaMemcpy(gpuweights, weights, sizeof(float) * csel->selected, cudaMemcpyHostToDevice);
00516 int *weightidx = new int[csel->selected];
00517 j=0;
00518 for (i=csel->firstsel; i<=csel->lastsel; i++) {
00519 if (csel->on[i]) {
00520 weightidx[j++] = i;
00521 }
00522 }
00523 cudaMemcpy(gpuweightidx, weightidx, sizeof(int) * csel->selected, cudaMemcpyHostToDevice);
00524 float tmp = 1.0f / thrust::reduce(thrust::device_ptr<float>(gpuweights), thrust::device_ptr<float>(gpuweights + csel->selected), 0, thrust::plus<float>());
00525 cudaMemcpy(wscale, &tmp, sizeof(float), cudaMemcpyHostToDevice);
00526 delete [] weightidx;
00527 }
00528 int blocks = (3*sellen + threads - 1) / threads;
00529 for (f=first; f<=last; f++) {
00530 ts = mol->get_frame(f);
00531 boxsize[0] = ts->a_length;
00532 boxsize[1] = ts->b_length;
00533 boxsize[2] = ts->c_length;
00534
00535 cudaStreamSynchronize(stream[f%nStreams]);
00536 cudaMemcpyAsync(pos[f%nStreams], ts->pos, sizeof(float) * 3*mol->nAtoms,cudaMemcpyHostToHost, stream[f%nStreams]);
00537 cudaMemcpyAsync(gpupos[f%nStreams], pos[f%nStreams], sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToDevice, stream[f%nStreams]);
00538 cudaMemcpyAsync(gpuboxsize[f%nStreams], boxsize, sizeof(float)*3, cudaMemcpyHostToDevice, stream[f%nStreams]);
00539
00540 inverseboxsize<<<1,4,0,stream[f%nStreams]>>>(gpuboxsize[f%nStreams], gpuinvboxsize[f%nStreams]);
00541 if (csel != NULL) {
00542 cudaMemsetAsync(gpucenters[f%nStreams],0, 3 * sizeof(float), stream[f%nStreams]);
00543
00544
00545
00546 measurecenter<<<(3*csel->selected + 95) / 96, 96, 0, stream[f%nStreams]>>>(gpupos[f%nStreams], gpucenters[f%nStreams], csel->selected, gpuweights, gpuweightidx, wscale);
00547 }
00548
00549 wrapatomic<<<blocks, threads, 0, stream[f%nStreams]>>> (gpupos[f%nStreams], sellen, gpucenters[f%nStreams], gpuindexlist, gpuboxsize[f%nStreams], gpuinvboxsize[f%nStreams]);
00550
00551 cudaMemcpyAsync(pos[f%nStreams], gpupos[f%nStreams], sizeof(float) * 3 *mol->nAtoms, cudaMemcpyDeviceToHost, stream[f%nStreams]);
00552 cudaMemcpyAsync(ts->pos, pos[f%nStreams], sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToHost, stream[f%nStreams]);
00553 }
00554
00555 cudaDeviceSynchronize();
00556
00557 cudaHostUnregister(boxsize);
00558 cudaHostUnregister(center);
00559 cudaFree(gpuindexlist);
00560 for (f = 0; f< nStreams; f++) {
00561 cudaStreamDestroy(stream[f]);
00562 cudaFree(gpupos[f]);
00563 cudaFree(gpuboxsize[f]);
00564 cudaFree(gpuinvboxsize[f]);
00565 cudaFree(gpucenters[f]);
00566 cudaFreeHost(pos[f]);
00567 }
00568
00569 if (csel != NULL) {
00570 cudaFree(gpuweights);
00571 cudaFree(gpuweightidx);
00572 }
00573 cudaDeviceSynchronize();
00574 cudaError_t error = cudaGetLastError();
00575 if(error != cudaSuccess)
00576 {
00577
00578 printf("CUDA error: %s\n", cudaGetErrorString(error));
00579 printf("Reverting to CPU algorithm\n");
00580 fpbc_exec_wrapatomic_cpu(mol, first, last, sellen, indexlist, weights, csel, center);
00581 }
00582 }
00583
00584
00585 void fpbc_exec_join(Molecule* mol, int first, int last, int fnum, int *compoundmap, int sellen, int* indexlist) {
00586 int f;
00587
00588 Timestep *ts;
00589 const int nStreams = 4;
00590 const int threads = 128;
00591 float *gpupos[nStreams];
00592 float *pos[nStreams];
00593 float boxsize[3];
00594 float *gpuboxsize[nStreams];
00595 float *gpuinvboxsize[nStreams];
00596 int *gpucompoundmap;
00597 int *gpuatomtofragmap;
00598 int *gpuindexlist;
00599 int blocks = (fnum + threads - 1) / threads;
00600 cudaStream_t stream[nStreams];
00601 cudaMalloc((void**) &gpuatomtofragmap, sizeof(int) * sellen);
00602 cudaMemset(gpuatomtofragmap, 0, sizeof(int) * sellen);
00603 cudaMalloc((void**) &gpucompoundmap, sizeof(int) * (fnum+1));
00604 cudaMalloc((void**) &gpuindexlist, sizeof(int) * sellen);
00605 cudaHostRegister(compoundmap, sizeof(int) * (fnum+1),0);
00606 cudaHostRegister(indexlist, sizeof(int) * sellen,0);
00607 cudaMemcpy(gpucompoundmap, compoundmap, sizeof(int) * (fnum+1), cudaMemcpyHostToDevice);
00608 cudaMemcpy(gpuindexlist, indexlist, sizeof(int) * sellen, cudaMemcpyHostToDevice);
00609 fragmentperatom<<<blocks, threads>>>(fnum, gpucompoundmap, gpuatomtofragmap);
00610 thrust::inclusive_scan(thrust::device_ptr<int>(gpuatomtofragmap), thrust::device_ptr<int>(gpuatomtofragmap + sellen), thrust::device_ptr<int>(gpuatomtofragmap));
00611 for (f = 0; f< nStreams; f++) {
00612 cudaStreamCreate(&stream[f]);
00613 cudaMalloc((void**) &gpupos[f], sizeof(float) * 3*mol->nAtoms);
00614 cudaMalloc((void**) &gpuboxsize[f], sizeof(float) * 3);
00615 cudaMalloc((void**) &gpuinvboxsize[f], sizeof(float) * 3);
00616 cudaMallocHost(&pos[f], sizeof(float) * 3*mol->nAtoms);
00617 }
00618 cudaHostRegister(boxsize, sizeof(float)*3,0);
00619
00620
00621 cudaDeviceSynchronize();
00622 blocks = (3*sellen + threads - 1) / threads;
00623 for (f = first; f <= last; f++) {
00624 ts = mol->get_frame(f);
00625 boxsize[0] = ts->a_length;
00626 boxsize[1] = ts->b_length;
00627 boxsize[2] = ts->c_length;
00628
00629 cudaStreamSynchronize(stream[f%nStreams]);
00630 cudaMemcpyAsync(pos[f%nStreams], ts->pos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToHost, stream[f%nStreams]);
00631
00632 cudaMemcpyAsync(gpupos[f%nStreams], pos[f%nStreams], sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToDevice, stream[f%nStreams]);
00633 cudaMemcpyAsync(gpuboxsize[f%nStreams], boxsize, sizeof(float)*3, cudaMemcpyHostToDevice, stream[f%nStreams]);
00634
00635 inverseboxsize<<<1,4,0,stream[f%nStreams]>>>(gpuboxsize[f%nStreams], gpuinvboxsize[f%nStreams]);
00636 repositionfragments<<<blocks,threads, 0, stream[f%nStreams]>>>(gpupos[f%nStreams], sellen, gpuatomtofragmap,
00637 gpucompoundmap, gpuindexlist, gpuboxsize[f%nStreams], gpuinvboxsize[f%nStreams]);
00638
00639 cudaMemcpyAsync(pos[f%nStreams], gpupos[f%nStreams], sizeof(float) * 3*mol->nAtoms, cudaMemcpyDeviceToHost,stream[f%nStreams]);
00640
00641 cudaMemcpyAsync(ts->pos, pos[f%nStreams], sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToHost, stream[f%nStreams]);
00642 }
00643
00644 cudaDeviceSynchronize();
00645
00646 cudaHostUnregister(boxsize);
00647 cudaHostUnregister(indexlist);
00648 cudaHostUnregister(compoundmap);
00649 cudaFree(gpucompoundmap);
00650 cudaFree(gpuindexlist);
00651 cudaFree(gpuatomtofragmap);
00652 for (f = 0; f< nStreams; f++) {
00653 cudaStreamDestroy(stream[f]);
00654 cudaFree(gpupos[f]);
00655 cudaFree(gpuboxsize[f]);
00656 cudaFree(gpuinvboxsize[f]);
00657 cudaFreeHost(pos[f]);
00658 }
00659
00660 cudaDeviceSynchronize();
00661 cudaError_t error = cudaGetLastError();
00662 if(error != cudaSuccess)
00663 {
00664
00665 printf("CUDA error: %s\n", cudaGetErrorString(error));
00666 printf("Reverting to CPU algorithm\n");
00667 fpbc_exec_join(mol, first, last, fnum, compoundmap, sellen, indexlist);
00668 }
00669 }
00670
00671
00672 void fpbc_exec_recenter(Molecule* mol, int first, int last, int csellen, int* cindexlist, int fnum, int *compoundmap, int sellen, int* indexlist, float* weights, AtomSel* csel, float* massarr) {
00673
00674 Timestep *ts;
00675 int f;
00676 const int threads = 128;
00677 float *pos;
00678 float *gpupos;
00679 float *gpuprevu;
00680 float *gpuprevw;
00681 float boxsize[3];
00682 float *gpuboxsize;
00683 float *gpucenters;
00684 float *gpuinvboxsize;
00685 float *gpuoldboxsize;
00686 float *gpuoldinvboxsize;
00687 float *gpufragcenters;
00688 float *wscale;
00689 cudaMalloc((void**) &wscale, sizeof(float));
00690 float *gpuweights;
00691 int *gpuweightidx;
00692 int *gpuindexlist;
00693 float *gpufragweight;
00694 int *gpuatomtofragmap;
00695 int *gpucompoundmap;
00696 cudaHostRegister(indexlist, sizeof(int) * sellen,0);
00697 cudaHostRegister(cindexlist, sizeof(int) * csellen,0);
00698 cudaHostRegister(boxsize, sizeof(float) * 3,0);
00699 int blocks = (3*sellen + threads - 1) / threads;
00700 int blocks_frag = (fnum + threads - 1) / threads;
00701 cudaMalloc((void**) &gpupos, sizeof(float) * 3*mol->nAtoms);
00702 cudaMalloc((void**) &gpuprevw, sizeof(float) * 3*mol->nAtoms);
00703 cudaMalloc((void**) &gpuprevu, sizeof(float) * 3*mol->nAtoms);
00704 cudaMalloc((void**) &gpuboxsize, sizeof(float) * 3);
00705 cudaMalloc((void**) &gpuinvboxsize, sizeof(float) * 3);
00706 cudaMalloc((void**) &gpuoldboxsize, sizeof(float) * 3);
00707 cudaMalloc((void**) &gpuoldinvboxsize, sizeof(float) * 3);
00708 cudaMalloc((void**) &gpucenters, sizeof(float) * 3);
00709 if (fnum) {
00710 cudaMalloc((void**) &gpufragcenters, sizeof(float) * 3 * fnum);
00711 }
00712 cudaMallocHost((void**) &pos, sizeof(float) * 3*mol->nAtoms);
00713 float tmp;
00714 cudaMalloc((void**) &gpuindexlist, sizeof(int) * sellen);
00715
00716 cudaMemcpy(gpuindexlist, indexlist, sizeof(int) * sellen, cudaMemcpyHostToDevice);
00717
00718 cudaMalloc((void**) &gpuweightidx, sizeof(int) * csellen);
00719 cudaMemcpy(gpuweightidx, cindexlist, sizeof(int) * csellen, cudaMemcpyHostToDevice);
00720 if (fnum) {
00721 cudaMalloc((void**) &gpuweights, sizeof(float) * mol->nAtoms);
00722 cudaMalloc((void**) &gpufragweight, sizeof(float) * fnum);
00723 cudaHostRegister(compoundmap, sizeof(int) * (fnum+1),0);
00724 cudaMemcpy(gpuweights, massarr, sizeof(float) * mol->nAtoms, cudaMemcpyHostToDevice);
00725 cudaMalloc((void**) &gpucompoundmap, sizeof(int) * (fnum+1));
00726 cudaMemcpy(gpucompoundmap, compoundmap, sizeof(int) * (fnum+1), cudaMemcpyHostToDevice);
00727 cudaMalloc((void**) &gpuatomtofragmap, sizeof(int) * sellen);
00728 cudaMemset(gpuatomtofragmap, 0, sizeof(int) * sellen);
00729 fragmentperatom<<<blocks_frag, threads>>>(fnum, gpucompoundmap, gpuatomtofragmap);
00730 thrust::inclusive_scan(thrust::device_ptr<int>(gpuatomtofragmap), thrust::device_ptr<int>(gpuatomtofragmap + sellen), thrust::device_ptr<int>(gpuatomtofragmap));
00731
00732 fragwscale<<<blocks_frag, threads>>>(gpufragweight, gpuweights, fnum, gpucompoundmap, gpuindexlist);
00733 thrust::device_vector<int> ids (thrust::device_ptr<int>(gpuweightidx), thrust::device_ptr<int>(gpuweightidx+csel->selected));
00734 thrust::device_vector<float> mass (thrust::device_ptr<float>(gpuweights), thrust::device_ptr<float>(gpuweights+mol->nAtoms));
00735 tmp = 1.0f / thrust::reduce(thrust::make_permutation_iterator(mass.begin(), ids.begin()),
00736 thrust::make_permutation_iterator(mass.end(), ids.end()), 0, thrust::plus<float>());
00737 cudaDeviceSynchronize();
00738 }
00739 else {
00740 cudaMalloc((void**) &gpuweights, sizeof(float) * csel->selected);
00741 cudaMemcpy(gpuweights, weights, sizeof(float) * csel->selected, cudaMemcpyHostToDevice);
00742 tmp = 1.0f / thrust::reduce(thrust::device_ptr<float>(gpuweights), thrust::device_ptr<float>(gpuweights + csel->selected), 0, thrust::plus<float>());
00743 }
00744 cudaMemcpy(wscale, &tmp, sizeof(float), cudaMemcpyHostToDevice);
00745
00746
00747 blocks_frag = (3*fnum + threads - 1) / threads;
00748 for (f=first; f<=last; f++) {
00749 ts = mol->get_frame(f);
00750 if ( f > first) {
00751 cudaMemcpyAsync(gpuoldboxsize, gpuboxsize, sizeof(float) * 3, cudaMemcpyDeviceToDevice);
00752 cudaMemcpyAsync(gpuoldinvboxsize, gpuinvboxsize, sizeof(float) * 3, cudaMemcpyDeviceToDevice);
00753 } else {
00754 cudaMemcpyAsync(gpuprevw, pos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToDevice);
00755 cudaMemcpyAsync(gpuprevu, gpuprevw, sizeof(float) * 3*mol->nAtoms, cudaMemcpyDeviceToDevice);
00756 }
00757
00758 boxsize[0] = ts->a_length;
00759 boxsize[1] = ts->b_length;
00760 boxsize[2] = ts->c_length;
00761 cudaMemcpyAsync(pos, ts->pos, sizeof(float) * 3*mol->nAtoms,cudaMemcpyHostToHost);
00762 cudaMemcpyAsync(gpupos, pos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToDevice);
00763 cudaMemcpyAsync(gpuboxsize, boxsize, sizeof(float) * 3, cudaMemcpyHostToDevice);
00764
00765 inverseboxsize<<<1,4>>>(gpuboxsize, gpuinvboxsize);
00766 if (f > first) {
00767
00768 unwrapatomic<<<blocks,threads>>>(gpupos, gpuprevu, gpuprevw, csellen, gpuweightidx, gpuboxsize, gpuinvboxsize, gpuoldboxsize, gpuoldinvboxsize);
00769 cudaMemcpyAsync(gpuprevu, gpupos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyDeviceToDevice);
00770 }
00771 cudaMemsetAsync(gpucenters,0, 3 * sizeof(float));
00772
00773 if (fnum) {
00774 measurecenter_fullmass<<<(3*csel->selected + 95) / 96, 96>>>(gpupos, gpucenters, csel->selected, gpuweights, gpuweightidx, wscale);
00775
00776 wrapcompound<<<blocks, threads>>> (gpupos, sellen, gpucenters, gpuatomtofragmap, gpuindexlist, gpuboxsize, gpuinvboxsize, gpufragcenters);
00777 }
00778 else {
00779 measurecenter<<<(3*csel->selected + 95) / 96, 96>>>(gpupos, gpucenters, csel->selected, gpuweights, gpuweightidx, wscale);
00780
00781 wrapatomic<<<blocks, threads>>> (gpupos, sellen, gpucenters, gpuindexlist, gpuboxsize, gpuinvboxsize);
00782 }
00783
00784 cudaMemcpyAsync(pos, gpupos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyDeviceToHost);
00785
00786 cudaMemcpyAsync(ts->pos, pos, sizeof(float) * 3*mol->nAtoms, cudaMemcpyHostToHost);
00787 }
00788
00789 cudaDeviceSynchronize();
00790
00791 cudaFree(gpupos);
00792 cudaFree(gpuboxsize);
00793 cudaFree(gpuinvboxsize);
00794 cudaFree(gpucenters);
00795 if (fnum) {
00796 cudaFree(gpufragcenters);
00797 }
00798 cudaFreeHost(pos);
00799 cudaFree(gpuprevw);
00800 cudaFree(gpuprevu);
00801 cudaFree(gpuoldboxsize);
00802 cudaFree(gpuoldinvboxsize);
00803
00804 cudaHostUnregister(indexlist);
00805 cudaHostUnregister(cindexlist);
00806 cudaHostUnregister(boxsize);
00807 cudaFree(gpuindexlist);
00808 cudaFree(gpuweightidx);
00809 cudaFree(gpuweights);
00810 cudaFree(wscale);
00811 if (fnum) {
00812 cudaFree(gpucompoundmap);
00813 cudaFree(gpuatomtofragmap);
00814 cudaFree(gpufragweight);
00815 cudaHostUnregister(compoundmap);
00816 }
00817
00818 cudaDeviceSynchronize();
00819 cudaError_t error = cudaGetLastError();
00820 if(error != cudaSuccess)
00821 {
00822
00823 printf("CUDA error: %s\n", cudaGetErrorString(error));
00824 printf("Reverting to CPU algorithm\n");
00825 fpbc_exec_recenter_cpu(mol, first, last, csellen, cindexlist, fnum, compoundmap, sellen, indexlist, weights, csel, massarr);
00826 }
00827 }