00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #include <math.h>
00025 #include <cuda.h>
00026
00027 #include "Inform.h"
00028 #include "utilities.h"
00029 #include "WKFThreads.h"
00030 #include "WKFUtils.h"
00031 #include "CUDAKernels.h"
00032
00033
00034
00035
00036 #if 0
00037
00038
00039
00040 #define NBLOCK 128 // size of an RDF data block
00041 #define MAXBIN 64 // maximum number of bins in a histogram
00042 #elif 1
00043
00044
00045
00046 #define NBLOCK 32 // size of an RDF data block
00047 #define MAXBIN 1024 // maximum number of bins in a histogram
00048 #elif 1
00049
00050
00051 #define NBLOCK 320 // size of an RDF data block
00052 #define MAXBIN 3072 // maximum number of bins in a histogram
00053 #define __USEATOMIC 1 // enable atomic increment operations
00054 #elif 1
00055
00056
00057
00058
00059 #define NBLOCK 896 // size of an RDF data block
00060 #define MAXBIN 8192 // maximum number of bins in a histogram
00061 #define __USEATOMIC 1 // enable atomic increment operations
00062 #endif
00063
00064 #define NCUDABLOCKS 256 // the number of blocks to divide the shared memory
00065
00066
00067
00068
00069 #define NBLOCKHIST 64 // the number of threads used in the final histogram
00070
00071
00072
00073 #define NCONSTBLOCK 5440 // the number of atoms in constant memory.
00074
00075
00076 #define THREADSPERWARP 32 // this is correct for G80/GT200/Fermi GPUs
00077 #define WARP_LOG_SIZE 5 // this is also correct for G80/GT200/Fermi GPUs
00078
00079 #define BIN_OVERFLOW_LIMIT 2147483648 // maximum value a bin can store before
00080
00081
00082
00083
00084
00085 __global__ void init_hist(unsigned int* llhistg,
00086 int maxbin)
00087 {
00088
00089 #ifndef __USEATOMIC
00090 int io;
00091 #endif
00092
00093 int iblock;
00094
00095 unsigned int *llhist;
00096
00097
00098 int nofblocks;
00099 int nleftover;
00100
00101 int maxloop;
00102
00103
00104
00105
00106 #ifdef __USEATOMIC
00107 llhist=llhistg+blockIdx.x*maxbin+threadIdx.x;
00108 nofblocks=maxbin/NBLOCK;
00109 nleftover=maxbin-nofblocks*NBLOCK;
00110 maxloop=nofblocks*NBLOCK;
00111
00112 for (iblock=0; iblock < maxloop; iblock+=NBLOCK) {
00113 llhist[iblock]=0UL;
00114 }
00115
00116 if (threadIdx.x < nleftover) {
00117 llhist[iblock]=0UL;
00118 }
00119
00120 #else
00121 llhist=llhistg+(((blockIdx.x)*NBLOCK)/THREADSPERWARP)*maxbin+threadIdx.x;
00122 nofblocks=maxbin/NBLOCK;
00123 nleftover=maxbin-nofblocks*NBLOCK;
00124 maxloop=nofblocks*NBLOCK;
00125 int maxloop2=(NBLOCK / THREADSPERWARP)*maxbin;
00126
00127 for (io=0; io < maxloop2; io+=maxbin) {
00128 for (iblock=0; iblock < maxloop; iblock+=NBLOCK) {
00129 llhist[io + iblock]=0UL;
00130 }
00131
00132 if (threadIdx.x < nleftover) {
00133 llhist[io + iblock]=0UL;
00134 }
00135 }
00136 #endif
00137
00138 return;
00139 }
00140
00141
00142
00143
00144
00145
00146 __global__ void init_hist_f(float* histdev) {
00147 histdev[threadIdx.x]=0.0f;
00148 return;
00149 }
00150
00151
00152
00153
00154
00155
00156
00157
00158 __global__ void reimage_xyz(float* xyz,
00159 int natomsi,
00160 int natomsipad,
00161 float3 celld,
00162 float rmax)
00163 {
00164 int iblock;
00165
00166 __shared__ float xyzi[3*NBLOCK];
00167
00168
00169 int nofblocks;
00170
00171
00172
00173 float *pxi;
00174
00175
00176 int threetimesid;
00177
00178 float rtmp;
00179
00180 int ifinalblock;
00181
00182 int loopmax;
00183
00184
00185 nofblocks=((natomsipad/NBLOCK)+NCUDABLOCKS-blockIdx.x-1)/NCUDABLOCKS;
00186 loopmax=nofblocks*3*NCUDABLOCKS*NBLOCK;
00187 ifinalblock=(natomsipad / NBLOCK - 1) % NCUDABLOCKS;
00188 pxi=xyz+blockIdx.x*3*NBLOCK+threadIdx.x;
00189 threetimesid=3*threadIdx.x;
00190
00191
00192 for (iblock=0; iblock<loopmax; iblock+=3*NCUDABLOCKS*NBLOCK) {
00193 __syncthreads();
00194
00195 xyzi[threadIdx.x ]=pxi[iblock ];
00196 xyzi[threadIdx.x+ NBLOCK]=pxi[iblock+ NBLOCK];
00197 xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK];
00198 __syncthreads();
00199
00200
00201
00202
00203 xyzi[threetimesid] = fmodf(xyzi[threetimesid ], celld.x);
00204 if (xyzi[threetimesid ] < 0.0f) {
00205 xyzi[threetimesid ] += celld.x;
00206 }
00207 xyzi[threetimesid+1]=fmodf(xyzi[threetimesid+1], celld.y);
00208 if (xyzi[threetimesid+1] < 0.0f) {
00209 xyzi[threetimesid+1] += celld.y;
00210 }
00211 xyzi[threetimesid+2]=fmodf(xyzi[threetimesid+2], celld.z);
00212 if (xyzi[threetimesid+2] < 0.0f) {
00213 xyzi[threetimesid+2] += celld.z;
00214 }
00215
00216
00217 if (iblock==loopmax-3*NCUDABLOCKS*NBLOCK && blockIdx.x==ifinalblock) {
00218
00219
00220
00221 if ((blockDim.x-threadIdx.x) <= (natomsipad - natomsi)) {
00222 rtmp = -((float)(threadIdx.x+1))*rmax;
00223 xyzi[threetimesid ] = rtmp;
00224 xyzi[threetimesid+1] = rtmp;
00225 xyzi[threetimesid+2] = rtmp;
00226 }
00227 }
00228
00229 __syncthreads();
00230
00231
00232 pxi[iblock ]=xyzi[threadIdx.x ];
00233 pxi[iblock+ NBLOCK]=xyzi[threadIdx.x+ NBLOCK];
00234 pxi[iblock+2*NBLOCK]=xyzi[threadIdx.x+2*NBLOCK];
00235
00236
00237
00238 }
00239 }
00240
00241
00242
00243
00244 __global__ void phantom_xyz(float* xyz,
00245 int natomsi,
00246 int natomsipad,
00247 float minxyz,
00248 float rmax)
00249 {
00250 int iblock;
00251
00252 __shared__ float xyzi[3*NBLOCK];
00253
00254
00255 int nofblocks;
00256
00257
00258
00259 float *pxi;
00260
00261
00262 int threetimesid;
00263
00264 float rtmp;
00265
00266 int ifinalblock;
00267
00268 int loopmax;
00269
00270
00271 nofblocks=((natomsipad/NBLOCK)+NCUDABLOCKS-blockIdx.x-1)/NCUDABLOCKS;
00272 loopmax=nofblocks*3*NCUDABLOCKS*NBLOCK;
00273 ifinalblock=(natomsipad / NBLOCK - 1) % NCUDABLOCKS;
00274 pxi=xyz+blockIdx.x*3*NBLOCK+threadIdx.x;
00275 threetimesid=3*threadIdx.x;
00276
00277
00278 for (iblock=0; iblock<loopmax; iblock+=3*NCUDABLOCKS*NBLOCK) {
00279 __syncthreads();
00280
00281 xyzi[threadIdx.x ]=pxi[iblock ];
00282 xyzi[threadIdx.x+ NBLOCK]=pxi[iblock+ NBLOCK];
00283 xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK];
00284 __syncthreads();
00285
00286
00287 if (iblock==loopmax-3*NCUDABLOCKS*NBLOCK && blockIdx.x==ifinalblock) {
00288
00289
00290
00291 if ((blockDim.x-threadIdx.x) <= (natomsipad - natomsi)) {
00292 rtmp = -((float)(threadIdx.x+1))*rmax-minxyz;
00293 xyzi[threetimesid ] = rtmp;
00294 xyzi[threetimesid+1] = rtmp;
00295 xyzi[threetimesid+2] = rtmp;
00296 }
00297 }
00298
00299 __syncthreads();
00300
00301
00302 pxi[iblock ]=xyzi[threadIdx.x ];
00303 pxi[iblock+ NBLOCK]=xyzi[threadIdx.x+ NBLOCK];
00304 pxi[iblock+2*NBLOCK]=xyzi[threadIdx.x+2*NBLOCK];
00305
00306
00307
00308 }
00309 }
00310
00311
00312
00313
00314
00315
00316
00317
00318
00319
00320
00321
00322
00323 __constant__ static float xyzj[3*NCONSTBLOCK];
00324
00325
00326
00327 void copycoordstoconstbuff(int natoms, const float* xyzh) {
00328 cudaMemcpyToSymbol(xyzj, xyzh, 3*natoms*sizeof(float));
00329 }
00330
00331
00332
00333
00334
00335
00336
00337
00338
00339 #ifndef __USEATOMIC
00340 __device__ void addData(volatile unsigned int *s_WarpHist,
00341
00342 unsigned int data,
00343 unsigned int threadTag)
00344 {
00345 unsigned int count;
00346
00347 do {
00348 count = s_WarpHist[data] & 0x07FFFFFFUL;
00349 count = threadTag | (count + 1);
00350 s_WarpHist[data] = count;
00351 } while(s_WarpHist[data] != count);
00352 }
00353 #endif
00354
00355
00356
00357
00358
00359 __global__ void calculate_rdf(int usepbc,
00360 float* xyz,
00361 int natomsi,
00362 int natomsj,
00363 float3 celld,
00364 unsigned int* llhistg,
00365 int nbins,
00366 float rmin,
00367 float delr_inv)
00368 {
00369 int io;
00370 int iblock;
00371
00372 #ifdef __USEATOMIC
00373 unsigned int *llhists1;
00374
00375
00376 __shared__ unsigned int llhists[MAXBIN];
00377 #else
00378 volatile unsigned int *llhists1;
00379
00380
00381 volatile __shared__ unsigned int llhists[(NBLOCK*MAXBIN)/THREADSPERWARP];
00382
00383 #endif
00384
00385 __shared__ float xyzi[3*NBLOCK];
00386
00387
00388 float rxij, rxij2, rij;
00389
00390
00391 int ibin,nofblocksi;
00392 int nleftover;
00393 float *pxi;
00394 unsigned int joffset;
00395 unsigned int loopmax, loopmax2;
00396 float rmin2=.0001/delr_inv;
00397
00398 #ifndef __USEATOMIC
00399
00400
00401 const unsigned int threadTag = ((unsigned int)
00402 ( threadIdx.x & (THREADSPERWARP - 1)))
00403 << (32 - WARP_LOG_SIZE);
00404 #endif
00405
00406
00407
00408
00409
00410
00411 #ifdef __USEATOMIC
00412 llhists1=llhists;
00413 nofblocksi=nbins/NBLOCK;
00414 nleftover=nbins-nofblocksi*NBLOCK;
00415 #else
00416 llhists1=llhists+(threadIdx.x/THREADSPERWARP)*MAXBIN;
00417 nofblocksi=nbins/THREADSPERWARP;
00418 nleftover=nbins-nofblocksi*THREADSPERWARP;
00419 #endif
00420
00421
00422 #ifdef __USEATOMIC
00423 loopmax = nofblocksi*NBLOCK;
00424 for (io=0; io<loopmax; io+=NBLOCK) {
00425 llhists1[io + threadIdx.x]=0UL;
00426 }
00427 if (threadIdx.x < nleftover) {
00428 llhists1[io + threadIdx.x]=0UL;
00429 }
00430
00431 #else
00432 loopmax = nofblocksi*THREADSPERWARP;
00433 for (io=0; io<loopmax; io+=THREADSPERWARP) {
00434 llhists1[io + (threadIdx.x & (THREADSPERWARP - 1))]=0UL;
00435 }
00436 if ((threadIdx.x & (THREADSPERWARP - 1)) < nleftover) {
00437 llhists1[io + (threadIdx.x & (THREADSPERWARP - 1))]=0UL;
00438 }
00439 #endif
00440
00441
00442 nofblocksi=((natomsi/NBLOCK)+NCUDABLOCKS-blockIdx.x-1)/NCUDABLOCKS;
00443 pxi=xyz+blockIdx.x*3*NBLOCK+threadIdx.x;
00444
00445 loopmax = 3 * natomsj;
00446 int idxt3 = 3 * threadIdx.x;
00447 int idxt3p1 = idxt3+1;
00448 int idxt3p2 = idxt3+2;
00449
00450 loopmax2 = nofblocksi*3*NCUDABLOCKS*NBLOCK;
00451
00452
00453
00454 if (usepbc) {
00455 for (iblock=0; iblock<loopmax2; iblock+=3*NCUDABLOCKS*NBLOCK) {
00456 __syncthreads();
00457
00458
00459 xyzi[threadIdx.x ]=pxi[iblock ];
00460 xyzi[threadIdx.x+ NBLOCK]=pxi[iblock+ NBLOCK];
00461 xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK];
00462
00463 __syncthreads();
00464
00465 for (joffset=0; joffset<loopmax; joffset+=3) {
00466
00467
00468 rxij=fabsf(xyzi[idxt3 ] - xyzj[joffset ]);
00469 rxij2=celld.x - rxij;
00470 rxij=fminf(rxij, rxij2);
00471 rij=rxij*rxij;
00472 rxij=fabsf(xyzi[idxt3p1] - xyzj[joffset+1]);
00473 rxij2=celld.y - rxij;
00474 rxij=fminf(rxij, rxij2);
00475 rij+=rxij*rxij;
00476 rxij=fabsf(xyzi[idxt3p2] - xyzj[joffset+2]);
00477 rxij2=celld.z - rxij;
00478 rxij=fminf(rxij, rxij2);
00479 rij=sqrtf(rij + rxij*rxij);
00480
00481
00482 ibin=__float2int_rd((rij-rmin)*delr_inv);
00483 if (ibin<nbins && ibin>=0 && rij>rmin2) {
00484 #ifdef __USEATOMIC
00485 atomicAdd(llhists1+ibin, 1U);
00486 #else
00487 addData(llhists1, ibin, threadTag);
00488 #endif
00489 }
00490 }
00491 }
00492 } else {
00493 for (iblock=0; iblock<loopmax2; iblock+=3*NCUDABLOCKS*NBLOCK) {
00494 __syncthreads();
00495
00496
00497 xyzi[threadIdx.x ]=pxi[iblock ];
00498 xyzi[threadIdx.x+ NBLOCK]=pxi[iblock+ NBLOCK];
00499 xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK];
00500
00501 __syncthreads();
00502
00503
00504 for (joffset=0; joffset<loopmax; joffset+=3) {
00505
00506
00507 rxij=xyzi[idxt3 ] - xyzj[joffset ];
00508 rij=rxij*rxij;
00509 rxij=xyzi[idxt3p1] - xyzj[joffset+1];
00510 rij+=rxij*rxij;
00511 rxij=xyzi[idxt3p2] - xyzj[joffset+2];
00512 rij=sqrtf(rij + rxij*rxij);
00513
00514
00515 ibin=__float2int_rd((rij-rmin)*delr_inv);
00516 if (ibin<nbins && ibin>=0 && rij>rmin2) {
00517 #ifdef __USEATOMIC
00518 atomicAdd(llhists1+ibin, 1U);
00519 #else
00520 addData(llhists1, ibin, threadTag);
00521 #endif
00522 }
00523 }
00524 }
00525 }
00526
00527 __syncthreads();
00528
00529
00530
00531
00532 nofblocksi=nbins/NBLOCK;
00533 nleftover=nbins-nofblocksi*NBLOCK;
00534
00535 #ifdef __USEATOMIC
00536
00537 unsigned int *llhistg1=llhistg+blockIdx.x*MAXBIN+threadIdx.x;
00538
00539
00540 loopmax = nofblocksi*NBLOCK;
00541 for (iblock=0; iblock < loopmax; iblock+=NBLOCK) {
00542 llhistg1[iblock] += llhists[iblock+threadIdx.x];
00543 }
00544
00545 if (threadIdx.x < nleftover) {
00546 llhistg1[iblock] += llhists[iblock+threadIdx.x];
00547 }
00548 #else
00549
00550 unsigned int* llhistg1=llhistg+(((blockIdx.x)*NBLOCK)/THREADSPERWARP)*MAXBIN+threadIdx.x;
00551
00552
00553 loopmax = MAXBIN * (NBLOCK / THREADSPERWARP);
00554 loopmax2 = nofblocksi * NBLOCK;
00555 for (io=0; io < loopmax; io+=MAXBIN) {
00556 for (iblock=0; iblock < loopmax2; iblock+=NBLOCK) {
00557 llhistg1[io+iblock] += llhists[io+iblock+threadIdx.x];
00558 }
00559
00560 if (threadIdx.x < nleftover) {
00561 llhistg1[iblock] += llhists[io+iblock+threadIdx.x];
00562 }
00563 }
00564 #endif
00565
00566 return;
00567 }
00568
00569
00570 #define ull2float __uint2float_rn
00571
00572
00573
00574
00575
00576
00577 __global__ void calculate_histogram(float* histdev,
00578 unsigned int* llhistg,
00579
00580
00581 int nbins)
00582 {
00583 int io;
00584 int iblock;
00585 int maxloop;
00586
00587 __shared__ unsigned int llhists[NBLOCKHIST];
00588
00589 __shared__ float xi[NBLOCKHIST];
00590 int nofblocks;
00591
00592 nofblocks=nbins/NBLOCKHIST;
00593 int nleftover=nbins-nofblocks*NBLOCKHIST;
00594 maxloop = nofblocks*NBLOCKHIST;
00595
00596
00597 for (iblock=0;iblock<maxloop;iblock+=NBLOCKHIST) {
00598 xi[threadIdx.x]=0.0f;
00599
00600
00601 #ifdef __USEATOMIC
00602 for (io=0; io < MAXBIN*NCUDABLOCKS; io+=MAXBIN) {
00603 #else
00604 for (io=0; io < MAXBIN * (NCUDABLOCKS*NBLOCK / THREADSPERWARP); io+=MAXBIN) {
00605 #endif
00606
00607 llhists[threadIdx.x]=llhistg[io+iblock+threadIdx.x];
00608
00609
00610 llhists[threadIdx.x]=llhists[threadIdx.x] & 0x07FFFFFFUL;
00611
00612
00613 xi[threadIdx.x]+=ull2float(llhists[threadIdx.x]);
00614 }
00615
00616
00617 histdev[iblock+threadIdx.x]+=xi[threadIdx.x];
00618 }
00619
00620
00621 if (threadIdx.x < nleftover) {
00622 xi[threadIdx.x]=0.0f;
00623
00624
00625 #ifdef __USEATOMIC
00626 for (io=0; io < MAXBIN*NCUDABLOCKS; io+=MAXBIN) {
00627 #else
00628 for (io=0; io < MAXBIN *(NCUDABLOCKS*NBLOCK / THREADSPERWARP); io+=MAXBIN) {
00629 #endif
00630
00631 llhists[threadIdx.x]=llhistg[io+iblock+threadIdx.x];
00632
00633
00634 llhists[threadIdx.x]=llhists[threadIdx.x] & 0x07FFFFFFUL;
00635
00636
00637 xi[threadIdx.x]+=ull2float(llhists[threadIdx.x]);
00638 }
00639
00640
00641 histdev[iblock+threadIdx.x]+=xi[threadIdx.x];
00642 }
00643
00644
00645 return;
00646 }
00647
00648
00649
00650
00651
00652
00653 void calculate_histogram_block(int usepbc,
00654 float *xyz,
00655 int natomsi,
00656 int natomsj,
00657 float3 celld,
00658 unsigned int* llhistg,
00659 int nbins,
00660 float rmin,
00661 float delr_inv,
00662 float *histdev,
00663 int nblockhist) {
00664
00665 init_hist<<<NCUDABLOCKS, NBLOCK>>>(llhistg, MAXBIN);
00666
00667
00668 calculate_rdf<<<NCUDABLOCKS, NBLOCK>>>(usepbc, xyz, natomsi, natomsj, celld,
00669 llhistg, nbins, rmin, delr_inv);
00670
00671
00672 calculate_histogram<<<1, nblockhist>>>(histdev, llhistg, nbins);
00673 }
00674
00675
00676
00677
00678
00679
00680 typedef struct {
00681 int usepbc;
00682 int natoms1;
00683 float* xyz;
00684 int natoms2;
00685 float** xyz2array;
00686 float* cell;
00687 float** histarray;
00688 int nbins;
00689 int nblockhist;
00690
00691 float rmin;
00692 float delr;
00693 int nblocks;
00694 int nhblock;
00695 } rdfthrparms;
00696
00697
00698
00699
00700
00701
00702
00703
00704
00705 static void * rdf_thread(void *voidparms) {
00706 rdfthrparms *parms = NULL;
00707 int threadid=0;
00708
00709 wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00710 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00711
00712 #if 0
00713 printf("rdf thread[%d] lanched and running...\n", threadid);
00714 #endif
00715 cudaSetDevice(threadid);
00716
00717
00718
00719
00720 const int natoms1 = parms->natoms1;
00721 const float *xyz = parms->xyz;
00722 const int natoms2 = parms->natoms2;
00723 const float *cell = parms->cell;
00724 const int nbins = parms->nbins;
00725 const int nblockhist = parms->nblockhist;
00726 const float rmin = parms->rmin;
00727 const float delr = parms->delr;
00728 float *xyz2 = parms->xyz2array[threadid];
00729 float *hist = parms->histarray[threadid];
00730 const int nhblock = parms->nhblock;
00731 const int usepbc = parms->usepbc;
00732
00733 float* xyzdev;
00734 float3 celld;
00735 float* histdev;
00736 unsigned int* llhistg;
00737
00738 int nconstblocks;
00739 int nleftover;
00740 int nbigblocks;
00741 int ibigblock;
00742 int nleftoverbig;
00743 int ntmpbig;
00744 int nbinstmp;
00745 int natoms1pad;
00746 int natoms2pad;
00747 float rmax;
00748
00749
00750 natoms1pad=natoms1;
00751 if (natoms1%NBLOCK!=0) {natoms1pad = (natoms1 / NBLOCK + 1) * NBLOCK;}
00752 natoms2pad=natoms2;
00753 if (natoms2%NBLOCK!=0) {natoms2pad = (natoms2 / NBLOCK + 1) * NBLOCK;}
00754
00755
00756
00757
00758 cudaMalloc((void**)&xyzdev, 3*max(natoms1pad,natoms2pad)*sizeof(float));
00759
00760
00761 cudaMalloc((void**)&histdev, nbins*sizeof(float));
00762
00763
00764 int ihblock;
00765 int maxloop=nbins/nblockhist;
00766 if (maxloop*nblockhist!=nbins) {maxloop=maxloop+1;}
00767 for (ihblock=0; ihblock<maxloop; ihblock++) {
00768 init_hist_f<<<1, nblockhist>>>(histdev+ihblock*nblockhist);
00769 }
00770
00771
00772 #ifdef __USEATOMIC
00773 cudaMalloc((void**)&llhistg, (NCUDABLOCKS*MAXBIN*sizeof(int)));
00774 #else
00775 cudaMalloc((void**)&llhistg, (NCUDABLOCKS*NBLOCK*MAXBIN*
00776 sizeof(int))/THREADSPERWARP);
00777 #endif
00778
00779
00780 celld.x=cell[0];
00781 celld.y=cell[1];
00782 celld.z=cell[2];
00783
00784
00785
00786 rmax = 1.1f * (rmin+((float)nbins)*delr);
00787
00788
00789 cudaMemcpy(xyzdev, xyz2, 3*natoms2*sizeof(float), cudaMemcpyHostToDevice);
00790
00791 if (usepbc) {
00792
00793 reimage_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms2,natoms2pad,celld,rmax);
00794 } else {
00795
00796 phantom_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms2,natoms2pad,1.0e38f,rmax);
00797 }
00798
00799
00800
00801 cudaMemcpy(xyz2, xyzdev, 3*natoms2*sizeof(float), cudaMemcpyDeviceToHost);
00802
00803
00804 cudaMemcpy(xyzdev, xyz, 3*natoms1*sizeof(float), cudaMemcpyHostToDevice);
00805
00806 if (usepbc) {
00807
00808 reimage_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms1,natoms1pad,celld,rmax);
00809 } else {
00810
00811 phantom_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms1,natoms1pad,1.0e38f,rmax);
00812 }
00813
00814
00815 nbinstmp = MAXBIN;
00816
00817
00818 nconstblocks=natoms2/NCONSTBLOCK;
00819 nleftover=natoms2-nconstblocks*NCONSTBLOCK;
00820
00821
00822 ntmpbig = NBLOCK * ((BIN_OVERFLOW_LIMIT / NCONSTBLOCK) / NBLOCK);
00823 nbigblocks = natoms1pad / ntmpbig;
00824 nleftoverbig = natoms1pad - nbigblocks * ntmpbig;
00825
00826 int nbblocks = (nleftoverbig != 0) ? nbigblocks+1 : nbigblocks;
00827
00828
00829 wkf_tasktile_t tile;
00830 while (wkf_threadpool_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
00831 #if 0
00832 printf("rdf thread[%d] fetched tile: s: %d e: %d\n", threadid, tile.start, tile.end);
00833 #endif
00834 int iconstblock = -1;
00835 ihblock = -1;
00836 int workitem;
00837 for (workitem=tile.start; workitem<tile.end; workitem++) {
00838
00839 ihblock = workitem % nhblock;
00840 int newiconstblock = workitem / nhblock;
00841 int blocksz;
00842
00843
00844 if (iconstblock != newiconstblock) {
00845 iconstblock = newiconstblock;
00846
00847
00848 blocksz = (iconstblock == nconstblocks && nleftover != 0)
00849 ? nleftover : NCONSTBLOCK;
00850
00851
00852 copycoordstoconstbuff(blocksz, xyz2+(3*NCONSTBLOCK*iconstblock));
00853 }
00854
00855 #if 0
00856 printf("rdf thread[%d] iconstblock: %d ihblock: %d\n", threadid,
00857 iconstblock, ihblock);
00858 #endif
00859
00860
00861
00862
00863 float rmintmp=rmin + (MAXBIN * delr * ihblock);
00864
00865 nbinstmp = MAXBIN;
00866
00867
00868 if (ihblock==(nhblock-1)) {
00869 nbinstmp = nbins%MAXBIN;
00870
00871 if (nbinstmp==0) { nbinstmp = MAXBIN;}
00872 }
00873
00874 for (ibigblock=0; ibigblock < nbblocks; ibigblock++) {
00875
00876 int bblocksz = (ibigblock == nbigblocks && nleftoverbig != 0)
00877 ? nleftoverbig : ntmpbig;
00878
00879 calculate_histogram_block(usepbc, (xyzdev+ibigblock*ntmpbig*3),
00880 bblocksz, blocksz, celld,
00881 llhistg, nbinstmp, rmintmp, (1/delr),
00882 histdev+ihblock*MAXBIN, nblockhist);
00883 }
00884
00885 cudaThreadSynchronize();
00886 }
00887 }
00888
00889
00890 cudaMemcpy(hist, histdev, nbins*sizeof(float), cudaMemcpyDeviceToHost);
00891
00892 cudaFree(xyzdev);
00893 cudaFree(histdev);
00894 cudaFree(llhistg);
00895
00896 #if 0
00897 printf("thread[%d] finished, releasing resources\n", threadid);
00898 #endif
00899
00900 return NULL;
00901 }
00902
00903
00904 int rdf_gpu(wkf_threadpool_t *devpool,
00905 int usepbc,
00906 int natoms1,
00907
00908 float *xyz,
00909
00910 int natoms2,
00911
00912 float *xyz2,
00913
00914 float* cell,
00915 float* hist,
00916
00917 int nbins,
00918 float rmin,
00919 float delr)
00920 {
00921 int i, ibin;
00922
00923 if (devpool == NULL) {
00924 return -1;
00925 }
00926 int numprocs = wkf_threadpool_get_workercount(devpool);
00927
00928
00929 float **xyz2array = (float**) calloc(1, sizeof(float *) * numprocs);
00930 float **histarray = (float**) calloc(1, sizeof(float *) * numprocs);
00931 for (i=0; i<numprocs; i++) {
00932 xyz2array[i] = (float*) calloc(1, sizeof(float) * 3 * (natoms2/NBLOCK + 1) * NBLOCK);
00933 memcpy(xyz2array[i], xyz2, sizeof(float) * 3 * (natoms2/NBLOCK + 1) * NBLOCK);
00934 histarray[i] = (float*) calloc(1, sizeof(float) * nbins);
00935 }
00936
00937 memset(hist, 0, nbins * sizeof(float));
00938
00939
00940 rdfthrparms parms;
00941
00942 parms.usepbc = usepbc;
00943 parms.natoms1 = natoms1;
00944 parms.xyz = xyz;
00945 parms.natoms2 = natoms2;
00946 parms.cell = cell;
00947 parms.nbins = nbins;
00948 parms.nblockhist = NBLOCKHIST;
00949 parms.rmin = rmin;
00950 parms.delr = delr;
00951 parms.xyz2array = xyz2array;
00952 parms.histarray = histarray;
00953
00954
00955 parms.nhblock = (nbins+MAXBIN-1)/MAXBIN;
00956
00957
00958 int nconstblocks=parms.natoms2/NCONSTBLOCK;
00959 int nleftoverconst=parms.natoms2 - nconstblocks*NCONSTBLOCK;
00960
00961
00962 parms.nblocks = (nleftoverconst != 0) ? nconstblocks+1 : nconstblocks;
00963
00964 int parblocks = parms.nblocks * parms.nhblock;
00965 #if 0
00966 printf("master thread: number of parallel work items: %d\n", parblocks);
00967 #endif
00968
00969
00970 wkf_tasktile_t tile;
00971 tile.start=0;
00972 tile.end=parblocks;
00973 wkf_threadpool_sched_dynamic(devpool, &tile);
00974 wkf_threadpool_launch(devpool, rdf_thread, &parms, 1);
00975
00976 memset(hist, 0, nbins * sizeof(float));
00977
00978 for (i=0; i<numprocs; i++) {
00979 for (ibin=0; ibin<nbins; ibin++) {
00980 hist[ibin] += histarray[i][ibin];
00981 }
00982 }
00983
00984 return 0;
00985 }
00986
00987
00988