00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #include <cuda.h>
00025
00026 #include "utilities.h"
00027 #include "VMDThreads.h"
00028 #include "CUDAKernels.h"
00029
00030 typedef struct {
00031 int threadid;
00032 int threadcount;
00033 float* atoms;
00034 float* grideners;
00035 long int numplane;
00036 long int numcol;
00037 long int numpt;
00038 long int natoms;
00039 float gridspacing;
00040 } enthrparms;
00041
00042
00043 static void * cudaenergythread(void *);
00044
00045 #if 1
00046 #define CUERR { cudaError_t err; \
00047 if ((err = cudaGetLastError()) != cudaSuccess) { \
00048 printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \
00049 printf("Thread aborting...\n"); \
00050 return NULL; }}
00051 #else
00052 #define CUERR
00053 #endif
00054
00055
00056
00057
00058
00059 #define MAXATOMS 4000
00060 __constant__ static float4 atominfo[MAXATOMS];
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086 #if 1
00087
00088
00089
00090
00091
00092
00093 #define UNROLLX 8
00094 #define UNROLLY 1
00095 #define BLOCKSIZEX 8 // make large enough to allow coalesced global mem ops
00096 #define BLOCKSIZEY 8 // make as small as possible for finer granularity
00097
00098 #else
00099
00100
00101
00102
00103
00104
00105 #define UNROLLX 4
00106 #define UNROLLY 1
00107 #define BLOCKSIZEX 16 // make large enough to allow coalesced global mem ops
00108 #define BLOCKSIZEY 16 // make as small as possible for finer granularity
00109
00110 #endif
00111
00112 #define BLOCKSIZE (BLOCKSIZEX*BLOCKSIZEY)
00113
00114
00115 #if UNROLLX == 8
00116 #define FLOPSPERATOMEVAL (59.0/8.0)
00117 #elif UNROLLX == 4
00118 #define FLOPSPERATOMEVAL (31.0/4.0)
00119 #endif
00120
00121 __global__ static void cenergy(int numatoms, float gridspacing, float * energygrid) {
00122 unsigned int xindex = __umul24(blockIdx.x, blockDim.x) * UNROLLX
00123 + threadIdx.x;
00124 unsigned int yindex = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
00125 unsigned int outaddr = (__umul24(gridDim.x, blockDim.x) * UNROLLX) * yindex
00126 + xindex;
00127
00128 float coory = gridspacing * yindex;
00129 float coorx = gridspacing * xindex;
00130
00131 float energyvalx1=0.0f;
00132 float energyvalx2=0.0f;
00133 float energyvalx3=0.0f;
00134 float energyvalx4=0.0f;
00135 #if UNROLLX > 4
00136 float energyvalx5=0.0f;
00137 float energyvalx6=0.0f;
00138 float energyvalx7=0.0f;
00139 float energyvalx8=0.0f;
00140 #endif
00141
00142 float gridspacing_coalesce = gridspacing * BLOCKSIZEX;
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153 int atomid;
00154 for (atomid=0; atomid<numatoms; atomid++) {
00155 float dy = coory - atominfo[atomid].y;
00156 float dyz2 = (dy * dy) + atominfo[atomid].z;
00157
00158 float dx1 = coorx - atominfo[atomid].x;
00159 float dx2 = dx1 + gridspacing_coalesce;
00160 float dx3 = dx2 + gridspacing_coalesce;
00161 float dx4 = dx3 + gridspacing_coalesce;
00162 #if UNROLLX > 4
00163 float dx5 = dx4 + gridspacing_coalesce;
00164 float dx6 = dx5 + gridspacing_coalesce;
00165 float dx7 = dx6 + gridspacing_coalesce;
00166 float dx8 = dx7 + gridspacing_coalesce;
00167 #endif
00168
00169 energyvalx1 += atominfo[atomid].w * rsqrtf(dx1*dx1 + dyz2);
00170 energyvalx2 += atominfo[atomid].w * rsqrtf(dx2*dx2 + dyz2);
00171 energyvalx3 += atominfo[atomid].w * rsqrtf(dx3*dx3 + dyz2);
00172 energyvalx4 += atominfo[atomid].w * rsqrtf(dx4*dx4 + dyz2);
00173 #if UNROLLX > 4
00174 energyvalx5 += atominfo[atomid].w * rsqrtf(dx5*dx5 + dyz2);
00175 energyvalx6 += atominfo[atomid].w * rsqrtf(dx6*dx6 + dyz2);
00176 energyvalx7 += atominfo[atomid].w * rsqrtf(dx7*dx7 + dyz2);
00177 energyvalx8 += atominfo[atomid].w * rsqrtf(dx8*dx8 + dyz2);
00178 #endif
00179 }
00180
00181 energygrid[outaddr ] += energyvalx1;
00182 energygrid[outaddr+1*BLOCKSIZEX] += energyvalx2;
00183 energygrid[outaddr+2*BLOCKSIZEX] += energyvalx3;
00184 energygrid[outaddr+3*BLOCKSIZEX] += energyvalx4;
00185 #if UNROLLX > 4
00186 energygrid[outaddr+4*BLOCKSIZEX] += energyvalx5;
00187 energygrid[outaddr+5*BLOCKSIZEX] += energyvalx6;
00188 energygrid[outaddr+6*BLOCKSIZEX] += energyvalx7;
00189 energygrid[outaddr+7*BLOCKSIZEX] += energyvalx8;
00190 #endif
00191 }
00192
00193
00194
00195
00196 #define TILESIZEX BLOCKSIZEX*UNROLLX
00197 #define TILESIZEY BLOCKSIZEY*UNROLLY
00198 #define GPU_X_ALIGNMASK (TILESIZEX - 1)
00199 #define GPU_Y_ALIGNMASK (TILESIZEY - 1)
00200
00201 int copyatomstoconstbuf(const float *atoms, float *atompre,
00202 int count, float zplane) {
00203 if (count > MAXATOMS) {
00204 printf("Atom count exceeds constant buffer storage capacity\n");
00205 return -1;
00206 }
00207
00208 int i;
00209 for (i=0; i<count*4; i+=4) {
00210 atompre[i ] = atoms[i ];
00211 atompre[i + 1] = atoms[i + 1];
00212 float dz = zplane - atoms[i + 2];
00213 atompre[i + 2] = dz*dz;
00214 atompre[i + 3] = atoms[i + 3];
00215 }
00216
00217 cudaMemcpyToSymbol(atominfo, atompre, count * 4 * sizeof(float), 0);
00218 CUERR
00219
00220 return 0;
00221 }
00222
00223 int vmd_cuda_vol_cpotential(long int natoms, float* atoms, float* grideners,
00224 long int numplane, long int numcol, long int numpt,
00225 float gridspacing) {
00226 int i;
00227 enthrparms *parms;
00228 vmd_thread_t * threads;
00229 vmd_timerhandle globaltimer;
00230 double totalruntime;
00231
00232 #if defined(VMDTHREADS)
00233 int numprocs = vmd_thread_numprocessors();
00234 #else
00235 int numprocs = 1;
00236 #endif
00237
00238 int deviceCount = 0;
00239 if (vmd_cuda_num_devices(&deviceCount))
00240 return -1;
00241 if (deviceCount < 1)
00242 return -1;
00243
00244
00245
00246 if (deviceCount < numprocs) {
00247 numprocs = deviceCount;
00248 }
00249
00250 printf("Using %d CUDA GPUs\n", numprocs);
00251
00252
00253 threads = (vmd_thread_t *) calloc(numprocs * sizeof(vmd_thread_t), 1);
00254
00255
00256 parms = (enthrparms *) malloc(numprocs * sizeof(enthrparms));
00257 for (i=0; i<numprocs; i++) {
00258 parms[i].threadid = i;
00259 parms[i].threadcount = numprocs;
00260 parms[i].atoms = atoms;
00261 parms[i].grideners = grideners;
00262 parms[i].numplane = numplane;
00263 parms[i].numcol = numcol;
00264 parms[i].numpt = numpt;
00265 parms[i].natoms = natoms;
00266 parms[i].gridspacing = gridspacing;
00267 }
00268
00269 printf("GPU padded grid size: %d x %d x %d\n",
00270 (numpt + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK),
00271 (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK),
00272 numplane);
00273
00274 globaltimer = vmd_timer_create();
00275 vmd_timer_start(globaltimer);
00276
00277 #if defined(VMDTHREADS)
00278
00279
00280
00281 for (i=0; i<numprocs; i++) {
00282 vmd_thread_create(&threads[i], cudaenergythread, &parms[i]);
00283 }
00284
00285
00286 for (i=0; i<numprocs; i++) {
00287 vmd_thread_join(threads[i], NULL);
00288 }
00289 #else
00290
00291 cudaenergythread((void *) &parms[0]);
00292 #endif
00293
00294
00295 vmd_timer_stop(globaltimer);
00296 totalruntime = vmd_timer_time(globaltimer);
00297 vmd_timer_destroy(globaltimer);
00298
00299 double atomevalssec = ((double) numplane * numcol * numpt * natoms) / (totalruntime * 1000000000.0);
00300 printf(" %g billion atom evals/second, %g GFLOPS\n",
00301 atomevalssec, atomevalssec * FLOPSPERATOMEVAL);
00302
00303
00304 free(parms);
00305 free(threads);
00306
00307 return 0;
00308 }
00309
00310
00311
00312
00313
00314 static void * cudaenergythread(void *voidparms) {
00315 dim3 volsize, Gsz, Bsz;
00316 float *devenergy = NULL;
00317 float *hostenergy = NULL;
00318 float *atomprebuf = NULL;
00319 int atomprebufpinned = 1;
00320
00321 enthrparms *parms = (enthrparms *) voidparms;
00322
00323
00324
00325 const float *atoms = parms->atoms;
00326 float* grideners = parms->grideners;
00327 const long int numplane = parms->numplane;
00328 const long int numcol = parms->numcol;
00329 const long int numpt = parms->numpt;
00330 const long int natoms = parms->natoms;
00331 const float gridspacing = parms->gridspacing;
00332 const int threadid = parms->threadid;
00333 const int threadcount = parms->threadcount;
00334 double lasttime, totaltime;
00335
00336 cudaSetDevice(threadid);
00337 CUERR
00338
00339
00340 volsize.x = (numpt + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK);
00341 volsize.y = (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK);
00342 volsize.z = 1;
00343
00344
00345 Bsz.x = BLOCKSIZEX;
00346 Bsz.y = BLOCKSIZEY;
00347 Bsz.z = 1;
00348 Gsz.x = volsize.x / (Bsz.x * UNROLLX);
00349 Gsz.y = volsize.y / (Bsz.y * UNROLLY);
00350 Gsz.z = 1;
00351 int volmemsz = sizeof(float) * volsize.x * volsize.y * volsize.z;
00352
00353 printf("Thread %d started for CUDA device %d...\n", threadid, threadid);
00354 vmd_timerhandle timer = vmd_timer_create();
00355 vmd_timer_start(timer);
00356 msgtimer * msgt = msg_timer_create(5);
00357
00358
00359
00360 #define DMABUFPADSIZE (32 * 1024)
00361
00362
00363
00364 if (atomprebufpinned) {
00365
00366 if (cudaMallocHost((void**) &atomprebuf, MAXATOMS*4*sizeof(float) + DMABUFPADSIZE) != cudaSuccess) {
00367 printf("Pinned atom copy buffer allocation failed!\n");
00368 atomprebufpinned=0;
00369 }
00370 }
00371
00372
00373
00374 if (!atomprebufpinned) {
00375
00376 atomprebuf = (float *) malloc(MAXATOMS * 4 * sizeof(float) + DMABUFPADSIZE);
00377 if (atomprebuf == NULL) {
00378 printf("Atom copy buffer allocation failed!\n");
00379 return NULL;
00380 }
00381 }
00382
00383
00384
00385 cudaMalloc((void**)&devenergy, volmemsz);
00386 CUERR
00387
00388 hostenergy = (float *) malloc(volmemsz);
00389
00390
00391 int iterations=0;
00392 int k;
00393 for (k=threadid; k<numplane; k+= threadcount) {
00394 int y;
00395 int atomstart;
00396 float zplane = k * (float) gridspacing;
00397
00398
00399 for (y=0; y<numcol; y++) {
00400 long eneraddr = k*numcol*numpt + y*numpt;
00401 memcpy(&hostenergy[y*volsize.x], &grideners[eneraddr], numpt * sizeof(float));
00402 }
00403
00404
00405 cudaMemcpy(devenergy, hostenergy, volmemsz, cudaMemcpyHostToDevice);
00406 CUERR
00407
00408 lasttime = vmd_timer_timenow(timer);
00409 for (atomstart=0; atomstart<natoms; atomstart+=MAXATOMS) {
00410 iterations++;
00411 int runatoms;
00412 int atomsremaining = natoms - atomstart;
00413 if (atomsremaining > MAXATOMS)
00414 runatoms = MAXATOMS;
00415 else
00416 runatoms = atomsremaining;
00417
00418
00419 if (copyatomstoconstbuf(atoms + 4*atomstart, atomprebuf, runatoms, zplane))
00420 return NULL;
00421
00422
00423 cenergy<<<Gsz, Bsz, 0>>>(runatoms, gridspacing, devenergy);
00424 CUERR
00425 }
00426
00427
00428 cudaMemcpy(hostenergy, devenergy, volmemsz, cudaMemcpyDeviceToHost);
00429 CUERR
00430
00431
00432 for (y=0; y<numcol; y++) {
00433 long eneraddr = k*numcol*numpt + y*numpt;
00434 memcpy(&grideners[eneraddr], &hostenergy[y*volsize.x], numpt * sizeof(float));
00435 }
00436
00437 totaltime = vmd_timer_timenow(timer);
00438 if (msg_timer_timeout(msgt)) {
00439
00440 printf("thread[%d] plane %d/%ld time %.2f, elapsed %.1f, est. total: %.1f\n",
00441 threadid, k, numplane,
00442 totaltime - lasttime, totaltime,
00443 totaltime * numplane / (k+1));
00444 }
00445 }
00446
00447 vmd_timer_destroy(timer);
00448 msg_timer_destroy(msgt);
00449 free(hostenergy);
00450 cudaFree(devenergy);
00451 CUERR
00452
00453
00454 if (atomprebufpinned) {
00455 if (cudaFreeHost(atomprebuf) != cudaSuccess) {
00456 printf("Pinned atom buffer deallocation failed!\n");
00457 return NULL;
00458 }
00459 } else {
00460 free(atomprebuf);
00461 }
00462 return NULL;
00463 }
00464
00465
00466
00467