00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #include <cuda.h>
00025
00026 #include "Inform.h"
00027 #include "utilities.h"
00028 #include "WKFThreads.h"
00029 #include "WKFUtils.h"
00030 #include "CUDAKernels.h"
00031
00032 typedef struct {
00033 float* atoms;
00034 float* grideners;
00035 long int numplane;
00036 long int numcol;
00037 long int numpt;
00038 long int natoms;
00039 float gridspacing;
00040 } enthrparms;
00041
00042
00043 static void * cudaenergythread(void *);
00044
00045 #if 1
00046 #define CUERR { cudaError_t err; \
00047 if ((err = cudaGetLastError()) != cudaSuccess) { \
00048 printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \
00049 printf("Thread aborting...\n"); \
00050 return NULL; }}
00051 #else
00052 #define CUERR
00053 #endif
00054
00055
00056
00057
00058
00059 #define MAXATOMS 4000
00060 __constant__ static float4 atominfo[MAXATOMS];
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086 #if 1
00087
00088
00089
00090
00091
00092
00093 #define UNROLLX 8
00094 #define UNROLLY 1
00095 #define BLOCKSIZEX 8 // make large enough to allow coalesced global mem ops
00096 #define BLOCKSIZEY 8 // make as small as possible for finer granularity
00097
00098 #else
00099
00100
00101
00102
00103
00104
00105 #define UNROLLX 4
00106 #define UNROLLY 1
00107 #define BLOCKSIZEX 16 // make large enough to allow coalesced global mem ops
00108 #define BLOCKSIZEY 16 // make as small as possible for finer granularity
00109
00110 #endif
00111
00112 #define BLOCKSIZE (BLOCKSIZEX*BLOCKSIZEY)
00113
00114
00115 #if UNROLLX == 8
00116 #define FLOPSPERATOMEVAL (59.0/8.0)
00117 #elif UNROLLX == 4
00118 #define FLOPSPERATOMEVAL (31.0/4.0)
00119 #endif
00120
00121 __global__ static void cenergy(int numatoms, float gridspacing, float * energygrid) {
00122 unsigned int xindex = __umul24(blockIdx.x, blockDim.x) * UNROLLX
00123 + threadIdx.x;
00124 unsigned int yindex = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
00125 unsigned int outaddr = (__umul24(gridDim.x, blockDim.x) * UNROLLX) * yindex
00126 + xindex;
00127
00128 float coory = gridspacing * yindex;
00129 float coorx = gridspacing * xindex;
00130
00131 float energyvalx1=0.0f;
00132 float energyvalx2=0.0f;
00133 float energyvalx3=0.0f;
00134 float energyvalx4=0.0f;
00135 #if UNROLLX > 4
00136 float energyvalx5=0.0f;
00137 float energyvalx6=0.0f;
00138 float energyvalx7=0.0f;
00139 float energyvalx8=0.0f;
00140 #endif
00141
00142 float gridspacing_coalesce = gridspacing * BLOCKSIZEX;
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153 int atomid;
00154 for (atomid=0; atomid<numatoms; atomid++) {
00155 float dy = coory - atominfo[atomid].y;
00156 float dyz2 = (dy * dy) + atominfo[atomid].z;
00157
00158 float dx1 = coorx - atominfo[atomid].x;
00159 float dx2 = dx1 + gridspacing_coalesce;
00160 float dx3 = dx2 + gridspacing_coalesce;
00161 float dx4 = dx3 + gridspacing_coalesce;
00162 #if UNROLLX > 4
00163 float dx5 = dx4 + gridspacing_coalesce;
00164 float dx6 = dx5 + gridspacing_coalesce;
00165 float dx7 = dx6 + gridspacing_coalesce;
00166 float dx8 = dx7 + gridspacing_coalesce;
00167 #endif
00168
00169 energyvalx1 += atominfo[atomid].w * rsqrtf(dx1*dx1 + dyz2);
00170 energyvalx2 += atominfo[atomid].w * rsqrtf(dx2*dx2 + dyz2);
00171 energyvalx3 += atominfo[atomid].w * rsqrtf(dx3*dx3 + dyz2);
00172 energyvalx4 += atominfo[atomid].w * rsqrtf(dx4*dx4 + dyz2);
00173 #if UNROLLX > 4
00174 energyvalx5 += atominfo[atomid].w * rsqrtf(dx5*dx5 + dyz2);
00175 energyvalx6 += atominfo[atomid].w * rsqrtf(dx6*dx6 + dyz2);
00176 energyvalx7 += atominfo[atomid].w * rsqrtf(dx7*dx7 + dyz2);
00177 energyvalx8 += atominfo[atomid].w * rsqrtf(dx8*dx8 + dyz2);
00178 #endif
00179 }
00180
00181 energygrid[outaddr ] += energyvalx1;
00182 energygrid[outaddr+1*BLOCKSIZEX] += energyvalx2;
00183 energygrid[outaddr+2*BLOCKSIZEX] += energyvalx3;
00184 energygrid[outaddr+3*BLOCKSIZEX] += energyvalx4;
00185 #if UNROLLX > 4
00186 energygrid[outaddr+4*BLOCKSIZEX] += energyvalx5;
00187 energygrid[outaddr+5*BLOCKSIZEX] += energyvalx6;
00188 energygrid[outaddr+6*BLOCKSIZEX] += energyvalx7;
00189 energygrid[outaddr+7*BLOCKSIZEX] += energyvalx8;
00190 #endif
00191 }
00192
00193
00194
00195
00196 #define TILESIZEX BLOCKSIZEX*UNROLLX
00197 #define TILESIZEY BLOCKSIZEY*UNROLLY
00198 #define GPU_X_ALIGNMASK (TILESIZEX - 1)
00199 #define GPU_Y_ALIGNMASK (TILESIZEY - 1)
00200
00201 static int copyatomstoconstbuf(const float *atoms, float *atompre,
00202 int count, float zplane) {
00203 if (count > MAXATOMS) {
00204 printf("Atom count exceeds constant buffer storage capacity\n");
00205 return -1;
00206 }
00207
00208 int i;
00209 for (i=0; i<count*4; i+=4) {
00210 atompre[i ] = atoms[i ];
00211 atompre[i + 1] = atoms[i + 1];
00212 float dz = zplane - atoms[i + 2];
00213 atompre[i + 2] = dz*dz;
00214 atompre[i + 3] = atoms[i + 3];
00215 }
00216
00217 cudaMemcpyToSymbol(atominfo, atompre, count * 4 * sizeof(float), 0);
00218 CUERR
00219
00220 return 0;
00221 }
00222
00223 int vmd_cuda_vol_cpotential(long int natoms, float* atoms, float* grideners,
00224 long int numplane, long int numcol, long int numpt,
00225 float gridspacing) {
00226 enthrparms parms;
00227 wkf_timerhandle globaltimer;
00228 double totalruntime;
00229 int rc=0;
00230
00231 int numprocs = wkf_thread_numprocessors();
00232 int deviceCount = 0;
00233 if (vmd_cuda_num_devices(&deviceCount))
00234 return -1;
00235 if (deviceCount < 1)
00236 return -1;
00237
00238
00239
00240 if (deviceCount < numprocs) {
00241 numprocs = deviceCount;
00242 }
00243
00244 printf("Using %d CUDA GPUs\n", numprocs);
00245 printf("GPU padded grid size: %d x %d x %d\n",
00246 (numpt + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK),
00247 (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK),
00248 numplane);
00249
00250 parms.atoms = atoms;
00251 parms.grideners = grideners;
00252 parms.numplane = numplane;
00253 parms.numcol = numcol;
00254 parms.numpt = numpt;
00255 parms.natoms = natoms;
00256 parms.gridspacing = gridspacing;
00257
00258 globaltimer = wkf_timer_create();
00259 wkf_timer_start(globaltimer);
00260
00261
00262 wkf_tasktile_t tile;
00263 tile.start=0;
00264 tile.end=numplane;
00265 rc = wkf_threadlaunch(numprocs, &parms, cudaenergythread, &tile);
00266
00267
00268 wkf_timer_stop(globaltimer);
00269 totalruntime = wkf_timer_time(globaltimer);
00270 wkf_timer_destroy(globaltimer);
00271
00272 if (!rc) {
00273 double atomevalssec = ((double) numplane * numcol * numpt * natoms) / (totalruntime * 1000000000.0);
00274 printf(" %g billion atom evals/second, %g GFLOPS\n",
00275 atomevalssec, atomevalssec * FLOPSPERATOMEVAL);
00276 } else {
00277 msgWarn << "A GPU encountered an unrecoverable error." << sendmsg;
00278 msgWarn << "Calculation will continue using the main CPU." << sendmsg;
00279 }
00280 return rc;
00281 }
00282
00283
00284
00285
00286
00287 static void * cudaenergythread(void *voidparms) {
00288 dim3 volsize, Gsz, Bsz;
00289 float *devenergy = NULL;
00290 float *hostenergy = NULL;
00291 float *atomprebuf = NULL;
00292 enthrparms *parms = NULL;
00293 int threadid=0;
00294 int atomprebufpinned = 0;
00295
00296 wkf_threadlaunch_getid(voidparms, &threadid, NULL);
00297 wkf_threadlaunch_getdata(voidparms, (void **) &parms);
00298
00299
00300
00301
00302 const float *atoms = parms->atoms;
00303 float* grideners = parms->grideners;
00304 const long int numplane = parms->numplane;
00305 const long int numcol = parms->numcol;
00306 const long int numpt = parms->numpt;
00307 const long int natoms = parms->natoms;
00308 const float gridspacing = parms->gridspacing;
00309 double lasttime, totaltime;
00310
00311 cudaError_t rc;
00312 rc = cudaSetDevice(threadid);
00313 if (rc != cudaSuccess) {
00314 #if CUDART_VERSION >= 2010
00315 rc = cudaGetLastError();
00316 if (rc != cudaErrorSetOnActiveProcess)
00317 return NULL;
00318 #else
00319 cudaGetLastError();
00320
00321 #endif
00322 }
00323
00324
00325 volsize.x = (numpt + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK);
00326 volsize.y = (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK);
00327 volsize.z = 1;
00328
00329
00330 Bsz.x = BLOCKSIZEX;
00331 Bsz.y = BLOCKSIZEY;
00332 Bsz.z = 1;
00333 Gsz.x = volsize.x / (Bsz.x * UNROLLX);
00334 Gsz.y = volsize.y / (Bsz.y * UNROLLY);
00335 Gsz.z = 1;
00336 int volmemsz = sizeof(float) * volsize.x * volsize.y * volsize.z;
00337
00338 printf("Thread %d started for CUDA device %d...\n", threadid, threadid);
00339 wkf_timerhandle timer = wkf_timer_create();
00340 wkf_timer_start(timer);
00341 wkfmsgtimer * msgt = wkf_msg_timer_create(5);
00342
00343
00344
00345 #define DMABUFPADSIZE (32 * 1024)
00346
00347
00348
00349 if (atomprebufpinned) {
00350
00351 if (cudaMallocHost((void**) &atomprebuf, MAXATOMS*4*sizeof(float) + DMABUFPADSIZE) != cudaSuccess) {
00352 printf("Pinned atom copy buffer allocation failed!\n");
00353 atomprebufpinned=0;
00354 }
00355 }
00356
00357
00358
00359 if (!atomprebufpinned) {
00360
00361 atomprebuf = (float *) malloc(MAXATOMS * 4 * sizeof(float) + DMABUFPADSIZE);
00362 if (atomprebuf == NULL) {
00363 printf("Atom copy buffer allocation failed!\n");
00364 return NULL;
00365 }
00366 }
00367
00368
00369
00370 cudaMalloc((void**)&devenergy, volmemsz);
00371 CUERR
00372
00373 hostenergy = (float *) malloc(volmemsz);
00374
00375
00376 int iterations=0;
00377 int computedplanes=0;
00378 wkf_tasktile_t tile;
00379 while (wkf_threadlaunch_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
00380 int k;
00381 for (k=tile.start; k<tile.end; k++) {
00382 int y;
00383 int atomstart;
00384 float zplane = k * (float) gridspacing;
00385 computedplanes++;
00386
00387
00388 for (y=0; y<numcol; y++) {
00389 long eneraddr = k*numcol*numpt + y*numpt;
00390 memcpy(&hostenergy[y*volsize.x], &grideners[eneraddr], numpt * sizeof(float));
00391 }
00392
00393
00394 cudaMemcpy(devenergy, hostenergy, volmemsz, cudaMemcpyHostToDevice);
00395 CUERR
00396
00397 lasttime = wkf_timer_timenow(timer);
00398 for (atomstart=0; atomstart<natoms; atomstart+=MAXATOMS) {
00399 iterations++;
00400 int runatoms;
00401 int atomsremaining = natoms - atomstart;
00402 if (atomsremaining > MAXATOMS)
00403 runatoms = MAXATOMS;
00404 else
00405 runatoms = atomsremaining;
00406
00407
00408 if (copyatomstoconstbuf(atoms + 4*atomstart, atomprebuf, runatoms, zplane))
00409 return NULL;
00410
00411
00412 cenergy<<<Gsz, Bsz, 0>>>(runatoms, gridspacing, devenergy);
00413 CUERR
00414 }
00415
00416
00417 cudaMemcpy(hostenergy, devenergy, volmemsz, cudaMemcpyDeviceToHost);
00418 CUERR
00419
00420
00421 for (y=0; y<numcol; y++) {
00422 long eneraddr = k*numcol*numpt + y*numpt;
00423 memcpy(&grideners[eneraddr], &hostenergy[y*volsize.x], numpt * sizeof(float));
00424 }
00425
00426 totaltime = wkf_timer_timenow(timer);
00427 if (wkf_msg_timer_timeout(msgt)) {
00428
00429 printf("thread[%d] plane %d/%ld (%d computed) time %.2f, elapsed %.1f, est. total: %.1f\n",
00430 threadid, k, numplane, computedplanes,
00431 totaltime - lasttime, totaltime,
00432 totaltime * numplane / (k+1));
00433 }
00434 }
00435 }
00436
00437 wkf_timer_destroy(timer);
00438 wkf_msg_timer_destroy(msgt);
00439 free(hostenergy);
00440 cudaFree(devenergy);
00441 CUERR
00442
00443
00444 if (atomprebufpinned) {
00445 if (cudaFreeHost(atomprebuf) != cudaSuccess) {
00446 printf("Pinned atom buffer deallocation failed!\n");
00447 return NULL;
00448 }
00449 } else {
00450 free(atomprebuf);
00451 }
00452 return NULL;
00453 }
00454
00455
00456
00457