00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #include <cuda.h>
00025
00026 #include "Inform.h"
00027 #include "utilities.h"
00028 #include "WKFThreads.h"
00029 #include "WKFUtils.h"
00030 #include "CUDAKernels.h"
00031
00032 typedef struct {
00033 float* atoms;
00034 float* grideners;
00035 long int numplane;
00036 long int numcol;
00037 long int numpt;
00038 long int natoms;
00039 float gridspacing;
00040 } enthrparms;
00041
00042
00043 static void * cudaenergythread(void *);
00044
00045 #if 1
00046 #define CUERR { cudaError_t err; \
00047 if ((err = cudaGetLastError()) != cudaSuccess) { \
00048 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00049 printf("Thread aborting...\n"); \
00050 return NULL; }}
00051 #else
00052 #define CUERR
00053 #endif
00054
00055
00056
00057
00058
00059 #define MAXATOMS 4000
00060 __constant__ static float4 atominfo[MAXATOMS];
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086 #if 1
00087
00088
00089
00090
00091
00092
00093 #define UNROLLX 8
00094 #define UNROLLY 1
00095 #define BLOCKSIZEX 8 // make large enough to allow coalesced global mem ops
00096 #define BLOCKSIZEY 8 // make as small as possible for finer granularity
00097
00098 #else
00099
00100
00101
00102
00103
00104
00105 #define UNROLLX 4
00106 #define UNROLLY 1
00107 #define BLOCKSIZEX 16 // make large enough to allow coalesced global mem ops
00108 #define BLOCKSIZEY 16 // make as small as possible for finer granularity
00109
00110 #endif
00111
00112 #define BLOCKSIZE (BLOCKSIZEX*BLOCKSIZEY)
00113
00114
00115 #if UNROLLX == 8
00116 #define FLOPSPERATOMEVAL (59.0/8.0)
00117 #elif UNROLLX == 4
00118 #define FLOPSPERATOMEVAL (31.0/4.0)
00119 #endif
00120
00121 __global__ static void cenergy(int numatoms, float gridspacing, float * energygrid) {
00122 unsigned int xindex = __umul24(blockIdx.x, blockDim.x) * UNROLLX
00123 + threadIdx.x;
00124 unsigned int yindex = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
00125 unsigned int outaddr = (__umul24(gridDim.x, blockDim.x) * UNROLLX) * yindex
00126 + xindex;
00127
00128 float coory = gridspacing * yindex;
00129 float coorx = gridspacing * xindex;
00130
00131 float energyvalx1=0.0f;
00132 float energyvalx2=0.0f;
00133 float energyvalx3=0.0f;
00134 float energyvalx4=0.0f;
00135 #if UNROLLX > 4
00136 float energyvalx5=0.0f;
00137 float energyvalx6=0.0f;
00138 float energyvalx7=0.0f;
00139 float energyvalx8=0.0f;
00140 #endif
00141
00142 float gridspacing_coalesce = gridspacing * BLOCKSIZEX;
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153 int atomid;
00154 for (atomid=0; atomid<numatoms; atomid++) {
00155 float dy = coory - atominfo[atomid].y;
00156 float dyz2 = (dy * dy) + atominfo[atomid].z;
00157
00158 float dx1 = coorx - atominfo[atomid].x;
00159 float dx2 = dx1 + gridspacing_coalesce;
00160 float dx3 = dx2 + gridspacing_coalesce;
00161 float dx4 = dx3 + gridspacing_coalesce;
00162 #if UNROLLX > 4
00163 float dx5 = dx4 + gridspacing_coalesce;
00164 float dx6 = dx5 + gridspacing_coalesce;
00165 float dx7 = dx6 + gridspacing_coalesce;
00166 float dx8 = dx7 + gridspacing_coalesce;
00167 #endif
00168
00169 energyvalx1 += atominfo[atomid].w * rsqrtf(dx1*dx1 + dyz2);
00170 energyvalx2 += atominfo[atomid].w * rsqrtf(dx2*dx2 + dyz2);
00171 energyvalx3 += atominfo[atomid].w * rsqrtf(dx3*dx3 + dyz2);
00172 energyvalx4 += atominfo[atomid].w * rsqrtf(dx4*dx4 + dyz2);
00173 #if UNROLLX > 4
00174 energyvalx5 += atominfo[atomid].w * rsqrtf(dx5*dx5 + dyz2);
00175 energyvalx6 += atominfo[atomid].w * rsqrtf(dx6*dx6 + dyz2);
00176 energyvalx7 += atominfo[atomid].w * rsqrtf(dx7*dx7 + dyz2);
00177 energyvalx8 += atominfo[atomid].w * rsqrtf(dx8*dx8 + dyz2);
00178 #endif
00179 }
00180
00181 energygrid[outaddr ] += energyvalx1;
00182 energygrid[outaddr+1*BLOCKSIZEX] += energyvalx2;
00183 energygrid[outaddr+2*BLOCKSIZEX] += energyvalx3;
00184 energygrid[outaddr+3*BLOCKSIZEX] += energyvalx4;
00185 #if UNROLLX > 4
00186 energygrid[outaddr+4*BLOCKSIZEX] += energyvalx5;
00187 energygrid[outaddr+5*BLOCKSIZEX] += energyvalx6;
00188 energygrid[outaddr+6*BLOCKSIZEX] += energyvalx7;
00189 energygrid[outaddr+7*BLOCKSIZEX] += energyvalx8;
00190 #endif
00191 }
00192
00193
00194
00195
00196 #define TILESIZEX BLOCKSIZEX*UNROLLX
00197 #define TILESIZEY BLOCKSIZEY*UNROLLY
00198 #define GPU_X_ALIGNMASK (TILESIZEX - 1)
00199 #define GPU_Y_ALIGNMASK (TILESIZEY - 1)
00200
00201 static int copyatomstoconstbuf(const float *atoms, float *atompre,
00202 int count, float zplane) {
00203 if (count > MAXATOMS) {
00204 printf("Atom count exceeds constant buffer storage capacity\n");
00205 return -1;
00206 }
00207
00208 int i;
00209 for (i=0; i<count*4; i+=4) {
00210 atompre[i ] = atoms[i ];
00211 atompre[i + 1] = atoms[i + 1];
00212 float dz = zplane - atoms[i + 2];
00213 atompre[i + 2] = dz*dz;
00214 atompre[i + 3] = atoms[i + 3];
00215 }
00216
00217 cudaMemcpyToSymbol(atominfo, atompre, count * 4 * sizeof(float), 0);
00218
00219 return 0;
00220 }
00221
00222 int vmd_cuda_vol_cpotential(long int natoms, float* atoms, float* grideners,
00223 long int numplane, long int numcol, long int numpt,
00224 float gridspacing) {
00225 enthrparms parms;
00226 wkf_timerhandle globaltimer;
00227 double totalruntime;
00228 int rc=0;
00229
00230 int numprocs = wkf_thread_numprocessors();
00231 int deviceCount = 0;
00232 if (vmd_cuda_num_devices(&deviceCount))
00233 return -1;
00234 if (deviceCount < 1)
00235 return -1;
00236
00237
00238
00239 if (deviceCount < numprocs) {
00240 numprocs = deviceCount;
00241 }
00242
00243 printf("Using %d CUDA GPUs\n", numprocs);
00244 printf("GPU padded grid size: %d x %d x %d\n",
00245 (numpt + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK),
00246 (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK),
00247 numplane);
00248
00249 parms.atoms = atoms;
00250 parms.grideners = grideners;
00251 parms.numplane = numplane;
00252 parms.numcol = numcol;
00253 parms.numpt = numpt;
00254 parms.natoms = natoms;
00255 parms.gridspacing = gridspacing;
00256
00257 globaltimer = wkf_timer_create();
00258 wkf_timer_start(globaltimer);
00259
00260
00261 wkf_tasktile_t tile;
00262 tile.start=0;
00263 tile.end=numplane;
00264 rc = wkf_threadlaunch(numprocs, &parms, cudaenergythread, &tile);
00265
00266
00267 wkf_timer_stop(globaltimer);
00268 totalruntime = wkf_timer_time(globaltimer);
00269 wkf_timer_destroy(globaltimer);
00270
00271 if (!rc) {
00272 double atomevalssec = ((double) numplane * numcol * numpt * natoms) / (totalruntime * 1000000000.0);
00273 printf(" %g billion atom evals/second, %g GFLOPS\n",
00274 atomevalssec, atomevalssec * FLOPSPERATOMEVAL);
00275 } else {
00276 msgWarn << "A GPU encountered an unrecoverable error." << sendmsg;
00277 msgWarn << "Calculation will continue using the main CPU." << sendmsg;
00278 }
00279 return rc;
00280 }
00281
00282
00283
00284
00285
00286 static void * cudaenergythread(void *voidparms) {
00287 dim3 volsize, Gsz, Bsz;
00288 float *devenergy = NULL;
00289 float *hostenergy = NULL;
00290 float *atomprebuf = NULL;
00291 enthrparms *parms = NULL;
00292 int threadid=0;
00293 int atomprebufpinned = 0;
00294
00295 wkf_threadlaunch_getid(voidparms, &threadid, NULL);
00296 wkf_threadlaunch_getdata(voidparms, (void **) &parms);
00297
00298
00299
00300
00301 const float *atoms = parms->atoms;
00302 float* grideners = parms->grideners;
00303 const long int numplane = parms->numplane;
00304 const long int numcol = parms->numcol;
00305 const long int numpt = parms->numpt;
00306 const long int natoms = parms->natoms;
00307 const float gridspacing = parms->gridspacing;
00308 double lasttime, totaltime;
00309
00310 cudaError_t rc;
00311 rc = cudaSetDevice(threadid);
00312 if (rc != cudaSuccess) {
00313 #if CUDART_VERSION >= 2010
00314 rc = cudaGetLastError();
00315 if (rc != cudaErrorSetOnActiveProcess)
00316 return NULL;
00317 #else
00318 cudaGetLastError();
00319
00320 #endif
00321 }
00322
00323
00324 volsize.x = (numpt + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK);
00325 volsize.y = (numcol + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK);
00326 volsize.z = 1;
00327
00328
00329 Bsz.x = BLOCKSIZEX;
00330 Bsz.y = BLOCKSIZEY;
00331 Bsz.z = 1;
00332 Gsz.x = volsize.x / (Bsz.x * UNROLLX);
00333 Gsz.y = volsize.y / (Bsz.y * UNROLLY);
00334 Gsz.z = 1;
00335 int volmemsz = sizeof(float) * volsize.x * volsize.y * volsize.z;
00336
00337 printf("Thread %d started for CUDA device %d...\n", threadid, threadid);
00338 wkf_timerhandle timer = wkf_timer_create();
00339 wkf_timer_start(timer);
00340 wkfmsgtimer * msgt = wkf_msg_timer_create(5);
00341
00342
00343
00344 #define DMABUFPADSIZE (32 * 1024)
00345
00346
00347
00348 if (atomprebufpinned) {
00349
00350 if (cudaMallocHost((void**) &atomprebuf, MAXATOMS*4*sizeof(float) + DMABUFPADSIZE) != cudaSuccess) {
00351 printf("Pinned atom copy buffer allocation failed!\n");
00352 atomprebufpinned=0;
00353 }
00354 }
00355
00356
00357
00358 if (!atomprebufpinned) {
00359
00360 atomprebuf = (float *) malloc(MAXATOMS * 4 * sizeof(float) + DMABUFPADSIZE);
00361 if (atomprebuf == NULL) {
00362 printf("Atom copy buffer allocation failed!\n");
00363 return NULL;
00364 }
00365 }
00366
00367
00368
00369 cudaMalloc((void**)&devenergy, volmemsz);
00370 CUERR
00371
00372 hostenergy = (float *) malloc(volmemsz);
00373
00374
00375 int iterations=0;
00376 int computedplanes=0;
00377 wkf_tasktile_t tile;
00378 while (wkf_threadlaunch_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
00379 int k;
00380 for (k=tile.start; k<tile.end; k++) {
00381 int y;
00382 int atomstart;
00383 float zplane = k * (float) gridspacing;
00384 computedplanes++;
00385
00386
00387 for (y=0; y<numcol; y++) {
00388 long eneraddr = k*numcol*numpt + y*numpt;
00389 memcpy(&hostenergy[y*volsize.x], &grideners[eneraddr], numpt * sizeof(float));
00390 }
00391
00392
00393 cudaMemcpy(devenergy, hostenergy, volmemsz, cudaMemcpyHostToDevice);
00394 CUERR
00395
00396 lasttime = wkf_timer_timenow(timer);
00397 for (atomstart=0; atomstart<natoms; atomstart+=MAXATOMS) {
00398 iterations++;
00399 int runatoms;
00400 int atomsremaining = natoms - atomstart;
00401 if (atomsremaining > MAXATOMS)
00402 runatoms = MAXATOMS;
00403 else
00404 runatoms = atomsremaining;
00405
00406
00407 if (copyatomstoconstbuf(atoms + 4*atomstart, atomprebuf, runatoms, zplane))
00408 return NULL;
00409
00410
00411 cenergy<<<Gsz, Bsz, 0>>>(runatoms, gridspacing, devenergy);
00412 CUERR
00413 }
00414
00415
00416 cudaMemcpy(hostenergy, devenergy, volmemsz, cudaMemcpyDeviceToHost);
00417 CUERR
00418
00419
00420 for (y=0; y<numcol; y++) {
00421 long eneraddr = k*numcol*numpt + y*numpt;
00422 memcpy(&grideners[eneraddr], &hostenergy[y*volsize.x], numpt * sizeof(float));
00423 }
00424
00425 totaltime = wkf_timer_timenow(timer);
00426 if (wkf_msg_timer_timeout(msgt)) {
00427
00428 printf("thread[%d] plane %d/%ld (%d computed) time %.2f, elapsed %.1f, est. total: %.1f\n",
00429 threadid, k, numplane, computedplanes,
00430 totaltime - lasttime, totaltime,
00431 totaltime * numplane / (k+1));
00432 }
00433 }
00434 }
00435
00436 wkf_timer_destroy(timer);
00437 wkf_msg_timer_destroy(msgt);
00438 free(hostenergy);
00439 cudaFree(devenergy);
00440 CUERR
00441
00442
00443 if (atomprebufpinned) {
00444 if (cudaFreeHost(atomprebuf) != cudaSuccess) {
00445 printf("Pinned atom buffer deallocation failed!\n");
00446 return NULL;
00447 }
00448 } else {
00449 free(atomprebuf);
00450 }
00451 return NULL;
00452 }
00453
00454
00455
00456