00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <stdio.h>
00023 #include <stdlib.h>
00024 #include <string.h>
00025 #include <math.h>
00026 #include <cuda.h>
00027
00028 #include "WKFThreads.h"
00029 #include "CUDAKernels.h"
00030 #include "OrbitalJIT.h"
00031
00032
00033 #if CUDART_VERSION >= 2030
00034 #define USE_PINNED_MEMORY 1
00035
00036
00037 #endif
00038
00039 #if 1
00040 #define CUERR { cudaError_t err; \
00041 if ((err = cudaGetLastError()) != cudaSuccess) { \
00042 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00043 printf("Thread aborting...\n"); \
00044 return NULL; }}
00045 #else
00046 #define CUERR
00047 #endif
00048
00049 #define ANGS_TO_BOHR 1.8897259877218677f
00050
00051 typedef union flint_t {
00052 float f;
00053 int i;
00054 } flint;
00055
00056 typedef struct {
00057 int numatoms;
00058 const float *wave_f;
00059 int num_wave_f;
00060 const float *basis_array;
00061 int num_basis;
00062 const float *atompos;
00063 const int *atom_basis;
00064 const int *num_shells_per_atom;
00065 const int *num_prim_per_shell;
00066 const int *shell_types;
00067 int num_shells;
00068 const int *numvoxels;
00069 float voxelsize;
00070 const float *origin;
00071 int density;
00072 float *orbitalgrid;
00073 } orbthrparms;
00074
00075
00076 static void * cudaorbitalthread(void *);
00077
00078
00079 #define UNROLLX 1
00080 #define UNROLLY 1
00081 #define BLOCKSIZEX 8
00082 #define BLOCKSIZEY 8
00083 #define BLOCKSIZE BLOCKSIZEX * BLOCKSIZEY
00084
00085
00086 #define TILESIZEX BLOCKSIZEX*UNROLLX
00087 #define TILESIZEY BLOCKSIZEY*UNROLLY
00088 #define GPU_X_ALIGNMASK (TILESIZEX - 1)
00089 #define GPU_Y_ALIGNMASK (TILESIZEY - 1)
00090
00091 #define MEMCOALESCE 384
00092
00093
00094 #define S_SHELL 0
00095 #define P_SHELL 1
00096 #define D_SHELL 2
00097 #define F_SHELL 3
00098 #define G_SHELL 4
00099 #define H_SHELL 5
00100
00101
00102
00103
00104 #define MAX_ATOM_SZ 256
00105
00106 #define MAX_ATOMPOS_SZ (MAX_ATOM_SZ)
00107 __constant__ static float const_atompos[MAX_ATOMPOS_SZ * 3];
00108
00109 #define MAX_ATOM_BASIS_SZ (MAX_ATOM_SZ)
00110 __constant__ static int const_atom_basis[MAX_ATOM_BASIS_SZ];
00111
00112 #define MAX_ATOMSHELL_SZ (MAX_ATOM_SZ)
00113 __constant__ static int const_num_shells_per_atom[MAX_ATOMSHELL_SZ];
00114
00115 #define MAX_BASIS_SZ 6144
00116 __constant__ static float const_basis_array[MAX_BASIS_SZ];
00117
00118 #define MAX_SHELL_SZ 1024
00119 __constant__ static int const_num_prim_per_shell[MAX_SHELL_SZ];
00120 __constant__ static int const_shell_types[MAX_SHELL_SZ];
00121
00122 #define MAX_WAVEF_SZ 6144
00123 __constant__ static float const_wave_f[MAX_WAVEF_SZ];
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136 #if defined(VMDMOJIT) && defined(VMDMOJITSRC)
00137 #include VMDMOJITSRC
00138 #endif
00139
00140
00141
00142
00143
00144
00145 __global__ static void cuorbitalconstmem(int numatoms,
00146 float voxelsize,
00147 float originx,
00148 float originy,
00149 float grid_z,
00150 int density,
00151 float * orbitalgrid) {
00152 unsigned int xindex = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
00153 unsigned int yindex = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
00154 unsigned int outaddr = __umul24(gridDim.x, blockDim.x) * yindex + xindex;
00155 float grid_x = originx + voxelsize * xindex;
00156 float grid_y = originy + voxelsize * yindex;
00157
00158
00159 int at;
00160 int prim, shell;
00161
00162
00163 float value = 0.0f;
00164
00165
00166 int ifunc = 0;
00167 int shell_counter = 0;
00168
00169
00170 for (at = 0; at < numatoms; at++) {
00171
00172 int maxshell = const_num_shells_per_atom[at];
00173 int prim_counter = const_atom_basis[at];
00174
00175 float xdist = (grid_x - const_atompos[3*at ])*ANGS_TO_BOHR;
00176 float ydist = (grid_y - const_atompos[3*at+1])*ANGS_TO_BOHR;
00177 float zdist = (grid_z - const_atompos[3*at+2])*ANGS_TO_BOHR;
00178
00179 float xdist2 = xdist*xdist;
00180 float ydist2 = ydist*ydist;
00181 float zdist2 = zdist*zdist;
00182
00183 float dist2 = xdist2 + ydist2 + zdist2;
00184
00185
00186 for (shell=0; shell < maxshell; shell++) {
00187 float contracted_gto = 0.0f;
00188
00189
00190
00191 int maxprim = const_num_prim_per_shell[shell_counter];
00192 int shelltype = const_shell_types[shell_counter];
00193 for (prim=0; prim < maxprim; prim++) {
00194 float exponent = const_basis_array[prim_counter ];
00195 float contract_coeff = const_basis_array[prim_counter + 1];
00196
00197
00198
00199
00200 contracted_gto += contract_coeff * exp2f(-exponent*dist2);
00201 prim_counter += 2;
00202 }
00203
00204
00205 float tmpshell=0.0f;
00206 switch (shelltype) {
00207 case S_SHELL:
00208 value += const_wave_f[ifunc++] * contracted_gto;
00209 break;
00210
00211 case P_SHELL:
00212 tmpshell += const_wave_f[ifunc++] * xdist;
00213 tmpshell += const_wave_f[ifunc++] * ydist;
00214 tmpshell += const_wave_f[ifunc++] * zdist;
00215 value += tmpshell * contracted_gto;
00216 break;
00217
00218 case D_SHELL:
00219 tmpshell += const_wave_f[ifunc++] * xdist2;
00220 tmpshell += const_wave_f[ifunc++] * xdist * ydist;
00221 tmpshell += const_wave_f[ifunc++] * ydist2;
00222 tmpshell += const_wave_f[ifunc++] * xdist * zdist;
00223 tmpshell += const_wave_f[ifunc++] * ydist * zdist;
00224 tmpshell += const_wave_f[ifunc++] * zdist2;
00225 value += tmpshell * contracted_gto;
00226 break;
00227
00228 case F_SHELL:
00229 tmpshell += const_wave_f[ifunc++] * xdist2 * xdist;
00230 tmpshell += const_wave_f[ifunc++] * xdist2 * ydist;
00231 tmpshell += const_wave_f[ifunc++] * ydist2 * xdist;
00232 tmpshell += const_wave_f[ifunc++] * ydist2 * ydist;
00233 tmpshell += const_wave_f[ifunc++] * xdist2 * zdist;
00234 tmpshell += const_wave_f[ifunc++] * xdist * ydist * zdist;
00235 tmpshell += const_wave_f[ifunc++] * ydist2 * zdist;
00236 tmpshell += const_wave_f[ifunc++] * zdist2 * xdist;
00237 tmpshell += const_wave_f[ifunc++] * zdist2 * ydist;
00238 tmpshell += const_wave_f[ifunc++] * zdist2 * zdist;
00239 value += tmpshell * contracted_gto;
00240 break;
00241
00242 case G_SHELL:
00243 tmpshell += const_wave_f[ifunc++] * xdist2 * xdist2;
00244 tmpshell += const_wave_f[ifunc++] * xdist2 * xdist * ydist;
00245 tmpshell += const_wave_f[ifunc++] * xdist2 * ydist2;
00246 tmpshell += const_wave_f[ifunc++] * ydist2 * ydist * xdist;
00247 tmpshell += const_wave_f[ifunc++] * ydist2 * ydist2;
00248 tmpshell += const_wave_f[ifunc++] * xdist2 * xdist * zdist;
00249 tmpshell += const_wave_f[ifunc++] * xdist2 * ydist * zdist;
00250 tmpshell += const_wave_f[ifunc++] * ydist2 * xdist * zdist;
00251 tmpshell += const_wave_f[ifunc++] * ydist2 * ydist * zdist;
00252 tmpshell += const_wave_f[ifunc++] * xdist2 * zdist2;
00253 tmpshell += const_wave_f[ifunc++] * zdist2 * xdist * ydist;
00254 tmpshell += const_wave_f[ifunc++] * ydist2 * zdist2;
00255 tmpshell += const_wave_f[ifunc++] * zdist2 * zdist * xdist;
00256 tmpshell += const_wave_f[ifunc++] * zdist2 * zdist * ydist;
00257 tmpshell += const_wave_f[ifunc++] * zdist2 * zdist2;
00258 value += tmpshell * contracted_gto;
00259 break;
00260 }
00261
00262 shell_counter++;
00263 }
00264 }
00265
00266
00267 if (density) {
00268 orbitalgrid[outaddr] = copysignf(value*value, value);
00269 } else {
00270 orbitalgrid[outaddr] = value;
00271 }
00272 }
00273
00274
00275
00276
00277
00278
00279
00280
00281
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291
00292 #define MAXSHELLCOUNT 15
00293
00294
00295
00296
00297
00298 #define MEMCOAMASK (~15)
00299
00300
00301
00302
00303
00304
00305 #define SHAREDSIZE 256
00306
00307
00308 __global__ static void cuorbitaltiledshared(int numatoms,
00309 const float *wave_f,
00310 const float *basis_array,
00311 const flint *atominfo,
00312 const int *shellinfo,
00313 float voxelsize,
00314 float originx,
00315 float originy,
00316 float grid_z,
00317 int density,
00318 float * orbitalgrid) {
00319 unsigned int xindex = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
00320 unsigned int yindex = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
00321 unsigned int outaddr = __umul24(gridDim.x, blockDim.x) * yindex + xindex;
00322 float grid_x = originx + voxelsize * xindex;
00323 float grid_y = originy + voxelsize * yindex;
00324
00325 int sidx = __mul24(threadIdx.y, blockDim.x) + threadIdx.x;
00326
00327 __shared__ float s_wave_f[SHAREDSIZE];
00328 int sblock_wave_f = 0;
00329 s_wave_f[sidx ] = wave_f[sidx ];
00330 s_wave_f[sidx + 64] = wave_f[sidx + 64];
00331 s_wave_f[sidx + 128] = wave_f[sidx + 128];
00332 s_wave_f[sidx + 192] = wave_f[sidx + 192];
00333 __syncthreads();
00334
00335
00336 int at;
00337 int prim, shell;
00338
00339
00340 float value = 0.0f;
00341
00342
00343 int ifunc = 0;
00344 int shell_counter = 0;
00345 int sblock_prim_counter = -1;
00346
00347 for (at = 0; at < numatoms; at++) {
00348 __shared__ flint s_atominfo[5];
00349 __shared__ float s_basis_array[SHAREDSIZE];
00350
00351 __syncthreads();
00352 if (sidx < 5)
00353 s_atominfo[sidx].i = atominfo[(at<<4) + sidx].i;
00354 __syncthreads();
00355
00356 int prim_counter = s_atominfo[3].i;
00357 int maxshell = s_atominfo[4].i;
00358 int new_sblock_prim_counter = prim_counter & MEMCOAMASK;
00359 if (sblock_prim_counter != new_sblock_prim_counter) {
00360 sblock_prim_counter = new_sblock_prim_counter;
00361 s_basis_array[sidx ] = basis_array[sblock_prim_counter + sidx ];
00362 s_basis_array[sidx + 64] = basis_array[sblock_prim_counter + sidx + 64];
00363 s_basis_array[sidx + 128] = basis_array[sblock_prim_counter + sidx + 128];
00364 s_basis_array[sidx + 192] = basis_array[sblock_prim_counter + sidx + 192];
00365 prim_counter -= sblock_prim_counter;
00366 __syncthreads();
00367 }
00368
00369
00370 float xdist = (grid_x - s_atominfo[0].f)*ANGS_TO_BOHR;
00371 float ydist = (grid_y - s_atominfo[1].f)*ANGS_TO_BOHR;
00372 float zdist = (grid_z - s_atominfo[2].f)*ANGS_TO_BOHR;
00373
00374 float xdist2 = xdist*xdist;
00375 float ydist2 = ydist*ydist;
00376 float zdist2 = zdist*zdist;
00377
00378 float dist2 = xdist2 + ydist2 + zdist2;
00379
00380
00381 for (shell=0; shell < maxshell; shell++) {
00382 float contracted_gto = 0.0f;
00383
00384
00385
00386 __shared__ int s_shellinfo[2];
00387
00388 __syncthreads();
00389 if (sidx < 2)
00390 s_shellinfo[sidx] = shellinfo[(shell_counter<<4) + sidx];
00391 __syncthreads();
00392
00393 int maxprim = s_shellinfo[0];
00394 int shelltype = s_shellinfo[1];
00395
00396 if ((prim_counter + (maxprim<<1)) >= SHAREDSIZE) {
00397 prim_counter += sblock_prim_counter;
00398 sblock_prim_counter = prim_counter & MEMCOAMASK;
00399 s_basis_array[sidx ] = basis_array[sblock_prim_counter + sidx ];
00400 s_basis_array[sidx + 64] = basis_array[sblock_prim_counter + sidx + 64];
00401 s_basis_array[sidx + 128] = basis_array[sblock_prim_counter + sidx + 128];
00402 s_basis_array[sidx + 192] = basis_array[sblock_prim_counter + sidx + 192];
00403 prim_counter -= sblock_prim_counter;
00404 __syncthreads();
00405 }
00406 for (prim=0; prim < maxprim; prim++) {
00407 float exponent = s_basis_array[prim_counter ];
00408 float contract_coeff = s_basis_array[prim_counter + 1];
00409
00410
00411
00412
00413 contracted_gto += contract_coeff * exp2f(-exponent*dist2);
00414
00415 prim_counter += 2;
00416 }
00417
00418
00419
00420
00421 if ((ifunc + MAXSHELLCOUNT) >= SHAREDSIZE) {
00422 ifunc += sblock_wave_f;
00423 sblock_wave_f = ifunc & MEMCOAMASK;
00424 __syncthreads();
00425 s_wave_f[sidx ] = wave_f[sblock_wave_f + sidx ];
00426 s_wave_f[sidx + 64] = wave_f[sblock_wave_f + sidx + 64];
00427 s_wave_f[sidx + 128] = wave_f[sblock_wave_f + sidx + 128];
00428 s_wave_f[sidx + 192] = wave_f[sblock_wave_f + sidx + 192];
00429 __syncthreads();
00430 ifunc -= sblock_wave_f;
00431 }
00432
00433
00434 float tmpshell=0.0f;
00435 switch (shelltype) {
00436 case S_SHELL:
00437 value += s_wave_f[ifunc++] * contracted_gto;
00438 break;
00439
00440 case P_SHELL:
00441 tmpshell += s_wave_f[ifunc++] * xdist;
00442 tmpshell += s_wave_f[ifunc++] * ydist;
00443 tmpshell += s_wave_f[ifunc++] * zdist;
00444 value += tmpshell * contracted_gto;
00445 break;
00446
00447 case D_SHELL:
00448 tmpshell += s_wave_f[ifunc++] * xdist2;
00449 tmpshell += s_wave_f[ifunc++] * xdist * ydist;
00450 tmpshell += s_wave_f[ifunc++] * ydist2;
00451 tmpshell += s_wave_f[ifunc++] * xdist * zdist;
00452 tmpshell += s_wave_f[ifunc++] * ydist * zdist;
00453 tmpshell += s_wave_f[ifunc++] * zdist2;
00454 value += tmpshell * contracted_gto;
00455 break;
00456
00457 case F_SHELL:
00458 tmpshell += s_wave_f[ifunc++] * xdist2 * xdist;
00459 tmpshell += s_wave_f[ifunc++] * xdist2 * ydist;
00460 tmpshell += s_wave_f[ifunc++] * ydist2 * xdist;
00461 tmpshell += s_wave_f[ifunc++] * ydist2 * ydist;
00462 tmpshell += s_wave_f[ifunc++] * xdist2 * zdist;
00463 tmpshell += s_wave_f[ifunc++] * xdist * ydist * zdist;
00464 tmpshell += s_wave_f[ifunc++] * ydist2 * zdist;
00465 tmpshell += s_wave_f[ifunc++] * zdist2 * xdist;
00466 tmpshell += s_wave_f[ifunc++] * zdist2 * ydist;
00467 tmpshell += s_wave_f[ifunc++] * zdist2 * zdist;
00468 value += tmpshell * contracted_gto;
00469 break;
00470
00471 case G_SHELL:
00472 tmpshell += s_wave_f[ifunc++] * xdist2 * xdist2;
00473 tmpshell += s_wave_f[ifunc++] * xdist2 * xdist * ydist;
00474 tmpshell += s_wave_f[ifunc++] * xdist2 * ydist2;
00475 tmpshell += s_wave_f[ifunc++] * ydist2 * ydist * xdist;
00476 tmpshell += s_wave_f[ifunc++] * ydist2 * ydist2;
00477 tmpshell += s_wave_f[ifunc++] * xdist2 * xdist * zdist;
00478 tmpshell += s_wave_f[ifunc++] * xdist2 * ydist * zdist;
00479 tmpshell += s_wave_f[ifunc++] * ydist2 * xdist * zdist;
00480 tmpshell += s_wave_f[ifunc++] * ydist2 * ydist * zdist;
00481 tmpshell += s_wave_f[ifunc++] * xdist2 * zdist2;
00482 tmpshell += s_wave_f[ifunc++] * zdist2 * xdist * ydist;
00483 tmpshell += s_wave_f[ifunc++] * ydist2 * zdist2;
00484 tmpshell += s_wave_f[ifunc++] * zdist2 * zdist * xdist;
00485 tmpshell += s_wave_f[ifunc++] * zdist2 * zdist * ydist;
00486 tmpshell += s_wave_f[ifunc++] * zdist2 * zdist2;
00487 value += tmpshell * contracted_gto;
00488 break;
00489 }
00490
00491 shell_counter++;
00492 }
00493 }
00494
00495
00496 if (density) {
00497 orbitalgrid[outaddr] = copysignf(value*value, value);
00498 } else {
00499 orbitalgrid[outaddr] = value;
00500 }
00501 }
00502
00503
00504
00505
00506
00507
00508
00509
00510 __global__ static void cuorbitalcachedglobmem(int numatoms,
00511 const float *wave_f,
00512 const float *basis_array,
00513 const flint *atominfo,
00514 const int *shellinfo,
00515 float voxelsize,
00516 float originx,
00517 float originy,
00518 float grid_z,
00519 int density,
00520 float * orbitalgrid) {
00521 unsigned int xindex = __umul24(blockIdx.x, blockDim.x) * UNROLLX
00522 + threadIdx.x;
00523 unsigned int yindex = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
00524 unsigned int outaddr = (__umul24(gridDim.x, blockDim.x) * UNROLLX) * yindex
00525 + xindex;
00526 float grid_x = originx + voxelsize * xindex;
00527 float grid_y = originy + voxelsize * yindex;
00528
00529
00530 int at;
00531 int prim, shell;
00532
00533
00534 float value = 0.0f;
00535
00536
00537 int ifunc = 0;
00538 int shell_counter = 0;
00539
00540
00541 for (at = 0; at < numatoms; at++) {
00542
00543 int atidx = at << 4;
00544 float xdist = (grid_x - atominfo[atidx + 0].f)*ANGS_TO_BOHR;
00545 float ydist = (grid_y - atominfo[atidx + 1].f)*ANGS_TO_BOHR;
00546 float zdist = (grid_z - atominfo[atidx + 2].f)*ANGS_TO_BOHR;
00547
00548 int prim_counter = atominfo[atidx + 3].i;
00549 int maxshell = atominfo[atidx + 4].i;
00550
00551 float xdist2 = xdist*xdist;
00552 float ydist2 = ydist*ydist;
00553 float zdist2 = zdist*zdist;
00554
00555 float dist2 = xdist2 + ydist2 + zdist2;
00556
00557
00558 for (shell=0; shell < maxshell; shell++) {
00559 float contracted_gto = 0.0f;
00560
00561 int maxprim = shellinfo[(shell_counter<<4) ];
00562 int shell_type = shellinfo[(shell_counter<<4) + 1];
00563 for (prim=0; prim < maxprim; prim++) {
00564 float exponent = basis_array[prim_counter ];
00565 float contract_coeff = basis_array[prim_counter + 1];
00566
00567
00568
00569
00570 contracted_gto += contract_coeff * exp2f(-exponent*dist2);
00571 prim_counter += 2;
00572 }
00573
00574
00575 float tmpshell=0;
00576 switch (shell_type) {
00577 case S_SHELL:
00578 value += wave_f[ifunc++] * contracted_gto;
00579 break;
00580
00581 case P_SHELL:
00582 tmpshell += wave_f[ifunc++] * xdist;
00583 tmpshell += wave_f[ifunc++] * ydist;
00584 tmpshell += wave_f[ifunc++] * zdist;
00585 value += tmpshell * contracted_gto;
00586 break;
00587
00588 case D_SHELL:
00589 tmpshell += wave_f[ifunc++] * xdist2;
00590 tmpshell += wave_f[ifunc++] * xdist * ydist;
00591 tmpshell += wave_f[ifunc++] * ydist2;
00592 tmpshell += wave_f[ifunc++] * xdist * zdist;
00593 tmpshell += wave_f[ifunc++] * ydist * zdist;
00594 tmpshell += wave_f[ifunc++] * zdist2;
00595 value += tmpshell * contracted_gto;
00596 break;
00597
00598 case F_SHELL:
00599 tmpshell += wave_f[ifunc++] * xdist2 * xdist;
00600 tmpshell += wave_f[ifunc++] * xdist2 * ydist;
00601 tmpshell += wave_f[ifunc++] * ydist2 * xdist;
00602 tmpshell += wave_f[ifunc++] * ydist2 * ydist;
00603 tmpshell += wave_f[ifunc++] * xdist2 * zdist;
00604 tmpshell += wave_f[ifunc++] * xdist * ydist * zdist;
00605 tmpshell += wave_f[ifunc++] * ydist2 * zdist;
00606 tmpshell += wave_f[ifunc++] * zdist2 * xdist;
00607 tmpshell += wave_f[ifunc++] * zdist2 * ydist;
00608 tmpshell += wave_f[ifunc++] * zdist2 * zdist;
00609 value += tmpshell * contracted_gto;
00610 break;
00611
00612 case G_SHELL:
00613 tmpshell += wave_f[ifunc++] * xdist2 * xdist2;
00614 tmpshell += wave_f[ifunc++] * xdist2 * xdist * ydist;
00615 tmpshell += wave_f[ifunc++] * xdist2 * ydist2;
00616 tmpshell += wave_f[ifunc++] * ydist2 * ydist * xdist;
00617 tmpshell += wave_f[ifunc++] * ydist2 * ydist2;
00618 tmpshell += wave_f[ifunc++] * xdist2 * xdist * zdist;
00619 tmpshell += wave_f[ifunc++] * xdist2 * ydist * zdist;
00620 tmpshell += wave_f[ifunc++] * ydist2 * xdist * zdist;
00621 tmpshell += wave_f[ifunc++] * ydist2 * ydist * zdist;
00622 tmpshell += wave_f[ifunc++] * xdist2 * zdist2;
00623 tmpshell += wave_f[ifunc++] * zdist2 * xdist * ydist;
00624 tmpshell += wave_f[ifunc++] * ydist2 * zdist2;
00625 tmpshell += wave_f[ifunc++] * zdist2 * zdist * xdist;
00626 tmpshell += wave_f[ifunc++] * zdist2 * zdist * ydist;
00627 tmpshell += wave_f[ifunc++] * zdist2 * zdist2;
00628 value += tmpshell * contracted_gto;
00629 break;
00630 }
00631
00632 shell_counter++;
00633 }
00634 }
00635
00636
00637 if (density) {
00638 orbitalgrid[outaddr] = copysignf(value*value, value);
00639 } else {
00640 orbitalgrid[outaddr] = value;
00641 }
00642 }
00643
00644
00645
00646
00647
00648
00649 static int computepaddedsize(int orig, int tilesize) {
00650 int alignmask = tilesize - 1;
00651 int paddedsz = (orig + alignmask) & ~alignmask;
00652
00653 return paddedsz;
00654 }
00655
00656 static void * cudaorbitalthread(void *voidparms) {
00657 dim3 volsize, Gsz, Bsz;
00658 float *d_wave_f = NULL;
00659 float *d_basis_array = NULL;
00660 flint *d_atominfo = NULL;
00661 int *d_shellinfo = NULL;
00662 float *d_origin = NULL;
00663 int *d_numvoxels = NULL;
00664 float *d_orbitalgrid = NULL;
00665 float *h_orbitalgrid = NULL;
00666 float *h_basis_array_exp2f = NULL;
00667 int numvoxels[3];
00668 float origin[3];
00669 orbthrparms *parms = NULL;
00670 int usefastconstkernel=0;
00671 int threadid=0;
00672 #if defined(USE_PINNED_MEMORY)
00673 int h_orbitalgrid_pinnedalloc=0;
00674 int h_orbitalgrid_zerocopy=0;
00675 #endif
00676 int tilesize = 1;
00677
00678 wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00679 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00680
00681
00682 tilesize=4;
00683 wkf_threadpool_worker_devscaletile(voidparms, &tilesize);
00684
00685 numvoxels[0] = parms->numvoxels[0];
00686 numvoxels[1] = parms->numvoxels[1];
00687 numvoxels[2] = 1;
00688
00689 origin[0] = parms->origin[0];
00690 origin[1] = parms->origin[1];
00691
00692
00693 volsize.x = (parms->numvoxels[0] + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK);
00694 volsize.y = (parms->numvoxels[1] + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK);
00695 volsize.z = 1;
00696
00697
00698 Bsz.x = BLOCKSIZEX;
00699 Bsz.y = BLOCKSIZEY;
00700 Bsz.z = 1;
00701 Gsz.x = volsize.x / (Bsz.x * UNROLLX);
00702 Gsz.y = volsize.y / (Bsz.y * UNROLLY);
00703 Gsz.z = 1;
00704 int volmemsz = sizeof(float) * volsize.x * volsize.y * volsize.z;
00705
00706
00707
00708 if ((parms->num_wave_f < MAX_WAVEF_SZ) &&
00709 (parms->numatoms < MAX_ATOM_SZ) &&
00710 (parms->numatoms < MAX_ATOMSHELL_SZ) &&
00711 (2*parms->num_basis < MAX_BASIS_SZ) &&
00712 (parms->num_shells < MAX_SHELL_SZ)) {
00713 usefastconstkernel=1;
00714 }
00715
00716
00717 if (getenv("VMDFORCEMOTILEDSHARED") != NULL) {
00718 usefastconstkernel=0;
00719 }
00720
00721
00722 int padsz;
00723 padsz = computepaddedsize(2 * parms->num_basis, MEMCOALESCE);
00724 h_basis_array_exp2f = (float *) malloc(padsz * sizeof(float));
00725 float log2e = log2(2.718281828);
00726 for (int ll=0; ll<(2*parms->num_basis); ll+=2) {
00727 #if 1
00728
00729 h_basis_array_exp2f[ll ] = parms->basis_array[ll ] * log2e;
00730 #else
00731 h_basis_array_exp2f[ll ] = parms->basis_array[ll ];
00732 #endif
00733 h_basis_array_exp2f[ll+1] = parms->basis_array[ll+1];
00734 }
00735
00736 if (usefastconstkernel) {
00737 cudaMemcpyToSymbol(const_wave_f, parms->wave_f, parms->num_wave_f * sizeof(float), 0);
00738 cudaMemcpyToSymbol(const_atompos, parms->atompos, 3 * parms->numatoms * sizeof(float), 0);
00739 cudaMemcpyToSymbol(const_atom_basis, parms->atom_basis, parms->numatoms * sizeof(int), 0);
00740 cudaMemcpyToSymbol(const_num_shells_per_atom, parms->num_shells_per_atom, parms->numatoms * sizeof(int), 0);
00741 cudaMemcpyToSymbol(const_basis_array, h_basis_array_exp2f, 2 * parms->num_basis * sizeof(float), 0);
00742 cudaMemcpyToSymbol(const_num_prim_per_shell, parms->num_prim_per_shell, parms->num_shells * sizeof(int), 0);
00743 cudaMemcpyToSymbol(const_shell_types, parms->shell_types, parms->num_shells * sizeof(int), 0);
00744 } else {
00745 padsz = computepaddedsize(parms->num_wave_f, MEMCOALESCE);
00746 cudaMalloc((void**)&d_wave_f, padsz * sizeof(float));
00747 cudaMemcpy(d_wave_f, parms->wave_f, parms->num_wave_f * sizeof(float), cudaMemcpyHostToDevice);
00748
00749
00750 padsz = computepaddedsize(16 * parms->numatoms, MEMCOALESCE);
00751 flint * h_atominfo = (flint *) calloc(1, padsz * sizeof(flint));
00752 cudaMalloc((void**)&d_atominfo, padsz * sizeof(flint));
00753 for (int ll=0; ll<parms->numatoms; ll++) {
00754 int addr = ll * 16;
00755 h_atominfo[addr ].f = parms->atompos[ll*3 ];
00756 h_atominfo[addr + 1].f = parms->atompos[ll*3 + 1];
00757 h_atominfo[addr + 2].f = parms->atompos[ll*3 + 2];
00758 h_atominfo[addr + 3].i = parms->atom_basis[ll];
00759 h_atominfo[addr + 4].i = parms->num_shells_per_atom[ll];
00760 }
00761 cudaMemcpy(d_atominfo, h_atominfo, padsz * sizeof(flint), cudaMemcpyHostToDevice);
00762 free(h_atominfo);
00763
00764 padsz = computepaddedsize(16 * parms->num_shells, MEMCOALESCE);
00765 int * h_shellinfo = (int *) calloc(1, padsz * sizeof(int));
00766 cudaMalloc((void**)&d_shellinfo, padsz * sizeof(int));
00767 for (int ll=0; ll<parms->num_shells; ll++) {
00768 h_shellinfo[ll*16 ] = parms->num_prim_per_shell[ll];
00769 h_shellinfo[ll*16 + 1] = parms->shell_types[ll];
00770 }
00771 cudaMemcpy(d_shellinfo, h_shellinfo, padsz * sizeof(int), cudaMemcpyHostToDevice);
00772 free(h_shellinfo);
00773
00774 cudaMalloc((void**)&d_basis_array, padsz * sizeof(float));
00775 cudaMemcpy(d_basis_array, h_basis_array_exp2f, 2 * parms->num_basis * sizeof(float), cudaMemcpyHostToDevice);
00776
00777 padsz = computepaddedsize(3, MEMCOALESCE);
00778 cudaMalloc((void**)&d_origin, padsz * sizeof(float));
00779 cudaMemcpy(d_origin, origin, 3 * sizeof(float), cudaMemcpyHostToDevice);
00780
00781 cudaMalloc((void**)&d_numvoxels, padsz * sizeof(int));
00782 cudaMemcpy(d_numvoxels, numvoxels, 3 * sizeof(int), cudaMemcpyHostToDevice);
00783 }
00784
00785
00786 #if defined(USE_ZERO_COPY)
00787 if ((getenv("VMDCUDANOZEROCOPY") == NULL) &&
00788 (cudaHostAlloc((void **) &h_orbitalgrid, volmemsz, cudaHostAllocMapped) == cudaSuccess)) {
00789 h_orbitalgrid_zerocopy=1;
00790 cudaHostGetDevicePointer(&d_orbitalgrid, h_orbitalgrid, 0);
00791 CUERR
00792 } else {
00793 printf("WARNING: CUDA zero-copy pinned memory allocation failed!\n");
00794 #else
00795
00796 cudaMalloc((void**)&d_orbitalgrid, volmemsz);
00797 CUERR
00798 #endif
00799 #if defined(USE_PINNED_MEMORY)
00800
00801 if ((getenv("VMDCUDANOPINNEDMEMORY") == NULL) &&
00802 (cudaMallocHost((void **) &h_orbitalgrid, volmemsz) == cudaSuccess)) {
00803 h_orbitalgrid_pinnedalloc=1;
00804 } else {
00805 printf("WARNING: CUDA pinned memory allocation failed!\n");
00806 h_orbitalgrid_pinnedalloc=0;
00807 h_orbitalgrid = (float *) malloc(volmemsz);
00808 }
00809 #else
00810
00811 h_orbitalgrid = (float *) malloc(volmemsz);
00812 #endif
00813 #if defined(USE_ZERO_COPY)
00814 }
00815 #endif
00816
00817 #if 0
00818 if (threadid == 0) {
00819
00820 printf("atoms[%d] ", parms->numatoms);
00821 printf("wavef[%d] ", parms->num_wave_f);
00822 printf("basis[%d] ", parms->num_basis);
00823 printf("shell[%d] ", parms->num_shells);
00824 if (usefastconstkernel) {
00825 #if defined(VMDMOJITSRC) && defined(VMDMOJIT)
00826 printf("GPU constant memory (JIT)");
00827 #else
00828 printf("GPU constant memory");
00829 #endif
00830 } else {
00831 printf("GPU tiled shared memory:");
00832 }
00833 printf(" Gsz:%dx%d\n", Gsz.x, Gsz.y);
00834 }
00835 #endif
00836
00837
00838 wkf_tasktile_t tile;
00839 int planesize = numvoxels[0] * numvoxels[1];
00840
00841 while (wkf_threadpool_next_tile(voidparms, tilesize, &tile) != WKF_SCHED_DONE) {
00842 int k;
00843 for (k=tile.start; k<tile.end; k++) {
00844 origin[2] = parms->origin[2] + parms->voxelsize * k;
00845
00846
00847
00848 if (usefastconstkernel) {
00849 #if defined(VMDMOJITSRC) && defined(VMDMOJIT)
00850 cuorbitalconstmem_jit<<<Gsz, Bsz, 0>>>(parms->numatoms,
00851 parms->voxelsize,
00852 origin[0],
00853 origin[1],
00854 origin[2],
00855 parms->density,
00856 d_orbitalgrid);
00857 #else
00858 cuorbitalconstmem<<<Gsz, Bsz, 0>>>(parms->numatoms,
00859 parms->voxelsize,
00860 origin[0],
00861 origin[1],
00862 origin[2],
00863 parms->density,
00864 d_orbitalgrid);
00865 #endif
00866 } else {
00867 #if defined(USE_FERMI_CACHE)
00868
00869
00870 cuorbitalcachedglobmem<<<Gsz, Bsz, 0>>>(parms->numatoms,
00871 d_wave_f,
00872 d_basis_array,
00873 d_atominfo,
00874 d_shellinfo,
00875 parms->voxelsize,
00876 origin[0],
00877 origin[1],
00878 origin[2],
00879 parms->density,
00880 d_orbitalgrid);
00881 #else
00882 cuorbitaltiledshared<<<Gsz, Bsz, 0>>>(parms->numatoms,
00883 d_wave_f,
00884 d_basis_array,
00885 d_atominfo,
00886 d_shellinfo,
00887 parms->voxelsize,
00888 origin[0],
00889 origin[1],
00890 origin[2],
00891 parms->density,
00892 d_orbitalgrid);
00893 #endif
00894 }
00895 #if defined(USE_ZERO_COPY)
00896 if (h_orbitalgrid_zerocopy) {
00897 cudaThreadSynchronize();
00898 } else {
00899 #endif
00900 CUERR
00901
00902
00903 cudaMemcpy(h_orbitalgrid, d_orbitalgrid, volmemsz, cudaMemcpyDeviceToHost);
00904 CUERR
00905 #if defined(USE_ZERO_COPY)
00906 }
00907 #endif
00908
00909
00910 int y;
00911 for (y=0; y<numvoxels[1]; y++) {
00912 long orbaddr = k*planesize + y*numvoxels[0];
00913 memcpy(&parms->orbitalgrid[orbaddr], &h_orbitalgrid[y*volsize.x], numvoxels[0] * sizeof(float));
00914 }
00915 }
00916 cudaThreadSynchronize();
00917 }
00918
00919 free(h_basis_array_exp2f);
00920
00921 #if defined(USE_ZERO_COPY)
00922 if (h_orbitalgrid_zerocopy) {
00923 cudaFreeHost(h_orbitalgrid);
00924 } else {
00925 #endif
00926 #if defined(USE_PINNED_MEMORY)
00927 if (h_orbitalgrid_pinnedalloc)
00928 cudaFreeHost(h_orbitalgrid);
00929 else
00930 free(h_orbitalgrid);
00931 #else
00932 free(h_orbitalgrid);
00933 #endif
00934 cudaFree(d_orbitalgrid);
00935 #if defined(USE_ZERO_COPY)
00936 }
00937 #endif
00938
00939 if (!usefastconstkernel) {
00940 cudaFree(d_wave_f);
00941 cudaFree(d_basis_array);
00942 cudaFree(d_atominfo);
00943 cudaFree(d_shellinfo);
00944 cudaFree(d_numvoxels);
00945 cudaFree(d_origin);
00946 }
00947
00948 CUERR
00949
00950 return NULL;
00951 }
00952
00953
00954 int vmd_cuda_evaluate_orbital_grid(wkf_threadpool_t *devpool,
00955 int numatoms,
00956 const float *wave_f, int num_wave_f,
00957 const float *basis_array, int num_basis,
00958 const float *atompos,
00959 const int *atom_basis,
00960 const int *num_shells_per_atom,
00961 const int *num_prim_per_shell,
00962 const int *shell_types,
00963 int num_shells,
00964 const int *numvoxels,
00965 float voxelsize,
00966 const float *origin,
00967 int density,
00968 float *orbitalgrid) {
00969 int rc=0;
00970 orbthrparms parms;
00971
00972 if (devpool == NULL) {
00973 return -1;
00974 }
00975
00976 #if 0
00977
00978
00979 if (getenv("VMDCUDADEV")) {
00980 rc=cudaSetDevice(atoi(getenv("VMDCUDADEV")));
00981 } else {
00982 rc=cudaSetDevice(0);
00983 }
00984
00985 if (rc != cudaSuccess) {
00986 #if CUDART_VERSION >= 2010
00987 rc = cudaGetLastError();
00988 if (rc != cudaErrorSetOnActiveProcess)
00989 return NULL;
00990 #else
00991 cudaGetLastError();
00992
00993 #endif
00994 }
00995
00996
00997 cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost);
00998
00999 #if CUDART_VERSION >= 3000
01000
01001 cudaFuncSetCacheConfig(cuorbitalcachedglobmem, cudaFuncCachePreferL1);
01002
01003
01004 cudaFuncSetCacheConfig(cuorbitaltiledshared, cudaFuncCachePreferShared);
01005 #endif
01006 #endif
01007
01008 #if defined(VMDMOJITSRC)
01009 if (getenv("VMDMOJIT") != NULL) {
01010
01011 orbital_jit_generate(ORBITAL_JIT_CUDA, getenv("VMDMOJITSRCFILE"),
01012 numatoms, wave_f, basis_array, atom_basis,
01013 num_shells_per_atom, num_prim_per_shell, shell_types);
01014
01015
01016 orbital_jit_generate(ORBITAL_JIT_OPENCL, "/tmp/mojit.cl",
01017 numatoms, wave_f, basis_array, atom_basis,
01018 num_shells_per_atom, num_prim_per_shell, shell_types);
01019 return 0;
01020 }
01021 #endif
01022
01023 parms.numatoms = numatoms;
01024 parms.wave_f = wave_f;
01025 parms.num_wave_f = num_wave_f;
01026 parms.basis_array = basis_array;
01027 parms.num_basis = num_basis;
01028 parms.atompos = atompos;
01029 parms.atom_basis = atom_basis;
01030 parms.num_shells_per_atom = num_shells_per_atom;
01031 parms.num_prim_per_shell = num_prim_per_shell;
01032 parms.shell_types = shell_types;
01033 parms.num_shells = num_shells;
01034 parms.numvoxels = numvoxels;
01035 parms.voxelsize = voxelsize;
01036 parms.origin = origin;
01037 parms.density = density;
01038 parms.orbitalgrid = orbitalgrid;
01039
01040 wkf_tasktile_t tile;
01041 tile.start=0;
01042 tile.end=numvoxels[2];
01043 wkf_threadpool_sched_dynamic(devpool, &tile);
01044 wkf_threadpool_launch(devpool, cudaorbitalthread, &parms, 1);
01045
01046 return rc;
01047 }
01048
01049