00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00057 #include <stdio.h>
00058 #include <stdlib.h>
00059 #include <string.h>
00060 #include <math.h>
00061 #include <cuda.h>
00062
00063 #include "WKFThreads.h"
00064 #include "CUDAKernels.h"
00065 #include "OrbitalJIT.h"
00066
00067
00068 #if CUDART_VERSION >= 2030
00069 #define USE_PINNED_MEMORY 1
00070
00071 #endif
00072
00073 #if 1 && (CUDART_VERSION >= 4000)
00074 #define RESTRICT __restrict__
00075 #else
00076 #define RESTRICT
00077 #endif
00078
00079 #if 1
00080 #define CUERR { cudaError_t err; \
00081 if ((err = cudaGetLastError()) != cudaSuccess) { \
00082 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00083 printf("Thread aborting...\n"); \
00084 return NULL; }}
00085 #else
00086 #define CUERR
00087 #endif
00088
00089 #define ANGS_TO_BOHR 1.8897259877218677f
00090
00091 typedef union flint_t {
00092 float f;
00093 int i;
00094 } flint;
00095
00096 typedef struct {
00097 int numatoms;
00098 const float *wave_f;
00099 int num_wave_f;
00100 const float *basis_array;
00101 int num_basis;
00102 const float *atompos;
00103 const int *atom_basis;
00104 const int *num_shells_per_atom;
00105 const int *num_prim_per_shell;
00106 const int *shell_types;
00107 int num_shells;
00108 const int *numvoxels;
00109 float voxelsize;
00110 const float *origin;
00111 int density;
00112 float *orbitalgrid;
00113 } orbthrparms;
00114
00115
00116 static void * cudaorbitalthread(void *);
00117
00118
00119 #define UNROLLX 1
00120 #define UNROLLY 1
00121 #define BLOCKSIZEX 8
00122 #define BLOCKSIZEY 8
00123 #define BLOCKSIZE BLOCKSIZEX * BLOCKSIZEY
00124
00125
00126 #define TILESIZEX BLOCKSIZEX*UNROLLX
00127 #define TILESIZEY BLOCKSIZEY*UNROLLY
00128 #define GPU_X_ALIGNMASK (TILESIZEX - 1)
00129 #define GPU_Y_ALIGNMASK (TILESIZEY - 1)
00130
00131 #define MEMCOALESCE 384
00132
00133
00134 #define S_SHELL 0
00135 #define P_SHELL 1
00136 #define D_SHELL 2
00137 #define F_SHELL 3
00138 #define G_SHELL 4
00139 #define H_SHELL 5
00140
00141
00142
00143
00144 #define MAX_ATOM_SZ 256
00145
00146 #define MAX_ATOMPOS_SZ (MAX_ATOM_SZ)
00147 __constant__ static float const_atompos[MAX_ATOMPOS_SZ * 3];
00148
00149 #define MAX_ATOM_BASIS_SZ (MAX_ATOM_SZ)
00150 __constant__ static int const_atom_basis[MAX_ATOM_BASIS_SZ];
00151
00152 #define MAX_ATOMSHELL_SZ (MAX_ATOM_SZ)
00153 __constant__ static int const_num_shells_per_atom[MAX_ATOMSHELL_SZ];
00154
00155 #define MAX_BASIS_SZ 6144
00156 __constant__ static float const_basis_array[MAX_BASIS_SZ];
00157
00158 #define MAX_SHELL_SZ 1024
00159 __constant__ static int const_num_prim_per_shell[MAX_SHELL_SZ];
00160 __constant__ static int const_shell_types[MAX_SHELL_SZ];
00161
00162 #define MAX_WAVEF_SZ 6144
00163 __constant__ static float const_wave_f[MAX_WAVEF_SZ];
00164
00165
00166
00167
00168
00169
00170
00171
00172
00173
00174
00175
00176 #if defined(VMDMOJIT) && defined(VMDMOJITSRC)
00177 #include VMDMOJITSRC
00178 #endif
00179
00180
00181
00182
00183
00184
00185 __global__ static void cuorbitalconstmem(int numatoms,
00186 float voxelsize,
00187 float originx,
00188 float originy,
00189 float grid_z,
00190 int density,
00191 float * orbitalgrid) {
00192 unsigned int xindex = blockIdx.x * blockDim.x + threadIdx.x;
00193 unsigned int yindex = blockIdx.y * blockDim.y + threadIdx.y;
00194 unsigned int outaddr = gridDim.x * blockDim.x * yindex + xindex;
00195 float grid_x = originx + voxelsize * xindex;
00196 float grid_y = originy + voxelsize * yindex;
00197
00198
00199 int at;
00200 int prim, shell;
00201
00202
00203 float value = 0.0f;
00204
00205
00206 int ifunc = 0;
00207 int shell_counter = 0;
00208
00209
00210 for (at = 0; at < numatoms; at++) {
00211
00212 int maxshell = const_num_shells_per_atom[at];
00213 int prim_counter = const_atom_basis[at];
00214
00215 float xdist = (grid_x - const_atompos[3*at ])*ANGS_TO_BOHR;
00216 float ydist = (grid_y - const_atompos[3*at+1])*ANGS_TO_BOHR;
00217 float zdist = (grid_z - const_atompos[3*at+2])*ANGS_TO_BOHR;
00218
00219 float xdist2 = xdist*xdist;
00220 float ydist2 = ydist*ydist;
00221 float zdist2 = zdist*zdist;
00222
00223 float dist2 = xdist2 + ydist2 + zdist2;
00224
00225
00226 for (shell=0; shell < maxshell; shell++) {
00227 float contracted_gto = 0.0f;
00228
00229
00230
00231 int maxprim = const_num_prim_per_shell[shell_counter];
00232 int shelltype = const_shell_types[shell_counter];
00233 for (prim=0; prim < maxprim; prim++) {
00234 float exponent = const_basis_array[prim_counter ];
00235 float contract_coeff = const_basis_array[prim_counter + 1];
00236
00237
00238
00239
00240 contracted_gto += contract_coeff * exp2f(-exponent*dist2);
00241 prim_counter += 2;
00242 }
00243
00244
00245 float tmpshell=0.0f;
00246 switch (shelltype) {
00247 case S_SHELL:
00248 value += const_wave_f[ifunc++] * contracted_gto;
00249 break;
00250
00251 case P_SHELL:
00252 tmpshell += const_wave_f[ifunc++] * xdist;
00253 tmpshell += const_wave_f[ifunc++] * ydist;
00254 tmpshell += const_wave_f[ifunc++] * zdist;
00255 value += tmpshell * contracted_gto;
00256 break;
00257
00258 case D_SHELL:
00259 tmpshell += const_wave_f[ifunc++] * xdist2;
00260 tmpshell += const_wave_f[ifunc++] * xdist * ydist;
00261 tmpshell += const_wave_f[ifunc++] * ydist2;
00262 tmpshell += const_wave_f[ifunc++] * xdist * zdist;
00263 tmpshell += const_wave_f[ifunc++] * ydist * zdist;
00264 tmpshell += const_wave_f[ifunc++] * zdist2;
00265 value += tmpshell * contracted_gto;
00266 break;
00267
00268 case F_SHELL:
00269 tmpshell += const_wave_f[ifunc++] * xdist2 * xdist;
00270 tmpshell += const_wave_f[ifunc++] * xdist2 * ydist;
00271 tmpshell += const_wave_f[ifunc++] * ydist2 * xdist;
00272 tmpshell += const_wave_f[ifunc++] * ydist2 * ydist;
00273 tmpshell += const_wave_f[ifunc++] * xdist2 * zdist;
00274 tmpshell += const_wave_f[ifunc++] * xdist * ydist * zdist;
00275 tmpshell += const_wave_f[ifunc++] * ydist2 * zdist;
00276 tmpshell += const_wave_f[ifunc++] * zdist2 * xdist;
00277 tmpshell += const_wave_f[ifunc++] * zdist2 * ydist;
00278 tmpshell += const_wave_f[ifunc++] * zdist2 * zdist;
00279 value += tmpshell * contracted_gto;
00280 break;
00281
00282 case G_SHELL:
00283 tmpshell += const_wave_f[ifunc++] * xdist2 * xdist2;
00284 tmpshell += const_wave_f[ifunc++] * xdist2 * xdist * ydist;
00285 tmpshell += const_wave_f[ifunc++] * xdist2 * ydist2;
00286 tmpshell += const_wave_f[ifunc++] * ydist2 * ydist * xdist;
00287 tmpshell += const_wave_f[ifunc++] * ydist2 * ydist2;
00288 tmpshell += const_wave_f[ifunc++] * xdist2 * xdist * zdist;
00289 tmpshell += const_wave_f[ifunc++] * xdist2 * ydist * zdist;
00290 tmpshell += const_wave_f[ifunc++] * ydist2 * xdist * zdist;
00291 tmpshell += const_wave_f[ifunc++] * ydist2 * ydist * zdist;
00292 tmpshell += const_wave_f[ifunc++] * xdist2 * zdist2;
00293 tmpshell += const_wave_f[ifunc++] * zdist2 * xdist * ydist;
00294 tmpshell += const_wave_f[ifunc++] * ydist2 * zdist2;
00295 tmpshell += const_wave_f[ifunc++] * zdist2 * zdist * xdist;
00296 tmpshell += const_wave_f[ifunc++] * zdist2 * zdist * ydist;
00297 tmpshell += const_wave_f[ifunc++] * zdist2 * zdist2;
00298 value += tmpshell * contracted_gto;
00299 break;
00300 }
00301
00302 shell_counter++;
00303 }
00304 }
00305
00306
00307 if (density) {
00308 orbitalgrid[outaddr] = copysignf(value*value, value);
00309 } else {
00310 orbitalgrid[outaddr] = value;
00311 }
00312 }
00313
00314
00315
00316
00317
00318
00319
00320
00321
00322
00323
00324
00325
00326
00327
00328
00329
00330
00331
00332 #define MAXSHELLCOUNT 15
00333
00334
00335
00336
00337
00338 #define MEMCOAMASK (~15)
00339
00340
00341
00342
00343
00344
00345 #define SHAREDSIZE 256
00346
00347
00348 __global__ static void cuorbitaltiledshared(int numatoms,
00349 const float *wave_f,
00350 const float *basis_array,
00351 const flint *atominfo,
00352 const int *shellinfo,
00353 float voxelsize,
00354 float originx,
00355 float originy,
00356 float grid_z,
00357 int density,
00358 float * orbitalgrid) {
00359 unsigned int xindex = blockIdx.x * blockDim.x + threadIdx.x;
00360 unsigned int yindex = blockIdx.y * blockDim.y + threadIdx.y;
00361 unsigned int outaddr = gridDim.x * blockDim.x * yindex + xindex;
00362 float grid_x = originx + voxelsize * xindex;
00363 float grid_y = originy + voxelsize * yindex;
00364
00365 int sidx = __mul24(threadIdx.y, blockDim.x) + threadIdx.x;
00366
00367 __shared__ float s_wave_f[SHAREDSIZE];
00368 int sblock_wave_f = 0;
00369 s_wave_f[sidx ] = wave_f[sidx ];
00370 s_wave_f[sidx + 64] = wave_f[sidx + 64];
00371 s_wave_f[sidx + 128] = wave_f[sidx + 128];
00372 s_wave_f[sidx + 192] = wave_f[sidx + 192];
00373 __syncthreads();
00374
00375
00376 int at;
00377 int prim, shell;
00378
00379
00380 float value = 0.0f;
00381
00382
00383 int ifunc = 0;
00384 int shell_counter = 0;
00385 int sblock_prim_counter = -1;
00386
00387 for (at = 0; at < numatoms; at++) {
00388 __shared__ flint s_atominfo[5];
00389 __shared__ float s_basis_array[SHAREDSIZE];
00390
00391 __syncthreads();
00392 if (sidx < 5)
00393 s_atominfo[sidx].i = atominfo[(at<<4) + sidx].i;
00394 __syncthreads();
00395
00396 int prim_counter = s_atominfo[3].i;
00397 int maxshell = s_atominfo[4].i;
00398 int new_sblock_prim_counter = prim_counter & MEMCOAMASK;
00399 if (sblock_prim_counter != new_sblock_prim_counter) {
00400 sblock_prim_counter = new_sblock_prim_counter;
00401 s_basis_array[sidx ] = basis_array[sblock_prim_counter + sidx ];
00402 s_basis_array[sidx + 64] = basis_array[sblock_prim_counter + sidx + 64];
00403 s_basis_array[sidx + 128] = basis_array[sblock_prim_counter + sidx + 128];
00404 s_basis_array[sidx + 192] = basis_array[sblock_prim_counter + sidx + 192];
00405 prim_counter -= sblock_prim_counter;
00406 __syncthreads();
00407 }
00408
00409
00410 float xdist = (grid_x - s_atominfo[0].f)*ANGS_TO_BOHR;
00411 float ydist = (grid_y - s_atominfo[1].f)*ANGS_TO_BOHR;
00412 float zdist = (grid_z - s_atominfo[2].f)*ANGS_TO_BOHR;
00413
00414 float xdist2 = xdist*xdist;
00415 float ydist2 = ydist*ydist;
00416 float zdist2 = zdist*zdist;
00417
00418 float dist2 = xdist2 + ydist2 + zdist2;
00419
00420
00421 for (shell=0; shell < maxshell; shell++) {
00422 float contracted_gto = 0.0f;
00423
00424
00425
00426 __shared__ int s_shellinfo[2];
00427
00428 __syncthreads();
00429 if (sidx < 2)
00430 s_shellinfo[sidx] = shellinfo[(shell_counter<<4) + sidx];
00431 __syncthreads();
00432
00433 int maxprim = s_shellinfo[0];
00434 int shelltype = s_shellinfo[1];
00435
00436 if ((prim_counter + (maxprim<<1)) >= SHAREDSIZE) {
00437 prim_counter += sblock_prim_counter;
00438 sblock_prim_counter = prim_counter & MEMCOAMASK;
00439 s_basis_array[sidx ] = basis_array[sblock_prim_counter + sidx ];
00440 s_basis_array[sidx + 64] = basis_array[sblock_prim_counter + sidx + 64];
00441 s_basis_array[sidx + 128] = basis_array[sblock_prim_counter + sidx + 128];
00442 s_basis_array[sidx + 192] = basis_array[sblock_prim_counter + sidx + 192];
00443 prim_counter -= sblock_prim_counter;
00444 __syncthreads();
00445 }
00446 for (prim=0; prim < maxprim; prim++) {
00447 float exponent = s_basis_array[prim_counter ];
00448 float contract_coeff = s_basis_array[prim_counter + 1];
00449
00450
00451
00452
00453 contracted_gto += contract_coeff * exp2f(-exponent*dist2);
00454
00455 prim_counter += 2;
00456 }
00457
00458
00459
00460
00461 if ((ifunc + MAXSHELLCOUNT) >= SHAREDSIZE) {
00462 ifunc += sblock_wave_f;
00463 sblock_wave_f = ifunc & MEMCOAMASK;
00464 __syncthreads();
00465 s_wave_f[sidx ] = wave_f[sblock_wave_f + sidx ];
00466 s_wave_f[sidx + 64] = wave_f[sblock_wave_f + sidx + 64];
00467 s_wave_f[sidx + 128] = wave_f[sblock_wave_f + sidx + 128];
00468 s_wave_f[sidx + 192] = wave_f[sblock_wave_f + sidx + 192];
00469 __syncthreads();
00470 ifunc -= sblock_wave_f;
00471 }
00472
00473
00474 float tmpshell=0.0f;
00475 switch (shelltype) {
00476 case S_SHELL:
00477 value += s_wave_f[ifunc++] * contracted_gto;
00478 break;
00479
00480 case P_SHELL:
00481 tmpshell += s_wave_f[ifunc++] * xdist;
00482 tmpshell += s_wave_f[ifunc++] * ydist;
00483 tmpshell += s_wave_f[ifunc++] * zdist;
00484 value += tmpshell * contracted_gto;
00485 break;
00486
00487 case D_SHELL:
00488 tmpshell += s_wave_f[ifunc++] * xdist2;
00489 tmpshell += s_wave_f[ifunc++] * xdist * ydist;
00490 tmpshell += s_wave_f[ifunc++] * ydist2;
00491 tmpshell += s_wave_f[ifunc++] * xdist * zdist;
00492 tmpshell += s_wave_f[ifunc++] * ydist * zdist;
00493 tmpshell += s_wave_f[ifunc++] * zdist2;
00494 value += tmpshell * contracted_gto;
00495 break;
00496
00497 case F_SHELL:
00498 tmpshell += s_wave_f[ifunc++] * xdist2 * xdist;
00499 tmpshell += s_wave_f[ifunc++] * xdist2 * ydist;
00500 tmpshell += s_wave_f[ifunc++] * ydist2 * xdist;
00501 tmpshell += s_wave_f[ifunc++] * ydist2 * ydist;
00502 tmpshell += s_wave_f[ifunc++] * xdist2 * zdist;
00503 tmpshell += s_wave_f[ifunc++] * xdist * ydist * zdist;
00504 tmpshell += s_wave_f[ifunc++] * ydist2 * zdist;
00505 tmpshell += s_wave_f[ifunc++] * zdist2 * xdist;
00506 tmpshell += s_wave_f[ifunc++] * zdist2 * ydist;
00507 tmpshell += s_wave_f[ifunc++] * zdist2 * zdist;
00508 value += tmpshell * contracted_gto;
00509 break;
00510
00511 case G_SHELL:
00512 tmpshell += s_wave_f[ifunc++] * xdist2 * xdist2;
00513 tmpshell += s_wave_f[ifunc++] * xdist2 * xdist * ydist;
00514 tmpshell += s_wave_f[ifunc++] * xdist2 * ydist2;
00515 tmpshell += s_wave_f[ifunc++] * ydist2 * ydist * xdist;
00516 tmpshell += s_wave_f[ifunc++] * ydist2 * ydist2;
00517 tmpshell += s_wave_f[ifunc++] * xdist2 * xdist * zdist;
00518 tmpshell += s_wave_f[ifunc++] * xdist2 * ydist * zdist;
00519 tmpshell += s_wave_f[ifunc++] * ydist2 * xdist * zdist;
00520 tmpshell += s_wave_f[ifunc++] * ydist2 * ydist * zdist;
00521 tmpshell += s_wave_f[ifunc++] * xdist2 * zdist2;
00522 tmpshell += s_wave_f[ifunc++] * zdist2 * xdist * ydist;
00523 tmpshell += s_wave_f[ifunc++] * ydist2 * zdist2;
00524 tmpshell += s_wave_f[ifunc++] * zdist2 * zdist * xdist;
00525 tmpshell += s_wave_f[ifunc++] * zdist2 * zdist * ydist;
00526 tmpshell += s_wave_f[ifunc++] * zdist2 * zdist2;
00527 value += tmpshell * contracted_gto;
00528 break;
00529 }
00530
00531 shell_counter++;
00532 }
00533 }
00534
00535
00536 if (density) {
00537 orbitalgrid[outaddr] = copysignf(value*value, value);
00538 } else {
00539 orbitalgrid[outaddr] = value;
00540 }
00541 }
00542
00543
00544
00545
00546
00547
00548
00549
00550 __global__ static void cuorbitalcachedglobmem(int numatoms,
00551 const float * RESTRICT wave_f,
00552 const float * RESTRICT basis_array,
00553 const flint * RESTRICT atominfo,
00554 const int * RESTRICT shellinfo,
00555 float voxelsize,
00556 float originx,
00557 float originy,
00558 float grid_z,
00559 int density,
00560 float * RESTRICT orbitalgrid) {
00561 unsigned int xindex = blockIdx.x * blockDim.x * UNROLLX + threadIdx.x;
00562 unsigned int yindex = blockIdx.y * blockDim.y + threadIdx.y;
00563 unsigned int outaddr = gridDim.x * blockDim.x * UNROLLX * yindex + xindex;
00564 float grid_x = originx + voxelsize * xindex;
00565 float grid_y = originy + voxelsize * yindex;
00566
00567
00568 int at;
00569 int prim, shell;
00570
00571
00572 float value = 0.0f;
00573
00574
00575 int ifunc = 0;
00576 int shell_counter = 0;
00577
00578
00579 for (at = 0; at < numatoms; at++) {
00580
00581 int atidx = at << 4;
00582 float xdist = (grid_x - atominfo[atidx + 0].f)*ANGS_TO_BOHR;
00583 float ydist = (grid_y - atominfo[atidx + 1].f)*ANGS_TO_BOHR;
00584 float zdist = (grid_z - atominfo[atidx + 2].f)*ANGS_TO_BOHR;
00585
00586 int prim_counter = atominfo[atidx + 3].i;
00587 int maxshell = atominfo[atidx + 4].i;
00588
00589 float xdist2 = xdist*xdist;
00590 float ydist2 = ydist*ydist;
00591 float zdist2 = zdist*zdist;
00592
00593 float dist2 = xdist2 + ydist2 + zdist2;
00594
00595
00596 for (shell=0; shell < maxshell; shell++) {
00597 float contracted_gto = 0.0f;
00598
00599 int maxprim = shellinfo[(shell_counter<<4) ];
00600 int shell_type = shellinfo[(shell_counter<<4) + 1];
00601 for (prim=0; prim < maxprim; prim++) {
00602 float exponent = basis_array[prim_counter ];
00603 float contract_coeff = basis_array[prim_counter + 1];
00604
00605
00606
00607
00608 contracted_gto += contract_coeff * exp2f(-exponent*dist2);
00609 prim_counter += 2;
00610 }
00611
00612
00613 float tmpshell=0;
00614 switch (shell_type) {
00615 case S_SHELL:
00616 value += wave_f[ifunc++] * contracted_gto;
00617 break;
00618
00619 case P_SHELL:
00620 tmpshell += wave_f[ifunc++] * xdist;
00621 tmpshell += wave_f[ifunc++] * ydist;
00622 tmpshell += wave_f[ifunc++] * zdist;
00623 value += tmpshell * contracted_gto;
00624 break;
00625
00626 case D_SHELL:
00627 tmpshell += wave_f[ifunc++] * xdist2;
00628 tmpshell += wave_f[ifunc++] * xdist * ydist;
00629 tmpshell += wave_f[ifunc++] * ydist2;
00630 tmpshell += wave_f[ifunc++] * xdist * zdist;
00631 tmpshell += wave_f[ifunc++] * ydist * zdist;
00632 tmpshell += wave_f[ifunc++] * zdist2;
00633 value += tmpshell * contracted_gto;
00634 break;
00635
00636 case F_SHELL:
00637 tmpshell += wave_f[ifunc++] * xdist2 * xdist;
00638 tmpshell += wave_f[ifunc++] * xdist2 * ydist;
00639 tmpshell += wave_f[ifunc++] * ydist2 * xdist;
00640 tmpshell += wave_f[ifunc++] * ydist2 * ydist;
00641 tmpshell += wave_f[ifunc++] * xdist2 * zdist;
00642 tmpshell += wave_f[ifunc++] * xdist * ydist * zdist;
00643 tmpshell += wave_f[ifunc++] * ydist2 * zdist;
00644 tmpshell += wave_f[ifunc++] * zdist2 * xdist;
00645 tmpshell += wave_f[ifunc++] * zdist2 * ydist;
00646 tmpshell += wave_f[ifunc++] * zdist2 * zdist;
00647 value += tmpshell * contracted_gto;
00648 break;
00649
00650 case G_SHELL:
00651 tmpshell += wave_f[ifunc++] * xdist2 * xdist2;
00652 tmpshell += wave_f[ifunc++] * xdist2 * xdist * ydist;
00653 tmpshell += wave_f[ifunc++] * xdist2 * ydist2;
00654 tmpshell += wave_f[ifunc++] * ydist2 * ydist * xdist;
00655 tmpshell += wave_f[ifunc++] * ydist2 * ydist2;
00656 tmpshell += wave_f[ifunc++] * xdist2 * xdist * zdist;
00657 tmpshell += wave_f[ifunc++] * xdist2 * ydist * zdist;
00658 tmpshell += wave_f[ifunc++] * ydist2 * xdist * zdist;
00659 tmpshell += wave_f[ifunc++] * ydist2 * ydist * zdist;
00660 tmpshell += wave_f[ifunc++] * xdist2 * zdist2;
00661 tmpshell += wave_f[ifunc++] * zdist2 * xdist * ydist;
00662 tmpshell += wave_f[ifunc++] * ydist2 * zdist2;
00663 tmpshell += wave_f[ifunc++] * zdist2 * zdist * xdist;
00664 tmpshell += wave_f[ifunc++] * zdist2 * zdist * ydist;
00665 tmpshell += wave_f[ifunc++] * zdist2 * zdist2;
00666 value += tmpshell * contracted_gto;
00667 break;
00668 }
00669
00670 shell_counter++;
00671 }
00672 }
00673
00674
00675 if (density) {
00676 orbitalgrid[outaddr] = copysignf(value*value, value);
00677 } else {
00678 orbitalgrid[outaddr] = value;
00679 }
00680 }
00681
00682
00683
00684
00685
00686
00687
00688
00689
00690
00691 __global__ static void cuorbitalcachedglobmem_packed(int numatoms,
00692 const float * RESTRICT wave_f,
00693 const float2 * RESTRICT basis_array,
00694 const flint * RESTRICT atominfo,
00695 const int2 * RESTRICT shellinfo,
00696 float voxelsize,
00697 float originx,
00698 float originy,
00699 float grid_z,
00700 int density,
00701 float * RESTRICT orbitalgrid) {
00702 unsigned int xindex = blockIdx.x * blockDim.x * UNROLLX + threadIdx.x;
00703 unsigned int yindex = blockIdx.y * blockDim.y + threadIdx.y;
00704 unsigned int outaddr = gridDim.x * blockDim.x * UNROLLX * yindex + xindex;
00705 float grid_x = originx + voxelsize * xindex;
00706 float grid_y = originy + voxelsize * yindex;
00707
00708
00709 int at;
00710 int prim, shell;
00711
00712
00713 float value = 0.0f;
00714
00715
00716 int ifunc = 0;
00717 int shell_counter = 0;
00718
00719
00720 for (at = 0; at < numatoms; at++) {
00721
00722 int atidx = at << 4;
00723 float xdist = (grid_x - atominfo[atidx + 0].f)*ANGS_TO_BOHR;
00724 float ydist = (grid_y - atominfo[atidx + 1].f)*ANGS_TO_BOHR;
00725 float zdist = (grid_z - atominfo[atidx + 2].f)*ANGS_TO_BOHR;
00726
00727 int prim_counter = atominfo[atidx + 3].i >> 1;
00728 int maxshell = atominfo[atidx + 4].i;
00729
00730 float xdist2 = xdist*xdist;
00731 float ydist2 = ydist*ydist;
00732 float zdist2 = zdist*zdist;
00733
00734 float dist2 = xdist2 + ydist2 + zdist2;
00735
00736
00737 for (shell=0; shell < maxshell; shell++) {
00738 int2 tmpshellinfo = shellinfo[shell_counter];
00739 int maxprim = tmpshellinfo.x;
00740 int shell_type = tmpshellinfo.y;
00741
00742 float contracted_gto = 0.0f;
00743 for (prim=prim_counter; prim < maxprim; prim++) {
00744 float2 tmpbasis = basis_array[prim];
00745 float exponent = tmpbasis.x;
00746 float contract_coeff = tmpbasis.y;
00747
00748
00749
00750
00751 contracted_gto += contract_coeff * exp2f(-exponent*dist2);
00752 }
00753
00754
00755 float tmpshell=0;
00756 switch (shell_type) {
00757 case S_SHELL:
00758 value += wave_f[ifunc++] * contracted_gto;
00759 break;
00760
00761 case P_SHELL:
00762 tmpshell += wave_f[ifunc++] * xdist;
00763 tmpshell += wave_f[ifunc++] * ydist;
00764 tmpshell += wave_f[ifunc++] * zdist;
00765 value += tmpshell * contracted_gto;
00766 break;
00767
00768 case D_SHELL:
00769 tmpshell += wave_f[ifunc++] * xdist2;
00770 tmpshell += wave_f[ifunc++] * xdist * ydist;
00771 tmpshell += wave_f[ifunc++] * ydist2;
00772 tmpshell += wave_f[ifunc++] * xdist * zdist;
00773 tmpshell += wave_f[ifunc++] * ydist * zdist;
00774 tmpshell += wave_f[ifunc++] * zdist2;
00775 value += tmpshell * contracted_gto;
00776 break;
00777
00778 case F_SHELL:
00779 tmpshell += wave_f[ifunc++] * xdist2 * xdist;
00780 tmpshell += wave_f[ifunc++] * xdist2 * ydist;
00781 tmpshell += wave_f[ifunc++] * ydist2 * xdist;
00782 tmpshell += wave_f[ifunc++] * ydist2 * ydist;
00783 tmpshell += wave_f[ifunc++] * xdist2 * zdist;
00784 tmpshell += wave_f[ifunc++] * xdist * ydist * zdist;
00785 tmpshell += wave_f[ifunc++] * ydist2 * zdist;
00786 tmpshell += wave_f[ifunc++] * zdist2 * xdist;
00787 tmpshell += wave_f[ifunc++] * zdist2 * ydist;
00788 tmpshell += wave_f[ifunc++] * zdist2 * zdist;
00789 value += tmpshell * contracted_gto;
00790 break;
00791
00792 case G_SHELL:
00793 tmpshell += wave_f[ifunc++] * xdist2 * xdist2;
00794 tmpshell += wave_f[ifunc++] * xdist2 * xdist * ydist;
00795 tmpshell += wave_f[ifunc++] * xdist2 * ydist2;
00796 tmpshell += wave_f[ifunc++] * ydist2 * ydist * xdist;
00797 tmpshell += wave_f[ifunc++] * ydist2 * ydist2;
00798 tmpshell += wave_f[ifunc++] * xdist2 * xdist * zdist;
00799 tmpshell += wave_f[ifunc++] * xdist2 * ydist * zdist;
00800 tmpshell += wave_f[ifunc++] * ydist2 * xdist * zdist;
00801 tmpshell += wave_f[ifunc++] * ydist2 * ydist * zdist;
00802 tmpshell += wave_f[ifunc++] * xdist2 * zdist2;
00803 tmpshell += wave_f[ifunc++] * zdist2 * xdist * ydist;
00804 tmpshell += wave_f[ifunc++] * ydist2 * zdist2;
00805 tmpshell += wave_f[ifunc++] * zdist2 * zdist * xdist;
00806 tmpshell += wave_f[ifunc++] * zdist2 * zdist * ydist;
00807 tmpshell += wave_f[ifunc++] * zdist2 * zdist2;
00808 value += tmpshell * contracted_gto;
00809 break;
00810 }
00811
00812 shell_counter++;
00813 }
00814 }
00815
00816
00817 if (density) {
00818 orbitalgrid[outaddr] = copysignf(value*value, value);
00819 } else {
00820 orbitalgrid[outaddr] = value;
00821 }
00822 }
00823
00824
00825
00826
00827
00828
00829 inline __host__ __device__ float2 make_float2(const float s) {
00830 return make_float2(s, s);
00831 }
00832
00833 inline __host__ __device__ float2 operator+(const float2& a, const float2& b) {
00834 return make_float2(a.x + b.x, a.y + b.y);
00835 }
00836
00837 inline __host__ __device__ void operator+=(float2& a, const float& b) {
00838 a.x += b; a.y += b;
00839 }
00840
00841 inline __host__ __device__ void operator+=(float2& a, const float2& b) {
00842 a.x += b.x; a.y += b.y;
00843 }
00844
00845 inline __host__ __device__ float2 operator-(const float2& a, const float s) {
00846 return make_float2(a.x - s, a.y - s);
00847 }
00848
00849 inline __host__ __device__ float2 operator*(const float2& a, const float2& b) {
00850 return make_float2(a.x * b.x, a.y * b.y);
00851 }
00852
00853 inline __host__ __device__ float2 operator*(const float s, const float2& a) {
00854 return make_float2(a.x * s, a.y * s);
00855 }
00856
00857 inline __host__ __device__ float2 operator*(const float2& a, const float s) {
00858 return make_float2(a.x * s, a.y * s);
00859 }
00860
00861
00862
00863
00864
00865
00866
00867
00868 __global__ static void cuorbitalcachedglobmem_packed_2x(int numatoms,
00869 const float * RESTRICT wave_f,
00870 const float2 * RESTRICT basis_array,
00871 const flint * RESTRICT atominfo,
00872 const int2 * RESTRICT shellinfo,
00873 float voxelsize,
00874 float originx,
00875 float originy,
00876 float grid_z,
00877 int density,
00878 float2 * RESTRICT orbitalgrid) {
00879 unsigned int xindex = blockIdx.x * blockDim.x + threadIdx.x;
00880 unsigned int yindex = blockIdx.y * blockDim.y + threadIdx.y;
00881
00882
00883 unsigned int outaddr = gridDim.x * blockDim.x * yindex + xindex;
00884 float2 grid_x = make_float2(originx) +
00885 voxelsize * make_float2(UNROLLX * xindex,
00886 UNROLLX * xindex + 1);
00887 float grid_y = originy + voxelsize * yindex;
00888
00889
00890 int at;
00891 int prim, shell;
00892
00893
00894 float2 value = make_float2(0.0f);
00895
00896
00897 int ifunc = 0;
00898 int shell_counter = 0;
00899
00900
00901 for (at = 0; at < numatoms; at++) {
00902
00903 int atidx = at << 4;
00904 float2 xdist = (grid_x - atominfo[atidx + 0].f)*ANGS_TO_BOHR;
00905 float ydist = (grid_y - atominfo[atidx + 1].f)*ANGS_TO_BOHR;
00906 float zdist = (grid_z - atominfo[atidx + 2].f)*ANGS_TO_BOHR;
00907
00908 int prim_counter = atominfo[atidx + 3].i >> 1;
00909 int maxshell = atominfo[atidx + 4].i;
00910
00911 float2 xdist2 = xdist*xdist;
00912 float ydist2 = ydist*ydist;
00913 float zdist2 = zdist*zdist;
00914
00915 float2 dist2 = xdist2 + make_float2(ydist2 + zdist2);
00916
00917
00918 for (shell=0; shell < maxshell; shell++) {
00919 int2 tmpshellinfo = shellinfo[shell_counter];
00920 int maxprim = tmpshellinfo.x;
00921 int shell_type = tmpshellinfo.y;
00922
00923 float2 contracted_gto = make_float2(0.0f);
00924 for (prim=prim_counter; prim < maxprim; prim++) {
00925 float2 tmpbasis = basis_array[prim];
00926 float exponent = tmpbasis.x;
00927 float contract_coeff = tmpbasis.y;
00928
00929
00930
00931
00932 float2 negexpdist2 = -exponent*dist2;
00933 float2 expval;
00934 expval.x = exp2f(negexpdist2.x);
00935 expval.y = exp2f(negexpdist2.y);
00936 contracted_gto += contract_coeff * expval;
00937 }
00938
00939
00940 float2 tmpshell = make_float2(0.0f);
00941 switch (shell_type) {
00942 case S_SHELL:
00943 value += wave_f[ifunc++] * contracted_gto;
00944 break;
00945
00946 case P_SHELL:
00947 tmpshell += wave_f[ifunc++] * xdist;
00948 tmpshell += wave_f[ifunc++] * ydist;
00949 tmpshell += wave_f[ifunc++] * zdist;
00950 value += tmpshell * contracted_gto;
00951 break;
00952
00953 case D_SHELL:
00954 tmpshell += wave_f[ifunc++] * xdist2;
00955 tmpshell += wave_f[ifunc++] * xdist * ydist;
00956 tmpshell += wave_f[ifunc++] * ydist2;
00957 tmpshell += wave_f[ifunc++] * xdist * zdist;
00958 tmpshell += wave_f[ifunc++] * ydist * zdist;
00959 tmpshell += wave_f[ifunc++] * zdist2;
00960 value += tmpshell * contracted_gto;
00961 break;
00962
00963 case F_SHELL:
00964 tmpshell += wave_f[ifunc++] * xdist2 * xdist;
00965 tmpshell += wave_f[ifunc++] * xdist2 * ydist;
00966 tmpshell += wave_f[ifunc++] * ydist2 * xdist;
00967 tmpshell += wave_f[ifunc++] * ydist2 * ydist;
00968 tmpshell += wave_f[ifunc++] * xdist2 * zdist;
00969 tmpshell += wave_f[ifunc++] * xdist * ydist * zdist;
00970 tmpshell += wave_f[ifunc++] * ydist2 * zdist;
00971 tmpshell += wave_f[ifunc++] * zdist2 * xdist;
00972 tmpshell += wave_f[ifunc++] * zdist2 * ydist;
00973 tmpshell += wave_f[ifunc++] * zdist2 * zdist;
00974 value += tmpshell * contracted_gto;
00975 break;
00976
00977 case G_SHELL:
00978 tmpshell += wave_f[ifunc++] * xdist2 * xdist2;
00979 tmpshell += wave_f[ifunc++] * xdist2 * xdist * ydist;
00980 tmpshell += wave_f[ifunc++] * xdist2 * ydist2;
00981 tmpshell += wave_f[ifunc++] * ydist2 * ydist * xdist;
00982 tmpshell += wave_f[ifunc++] * ydist2 * ydist2;
00983 tmpshell += wave_f[ifunc++] * xdist2 * xdist * zdist;
00984 tmpshell += wave_f[ifunc++] * xdist2 * ydist * zdist;
00985 tmpshell += wave_f[ifunc++] * ydist2 * xdist * zdist;
00986 tmpshell += wave_f[ifunc++] * ydist2 * ydist * zdist;
00987 tmpshell += wave_f[ifunc++] * xdist2 * zdist2;
00988 tmpshell += wave_f[ifunc++] * zdist2 * xdist * ydist;
00989 tmpshell += wave_f[ifunc++] * ydist2 * zdist2;
00990 tmpshell += wave_f[ifunc++] * zdist2 * zdist * xdist;
00991 tmpshell += wave_f[ifunc++] * zdist2 * zdist * ydist;
00992 tmpshell += wave_f[ifunc++] * zdist2 * zdist2;
00993 value += tmpshell * contracted_gto;
00994 break;
00995 }
00996
00997 shell_counter++;
00998 }
00999 }
01000
01001
01002 if (density) {
01003 float2 tmpout;
01004 tmpout.x = copysignf(value.x*value.x, value.x);
01005 tmpout.y = copysignf(value.y*value.y, value.y);
01006 orbitalgrid[outaddr] = tmpout;
01007 } else {
01008 orbitalgrid[outaddr] = value;
01009 }
01010 }
01011
01012
01013
01014
01015
01016
01017 inline __host__ __device__ float4 make_float4(const float s) {
01018 return make_float4(s, s, s, s);
01019 }
01020
01021 inline __host__ __device__ float4 operator+(const float4& a, const float4& b) {
01022 return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
01023 }
01024
01025 inline __host__ __device__ float4 make_float4(const float3 &a, const float &b) {
01026 return make_float4(a.x, a.y, a.z, b);
01027 }
01028
01029 inline __host__ __device__ void operator+=(float4& a, const float& b) {
01030 a.x += b; a.y += b; a.z += b; a.w += b;
01031 }
01032
01033 inline __host__ __device__ void operator+=(float4& a, const float4& b) {
01034 a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
01035 }
01036
01037 inline __host__ __device__ float4 operator-(const float4& a, const float s) {
01038 return make_float4(a.x - s, a.y - s, a.z - s, a.w - s);
01039 }
01040
01041 inline __host__ __device__ void operator*=(float4& a, const float &b) {
01042 a.x *= b; a.y *= b; a.z *= b; a.w *= b;
01043 }
01044
01045 inline __host__ __device__ float4 operator*(const float4& a, const float4& b) {
01046 return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
01047 }
01048
01049 inline __host__ __device__ float4 operator*(const float4& a, const float s) {
01050 return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
01051 }
01052
01053 inline __host__ __device__ float4 operator*(const float s, const float4& a) {
01054 return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
01055 }
01056
01057
01058
01059
01060
01061
01062
01063 __global__ static void cuorbitalcachedglobmem_packed_4x_3D(int numatoms,
01064 const float * RESTRICT wave_f,
01065 const float2 * RESTRICT basis_array,
01066 const flint * RESTRICT atominfo,
01067 const int2 * RESTRICT shellinfo,
01068 float voxelsize,
01069 float originx,
01070 float originy,
01071 float grid_z,
01072 int density,
01073 float4 * RESTRICT orbitalgrid) {
01074 unsigned int xindex = blockIdx.x * blockDim.x + threadIdx.x;
01075 unsigned int yindex = blockIdx.y * blockDim.y + threadIdx.y;
01076 unsigned int zindex = blockIdx.z * blockDim.z + threadIdx.z;
01077
01078
01079 unsigned int outaddr = gridDim.x * blockDim.x * yindex + xindex;
01080 const int unrollxidx = UNROLLX * xindex;
01081 float4 grid_x = make_float4(originx) +
01082 voxelsize * make_float4(unrollxidx,
01083 unrollxidx + 1,
01084 unrollxidx + 2,
01085 unrollxidx + 3);
01086 float grid_y = originy + voxelsize * yindex;
01087
01088
01089 int at;
01090 int prim, shell;
01091
01092
01093 float4 value = make_float4(0.0f);
01094
01095
01096 int ifunc = 0;
01097 int shell_counter = 0;
01098
01099
01100 for (at = 0; at < numatoms; at++) {
01101
01102 int atidx = at << 4;
01103 float4 xdist = (grid_x - atominfo[atidx + 0].f)*ANGS_TO_BOHR;
01104 float ydist = (grid_y - atominfo[atidx + 1].f)*ANGS_TO_BOHR;
01105 float zdist = (grid_z - atominfo[atidx + 2].f)*ANGS_TO_BOHR;
01106
01107 int prim_counter = atominfo[atidx + 3].i >> 1;
01108 int maxshell = atominfo[atidx + 4].i;
01109
01110 float4 xdist2 = xdist*xdist;
01111 float ydist2 = ydist*ydist;
01112 float zdist2 = zdist*zdist;
01113
01114 float4 dist2 = xdist2 + make_float4(ydist2 + zdist2);
01115
01116
01117 for (shell=0; shell < maxshell; shell++) {
01118 int2 tmpshellinfo = shellinfo[shell_counter];
01119 int maxprim = tmpshellinfo.x;
01120 int shell_type = tmpshellinfo.y;
01121
01122 float4 contracted_gto = make_float4(0.0f);
01123 for (prim=prim_counter; prim < maxprim; prim++) {
01124 float2 tmpbasis = basis_array[prim];
01125 float exponent = tmpbasis.x;
01126 float contract_coeff = tmpbasis.y;
01127
01128
01129
01130
01131 float4 negexpdist2 = -exponent*dist2;
01132 float4 expval;
01133 expval.x = exp2f(negexpdist2.x);
01134 expval.y = exp2f(negexpdist2.y);
01135 expval.z = exp2f(negexpdist2.z);
01136 expval.w = exp2f(negexpdist2.w);
01137 contracted_gto += contract_coeff * expval;
01138 }
01139
01140
01141 float4 tmpshell = make_float4(0.0f);
01142 switch (shell_type) {
01143 case S_SHELL:
01144 value += wave_f[ifunc++] * contracted_gto;
01145 break;
01146
01147 case P_SHELL:
01148 tmpshell += wave_f[ifunc++] * xdist;
01149 tmpshell += wave_f[ifunc++] * ydist;
01150 tmpshell += wave_f[ifunc++] * zdist;
01151 value += tmpshell * contracted_gto;
01152 break;
01153
01154 case D_SHELL:
01155 tmpshell += wave_f[ifunc++] * xdist2;
01156 tmpshell += wave_f[ifunc++] * xdist * ydist;
01157 tmpshell += wave_f[ifunc++] * ydist2;
01158 tmpshell += wave_f[ifunc++] * xdist * zdist;
01159 tmpshell += wave_f[ifunc++] * ydist * zdist;
01160 tmpshell += wave_f[ifunc++] * zdist2;
01161 value += tmpshell * contracted_gto;
01162 break;
01163
01164 case F_SHELL:
01165 tmpshell += wave_f[ifunc++] * xdist2 * xdist;
01166 tmpshell += wave_f[ifunc++] * xdist2 * ydist;
01167 tmpshell += wave_f[ifunc++] * ydist2 * xdist;
01168 tmpshell += wave_f[ifunc++] * ydist2 * ydist;
01169 tmpshell += wave_f[ifunc++] * xdist2 * zdist;
01170 tmpshell += wave_f[ifunc++] * xdist * ydist * zdist;
01171 tmpshell += wave_f[ifunc++] * ydist2 * zdist;
01172 tmpshell += wave_f[ifunc++] * zdist2 * xdist;
01173 tmpshell += wave_f[ifunc++] * zdist2 * ydist;
01174 tmpshell += wave_f[ifunc++] * zdist2 * zdist;
01175 value += tmpshell * contracted_gto;
01176 break;
01177
01178 case G_SHELL:
01179 tmpshell += wave_f[ifunc++] * xdist2 * xdist2;
01180 tmpshell += wave_f[ifunc++] * xdist2 * xdist * ydist;
01181 tmpshell += wave_f[ifunc++] * xdist2 * ydist2;
01182 tmpshell += wave_f[ifunc++] * ydist2 * ydist * xdist;
01183 tmpshell += wave_f[ifunc++] * ydist2 * ydist2;
01184 tmpshell += wave_f[ifunc++] * xdist2 * xdist * zdist;
01185 tmpshell += wave_f[ifunc++] * xdist2 * ydist * zdist;
01186 tmpshell += wave_f[ifunc++] * ydist2 * xdist * zdist;
01187 tmpshell += wave_f[ifunc++] * ydist2 * ydist * zdist;
01188 tmpshell += wave_f[ifunc++] * xdist2 * zdist2;
01189 tmpshell += wave_f[ifunc++] * zdist2 * xdist * ydist;
01190 tmpshell += wave_f[ifunc++] * ydist2 * zdist2;
01191 tmpshell += wave_f[ifunc++] * zdist2 * zdist * xdist;
01192 tmpshell += wave_f[ifunc++] * zdist2 * zdist * ydist;
01193 tmpshell += wave_f[ifunc++] * zdist2 * zdist2;
01194 value += tmpshell * contracted_gto;
01195 break;
01196 }
01197
01198 shell_counter++;
01199 }
01200 }
01201
01202
01203 if (density) {
01204 float4 tmpout;
01205 tmpout.x = copysignf(value.x*value.x, value.x);
01206 tmpout.y = copysignf(value.y*value.y, value.y);
01207 tmpout.z = copysignf(value.z*value.z, value.z);
01208 tmpout.w = copysignf(value.w*value.w, value.w);
01209 orbitalgrid[outaddr] = tmpout;
01210 } else {
01211 orbitalgrid[outaddr] = value;
01212 }
01213 }
01214
01215
01216
01217
01218
01219
01220
01221
01222 __global__ static void cuorbitalcachedglobmem_packed_4x(int numatoms,
01223 const float * RESTRICT wave_f,
01224 const float2 * RESTRICT basis_array,
01225 const flint * RESTRICT atominfo,
01226 const int2 * RESTRICT shellinfo,
01227 float voxelsize,
01228 float originx,
01229 float originy,
01230 float grid_z,
01231 int density,
01232 float4 * RESTRICT orbitalgrid) {
01233 unsigned int xindex = blockIdx.x * blockDim.x + threadIdx.x;
01234 unsigned int yindex = blockIdx.y * blockDim.y + threadIdx.y;
01235
01236
01237 unsigned int outaddr = gridDim.x * blockDim.x * yindex + xindex;
01238 float4 grid_x = make_float4(originx) +
01239 voxelsize * make_float4(UNROLLX * xindex,
01240 UNROLLX * xindex + 1,
01241 UNROLLX * xindex + 2,
01242 UNROLLX * xindex + 3);
01243 float grid_y = originy + voxelsize * yindex;
01244
01245
01246 int at;
01247 int prim, shell;
01248
01249
01250 float4 value = make_float4(0.0f);
01251
01252
01253 int ifunc = 0;
01254 int shell_counter = 0;
01255
01256
01257 for (at = 0; at < numatoms; at++) {
01258
01259 int atidx = at << 4;
01260 float4 xdist = (grid_x - atominfo[atidx + 0].f)*ANGS_TO_BOHR;
01261 float ydist = (grid_y - atominfo[atidx + 1].f)*ANGS_TO_BOHR;
01262 float zdist = (grid_z - atominfo[atidx + 2].f)*ANGS_TO_BOHR;
01263
01264 int prim_counter = atominfo[atidx + 3].i >> 1;
01265 int maxshell = atominfo[atidx + 4].i;
01266
01267 float4 xdist2 = xdist*xdist;
01268 float ydist2 = ydist*ydist;
01269 float zdist2 = zdist*zdist;
01270
01271 float4 dist2 = xdist2 + make_float4(ydist2 + zdist2);
01272
01273
01274 for (shell=0; shell < maxshell; shell++) {
01275 int2 tmpshellinfo = shellinfo[shell_counter];
01276 int maxprim = tmpshellinfo.x;
01277 int shell_type = tmpshellinfo.y;
01278
01279 float4 contracted_gto = make_float4(0.0f);
01280 for (prim=prim_counter; prim < maxprim; prim++) {
01281 float2 tmpbasis = basis_array[prim];
01282 float exponent = tmpbasis.x;
01283 float contract_coeff = tmpbasis.y;
01284
01285
01286
01287
01288 float4 negexpdist2 = -exponent*dist2;
01289 float4 expval;
01290 expval.x = exp2f(negexpdist2.x);
01291 expval.y = exp2f(negexpdist2.y);
01292 expval.z = exp2f(negexpdist2.z);
01293 expval.w = exp2f(negexpdist2.w);
01294 contracted_gto += contract_coeff * expval;
01295 }
01296
01297
01298 float4 tmpshell = make_float4(0.0f);
01299 switch (shell_type) {
01300 case S_SHELL:
01301 value += wave_f[ifunc++] * contracted_gto;
01302 break;
01303
01304 case P_SHELL:
01305 tmpshell += wave_f[ifunc++] * xdist;
01306 tmpshell += wave_f[ifunc++] * ydist;
01307 tmpshell += wave_f[ifunc++] * zdist;
01308 value += tmpshell * contracted_gto;
01309 break;
01310
01311 case D_SHELL:
01312 tmpshell += wave_f[ifunc++] * xdist2;
01313 tmpshell += wave_f[ifunc++] * xdist * ydist;
01314 tmpshell += wave_f[ifunc++] * ydist2;
01315 tmpshell += wave_f[ifunc++] * xdist * zdist;
01316 tmpshell += wave_f[ifunc++] * ydist * zdist;
01317 tmpshell += wave_f[ifunc++] * zdist2;
01318 value += tmpshell * contracted_gto;
01319 break;
01320
01321 case F_SHELL:
01322 tmpshell += wave_f[ifunc++] * xdist2 * xdist;
01323 tmpshell += wave_f[ifunc++] * xdist2 * ydist;
01324 tmpshell += wave_f[ifunc++] * ydist2 * xdist;
01325 tmpshell += wave_f[ifunc++] * ydist2 * ydist;
01326 tmpshell += wave_f[ifunc++] * xdist2 * zdist;
01327 tmpshell += wave_f[ifunc++] * xdist * ydist * zdist;
01328 tmpshell += wave_f[ifunc++] * ydist2 * zdist;
01329 tmpshell += wave_f[ifunc++] * zdist2 * xdist;
01330 tmpshell += wave_f[ifunc++] * zdist2 * ydist;
01331 tmpshell += wave_f[ifunc++] * zdist2 * zdist;
01332 value += tmpshell * contracted_gto;
01333 break;
01334
01335 case G_SHELL:
01336 tmpshell += wave_f[ifunc++] * xdist2 * xdist2;
01337 tmpshell += wave_f[ifunc++] * xdist2 * xdist * ydist;
01338 tmpshell += wave_f[ifunc++] * xdist2 * ydist2;
01339 tmpshell += wave_f[ifunc++] * ydist2 * ydist * xdist;
01340 tmpshell += wave_f[ifunc++] * ydist2 * ydist2;
01341 tmpshell += wave_f[ifunc++] * xdist2 * xdist * zdist;
01342 tmpshell += wave_f[ifunc++] * xdist2 * ydist * zdist;
01343 tmpshell += wave_f[ifunc++] * ydist2 * xdist * zdist;
01344 tmpshell += wave_f[ifunc++] * ydist2 * ydist * zdist;
01345 tmpshell += wave_f[ifunc++] * xdist2 * zdist2;
01346 tmpshell += wave_f[ifunc++] * zdist2 * xdist * ydist;
01347 tmpshell += wave_f[ifunc++] * ydist2 * zdist2;
01348 tmpshell += wave_f[ifunc++] * zdist2 * zdist * xdist;
01349 tmpshell += wave_f[ifunc++] * zdist2 * zdist * ydist;
01350 tmpshell += wave_f[ifunc++] * zdist2 * zdist2;
01351 value += tmpshell * contracted_gto;
01352 break;
01353 }
01354
01355 shell_counter++;
01356 }
01357 }
01358
01359
01360 if (density) {
01361 float4 tmpout;
01362 tmpout.x = copysignf(value.x*value.x, value.x);
01363 tmpout.y = copysignf(value.y*value.y, value.y);
01364 tmpout.z = copysignf(value.z*value.z, value.z);
01365 tmpout.w = copysignf(value.w*value.w, value.w);
01366 orbitalgrid[outaddr] = tmpout;
01367 } else {
01368 orbitalgrid[outaddr] = value;
01369 }
01370 }
01371
01372
01373
01374
01375 static int computepaddedsize(int orig, int tilesize) {
01376 int alignmask = tilesize - 1;
01377 int paddedsz = (orig + alignmask) & ~alignmask;
01378
01379 return paddedsz;
01380 }
01381
01382 static void * cudaorbitalthread(void *voidparms) {
01383 dim3 volsize, Gsz, Bsz;
01384 float *d_wave_f = NULL;
01385 float *d_basis_array = NULL;
01386 flint *d_atominfo = NULL;
01387 int *d_shellinfo = NULL;
01388 int *d_shellinfo_packed = NULL;
01389 float *d_origin = NULL;
01390 int *d_numvoxels = NULL;
01391 float *d_orbitalgrid = NULL;
01392 float *h_orbitalgrid = NULL;
01393 float *h_basis_array_exp2f = NULL;
01394 int numvoxels[3];
01395 float origin[3];
01396 orbthrparms *parms = NULL;
01397 int hwpreferscachekernel=0;
01398 int usefastconstkernel=0;
01399 int usecachekernel=0;
01400 int threadid=0;
01401 #if defined(USE_PINNED_MEMORY)
01402 int h_orbitalgrid_pinnedalloc=0;
01403 #endif
01404 #if defined(USE_ZERO_COPY)
01405 int h_orbitalgrid_zerocopy=0;
01406 #endif
01407
01408 wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
01409 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
01410
01411
01412 cudaDeviceProp deviceProp;
01413 memset(&deviceProp, 0, sizeof(cudaDeviceProp));
01414
01415 int dev=0;
01416 if ((cudaGetDevice(&dev) == cudaSuccess) &&
01417 (cudaGetDeviceProperties(&deviceProp, dev) == cudaSuccess)) {
01418 cudaError_t err = cudaGetLastError();
01419
01420 if (deviceProp.major >= 7)
01421 hwpreferscachekernel=7;
01422
01423
01424 if (getenv("VMDFORCEMOPACKED") != NULL) {
01425 if (deviceProp.major >= 8)
01426 hwpreferscachekernel=8;
01427 }
01428 }
01429
01430
01431
01432 int maxtilesize=4;
01433
01434
01435 wkf_threadpool_worker_devscaletile(voidparms, &maxtilesize);
01436
01437 numvoxels[0] = parms->numvoxels[0];
01438 numvoxels[1] = parms->numvoxels[1];
01439 numvoxels[2] = 1;
01440
01441 origin[0] = parms->origin[0];
01442 origin[1] = parms->origin[1];
01443
01444
01445 volsize.x = (parms->numvoxels[0] + GPU_X_ALIGNMASK) & ~(GPU_X_ALIGNMASK);
01446 volsize.y = (parms->numvoxels[1] + GPU_Y_ALIGNMASK) & ~(GPU_Y_ALIGNMASK);
01447 volsize.z = 1;
01448
01449
01450 Bsz.x = BLOCKSIZEX;
01451 Bsz.y = BLOCKSIZEY;
01452 Bsz.z = 1;
01453 Gsz.x = volsize.x / (Bsz.x * UNROLLX);
01454 Gsz.y = volsize.y / (Bsz.y * UNROLLY);
01455 Gsz.z = 1;
01456 int volmemsz = sizeof(float) * volsize.x * volsize.y * volsize.z;
01457
01458
01459
01460 if (hwpreferscachekernel) {
01461 usecachekernel=1;
01462 } else if ((parms->num_wave_f < MAX_WAVEF_SZ) &&
01463 (parms->numatoms < MAX_ATOM_SZ) &&
01464 (parms->numatoms < MAX_ATOMSHELL_SZ) &&
01465 (2*parms->num_basis < MAX_BASIS_SZ) &&
01466 (parms->num_shells < MAX_SHELL_SZ)) {
01467 usefastconstkernel=1;
01468 }
01469
01470
01471 if (getenv("VMDFORCEMOCONSTMEM") != NULL) {
01472 usefastconstkernel=1;
01473 usecachekernel=0;
01474 }
01475 if (getenv("VMDFORCEMOTILEDSHARED") != NULL) {
01476 usefastconstkernel=0;
01477 usecachekernel=0;
01478 }
01479 if (getenv("VMDFORCEMOL1CACHE") != NULL) {
01480 usefastconstkernel=0;
01481 usecachekernel=1;
01482 }
01483
01484
01485 int padsz;
01486 padsz = computepaddedsize(2 * parms->num_basis, MEMCOALESCE);
01487 h_basis_array_exp2f = (float *) malloc(padsz * sizeof(float));
01488 float log2e = log2f(2.718281828f);
01489 for (int ll=0; ll<(2*parms->num_basis); ll+=2) {
01490 #if 1
01491
01492 h_basis_array_exp2f[ll ] = parms->basis_array[ll ] * log2e;
01493 #else
01494 h_basis_array_exp2f[ll ] = parms->basis_array[ll ];
01495 #endif
01496 h_basis_array_exp2f[ll+1] = parms->basis_array[ll+1];
01497 }
01498
01499 if (usefastconstkernel) {
01500 cudaMemcpyToSymbol(const_wave_f, parms->wave_f, parms->num_wave_f * sizeof(float), 0);
01501 cudaMemcpyToSymbol(const_atompos, parms->atompos, 3 * parms->numatoms * sizeof(float), 0);
01502 cudaMemcpyToSymbol(const_atom_basis, parms->atom_basis, parms->numatoms * sizeof(int), 0);
01503 cudaMemcpyToSymbol(const_num_shells_per_atom, parms->num_shells_per_atom, parms->numatoms * sizeof(int), 0);
01504 cudaMemcpyToSymbol(const_basis_array, h_basis_array_exp2f, 2 * parms->num_basis * sizeof(float), 0);
01505 cudaMemcpyToSymbol(const_num_prim_per_shell, parms->num_prim_per_shell, parms->num_shells * sizeof(int), 0);
01506 cudaMemcpyToSymbol(const_shell_types, parms->shell_types, parms->num_shells * sizeof(int), 0);
01507 } else {
01508 padsz = computepaddedsize(parms->num_wave_f, MEMCOALESCE);
01509 cudaMalloc((void**)&d_wave_f, padsz * sizeof(float));
01510 cudaMemcpy(d_wave_f, parms->wave_f, parms->num_wave_f * sizeof(float), cudaMemcpyHostToDevice);
01511
01512
01513 padsz = computepaddedsize(16 * parms->numatoms, MEMCOALESCE);
01514 flint * h_atominfo = (flint *) calloc(1, padsz * sizeof(flint));
01515 cudaMalloc((void**)&d_atominfo, padsz * sizeof(flint));
01516 for (int ll=0; ll<parms->numatoms; ll++) {
01517 int addr = ll * 16;
01518 h_atominfo[addr ].f = parms->atompos[ll*3 ];
01519 h_atominfo[addr + 1].f = parms->atompos[ll*3 + 1];
01520 h_atominfo[addr + 2].f = parms->atompos[ll*3 + 2];
01521 h_atominfo[addr + 3].i = parms->atom_basis[ll];
01522 h_atominfo[addr + 4].i = parms->num_shells_per_atom[ll];
01523 }
01524 cudaMemcpy(d_atominfo, h_atominfo, padsz * sizeof(flint), cudaMemcpyHostToDevice);
01525 free(h_atominfo);
01526
01527 padsz = computepaddedsize(16 * parms->num_shells, MEMCOALESCE);
01528 int * h_shellinfo = (int *) calloc(1, padsz * sizeof(int));
01529 cudaMalloc((void**)&d_shellinfo, padsz * sizeof(int));
01530 for (int ll=0; ll<parms->num_shells; ll++) {
01531 h_shellinfo[ll*16 ] = parms->num_prim_per_shell[ll];
01532 h_shellinfo[ll*16 + 1] = parms->shell_types[ll];
01533 }
01534 cudaMemcpy(d_shellinfo, h_shellinfo, padsz * sizeof(int), cudaMemcpyHostToDevice);
01535 free(h_shellinfo);
01536
01537
01538 #if 1
01539
01540
01541
01542 padsz = computepaddedsize(2 * parms->num_shells, MEMCOALESCE);
01543 h_shellinfo = (int *) calloc(1, padsz * sizeof(int));
01544 cudaMalloc((void**)&d_shellinfo_packed, padsz * sizeof(int));
01545 for (int ll=0; ll<parms->num_shells; ll++) {
01546 h_shellinfo[ll*2 ] = parms->num_prim_per_shell[ll];
01547 h_shellinfo[ll*2 + 1] = parms->shell_types[ll];
01548 }
01549 cudaMemcpy(d_shellinfo_packed, h_shellinfo, padsz * sizeof(int), cudaMemcpyHostToDevice);
01550 free(h_shellinfo);
01551 #endif
01552
01553
01554 cudaMalloc((void**)&d_basis_array, padsz * sizeof(float));
01555 cudaMemcpy(d_basis_array, h_basis_array_exp2f, 2 * parms->num_basis * sizeof(float), cudaMemcpyHostToDevice);
01556
01557 padsz = computepaddedsize(3, MEMCOALESCE);
01558 cudaMalloc((void**)&d_origin, padsz * sizeof(float));
01559 cudaMemcpy(d_origin, origin, 3 * sizeof(float), cudaMemcpyHostToDevice);
01560
01561 cudaMalloc((void**)&d_numvoxels, padsz * sizeof(int));
01562 cudaMemcpy(d_numvoxels, numvoxels, 3 * sizeof(int), cudaMemcpyHostToDevice);
01563 }
01564
01565
01566 #if defined(USE_ZERO_COPY)
01567 if ((getenv("VMDCUDANOZEROCOPY") == NULL) &&
01568 (cudaHostAlloc((void **) &h_orbitalgrid, volmemsz, cudaHostAllocMapped) == cudaSuccess)) {
01569 h_orbitalgrid_zerocopy=1;
01570 cudaHostGetDevicePointer(&d_orbitalgrid, h_orbitalgrid, 0);
01571 CUERR
01572 } else {
01573 printf("WARNING: CUDA zero-copy pinned memory allocation failed!\n");
01574 #else
01575
01576 cudaMalloc((void**)&d_orbitalgrid, volmemsz);
01577 CUERR
01578 #endif
01579 #if defined(USE_PINNED_MEMORY)
01580
01581 if ((getenv("VMDCUDANOPINNEDMEMORY") == NULL) &&
01582 (cudaMallocHost((void **) &h_orbitalgrid, volmemsz) == cudaSuccess)) {
01583 h_orbitalgrid_pinnedalloc=1;
01584 } else {
01585 printf("WARNING: CUDA pinned memory allocation failed!\n");
01586 h_orbitalgrid_pinnedalloc=0;
01587 h_orbitalgrid = (float *) malloc(volmemsz);
01588 }
01589 #else
01590
01591 h_orbitalgrid = (float *) malloc(volmemsz);
01592 #endif
01593 #if defined(USE_ZERO_COPY)
01594 }
01595 #endif
01596
01597 #if 0
01598 if (threadid == 0) {
01599
01600 printf("atoms[%d] ", parms->numatoms);
01601 printf("wavef[%d] ", parms->num_wave_f);
01602 printf("basis[%d] ", parms->num_basis);
01603 printf("shell[%d] ", parms->num_shells);
01604 if (usefastconstkernel) {
01605 #if defined(VMDMOJITSRC) && defined(VMDMOJIT)
01606 printf("GPU constant memory (JIT)");
01607 #else
01608 printf("GPU constant memory");
01609 #endif
01610 } else {
01611 printf("GPU tiled shared memory:");
01612 }
01613 printf(" Gsz:%dx%d\n", Gsz.x, Gsz.y);
01614 }
01615 #endif
01616
01617 #if 0
01618 if (usefastconstkernel) {
01619 printf("Using constant memory\n");
01620 } else if (!usefastconstkernel && usecachekernel) {
01621 printf("Using Fermi/Kepler L1 cache\n");
01622 } else if (!usefastconstkernel) {
01623 printf("Using tiled shared memory kernel\n");
01624 }
01625 #endif
01626
01627
01628 wkf_tasktile_t tile;
01629 while (wkf_threadpool_next_tile(voidparms, maxtilesize, &tile) != WKF_SCHED_DONE) {
01630 for (int k=tile.start; k<tile.end; k++) {
01631 origin[2] = parms->origin[2] + parms->voxelsize * k;
01632
01633
01634 if (usefastconstkernel) {
01635 #if defined(VMDMOJITSRC) && defined(VMDMOJIT)
01636 cuorbitalconstmem_jit<<<Gsz, Bsz, 0>>>(parms->numatoms,
01637 parms->voxelsize,
01638 origin[0], origin[1], origin[2],
01639 parms->density, d_orbitalgrid);
01640 #else
01641 cuorbitalconstmem<<<Gsz, Bsz, 0>>>(parms->numatoms,
01642 parms->voxelsize,
01643 origin[0], origin[1], origin[2],
01644 parms->density, d_orbitalgrid);
01645 #endif
01646 } else {
01647 if (usecachekernel) {
01648
01649 if (hwpreferscachekernel >= 8) {
01650 #if 0
01651
01652 int ktilesize = tile.end - tile.start;
01653 Gsz.z = ktilesize;
01654 cuorbitalcachedglobmem_packed_4x_3D<<<Gsz, Bsz, 0>>>(parms->numatoms,
01655 d_wave_f, (float2 *) d_basis_array,
01656 d_atominfo, (int2 *) d_shellinfo_packed,
01657 parms->voxelsize,
01658 origin[0], origin[1], origin[2],
01659 parms->density, (float4 *) d_orbitalgrid);
01660 k = tile.end;
01661 #elif 1
01662 cuorbitalcachedglobmem_packed_4x<<<Gsz, Bsz, 0>>>(parms->numatoms,
01663 d_wave_f, (float2 *) d_basis_array,
01664 d_atominfo, (int2 *) d_shellinfo_packed,
01665 parms->voxelsize,
01666 origin[0], origin[1], origin[2],
01667 parms->density, (float4 *) d_orbitalgrid);
01668 #elif 1
01669 cuorbitalcachedglobmem_packed_2x<<<Gsz, Bsz, 0>>>(parms->numatoms,
01670 d_wave_f, (float2 *) d_basis_array,
01671 d_atominfo, (int2 *) d_shellinfo_packed,
01672 parms->voxelsize,
01673 origin[0], origin[1], origin[2],
01674 parms->density, (float2 *) d_orbitalgrid);
01675 #else
01676 cuorbitalcachedglobmem_packed<<<Gsz, Bsz, 0>>>(parms->numatoms,
01677 d_wave_f, (float2 *) d_basis_array,
01678 d_atominfo, (int2 *) d_shellinfo_packed,
01679 parms->voxelsize,
01680 origin[0], origin[1], origin[2],
01681 parms->density, d_orbitalgrid);
01682 #endif
01683 } else {
01684
01685
01686 cuorbitalcachedglobmem<<<Gsz, Bsz, 0>>>(parms->numatoms,
01687 d_wave_f, d_basis_array,
01688 d_atominfo, d_shellinfo,
01689 parms->voxelsize,
01690 origin[0], origin[1], origin[2],
01691 parms->density, d_orbitalgrid);
01692 }
01693 } else {
01694
01695
01696 cuorbitaltiledshared<<<Gsz, Bsz, 0>>>(parms->numatoms,
01697 d_wave_f, d_basis_array,
01698 d_atominfo, d_shellinfo,
01699 parms->voxelsize,
01700 origin[0], origin[1], origin[2],
01701 parms->density, d_orbitalgrid);
01702 }
01703 }
01704
01705 #if defined(USE_ZERO_COPY)
01706 if (h_orbitalgrid_zerocopy) {
01707 cudaDeviceSynchronize();
01708 } else {
01709 #endif
01710 CUERR
01711
01712
01713 cudaMemcpy(h_orbitalgrid, d_orbitalgrid, volmemsz, cudaMemcpyDeviceToHost);
01714 CUERR
01715 #if defined(USE_ZERO_COPY)
01716 }
01717 #endif
01718
01719
01720 int planesz = numvoxels[0] * numvoxels[1];
01721 for (int y=0; y<numvoxels[1]; y++) {
01722 long orbaddr = k*planesz + y*numvoxels[0];
01723 memcpy(&parms->orbitalgrid[orbaddr], &h_orbitalgrid[y*volsize.x], numvoxels[0] * sizeof(float));
01724 }
01725 }
01726 cudaDeviceSynchronize();
01727 }
01728
01729 free(h_basis_array_exp2f);
01730
01731 #if defined(USE_ZERO_COPY)
01732 if (h_orbitalgrid_zerocopy) {
01733 cudaFreeHost(h_orbitalgrid);
01734 } else {
01735 #endif
01736 #if defined(USE_PINNED_MEMORY)
01737 if (h_orbitalgrid_pinnedalloc)
01738 cudaFreeHost(h_orbitalgrid);
01739 else
01740 free(h_orbitalgrid);
01741 #else
01742 free(h_orbitalgrid);
01743 #endif
01744 cudaFree(d_orbitalgrid);
01745 #if defined(USE_ZERO_COPY)
01746 }
01747 #endif
01748
01749 if (!usefastconstkernel) {
01750 cudaFree(d_wave_f);
01751 cudaFree(d_basis_array);
01752 cudaFree(d_atominfo);
01753 cudaFree(d_shellinfo);
01754 cudaFree(d_numvoxels);
01755 cudaFree(d_origin);
01756 }
01757
01758 CUERR
01759
01760 return NULL;
01761 }
01762
01763
01764 int vmd_cuda_evaluate_orbital_grid(wkf_threadpool_t *devpool,
01765 int numatoms,
01766 const float *wave_f, int num_wave_f,
01767 const float *basis_array, int num_basis,
01768 const float *atompos,
01769 const int *atom_basis,
01770 const int *num_shells_per_atom,
01771 const int *num_prim_per_shell,
01772 const int *shell_types,
01773 int num_shells,
01774 const int *numvoxels,
01775 float voxelsize,
01776 const float *origin,
01777 int density,
01778 float *orbitalgrid) {
01779 int rc=0;
01780 orbthrparms parms;
01781
01782 if (devpool == NULL) {
01783 return -1;
01784 }
01785
01786 #if 0
01787
01788
01789 if (getenv("VMDCUDADEV")) {
01790 rc=cudaSetDevice(atoi(getenv("VMDCUDADEV")));
01791 } else {
01792 rc=cudaSetDevice(0);
01793 }
01794
01795 if (rc != cudaSuccess) {
01796 #if CUDART_VERSION >= 2010
01797 rc = cudaGetLastError();
01798 if (rc != cudaErrorSetOnActiveProcess)
01799 return NULL;
01800 #else
01801 cudaGetLastError();
01802
01803 #endif
01804 }
01805
01806
01807 cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost);
01808
01809 #if CUDART_VERSION >= 3000
01810
01811 cudaFuncSetCacheConfig(cuorbitalcachedglobmem, cudaFuncCachePreferL1);
01812
01813
01814 cudaFuncSetCacheConfig(cuorbitaltiledshared, cudaFuncCachePreferShared);
01815 #endif
01816 #endif
01817
01818 #if defined(VMDMOJITSRC)
01819 if (getenv("VMDMOJIT") != NULL) {
01820
01821 orbital_jit_generate(ORBITAL_JIT_CUDA, getenv("VMDMOJITSRCFILE"),
01822 numatoms, wave_f, basis_array, atom_basis,
01823 num_shells_per_atom, num_prim_per_shell, shell_types);
01824
01825
01826 orbital_jit_generate(ORBITAL_JIT_OPENCL, "/tmp/mojit.cl",
01827 numatoms, wave_f, basis_array, atom_basis,
01828 num_shells_per_atom, num_prim_per_shell, shell_types);
01829 return 0;
01830 }
01831 #endif
01832
01833 parms.numatoms = numatoms;
01834 parms.wave_f = wave_f;
01835 parms.num_wave_f = num_wave_f;
01836 parms.basis_array = basis_array;
01837 parms.num_basis = num_basis;
01838 parms.atompos = atompos;
01839 parms.atom_basis = atom_basis;
01840 parms.num_shells_per_atom = num_shells_per_atom;
01841 parms.num_prim_per_shell = num_prim_per_shell;
01842 parms.shell_types = shell_types;
01843 parms.num_shells = num_shells;
01844 parms.numvoxels = numvoxels;
01845 parms.voxelsize = voxelsize;
01846 parms.origin = origin;
01847 parms.density = density;
01848 parms.orbitalgrid = orbitalgrid;
01849
01850 wkf_tasktile_t tile;
01851 tile.start=0;
01852 tile.end=numvoxels[2];
01853 wkf_threadpool_sched_dynamic(devpool, &tile);
01854 wkf_threadpool_launch(devpool, cudaorbitalthread, &parms, 1);
01855
01856 return rc;
01857 }
01858
01859