00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00023
00024
00025 #if VMDUSEAVX2 && defined(__AVX__) && defined(__AVX2__) || (defined(_MSC_VER) && (_MSC_VER >= 1916))
00026 #include <immintrin.h>
00027 #endif
00028
00029 #include <stdio.h>
00030 #include <stdlib.h>
00031 #include <string.h>
00032 #include <math.h>
00033 #include "QuickSurf.h"
00034 #include "Inform.h"
00035 #include "utilities.h"
00036 #include "WKFUtils.h"
00037 #include "VolumetricData.h"
00038
00039 #include "VMDDisplayList.h"
00040 #include "Displayable.h"
00041 #include "DispCmds.h"
00042
00043 #define MIN(X,Y) (((X)<(Y))? (X) : (Y))
00044 #define MAX(X,Y) (((X)>(Y))? (X) : (Y))
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086 #if defined(__GNUC__) && ! defined(__INTEL_COMPILER)
00087 #define __align(X) __attribute__((aligned(X) ))
00088 #if (__GNUC__ < 4)
00089 #define MISSING_mm_cvtsd_f64
00090 #endif
00091 #else
00092 #define __align(X) __declspec(align(X) )
00093 #endif
00094
00095 #define MLOG2EF -1.44269504088896f
00096
00097
00098
00099
00100
00101 #define SCEXP0 1.0000000000000000f
00102 #define SCEXP1 0.6987082824680118f
00103 #define SCEXP2 0.2633174272827404f
00104 #define SCEXP3 0.0923611991471395f
00105 #define SCEXP4 0.0277520543324108f
00106
00107
00108 #define EXPOBIAS 127
00109 #define EXPOSHIFT 23
00110
00111
00112
00113 #define ACUTOFF -10
00114
00115 typedef union flint_t {
00116 float f;
00117 int n;
00118 } flint;
00119
00120 #if VMDUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00121
00122 typedef union AVXreg_t {
00123 __m256 f;
00124 __m256i i;
00125 struct {
00126 float r0, r1, r2, r3, r4, r5, r6, r7;
00127 } floatreg;
00128 } AVXreg;
00129 #endif
00130
00131
00132
00133 void vmd_gaussdensity_avx2(int verbose,
00134 int natoms, const float *xyzr,
00135 const float *atomicnum,
00136 const float *colors,
00137 float *densitymap, float *voltexmap,
00138 const int *numvoxels,
00139 float radscale, float gridspacing,
00140 float isovalue, float gausslim) {
00141 int i, x, y, z;
00142 int maxvoxel[3];
00143 maxvoxel[0] = numvoxels[0]-1;
00144 maxvoxel[1] = numvoxels[1]-1;
00145 maxvoxel[2] = numvoxels[2]-1;
00146 const float invgridspacing = 1.0f / gridspacing;
00147
00148 #if VMDUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00149
00150 __m256 gridspacing8_8 = _mm256_set1_ps(gridspacing * 8.0f);
00151 __attribute__((aligned(16))) float sxdelta8[8];
00152 for (x=0; x<8; x++)
00153 sxdelta8[x] = ((float) x) * gridspacing;
00154 #endif
00155
00156
00157 if (voltexmap != NULL) {
00158 float invisovalue = 1.0f / isovalue;
00159
00160 for (i=0; i<natoms; i++) {
00161 if (verbose && ((i & 0x3fff) == 0)) {
00162 printf(".");
00163 fflush(stdout);
00164 }
00165
00166 ptrdiff_t ind = i*4L;
00167 float scaledrad = xyzr[ind + 3] * radscale;
00168
00169
00170 float atomicnumfactor = 1.0f;
00171 if (atomicnum != NULL) {
00172 atomicnumfactor = atomicnum[i];
00173 }
00174
00175
00176 float arinv = -(1.0f/(2.0f*scaledrad*scaledrad)) * MLOG2EF;
00177 float radlim = gausslim * scaledrad;
00178 float radlim2 = radlim * radlim;
00179 radlim *= invgridspacing;
00180
00181 #if VMDUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00182 __m256 atomicnumfactor_8 = _mm256_set1_ps(atomicnumfactor);
00183 #if VMDUSESVMLEXP
00184
00185 __m256 arinv_8 = _mm256_set1_ps(arinv * (2.718281828f/2.0f) / MLOG2EF);
00186 #else
00187 __m256 arinv_8 = _mm256_set1_ps(arinv);
00188 #endif
00189 #endif
00190
00191 float tmp;
00192 tmp = xyzr[ind ] * invgridspacing;
00193 int xmin = MAX((int) (tmp - radlim), 0);
00194 int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00195 tmp = xyzr[ind+1] * invgridspacing;
00196 int ymin = MAX((int) (tmp - radlim), 0);
00197 int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00198 tmp = xyzr[ind+2] * invgridspacing;
00199 int zmin = MAX((int) (tmp - radlim), 0);
00200 int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00201
00202 float dz = zmin*gridspacing - xyzr[ind+2];
00203 for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00204 float dy = ymin*gridspacing - xyzr[ind+1];
00205 for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00206 float dy2dz2 = dy*dy + dz*dz;
00207
00208
00209 if (dy2dz2 >= radlim2)
00210 continue;
00211
00212 ptrdiff_t addr = ptrdiff_t(z * numvoxels[0]) * ptrdiff_t(numvoxels[1]) + ptrdiff_t(y * numvoxels[0]);
00213 float dx = xmin*gridspacing - xyzr[ind];
00214 x=xmin;
00215
00216 #if VMDUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00217
00218
00219 {
00220 __align(16) AVXreg n;
00221 __align(16) AVXreg y;
00222 __m256 dy2dz2_8 = _mm256_set1_ps(dy2dz2);
00223 __m256 dx_8 = _mm256_add_ps(_mm256_set1_ps(dx), _mm256_load_ps(&sxdelta8[0]));
00224
00225 for (; (x+7)<=xmax; x+=8,dx_8=_mm256_add_ps(dx_8, gridspacing8_8)) {
00226 __m256 r2 = _mm256_fmadd_ps(dx_8, dx_8, dy2dz2_8);
00227 __m256 d;
00228 #if VMDUSESVMLEXP
00229
00230 y.f = _mm256_exp2_ps(_mm256_mul_ps(r2, arinv_8));
00231 #else
00232
00233 y.f = _mm256_mul_ps(r2, arinv_8);
00234 n.i = _mm256_cvttps_epi32(y.f);
00235 d = _mm256_cvtepi32_ps(n.i);
00236 d = _mm256_sub_ps(d, y.f);
00237
00238
00239
00240 y.f = _mm256_fmadd_ps(d, _mm256_set1_ps(SCEXP4), _mm256_set1_ps(SCEXP3));
00241 y.f = _mm256_fmadd_ps(y.f, d, _mm256_set1_ps(SCEXP2));
00242 y.f = _mm256_fmadd_ps(y.f, d, _mm256_set1_ps(SCEXP1));
00243 y.f = _mm256_fmadd_ps(y.f, d, _mm256_set1_ps(SCEXP0));
00244
00245
00246
00247
00248
00249 n.i = _mm256_sub_epi32(_mm256_set1_epi32(EXPOBIAS), n.i);
00250 n.i = _mm256_slli_epi32(n.i, EXPOSHIFT);
00251 y.f = _mm256_mul_ps(y.f, n.f);
00252 y.f = _mm256_mul_ps(y.f, atomicnumfactor_8);
00253 #endif
00254
00255
00256
00257 float *ufptr = &densitymap[addr + x];
00258 d = _mm256_loadu_ps(ufptr);
00259 _mm256_storeu_ps(ufptr, _mm256_add_ps(d, y.f));
00260
00261
00262
00263
00264 d = _mm256_mul_ps(y.f, _mm256_set1_ps(invisovalue));
00265 ptrdiff_t caddr = (addr + x) * 3L;
00266
00267 #if 0 && VMDUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00268
00269 float *txptr = &voltexmap[caddr];
00270 __m128 *m = (__m128 *) txptr;
00271
00272 __m256 m03 = _mm256_castps128_ps256(m[0]);
00273 __m256 m14 = _mm256_castps128_ps256(m[1]);
00274 __m256 m25 = _mm256_castps128_ps256(m[2]);
00275 m03 = _mm256_insertf128_ps(m03, m[3], 1);
00276 m14 = _mm256_insertf128_ps(m14, m[4], 1);
00277 m25 = _mm256_insertf128_ps(m25, m[5], 1);
00278
00279
00280 __m256 rg = _mm256_shuffle_ps(m14, m25, _MM_SHUFFLE(2,1,3,2));
00281
00282 __m256 gb = _mm256_shuffle_ps(m03, m14, _MM_SHUFFLE(1,0,2,1));
00283 __m256 r = _mm256_shuffle_ps(m03, rg , _MM_SHUFFLE(2,0,3,0));
00284 __m256 g = _mm256_shuffle_ps(gb , rg , _MM_SHUFFLE(3,1,2,0));
00285 __m256 b = _mm256_shuffle_ps(gb , m25, _MM_SHUFFLE(3,0,3,1));
00286
00287
00288 r = _mm256_fmadd_ps(d, _mm256_set1_ps(colors[ind ]), r);
00289 g = _mm256_fmadd_ps(d, _mm256_set1_ps(colors[ind + 1]), g);
00290 b = _mm256_fmadd_ps(d, _mm256_set1_ps(colors[ind + 2]), b);
00291
00292
00293 __m256 rrg = _mm256_shuffle_ps(r, g, _MM_SHUFFLE(2,0,2,0));
00294 __m256 rgb = _mm256_shuffle_ps(g, b, _MM_SHUFFLE(3,1,3,1));
00295 __m256 rbr = _mm256_shuffle_ps(b, r, _MM_SHUFFLE(3,1,2,0));
00296 __m256 r03 = _mm256_shuffle_ps(rrg, rbr, _MM_SHUFFLE(2,0,2,0));
00297 __m256 r14 = _mm256_shuffle_ps(rgb, rrg, _MM_SHUFFLE(3,1,2,0));
00298 __m256 r25 = _mm256_shuffle_ps(rbr, rgb, _MM_SHUFFLE(3,1,3,1));
00299
00300
00301 m[0] = _mm256_castps256_ps128( r03 );
00302 m[1] = _mm256_castps256_ps128( r14 );
00303 m[2] = _mm256_castps256_ps128( r25 );
00304 m[3] = _mm256_extractf128_ps( r03 ,1);
00305 m[4] = _mm256_extractf128_ps( r14 ,1);
00306 m[5] = _mm256_extractf128_ps( r25 ,1);
00307 #else
00308
00309 float r, g, b;
00310 r = colors[ind ];
00311 g = colors[ind + 1];
00312 b = colors[ind + 2];
00313
00314 AVXreg tmp;
00315 tmp.f = d;
00316 float density;
00317 density = tmp.floatreg.r0;
00318 voltexmap[caddr ] += density * r;
00319 voltexmap[caddr + 1] += density * g;
00320 voltexmap[caddr + 2] += density * b;
00321
00322 density = tmp.floatreg.r1;
00323 voltexmap[caddr + 3] += density * r;
00324 voltexmap[caddr + 4] += density * g;
00325 voltexmap[caddr + 5] += density * b;
00326
00327 density = tmp.floatreg.r2;
00328 voltexmap[caddr + 6] += density * r;
00329 voltexmap[caddr + 7] += density * g;
00330 voltexmap[caddr + 8] += density * b;
00331
00332 density = tmp.floatreg.r3;
00333 voltexmap[caddr + 9] += density * r;
00334 voltexmap[caddr + 10] += density * g;
00335 voltexmap[caddr + 11] += density * b;
00336
00337 density = tmp.floatreg.r4;
00338 voltexmap[caddr + 12] += density * r;
00339 voltexmap[caddr + 13] += density * g;
00340 voltexmap[caddr + 14] += density * b;
00341
00342 density = tmp.floatreg.r5;
00343 voltexmap[caddr + 15] += density * r;
00344 voltexmap[caddr + 16] += density * g;
00345 voltexmap[caddr + 17] += density * b;
00346
00347 density = tmp.floatreg.r6;
00348 voltexmap[caddr + 18] += density * r;
00349 voltexmap[caddr + 19] += density * g;
00350 voltexmap[caddr + 20] += density * b;
00351
00352 density = tmp.floatreg.r7;
00353 voltexmap[caddr + 21] += density * r;
00354 voltexmap[caddr + 22] += density * g;
00355 voltexmap[caddr + 23] += density * b;
00356 #endif
00357 }
00358 }
00359 #endif
00360
00361
00362 for (; x<=xmax; x++,dx+=gridspacing) {
00363 float r2 = dx*dx + dy2dz2;
00364
00365
00366 float mb = r2 * arinv;
00367 int mbflr = (int) mb;
00368 float d = mbflr - mb;
00369
00370
00371 float sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
00372
00373
00374 flint scalfac;
00375 scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT;
00376
00377
00378 float density = (sy * scalfac.f);
00379
00380 density *= atomicnumfactor;
00381
00382
00383 densitymap[addr + x] += density;
00384
00385
00386
00387
00388 density *= invisovalue;
00389 ptrdiff_t caddr = (addr + x) * 3L;
00390
00391
00392 voltexmap[caddr ] += density * colors[ind ];
00393 voltexmap[caddr + 1] += density * colors[ind + 1];
00394 voltexmap[caddr + 2] += density * colors[ind + 2];
00395 }
00396 }
00397 }
00398 }
00399 } else {
00400
00401 for (i=0; i<natoms; i++) {
00402 if (verbose && ((i & 0x3fff) == 0)) {
00403 printf(".");
00404 fflush(stdout);
00405 }
00406
00407 ptrdiff_t ind = i*4L;
00408 float scaledrad = xyzr[ind+3] * radscale;
00409
00410
00411 float atomicnumfactor = 1.0f;
00412 if (atomicnum != NULL) {
00413 atomicnumfactor = atomicnum[i];
00414 }
00415
00416
00417 float arinv = -(1.0f/(2.0f*scaledrad*scaledrad)) * MLOG2EF;
00418 float radlim = gausslim * scaledrad;
00419 float radlim2 = radlim * radlim;
00420 radlim *= invgridspacing;
00421
00422 #if VMDUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00423 __m256 atomicnumfactor_8 = _mm256_set1_ps(atomicnumfactor);
00424 #if VMDUSESVMLEXP
00425
00426 __m256 arinv_8 = _mm256_set1_ps(arinv * (2.718281828f/2.0f) / MLOG2EF);
00427 #else
00428 __m256 arinv_8 = _mm256_set1_ps(arinv);
00429 #endif
00430 #endif
00431
00432 float tmp;
00433 tmp = xyzr[ind ] * invgridspacing;
00434 int xmin = MAX((int) (tmp - radlim), 0);
00435 int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00436 tmp = xyzr[ind+1] * invgridspacing;
00437 int ymin = MAX((int) (tmp - radlim), 0);
00438 int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00439 tmp = xyzr[ind+2] * invgridspacing;
00440 int zmin = MAX((int) (tmp - radlim), 0);
00441 int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00442
00443 float dz = zmin*gridspacing - xyzr[ind+2];
00444 for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00445 float dy = ymin*gridspacing - xyzr[ind+1];
00446 for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00447 float dy2dz2 = dy*dy + dz*dz;
00448
00449
00450 if (dy2dz2 >= radlim2)
00451 continue;
00452
00453 ptrdiff_t addr = ptrdiff_t(z * numvoxels[0]) * ptrdiff_t(numvoxels[1]) + ptrdiff_t(y * numvoxels[0]);
00454 float dx = xmin*gridspacing - xyzr[ind];
00455 x=xmin;
00456
00457 #if VMDUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00458
00459
00460 {
00461 __align(16) AVXreg n;
00462 __align(16) AVXreg y;
00463 __m256 dy2dz2_8 = _mm256_set1_ps(dy2dz2);
00464 __m256 dx_8 = _mm256_add_ps(_mm256_set1_ps(dx), _mm256_load_ps(&sxdelta8[0]));
00465
00466 for (; (x+7)<=xmax; x+=8,dx_8=_mm256_add_ps(dx_8, gridspacing8_8)) {
00467 __m256 r2 = _mm256_fmadd_ps(dx_8, dx_8, dy2dz2_8);
00468 __m256 d;
00469 #if VMDUSESVMLEXP
00470
00471 y.f = _mm256_exp2_ps(_mm256_mul_ps(r2, arinv_8));
00472 #else
00473
00474 y.f = _mm256_mul_ps(r2, arinv_8);
00475 n.i = _mm256_cvttps_epi32(y.f);
00476 d = _mm256_cvtepi32_ps(n.i);
00477 d = _mm256_sub_ps(d, y.f);
00478
00479
00480
00481 #if defined(__FMA__)
00482
00483 y.f = _mm256_fmadd_ps(d, _mm256_set1_ps(SCEXP4), _mm256_set1_ps(SCEXP3));
00484 y.f = _mm256_fmadd_ps(y.f, d, _mm256_set1_ps(SCEXP2));
00485 y.f = _mm256_fmadd_ps(y.f, d, _mm256_set1_ps(SCEXP1));
00486 y.f = _mm256_fmadd_ps(y.f, d, _mm256_set1_ps(SCEXP0));
00487 #else
00488 y.f = _mm256_mul_ps(d, _mm256_set1_ps(SCEXP4));
00489 y.f = _mm256_add_ps(y.f, _mm256_set1_ps(SCEXP3));
00490 y.f = _mm256_mul_ps(y.f, d);
00491 y.f = _mm256_add_ps(y.f, _mm256_set1_ps(SCEXP2));
00492 y.f = _mm256_mul_ps(y.f, d);
00493 y.f = _mm256_add_ps(y.f, _mm256_set1_ps(SCEXP1));
00494 y.f = _mm256_mul_ps(y.f, d);
00495 y.f = _mm256_add_ps(y.f, _mm256_set1_ps(SCEXP0));
00496 #endif
00497
00498
00499
00500
00501
00502 n.i = _mm256_sub_epi32(_mm256_set1_epi32(EXPOBIAS), n.i);
00503 n.i = _mm256_slli_epi32(n.i, EXPOSHIFT);
00504 y.f = _mm256_mul_ps(y.f, n.f);
00505 y.f = _mm256_mul_ps(y.f, atomicnumfactor_8);
00506 #endif
00507
00508
00509
00510 float *ufptr = &densitymap[addr + x];
00511 d = _mm256_loadu_ps(ufptr);
00512 _mm256_storeu_ps(ufptr, _mm256_add_ps(d, y.f));
00513 }
00514 }
00515 #endif
00516
00517
00518 for (; x<=xmax; x++,dx+=gridspacing) {
00519 float r2 = dx*dx + dy2dz2;
00520
00521
00522 float mb = r2 * arinv;
00523 int mbflr = (int) mb;
00524 float d = mbflr - mb;
00525
00526
00527 float sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
00528
00529
00530 flint scalfac;
00531 scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT;
00532
00533
00534 float density = (sy * scalfac.f);
00535
00536 density *= atomicnumfactor;
00537
00538 densitymap[addr + x] += density;
00539 }
00540 }
00541 }
00542 }
00543 }
00544
00545
00546
00547 _mm256_zeroupper();
00548 }
00549
00550
00551
00552