00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 #if !defined(__PGIC__)
00023 
00024 
00025 #if (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_AMD64))
00026 #if !defined(__SSE2__) && defined(_WIN64)
00027 #define __SSE2__ 1      
00028 #endif
00029 #endif
00030 
00031 #define VMDQSURFUSESSE 1
00032 #endif
00033 
00034 
00035 
00036 
00037 
00038 
00039 
00040 
00041 #if !defined(__GNUC__) && defined(__VEC__)
00042 #define VMDQSURFUSEVSX 1
00043 #endif
00044 
00045 #include <stdio.h>
00046 #include <stdlib.h>
00047 #if VMDQSURFUSESSE && defined(__SSE2__) 
00048 #include <emmintrin.h>
00049 #endif
00050 
00051 #if (defined(VMDQSURFUSEVSX) && defined(__VSX__))
00052 #if defined(__GNUC__) && defined(__VEC__)
00053 #include <altivec.h>
00054 #endif
00055 #endif
00056 
00057 #include <string.h>
00058 #include <math.h>
00059 #include "QuickSurf.h"
00060 #if defined(VMDCUDA)
00061 #include "CUDAQuickSurf.h"
00062 #endif
00063 #include "Measure.h"
00064 #include "Inform.h"
00065 #include "utilities.h"
00066 #include "WKFUtils.h"
00067 #include "VolumetricData.h"
00068 
00069 #include "VMDDisplayList.h"
00070 #include "Displayable.h"
00071 #include "DispCmds.h"
00072 #include "ProfileHooks.h"
00073 #include "VMDApp.h" 
00074 
00075 #define MIN(X,Y) (((X)<(Y))? (X) : (Y))
00076 #define MAX(X,Y) (((X)>(Y))? (X) : (Y))
00077 
00078 
00079 void vmd_gaussdensity_avx2(int verbose,
00080                            int natoms, const float *xyzr,
00081                            const float *atomicnum,
00082                            const float *colors,
00083                            float *densitymap, float *voltexmap,
00084                            const int *numvoxels,
00085                            float radscale, float gridspacing,
00086                            float isovalue, float gausslim);
00087 
00088 void vmd_gaussdensity_neon(int verbose,
00089                            int natoms, const float *xyzr,
00090                            const float *atomicnum,
00091                            const float *colors,
00092                            float *densitymap, float *voltexmap,
00093                            const int *numvoxels,
00094                            float radscale, float gridspacing,
00095                            float isovalue, float gausslim);
00096 
00097 
00098 
00099 
00100 
00101 
00102 
00103 
00104 
00105 
00106 
00107 
00108 
00109 
00110 
00111 
00112 
00113 
00114 
00115 
00116 
00117 
00118 
00119 
00120 
00121 
00122 
00123 
00124 
00125 
00126 
00127 
00128 
00129 
00130 
00131 
00132 
00133 
00134 
00135 
00136 #if defined(__GNUC__) && ! defined(__INTEL_COMPILER)
00137 #define __align(X)  __attribute__((aligned(X) ))
00138 #if (__GNUC__ < 4)
00139 #define MISSING_mm_cvtsd_f64
00140 #endif
00141 #else
00142 #define __align(X) __declspec(align(X) )
00143 #endif
00144 
00145 #define MLOG2EF    -1.44269504088896f
00146 
00147 
00148 
00149 
00150 
00151 #define SCEXP0     1.0000000000000000f
00152 #define SCEXP1     0.6987082824680118f
00153 #define SCEXP2     0.2633174272827404f
00154 #define SCEXP3     0.0923611991471395f
00155 #define SCEXP4     0.0277520543324108f
00156 
00157 
00158 #define EXPOBIAS   127
00159 #define EXPOSHIFT   23
00160 
00161 
00162 #define ACUTOFF    -10
00163 
00164 typedef union flint_t {
00165   float f;
00166   int n;
00167 } flint;
00168 
00169 #if VMDQSURFUSESSE && defined(__SSE2__)
00170 
00171 typedef union SSEreg_t {
00172   __m128  f;  
00173   __m128i i;  
00174 } SSEreg;
00175 #endif
00176 
00177 
00178 #if 0
00179 static float aexpfnx(float x) {
00180   
00181   float mb;
00182   int mbflr;
00183   float d;
00184   float sy;
00185   flint scalfac;
00186 
00187   if (x < ACUTOFF) return 0.f;
00188 
00189   mb = x * MLOG2EF;    
00190   mbflr = (int) mb;    
00191   d = mbflr - mb;      
00192   sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
00193                        
00194   scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT;  
00195   return (sy * scalfac.f);  
00196 }
00197 
00198 
00199 static void vmd_gaussdensity(int verbose, 
00200                              int natoms, const float *xyzr,
00201                              const float *atomicnum,
00202                              const float *colors,
00203                              float *densitymap, float *voltexmap, 
00204                              const int *numvoxels, 
00205                              float radscale, float gridspacing, 
00206                              float isovalue, float gausslim) {
00207   int i, x, y, z;
00208   int maxvoxel[3];
00209   maxvoxel[0] = numvoxels[0]-1; 
00210   maxvoxel[1] = numvoxels[1]-1; 
00211   maxvoxel[2] = numvoxels[2]-1; 
00212   const float invgridspacing = 1.0f / gridspacing;
00213 
00214   
00215   if (voltexmap != NULL) {
00216     float invisovalue = 1.0f / isovalue;
00217     
00218     for (i=0; i<natoms; i++) {
00219       if (verbose && ((i & 0x3fff) == 0)) {
00220         printf("."); 
00221         fflush(stdout);
00222       }
00223 
00224       ptrdiff_t ind = i*4L;
00225       float scaledrad = xyzr[ind + 3L] * radscale;
00226 
00227       
00228       float atomicnumfactor = 1.0f;
00229       if (atomicnum != NULL) {
00230         atomicnumfactor = atomicnum[i];
00231       }
00232 
00233       float arinv = 1.0f/(2.0f*scaledrad*scaledrad);
00234       float radlim = gausslim * scaledrad;
00235       float radlim2 = radlim * radlim; 
00236       radlim *= invgridspacing;
00237 
00238       float tmp;
00239       tmp = xyzr[ind  ] * invgridspacing;
00240       int xmin = MAX((int) (tmp - radlim), 0);
00241       int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00242       tmp = xyzr[ind+1] * invgridspacing;
00243       int ymin = MAX((int) (tmp - radlim), 0);
00244       int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00245       tmp = xyzr[ind+2] * invgridspacing;
00246       int zmin = MAX((int) (tmp - radlim), 0);
00247       int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00248 
00249       float dz = zmin*gridspacing - xyzr[ind+2];
00250       for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00251         float dy = ymin*gridspacing - xyzr[ind+1];
00252         for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00253           float dy2dz2 = dy*dy + dz*dz;
00254 
00255           
00256           if (dy2dz2 >= radlim2) 
00257             continue;
00258 
00259           int addr = z * numvoxels[0] * numvoxels[1] + y * numvoxels[0];
00260           float dx = xmin*gridspacing - xyzr[ind];
00261           for (x=xmin; x<=xmax; x++,dx+=gridspacing) {
00262             float r2 = dx*dx + dy2dz2;
00263             float expval = -r2 * arinv;
00264 #if VMDUSEFULLEXP
00265             
00266             float density = exp(expval);
00267 #else
00268             
00269             float density = aexpfnx(expval);
00270 #endif
00271 
00272             density *= atomicnumfactor; 
00273 
00274             
00275             densitymap[addr + x] += density;
00276 
00277             
00278             
00279             
00280             density *= invisovalue;
00281             ptrdiff_t caddr = (addr + x) * 3L;
00282 
00283             
00284             voltexmap[caddr    ] += density * colors[ind    ];
00285             voltexmap[caddr + 1] += density * colors[ind + 1];
00286             voltexmap[caddr + 2] += density * colors[ind + 2];
00287           }
00288         }
00289       }
00290     }
00291   } else {
00292     
00293     for (i=0; i<natoms; i++) {
00294       if (verbose && ((i & 0x3fff) == 0)) {
00295         printf("."); 
00296         fflush(stdout);
00297       }
00298 
00299       ptrdiff_t ind = i*4L;
00300       float scaledrad = xyzr[ind + 3] * radscale;
00301 
00302       
00303       float atomicnumfactor = 1.0f;
00304       if (atomicnum != NULL) {
00305         atomicnumfactor = atomicnum[i];
00306       }
00307 
00308       float arinv = 1.0f/(2.0f*scaledrad*scaledrad);
00309       float radlim = gausslim * scaledrad;
00310       float radlim2 = radlim * radlim; 
00311       radlim *= invgridspacing;
00312 
00313       float tmp;
00314       tmp = xyzr[ind  ] * invgridspacing;
00315       int xmin = MAX((int) (tmp - radlim), 0);
00316       int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00317       tmp = xyzr[ind+1] * invgridspacing;
00318       int ymin = MAX((int) (tmp - radlim), 0);
00319       int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00320       tmp = xyzr[ind+2] * invgridspacing;
00321       int zmin = MAX((int) (tmp - radlim), 0);
00322       int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00323 
00324       float dz = zmin*gridspacing - xyzr[ind+2];
00325       for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00326         float dy = ymin*gridspacing - xyzr[ind+1];
00327         for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00328           float dy2dz2 = dy*dy + dz*dz;
00329 
00330           
00331           if (dy2dz2 >= radlim2) 
00332             continue;
00333 
00334           int addr = z * numvoxels[0] * numvoxels[1] + y * numvoxels[0];
00335           float dx = xmin*gridspacing - xyzr[ind];
00336           for (x=xmin; x<=xmax; x++,dx+=gridspacing) {
00337             float r2 = dx*dx + dy2dz2;
00338             float expval = -r2 * arinv;
00339 #if VMDUSEFULLEXP
00340             
00341             float density = exp(expval);
00342 #else
00343             
00344             float density = aexpfnx(expval);
00345 #endif
00346 
00347             density *= atomicnumfactor; 
00348 
00349             
00350             densitymap[addr + x] += density;
00351           }
00352         }
00353       }
00354     }
00355   }
00356 }
00357 #endif
00358 
00359 
00360 
00361 static void vmd_gaussdensity_opt(wkf_cpu_caps_t *cpucaps, int verbose,
00362                                  int natoms, const float *xyzr,
00363                                  const float *atomicnum,
00364                                  const float *colors,
00365                                  float *densitymap, float *voltexmap, 
00366                                  const int *numvoxels, 
00367                                  float radscale, float gridspacing, 
00368                                  float isovalue, float gausslim) {
00369   int i, x, y, z;
00370   int maxvoxel[3];
00371   maxvoxel[0] = numvoxels[0]-1; 
00372   maxvoxel[1] = numvoxels[1]-1; 
00373   maxvoxel[2] = numvoxels[2]-1; 
00374   const float invgridspacing = 1.0f / gridspacing;
00375 
00376   
00377   
00378   
00379   
00380   
00381 #if defined(VMDCPUDISPATCH)
00382 
00383 
00384 #if (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_AMD64))
00385 
00386   if ((cpucaps != NULL) && 
00387       (cpucaps->flags & CPU_FMA) && (cpucaps->flags & CPU_AVX2) && 
00388       (getenv("VMDNOAVX2")==NULL)) {
00389     if (verbose)
00390       printf("vmd_gaussdensity_avx2()\n");
00391 
00392     vmd_gaussdensity_avx2(verbose, natoms, xyzr, atomicnum, colors, 
00393                           densitymap, voltexmap, numvoxels, radscale, 
00394                           gridspacing, isovalue, gausslim);
00395     return;
00396   }
00397 #endif
00398 
00399 #if defined(VMDUSENEON)
00400 
00401   if (1 && (getenv("VMDNONEON") == NULL)) {
00402     if (verbose)
00403       printf("vmd_gaussdensity_neon()\n");
00404 
00405     vmd_gaussdensity_neon(verbose, natoms, xyzr, atomicnum, colors, 
00406                           densitymap, voltexmap, numvoxels, radscale, 
00407                           gridspacing, isovalue, gausslim);
00408     return;
00409   }
00410 #endif
00411 
00412 
00413   if (verbose)
00414     printf("vmd_gaussdensity_opt()\n");
00415 #endif
00416 
00417 #if VMDQSURFUSESSE && defined(__SSE2__)
00418   int usesse=1;
00419   if (getenv("VMDNOSSE")) {
00420     usesse=0;
00421   }
00422 #endif
00423 
00424 #if VMDQSURFUSEVSX && defined(__VEC__)
00425   int usevsx=1;
00426   if (getenv("VMDNOVSX")) {
00427     usevsx=0;
00428   }
00429 #endif
00430 
00431 #if VMDQSURFUSESSE && defined(__SSE2__)
00432   
00433   __m128 gridspacing4_4;
00434   __align(16) float sxdelta4[4]; 
00435 
00436   if (usesse) {
00437     gridspacing4_4 = _mm_set1_ps(gridspacing * 4.0f);
00438     for (x=0; x<4; x++)
00439       sxdelta4[x] = ((float) x) * gridspacing;
00440   }
00441 #endif
00442 
00443 #if VMDQSURFUSEVSX && defined(__VEC__)
00444   
00445   vector float gridspacing4_4;
00446   __attribute__((aligned(16))) float sxdelta4[4]; 
00447 
00448   if (usevsx) {
00449     gridspacing4_4 = vec_splats(gridspacing * 4.0f);
00450     for (x=0; x<4; x++)
00451       sxdelta4[x] = ((float) x) * gridspacing;
00452   }
00453 #endif
00454 
00455   
00456   if (voltexmap != NULL) {
00457     float invisovalue = 1.0f / isovalue;
00458     
00459     for (i=0; i<natoms; i++) {
00460       if (verbose && ((i & 0x3fff) == 0)) {
00461         printf("."); 
00462         fflush(stdout);
00463       }
00464 
00465       ptrdiff_t ind = i*4L;
00466       float scaledrad = xyzr[ind + 3] * radscale;
00467 
00468       
00469       float atomicnumfactor = 1.0f;
00470       if (atomicnum != NULL) {
00471         atomicnumfactor = atomicnum[i];
00472       }
00473 
00474       
00475       float arinv = -(1.0f/(2.0f*scaledrad*scaledrad)) * MLOG2EF;
00476       float radlim = gausslim * scaledrad;
00477       float radlim2 = radlim * radlim; 
00478       radlim *= invgridspacing;
00479 
00480 #if VMDQSURFUSESSE && defined(__SSE2__)
00481       __m128 atomicnumfactor_4 = { 0 };
00482       __m128 arinv_4;
00483       if (usesse) {
00484         atomicnumfactor_4 = _mm_set1_ps(atomicnumfactor);
00485 #if VMDUSESVMLEXP
00486         
00487         arinv_4 = _mm_set1_ps(arinv * (2.718281828f/2.0f) / MLOG2EF); 
00488 #else
00489         
00490         arinv_4 = _mm_set1_ps(arinv);
00491 #endif
00492       }
00493 #endif
00494 
00495       float tmp;
00496       tmp = xyzr[ind  ] * invgridspacing;
00497       int xmin = MAX((int) (tmp - radlim), 0);
00498       int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00499       tmp = xyzr[ind+1] * invgridspacing;
00500       int ymin = MAX((int) (tmp - radlim), 0);
00501       int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00502       tmp = xyzr[ind+2] * invgridspacing;
00503       int zmin = MAX((int) (tmp - radlim), 0);
00504       int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00505 
00506       float dz = zmin*gridspacing - xyzr[ind+2];
00507       for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00508         float dy = ymin*gridspacing - xyzr[ind+1];
00509         for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00510           float dy2dz2 = dy*dy + dz*dz;
00511 
00512           
00513           if (dy2dz2 >= radlim2) 
00514             continue;
00515 
00516           ptrdiff_t addr = ptrdiff_t(z * numvoxels[0]) * ptrdiff_t(numvoxels[1]) + ptrdiff_t(y * numvoxels[0]);
00517           float dx = xmin*gridspacing - xyzr[ind];
00518           x=xmin;
00519 
00520 #if VMDQSURFUSESSE && defined(__SSE2__)
00521           
00522           
00523           if (usesse) {
00524             __align(16) SSEreg n;
00525             __align(16) SSEreg y;
00526             __m128 dy2dz2_4 = _mm_set1_ps(dy2dz2);
00527             __m128 dx_4 = _mm_add_ps(_mm_set1_ps(dx), _mm_load_ps(&sxdelta4[0]));
00528 
00529             for (; (x+3)<=xmax; x+=4,dx_4=_mm_add_ps(dx_4, gridspacing4_4)) {
00530               __m128 r2 = _mm_add_ps(_mm_mul_ps(dx_4, dx_4), dy2dz2_4);
00531               __m128 d;
00532 #if VMDUSESVMLEXP
00533               
00534               y.f = _mm_exp2_ps(_mm_mul_ps(r2, arinv_4));
00535 #else
00536               
00537               y.f = _mm_mul_ps(r2, arinv_4);         
00538               n.i = _mm_cvttps_epi32(y.f);
00539               d = _mm_cvtepi32_ps(n.i);
00540               d = _mm_sub_ps(d, y.f);
00541 
00542               
00543               
00544 #if 0
00545               
00546               y.f = _mm_fmadd_ps(d, _mm_set1_ps(SCEXP4), _mm_set1_ps(SCEXP3)); 
00547               y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP2));
00548               y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP1));
00549               y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP0));
00550 #else
00551               y.f = _mm_mul_ps(d, _mm_set_ps1(SCEXP4));      
00552               y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP3));    
00553               y.f = _mm_mul_ps(y.f, d);
00554               y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP2));    
00555               y.f = _mm_mul_ps(y.f, d);
00556               y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP1));    
00557               y.f = _mm_mul_ps(y.f, d);
00558               y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP0));    
00559 #endif
00560 
00561               
00562               
00563               n.i = _mm_sub_epi32(_mm_set1_epi32(EXPOBIAS), n.i);
00564               n.i = _mm_slli_epi32(n.i, EXPOSHIFT);
00565               y.f = _mm_mul_ps(y.f, n.f);
00566 #endif
00567 
00568               
00569               
00570               float *ufptr = &densitymap[addr + x];
00571               d = _mm_loadu_ps(ufptr);
00572               y.f = _mm_mul_ps(y.f, atomicnumfactor_4); 
00573               _mm_storeu_ps(ufptr, _mm_add_ps(d, y.f)); 
00574 
00575               
00576               
00577               
00578               d = _mm_mul_ps(y.f, _mm_set_ps1(invisovalue));
00579               ptrdiff_t caddr = (addr + x) * 3L;
00580 
00581 #if 1
00582               float *txptr = &voltexmap[caddr];
00583               
00584               __m128 r0g0b0r1 = _mm_loadu_ps(txptr+0);
00585               __m128 g1b1r2g2 = _mm_loadu_ps(txptr+4); 
00586               __m128 b2r3g3b3 = _mm_loadu_ps(txptr+8);
00587 
00588               
00589               __m128 r2g2r3g3 = _mm_shuffle_ps(g1b1r2g2, b2r3g3b3, _MM_SHUFFLE(2, 1, 3, 2)); 
00590               __m128 g0b0g1b1 = _mm_shuffle_ps(r0g0b0r1, g1b1r2g2, _MM_SHUFFLE(1, 0, 2, 1));
00591               __m128 r        = _mm_shuffle_ps(r0g0b0r1, r2g2r3g3, _MM_SHUFFLE(2, 0, 3, 0)); 
00592               __m128 g        = _mm_shuffle_ps(g0b0g1b1, r2g2r3g3, _MM_SHUFFLE(3, 1, 2, 0)); 
00593               __m128 b        = _mm_shuffle_ps(g0b0g1b1, b2r3g3b3, _MM_SHUFFLE(3, 0, 3, 1)); 
00594 
00595               
00596               r = _mm_add_ps(r, _mm_mul_ps(d, _mm_set_ps1(colors[ind    ])));
00597               g = _mm_add_ps(g, _mm_mul_ps(d, _mm_set_ps1(colors[ind + 1])));
00598               b = _mm_add_ps(b, _mm_mul_ps(d, _mm_set_ps1(colors[ind + 2])));
00599 
00600               
00601               __m128 r0r2g0g2  = _mm_shuffle_ps(r, g, _MM_SHUFFLE(2, 0, 2, 0));
00602               __m128 g1g3b1b3  = _mm_shuffle_ps(g, b, _MM_SHUFFLE(3, 1, 3, 1));
00603               __m128 b0b2r1r3  = _mm_shuffle_ps(b, r, _MM_SHUFFLE(3, 1, 2, 0));
00604  
00605               __m128 rr0g0b0r1 = _mm_shuffle_ps(r0r2g0g2, b0b2r1r3, _MM_SHUFFLE(2, 0, 2, 0)); 
00606               __m128 rg1b1r2g2 = _mm_shuffle_ps(g1g3b1b3, r0r2g0g2, _MM_SHUFFLE(3, 1, 2, 0)); 
00607               __m128 rb2r3g3b3 = _mm_shuffle_ps(b0b2r1r3, g1g3b1b3, _MM_SHUFFLE(3, 1, 3, 1)); 
00608  
00609               
00610               _mm_storeu_ps(txptr+0, rr0g0b0r1);
00611               _mm_storeu_ps(txptr+4, rg1b1r2g2);
00612               _mm_storeu_ps(txptr+8, rb2r3g3b3);
00613 
00614 #else
00615 
00616               
00617               float r, g, b;
00618               r = colors[ind    ];
00619               g = colors[ind + 1];
00620               b = colors[ind + 2];
00621 
00622               SSEreg tmp; 
00623               tmp.f = d;
00624               float density;
00625               density = tmp.floatreg.r0; 
00626               voltexmap[caddr     ] += density * r;
00627               voltexmap[caddr +  1] += density * g;
00628               voltexmap[caddr +  2] += density * b;
00629 
00630               density = tmp.floatreg.r1; 
00631               voltexmap[caddr +  3] += density * r;
00632               voltexmap[caddr +  4] += density * g;
00633               voltexmap[caddr +  5] += density * b;
00634 
00635               density = tmp.floatreg.r2; 
00636               voltexmap[caddr +  6] += density * r;
00637               voltexmap[caddr +  7] += density * g;
00638               voltexmap[caddr +  8] += density * b;
00639 
00640               density = tmp.floatreg.r3; 
00641               voltexmap[caddr +  9] += density * r;
00642               voltexmap[caddr + 10] += density * g;
00643               voltexmap[caddr + 11] += density * b;
00644 #endif
00645             }
00646           }
00647 #endif
00648 
00649           
00650           for (; x<=xmax; x++,dx+=gridspacing) {
00651             float r2 = dx*dx + dy2dz2;
00652 
00653             
00654             float mb = r2 * arinv;         
00655             int mbflr = (int) mb;          
00656             float d = mbflr - mb;          
00657 
00658             
00659             float sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
00660 
00661             
00662             flint scalfac;
00663             scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT;  
00664 
00665             
00666             float density = (sy * scalfac.f);
00667 
00668             density *= atomicnumfactor; 
00669 
00670             
00671             densitymap[addr + x] += density;
00672 
00673             
00674             
00675             
00676             density *= invisovalue;
00677             ptrdiff_t caddr = (addr + x) * 3L;
00678 
00679             
00680             voltexmap[caddr    ] += density * colors[ind    ];
00681             voltexmap[caddr + 1] += density * colors[ind + 1];
00682             voltexmap[caddr + 2] += density * colors[ind + 2];
00683           }
00684         }
00685       }
00686     }
00687   } else {
00688     
00689     for (i=0; i<natoms; i++) {
00690       if (verbose && ((i & 0x3fff) == 0)) {
00691         printf("."); 
00692         fflush(stdout);
00693       }
00694 
00695       ptrdiff_t ind = i*4L;
00696       float scaledrad = xyzr[ind+3] * radscale;
00697 
00698       
00699       float atomicnumfactor = 1.0f;
00700       if (atomicnum != NULL) {
00701         atomicnumfactor = atomicnum[i];
00702       }
00703 
00704       
00705       float arinv = -(1.0f/(2.0f*scaledrad*scaledrad)) * MLOG2EF;
00706       float radlim = gausslim * scaledrad;
00707       float radlim2 = radlim * radlim; 
00708       radlim *= invgridspacing;
00709 
00710 #if VMDQSURFUSESSE && defined(__SSE2__)
00711       __m128 atomicnumfactor_4 = { 0 };
00712       __m128 arinv_4;
00713       if (usesse) {
00714         atomicnumfactor_4 = _mm_set1_ps(atomicnumfactor);
00715 #if VMDUSESVMLEXP
00716         
00717         arinv_4 = _mm_set1_ps(arinv * (2.718281828f/2.0f) / MLOG2EF); 
00718 #else
00719         
00720         arinv_4 = _mm_set1_ps(arinv);
00721 #endif
00722       }
00723 #endif
00724 
00725 #if VMDQSURFUSEVSX && defined(__VEC__)
00726       vector float atomicnumfactor_4;
00727       vector float arinv_4;
00728       if (usevsx) {
00729         atomicnumfactor_4 = vec_splats(atomicnumfactor);
00730 
00731         
00732         arinv_4 = vec_splats(arinv);
00733       }
00734 #endif
00735 
00736       float tmp;
00737       tmp = xyzr[ind  ] * invgridspacing;
00738       int xmin = MAX((int) (tmp - radlim), 0);
00739       int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00740       tmp = xyzr[ind+1] * invgridspacing;
00741       int ymin = MAX((int) (tmp - radlim), 0);
00742       int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00743       tmp = xyzr[ind+2] * invgridspacing;
00744       int zmin = MAX((int) (tmp - radlim), 0);
00745       int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00746 
00747       float dz = zmin*gridspacing - xyzr[ind+2];
00748       for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00749         float dy = ymin*gridspacing - xyzr[ind+1];
00750         for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00751           float dy2dz2 = dy*dy + dz*dz;
00752 
00753           
00754           if (dy2dz2 >= radlim2) 
00755             continue;
00756 
00757           ptrdiff_t addr = ptrdiff_t(z * numvoxels[0]) * ptrdiff_t(numvoxels[1]) + ptrdiff_t(y * numvoxels[0]);
00758           float dx = xmin*gridspacing - xyzr[ind];
00759           x=xmin;
00760 
00761 #if VMDQSURFUSESSE && defined(__SSE2__)
00762           
00763           
00764           if (usesse) {
00765             __align(16) SSEreg n;
00766             __align(16) SSEreg y;
00767             __m128 dy2dz2_4 = _mm_set1_ps(dy2dz2);
00768             __m128 dx_4 = _mm_add_ps(_mm_set1_ps(dx), _mm_load_ps(&sxdelta4[0]));
00769 
00770             for (; (x+3)<=xmax; x+=4,dx_4=_mm_add_ps(dx_4, gridspacing4_4)) {
00771               __m128 r2 = _mm_add_ps(_mm_mul_ps(dx_4, dx_4), dy2dz2_4);
00772               __m128 d;
00773 #if VMDUSESVMLEXP
00774               
00775               y.f = _mm_exp2_ps(_mm_mul_ps(r2, arinv_4));
00776 #else
00777               
00778               y.f = _mm_mul_ps(r2, arinv_4);         
00779               n.i = _mm_cvttps_epi32(y.f);
00780               d = _mm_cvtepi32_ps(n.i);
00781               d = _mm_sub_ps(d, y.f);
00782 
00783               
00784               
00785 #if 0
00786               
00787               y.f = _mm_fmadd_ps(d, _mm_set1_ps(SCEXP4), _mm_set1_ps(SCEXP3)); 
00788               y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP2));
00789               y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP1));
00790               y.f = _mm_fmadd_ps(y.f, d, _mm_set1_ps(SCEXP0));
00791 #else
00792               y.f = _mm_mul_ps(d, _mm_set_ps1(SCEXP4));      
00793               y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP3));    
00794               y.f = _mm_mul_ps(y.f, d);
00795               y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP2));    
00796               y.f = _mm_mul_ps(y.f, d);
00797               y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP1));    
00798               y.f = _mm_mul_ps(y.f, d);
00799               y.f = _mm_add_ps(y.f, _mm_set_ps1(SCEXP0));    
00800 #endif
00801 
00802               
00803               
00804               n.i = _mm_sub_epi32(_mm_set1_epi32(EXPOBIAS), n.i);
00805               n.i = _mm_slli_epi32(n.i, EXPOSHIFT);
00806               y.f = _mm_mul_ps(y.f, n.f);
00807               y.f = _mm_mul_ps(y.f, atomicnumfactor_4); 
00808 #endif
00809 
00810               
00811               
00812               float *ufptr = &densitymap[addr + x];
00813               d = _mm_loadu_ps(ufptr); 
00814               _mm_storeu_ps(ufptr, _mm_add_ps(d, y.f)); 
00815             }
00816           }
00817 #endif
00818 
00819 
00820 #if VMDQSURFUSEVSX && defined(__VEC__)
00821           
00822           
00823           
00824           
00825           
00826           
00827           
00828           if (usevsx) {
00829             vector float dy2dz2_4 = vec_splats(dy2dz2);
00830             vector float tmpvsxdelta4 = *((__vector float *) &sxdelta4[0]);
00831             vector float dx_4 = vec_add(vec_splats(dx), tmpvsxdelta4);
00832 
00833             for (; (x+3)<=xmax; x+=4,dx_4=vec_add(dx_4, gridspacing4_4)) {
00834               vector float r2 = vec_add(vec_mul(dx_4, dx_4), dy2dz2_4);
00835 
00836               
00837               vector float mb = vec_mul(r2, arinv_4);   
00838               vector float mbflr = vec_floor(mb);
00839               vector float d = vec_sub(mbflr, mb);
00840               vector float y;
00841 
00842               
00843               
00844               y = vec_madd(d, vec_splats(SCEXP4), vec_splats(SCEXP3)); 
00845               y = vec_madd(y, d, vec_splats(SCEXP2)); 
00846               y = vec_madd(y, d, vec_splats(SCEXP1)); 
00847               y = vec_madd(y, d, vec_splats(SCEXP0)); 
00848 
00849               
00850               
00851               y = vec_mul(y, vec_expte(-mbflr));
00852               y = vec_mul(y, atomicnumfactor_4); 
00853 
00854               
00855               
00856               float *ufptr = &densitymap[addr + x];
00857               d = *((__vector float *) &ufptr[0]);
00858               
00859               
00860               
00861               d = vec_add(d, y);
00862 
00863               ufptr[0] = d[0];
00864               ufptr[1] = d[1];
00865               ufptr[2] = d[2];
00866               ufptr[3] = d[3];
00867             }
00868           }
00869 #endif
00870 
00871           
00872           for (; x<=xmax; x++,dx+=gridspacing) {
00873             float r2 = dx*dx + dy2dz2;
00874 
00875             
00876             float mb = r2 * arinv;         
00877             int mbflr = (int) mb;          
00878             float d = mbflr - mb;          
00879 
00880             
00881             float sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
00882 
00883             
00884             flint scalfac;
00885             scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT;  
00886 
00887             
00888             float density = (sy * scalfac.f);
00889 
00890             density *= atomicnumfactor; 
00891 
00892             densitymap[addr + x] += density;
00893           }
00894         }
00895       }
00896     }
00897   }
00898 }
00899 
00900 
00901 typedef struct {
00902   wkf_cpu_caps_t *cpucaps;
00903   int verbose;
00904   int natoms;
00905   float radscale;
00906   float gridspacing;
00907   float isovalue;
00908   float gausslim;
00909   const int *numvoxels;
00910   const float *xyzr; 
00911   const float *atomicnum;
00912   const float *colors;
00913   float **thrdensitymaps;
00914   float **thrvoltexmaps;
00915 } densitythrparms;
00916 
00917 
00918 static void * densitythread(void *voidparms) {
00919   wkf_tasktile_t tile;
00920   densitythrparms *parms = NULL;
00921   int threadid;
00922 
00923   wkf_threadlaunch_getid(voidparms, &threadid, NULL);
00924   wkf_threadlaunch_getdata(voidparms, (void **) &parms);
00925 
00926   while (wkf_threadlaunch_next_tile(voidparms, 16384, &tile) != WKF_SCHED_DONE) {
00927     int natoms = tile.end-tile.start;
00928     const float *atomicnum = (parms->atomicnum == NULL) ? NULL : &parms->atomicnum[tile.start]; 
00929     vmd_gaussdensity_opt(parms->cpucaps,
00930                          parms->verbose, natoms, 
00931                          &parms->xyzr[4L*tile.start],
00932                          atomicnum,
00933                          (parms->thrvoltexmaps[0]!=NULL) ? &parms->colors[4L*tile.start] : NULL,
00934                          parms->thrdensitymaps[threadid], 
00935                          parms->thrvoltexmaps[threadid], 
00936                          parms->numvoxels, 
00937                          parms->radscale, 
00938                          parms->gridspacing, 
00939                          parms->isovalue, 
00940                          parms->gausslim);
00941   }
00942 
00943   return NULL;
00944 }
00945 
00946 
00947 static void * reductionthread(void *voidparms) {
00948   wkf_tasktile_t tile;
00949   densitythrparms *parms = NULL;
00950   int threadid, numthreads;
00951 
00952   wkf_threadlaunch_getid(voidparms, &threadid, &numthreads);
00953   wkf_threadlaunch_getdata(voidparms, (void **) &parms);
00954 
00955   while (wkf_threadlaunch_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
00956     
00957     ptrdiff_t planesz = ptrdiff_t(parms->numvoxels[0]) * ptrdiff_t(parms->numvoxels[1]);
00958     ptrdiff_t start = ptrdiff_t(tile.start) * planesz;
00959     ptrdiff_t end   = ptrdiff_t(tile.end)   * planesz;
00960 
00961     ptrdiff_t i, x;
00962     for (x=start; x<end; x++) {
00963       float tmp = 0.0f;
00964       for (i=1; i<numthreads; i++) {
00965         tmp += parms->thrdensitymaps[i][x];
00966       }
00967       parms->thrdensitymaps[0][x] += tmp;
00968     }
00969 
00970     
00971     if (parms->thrvoltexmaps[0] != NULL) {
00972       for (x=start*3L; x<end*3L; x++) {
00973         float tmp = 0.0f;
00974         for (i=1; i<numthreads; i++) {
00975           tmp += parms->thrvoltexmaps[i][x];
00976         }
00977         parms->thrvoltexmaps[0][x] += tmp;
00978       }
00979     }
00980   }
00981 
00982   return NULL;
00983 }
00984 
00985 
00986 static int vmd_gaussdensity_threaded(wkf_cpu_caps_t *cpucaps, int verbose, 
00987                                      int natoms, const float *xyzr,
00988                                      const float *atomicnum,
00989                                      const float *colors,
00990                                      float *densitymap, float *voltexmap, 
00991                                      const int *numvoxels, 
00992                                      float radscale, float gridspacing, 
00993                                      float isovalue, float gausslim) {
00994   densitythrparms parms;
00995   memset(&parms, 0, sizeof(parms));
00996 
00997   parms.cpucaps = cpucaps;
00998   parms.verbose = verbose;
00999   parms.natoms = natoms;
01000   parms.radscale = radscale;
01001   parms.gridspacing = gridspacing;
01002   parms.isovalue = isovalue;
01003   parms.gausslim = gausslim;
01004   parms.numvoxels = numvoxels;
01005   parms.xyzr = xyzr;
01006   parms.atomicnum = atomicnum;
01007   parms.colors = colors;
01008  
01009   int physprocs = wkf_thread_numprocessors();
01010   int maxprocs = physprocs;
01011 
01012   
01013   
01014   
01015   
01016   
01017   
01018   
01019   while (maxprocs > 8) 
01020     maxprocs /= 2;
01021 
01022   
01023   
01024   
01025   
01026   
01027   ptrdiff_t volsz = ptrdiff_t(numvoxels[0]) * 
01028                     ptrdiff_t(numvoxels[1]) * ptrdiff_t(numvoxels[2]);
01029   ptrdiff_t volmemsz = sizeof(float) * volsz;
01030   ptrdiff_t volmemszkb = volmemsz / 1024;
01031   ptrdiff_t volmemtexszkb = volmemszkb + ((voltexmap != NULL) ? 3L*volmemszkb : 0);
01032 
01033   
01034   
01035   
01036   ptrdiff_t vmdcorefree = -1;
01037 
01038 #if defined(ARCH_BLUEWATERS) || defined(ARCH_CRAY_XC) || defined(ARCH_CRAY_XK) || defined(ARCH_LINUXAMD64) || defined(ARCH_SOLARIS2_64) || defined(ARCH_SOLARISX86_64) || defined(ARCH_AIX6_64) || defined(MACOSXARM64) || defined(ARCH_MACOSXX86_64) 
01039   
01040   
01041   
01042   
01043   
01044   
01045   vmdcorefree = vmd_get_avail_physmem_mb();
01046 #endif
01047 
01048   if (vmdcorefree >= 0) {
01049     
01050     
01051     
01052     
01053     while ((volmemtexszkb * maxprocs) > (1024L*vmdcorefree/4)) {
01054       maxprocs /= 2;
01055     }
01056   } else {
01057     
01058     while ((volmemtexszkb * maxprocs) > (2L * 1024L * 1024L))
01059       maxprocs /= 2;
01060   }
01061 
01062   if (maxprocs < 1) 
01063     maxprocs = 1;
01064 
01065   
01066   
01067   parms.thrdensitymaps = (float **) calloc(1,maxprocs * sizeof(float *));
01068   parms.thrvoltexmaps = (float **) calloc(1, maxprocs * sizeof(float *));
01069 
01070   
01071   parms.thrdensitymaps[0] = densitymap;
01072   parms.thrvoltexmaps[0] = voltexmap;
01073 
01074   int i;
01075   int numprocs = maxprocs; 
01076   for (i=1; i<maxprocs; i++) {
01077     parms.thrdensitymaps[i] = (float *) calloc(1, volmemsz);
01078     if (parms.thrdensitymaps[i] == NULL) {
01079       numprocs = i;
01080       break;
01081     }
01082     if (voltexmap != NULL) {
01083       parms.thrvoltexmaps[i] = (float *) calloc(1, 3L * volmemsz);
01084       if (parms.thrvoltexmaps[i] == NULL) {
01085         free(parms.thrdensitymaps[i]);
01086         parms.thrdensitymaps[i] = NULL;
01087         numprocs = i;
01088         break;
01089       }
01090     }
01091   }
01092 
01093   
01094   wkf_tasktile_t tile;
01095   tile.start = 0;
01096   tile.end = natoms;
01097   wkf_threadlaunch(numprocs, &parms, densitythread, &tile);
01098 
01099   
01100   tile.start = 0;
01101   tile.end = numvoxels[2];
01102   wkf_threadlaunch(numprocs, &parms, reductionthread, &tile);
01103 
01104   
01105   for (i=1; i<maxprocs; i++) {
01106     if (parms.thrdensitymaps[i] != NULL)
01107       free(parms.thrdensitymaps[i]);
01108 
01109     if (parms.thrvoltexmaps[i] != NULL)
01110       free(parms.thrvoltexmaps[i]);
01111   }
01112   free(parms.thrdensitymaps);
01113   free(parms.thrvoltexmaps);
01114 
01115   return 0;
01116 }
01117 
01118 QuickSurf::QuickSurf(int forcecpuonly) {
01119   volmap = NULL;
01120   voltexmap = NULL;
01121   s.clear();
01122   isovalue = 0.5f;
01123 
01124   numvoxels[0] = 128;
01125   numvoxels[1] = 128;
01126   numvoxels[2] = 128;
01127 
01128   origin[0] = 0.0f;
01129   origin[1] = 0.0f;
01130   origin[2] = 0.0f;
01131 
01132   xaxis[0] = 1.0f;
01133   xaxis[1] = 0.0f;
01134   xaxis[2] = 0.0f;
01135 
01136   yaxis[0] = 0.0f;
01137   yaxis[1] = 1.0f;
01138   yaxis[2] = 0.0f;
01139 
01140   zaxis[0] = 0.0f;
01141   zaxis[1] = 0.0f;
01142   zaxis[2] = 1.0f;
01143    
01144   cudaqs = NULL;
01145   force_cpuonly = forcecpuonly;
01146 #if defined(VMDCUDA)
01147   if (!force_cpuonly && !getenv("VMDNOCUDA")) {
01148     cudaqs = new CUDAQuickSurf();
01149   }
01150 #endif
01151 
01152   timer = wkf_timer_create();
01153 }
01154 
01155 
01156 void QuickSurf::free_gpu_memory(void) {
01157   if (cudaqs) {
01158 #if defined(VMDCUDA)
01159     delete cudaqs; 
01160 #endif
01161     cudaqs = NULL; 
01162   }
01163 }
01164 
01165 
01166 int QuickSurf::calc_surf(AtomSel *atomSel, DrawMolecule *mol,
01167                          const float *atompos, const float *atomradii,
01168                          int quality, float radscale, float gridspacing,
01169                          float isoval, const int *colidx, const float *cmap,
01170                          VMDDisplayList *cmdList) {
01171   PROFILE_PUSH_RANGE("QuickSurf", 3);
01172 
01173   wkf_timer_start(timer);
01174   int colorperatom = (colidx != NULL && cmap != NULL);
01175   int usebeads=0;
01176 
01177   int verbose = (getenv("VMDQUICKSURFVERBOSE") != NULL);
01178 
01179   
01180   
01181   
01182   const float *atomicnum = NULL;
01183 
01184   
01185   if (voltexmap != NULL)
01186     free(voltexmap);
01187   voltexmap = NULL;
01188 
01189   ResizeArray<float> beadpos(64 + (3L * atomSel->selected) / 20);
01190   ResizeArray<float> beadradii(64 + (3L * atomSel->selected) / 20);
01191   ResizeArray<float> beadcolors(64 + (3L * atomSel->selected) / 20);
01192 
01193   if (getenv("VMDQUICKSURFBEADS")) {
01194     usebeads=1;
01195     if (verbose)
01196       printf("QuickSurf using residue beads representation...\n");
01197   }
01198 
01199   int numbeads = 0;
01200   if (usebeads) {
01201     int i, resid, numres;
01202 
01203     
01204     numres = mol->residueList.num();
01205     for (resid=0; resid<numres; resid++) {
01206       float com[3] = {0.0, 0.0, 0.0};
01207       const ResizeArray<int> &atoms = mol->residueList[resid]->atoms;
01208       int numatoms = atoms.num();
01209       int oncount = 0;
01210    
01211       
01212       for (i=0; i<numatoms; i++) {
01213         int idx = atoms[i];
01214         if (atomSel->on[idx]) {
01215           oncount++;
01216           vec_add(com, com, atompos + 3L*idx);
01217         }
01218       }
01219 
01220       if (oncount < 1)
01221         continue; 
01222 
01223       vec_scale(com, 1.0f / (float) oncount, com);
01224 
01225       
01226       int atomcolorindex=0; 
01227       float boundradsq = 0.0f;
01228       for (i=0; i<numatoms; i++) {
01229         int idx = atoms[i];
01230         if (atomSel->on[idx]) {
01231           float tmpdist[3];
01232           atomcolorindex = idx;
01233           vec_sub(tmpdist, com, atompos + 3L*idx);
01234           float distsq = dot_prod(tmpdist, tmpdist);
01235           if (distsq > boundradsq) {
01236             boundradsq = distsq;
01237           }
01238         }
01239       }
01240       beadpos.append3(&com[0]);
01241       beadradii.append(sqrtf(boundradsq) + 1.0f);
01242 
01243       if (colorperatom) {
01244         const float *cp = &cmap[colidx[atomcolorindex] * 3L];
01245         beadcolors.append3(&cp[0]);
01246       }
01247 
01248       
01249     }
01250 
01251     numbeads = beadpos.num() / 3;
01252   }
01253 
01254   
01255   isovalue=isoval;
01256 
01257   
01258   
01259   
01260   
01261   if (cmap != NULL)
01262     vec_copy(solidcolor, cmap);
01263 
01264   
01265   
01266   float minx, miny, minz, maxx, maxy, maxz;
01267   float minrad, maxrad;
01268   int i;
01269   if (usebeads) {
01270     minx = maxx = beadpos[0];
01271     miny = maxy = beadpos[1];
01272     minz = maxz = beadpos[2];
01273     minrad = maxrad = beadradii[0];
01274     for (i=0; i<numbeads; i++) {
01275       ptrdiff_t ind = i * 3L;
01276       float tmpx = beadpos[ind  ];
01277       float tmpy = beadpos[ind+1];
01278       float tmpz = beadpos[ind+2];
01279 
01280       minx = (tmpx < minx) ? tmpx : minx;
01281       maxx = (tmpx > maxx) ? tmpx : maxx;
01282 
01283       miny = (tmpy < miny) ? tmpy : miny;
01284       maxy = (tmpy > maxy) ? tmpy : maxy;
01285 
01286       minz = (tmpz < minz) ? tmpz : minz;
01287       maxz = (tmpz > maxz) ? tmpz : maxz;
01288  
01289       
01290       
01291       float r = beadradii[i];
01292       minrad = (r < minrad) ? r : minrad;
01293       maxrad = (r > maxrad) ? r : maxrad;
01294     }
01295   } else {
01296     minx = maxx = atompos[atomSel->firstsel*3L  ];
01297     miny = maxy = atompos[atomSel->firstsel*3L+1];
01298     minz = maxz = atompos[atomSel->firstsel*3L+2];
01299 
01300     
01301     mol->get_radii_minmax(minrad, maxrad);
01302 
01303     
01304     
01305     
01306     if (minrad <= 0.001 || maxrad/minrad > 2.5) {
01307       minrad = maxrad = atomradii[atomSel->firstsel];
01308       for (i=atomSel->firstsel; i<=atomSel->lastsel; i++) {
01309         if (atomSel->on[i]) {
01310           ptrdiff_t ind = i * 3L;
01311           float tmpx = atompos[ind  ];
01312           float tmpy = atompos[ind+1];
01313           float tmpz = atompos[ind+2];
01314 
01315           minx = (tmpx < minx) ? tmpx : minx;
01316           maxx = (tmpx > maxx) ? tmpx : maxx;
01317 
01318           miny = (tmpy < miny) ? tmpy : miny;
01319           maxy = (tmpy > maxy) ? tmpy : maxy;
01320 
01321           minz = (tmpz < minz) ? tmpz : minz;
01322           maxz = (tmpz > maxz) ? tmpz : maxz;
01323   
01324           float r = atomradii[i];
01325           minrad = (r < minrad) ? r : minrad;
01326           maxrad = (r > maxrad) ? r : maxrad;
01327         }
01328       }
01329     } else {
01330 #if 1
01331       float fmin[3], fmax[3];
01332       minmax_selected_3fv_aligned(atompos, atomSel->on, atomSel->num_atoms,
01333                                   atomSel->firstsel, atomSel->lastsel,
01334                                   fmin, fmax);
01335       minx = fmin[0];
01336       miny = fmin[1];
01337       minz = fmin[2];
01338 
01339       maxx = fmax[0]; 
01340       maxy = fmax[1]; 
01341       maxz = fmax[2]; 
01342 #else
01343       for (i=atomSel->firstsel; i<=atomSel->lastsel; i++) {
01344         if (atomSel->on[i]) {
01345           ptrdiff_t ind = i * 3L;
01346           float tmpx = atompos[ind  ];
01347           float tmpy = atompos[ind+1];
01348           float tmpz = atompos[ind+2];
01349 
01350           minx = (tmpx < minx) ? tmpx : minx;
01351           maxx = (tmpx > maxx) ? tmpx : maxx;
01352 
01353           miny = (tmpy < miny) ? tmpy : miny;
01354           maxy = (tmpy > maxy) ? tmpy : maxy;
01355 
01356           minz = (tmpz < minz) ? tmpz : minz;
01357           maxz = (tmpz > maxz) ? tmpz : maxz;
01358         }
01359       }
01360 #endif
01361     }
01362   }
01363 
01364   float mincoord[3], maxcoord[3];
01365   mincoord[0] = minx;
01366   mincoord[1] = miny;
01367   mincoord[2] = minz;
01368   maxcoord[0] = maxx;
01369   maxcoord[1] = maxy;
01370   maxcoord[2] = maxz;
01371 
01372   
01373   
01374   float gridpadding = radscale * maxrad * 1.70f;
01375   float padrad = gridpadding;
01376   padrad = 0.65f * sqrtf(4.0f/3.0f*((float) VMD_PI)*padrad*padrad*padrad);
01377   gridpadding = MAX(gridpadding, padrad);
01378 
01379   
01380   
01381   
01382   
01383   
01384   
01385   
01386   
01387   
01388   if (minrad > 4.0f) {
01389     gridspacing *= minrad;
01390   }
01391 
01392   if (verbose) {
01393     printf("QuickSurf: R*%.1f, I=%.1f, H=%.1f Pad: %.1f minR: %.1f maxR: %.1f)\n",
01394            radscale, isovalue, gridspacing, gridpadding, minrad, maxrad);
01395   }
01396 
01397   mincoord[0] -= gridpadding;
01398   mincoord[1] -= gridpadding;
01399   mincoord[2] -= gridpadding;
01400   maxcoord[0] += gridpadding;
01401   maxcoord[1] += gridpadding;
01402   maxcoord[2] += gridpadding;
01403 
01404   
01405   numvoxels[0] = (int) ceil((maxcoord[0]-mincoord[0]) / gridspacing);
01406   numvoxels[1] = (int) ceil((maxcoord[1]-mincoord[1]) / gridspacing);
01407   numvoxels[2] = (int) ceil((maxcoord[2]-mincoord[2]) / gridspacing);
01408 
01409   
01410   xaxis[0] = (numvoxels[0]-1) * gridspacing;
01411   yaxis[1] = (numvoxels[1]-1) * gridspacing;
01412   zaxis[2] = (numvoxels[2]-1) * gridspacing;
01413   maxcoord[0] = mincoord[0] + xaxis[0];
01414   maxcoord[1] = mincoord[1] + yaxis[1];
01415   maxcoord[2] = mincoord[2] + zaxis[2];
01416 
01417   int boundserr=0;
01418   for (i=0; i<3; i++) {
01419     if (isnan(mincoord[i]) || isnan(maxcoord[i]) || (numvoxels[i] < 1))
01420       boundserr = 1;
01421   }
01422 
01423   if (boundserr)
01424     msgErr << "QuickSurf) NaN or illegal bounding box, grid size" << sendmsg;
01425 
01426   if (verbose || boundserr) {
01427     printf("  GridSZ: (%4d %4d %4d)  BBox: (%.1f %.1f %.1f)->(%.1f %.1f %.1f)\n",
01428            numvoxels[0], numvoxels[1], numvoxels[2],
01429            mincoord[0], mincoord[1], mincoord[2],
01430            maxcoord[0], maxcoord[1], maxcoord[2]);
01431   }
01432 
01433   if (boundserr)
01434     return -1; 
01435 
01436   vec_copy(origin, mincoord);
01437 
01438   
01439   float *xyzr = NULL;
01440   float *colors = NULL;
01441   if (usebeads) { 
01442     int ind =0;
01443     int ind4=0; 
01444     xyzr = (float *) malloc(numbeads * sizeof(float) * 4L);
01445     if (colorperatom) {
01446       colors = (float *) malloc(numbeads * sizeof(float) * 4L);
01447 
01448       
01449       for (i=0; i<numbeads; i++) {
01450         const float *fp = &beadpos[0] + ind;
01451         xyzr[ind4    ] = fp[0]-origin[0];
01452         xyzr[ind4 + 1] = fp[1]-origin[1];
01453         xyzr[ind4 + 2] = fp[2]-origin[2];
01454         xyzr[ind4 + 3] = beadradii[i];
01455  
01456         const float *cp = &beadcolors[0] + ind;
01457         colors[ind4    ] = cp[0];
01458         colors[ind4 + 1] = cp[1];
01459         colors[ind4 + 2] = cp[2];
01460         colors[ind4 + 3] = 1.0f;
01461         ind4 += 4;
01462         ind += 3;
01463       }
01464     } else {
01465       
01466       for (i=0; i<numbeads; i++) {
01467         const float *fp = &beadpos[0] + ind;
01468         xyzr[ind4    ] = fp[0]-origin[0];
01469         xyzr[ind4 + 1] = fp[1]-origin[1];
01470         xyzr[ind4 + 2] = fp[2]-origin[2];
01471         xyzr[ind4 + 3] = beadradii[i];
01472         ind4 += 4;
01473         ind += 3;
01474       }
01475     }
01476   } else {
01477     ptrdiff_t ind = atomSel->firstsel * 3L;
01478     ptrdiff_t ind4=0; 
01479     xyzr = (float *) malloc(atomSel->selected * sizeof(float) * 4L);
01480     if (colorperatom) {
01481       colors = (float *) malloc(atomSel->selected * sizeof(float) * 4L);
01482 
01483       
01484       for (i=atomSel->firstsel; i <= atomSel->lastsel; i++) {
01485         if (atomSel->on[i]) {
01486           const float *fp = atompos + ind;
01487           xyzr[ind4    ] = fp[0]-origin[0];
01488           xyzr[ind4 + 1] = fp[1]-origin[1];
01489           xyzr[ind4 + 2] = fp[2]-origin[2];
01490           xyzr[ind4 + 3] = atomradii[i];
01491  
01492           const float *cp = &cmap[colidx[i] * 3L];
01493           colors[ind4    ] = cp[0];
01494           colors[ind4 + 1] = cp[1];
01495           colors[ind4 + 2] = cp[2];
01496           colors[ind4 + 3] = 1.0f;
01497           ind4 += 4;
01498         }
01499         ind += 3;
01500       }
01501     } else {
01502       
01503       for (i=atomSel->firstsel; i <= atomSel->lastsel; i++) {
01504         if (atomSel->on[i]) {
01505           const float *fp = atompos + ind;
01506           xyzr[ind4    ] = fp[0]-origin[0];
01507           xyzr[ind4 + 1] = fp[1]-origin[1];
01508           xyzr[ind4 + 2] = fp[2]-origin[2];
01509           xyzr[ind4 + 3] = atomradii[i];
01510           ind4 += 4;
01511         }
01512         ind += 3;
01513       }
01514     }
01515   }
01516 
01517   
01518   float gausslim = 2.0f;
01519   switch (quality) {
01520     case 3: gausslim = 4.0f; break; 
01521 
01522     case 2: gausslim = 3.0f; break; 
01523 
01524     case 1: gausslim = 2.5f; break; 
01525 
01526     case 0: 
01527     default: gausslim = 2.0f; 
01528       break;
01529   }
01530 
01531   pretime = wkf_timer_timenow(timer);
01532 
01533 #if defined(VMDCUDA)
01534   if (!force_cpuonly && !getenv("VMDNOCUDA")) {
01535     
01536     if (cudaqs == NULL)
01537       cudaqs = new CUDAQuickSurf();
01538 
01539     
01540     CUDAQuickSurf::VolTexFormat voltexformat = CUDAQuickSurf::RGB3F;
01541     switch (quality) {
01542       case 3: voltexformat = CUDAQuickSurf::RGB3F; break; 
01543 
01544       case 2: voltexformat = CUDAQuickSurf::RGB3F; break; 
01545 
01546       case 1: voltexformat = CUDAQuickSurf::RGB3F; break; 
01547 
01548       case 0: 
01549       default: voltexformat = CUDAQuickSurf::RGB4U; 
01550         break;
01551     }
01552 
01553     
01554     int pcount = (usebeads) ? numbeads : atomSel->selected; 
01555     int rc = cudaqs->calc_surf(pcount, &xyzr[0],
01556                                (colorperatom) ? &colors[0] : &cmap[0],
01557                                colorperatom, voltexformat, 
01558                                origin, numvoxels, maxrad,
01559                                radscale, gridspacing, isovalue, gausslim,
01560                                cmdList);
01561 
01562     
01563     
01564     
01565     
01566     if (getenv("VMDQUICKSURFMINMEM")) {
01567       free_gpu_memory();
01568     }
01569 
01570     if (rc == 0) {
01571       free(xyzr);
01572       if (colors)
01573         free(colors);
01574   
01575       voltime = wkf_timer_timenow(timer);
01576 
01577       PROFILE_POP_RANGE(); 
01578 
01579       return 0;
01580     }
01581   }
01582 #endif
01583 
01584   if (verbose) {
01585     printf("  Computing density map grid on CPUs ");
01586   }
01587 
01588   ptrdiff_t volsz = numvoxels[0] * numvoxels[1] * numvoxels[2];
01589   volmap = new float[volsz];
01590   if (colidx != NULL && cmap != NULL) {
01591     voltexmap = (float*) calloc(1, 3L * sizeof(float) * numvoxels[0] * numvoxels[1] * numvoxels[2]);
01592   }
01593 
01594   fflush(stdout);
01595   memset(volmap, 0, sizeof(float) * volsz);
01596   if ((volsz * atomSel->selected) > 20000000) {
01597     vmd_gaussdensity_threaded(mol->app->cpucaps, verbose, 
01598                               atomSel->selected, &xyzr[0], atomicnum,
01599                               (voltexmap!=NULL) ? &colors[0] : NULL,
01600                               volmap, voltexmap, numvoxels, radscale, 
01601                               gridspacing, isovalue, gausslim);
01602   } else {
01603     vmd_gaussdensity_opt(mol->app->cpucaps, verbose, 
01604                          atomSel->selected, &xyzr[0], atomicnum,
01605                          (voltexmap!=NULL) ? &colors[0] : NULL,
01606                          volmap, voltexmap, numvoxels, radscale, 
01607                          gridspacing, isovalue, gausslim);
01608   }
01609 
01610   free(xyzr);
01611   if (colors)
01612     free(colors);
01613 
01614   voltime = wkf_timer_timenow(timer);
01615 
01616   
01617   if (cmdList != NULL) {
01618     draw_trimesh(cmdList);
01619   }
01620 
01621   if (verbose) {
01622     printf(" Done.\n");
01623   }
01624 
01625   PROFILE_POP_RANGE(); 
01626 
01627   return 0;
01628 }
01629 
01630 
01631 
01632 VolumetricData * QuickSurf::calc_density_map(AtomSel * atomSel, 
01633                                              DrawMolecule *mymol,  
01634                                              const float *atompos, 
01635                                              const float *atomradii,
01636                                              int quality, float radscale, 
01637                                              float gridspacing) {
01638   if (!calc_surf(atomSel, mymol, atompos, atomradii, quality, 
01639                  radscale, gridspacing, 1.0f, NULL, NULL, NULL)) {
01640     VolumetricData *surfvol;
01641     surfvol = new VolumetricData("density map", origin, xaxis, yaxis, zaxis,
01642                                  numvoxels[0], numvoxels[1], numvoxels[2],
01643                                  volmap);
01644     return surfvol;
01645   }
01646 
01647   return NULL;
01648 }
01649 
01650 
01651 
01652 int QuickSurf::get_trimesh(int &numverts, 
01653                            float *&v3fv, float *&n3fv, float *&c3fv, 
01654                            int &numfacets, int *&fiv) {
01655 
01656   int verbose = (getenv("VMDQUICKSURFVERBOSE") != NULL);
01657 
01658   if (verbose)
01659     printf("Running marching cubes on CPU...\n");
01660 
01661   VolumetricData *surfvol; 
01662   surfvol = new VolumetricData("molecular surface",
01663                                origin, xaxis, yaxis, zaxis,
01664                                numvoxels[0], numvoxels[1], numvoxels[2],
01665                                volmap);
01666 
01667   
01668   
01669   
01670   surfvol->compute_volume_gradient(); 
01671   gradtime = wkf_timer_timenow(timer);
01672 
01673   
01674   const int stepsize = 1;
01675   s.clear();                              
01676   s.compute(surfvol, isovalue, stepsize); 
01677 
01678   mctime = wkf_timer_timenow(timer);
01679 
01680   s.vertexfusion(9, 9);                   
01681   s.normalize();                          
01682 
01683   if (s.numtriangles > 0) {
01684     if (voltexmap != NULL) {
01685       
01686       s.set_color_voltex_rgb3fv(voltexmap);
01687     } else {
01688       
01689       s.set_color_rgb3fv(solidcolor);
01690     }
01691   }
01692 
01693   numverts = s.v.num() / 3;
01694   v3fv=&s.v[0];
01695   n3fv=&s.n[0];
01696   c3fv=&s.c[0];
01697 
01698   numfacets = s.numtriangles;
01699   fiv=&s.f[0];
01700 
01701   delete surfvol;
01702 
01703   mcverttime = wkf_timer_timenow(timer);
01704   reptime = mcverttime;
01705 
01706   if (verbose) {
01707     char strmsg[1024];
01708     sprintf(strmsg, "QuickSurf: %.3f [pre:%.3f vol:%.3f gr:%.3f mc:%.2f mcv:%.3f]",
01709             reptime, pretime, voltime-pretime, gradtime-voltime, 
01710             mctime-gradtime, mcverttime-mctime);
01711 
01712     msgInfo << strmsg << sendmsg;
01713   }
01714  
01715   return 0;
01716 }
01717 
01718 
01719 int QuickSurf::draw_trimesh(VMDDisplayList *cmdList) {
01720   DispCmdTriMesh cmdTriMesh;
01721 
01722   int numverts=0;
01723   float *v=NULL, *n=NULL, *c=NULL;
01724   int numfacets=0;
01725   int *f=NULL;
01726 
01727   get_trimesh(numverts, v, n, c, numfacets, f);
01728 
01729   
01730   if (numfacets > 0) {
01731     cmdTriMesh.putdata(v, n, c, numverts, f, numfacets, 0, cmdList);
01732   }
01733  
01734   return 0;
01735 }
01736 
01737 
01738 QuickSurf::~QuickSurf() {
01739 #if defined(VMDCUDA)
01740   free_gpu_memory();
01741 #endif
01742 
01743   if (voltexmap != NULL)
01744     free(voltexmap);
01745   voltexmap = NULL;
01746 
01747   wkf_timer_destroy(timer);
01748 }
01749 
01750