00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00022 
00023 
00024 
00025 #if defined(VMDCPUDISPATCH) && defined(VMDUSENEON)
00026 #include <arm_neon.h>
00027 
00028 #include <stdio.h>
00029 #include <stdlib.h>
00030 #include <string.h>
00031 #include <math.h>
00032 #include "QuickSurf.h"
00033 #include "Inform.h"
00034 #include "utilities.h"
00035 #include "WKFUtils.h"
00036 #include "VolumetricData.h"
00037 
00038 #include "VMDDisplayList.h"
00039 #include "Displayable.h"
00040 #include "DispCmds.h"
00041 
00042 #define MIN(X,Y) (((X)<(Y))? (X) : (Y))
00043 #define MAX(X,Y) (((X)>(Y))? (X) : (Y))
00044 
00045 
00046 
00047 
00048 
00049 
00050 
00051 
00052 
00053 
00054 
00055 
00056 
00057 
00058 
00059 
00060 
00061 
00062 
00063 
00064 
00065 
00066 
00067 
00068 
00069 
00070 
00071 
00072 
00073 
00074 
00075 
00076 
00077 
00078 
00079 
00080 
00081 
00082 
00083 
00084 
00085 #if defined(__GNUC__) && ! defined(__INTEL_COMPILER)
00086 #define __align(X)  __attribute__((aligned(X) ))
00087 #else
00088 #define __align(X) __declspec(align(X) )
00089 #endif
00090 
00091 #define MLOG2EF    -1.44269504088896f
00092 
00093 
00094 
00095 
00096 
00097 #define SCEXP0     1.0000000000000000f
00098 #define SCEXP1     0.6987082824680118f
00099 #define SCEXP2     0.2633174272827404f
00100 #define SCEXP3     0.0923611991471395f
00101 #define SCEXP4     0.0277520543324108f
00102 
00103 
00104 #define EXPOBIAS   127
00105 #define EXPOSHIFT   23
00106 
00107 
00108 
00109 #define ACUTOFF    -10
00110 
00111 typedef union flint_t {
00112   float f;
00113   int n;
00114 } flint;
00115 
00116 
00117 typedef union NEONreg_t {
00118   float32x4_t f;  
00119   int32x4_t i;    
00120   struct {
00121     float r0, r1, r2, r3;  
00122   } floatreg;
00123 } NEONreg;
00124 
00125 void vmd_gaussdensity_neon(int verbose,
00126                            int natoms, const float *xyzr,
00127                            const float *atomicnum,
00128                            const float *colors,
00129                            float *densitymap, float *voltexmap,
00130                            const int *numvoxels,
00131                            float radscale, float gridspacing,
00132                            float isovalue, float gausslim) {
00133   int i, x, y, z;
00134   int maxvoxel[3];
00135   maxvoxel[0] = numvoxels[0]-1;
00136   maxvoxel[1] = numvoxels[1]-1;
00137   maxvoxel[2] = numvoxels[2]-1;
00138   const float invgridspacing = 1.0f / gridspacing;
00139 
00140   
00141   float32x4_t gridspacing4_4;
00142   __attribute__((aligned(16))) float sxdelta4[4]; 
00143 
00144   gridspacing4_4 = vdupq_n_f32(gridspacing * 4.0f);
00145   for (x=0; x<4; x++)
00146     sxdelta4[x] = ((float) x) * gridspacing;
00147 
00148   
00149   if (voltexmap != NULL) {
00150     float invisovalue = 1.0f / isovalue;
00151     
00152     for (i=0; i<natoms; i++) {
00153       if (verbose && ((i & 0x3fff) == 0)) {
00154         printf(".");
00155         fflush(stdout);
00156       }
00157 
00158       ptrdiff_t ind = i*4L;
00159       float scaledrad = xyzr[ind + 3] * radscale;
00160 
00161       
00162       float atomicnumfactor = 1.0f;
00163       if (atomicnum != NULL) {
00164         atomicnumfactor = atomicnum[i];
00165       }
00166 
00167       
00168       float arinv = -(1.0f/(2.0f*scaledrad*scaledrad)) * MLOG2EF;
00169       float radlim = gausslim * scaledrad;
00170       float radlim2 = radlim * radlim; 
00171       radlim *= invgridspacing;
00172 
00173       float32x4_t atomicnumfactor_4;
00174       float32x4_t arinv_4;
00175       atomicnumfactor_4 = vdupq_n_f32(atomicnumfactor);
00176 
00177       
00178       arinv_4 = vdupq_n_f32(arinv);
00179 
00180       float tmp;
00181       tmp = xyzr[ind  ] * invgridspacing;
00182       int xmin = MAX((int) (tmp - radlim), 0);
00183       int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00184       tmp = xyzr[ind+1] * invgridspacing;
00185       int ymin = MAX((int) (tmp - radlim), 0);
00186       int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00187       tmp = xyzr[ind+2] * invgridspacing;
00188       int zmin = MAX((int) (tmp - radlim), 0);
00189       int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00190 
00191       float dz = zmin*gridspacing - xyzr[ind+2];
00192       for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00193         float dy = ymin*gridspacing - xyzr[ind+1];
00194         for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00195           float dy2dz2 = dy*dy + dz*dz;
00196 
00197           
00198           if (dy2dz2 >= radlim2)
00199             continue;
00200 
00201           ptrdiff_t addr = ptrdiff_t(z * numvoxels[0]) * ptrdiff_t(numvoxels[1]) + ptrdiff_t(y * numvoxels[0]);
00202           float dx = xmin*gridspacing - xyzr[ind];
00203           x=xmin;
00204 
00205           
00206           
00207           {
00208             __align(16) NEONreg n;
00209             __align(16) NEONreg y;
00210             float32x4_t dy2dz2_4 = vdupq_n_f32(dy2dz2);
00211             float32x4_t dx_4 = vaddq_f32(vdupq_n_f32(dx), vld1q_f32(&sxdelta4[0]));
00212 
00213             for (; (x+3)<=xmax; x+=4,dx_4=vaddq_f32(dx_4, gridspacing4_4)) {
00214               float32x4_t r2 = vfmaq_f32(dy2dz2_4, dx_4, dx_4);
00215               float32x4_t d;
00216 
00217               
00218               y.f = vmulq_f32(r2, arinv_4);  
00219               n.i = vcvtq_s32_f32(y.f);
00220               d = vcvtq_f32_s32(n.i);
00221               d = vsubq_f32(d, y.f);
00222 
00223               
00224               
00225               y.f = vfmaq_f32(vdupq_n_f32(SCEXP3), vdupq_n_f32(SCEXP4), d);
00226               y.f = vfmaq_f32(vdupq_n_f32(SCEXP2), d, y.f);
00227               y.f = vfmaq_f32(vdupq_n_f32(SCEXP1), d, y.f);
00228               y.f = vfmaq_f32(vdupq_n_f32(SCEXP0), d, y.f);
00229 
00230               
00231               
00232               n.i = vsubq_s32(vdupq_n_s32(EXPOBIAS), n.i);
00233               n.i = vshlq_s32(n.i, vdupq_n_s32(EXPOSHIFT));
00234               y.f = vmulq_f32(y.f, n.f);
00235               y.f = vmulq_f32(y.f, atomicnumfactor_4); 
00236 
00237               float *ufptr = &densitymap[addr + x];
00238               d = vld1q_f32(ufptr); 
00239               vst1q_f32(ufptr, vaddq_f32(d, y.f));
00240 
00241               
00242               
00243               
00244               d = vmulq_f32(y.f, vdupq_n_f32(invisovalue));
00245               ptrdiff_t caddr = (addr + x) * 3L;
00246 
00247 #if 0
00248               
00249 #else
00250               
00251               float r, g, b;
00252               r = colors[ind    ];
00253               g = colors[ind + 1];
00254               b = colors[ind + 2];
00255 
00256               NEONreg tmp;
00257               tmp.f = d;
00258               float density;
00259               density = tmp.floatreg.r0;
00260               voltexmap[caddr     ] += density * r;
00261               voltexmap[caddr +  1] += density * g;
00262               voltexmap[caddr +  2] += density * b;
00263 
00264               density = tmp.floatreg.r1;
00265               voltexmap[caddr +  3] += density * r;
00266               voltexmap[caddr +  4] += density * g;
00267               voltexmap[caddr +  5] += density * b;
00268 
00269               density = tmp.floatreg.r2;
00270               voltexmap[caddr +  6] += density * r;
00271               voltexmap[caddr +  7] += density * g;
00272               voltexmap[caddr +  8] += density * b;
00273 
00274               density = tmp.floatreg.r3;
00275               voltexmap[caddr +  9] += density * r;
00276               voltexmap[caddr + 10] += density * g;
00277               voltexmap[caddr + 11] += density * b;
00278 #endif
00279             }
00280           }
00281 
00282           
00283           for (; x<=xmax; x++,dx+=gridspacing) {
00284             float r2 = dx*dx + dy2dz2;
00285 
00286             
00287             float mb = r2 * arinv;         
00288             int mbflr = (int) mb;          
00289             float d = mbflr - mb;          
00290 
00291             
00292             float sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
00293 
00294             
00295             flint scalfac;
00296             scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT;
00297 
00298             
00299             float density = (sy * scalfac.f);
00300 
00301             density *= atomicnumfactor; 
00302 
00303             
00304             densitymap[addr + x] += density;
00305 
00306             
00307             
00308             
00309             density *= invisovalue;
00310             ptrdiff_t caddr = (addr + x) * 3L;
00311 
00312             
00313             voltexmap[caddr    ] += density * colors[ind    ];
00314             voltexmap[caddr + 1] += density * colors[ind + 1];
00315             voltexmap[caddr + 2] += density * colors[ind + 2];
00316           }
00317         }
00318       }
00319     }
00320   } else {
00321     
00322     for (i=0; i<natoms; i++) {
00323       if (verbose && ((i & 0x3fff) == 0)) {
00324         printf(".");
00325         fflush(stdout);
00326       }
00327 
00328       ptrdiff_t ind = i*4L;
00329       float scaledrad = xyzr[ind+3] * radscale;
00330 
00331       
00332       float atomicnumfactor = 1.0f;
00333       if (atomicnum != NULL) {
00334         atomicnumfactor = atomicnum[i];
00335       }
00336 
00337       
00338       float arinv = -(1.0f/(2.0f*scaledrad*scaledrad)) * MLOG2EF;
00339       float radlim = gausslim * scaledrad;
00340       float radlim2 = radlim * radlim; 
00341       radlim *= invgridspacing;
00342 
00343       float32x4_t atomicnumfactor_4 = vdupq_n_f32(atomicnumfactor);
00344       float32x4_t arinv_4= vdupq_n_f32(arinv); 
00345 
00346       float tmp;
00347       tmp = xyzr[ind  ] * invgridspacing;
00348       int xmin = MAX((int) (tmp - radlim), 0);
00349       int xmax = MIN((int) (tmp + radlim), maxvoxel[0]);
00350       tmp = xyzr[ind+1] * invgridspacing;
00351       int ymin = MAX((int) (tmp - radlim), 0);
00352       int ymax = MIN((int) (tmp + radlim), maxvoxel[1]);
00353       tmp = xyzr[ind+2] * invgridspacing;
00354       int zmin = MAX((int) (tmp - radlim), 0);
00355       int zmax = MIN((int) (tmp + radlim), maxvoxel[2]);
00356 
00357       float dz = zmin*gridspacing - xyzr[ind+2];
00358       for (z=zmin; z<=zmax; z++,dz+=gridspacing) {
00359         float dy = ymin*gridspacing - xyzr[ind+1];
00360         for (y=ymin; y<=ymax; y++,dy+=gridspacing) {
00361           float dy2dz2 = dy*dy + dz*dz;
00362 
00363           
00364           if (dy2dz2 >= radlim2)
00365             continue;
00366 
00367           ptrdiff_t addr = ptrdiff_t(z * numvoxels[0]) * ptrdiff_t(numvoxels[1]) + ptrdiff_t(y * numvoxels[0]);
00368           float dx = xmin*gridspacing - xyzr[ind];
00369           x=xmin;
00370 
00371           
00372           
00373           {
00374             __align(16) NEONreg n;
00375             __align(16) NEONreg y;
00376             float32x4_t dy2dz2_4 = vdupq_n_f32(dy2dz2);
00377             float32x4_t dx_4 = vaddq_f32(vdupq_n_f32(dx), vld1q_f32(&sxdelta4[0]));
00378 
00379             for (; (x+3)<=xmax; x+=4,dx_4=vaddq_f32(dx_4, gridspacing4_4)) {
00380               float32x4_t r2 = vfmaq_f32(dy2dz2_4, dx_4, dx_4);
00381               float32x4_t d;
00382 
00383               
00384               y.f = vmulq_f32(r2, arinv_4);         
00385               n.i = vcvtq_s32_f32(y.f);
00386               d = vcvtq_f32_s32(n.i);
00387               d = vsubq_f32(d, y.f);
00388 
00389               
00390               
00391 #if __ARM_FEATURE_FMA
00392               
00393               y.f = vfmaq_f32(vdupq_n_f32(SCEXP3), vdupq_n_f32(SCEXP4), d);
00394               y.f = vfmaq_f32(vdupq_n_f32(SCEXP2), d, y.f);
00395               y.f = vfmaq_f32(vdupq_n_f32(SCEXP1), d, y.f);
00396               y.f = vfmaq_f32(vdupq_n_f32(SCEXP0), d, y.f);
00397 #else
00398               y.f = vmulq_f32(d, vdupq_n_f32(SCEXP4));      
00399               y.f = vaddq_f32(y.f, vdupq_n_f32(SCEXP3));    
00400               y.f = vmulq_f32(y.f, d);
00401               y.f = vaddq_f32(y.f, vdupq_n_f32(SCEXP2));    
00402               y.f = vmulq_f32(y.f, d);
00403               y.f = vaddq_f32(y.f, vdupq_n_f32(SCEXP1));    
00404               y.f = vmulq_f32(y.f, d);
00405               y.f = vaddq_f32(y.f, vdupq_n_f32(SCEXP0));    
00406 #endif
00407 
00408               
00409               
00410               n.i = vsubq_s32(vdupq_n_s32(EXPOBIAS), n.i);
00411               n.i = vshlq_s32(n.i, vdupq_n_s32(EXPOSHIFT));
00412               y.f = vmulq_f32(y.f, n.f);
00413               y.f = vmulq_f32(y.f, atomicnumfactor_4); 
00414 
00415               float *ufptr = &densitymap[addr + x];
00416               d = vld1q_f32(ufptr); 
00417               vst1q_f32(ufptr, vaddq_f32(d, y.f));
00418             }
00419           }
00420 
00421           
00422           for (; x<=xmax; x++,dx+=gridspacing) {
00423             float r2 = dx*dx + dy2dz2;
00424 
00425             
00426             float mb = r2 * arinv;         
00427             int mbflr = (int) mb;          
00428             float d = mbflr - mb;          
00429 
00430             
00431             float sy = SCEXP0 + d*(SCEXP1 + d*(SCEXP2 + d*(SCEXP3 + d*SCEXP4)));
00432 
00433             
00434             flint scalfac;
00435             scalfac.n = (EXPOBIAS - mbflr) << EXPOSHIFT;
00436 
00437             
00438             float density = (sy * scalfac.f);
00439 
00440             density *= atomicnumfactor; 
00441 
00442             densitymap[addr + x] += density;
00443           }
00444         }
00445       }
00446     }
00447   }
00448 }
00449  
00450 
00451 #endif
00452