00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041 #if defined(VMDCPUDISPATCH) && defined(__ARM_FEATURE_SVE)
00042 #include <arm_sve.h>
00043
00044 #include <math.h>
00045 #include <stdio.h>
00046 #include "Orbital.h"
00047 #include "DrawMolecule.h"
00048 #include "utilities.h"
00049 #include "Inform.h"
00050 #include "WKFThreads.h"
00051 #include "WKFUtils.h"
00052 #include "ProfileHooks.h"
00053
00054 #define ANGS_TO_BOHR 1.88972612478289694072f
00055
00056 #if defined(__GNUC__)
00057 #define __align(X) __attribute__((aligned(X) ))
00058 #else
00059 #define __align(X) __declspec(align(X) )
00060 #endif
00061
00062 #define MLOG2EF -1.44269504088896f
00063
00064 #if 0
00065 static void print_svfloat32_t(svfloat32_t v) {
00066 float tmp[svcntw()];
00067
00068 svbool_t pg = svptrue_b32();
00069 svst1_f32(pg, &tmp[0], v);
00070
00071 printf("print_svfloat32_t: ");
00072 for (int i=0; i<svcntw(); i++)
00073 printf("%g ", tmp[i]);
00074 printf("\n");
00075 }
00076
00077 static void print_svint32_t(svint32_t v) {
00078 int tmp[svcntw()];
00079
00080 svbool_t pg = svptrue_b32();
00081 svst1_s32(pg, &tmp[0], v);
00082
00083 printf("print_svint32_t: ");
00084 for (int i=0; i<svcntw(); i++)
00085 printf("%d ", tmp[i]);
00086 printf("\n");
00087 }
00088
00089 static void print_svhex32_t(svint32_t v) {
00090 int tmp[4];
00091
00092 svbool_t pg = svptrue_b32();
00093 svst1_s32(pg, &tmp[0], v);
00094
00095 printf("print_svhex32_t: ");
00096 for (int i=0; i<svcntw(); i++)
00097 printf("%08x ", tmp[i]);
00098 printf("\n");
00099 }
00100 #endif
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113 #define SCEXP0 1.0000000000000000f
00114 #define SCEXP1 0.6987082824680118f
00115 #define SCEXP2 0.2633174272827404f
00116 #define SCEXP3 0.0923611991471395f
00117 #define SCEXP4 0.0277520543324108f
00118
00119
00120 #define EXPOBIAS 127
00121 #define EXPOSHIFT 23
00122
00123
00124 #define ACUTOFF -10
00125
00126 svfloat32_t aexpfnxarmsve(svfloat32_t x) {
00127 svbool_t pg = svptrue_b32();
00128
00129
00130 float fmv = svmaxv(pg, x);
00131 if (fmv < ACUTOFF) {
00132 return svdup_f32(0.0f);
00133 }
00134
00135
00136 pg = svcmpge_f32(pg, x, svdup_f32(ACUTOFF));
00137
00138
00139
00140
00141
00142
00143
00144 svfloat32_t mb = svmul_f32_x(pg, x, svdup_f32(MLOG2EF));
00145 svint32_t intflr = svcvt_s32_f32_x(pg, mb);
00146 svfloat32_t mbflr = svcvt_f32_s32_x(pg, intflr);
00147 svfloat32_t d = svsub_f32_x(pg, mbflr, mb);
00148
00149
00150
00151 svfloat32_t y;
00152 y = svmad_f32_x(pg, d, svdup_f32(SCEXP4), svdup_f32(SCEXP3));
00153 y = svmad_f32_x(pg, y, d, svdup_f32(SCEXP2));
00154 y = svmad_f32_x(pg, y, d, svdup_f32(SCEXP1));
00155 y = svmad_f32_x(pg, y, d, svdup_f32(SCEXP0));
00156
00157
00158
00159 svint32_t flint = svsub_s32_x(pg, svdup_s32(EXPOBIAS), intflr);
00160 flint = svlsl_s32_x(pg, flint, svdup_u32(EXPOSHIFT));
00161 y = svmul_f32_z(pg, y, svreinterpret_f32_s32(flint));
00162 return y;
00163 }
00164
00165
00166
00167
00168
00169 int evaluate_grid_sve(int numatoms,
00170 const float *wave_f, const float *basis_array,
00171 const float *atompos,
00172 const int *atom_basis,
00173 const int *num_shells_per_atom,
00174 const int *num_prim_per_shell,
00175 const int *shell_types,
00176 const int *numvoxels,
00177 float voxelsize,
00178 const float *origin,
00179 int density,
00180 float * orbitalgrid) {
00181 if (!orbitalgrid)
00182 return -1;
00183
00184 svbool_t pg = svptrue_b32();
00185
00186 int nx, ny, nz;
00187 int vecsize = svcntw();
00188 float sxdelta[vecsize];
00189 for (nx=0; nx<vecsize; nx++)
00190 sxdelta[nx] = ((float) nx) * voxelsize * ANGS_TO_BOHR;
00191
00192
00193
00194 int numgridxy = numvoxels[0]*numvoxels[1];
00195 for (nz=0; nz<numvoxels[2]; nz++) {
00196 float grid_x, grid_y, grid_z;
00197 grid_z = origin[2] + nz * voxelsize;
00198 for (ny=0; ny<numvoxels[1]; ny++) {
00199 grid_y = origin[1] + ny * voxelsize;
00200 int gaddrzy = ny*numvoxels[0] + nz*numgridxy;
00201 for (nx=0; nx<numvoxels[0]; nx+=16) {
00202 grid_x = origin[0] + nx * voxelsize;
00203
00204
00205
00206 int at;
00207 int prim, shell;
00208
00209
00210 svfloat32_t value = svdup_f32(0.0f);
00211
00212
00213 int ifunc = 0;
00214 int shell_counter = 0;
00215
00216
00217 for (at=0; at<numatoms; at++) {
00218 int maxshell = num_shells_per_atom[at];
00219 int prim_counter = atom_basis[at];
00220
00221
00222 float sxdist = (grid_x - atompos[3*at ])*ANGS_TO_BOHR;
00223 float sydist = (grid_y - atompos[3*at+1])*ANGS_TO_BOHR;
00224 float szdist = (grid_z - atompos[3*at+2])*ANGS_TO_BOHR;
00225
00226 float sydist2 = sydist*sydist;
00227 float szdist2 = szdist*szdist;
00228 float yzdist2 = sydist2 + szdist2;
00229
00230 svfloat32_t xdelta = svld1_f32(pg, &sxdelta[0]);
00231 svfloat32_t xdist = svdup_f32(sxdist);
00232 xdist = svadd_f32_z(pg, xdist, xdelta);
00233 svfloat32_t ydist = svdup_f32(sydist);
00234 svfloat32_t zdist = svdup_f32(szdist);
00235 svfloat32_t xdist2 = svmul_f32_z(pg, xdist, xdist);
00236 svfloat32_t ydist2 = svmul_f32_z(pg, ydist, ydist);
00237 svfloat32_t zdist2 = svmul_f32_z(pg, zdist, zdist);
00238 svfloat32_t dist2 = svdup_f32(yzdist2);
00239 dist2 = svadd_f32_z(pg, dist2, xdist2);
00240
00241
00242
00243
00244
00245
00246 for (shell=0; shell < maxshell; shell++) {
00247 svfloat32_t contracted_gto = svdup_f32(0.0f);
00248
00249
00250
00251
00252
00253
00254
00255
00256
00257 int maxprim = num_prim_per_shell[shell_counter];
00258 int shelltype = shell_types[shell_counter];
00259 for (prim=0; prim<maxprim; prim++) {
00260
00261 float exponent = -basis_array[prim_counter ];
00262 float contract_coeff = basis_array[prim_counter + 1];
00263
00264
00265 svfloat32_t expval = svmul_f32_z(pg, svdup_f32(exponent), dist2);
00266
00267 svfloat32_t retval = aexpfnxarmsve(expval);
00268 contracted_gto = svmad_f32_z(pg, svdup_f32(contract_coeff), retval, contracted_gto);
00269
00270 prim_counter += 2;
00271 }
00272
00273
00274 svfloat32_t tmpshell = svdup_f32(0.0f);
00275 switch (shelltype) {
00276
00277 case S_SHELL:
00278 value = svmad_f32_z(pg, svdup_f32(wave_f[ifunc++]), contracted_gto, value);
00279 break;
00280
00281 case P_SHELL:
00282 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), xdist, tmpshell);
00283 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), ydist, tmpshell);
00284 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), zdist, tmpshell);
00285 value = svmad_f32_z(pg, tmpshell, contracted_gto, value);
00286 break;
00287
00288 case D_SHELL:
00289 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), xdist2, tmpshell);
00290 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, xdist, ydist), tmpshell);
00291 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), ydist2, tmpshell);
00292 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, xdist, zdist), tmpshell);
00293 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, ydist, zdist), tmpshell);
00294 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), zdist2, tmpshell);
00295 value = svmad_f32_z(pg, tmpshell, contracted_gto, value);
00296 break;
00297
00298 case F_SHELL:
00299 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, xdist2, xdist), tmpshell);
00300 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, xdist2, ydist), tmpshell);
00301 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, ydist2, xdist), tmpshell);
00302 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, ydist2, ydist), tmpshell);
00303 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, xdist2, zdist), tmpshell);
00304 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, svmul_f32_x(pg, xdist, ydist), zdist), tmpshell);
00305 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, ydist2, zdist), tmpshell);
00306 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, zdist2, xdist), tmpshell);
00307 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, zdist2, ydist), tmpshell);
00308 tmpshell = svmad_f32_x(pg, svdup_f32(wave_f[ifunc++]), svmul_f32_x(pg, zdist2, zdist), tmpshell);
00309 value = svmad_f32_z(pg, tmpshell, contracted_gto, value);
00310 break;
00311
00312 #if 0
00313 default:
00314
00315 int i, j;
00316 float xdp, ydp, zdp;
00317 float xdiv = 1.0f / xdist;
00318 for (j=0, zdp=1.0f; j<=shelltype; j++, zdp*=zdist) {
00319 int imax = shelltype - j;
00320 for (i=0, ydp=1.0f, xdp=pow(xdist, imax); i<=imax; i++, ydp*=ydist, xdp*=xdiv) {
00321 tmpshell += wave_f[ifunc++] * xdp * ydp * zdp;
00322 }
00323 }
00324 value += tmpshell * contracted_gto;
00325 #endif
00326 }
00327
00328 shell_counter++;
00329 }
00330 }
00331
00332
00333 if (density) {
00334 pg = svcmplt_f32(pg, value, svdup_f32(0.0f));
00335 svfloat32_t sqdensity = svmul_f32_z(pg, value, value);
00336 svfloat32_t orbdensity = svsel_f32(pg, sqdensity, svmul_f32_z(pg, sqdensity, svdup_f32(-1.0f)));
00337 svst1_f32(pg, &orbitalgrid[gaddrzy + nx], orbdensity);
00338 } else {
00339 svst1_f32(pg, &orbitalgrid[gaddrzy + nx], value);
00340 }
00341 }
00342 }
00343 }
00344
00345 return 0;
00346 }
00347
00348 #else
00349
00350 int evaluate_grid_sve(int numatoms,
00351 const float *wave_f, const float *basis_array,
00352 const float *atompos,
00353 const int *atom_basis,
00354 const int *num_shells_per_atom,
00355 const int *num_prim_per_shell,
00356 const int *shell_types,
00357 const int *numvoxels,
00358 float voxelsize,
00359 const float *origin,
00360 int density,
00361 float * orbitalgrid) {
00362 return -1;
00363 }
00364
00365 #endif