00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #define VMDUSESSE 1
00024
00025
00026
00027 #if defined(VMDUSESSE) && defined(__SSE2__)
00028 #include <emmintrin.h>
00029 #endif
00030 #if defined(VMDUSEAVX) && defined(__AVX__)
00031 #include <immintrin.h>
00032 #endif
00033 #if defined(VMDUSENEON) && defined(__ARM_NEON__)
00034 #include <arm_neon.h>
00035 #endif
00036 #include <string.h>
00037 #include <ctype.h>
00038 #include <math.h>
00039 #include <stdio.h>
00040 #include <stdlib.h>
00041
00042 #if defined(_MSC_VER)
00043 #include <windows.h>
00044 #include <conio.h>
00045 #else
00046 #include <unistd.h>
00047 #include <sys/time.h>
00048 #include <errno.h>
00049
00050 #if defined(ARCH_AIX4)
00051 #include <strings.h>
00052 #endif
00053
00054 #if defined(__irix)
00055 #include <bstring.h>
00056 #endif
00057
00058 #if defined(__hpux)
00059 #include <time.h>
00060 #endif // HPUX
00061 #endif // _MSC_VER
00062
00063 #if defined(AIXUSEPERFSTAT)
00064 #include <libperfstat.h>
00065 #endif
00066
00067 #if defined(__APPLE__)
00068 #include <sys/sysctl.h>
00069 #endif
00070
00071 #include "utilities.h"
00072
00073
00074
00075
00076 char *combine_arguments(int argc, const char **argv, int n) {
00077 char *newstr = NULL;
00078
00079 if(argc > 0 && n < argc && n >= 0) {
00080 int i, sl = 0;
00081
00082 for(i=n; i < argc; i++)
00083 sl += strlen(argv[i]);
00084
00085
00086 if(sl) {
00087 newstr = new char[sl + 8 + argc - n];
00088 *newstr = '\0';
00089 for(i=n; i < argc; i++) {
00090 if(i != n)
00091 strcat(newstr," ");
00092 strcat(newstr, argv[i]);
00093 }
00094 }
00095 }
00096
00097
00098 return newstr;
00099 }
00100
00101
00102
00103 char *stringdup(const char *s) {
00104 char *rs;
00105
00106 if(!s)
00107 return NULL;
00108
00109 rs = new char[strlen(s) + 1];
00110 strcpy(rs,s);
00111
00112 return rs;
00113 }
00114
00115
00116
00117 char *stringtoupper(char *s) {
00118 if (s != NULL) {
00119 int i;
00120 int sz = strlen(s);
00121 for(i=0; i<sz; i++)
00122 s[i] = toupper(s[i]);
00123 }
00124
00125 return s;
00126 }
00127
00128 void stripslashes(char *str) {
00129 while (strlen(str) > 0 && str[strlen(str) - 1] == '/') {
00130 str[strlen(str) - 1] = '\0';
00131 }
00132 }
00133
00134
00135 int strupcmp(const char *a, const char *b) {
00136 char *ua, *ub;
00137 int retval;
00138
00139 ua = stringtoupper(stringdup(a));
00140 ub = stringtoupper(stringdup(b));
00141
00142 retval = strcmp(ua,ub);
00143
00144 delete [] ub;
00145 delete [] ua;
00146
00147 return retval;
00148 }
00149
00150
00151
00152 int strupncmp(const char *a, const char *b, int n) {
00153 #if defined(ARCH_AIX3) || defined(ARCH_AIX4) || defined(_MSC_VER)
00154 while (n-- > 0) {
00155 if (toupper(*a) != toupper(*b)) {
00156 return toupper(*b) - toupper(*a);
00157 }
00158 if (*a == 0) return 0;
00159 a++; b++;
00160 }
00161 return 0;
00162 #else
00163 return strncasecmp(a, b, n);
00164 #endif
00165 }
00166
00167
00168
00169
00170
00171 void breakup_filename(const char *full, char **path, char **name) {
00172 const char *namestrt;
00173 int pathlen;
00174
00175 if(full == NULL) {
00176 *path = *name = NULL;
00177 return;
00178 } else if (strlen(full) == 0) {
00179 *path = new char[1];
00180 *name = new char[1];
00181 (*path)[0] = (*name)[0] = '\0';
00182 return;
00183 }
00184
00185
00186 if((namestrt = strrchr(full,'/')) != NULL && strlen(namestrt) > 0) {
00187 namestrt++;
00188 } else {
00189 namestrt = full;
00190 }
00191
00192
00193 *name = stringdup(namestrt);
00194
00195
00196 pathlen = strlen(full) - strlen(*name);
00197 *path = new char[pathlen + 1];
00198 strncpy(*path,full,pathlen);
00199 (*path)[pathlen] = '\0';
00200 }
00201
00202
00203 char *str_tokenize(const char *newcmd, int *argc, char *argv[]) {
00204 char *cmd;
00205 const char *cmdstart;
00206 cmdstart = newcmd;
00207
00208
00209
00210
00211 while (cmdstart != NULL &&
00212 (*cmdstart == ' ' ||
00213 *cmdstart == ',' ||
00214 *cmdstart == ';' ||
00215 *cmdstart == '\t' ||
00216 *cmdstart == '\n')) {
00217 cmdstart++;
00218 }
00219
00220 cmd = stringdup(cmdstart);
00221 *argc = 0;
00222
00223
00224 argv[*argc] = strtok(cmd, " ,;\t\n");
00225
00226
00227 while(argv[*argc] != NULL) {
00228
00229 if(argv[*argc][0] == '#') {
00230 break;
00231 } else {
00232 (*argc)++;
00233 }
00234
00235
00236 argv[*argc] = strtok(NULL," ,;\t\n");
00237 }
00238
00239 return (*argc > 0 ? argv[0] : (char *) NULL);
00240 }
00241
00242
00243
00244 double time_of_day(void) {
00245 #if defined(_MSC_VER)
00246 double t;
00247
00248 t = GetTickCount();
00249 t = t / 1000.0;
00250
00251 return t;
00252 #else
00253 struct timeval tm;
00254 struct timezone tz;
00255
00256 gettimeofday(&tm, &tz);
00257 return((double)(tm.tv_sec) + (double)(tm.tv_usec)/1000000.0);
00258 #endif
00259 }
00260
00261
00262 int vmd_check_stdin(void) {
00263 #if defined(_MSC_VER)
00264 if (_kbhit() != 0)
00265 return TRUE;
00266 else
00267 return FALSE;
00268 #else
00269 fd_set readvec;
00270 struct timeval timeout;
00271 int ret, stdin_fd;
00272
00273 timeout.tv_sec = 0;
00274 timeout.tv_usec = 0;
00275 stdin_fd = 0;
00276 FD_ZERO(&readvec);
00277 FD_SET(stdin_fd, &readvec);
00278
00279 #if !defined(ARCH_AIX3)
00280 ret = select(16, &readvec, NULL, NULL, &timeout);
00281 #else
00282 ret = select(16, (int *)(&readvec), NULL, NULL, &timeout);
00283 #endif
00284
00285 if (ret == -1) {
00286 if (errno != EINTR)
00287 printf("select() error while attempting to read text input.\n");
00288 return FALSE;
00289 } else if (ret == 0) {
00290 return FALSE;
00291 }
00292 return TRUE;
00293 #endif
00294 }
00295
00296
00297
00298 char *vmd_username(void) {
00299 #if defined(_MSC_VER)
00300 char username[1024];
00301 unsigned long size = 1023;
00302
00303 if (GetUserName((char *) &username, &size)) {
00304 return stringdup(username);
00305 }
00306 else {
00307 return stringdup("Windows User");
00308 }
00309 #else
00310 #if defined(ARCH_FREEBSD) || defined(ARCH_FREEBSDAMD64) || defined(__APPLE__) || defined(__linux)
00311 return stringdup(getlogin());
00312 #else
00313 return stringdup(cuserid(NULL));
00314 #endif
00315 #endif
00316 }
00317
00318 int vmd_getuid(void) {
00319 #if defined(_MSC_VER)
00320 return 0;
00321 #else
00322 return getuid();
00323 #endif
00324 }
00325
00326
00327 #if 0
00328
00329
00330
00331
00332
00333 void set_1fv_aligned(const int *iv, int n, const int val) {
00334 int i=0;
00335
00336 #if defined(VMDUSESSE) && defined(__SSE2__)
00337 __m128i = _mm_set_p
00338
00339 for (; i<(n-3); i+=4) {
00340 }
00341 #endif
00342 }
00343 #endif
00344
00345
00346 #if defined(VMDUSESSE) || defined(VMDUSEAVX) || defined(VMDUSENEON)
00347
00348
00349
00350
00351
00352
00353
00354
00355
00356
00357 #if 1
00358
00359 #define myintptrtype unsigned long
00360 #elif 1
00361
00362 #define myintptrtype size_t
00363 #else
00364
00365 #define myintptrtype uintptr_t
00366 #endif
00367
00368 #if 0
00369
00370 static int is_Nbyte_aligned(const void *ptr, int N) {
00371 return ((((myintptrtype) ptr) % N) == 0);
00372 }
00373 #endif
00374
00375
00376 static int is_16byte_aligned(const void *ptr) {
00377 return (((myintptrtype) ptr) == (((myintptrtype) ptr) & (~0xf)));
00378 }
00379
00380 #if defined(VMDUSEAVX)
00381
00382 static int is_32byte_aligned(const void *ptr) {
00383 return (((myintptrtype) ptr) == (((myintptrtype) ptr) & (~0x1f)));
00384 }
00385 #endif
00386
00387 #if 0
00388
00389 static int is_64byte_aligned(const void *ptr) {
00390 return (((myintptrtype) ptr) == (((myintptrtype) ptr) & (~0x3f)));
00391 }
00392 #endif
00393 #endif
00394
00395
00396
00397
00398
00399 #if defined(VMDUSESSE) && defined(__SSE2__)
00400
00401 static void print_m128i(__m128i mask4) {
00402 int * iv = (int *) &mask4;
00403 printf("vec: %08x %08x %08x %08x\n", iv[0], iv[1], iv[2], iv[3]);
00404 }
00405
00406
00407 static int hand_m128i(__m128i mask4) {
00408 __m128i tmp = mask4;
00409 tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 3, 0, 1));
00410 tmp = _mm_and_si128(mask4, tmp);
00411 mask4 = tmp;
00412 tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
00413 tmp = _mm_and_si128(mask4, tmp);
00414 mask4 = tmp;
00415
00416 int mask = _mm_cvtsi128_si32(mask4);
00417 return mask;
00418 }
00419
00420
00421 static int hor_m128i(__m128i mask4) {
00422 __m128i tmp = mask4;
00423 tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 3, 0, 1));
00424 tmp = _mm_or_si128(mask4, tmp);
00425 mask4 = tmp;
00426 tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
00427 tmp = _mm_or_si128(mask4, tmp);
00428 mask4 = tmp;
00429
00430 int mask = _mm_cvtsi128_si32(mask4);
00431 return mask;
00432 }
00433
00434
00435 static int hadd_m128i(__m128i sum4) {
00436 __m128i tmp = sum4;
00437 tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 3, 0, 1));
00438 tmp = _mm_add_epi32(sum4, tmp);
00439 sum4 = tmp;
00440 tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2));
00441 tmp = _mm_add_epi32(sum4, tmp);
00442 sum4 = tmp;
00443
00444 int sum = _mm_cvtsi128_si32(sum4);
00445 return sum;
00446 }
00447
00448
00449 static __m128i _mm_sel_m128i(const __m128i &a, const __m128i &b, const __m128i &mask) {
00450
00451 return _mm_xor_si128(a, _mm_and_si128(mask, _mm_xor_si128(b, a)));
00452 }
00453
00454
00455 static __m128 _mm_sel_ps(const __m128 &a, const __m128 &b, const __m128 &mask) {
00456
00457 return _mm_xor_ps(a, _mm_and_ps(mask, _mm_xor_ps(b, a)));
00458 }
00459
00460
00461
00462 static float fmin_m128(__m128 min4) {
00463 __m128 tmp;
00464 tmp = min4;
00465 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(2, 3, 0, 1));
00466 tmp = _mm_min_ps(min4, tmp);
00467 min4 = tmp;
00468 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2));
00469 tmp = _mm_min_ps(min4, tmp);
00470 min4 = tmp;
00471
00472 float fmin;
00473 _mm_store_ss(&fmin, min4);
00474 return fmin;
00475 }
00476
00477
00478
00479 static float fmax_m128(__m128 max4) {
00480 __m128 tmp = max4;
00481 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(2, 3, 0, 1));
00482 tmp = _mm_max_ps(max4, tmp);
00483 max4 = tmp;
00484 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2));
00485 tmp = _mm_max_ps(max4, tmp);
00486 max4 = tmp;
00487
00488 float fmax;
00489 _mm_store_ss(&fmax, max4);
00490 return fmax;
00491 }
00492 #endif
00493
00494
00495
00496
00497
00498 #if defined(VMDUSENEON) && defined(__ARM_NEON__)
00499
00500
00501 static float fmin_f32x4(float32x4_t min4) {
00502 float *f1 = (float *) &min4;
00503 float min1 = f1[0];
00504 if (f1[1] < min1) min1 = f1[1];
00505 if (f1[2] < min1) min1 = f1[2];
00506 if (f1[3] < min1) min1 = f1[3];
00507 return min1;
00508 }
00509
00510 static float fmax_f32x4(float32x4_t max4) {
00511 float *f1 = (float *) &max4;
00512 float max1 = f1[0];
00513 if (f1[1] > max1) max1 = f1[1];
00514 if (f1[2] > max1) max1 = f1[2];
00515 if (f1[3] > max1) max1 = f1[3];
00516 return max1;
00517 }
00518
00519 #endif
00520
00521
00522
00523 int find_first_selection_aligned(int n, const int *on, int *firstsel) {
00524 int i;
00525 *firstsel = 0;
00526
00527
00528 #if defined(VMDUSEAVX) && defined(__AVX__)
00529
00530 for (i=0; ((i<n) && !is_32byte_aligned(&on[i])); i++) {
00531 if (on[i]) {
00532 *firstsel = i;
00533 return 0;
00534 }
00535 }
00536
00537
00538 for (; i<(n-7); i+=8) {
00539
00540 __m256i on8 = _mm256_load_si256((__m256i*) &on[i]);
00541 if (!_mm256_testz_si256(on8, on8))
00542 break;
00543 }
00544
00545 for (; i<n; i++) {
00546 if (on[i]) {
00547 *firstsel = i;
00548 return 0;
00549 }
00550 }
00551 #elif defined(VMDUSESSE) && defined(__SSE2__)
00552
00553 for (i=0; ((i<n) && !is_16byte_aligned(&on[i])); i++) {
00554 if (on[i]) {
00555 *firstsel = i;
00556 return 0;
00557 }
00558 }
00559
00560
00561 for (; i<(n-3); i+=4) {
00562
00563 __m128i on4 = _mm_load_si128((__m128i*) &on[i]);
00564 if (hor_m128i(on4))
00565 break;
00566 }
00567
00568 for (; i<n; i++) {
00569 if (on[i]) {
00570 *firstsel = i;
00571 return 0;
00572 }
00573 }
00574 #else
00575
00576 for (i=0; i<n; i++) {
00577 if (on[i]) {
00578 *firstsel = i;
00579 return 0;
00580 }
00581 }
00582 #endif
00583
00584
00585 *firstsel = 0;
00586 return -1;
00587 }
00588
00589
00590
00591 int find_last_selection_aligned(int n, const int *on, int *lastsel) {
00592 int i;
00593 *lastsel = -1;
00594
00595
00596 #if defined(VMDUSEAVX) && defined(__AVX__)
00597
00598
00599 for (i=n-1; i>=0; i--) {
00600 if (on[i]) {
00601 *lastsel = i;
00602 return 0;
00603 }
00604
00605
00606 if (is_32byte_aligned(&on[i]))
00607 break;
00608 }
00609
00610 for (i-=8; i>=0; i-=8) {
00611
00612 __m256i on8 = _mm256_load_si256((__m256i*) &on[i]);
00613 if (!_mm256_testz_si256(on8, on8))
00614 break;
00615 }
00616
00617 int last8=i;
00618 for (i=last8+7; i>=last8; i--) {
00619 if (on[i]) {
00620 *lastsel = i;
00621 return 0;
00622 }
00623 }
00624 #elif defined(VMDUSESSE) && defined(__SSE2__)
00625
00626
00627 for (i=n-1; i>=0; i--) {
00628 if (on[i]) {
00629 *lastsel = i;
00630 return 0;
00631 }
00632
00633
00634 if (is_16byte_aligned(&on[i]))
00635 break;
00636 }
00637
00638 for (i-=4; i>=0; i-=4) {
00639
00640 __m128i on4 = _mm_load_si128((__m128i*) &on[i]);
00641 if (hor_m128i(on4))
00642 break;
00643 }
00644
00645 int last4=i;
00646 for (i=last4+3; i>=last4; i--) {
00647 if (on[i]) {
00648 *lastsel = i;
00649 return 0;
00650 }
00651 }
00652 #else
00653
00654 for (i=n-1; i>=0; i--) {
00655 if (on[i]) {
00656 *lastsel = i;
00657 return 0;
00658 }
00659 }
00660 #endif
00661
00662
00663 *lastsel = -1;
00664 return -1;
00665 }
00666
00667
00668
00669
00670 int analyze_selection_aligned(int n, const int *on,
00671 int *firstsel, int *lastsel, int *selected) {
00672 int sel = *selected = 0;
00673 int first = *firstsel = 0;
00674 int last = *lastsel = -1;
00675 int i;
00676
00677
00678 if (find_first_selection_aligned(n, on, &first)) {
00679 return -1;
00680 }
00681
00682
00683 if (find_last_selection_aligned(n, on, &last)) {
00684 return -1;
00685 }
00686
00687
00688
00689
00690
00691
00692
00693 #if !defined(__INTEL_COMPILER) && defined(VMDUSESSE) && defined(__SSE2__)
00694
00695
00696 for (i=first; ((i<=last) && (!is_16byte_aligned(&on[i]))); i++) {
00697 sel += on[i];
00698 }
00699
00700
00701 for (; i<=(last-3); i+=4) {
00702
00703 __m128i on4 = _mm_load_si128((__m128i*) &on[i]);
00704
00705
00706 sel += hadd_m128i(on4);
00707 }
00708
00709
00710 for (; i<=last; i++) {
00711 sel += on[i];
00712 }
00713 #else
00714
00715 for (i=first; i<=last; i++) {
00716 sel += on[i];
00717 }
00718 #endif
00719
00720 *selected = sel;
00721 *firstsel = first;
00722 *lastsel = last;
00723
00724 return 0;
00725 }
00726
00727
00728
00729 void minmax_1fv_aligned(const float *f, int n, float *fmin, float *fmax) {
00730 if (n < 1)
00731 return;
00732
00733 #if defined(VMDUSESSE) && defined(__SSE2__)
00734 int i=0;
00735 float min1 = f[0];
00736 float max1 = f[0];
00737
00738
00739 for (i=0; ((i<n) && !is_16byte_aligned(&f[i])); i++) {
00740 if (f[i] < min1) min1 = f[i];
00741 if (f[i] > max1) max1 = f[i];
00742 }
00743
00744
00745 __m128 min4 = _mm_set_ps1(min1);
00746 __m128 max4 = _mm_set_ps1(max1);
00747
00748
00749 for (; i<(n-31); i+=32) {
00750 __m128 f4 = _mm_load_ps(&f[i]);
00751 min4 = _mm_min_ps(min4, f4);
00752 max4 = _mm_max_ps(max4, f4);
00753 f4 = _mm_load_ps(&f[i+4]);
00754 min4 = _mm_min_ps(min4, f4);
00755 max4 = _mm_max_ps(max4, f4);
00756 f4 = _mm_load_ps(&f[i+8]);
00757 min4 = _mm_min_ps(min4, f4);
00758 max4 = _mm_max_ps(max4, f4);
00759 f4 = _mm_load_ps(&f[i+12]);
00760 min4 = _mm_min_ps(min4, f4);
00761 max4 = _mm_max_ps(max4, f4);
00762
00763 f4 = _mm_load_ps(&f[i+16]);
00764 min4 = _mm_min_ps(min4, f4);
00765 max4 = _mm_max_ps(max4, f4);
00766 f4 = _mm_load_ps(&f[i+20]);
00767 min4 = _mm_min_ps(min4, f4);
00768 max4 = _mm_max_ps(max4, f4);
00769 f4 = _mm_load_ps(&f[i+24]);
00770 min4 = _mm_min_ps(min4, f4);
00771 max4 = _mm_max_ps(max4, f4);
00772 f4 = _mm_load_ps(&f[i+28]);
00773 min4 = _mm_min_ps(min4, f4);
00774 max4 = _mm_max_ps(max4, f4);
00775 }
00776
00777
00778 for (; i<(n-3); i+=4) {
00779 __m128 f4 = _mm_load_ps(&f[i]);
00780 min4 = _mm_min_ps(min4, f4);
00781 max4 = _mm_max_ps(max4, f4);
00782 }
00783
00784
00785 for (; i<n; i++) {
00786 __m128 f4 = _mm_set_ps1(f[i]);
00787 min4 = _mm_min_ps(min4, f4);
00788 max4 = _mm_max_ps(max4, f4);
00789 }
00790
00791
00792
00793 *fmin = fmin_m128(min4);
00794 *fmax = fmax_m128(max4);
00795 #elif defined(VMDUSENEON) && defined(__ARM_NEON__)
00796 int i=0;
00797 float min1 = f[0];
00798 float max1 = f[0];
00799
00800
00801 for (i=0; ((i<n) && !is_16byte_aligned(&f[i])); i++) {
00802 if (f[i] < min1) min1 = f[i];
00803 if (f[i] > max1) max1 = f[i];
00804 }
00805
00806
00807 float32x4_t min4 = vdupq_n_f32(min1);
00808 float32x4_t max4 = vdupq_n_f32(max1);
00809
00810
00811 for (; i<(n-31); i+=32) {
00812 float32x4_t f4;
00813 f4 = vld1q_f32(&f[i ]);
00814 min4 = vminq_f32(min4, f4);
00815 max4 = vmaxq_f32(max4, f4);
00816 f4 = vld1q_f32(&f[i+ 4]);
00817 min4 = vminq_f32(min4, f4);
00818 max4 = vmaxq_f32(max4, f4);
00819 f4 = vld1q_f32(&f[i+ 8]);
00820 min4 = vminq_f32(min4, f4);
00821 max4 = vmaxq_f32(max4, f4);
00822 f4 = vld1q_f32(&f[i+12]);
00823 min4 = vminq_f32(min4, f4);
00824 max4 = vmaxq_f32(max4, f4);
00825
00826 f4 = vld1q_f32(&f[i+16]);
00827 min4 = vminq_f32(min4, f4);
00828 max4 = vmaxq_f32(max4, f4);
00829 f4 = vld1q_f32(&f[i+20]);
00830 min4 = vminq_f32(min4, f4);
00831 max4 = vmaxq_f32(max4, f4);
00832 f4 = vld1q_f32(&f[i+24]);
00833 min4 = vminq_f32(min4, f4);
00834 max4 = vmaxq_f32(max4, f4);
00835 f4 = vld1q_f32(&f[i+28]);
00836 min4 = vminq_f32(min4, f4);
00837 max4 = vmaxq_f32(max4, f4);
00838 }
00839
00840
00841 for (; i<(n-3); i+=4) {
00842 float32x4_t f4 = vld1q_f32(&f[i]);
00843 min4 = vminq_f32(min4, f4);
00844 max4 = vmaxq_f32(max4, f4);
00845 }
00846
00847
00848 for (; i<n; i++) {
00849 float32x4_t f4 = vdupq_n_f32(f[i]);
00850 min4 = vminq_f32(min4, f4);
00851 max4 = vmaxq_f32(max4, f4);
00852 }
00853
00854
00855
00856 *fmin = fmin_f32x4(min4);
00857 *fmax = fmax_f32x4(max4);
00858 #else
00859
00860 float min1 = f[0];
00861 float max1 = f[0];
00862 for (int i=1; i<n; i++) {
00863 if (f[i] < min1) min1 = f[i];
00864 if (f[i] > max1) max1 = f[i];
00865 }
00866 *fmin = min1;
00867 *fmax = max1;
00868 #endif
00869 }
00870
00871
00872
00873
00874 void minmax_3fv_aligned(const float *f, const int n3, float *fmin, float *fmax) {
00875 float minx, maxx, miny, maxy, minz, maxz;
00876 const int end = n3*3;
00877
00878 if (n3 < 1)
00879 return;
00880
00881 int i=0;
00882 minx=maxx=f[i ];
00883 miny=maxy=f[i+1];
00884 minz=maxz=f[i+2];
00885
00886 #if defined(VMDUSESSE) && defined(__SSE2__)
00887
00888
00889 for (; i<end; i+=3) {
00890
00891 if (is_16byte_aligned(&f[i])) {
00892 break;
00893 }
00894
00895 float tmpx = f[i ];
00896 if (tmpx < minx) minx = tmpx;
00897 if (tmpx > maxx) maxx = tmpx;
00898
00899 float tmpy = f[i+1];
00900 if (tmpy < miny) miny = tmpy;
00901 if (tmpy > maxy) maxy = tmpy;
00902
00903 float tmpz = f[i+2];
00904 if (tmpz < minz) minz = tmpz;
00905 if (tmpz > maxz) maxz = tmpz;
00906 }
00907
00908
00909 __m128 xmin4 = _mm_set_ps1(minx);
00910 __m128 xmax4 = _mm_set_ps1(maxx);
00911 __m128 ymin4 = _mm_set_ps1(miny);
00912 __m128 ymax4 = _mm_set_ps1(maxy);
00913 __m128 zmin4 = _mm_set_ps1(minz);
00914 __m128 zmax4 = _mm_set_ps1(maxz);
00915
00916 for (; i<(end-11); i+=12) {
00917
00918
00919 __m128 x0y0z0x1 = _mm_load_ps(&f[i ]);
00920 __m128 y1z1x2y2 = _mm_load_ps(&f[i+4]);
00921 __m128 z2x3y3z3 = _mm_load_ps(&f[i+8]);
00922
00923
00924 __m128 x2y2x3y3 = _mm_shuffle_ps(y1z1x2y2, z2x3y3z3, _MM_SHUFFLE(2, 1, 3, 2));
00925 __m128 y0z0y1z1 = _mm_shuffle_ps(x0y0z0x1, y1z1x2y2, _MM_SHUFFLE(1, 0, 2, 1));
00926 __m128 x = _mm_shuffle_ps(x0y0z0x1, x2y2x3y3, _MM_SHUFFLE(2, 0, 3, 0));
00927 __m128 y = _mm_shuffle_ps(y0z0y1z1, x2y2x3y3, _MM_SHUFFLE(3, 1, 2, 0));
00928 __m128 z = _mm_shuffle_ps(y0z0y1z1, z2x3y3z3, _MM_SHUFFLE(3, 0, 3, 1));
00929
00930
00931 xmin4 = _mm_min_ps(xmin4, x);
00932 xmax4 = _mm_max_ps(xmax4, x);
00933 ymin4 = _mm_min_ps(ymin4, y);
00934 ymax4 = _mm_max_ps(ymax4, y);
00935 zmin4 = _mm_min_ps(zmin4, z);
00936 zmax4 = _mm_max_ps(zmax4, z);
00937 }
00938
00939 minx = fmin_m128(xmin4);
00940 miny = fmin_m128(ymin4);
00941 minz = fmin_m128(zmin4);
00942
00943 maxx = fmax_m128(xmax4);
00944 maxy = fmax_m128(ymax4);
00945 maxz = fmax_m128(zmax4);
00946 #endif
00947
00948
00949 for (; i<end; i+=3) {
00950 float tmpx = f[i ];
00951 if (tmpx < minx) minx = tmpx;
00952 if (tmpx > maxx) maxx = tmpx;
00953
00954 float tmpy = f[i+1];
00955 if (tmpy < miny) miny = tmpy;
00956 if (tmpy > maxy) maxy = tmpy;
00957
00958 float tmpz = f[i+2];
00959 if (tmpz < minz) minz = tmpz;
00960 if (tmpz > maxz) maxz = tmpz;
00961 }
00962
00963 fmin[0] = minx;
00964 fmax[0] = maxx;
00965 fmin[1] = miny;
00966 fmax[1] = maxy;
00967 fmin[2] = minz;
00968 fmax[2] = maxz;
00969 }
00970
00971
00972
00973
00974 int minmax_selected_3fv_aligned(const float *f, const int *on, const int n3,
00975 const int firstsel, const int lastsel,
00976 float *fmin, float *fmax) {
00977 float minx, maxx, miny, maxy, minz, maxz;
00978
00979 if ((n3 < 1) || (firstsel < 0) || (lastsel < firstsel) || (lastsel >= n3))
00980 return -1;
00981
00982
00983 int i=firstsel;
00984 minx=maxx=f[i*3 ];
00985 miny=maxy=f[i*3+1];
00986 minz=maxz=f[i*3+2];
00987
00988 int end=lastsel+1;
00989
00990
00991
00992
00993 #if defined(VMDUSESSE) && defined(__SSE2__)
00994
00995
00996 for (; i<end; i++) {
00997 int ind3 = i * 3;
00998
00999 #if 1
01000
01001
01002
01003
01004 if (is_16byte_aligned(&f[ind3])) {
01005 break;
01006 }
01007 #else
01008
01009 if (is_16byte_aligned(&on[i]) && is_16byte_aligned(&f[ind3])) {
01010
01011
01012 break;
01013 }
01014 #endif
01015
01016 if (on[i]) {
01017 float tmpx = f[ind3 ];
01018 if (tmpx < minx) minx = tmpx;
01019 if (tmpx > maxx) maxx = tmpx;
01020
01021 float tmpy = f[ind3+1];
01022 if (tmpy < miny) miny = tmpy;
01023 if (tmpy > maxy) maxy = tmpy;
01024
01025 float tmpz = f[ind3+2];
01026 if (tmpz < minz) minz = tmpz;
01027 if (tmpz > maxz) maxz = tmpz;
01028 }
01029 }
01030
01031
01032 __m128 xmin4 = _mm_set_ps1(minx);
01033 __m128 xmax4 = _mm_set_ps1(maxx);
01034 __m128 ymin4 = _mm_set_ps1(miny);
01035 __m128 ymax4 = _mm_set_ps1(maxy);
01036 __m128 zmin4 = _mm_set_ps1(minz);
01037 __m128 zmax4 = _mm_set_ps1(maxz);
01038
01039 for (; i<(end-3); i+=4) {
01040 #if 1
01041
01042
01043 __m128i on4 = _mm_loadu_si128((__m128i*) &on[i]);
01044 #else
01045
01046 __m128i on4 = _mm_load_si128((__m128i*) &on[i]);
01047 #endif
01048
01049
01050 __m128i mask = _mm_cmpeq_epi32(_mm_set1_epi32(1), on4);
01051 if (!hor_m128i(mask))
01052 continue;
01053
01054
01055
01056 int ind3 = i * 3;
01057 __m128 x0y0z0x1 = _mm_load_ps(&f[ind3+0]);
01058 __m128 y1z1x2y2 = _mm_load_ps(&f[ind3+4]);
01059 __m128 z2x3y3z3 = _mm_load_ps(&f[ind3+8]);
01060
01061
01062 __m128 x2y2x3y3 = _mm_shuffle_ps(y1z1x2y2, z2x3y3z3, _MM_SHUFFLE(2, 1, 3, 2));
01063 __m128 y0z0y1z1 = _mm_shuffle_ps(x0y0z0x1, y1z1x2y2, _MM_SHUFFLE(1, 0, 2, 1));
01064 __m128 x = _mm_shuffle_ps(x0y0z0x1, x2y2x3y3, _MM_SHUFFLE(2, 0, 3, 0));
01065 __m128 y = _mm_shuffle_ps(y0z0y1z1, x2y2x3y3, _MM_SHUFFLE(3, 1, 2, 0));
01066 __m128 z = _mm_shuffle_ps(y0z0y1z1, z2x3y3z3, _MM_SHUFFLE(3, 0, 3, 1));
01067
01068
01069 xmin4 = _mm_sel_ps(xmin4, _mm_min_ps(xmin4, x), (__m128) mask);
01070 xmax4 = _mm_sel_ps(xmax4, _mm_max_ps(xmax4, x), (__m128) mask);
01071 ymin4 = _mm_sel_ps(ymin4, _mm_min_ps(ymin4, y), (__m128) mask);
01072 ymax4 = _mm_sel_ps(ymax4, _mm_max_ps(ymax4, y), (__m128) mask);
01073 zmin4 = _mm_sel_ps(zmin4, _mm_min_ps(zmin4, z), (__m128) mask);
01074 zmax4 = _mm_sel_ps(zmax4, _mm_max_ps(zmax4, z), (__m128) mask);
01075 }
01076
01077 minx = fmin_m128(xmin4);
01078 miny = fmin_m128(ymin4);
01079 minz = fmin_m128(zmin4);
01080
01081 maxx = fmax_m128(xmax4);
01082 maxy = fmax_m128(ymax4);
01083 maxz = fmax_m128(zmax4);
01084 #endif
01085
01086
01087 for (; i<end; i++) {
01088 if (on[i]) {
01089 int ind3 = i * 3;
01090 float tmpx = f[ind3 ];
01091 if (tmpx < minx) minx = tmpx;
01092 if (tmpx > maxx) maxx = tmpx;
01093
01094 float tmpy = f[ind3+1];
01095 if (tmpy < miny) miny = tmpy;
01096 if (tmpy > maxy) maxy = tmpy;
01097
01098 float tmpz = f[ind3+2];
01099 if (tmpz < minz) minz = tmpz;
01100 if (tmpz > maxz) maxz = tmpz;
01101 }
01102 }
01103
01104 fmin[0] = minx;
01105 fmax[0] = maxx;
01106 fmin[1] = miny;
01107 fmax[1] = maxy;
01108 fmin[2] = minz;
01109 fmax[2] = maxz;
01110
01111 return 0;
01112 }
01113
01114
01115
01116
01117
01118 float * cross_prod(float *x1, const float *x2, const float *x3)
01119 {
01120 x1[0] = x2[1]*x3[2] - x3[1]*x2[2];
01121 x1[1] = -x2[0]*x3[2] + x3[0]*x2[2];
01122 x1[2] = x2[0]*x3[1] - x3[0]*x2[1];
01123 return x1;
01124 }
01125
01126
01127
01128 float * vec_normalize(float *vect) {
01129 float len = vect[0]*vect[0] + vect[1]*vect[1] + vect[2]*vect[2];
01130
01131
01132 if (len > 0) {
01133 float rescale = 1.0f / sqrtf(len);
01134 vect[0] *= rescale;
01135 vect[1] *= rescale;
01136 vect[2] *= rescale;
01137 }
01138
01139 return vect;
01140 }
01141
01142
01143
01144 float norm(const float *vect) {
01145 return sqrtf(vect[0]*vect[0] + vect[1]*vect[1] + vect[2]*vect[2]);
01146 }
01147
01148
01149
01150 int tri_degenerate(const float * v0, const float * v1, const float * v2) {
01151 float s1[3], s2[3], s1_length, s2_length;
01152
01153
01154
01155
01156
01157
01158
01159
01160
01161
01162
01163
01164
01165
01166
01167
01168 s1[0] = v0[0] - v1[0];
01169 s1[1] = v0[1] - v1[1];
01170 s1[2] = v0[2] - v1[2];
01171
01172 s2[0] = v0[0] - v2[0];
01173 s2[1] = v0[1] - v2[1];
01174 s2[2] = v0[2] - v2[2];
01175
01176 s1_length = sqrtf(s1[0]*s1[0] + s1[1]*s1[1] + s1[2]*s1[2]);
01177 s2_length = sqrtf(s2[0]*s2[0] + s2[1]*s2[1] + s2[2]*s2[2]);
01178
01179
01180
01181
01182
01183
01184 s2_length = 1.0f / (s1_length*s2_length);
01185 s1_length = s2_length * (s1[0]*s2[0] + s1[1]*s2[1] + s1[2]*s2[2]);
01186
01187
01188 if ((s1_length >= 1.0 ) || (s1_length <= -1.0))
01189 return 1;
01190 else
01191 return 0;
01192 }
01193
01194
01195
01196 float angle(const float *a, const float *b) {
01197 float ab[3];
01198 cross_prod(ab, a, b);
01199 float psin = sqrtf(dot_prod(ab, ab));
01200 float pcos = dot_prod(a, b);
01201 return 57.2958f * (float) atan2(psin, pcos);
01202 }
01203
01204
01205
01206
01207
01208 float dihedral(const float *a1,const float *a2,const float *a3,const float *a4)
01209 {
01210 float r1[3], r2[3], r3[3], n1[3], n2[3];
01211 vec_sub(r1, a2, a1);
01212 vec_sub(r2, a3, a2);
01213 vec_sub(r3, a4, a3);
01214
01215 cross_prod(n1, r1, r2);
01216 cross_prod(n2, r2, r3);
01217
01218 float psin = dot_prod(n1, r3) * sqrtf(dot_prod(r2, r2));
01219 float pcos = dot_prod(n1, n2);
01220
01221
01222
01223 return 57.2958f * (float) atan2(psin, pcos);
01224 }
01225
01226
01227 float distance(const float *a, const float *b) {
01228 return sqrtf(distance2(a,b));
01229 }
01230
01231 char *vmd_tempfile(const char *s) {
01232 char *envtxt, *TempDir;
01233
01234 if((envtxt = getenv("VMDTMPDIR")) != NULL) {
01235 TempDir = stringdup(envtxt);
01236 } else {
01237 #if defined(_MSC_VER)
01238 if ((envtxt = getenv("TMP")) != NULL) {
01239 TempDir = stringdup(envtxt);
01240 }
01241 else if ((envtxt = getenv("TEMP")) != NULL) {
01242 TempDir = stringdup(envtxt);
01243 }
01244 else {
01245 TempDir = stringdup("c:\\\\");
01246 }
01247 #else
01248 TempDir = stringdup("/tmp");
01249 #endif
01250 }
01251 stripslashes(TempDir);
01252
01253 char *tmpfilebuf = new char[1024];
01254
01255
01256 strcpy(tmpfilebuf, TempDir);
01257
01258 #if defined(_MSC_VER)
01259 strcat(tmpfilebuf, "\\");
01260 strncat(tmpfilebuf, s, 1022 - strlen(TempDir));
01261 #else
01262 strcat(tmpfilebuf, "/");
01263 strncat(tmpfilebuf, s, 1022 - strlen(TempDir));
01264 #endif
01265
01266 tmpfilebuf[1023] = '\0';
01267
01268 delete [] TempDir;
01269
01270
01271 return tmpfilebuf;
01272 }
01273
01274
01275 int vmd_delete_file(const char * path) {
01276 #if defined(_MSC_VER)
01277 if (DeleteFile(path) == 0)
01278 return -1;
01279 else
01280 return 0;
01281 #else
01282 return unlink(path);
01283 #endif
01284 }
01285
01286 void vmd_sleep(int secs) {
01287 #if defined(_MSC_VER)
01288 Sleep(secs * 1000);
01289 #else
01290 sleep(secs);
01291 #endif
01292 }
01293
01294 void vmd_msleep(int msecs) {
01295 #if defined(_MSC_VER)
01296 Sleep(msecs);
01297 #else
01298 struct timeval timeout;
01299 timeout.tv_sec = 0;
01300 timeout.tv_usec = 1000 * msecs;
01301 select(0, NULL, NULL, NULL, &timeout);
01302 #endif // _MSC_VER
01303 }
01304
01305 int vmd_system(const char* cmd) {
01306 return system(cmd);
01307 }
01308
01309
01313 long vmd_random(void) {
01314 #ifdef _MSC_VER
01315 return rand();
01316 #else
01317 return random();
01318 #endif
01319 }
01320
01321 void vmd_srandom(unsigned int seed) {
01322 #ifdef _MSC_VER
01323 srand(seed);
01324 #else
01325 srandom(seed);
01326 #endif
01327 }
01328
01331 float vmd_random_gaussian() {
01332 static bool cache = false;
01333 static float cached_value;
01334 const float RAND_FACTOR = 2.f/VMD_RAND_MAX;
01335 float r, s, w;
01336
01337 if (cache) {
01338 cache = false;
01339 return cached_value;
01340 }
01341 do {
01342 r = RAND_FACTOR*vmd_random()-1.f;
01343 s = RAND_FACTOR*vmd_random()-1.f;
01344 w = r*r+s*s;
01345 } while (w >= 1.f);
01346 w = sqrtf(-2.f*logf(w)/w);
01347 cached_value = s * w;
01348 cache = true;
01349 return (r*w);
01350 }
01351
01352
01355 long vmd_get_total_physmem_mb(void) {
01356 #if defined(_MSC_VER)
01357 MEMORYSTATUS memstat;
01358 GlobalMemoryStatus(&memstat);
01359 if (memstat.dwLength != sizeof(memstat))
01360 return -1;
01361 return memstat.dwTotalPhys/(1024 * 1024);
01362 #elif defined(__linux)
01363 FILE *fp;
01364 char meminfobuf[1024], *pos;
01365 size_t len;
01366
01367 fp = fopen("/proc/meminfo", "r");
01368 if (fp != NULL) {
01369 len = fread(meminfobuf,1,1024, fp);
01370 meminfobuf[1023] = 0;
01371 fclose(fp);
01372 if (len > 0) {
01373 pos=strstr(meminfobuf,"MemTotal:");
01374 if (pos == NULL)
01375 return -1;
01376 pos += 9; ;
01377 return strtol(pos, (char **)NULL, 10)/1024L;
01378 }
01379 }
01380 return -1;
01381 #elif defined(AIXUSEPERFSTAT) && defined(_AIX)
01382 perfstat_memory_total_t minfo;
01383 perfstat_memory_total(NULL, &minfo, sizeof(perfstat_memory_total_t), 1);
01384 return minfo.real_total*(4096/1024)/1024;
01385 #elif defined(_AIX)
01386 return (sysconf(_SC_AIX_REALMEM) / 1024);
01387 #elif defined(_SC_PAGESIZE) && defined(_SC_PHYS_PAGES)
01388
01389 long pgsz = sysconf(_SC_PAGESIZE);
01390 long physpgs = sysconf(_SC_PHYS_PAGES);
01391 return ((pgsz / 1024) * physpgs) / 1024;
01392 #elif defined(__APPLE__)
01393
01394
01395 int rc;
01396 uint64_t membytes;
01397 size_t len = sizeof(membytes);
01398 if (sysctlbyname("hw.memsize", &membytes, &len, NULL, 0))
01399 return -1;
01400 return (membytes / (1024*1024));
01401 #else
01402 return -1;
01403 #endif
01404 }
01405
01406
01407
01410 long vmd_get_avail_physmem_mb(void) {
01411 #if defined(_MSC_VER)
01412 MEMORYSTATUS memstat;
01413 GlobalMemoryStatus(&memstat);
01414 if (memstat.dwLength != sizeof(memstat))
01415 return -1;
01416 return memstat.dwAvailPhys / (1024 * 1024);
01417 #elif defined(__linux)
01418 FILE *fp;
01419 char meminfobuf[1024], *pos;
01420 size_t len;
01421 long val;
01422
01423 fp = fopen("/proc/meminfo", "r");
01424 if (fp != NULL) {
01425 len = fread(meminfobuf,1,1024, fp);
01426 meminfobuf[1023] = 0;
01427 fclose(fp);
01428 if (len > 0) {
01429 val = 0L;
01430 pos=strstr(meminfobuf,"MemFree:");
01431 if (pos != NULL) {
01432 pos += 8; ;
01433 val += strtol(pos, (char **)NULL, 10);
01434 }
01435 pos=strstr(meminfobuf,"Buffers:");
01436 if (pos != NULL) {
01437 pos += 8; ;
01438 val += strtol(pos, (char **)NULL, 10);
01439 }
01440 pos=strstr(meminfobuf,"Cached:");
01441 if (pos != NULL) {
01442 pos += 8; ;
01443 val += strtol(pos, (char **)NULL, 10);
01444 }
01445 return val/1024L;
01446 } else {
01447 return -1;
01448 }
01449 } else {
01450 return -1;
01451 }
01452 #elif defined(AIXUSEPERFSTAT) && defined(_AIX)
01453 perfstat_memory_total_t minfo;
01454 perfstat_memory_total(NULL, &minfo, sizeof(perfstat_memory_total_t), 1);
01455 return minfo.real_free*(4096/1024)/1024;
01456 #elif defined(_SC_PAGESIZE) && defined(_SC_AVPHYS_PAGES)
01457
01458 long pgsz = sysconf(_SC_PAGESIZE);
01459 long avphyspgs = sysconf(_SC_AVPHYS_PAGES);
01460 return ((pgsz / 1024) * avphyspgs) / 1024;
01461 #elif defined(__APPLE__)
01462 #if 0
01463
01464
01465
01466 int rc;
01467 int membytes;
01468 size_t len = sizeof(membytes);
01469 if (sysctlbyname("hw.usermem", &membytes, &len, NULL, 0))
01470 return -1;
01471 return (membytes / (1024*1024));
01472 #else
01473 return -1;
01474 #endif
01475 #else
01476 return -1;
01477 #endif
01478 }
01479
01480
01482 long vmd_get_avail_physmem_percent(void) {
01483 double total, avail;
01484 total = (double) vmd_get_total_physmem_mb();
01485 avail = (double) vmd_get_avail_physmem_mb();
01486 if (total > 0.0 && avail >= 0.0)
01487 return (long) (avail / (total / 100.0));
01488
01489 return -1;
01490 }
01491
01492