MeasureQCP.C Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2019 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 
00009 /***************************************************************************
00010  * RCS INFORMATION:
00011  *
00012  *      $RCSfile: MeasureQCP.C,v $
00013  *      $Author: johns $        $Locker:  $             $State: Exp $
00014  *      $Revision: 1.35 $       $Date: 2020/10/15 16:07:31 $
00015  *
00016  ***************************************************************************
00017  * DESCRIPTION:
00018  *   Code to compute RMSD values for unaligned structures without 
00019  *   actually performing the alginment, particularly useful for 
00020  *   computing large dissimilarity matrices required for 
00021  *   trajectory clustering analysis
00022  *
00023  ***************************************************************************/
00024 
00025 #include <stdio.h>
00026 #include <stdlib.h>
00027 
00028 #define VMDQCPUSESSE  1
00029 // #define VMDQCPUSEAVX2 1
00030 #if defined(VMDUSEAVX512)
00031 #define VMDQCPUSEAVX512 1
00032 #endif
00033 
00034 #define VMDQCPUSETHRPOOL 1
00035 
00036 #if VMDQCPUSESSE && defined(__SSE2__)
00037 #include <emmintrin.h>
00038 #endif
00039 #if VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00040 #include <immintrin.h>
00041 #endif
00042 #if VMDQCPUSEAVX512 && defined(__AVX512F__)
00043 #include <immintrin.h>
00044 #endif
00045 #if (defined(VMDQCPUSEVSX) && defined(__VSX__))
00046 #if defined(__GNUC__) && defined(__VEC__)
00047 #include <altivec.h>
00048 #endif
00049 #endif
00050 
00051 #include <math.h>
00052 #include "Measure.h"
00053 #include "AtomSel.h"
00054 #include "utilities.h"
00055 #include "ResizeArray.h"
00056 #include "MoleculeList.h"
00057 #include "Inform.h"
00058 #include "Timestep.h"
00059 #include "CUDAAccel.h"
00060 #include "CUDAMeasureQCP.h"
00061 #include "VMDApp.h"
00062 #include "WKFThreads.h"
00063 #include "WKFUtils.h"
00064 
00065 #if VMDQCPUSEAVX512 && defined(__AVX512F__)
00066 
00067 static double hadd8_m512d(__m512d sum8) {
00068 //  __m512d tmp = sum8;
00069 //  __m512d hsum4 = _mm512_add_pd(tmp, _mm512_permute2f128_pd(tmp, tmp, 0x1));
00070 //  __m512d hsum2 = _mm256_castpd256_pd128(hsum4);
00071 //  __m512d sum2 = _mm_hadd_pd(hsum2, hsum2);
00072 //  return _mm_cvtsd_f64(sum2);
00073   return 0.0;
00074 }
00075 
00076 
00077 // AVX2 + FMA + 32-byte-aligned SOA-format memory buffers
00078 static double InnerProductSOA_avx512(double *A,
00079                                      float *crdx1, float *crdy1, float *crdz1,
00080                                      float *crdx2, float *crdy2, float *crdz2,
00081                                      const int cnt, const float *weight) {
00082   __m512d va0 = _mm512_set1_pd(0.0);
00083   __m512d va1 = _mm512_set1_pd(0.0);
00084   __m512d va2 = _mm512_set1_pd(0.0);
00085   __m512d va3 = _mm512_set1_pd(0.0);
00086   __m512d va4 = _mm512_set1_pd(0.0);
00087   __m512d va5 = _mm512_set1_pd(0.0);
00088   __m512d va6 = _mm512_set1_pd(0.0);
00089   __m512d va7 = _mm512_set1_pd(0.0);
00090   __m512d va8 = _mm512_set1_pd(0.0);
00091   __m512d vG1 = _mm512_set1_pd(0.0);
00092   __m512d vG2 = _mm512_set1_pd(0.0);
00093 
00094   if (weight != NULL) {
00095     for (int i=0; i<cnt; i+=8) {
00096       __m256 xa8f = _mm256_load_ps(crdx1 + i); // load 8-float vectors
00097       __m256 ya8f = _mm256_load_ps(crdy1 + i);
00098       __m256 za8f = _mm256_load_ps(crdz1 + i);
00099 
00100       __m512d xa8 = _mm512_cvtps_pd(xa8f); // convert from float to doubles
00101       __m512d ya8 = _mm512_cvtps_pd(ya8f);
00102       __m512d za8 = _mm512_cvtps_pd(za8f);
00103 
00104       __m512d gatmp = _mm512_mul_pd(xa8, xa8);
00105       gatmp = _mm512_fmadd_pd(ya8, ya8, gatmp);
00106       gatmp = _mm512_fmadd_pd(za8, za8, gatmp);
00107 
00108       __m256 xb8f = _mm256_load_ps(crdx2 + i); // load 8-float vectors
00109       __m256 yb8f = _mm256_load_ps(crdy2 + i);
00110       __m256 zb8f = _mm256_load_ps(crdz2 + i);
00111 
00112       __m512d xb8 = _mm512_cvtps_pd(xb8f); // convert from float to doubles
00113       __m512d yb8 = _mm512_cvtps_pd(yb8f);
00114       __m512d zb8 = _mm512_cvtps_pd(zb8f);
00115 
00116       __m512d gbtmp = _mm512_mul_pd(xb8, xb8);
00117       gbtmp = _mm512_fmadd_pd(yb8, yb8, gbtmp);
00118       gbtmp = _mm512_fmadd_pd(zb8, zb8, gbtmp);
00119 
00120       __m256 w8f = _mm256_load_ps(weight + i); // load 8-float vector
00121       __m512d w8 = _mm512_cvtps_pd(w8f); // convert from float to double
00122 
00123       vG1 = _mm512_fmadd_pd(w8, gatmp, vG1);
00124       vG2 = _mm512_fmadd_pd(w8, gbtmp, vG2);
00125 
00126       va0 = _mm512_fmadd_pd(xa8, xb8, va0);
00127       va1 = _mm512_fmadd_pd(xa8, yb8, va1);
00128       va2 = _mm512_fmadd_pd(xa8, zb8, va2);
00129 
00130       va3 = _mm512_fmadd_pd(ya8, xb8, va3);
00131       va4 = _mm512_fmadd_pd(ya8, yb8, va4);
00132       va5 = _mm512_fmadd_pd(ya8, zb8, va5);
00133 
00134       va6 = _mm512_fmadd_pd(za8, xb8, va6);
00135       va7 = _mm512_fmadd_pd(za8, yb8, va7);
00136       va8 = _mm512_fmadd_pd(za8, zb8, va8);
00137     }
00138   } else {
00139     for (int i=0; i<cnt; i+=8) {
00140       __m256 xa8f = _mm256_load_ps(crdx1 + i); // load 8-float vectors
00141       __m256 ya8f = _mm256_load_ps(crdy1 + i);
00142       __m256 za8f = _mm256_load_ps(crdz1 + i);
00143 
00144       __m512d xa8 = _mm512_cvtps_pd(xa8f); // convert from float to doubles
00145       __m512d ya8 = _mm512_cvtps_pd(ya8f);
00146       __m512d za8 = _mm512_cvtps_pd(za8f);
00147 
00148       vG1 = _mm512_fmadd_pd(xa8, xa8, vG1);
00149       vG1 = _mm512_fmadd_pd(ya8, ya8, vG1);
00150       vG1 = _mm512_fmadd_pd(za8, za8, vG1);
00151 
00152       __m256 xb8f = _mm256_load_ps(crdx2 + i); // load 8-float vectors
00153       __m256 yb8f = _mm256_load_ps(crdy2 + i);
00154       __m256 zb8f = _mm256_load_ps(crdz2 + i);
00155 
00156       __m512d xb8 = _mm512_cvtps_pd(xb8f); // convert from float to doubles
00157       __m512d yb8 = _mm512_cvtps_pd(yb8f);
00158       __m512d zb8 = _mm512_cvtps_pd(zb8f);
00159 
00160       vG2 = _mm512_fmadd_pd(xb8, xb8, vG2);
00161       vG2 = _mm512_fmadd_pd(yb8, yb8, vG2);
00162       vG2 = _mm512_fmadd_pd(zb8, zb8, vG2);
00163 
00164       va0 = _mm512_fmadd_pd(xa8, xb8, va0);
00165       va1 = _mm512_fmadd_pd(xa8, yb8, va1);
00166       va2 = _mm512_fmadd_pd(xa8, zb8, va2);
00167 
00168       va3 = _mm512_fmadd_pd(ya8, xb8, va3);
00169       va4 = _mm512_fmadd_pd(ya8, yb8, va4);
00170       va5 = _mm512_fmadd_pd(ya8, zb8, va5);
00171 
00172       va6 = _mm512_fmadd_pd(za8, xb8, va6);
00173       va7 = _mm512_fmadd_pd(za8, yb8, va7);
00174       va8 = _mm512_fmadd_pd(za8, zb8, va8);
00175     }
00176   }
00177 
00178   A[0] = hadd8_m512d(va0);
00179   A[1] = hadd8_m512d(va1);
00180   A[2] = hadd8_m512d(va2);
00181   A[3] = hadd8_m512d(va3);
00182   A[4] = hadd8_m512d(va4);
00183   A[5] = hadd8_m512d(va5);
00184   A[6] = hadd8_m512d(va6);
00185   A[7] = hadd8_m512d(va7);
00186   A[8] = hadd8_m512d(va8);
00187 
00188   double G1 = hadd8_m512d(vG1);
00189   double G2 = hadd8_m512d(vG2);
00190 
00191   return (G1 + G2) * 0.5;
00192 }
00193 
00194 #endif
00195 
00196 #if VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00197 
00198 static double hadd4_m256d(__m256d sum4) {
00199   __m256d tmp = sum4;
00200   __m256d hsum4 = _mm256_add_pd(tmp, _mm256_permute2f128_pd(tmp, tmp, 0x1));
00201   __m128d hsum2 = _mm256_castpd256_pd128(hsum4);
00202   __m128d sum2 = _mm_hadd_pd(hsum2, hsum2);
00203   return _mm_cvtsd_f64(sum2);
00204 }
00205 
00206 
00207 // AVX2 + FMA + 32-byte-aligned SOA-format memory buffers
00208 static double InnerProductSOA_avx2(double *A,
00209                                    float *crdx1, float *crdy1, float *crdz1,
00210                                    float *crdx2, float *crdy2, float *crdz2,
00211                                    const int cnt, const float *weight) {
00212   __m256d va0 = _mm256_set1_pd(0.0);
00213   __m256d va1 = _mm256_set1_pd(0.0);
00214   __m256d va2 = _mm256_set1_pd(0.0);
00215   __m256d va3 = _mm256_set1_pd(0.0);
00216   __m256d va4 = _mm256_set1_pd(0.0);
00217   __m256d va5 = _mm256_set1_pd(0.0);
00218   __m256d va6 = _mm256_set1_pd(0.0);
00219   __m256d va7 = _mm256_set1_pd(0.0);
00220   __m256d va8 = _mm256_set1_pd(0.0);
00221   __m256d vG1 = _mm256_set1_pd(0.0);
00222   __m256d vG2 = _mm256_set1_pd(0.0);
00223 
00224   if (weight != NULL) {
00225     for (int i=0; i<cnt; i+=4) {
00226       __m128 xa4f = _mm_load_ps(crdx1 + i); // load 4-float vectors
00227       __m128 ya4f = _mm_load_ps(crdy1 + i);
00228       __m128 za4f = _mm_load_ps(crdz1 + i);
00229 
00230       __m256d xa4 = _mm256_cvtps_pd(xa4f); // convert from float to doubles
00231       __m256d ya4 = _mm256_cvtps_pd(ya4f);
00232       __m256d za4 = _mm256_cvtps_pd(za4f);
00233 
00234       __m256d gatmp = _mm256_mul_pd(xa4, xa4);
00235       gatmp = _mm256_fmadd_pd(ya4, ya4, gatmp);
00236       gatmp = _mm256_fmadd_pd(za4, za4, gatmp);
00237 
00238       __m128 xb4f = _mm_load_ps(crdx2 + i); // load 4-float vectors
00239       __m128 yb4f = _mm_load_ps(crdy2 + i);
00240       __m128 zb4f = _mm_load_ps(crdz2 + i);
00241 
00242       __m256d xb4 = _mm256_cvtps_pd(xb4f); // convert from float to doubles
00243       __m256d yb4 = _mm256_cvtps_pd(yb4f);
00244       __m256d zb4 = _mm256_cvtps_pd(zb4f);
00245 
00246       __m256d gbtmp = _mm256_mul_pd(xb4, xb4);
00247       gbtmp = _mm256_fmadd_pd(yb4, yb4, gbtmp);
00248       gbtmp = _mm256_fmadd_pd(zb4, zb4, gbtmp);
00249 
00250       __m128 w4f = _mm_load_ps(weight + i); // load 4-float vector
00251       __m256d w4 = _mm256_cvtps_pd(w4f); // convert from float to double
00252 
00253       vG1 = _mm256_fmadd_pd(w4, gatmp, vG1);
00254       vG2 = _mm256_fmadd_pd(w4, gbtmp, vG2);
00255 
00256       va0 = _mm256_fmadd_pd(xa4, xb4, va0);
00257       va1 = _mm256_fmadd_pd(xa4, yb4, va1);
00258       va2 = _mm256_fmadd_pd(xa4, zb4, va2);
00259 
00260       va3 = _mm256_fmadd_pd(ya4, xb4, va3);
00261       va4 = _mm256_fmadd_pd(ya4, yb4, va4);
00262       va5 = _mm256_fmadd_pd(ya4, zb4, va5);
00263 
00264       va6 = _mm256_fmadd_pd(za4, xb4, va6);
00265       va7 = _mm256_fmadd_pd(za4, yb4, va7);
00266       va8 = _mm256_fmadd_pd(za4, zb4, va8);
00267     }
00268   } else {
00269     for (int i=0; i<cnt; i+=4) {
00270       __m128 xa4f = _mm_load_ps(crdx1 + i); // load 4-float vectors
00271       __m128 ya4f = _mm_load_ps(crdy1 + i);
00272       __m128 za4f = _mm_load_ps(crdz1 + i);
00273 
00274       __m256d xa4 = _mm256_cvtps_pd(xa4f); // convert from float to doubles
00275       __m256d ya4 = _mm256_cvtps_pd(ya4f);
00276       __m256d za4 = _mm256_cvtps_pd(za4f);
00277 
00278       vG1 = _mm256_fmadd_pd(xa4, xa4, vG1);
00279       vG1 = _mm256_fmadd_pd(ya4, ya4, vG1);
00280       vG1 = _mm256_fmadd_pd(za4, za4, vG1);
00281 
00282       __m128 xb4f = _mm_load_ps(crdx2 + i); // load 4-float vectors
00283       __m128 yb4f = _mm_load_ps(crdy2 + i);
00284       __m128 zb4f = _mm_load_ps(crdz2 + i);
00285 
00286       __m256d xb4 = _mm256_cvtps_pd(xb4f); // convert from float to doubles
00287       __m256d yb4 = _mm256_cvtps_pd(yb4f);
00288       __m256d zb4 = _mm256_cvtps_pd(zb4f);
00289 
00290       vG2 = _mm256_fmadd_pd(xb4, xb4, vG2);
00291       vG2 = _mm256_fmadd_pd(yb4, yb4, vG2);
00292       vG2 = _mm256_fmadd_pd(zb4, zb4, vG2);
00293 
00294       va0 = _mm256_fmadd_pd(xa4, xb4, va0);
00295       va1 = _mm256_fmadd_pd(xa4, yb4, va1);
00296       va2 = _mm256_fmadd_pd(xa4, zb4, va2);
00297 
00298       va3 = _mm256_fmadd_pd(ya4, xb4, va3);
00299       va4 = _mm256_fmadd_pd(ya4, yb4, va4);
00300       va5 = _mm256_fmadd_pd(ya4, zb4, va5);
00301 
00302       va6 = _mm256_fmadd_pd(za4, xb4, va6);
00303       va7 = _mm256_fmadd_pd(za4, yb4, va7);
00304       va8 = _mm256_fmadd_pd(za4, zb4, va8);
00305     }
00306   }
00307 
00308   A[0] = hadd4_m256d(va0);
00309   A[1] = hadd4_m256d(va1);
00310   A[2] = hadd4_m256d(va2);
00311   A[3] = hadd4_m256d(va3);
00312   A[4] = hadd4_m256d(va4);
00313   A[5] = hadd4_m256d(va5);
00314   A[6] = hadd4_m256d(va6);
00315   A[7] = hadd4_m256d(va7);
00316   A[8] = hadd4_m256d(va8);
00317 
00318   double G1 = hadd4_m256d(vG1);
00319   double G2 = hadd4_m256d(vG2);
00320 
00321   return (G1 + G2) * 0.5;
00322 }
00323 
00324 #endif
00325 
00326 
00327 // plain C++ version of inner product for SOA coordinate storage
00328 static double InnerProductSOA(double *A,
00329                               float *crdx1, float *crdy1, float *crdz1,
00330                               float *crdx2, float *crdy2, float *crdz2,
00331                               const int cnt, const float *weight) {
00332   double G1=0.0, G2 = 0.0;
00333   memset(A, 0, sizeof(double) * 9);
00334 
00335   double x1, x2, y1, y2, z1, z2;
00336   double a0, a1, a2, a3, a4, a5, a6, a7, a8;
00337   a0=a1=a2=a3=a4=a5=a6=a7=a8=0.0;
00338   if (weight != NULL) {
00339     for (int i=0; i<cnt; i++) {
00340       double w = weight[i];
00341       x1 = crdx1[i];
00342       y1 = crdy1[i];
00343       z1 = crdz1[i];
00344 
00345       G1 += w * (x1*x1 + y1*y1 + z1*z1);
00346 
00347       x2 = crdx2[i];
00348       y2 = crdy2[i];
00349       z2 = crdz2[i];
00350 
00351       G2 += w * (x2*x2 + y2*y2 + z2*z2);
00352 
00353       a0 += x1 * x2;
00354       a1 += x1 * y2;
00355       a2 += x1 * z2;
00356 
00357       a3 += y1 * x2;
00358       a4 += y1 * y2;
00359       a5 += y1 * z2;
00360 
00361       a6 += z1 * x2;
00362       a7 += z1 * y2;
00363       a8 += z1 * z2;
00364     }
00365   } else {
00366     for (int i=0; i<cnt; i++) {
00367       x1 = crdx1[i];
00368       y1 = crdy1[i];
00369       z1 = crdz1[i];
00370 
00371       G1 += x1*x1 + y1*y1 + z1*z1;
00372 
00373       x2 = crdx2[i];
00374       y2 = crdy2[i];
00375       z2 = crdz2[i];
00376 
00377       G2 += x2*x2 + y2*y2 + z2*z2;
00378 
00379       a0 += x1 * x2;
00380       a1 += x1 * y2;
00381       a2 += x1 * z2;
00382 
00383       a3 += y1 * x2;
00384       a4 += y1 * y2;
00385       a5 += y1 * z2;
00386 
00387       a6 += z1 * x2;
00388       a7 += z1 * y2;
00389       a8 += z1 * z2;
00390     }
00391   }
00392 
00393   A[0] = a0;
00394   A[1] = a1;
00395   A[2] = a2;
00396 
00397   A[3] = a3;
00398   A[4] = a4;
00399   A[5] = a5;
00400 
00401   A[6] = a6;
00402   A[7] = a7;
00403   A[8] = a8;
00404 
00405   return (G1 + G2) * 0.5;
00406 }
00407 
00408 //
00409 // OpenACC version of inner product for SOA coordinate storage
00410 //
00411 // use pgc++ -m64 -Minfo=accel -ta=nvidia -O -acc
00412 #if defined(__PGIC__) && defined(_OPENACC)
00413 
00414 #if 0
00415 static void vecadd_acc(void) {
00416   printf("****** OpenACC test vecadd_acc()...\n");
00417 
00418   // Size of vectors
00419   int n = 10000;
00420  
00421   // Input vectors
00422   double *restrict a;
00423   double *restrict b;
00424 
00425   // Output vector
00426   double *restrict c;
00427  
00428   // Size, in bytes, of each vector
00429   size_t bytes = n*sizeof(double);
00430  
00431   // Allocate memory for each vector
00432   a = (double*)malloc(bytes);
00433   b = (double*)malloc(bytes);
00434   c = (double*)malloc(bytes);
00435  
00436   // Initialize content of input vectors, vector a[i] = sin(i)^2 vector b[i] = cos(i)^2
00437   int i;
00438   for (i=0; i<n; i++) {
00439     a[i] = sin(i)*sin(i);
00440     b[i] = cos(i)*cos(i);
00441   }   
00442  
00443   // sum component wise and save result into vector c
00444 #pragma acc kernels copyin(a[0:n],b[0:n]), copyout(c[0:n])
00445   for (i=0; i<n; i++) {
00446     c[i] = a[i] + b[i];
00447   }
00448  
00449   // Sum up vector c and print result divided by n, this should equal 1 within error
00450   double sum = 0.0;
00451   for(i=0; i<n; i++) {
00452     sum += c[i];
00453   }
00454   sum = sum/n;
00455   printf("****** final result: %f *******\n", sum);
00456  
00457   // Release memory
00458   free(a);
00459   free(b);
00460   free(c);
00461 
00462   printf("****** OpenACC test vecadd_acc() done.\n");
00463 }
00464 #endif
00465 
00466 //
00467 // Use 1-D loop rather than 2-D to please PGI OpenACC so it doesn't
00468 // complain about loop-carried dependencies, this appears to be the only
00469 // successful method to achieve a good quality parallelization.
00470 //
00471 #define LOOP1D 1
00472 
00473 #if defined(LOOP1D)
00474 #if defined(__PGIC__) && defined(_OPENACC)
00475 #pragma acc routine seq
00476 #endif
00477 static void acc_idx2sub_tril(long N, long ind, long *J, long *I) {
00478   long i, j;
00479   i = long(floor((2*N+1 - sqrt((2*N+1)*(2*N+1) - 8*ind)) / 2));
00480   j = ind - N*i + i*(i-1)/2 + i;
00481  
00482   *I = i;
00483   *J = j;
00484 }
00485 #endif
00486 
00487 static void rmsdmat_qcp_acc(int cnt, int padcnt, int framecrdsz, 
00488                             int framecount, 
00489 #if defined(LOOP1D)
00490                             const float * restrict crds, 
00491 #else
00492                             const float * crds, 
00493 #endif
00494                             // const float *weight,
00495                             float * rmsdmat) {
00496   printf("OpenACC rmsdmat_qcp_acc()...\n");
00497   printf("ACC cnt: %d padcnt: %d\n", cnt, padcnt);
00498 
00499   printf("Copying input arrays to accelerators...\n");
00500   long totalsz = 3L * framecrdsz * framecount;
00501   printf("ACC copysz: %ld  (3 * %d * %d)\n", totalsz, framecrdsz, framecount);
00502 
00503   long matcnt = framecount * framecount;
00504   printf("ACC matcnt: %ld\n", matcnt);
00505 
00506   printf("Running OpenACC kernels...\n");
00507 #if defined(LOOP1D)
00508   long i, j, k;
00509 #pragma acc kernels copyin(crds[0:totalsz]), copy(rmsdmat[0:matcnt])
00510   for (k=0; k<(framecount*(framecount-1))/2; k++) {
00511     acc_idx2sub_tril(long(framecount-1), k, &i, &j);
00512     long x1addr = j * 3L * framecrdsz;
00513     {
00514 #else
00515   long i, j;
00516 #pragma acc kernels copyin(crds[0:totalsz]), copy(rmsdmat[0:matcnt])
00517   for (j=0; j<framecount; j++) {
00518     long x1addr = j * 3L * framecrdsz;
00519 
00520     for (i=0; i<j; i++) {
00521 #endif
00522       // calculate the (weighted) inner product of two structures
00523       long x2addr = i * 3L * framecrdsz;
00524 
00525       double G1=0.0, G2=0.0;
00526 
00527       double a0, a1, a2, a3, a4, a5, a6, a7, a8;
00528       a0=a1=a2=a3=a4=a5=a6=a7=a8=0.0;
00529 #if 0
00530       if (weight != NULL) {
00531         double x1, x2, y1, y2, z1, z2;
00532 #pragma acc loop
00533         for (long l=0; l<cnt; l++) {
00534           double w = weight[l];
00535           x1 = crds[l + x1addr];
00536           y1 = crds[l + x1addr + framecrdsz];
00537           z1 = crds[l + x1addr + framecrdsz*2];
00538 
00539           G1 += w * (x1*x1 + y1*y1 + z1*z1);
00540 
00541           x2 = crds[l + x2addr];
00542           y2 = crds[l + x2addr + framecrdsz];
00543           z2 = crds[l + x2addr + framecrdsz*2];
00544 
00545           G2 += w * (x2*x2 + y2*y2 + z2*z2);
00546 
00547           a0 += x1 * x2;
00548           a1 += x1 * y2;
00549           a2 += x1 * z2;
00550 
00551           a3 += y1 * x2;
00552           a4 += y1 * y2;
00553           a5 += y1 * z2;
00554 
00555           a6 += z1 * x2;
00556           a7 += z1 * y2;
00557           a8 += z1 * z2;
00558         }
00559       } else {
00560 #endif
00561         double x1, x2, y1, y2, z1, z2;
00562 #pragma acc loop vector(256)
00563 //#pragma acc loop vector(256) reduction(+:a0),reduction(+:a1),reduction(+:a2),reduction(+:a3),reduction(+:a4),reduction(+:a5),reduction(+:a6),reduction(+:a7),reduction(+:a8),reduction(+:G1),reduction(+:G2)
00564         for (long l=0; l<cnt; l++) {
00565           x1 = crds[l + x1addr];
00566           y1 = crds[l + x1addr + framecrdsz];
00567           z1 = crds[l + x1addr + framecrdsz*2];
00568 
00569           G1 += x1*x1 + y1*y1 + z1*z1;
00570 
00571           x2 = crds[l + x2addr];
00572           y2 = crds[l + x2addr + framecrdsz];
00573           z2 = crds[l + x2addr + framecrdsz*2];
00574 
00575           G2 += x2*x2 + y2*y2 + z2*z2;
00576 
00577           a0 += x1 * x2;
00578           a1 += x1 * y2;
00579           a2 += x1 * z2;
00580 
00581           a3 += y1 * x2;
00582           a4 += y1 * y2;
00583           a5 += y1 * z2;
00584 
00585           a6 += z1 * x2;
00586           a7 += z1 * y2;
00587           a8 += z1 * z2;
00588         }
00589 #if 0
00590       }
00591 #endif
00592 
00593       double A[9];
00594       A[0] = a0;
00595       A[1] = a1;
00596       A[2] = a2;
00597 
00598       A[3] = a3;
00599       A[4] = a4;
00600       A[5] = a5;
00601 
00602       A[6] = a6;
00603       A[7] = a7;
00604       A[8] = a8;
00605 
00606       double E0 = (G1 + G2) * 0.5;
00607 
00608       // calculate the RMSD & rotational matrix
00609       float rmsd;
00610       FastCalcRMSDAndRotation(NULL, A, &rmsd, E0, cnt, -1);
00611 #if defined(LOOP1D)
00612       rmsdmat[k]=rmsd; // store linearized triangle
00613 #else
00614       rmsdmat[j*framecount + i]=rmsd;
00615 #endif
00616     }
00617   }
00618 
00619   printf("ACC done.\n");
00620 }
00621 
00622 #endif
00623 
00624 
00625 #if 0
00626 static double InnerProductAOS(double *A, double *coords1, double *coords2,
00627                               const int cnt, const double *weight) {
00628   double G1=0.0, G2=0.0;
00629   memset(A, 0, sizeof(double) * 9);
00630 
00631   long i;
00632   double x1, x2, y1, y2, z1, z2;
00633   if (weight != NULL) {
00634     for (i=0; i<cnt; i++) {
00635       double w = weight[i];
00636       long idx = i*3;
00637       x1 = coords1[idx  ];
00638       y1 = coords1[idx+1];
00639       z1 = coords1[idx+2];
00640 
00641       G1 += w * (x1*x1 + y1*y1 + z1*z1);
00642 
00643       x2 = coords2[idx  ];
00644       y2 = coords2[idx+1];
00645       z2 = coords2[idx+2];
00646 
00647       G2 += w * (x2*x2 + y2*y2 + z2*z2);
00648 
00649       A[0] +=  (x1 * x2);
00650       A[1] +=  (x1 * y2);
00651       A[2] +=  (x1 * z2);
00652 
00653       A[3] +=  (y1 * x2);
00654       A[4] +=  (y1 * y2);
00655       A[5] +=  (y1 * z2);
00656 
00657       A[6] +=  (z1 * x2);
00658       A[7] +=  (z1 * y2);
00659       A[8] +=  (z1 * z2);
00660     }
00661   } else {
00662     for (i=0; i<cnt; i++) {
00663       long idx = i*3;
00664       x1 = coords1[idx  ];
00665       y1 = coords1[idx+1];
00666       z1 = coords1[idx+2];
00667 
00668       G1 += x1*x1 + y1*y1 + z1*z1;
00669 
00670       x2 = coords2[idx  ];
00671       y2 = coords2[idx+1];
00672       z2 = coords2[idx+2];
00673 
00674       G2 += x2*x2 + y2*y2 + z2*z2;
00675 
00676       A[0] +=  (x1 * x2);
00677       A[1] +=  (x1 * y2);
00678       A[2] +=  (x1 * z2);
00679 
00680       A[3] +=  (y1 * x2);
00681       A[4] +=  (y1 * y2);
00682       A[5] +=  (y1 * z2);
00683 
00684       A[6] +=  (z1 * x2);
00685       A[7] +=  (z1 * y2);
00686       A[8] +=  (z1 * z2);
00687     }
00688   }
00689 
00690   return (G1 + G2) * 0.5;
00691 }
00692 #endif
00693 
00694 
00695 void com_soa(int cnt, 
00696              float *&soax, float *&soay, float *&soaz,
00697              double &comx, double &comy, double &comz,
00698              const float *weight) {
00699   comx=comy=comz=0.0;
00700 
00701   if (weight != NULL) {
00702     double wsum = 0.0;
00703 
00704     // compute weighted center of mass
00705     int i;
00706     for (i=0; i<cnt; i++) {
00707       double w = weight[i];
00708       wsum += w;
00709 
00710       comx += soax[i] * w;
00711       comy += soay[i] * w;
00712       comz += soaz[i] * w;
00713     }
00714     double wsumnorm = 1.0 / wsum;
00715     comx *= wsumnorm; 
00716     comy *= wsumnorm;
00717     comz *= wsumnorm;
00718   } else {
00719     // compute unweighted center of mass
00720     int i;
00721     for (i=0; i<cnt; i++) {
00722       comx += soax[i];
00723       comy += soay[i];
00724       comz += soaz[i];
00725     }
00726     double avenorm = 1.0 / ((double) cnt);
00727     comx *= avenorm; 
00728     comy *= avenorm;
00729     comz *= avenorm;
00730   }
00731 }
00732 
00733 
00734 
00735 int center_convert_soa(const AtomSel *sel, int num, const float *framepos,
00736                        const float *weight, 
00737                        float *&soax, float *&soay, float *&soaz) {
00738   // allocate temporary working arrays, plus required SIMD padding
00739   int cnt  = sel->selected;
00740   soax = (float *) calloc(1, (cnt + 16)*sizeof(float));
00741   soay = (float *) calloc(1, (cnt + 16)*sizeof(float));
00742   soaz = (float *) calloc(1, (cnt + 16)*sizeof(float));
00743 
00744   int selind = sel->firstsel; // start from the first selected atom
00745   double comx=0.0, comy=0.0, comz=0.0;
00746 
00747   int i;
00748   for (i=0; i<cnt; i++) {
00749     // find next 'on' atom in selection
00750     // loop is safe since we already stop the on cnt > 0 above
00751     while (!sel->on[selind])
00752       selind++;
00753 
00754     // compact selection and convert AOS to SOA storage on-the-fly
00755     long addr = 3*selind;
00756     float tx = framepos[addr    ];
00757     float ty = framepos[addr + 1];
00758     float tz = framepos[addr + 2];
00759 
00760     comx += tx;
00761     comy += ty;
00762     comz += tz;
00763 
00764     soax[i] = tx;
00765     soay[i] = ty; 
00766     soaz[i] = tz;
00767 
00768     selind++; // advance to next atom
00769   }
00770 
00771   double avenorm = 1.0 / ((double) cnt);
00772   comx *= avenorm; // compute unweighted center of mass
00773   comy *= avenorm;
00774   comz *= avenorm;
00775 
00776 #if 0
00777   printf("center_convert_soa(): structure com: %g %g %g\n", comx, comy, comz);
00778 #endif
00779 
00780   // translate center of mass to the origin
00781   for (i=0; i<cnt; i++) {
00782     soax[i] -= float(comx);
00783     soay[i] -= float(comy);
00784     soaz[i] -= float(comz);
00785   }
00786 
00787 #if 0
00788   // check post-translation com 
00789   com_soa(cnt, soax, soay, soaz, comx, comy, comz, weight); 
00790   printf("center_convert_soa():  centered com: %lg %lg %lg\n", comx, comy, comz);
00791 #endif   
00792 
00793   return 0;
00794 }
00795 
00796 
00797 int center_convert_single_soa(const AtomSel *sel, int num, 
00798                               const float *framepos,
00799                               const float *weight, 
00800                               float *soax, float *soay, float *soaz) {
00801   // allocate temporary working arrays, plus required SIMD padding
00802   int cnt = sel->selected;
00803   int selind = sel->firstsel; // start from the first selected atom
00804   double comx=0.0, comy=0.0, comz=0.0;
00805 
00806   int i;
00807   for (i=0; i<cnt; i++) {
00808     // find next 'on' atom in selection
00809     // loop is safe since we already stop the on cnt > 0 above
00810     while (!sel->on[selind])
00811       selind++;
00812 
00813     // compact selection and convert AOS to SOA storage on-the-fly
00814     long addr = 3*selind;
00815     float tx = framepos[addr    ];
00816     float ty = framepos[addr + 1];
00817     float tz = framepos[addr + 2];
00818 
00819     comx += tx;
00820     comy += ty;
00821     comz += tz;
00822 
00823     soax[i] = tx;
00824     soay[i] = ty; 
00825     soaz[i] = tz;
00826 
00827     selind++; // advance to next atom
00828   }
00829 
00830   double avenorm = 1.0 / ((double) cnt);
00831   comx *= avenorm; // compute unweighted center of mass
00832   comy *= avenorm;
00833   comz *= avenorm;
00834 
00835   // translate center of mass to the origin
00836   for (i=0; i<cnt; i++) {
00837     soax[i] -= float(comx);
00838     soay[i] -= float(comy);
00839     soaz[i] -= float(comz);
00840   }
00841 
00842   return 0;
00843 }
00844 
00845 
00846 int measure_rmsd_qcp(VMDApp *app,
00847                      const AtomSel *sel1, const AtomSel *sel2,
00848                      int num, const float *framepos1, const float *framepos2,
00849                      float *weight, float *rmsd) {
00850   if (!sel1 || !sel2)   return MEASURE_ERR_NOSEL;
00851   if (sel1->selected < 1 || sel2->selected < 1) return MEASURE_ERR_NOSEL;
00852   if (!weight || !rmsd) return MEASURE_ERR_NOWEIGHT;
00853 
00854   // the number of selected atoms must be the same
00855   if (sel1->selected != sel2->selected) return MEASURE_ERR_MISMATCHEDCNT;
00856 
00857 #if 0
00858   // need to know how to traverse the list of weights
00859   // there could be 1 weight per atom (sel_flg == 1) or
00860   // 1 weight per selected atom (sel_flg == 0)
00861   int sel_flg;
00862   if (num == sel1->num_atoms) {
00863     sel_flg = 1; // using all elements
00864   } else {
00865     sel_flg = 0; // using elements from selection
00866   }
00867 #endif
00868 
00869   //
00870   // compute CoM for each selection while copying them into target bufs 
00871   //
00872   float *sel1x, *sel1y, *sel1z, *sel2x, *sel2y, *sel2z;
00873   center_convert_soa(sel1, num, framepos1, weight, sel1x, sel1y, sel1z);
00874   center_convert_soa(sel2, num, framepos2, weight, sel2x, sel2y, sel2z);
00875 
00876   // calculate the (weighted) inner product of two structures
00877   double E0 = 0;
00878   double A[9];
00879   E0 = InnerProductSOA(A, 
00880                        sel1x, sel1y, sel1z,
00881                        sel2x, sel2y, sel2z,
00882                        sel1->selected, NULL /* weight */);
00883 
00884 #if 0
00885   printf("QCP inner product results:\n");
00886   printf("  E0: %g\n", E0);
00887   int i;
00888   for (i=0; i<9; i+=3) 
00889     printf("A[%d-%d]: %g %g %g\n", i, i+2, A[i], A[i+1], A[i+2]);
00890   printf("\n");
00891 #endif
00892 
00893   // calculate the RMSD & rotational matrix
00894   FastCalcRMSDAndRotation(NULL, A, rmsd, E0, sel1->selected, -1);
00895 
00896   free(sel1x);
00897   free(sel1y);
00898   free(sel1z);
00899 
00900   free(sel2x);
00901   free(sel2y);
00902   free(sel2z);
00903 
00904   return MEASURE_NOERR; // and say rmsd is OK
00905 }
00906 
00907 
00908 #if 0
00909 // compute linear array index from lower-triangular indices i,j 
00910 static int sub2idx_tril(long N, long i, long j, long *ind) {
00911 //  *ind = i + (j-1)*N - j*(j-1)/2;
00912   *ind = j + N*i - i*(i-1)/2;
00913   return 0;
00914 }
00915 #endif
00916 
00917 // compute lower-triangular indices i,j from linear array index
00918 static int idx2sub_tril(long N, long ind, long *J, long *I) {
00919   long i, j;
00920 
00921   if (ind > (N*(N+1)/2)) {
00922     return -1; // out of bounds
00923   }
00924 
00925   // XXX deal with ambiguous types for sqrt() on Solaris
00926   double tmp2np1 = 2*N+1;
00927   i = long(floor((tmp2np1 - sqrt(tmp2np1*tmp2np1 - 8.0*ind)) / 2));
00928   // i = long(floor((2*N+1 - sqrt((2*N+1)*(2*N+1) - 8*ind)) / 2));
00929   j = ind - N*i + i*(i-1)/2 + i;
00930   
00931   *I = i;
00932   *J = j+1;
00933 
00934   return 0;
00935 }
00936 
00937 
00938 typedef struct {
00939   const AtomSel *sel;
00940   int first;
00941   int last;
00942   int step;
00943   float *rmsdmat;
00944   int padcnt;
00945   int framecrdsz;
00946   float *crds;
00947 #if (VMDQCPUSEAVX512 && defined(__AVX512F__))
00948   int useavx512;
00949 #endif
00950 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
00951   int useavx2;
00952 #endif
00953 #if (VMDQCPUSEVSX && defined(__VEC__))
00954   int usevsx;
00955 #endif
00956 } qcprmsdthreadparms;
00957 
00958 
00959 static void * measure_rmsdmat_qcp_thread(void *voidparms) {
00960   int threadid;
00961   qcprmsdthreadparms *parms = NULL;
00962 #if defined(VMDQCPUSETHRPOOL)
00963   wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00964   wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00965 #else
00966   wkf_threadlaunch_getdata(voidparms, (void **) &parms);
00967   wkf_threadlaunch_getid(voidparms, &threadid, NULL);
00968 #endif
00969 
00970   //
00971   // copy in per-thread parameters
00972   //
00973   const AtomSel *sel = parms->sel;
00974   float *rmsdmat = parms->rmsdmat;
00975 
00976   // XXX array padding not universally honored yet...
00977   // int padcnt = parms->padcnt;
00978 
00979   int framecrdsz = parms->framecrdsz;
00980   float *crds = parms->crds;
00981   int first  = parms->first;
00982   int last   = parms->last;
00983   int step   = parms->step;
00984 #if VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
00985   int useavx2 = parms->useavx2;
00986 #endif
00987 #if (VMDQCPUSEAVX512 && defined(__AVX512F__))
00988   int useavx512 = parms->useavx512;
00989 #endif
00990 #if (VMDQCPUSEVSX && defined(__VEC__))
00991   int usevsx = parms->usevsx;
00992 #endif
00993 
00994 #if 0
00995 printf("qcpthread[%d] running... %s\n", threadid, 
00996 #if (VMDQCPUSEAVX512 && defined(__AVX512F__))
00997          (useavx512) ? "(AVX512)" : "(C++)");
00998 #elif (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
00999          (useavx2) ? "(AVX2)" : "(C++)");
01000 #elif (VMDQCPUSEVSX && defined(__VEC__))
01001          (useavsx) ? "(VSX)" : "(C++)");
01002 #else  
01003        "(C++)");
01004 #endif
01005 #endif
01006 
01007   int framecount = (last - first + 1) / step;
01008 
01009   wkf_tasktile_t tile;
01010 #if defined(VMDQCPUSETHRPOOL)
01011   while (wkf_threadpool_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
01012 #else
01013   while (wkf_threadlaunch_next_tile(voidparms, 8, &tile) != WKF_SCHED_DONE) {
01014 #endif
01015     long idx;
01016 
01017     for (idx=tile.start; idx<tile.end; idx++) {
01018       long i, j;
01019 
01020       // compute i,j from idx...
01021       // only compute off-diagonal elements, so we use (framecount-1)
01022       if (idx2sub_tril(framecount-1, idx, &i, &j)) {
01023         printf("qcpthread[%d]: work idx %ld out of triangle!\n", threadid, idx);
01024         break;
01025       }
01026 
01027       // calculate the (weighted) inner product of two structures
01028       double A[9];
01029       double E0 = 0;
01030 
01031       float *xj = crds + (j * 3 * framecrdsz);
01032       float *yj = xj + framecrdsz;
01033       float *zj = xj + framecrdsz*2;
01034 
01035       float *xi = crds + (i * 3 * framecrdsz);
01036       float *yi = xi + framecrdsz;
01037       float *zi = xi + framecrdsz*2;
01038 
01039 #if VMDQCPUSEAVX512 && defined(__AVX512F__)
01040       if (useavx512) {
01041         E0 = InnerProductSOA_avx512(A, xj, yj, zj, xi, yi, zi,
01042                                     sel->selected, NULL /* weight */);
01043       } else 
01044 #endif
01045 #if VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__)
01046       if (useavx2) {
01047         E0 = InnerProductSOA_avx2(A, xj, yj, zj, xi, yi, zi,
01048                                   sel->selected, NULL /* weight */);
01049       } else 
01050 #endif
01051         E0 = InnerProductSOA(A, xj, yj, zj, xi, yi, zi, 
01052                              sel->selected, NULL /* weight */);
01053 
01054       // calculate the RMSD & rotational matrix
01055       FastCalcRMSDAndRotation(NULL, A, &rmsdmat[j*framecount + i], 
01056                               E0, sel->selected, -1);
01057 
01058       // reflect the outcome of the lower triangle into the upper triangle
01059       rmsdmat[i*framecount + j] = rmsdmat[j*framecount + i];
01060     } 
01061   }
01062 
01063   return NULL;
01064 }
01065 
01066 
01067 
01068 int measure_rmsdmat_qcp(VMDApp *app,
01069                         const AtomSel *sel, MoleculeList *mlist,
01070                         int num, float *weight, 
01071                         int first, int last, int step,
01072                         float *rmsdmat) {
01073   if (!sel) return MEASURE_ERR_NOSEL;
01074   if (sel->selected < 1) return MEASURE_ERR_NOSEL;
01075 //  if (!weight || !rmsd) return MEASURE_ERR_NOWEIGHT;
01076 
01077   Molecule *mymol = mlist->mol_from_id(sel->molid());
01078   int maxframes = mymol->numframes();
01079 
01080   // accept value of -1 meaning "all" frames
01081   if (last == -1)
01082     last = maxframes-1;
01083 
01084   if (maxframes == 0 || first < 0 || first > last ||
01085       last >= maxframes || step <= 0)
01086     return MEASURE_ERR_BADFRAMERANGE;
01087 
01088   // XXX replace with calls to centralized control system
01089 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
01090 // XXX there's no SSE-specific code path 
01091 //  int usesse=1;
01092 //  if (getenv("VMDNOSSE")) {
01093 //    usesse=0;
01094 //  }
01095 #endif
01096 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
01097   int useavx2=1;
01098   if (getenv("VMDNOAVX2")) {
01099     useavx2=0;
01100   }
01101 #endif
01102 #if (VMDQCPUSEAVX512 && defined(__AVX512F__))
01103   int useavx512=1;
01104   if (getenv("VMDNOAVX512")) {
01105     useavx512=0;
01106   }
01107 #endif
01108 #if (VMDQCPUSEVSX && defined(__VEC__))
01109   int usevsx=1;
01110   if (getenv("VMDNOVSX")) {
01111     usevsx=0;
01112   }
01113 #endif
01114 
01115 #if 0
01116   // need to know how to traverse the list of weights
01117   // there could be 1 weight per atom (sel_flg == 1) or
01118   // 1 weight per selected atom (sel_flg == 0)
01119   int sel_flg;
01120   if (num == sel->num_atoms) {
01121     sel_flg = 1; // using all elements
01122   } else {
01123     sel_flg = 0; // using elements from selection
01124   }
01125 #endif
01126 
01127   // start timers
01128   wkf_timerhandle timer;
01129   timer=wkf_timer_create();
01130   wkf_timer_start(timer);
01131 
01132 
01133   //
01134   // compute CoM for frame/selection while copying them into SOA target bufs 
01135   //
01136   int framecount = (last - first + 1) / step;
01137 
01138   int padcnt = (num + 255) & ~255;
01139   int framecrdsz = padcnt + 256;
01140   float *crds = (float *) calloc(1, (framecount * 3L * framecrdsz + 256) * sizeof(float));
01141     
01142   int frame;
01143   for (frame=first; frame<=last; frame+=step) {
01144     const float *framepos = (mymol->get_frame(frame))->pos;
01145     float *xc = crds + (frame * 3L * framecrdsz);
01146     float *yc = xc + framecrdsz;
01147     float *zc = xc + framecrdsz*2;
01148 
01149     center_convert_single_soa(sel, num, framepos, weight, xc, yc, zc);
01150   }
01151 
01152   double converttime = wkf_timer_timenow(timer);
01153 
01154 #if !(defined(__PGIC__) && defined(_OPENACC))
01155 #if defined(VMDTHREADS)
01156   int numprocs = wkf_thread_numprocessors();
01157 #else
01158   int numprocs = 1;
01159 #endif
01160 
01161   //
01162   // copy in per-thread parameters
01163   //
01164   qcprmsdthreadparms parms;
01165   memset(&parms, 0, sizeof(parms));
01166   parms.sel = sel;
01167   parms.rmsdmat = rmsdmat;
01168   parms.padcnt = padcnt;
01169   parms.framecrdsz = framecrdsz;
01170   parms.crds = crds;
01171   parms.first = first;
01172   parms.last = last;
01173   parms.step = step;
01174 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
01175   parms.useavx2 = useavx2;
01176 #endif
01177 #if (VMDQCPUSEAVX512 && defined(__AVX512F__))
01178   parms.useavx512 = useavx512;
01179 #endif
01180 #if (VMDQCPUSEVSX && defined(__VEC__))
01181   parms.usevsx = usevsx;
01182 #endif
01183   
01184   // spawn child threads to do the work
01185   wkf_tasktile_t tile;
01186   tile.start=0;
01187   tile.end=(framecount-1)*(framecount-1)/2; // only compute off-diag elements
01188 
01189 #if defined(VMDORBUSETHRPOOL)
01190   wkf_threadpool_sched_dynamic(app->thrpool, &tile);
01191   rc = wkf_threadpool_launch(app->thrpool, measure_rmsdmat_qcp_thread, &parms, 1);
01192 #else
01193   wkf_threadlaunch(numprocs, &parms, measure_rmsdmat_qcp_thread, &tile);
01194 #endif
01195 #elif defined(__PGIC__) && defined(_OPENACC)
01196   // OpenACC variant
01197   rmsdmat_qcp_acc(sel->selected, padcnt, framecrdsz, framecount, crds, 
01198 //                  NULL /* weight */, 
01199                   rmsdmat);
01200 #else
01201   int i, j;
01202   for (j=0; j<framecount; j++) {
01203     float *xj = crds + (j * 3 * framecrdsz);
01204     float *yj = xj + framecrdsz;
01205     float *zj = xj + framecrdsz*2;
01206     for (i=0; i<j; i++) {
01207       // calculate the (weighted) inner product of two structures
01208       double A[9];
01209 
01210       float *xi = crds + (i * 3 * framecrdsz);
01211       float *yi = xi + framecrdsz;
01212       float *zi = xi + framecrdsz*2;
01213 
01214       double E0 = InnerProductSOA(A, xj, yj, zj, xi, yi, zi, 
01215                                   sel->selected, NULL /* weight */);
01216 
01217       // calculate the RMSD & rotational matrix
01218       FastCalcRMSDAndRotation(NULL, A, &rmsdmat[j*framecount + i], 
01219                               E0, sel->selected, -1);
01220 
01221       // reflect the outcome of the lower triangle into the upper triangle
01222       rmsdmat[i*framecount + j] = rmsdmat[j*framecount + i];
01223     }
01224   }
01225 #endif
01226 
01227   // mark all self-RMSDs with a value of 0.0
01228   for (long l=0; l<framecount; l++) {
01229     rmsdmat[l*framecount + l] = 0.0;
01230   }
01231 
01232   double rmsdtime = wkf_timer_timenow(timer) - converttime;
01233 
01234   // free all temporary buffers
01235   free(crds);
01236 
01237 #if 1
01238   double totaltime = wkf_timer_timenow(timer);
01239   printf("QCP RMSD Matrix calculation time: SOA selection: %.3f  RMSD solve: %.3f  total: %.3f\n", converttime, rmsdtime, totaltime); 
01240 #endif
01241 
01242   wkf_timer_destroy(timer);
01243 
01244   return MEASURE_NOERR; // and say rmsd is OK
01245 }
01246 
01247 
01248 
01249 int measure_rmsdmat_qcp_ooc(VMDApp *app,
01250                         const AtomSel *sel, MoleculeList *mlist,
01251                         int nfiles, const char **trjfileset,
01252                         int num, float *weight, 
01253                         int first, int last, int step,
01254                         int &framecount, float *&rmsdmat) {
01255   if (!sel) return MEASURE_ERR_NOSEL;
01256   if (sel->selected < 1) return MEASURE_ERR_NOSEL;
01257 //  if (!weight || !rmsd) return MEASURE_ERR_NOWEIGHT;
01258 
01259 
01260   //
01261   // XXX really need to compute per-file frame counts etc
01262   //
01263   framecount = (last - first + 1) / step;
01264 
01265   printf("** measure_rmsdmat_qcp_ooc(): \n");
01266   printf("**   first: %d  last: %d  step: %d  nfiles: %d count: %d\n",
01267          first, last, step, nfiles, framecount);
01268 
01269   rmsdmat = (float *) calloc(1, framecount * framecount * sizeof(float));
01270 
01271   // XXX this needs to be properly implemented with fallback functionality
01272 #if defined(VMDCUDA)
01273   qcp_soa_gpu_ooc(app->cuda->get_cuda_devpool(), 
01274                   nfiles, trjfileset, sel, first, last, step, rmsdmat);
01275 #endif
01276 
01277 #if 0
01278   Molecule *mymol = mlist->mol_from_id(sel->molid());
01279 
01280   int maxframes = mymol->numframes();
01281 
01282   // accept value of -1 meaning "all" frames
01283   if (last == -1)
01284     last = maxframes-1;
01285 
01286   if (maxframes == 0 || first < 0 || first > last ||
01287       last >= maxframes || step <= 0)
01288     return MEASURE_ERR_BADFRAMERANGE;
01289 
01290   // XXX hacked out lots of stuff we don't need yet...
01291 
01292 
01293 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
01294   int useavx2=1;
01295   if (getenv("VMDNOAVX2")) {
01296     useavx2=0;
01297   }
01298 #endif
01299 
01300   // start timers
01301   wkf_timerhandle timer;
01302   timer=wkf_timer_create();
01303   wkf_timer_start(timer);
01304 
01305   //
01306   // compute CoM for frame/selection while copying them into SOA target bufs 
01307   //
01308   int framecount = (last - first + 1) / step;
01309 
01310   int padcnt = (num + 255) & ~255;
01311   int framecrdsz = padcnt + 256;
01312   float *crds = (float *) calloc(1, (framecount * 3L * framecrdsz + 256) * sizeof(float));
01313     
01314   int frame;
01315   for (frame=first; frame<=last; frame+=step) {
01316     const float *framepos = (mymol->get_frame(frame))->pos;
01317     float *xc = crds + (frame * 3L * framecrdsz);
01318     float *yc = xc + framecrdsz;
01319     float *zc = xc + framecrdsz*2;
01320 
01321     center_convert_single_soa(sel, num, framepos, weight, xc, yc, zc);
01322   }
01323 
01324   double converttime = wkf_timer_timenow(timer);
01325 
01326 #if defined(VMDTHREADS)
01327   int numprocs = wkf_thread_numprocessors();
01328 #else
01329   int numprocs = 1;
01330 #endif
01331 
01332   //
01333   // copy in per-thread parameters
01334   //
01335   qcprmsdthreadparms parms;
01336   memset(&parms, 0, sizeof(parms));
01337   parms.sel = sel;
01338   parms.rmsdmat = rmsdmat;
01339   parms.padcnt = padcnt;
01340   parms.framecrdsz = framecrdsz;
01341   parms.crds = crds;
01342   parms.first = first;
01343   parms.last = last;
01344   parms.step = step;
01345 #if (VMDQCPUSESSE && defined(__SSE2__)) || (VMDQCPUSEAVX2 && defined(__AVX__) && defined(__AVX2__))
01346   parms.useavx2 = useavx2;
01347 #endif
01348   
01349   // spawn child threads to do the work
01350   wkf_tasktile_t tile;
01351   tile.start=0;
01352   tile.end=(framecount-1)*(framecount-1)/2; // only compute off-diag elements
01353 
01354 #if defined(VMDORBUSETHRPOOL)
01355   wkf_threadpool_sched_dynamic(app->thrpool, &tile);
01356   rc = wkf_threadpool_launch(app->thrpool, measure_rmsdmat_qcp_thread, &parms, 1);
01357 #else
01358   wkf_threadlaunch(numprocs, &parms, measure_rmsdmat_qcp_thread, &tile);
01359 #endif
01360 
01361   // mark all self-RMSDs with a value of 1.0
01362   for (long l=0; l<framecount; l++) {
01363     rmsdmat[l*framecount + l] = 1.0;
01364   }
01365 
01366   double rmsdtime = wkf_timer_timenow(timer) - converttime;
01367 
01368   // free all temporary buffers
01369   free(crds);
01370 
01371 #if 1
01372   double totaltime = wkf_timer_timenow(timer);
01373   printf("QCP RMSD Matrix calculation time: SOA selection: %.3f  RMSD solve: %.3f  total: %.3f\n", converttime, rmsdtime, totaltime); 
01374 #endif
01375 
01376   wkf_timer_destroy(timer);
01377 #endif
01378 
01379   return MEASURE_NOERR; // and say rmsd is OK
01380 }
01381 
01382 
01383 
01384 
01385 
01386 
01387 //
01388 // Copyright notice for original QCP FastCalcRMSDAndRotation() routine
01389 //
01390 // If you use this QCP rotation calculation method in a publication, please
01391 // reference:
01392 //   Douglas L. Theobald (2005)
01393 //   "Rapid calculation of RMSD using a quaternion-based characteristic
01394 //   polynomial."
01395 //   Acta Crystallographica A 61(4):478-480.
01396 //
01397 //   Pu Liu, Dmitris K. Agrafiotis, and Douglas L. Theobald (2009)
01398 //   "Fast determination of the optimal rotational matrix for macromolecular
01399 //   superpositions."
01400 //   Journal of Computational Chemistry 31(7):1561-1563.
01401 //
01402 //  Copyright (c) 2009-2013 Pu Liu and Douglas L. Theobald
01403 //  All rights reserved.
01404 //
01405 //  Redistribution and use in source and binary forms, with or without modification, are permitted
01406 //  provided that the following conditions are met:
01407 //
01408 //  * Redistributions of source code must retain the above copyright notice, 
01409 //    this list of conditions and the following disclaimer.
01410 //  * Redistributions in binary form must reproduce the above copyright notice,
01411 //    this list of conditions and the following disclaimer in the documentation
01412 //    and/or other materials provided with the distribution.
01413 //  * Neither the name of the <ORGANIZATION> nor the names of its 
01414 //    contributors may be used to endorse or promote products derived from 
01415 //    this software without specific prior written permission.
01416 //
01417 //  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
01418 //  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
01419 //  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
01420 //  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
01421 //  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
01422 //  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
01423 //  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
01424 //  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
01425 //  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
01426 //  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
01427 //  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
01428 //
01429 #if defined(__PGIC__)
01430 #pragma acc routine seq
01431 #endif
01432 int FastCalcRMSDAndRotation(double *rot, double *A, float *rmsd, 
01433                             double E0, int len, double minScore) {
01434   double Sxx, Sxy, Sxz, Syx, Syy, Syz, Szx, Szy, Szz;
01435   double Szz2, Syy2, Sxx2, Sxy2, Syz2, Sxz2, Syx2, Szy2, Szx2,
01436          SyzSzymSyySzz2, Sxx2Syy2Szz2Syz2Szy2, Sxy2Sxz2Syx2Szx2,
01437          SxzpSzx, SyzpSzy, SxypSyx, SyzmSzy,
01438          SxzmSzx, SxymSyx, SxxpSyy, SxxmSyy;
01439   double C[4];
01440   int i;
01441   double mxEigenV; 
01442   double oldg = 0.0;
01443   double b, a, delta, rms, qsqr;
01444   double q1, q2, q3, q4, normq;
01445   double a11, a12, a13, a14, a21, a22, a23, a24;
01446   double a31, a32, a33, a34, a41, a42, a43, a44;
01447   double a2, x2, y2, z2; 
01448   double xy, az, zx, ay, yz, ax; 
01449   double a3344_4334, a3244_4234, a3243_4233, a3143_4133,a3144_4134, a3142_4132; 
01450   double evecprec = 1e-6;
01451   double evalprec = 1e-11;
01452 
01453   Sxx = A[0]; Sxy = A[1]; Sxz = A[2];
01454   Syx = A[3]; Syy = A[4]; Syz = A[5];
01455   Szx = A[6]; Szy = A[7]; Szz = A[8];
01456 
01457   Sxx2 = Sxx * Sxx;
01458   Syy2 = Syy * Syy;
01459   Szz2 = Szz * Szz;
01460 
01461   Sxy2 = Sxy * Sxy;
01462   Syz2 = Syz * Syz;
01463   Sxz2 = Sxz * Sxz;
01464 
01465   Syx2 = Syx * Syx;
01466   Szy2 = Szy * Szy;
01467   Szx2 = Szx * Szx;
01468 
01469   SyzSzymSyySzz2 = 2.0*(Syz*Szy - Syy*Szz);
01470   Sxx2Syy2Szz2Syz2Szy2 = Syy2 + Szz2 - Sxx2 + Syz2 + Szy2;
01471 
01472   C[2] = -2.0 * (Sxx2 + Syy2 + Szz2 + Sxy2 + Syx2 + Sxz2 + Szx2 + Syz2 + Szy2);
01473   C[1] = 8.0 * (Sxx*Syz*Szy + Syy*Szx*Sxz + Szz*Sxy*Syx - Sxx*Syy*Szz - Syz*Szx*Sxy - Szy*Syx*Sxz);
01474 
01475   SxzpSzx = Sxz + Szx;
01476   SyzpSzy = Syz + Szy;
01477   SxypSyx = Sxy + Syx;
01478   SyzmSzy = Syz - Szy;
01479   SxzmSzx = Sxz - Szx;
01480   SxymSyx = Sxy - Syx;
01481   SxxpSyy = Sxx + Syy;
01482   SxxmSyy = Sxx - Syy;
01483   Sxy2Sxz2Syx2Szx2 = Sxy2 + Sxz2 - Syx2 - Szx2;
01484 
01485   C[0] = Sxy2Sxz2Syx2Szx2 * Sxy2Sxz2Syx2Szx2
01486        + (Sxx2Syy2Szz2Syz2Szy2 + SyzSzymSyySzz2) * (Sxx2Syy2Szz2Syz2Szy2 - SyzSzymSyySzz2)
01487        + (-(SxzpSzx)*(SyzmSzy)+(SxymSyx)*(SxxmSyy-Szz)) * (-(SxzmSzx)*(SyzpSzy)+(SxymSyx)*(SxxmSyy+Szz))
01488        + (-(SxzpSzx)*(SyzpSzy)-(SxypSyx)*(SxxpSyy-Szz)) * (-(SxzmSzx)*(SyzmSzy)-(SxypSyx)*(SxxpSyy+Szz))
01489        + (+(SxypSyx)*(SyzpSzy)+(SxzpSzx)*(SxxmSyy+Szz)) * (-(SxymSyx)*(SyzmSzy)+(SxzpSzx)*(SxxpSyy+Szz))
01490        + (+(SxypSyx)*(SyzmSzy)+(SxzmSzx)*(SxxmSyy-Szz)) * (-(SxymSyx)*(SyzpSzy)+(SxzmSzx)*(SxxpSyy-Szz));
01491 
01492   /* Newton-Raphson */
01493   mxEigenV = E0;
01494   for (i = 0; i < 50; ++i) {
01495     oldg = mxEigenV;
01496     x2 = mxEigenV*mxEigenV;
01497     b = (x2 + C[2])*mxEigenV;
01498     a = b + C[1];
01499     delta = ((a*mxEigenV + C[0])/(2.0*x2*mxEigenV + b + a));
01500     mxEigenV -= delta;
01501 #if 0
01502     printf("QCP diff[%3d]: %16g %16g %16g\n", i, mxEigenV - oldg, evalprec*mxEigenV, mxEigenV);
01503 #endif
01504     if (fabs(mxEigenV - oldg) < fabs(evalprec*mxEigenV))
01505       break;
01506   }
01507 
01508 #if !defined(__PGIC__)
01509   if (i == 50) 
01510     printf("MeasureQCP: More than %d iterations needed!\n", i);
01511 #endif
01512 
01513   // the fabs() is to guard against extremely small, 
01514   // but *negative* numbers due to floating point error 
01515   rms = sqrt(fabs(2.0 * (E0 - mxEigenV)/len));
01516   (*rmsd) = float(rms);
01517   /* printf("\n\n %16g %16g %16g \n", rms, E0, 2.0 * (E0 - mxEigenV)/len); */
01518 
01519   if (minScore > 0) 
01520     if (rms < minScore)
01521       return (-1); // Don't bother with rotation. 
01522 
01523   // only perform rotation related calculations if we have a non-NULL
01524   // pointer for the output rotation matrix
01525   if (rot != NULL) {
01526     a11 = SxxpSyy + Szz-mxEigenV; a12 = SyzmSzy; a13 = - SxzmSzx; a14 = SxymSyx;
01527     a21 = SyzmSzy; a22 = SxxmSyy - Szz-mxEigenV; a23 = SxypSyx; a24= SxzpSzx;
01528     a31 = a13; a32 = a23; a33 = Syy-Sxx-Szz - mxEigenV; a34 = SyzpSzy;
01529     a41 = a14; a42 = a24; a43 = a34; a44 = Szz - SxxpSyy - mxEigenV;
01530     a3344_4334 = a33 * a44 - a43 * a34; a3244_4234 = a32 * a44-a42*a34;
01531     a3243_4233 = a32 * a43 - a42 * a33; a3143_4133 = a31 * a43-a41*a33;
01532     a3144_4134 = a31 * a44 - a41 * a34; a3142_4132 = a31 * a42-a41*a32;
01533     q1 =  a22*a3344_4334-a23*a3244_4234+a24*a3243_4233;
01534     q2 = -a21*a3344_4334+a23*a3144_4134-a24*a3143_4133;
01535     q3 =  a21*a3244_4234-a22*a3144_4134+a24*a3142_4132;
01536     q4 = -a21*a3243_4233+a22*a3143_4133-a23*a3142_4132;
01537 
01538     qsqr = q1 * q1 + q2 * q2 + q3 * q3 + q4 * q4;
01539 
01540     // The following code tries to calculate another column in the 
01541     // adjoint matrix when the norm of the current column is too small.
01542     // Usually this block will never be activated.  
01543     // To be absolutely safe this should be
01544     // uncommented, but it is most likely unnecessary.
01545     if (qsqr < evecprec) {
01546       q1 =  a12*a3344_4334 - a13*a3244_4234 + a14*a3243_4233;
01547       q2 = -a11*a3344_4334 + a13*a3144_4134 - a14*a3143_4133;
01548       q3 =  a11*a3244_4234 - a12*a3144_4134 + a14*a3142_4132;
01549       q4 = -a11*a3243_4233 + a12*a3143_4133 - a13*a3142_4132;
01550       qsqr = q1*q1 + q2 *q2 + q3*q3+q4*q4;
01551 
01552       if (qsqr < evecprec) {
01553         double a1324_1423 = a13*a24 - a14*a23, a1224_1422 = a12*a24 - a14*a22;
01554         double a1223_1322 = a12*a23 - a13*a22, a1124_1421 = a11*a24 - a14*a21;
01555         double a1123_1321 = a11*a23 - a13*a21, a1122_1221 = a11*a22 - a12*a21;
01556 
01557         q1 =  a42 * a1324_1423 - a43 * a1224_1422 + a44 * a1223_1322;
01558         q2 = -a41 * a1324_1423 + a43 * a1124_1421 - a44 * a1123_1321;
01559         q3 =  a41 * a1224_1422 - a42 * a1124_1421 + a44 * a1122_1221;
01560         q4 = -a41 * a1223_1322 + a42 * a1123_1321 - a43 * a1122_1221;
01561         qsqr = q1*q1 + q2 *q2 + q3*q3+q4*q4;
01562 
01563         if (qsqr < evecprec) {
01564           q1 =  a32 * a1324_1423 - a33 * a1224_1422 + a34 * a1223_1322;
01565           q2 = -a31 * a1324_1423 + a33 * a1124_1421 - a34 * a1123_1321;
01566           q3 =  a31 * a1224_1422 - a32 * a1124_1421 + a34 * a1122_1221;
01567           q4 = -a31 * a1223_1322 + a32 * a1123_1321 - a33 * a1122_1221;
01568           qsqr = q1*q1 + q2 *q2 + q3*q3 + q4*q4;
01569                 
01570           if (qsqr < evecprec) {
01571             // if qsqr is still too small, return the identity matrix.
01572             rot[0] = rot[4] = rot[8] = 1.0;
01573             rot[1] = rot[2] = rot[3] = rot[5] = rot[6] = rot[7] = 0.0;
01574 
01575             return(0);
01576           }
01577         }
01578       }
01579     }
01580 
01581     normq = sqrt(qsqr);
01582     q1 /= normq;
01583     q2 /= normq;
01584     q3 /= normq;
01585     q4 /= normq;
01586 
01587     a2 = q1 * q1;
01588     x2 = q2 * q2;
01589     y2 = q3 * q3;
01590     z2 = q4 * q4;
01591 
01592     xy = q2 * q3;
01593     az = q1 * q4;
01594     zx = q4 * q2;
01595     ay = q1 * q3;
01596     yz = q3 * q4;
01597     ax = q1 * q2;
01598 
01599     rot[0] = a2 + x2 - y2 - z2;
01600     rot[1] = 2 * (xy + az);
01601     rot[2] = 2 * (zx - ay);
01602     rot[3] = 2 * (xy - az);
01603     rot[4] = a2 - x2 + y2 - z2;
01604     rot[5] = 2 * (yz + ax);
01605     rot[6] = 2 * (zx + ay);
01606     rot[7] = 2 * (yz - ax);
01607     rot[8] = a2 - x2 - y2 + z2;
01608   }
01609 
01610   return 1;
01611 }
01612