#include "aaTools.h"


// Calculates the avg CA dist betw residues globally numbered 'start'
//   and 'end' across all the sequences in the Alignment
float getAvgCaDist(Alignment msa, int start, int end)
{
  float dist=0;
  for(int i=0; i<msa.nSequences; i++) {
    dist+=sqrt(
        pow( msa.sequences[i].structure.caCoordinates[start][0] - 
             msa.sequences[i].structure.caCoordinates[end][0] , 2) +
        pow( msa.sequences[i].structure.caCoordinates[start][1] - 
             msa.sequences[i].structure.caCoordinates[end][1] , 2) +
        pow( msa.sequences[i].structure.caCoordinates[start][2] - 
             msa.sequences[i].structure.caCoordinates[end][2] , 2) );

  }
  dist = dist/msa.nSequences;
  return dist;
}


// Counts the number and length of ungapped blocks
//   in the original protein Alignment
void getBlockStatsMsa(Alignment msa) {
  int i, k, flag;
  // int j,n,A,B;
  int *block = new int[msa.maximumSequenceLength];
  int length = msa.maximumSequenceLength;

  // Set boolean array block
  for(i=0; i<length; i++) {
    flag=0;
    for(k=0; k<msa.nSequences; k++) {
        if(msa.sequences[k].residues[i]=='-')
          flag+=1;
    }
    if(flag==0)
//    if(flag<.1*msa.nSequences)
      block[i]=1;
  }

  int count=0;
  for(i=0; i<length; i++) {
    if(block[i]) {
      count++;
      block[i]=count;
    }
    else
      count=0;
  }
  int maxBlockLen=0;
  for(i=length-1; i>0; i--) {
    if(block[i-1]!=0 && block[i]!=0)
      block[i-1] = block[i];
    if(block[i]>maxBlockLen) maxBlockLen = block[i];
  }

  int* blockCount = new int[maxBlockLen+1];

  for(i=0+1; i<length; i++)
    if(block[i]>0 && block[i]!=block[i-1])
      blockCount[block[i]]++;

  for(i=0+1; i<maxBlockLen+1; i++)
    printf("%d %d\n", i,blockCount[i]);
  int start,end;
  float dist;
  for(i=0+1; i<length; i++) {
    if(block[i]>0 && block[i]!=block[i-1]) {
      start = i;
      i++;
      while(block[i]==block[i-1]) i++;
      end = i-1;
//      printf("%d to %d\n", start, end);
      dist = getAvgCaDist(msa,start,end);
      printf("%d %f\n", block[i-1], dist);
    }
  }
  delete[] block;
  delete[] blockCount;
return;
}


// Counts the number and length of ungapped blocks
//   in the original protein Alignment
void getBlockStatsPwa(Alignment pw) {
  int i, k, n, flag;
  // int j,A,B;

  if(DB) printf("ENTERED getBlockStatsPwa\n");
  int length = pw.maximumSequenceLength;
  int* blockCount = new int[length];
  for(int i=0; i<length; i++) blockCount[i]=0;
  int* globBlockCount = new int[length];
  for(int i=0; i<length; i++) globBlockCount[i]=0;
  int* globBDCount = new int[500]; //0-10A in .1 incr
  for(int i=0; i<500; i++) globBDCount[i]=0;

  int *block = new int[length];
  for(n=0; n<pw.nSequences/2; n++) {
    for(i=0;i<length;i++) block[i]=0;

    // Set boolean array block
    for(i=0; i<length && pw.sequences[2*n].residues[i]!='\0'; i++) {
      flag=0;
      for(k=0; k<2; k++) {
        if(pw.sequences[2*n+k].residues[i]=='-')
          flag+=1;
      }
      if(flag==0)
        block[i]=1;
    }

    int count=0;
    for(i=0; i<length; i++) {
      if(block[i]) {
        count++;
        block[i]=count;
      }
      else
        count=0;

    }
    int maxBlockLen=0;
    for(i=length-1; i>0; i--) {
      if(block[i-1]!=0 && block[i]!=0)
        block[i-1] = block[i];
      if(block[i]>maxBlockLen) maxBlockLen = block[i];
    }

    //  now determine frequency of blocks of each length
    for(i=0; i<length; i++) blockCount[i]=0;
    for(i=0+1; i<length; i++)
      if(block[i]>0 && block[i]!=block[i-1])
        blockCount[block[i]]++;


    for(i=0; i<length; i++) globBlockCount[i]+=blockCount[i];


    int start,end;
    float dist;
    for(i=0+1; i<length; i++) {
      if(block[i]>0 && block[i]!=block[i-1]) {
        start = i;
        i++;
        while(block[i]==block[i-1]) i++;
        end = i-1;
        //        printf("%d to %d\n", start, end);
        if((start-end)*(start-end)>1) {
          k = 2*n;
          dist =.5*sqrt(
              pow( pw.sequences[k].structure.caCoordinates[start][0] - 
                pw.sequences[k].structure.caCoordinates[end][0] , 2) +
              pow( pw.sequences[k].structure.caCoordinates[start][1] - 
                pw.sequences[k].structure.caCoordinates[end][1] , 2) +
              pow( pw.sequences[k].structure.caCoordinates[start][2] - 
                pw.sequences[k].structure.caCoordinates[end][2] , 2) );
          k = 2*n+1;
          dist+=.5*sqrt(
              pow( pw.sequences[k].structure.caCoordinates[start][0] - 
                pw.sequences[k].structure.caCoordinates[end][0] , 2) +
              pow( pw.sequences[k].structure.caCoordinates[start][1] - 
                pw.sequences[k].structure.caCoordinates[end][1] , 2) +
              pow( pw.sequences[k].structure.caCoordinates[start][2] - 
                pw.sequences[k].structure.caCoordinates[end][2] , 2) );
          if(dist<30) globBDCount[(int)floor(dist*2)]++;
        }
        /*    for(int m=0; m<block[i-1]; m++)
              printf("%c", pw.sequences[2*n].residues[start+m]);
              printf("\n");
              for(int m=0; m<block[i-1]; m++)
              printf("%c", pw.sequences[2*n+1].residues[start+m]);
              printf("\n");
         */
        /*          double H=0, E=0;
                    for(int m=0; m<block[i-1]; m++) {
                    if(pw.sequences[2*n].residues[start+m]=='H') H++;
                    if(pw.sequences[2*n].residues[start+m]=='E') E++;
                    if(pw.sequences[2*n+1].residues[start+m]=='H') H++;
                    if(pw.sequences[2*n+1].residues[start+m]=='E') E++;
                    }
         */
        //          printf("%d %f %f %f\n", block[i-1], dist, H/(2*block[i-1]),E/(2*block[i-1]));

      }
    }

  }
  //  for(i=0; i<length; i++)
  //    printf("%d %d\n", i,globBlockCount[i]);
  for(i=0; i<60; i++)
    printf("%d.%d %d\n", i/2,5*(i%2),globBDCount[i]);

  //  for(i=0+1; i<50; i++)
  //    printf("%d %d\n", i,blockCount[i]);
  delete[] block;
  delete[] blockCount;
  delete[] globBlockCount;
  return;
}


// Calculates the avg CA dist betw residues globally numbered 'start'
//   and 'end' across all the sequences in the Alignment
float getCaDist(Alignment aln, int start, int end, int protIndex)
{
  int i=protIndex;
  float dist=sqrt(
      pow( aln.sequences[i].structure.caCoordinates[start][0] - 
        aln.sequences[i].structure.caCoordinates[end][0] , 2) +
      pow( aln.sequences[i].structure.caCoordinates[start][1] - 
        aln.sequences[i].structure.caCoordinates[end][1] , 2) +
      pow( aln.sequences[i].structure.caCoordinates[start][2] - 
        aln.sequences[i].structure.caCoordinates[end][2] , 2) );
  return dist;
}


// Counts the number and length of gaps
//   in the original protein Alignment
void getGapStats(Alignment msa) {
  int i, j;
  // int k,n,A,B,flag;
  int length = msa.maximumSequenceLength;
  int **gap = new int*[msa.nSequences];
  for(i=0; i<msa.nSequences; i++)
    gap[i] = new int[length];

  // Set boolean array block
  for(i=0; i<msa.nSequences; i++)
    for(j=0; j<length; j++)
      if(msa.sequences[i].residues[j]=='-')
        gap[i][j]=1;

  for(i=0; i<msa.nSequences; i++) {
    int count=0;
    for(j=0; j<length; j++) {
      if(gap[i][j]) {
        count++;
        gap[i][j]=count;
      }
      else
        count=0;
    }
  }
  int maxGapLen=0;
  for(i=0; i<msa.nSequences; i++)
    for(j=length-1; j>0; j--) {
      if(gap[i][j-1]!=0 && gap[i][j]!=0)
        gap[i][j-1] = gap[i][j];
      if(gap[i][j]>maxGapLen) maxGapLen = gap[i][j];
    }

  int* gapCount = new int[maxGapLen+1];

  for(i=0; i<msa.nSequences; i++)
    for(j=0+1; j<length; j++)
      if(gap[i][j]>0 && gap[i][j]!=gap[i][j-1])
        gapCount[gap[i][j]]++;

  for(i=0+1; i<maxGapLen+1; i++)
    printf("%d %d\n", i,gapCount[i]);
  int start,end;
  float dist;
  float maxDist = 0;
  int *distStats = new int[20*10];
  for(i=0; i<msa.nSequences; i++) {
    for(j=0+1; j<length; j++) {
      if(gap[i][j]>0 && gap[i][j]!=gap[i][j-1]) {
        start = j;
        j++;
        while(gap[i][j]==gap[i][j-1]) j++;
        end = j-1;
  //      printf("%d to %d\n", start, end);
	if(end<length-2) {
          dist = getCaDist(msa,start-1,end+1,i);
	  distStats[(int)(10*dist)]++;
	  if(dist>maxDist) maxDist = dist;
          printf("%d %f\n", gap[i][j-1], dist);
	}
      }
    }
  }

  for(i=0; i<20*10; i++)
    printf("%d.%d %d\n", i/10, i%10, distStats[i]);
  

  delete[] gap;
  delete[] gapCount;
  delete[] distStats;

return;

}



 









// Converts a string (temp) to a float
float charToFloat(char *temp) {
  int flag = 0;
  float ans = 0;
  int i, dec=0;

  for(i=0;i<(int)strlen(temp);i++)
    if(temp[i]=='.')
      dec = i;

  for(i=0;i<(int)strlen(temp);i++) {
    if(temp[i]==45) flag=1;
    if(temp[i]>=48 && temp[i]<=57)
      ans += pow((float)10,dec-i-1*(i<dec))*(temp[i]-48);
  }
  if(flag) ans*=-1;

  return ans;
}

// Converts a string (temp) to an int
int charToInt(char *temp) {
  int i,flag=0,ans=0;
  //int len=strlen(temp);

  while(temp[0]==' ') {
    strcpy(temp,temp+1);
    temp[strlen(temp)] = '\0';
  }
  while(temp[strlen(temp)-1]==' ') {
    temp[strlen(temp)-1] = '\0';
  }

  for(i=0;i<(int)strlen(temp);i++) {
    if(temp[i]==45) flag=1;
    if(temp[i]>=48 && temp[i]<=57)
      ans += (int)pow((float)10,(float)strlen(temp)-1-i)*(temp[i]-48);
  }
  if(flag) ans*=-1;

  return ans;
}


// Converts and integer num to a string of length len
char* intToString(int num, int len) {
  int negative = 0;
  //int ans = 0;
  int i,j;
  float k;

  char *temp = new char[len+1];

  if(num<0) negative=1;

  if(num<0) num*=-1;

  for(i=0; i<len; i++) {
    k = num/(pow((float)10,i));
    j = (int)k%10;
    temp[len-i-1] = j+48;
  }

  i = 0;
  while(temp[i]=='0')
    temp[i++] = ' ';
  if(temp[i]=='\0') temp[i-1] = '0';

  if(negative)
    temp[i-1] = '-';
    
  temp[len] = '\0';

  return temp;
}


// Converts a string to lower case
char* lower(char *s) {
  for(int i=0; i<(int)strlen(s); i++) {
    if(s[i]>=65 && s[i]<=90)
      s[i]+=32;
  }
  return s;
}



// pairToMult(Alignment pw, Alignment* msa)
//    - parameters : pw  - name of the pairwise Alignment structure to be
//    				converted
//		     *msa - pointer to the msa structure which we will write to
//    - Gets data from pw.  Uses the first sequence as the "master", and uses
//        the pairwise Alignments of the "master" with each "slave" to 
//        construct a multiple sequence Alignment.
//
void pairToMult(Alignment pw, Alignment* msa) 
{
  int i,j,k,count,len,cFS;
  // int n;
  char **tempmsa;
  char *masterSeq = new char[10*pw.maximumSequenceLength];
  char *masterSeqNoGap = new char[10*pw.maximumSequenceLength];
  char *tempSeq = new char[10*pw.maximumSequenceLength];
  int *nGaps= new int[10*pw.maximumSequenceLength];
  int *tempnGaps= new int[10*pw.maximumSequenceLength];

  msa->nSequences = pw.nProteins;
  if(msa->name!=NULL) delete[] msa->name;
  msa->name = new char[strlen(pw.name)+1];

  // Construct master sequence w/o gaps
  strcpy(tempSeq,pw.sequences[0].residues);
  count = 0;
  for(i=0; i<(int)strlen(tempSeq); i++)
    if(tempSeq[i] != '-' && tempSeq[i] != '?') 
      masterSeqNoGap[count++] = tempSeq[i]; 
  masterSeqNoGap[count] = '\0';

  // Figure out how many gaps the master seq should have
  //
  // Read one sequence at a time
  for(k=0; k<pw.nSequences; k++) {
    // If not master sequence, then ignore for now
    if(strcmp(pw.sequences[k].name,pw.sequences[0].name)==0) {
      // Now go res by res and find how many gaps after each
      for(i=0,j=0; i<(int)strlen(masterSeqNoGap)+1; i++) {
        count=j+1;
        while(pw.sequences[k].residues[j++] == '-');
        if(j-count>nGaps[i]) nGaps[i] = j-count;
      }
    }
  }

  // Now add all these gaps into the master sequence
  for(i=0; i<(int)strlen(masterSeqNoGap)+1; i++) {
    for(j=0; j<nGaps[i]; j++)
      strcat(masterSeq, "-\0");
    strcat(masterSeq, charToString(masterSeqNoGap[i]));
  }

  // Initialize msa's dynamic variables (now knowing msa.nSequences)
  msa->maximumSequenceLength = strlen(masterSeq);
  msa->sequences = new Sequence[msa->nSequences];

  // initialize some other dynamic variables
  len = strlen(masterSeq);
  tempmsa = new char*[pw.nProteins];
  for(i=0;i<pw.nProteins;i++)
    tempmsa[i] = new char[len+1];
  msa->sequences[0].setResidues(masterSeq);

  // Set name of 'master' seq here for msa
  msa->sequences[0].setName(pw.sequences[0].name);


  // Set name of other unique seqs here for msa
  for(i=1; i<pw.nProteins; i++)
    msa->sequences[i].setName(pw.sequences[2*i-1].name);

  // Read through each pairwise Alignment
  // Go through and align each slave sequence to the overall master
  // using it's local master
  for(i=1; i<pw.nProteins; i++) {

    // First count number of gaps in local master (sequences[2i-2].residues)
    for(j=0; j<pw.sequences[2*i-2].length; j++)
      tempnGaps[j]=0;
    for(j=0,k=0; j<pw.sequences[2*i-2].length; j++)
      if(pw.sequences[2*i-2].residues[j] == '-') tempnGaps[k]++;
      else k++;

    // Now compare this to the number in the global (gapped) master
    // in order to determine where additional gaps must be added to
    // the local slave sequence so as to align it to the global master
    cFS=0;
    for(j=0; j<(int)strlen(masterSeqNoGap)+1; j++) {
      for(k=0; k<nGaps[j] - tempnGaps[j]; k++)
        strcat(msa->sequences[i].residues,"-"); //BUG
      tempSeq[0] = '\0';
      for(k=0; k<tempnGaps[j]+1; k++)
        strcat(tempSeq, charToString(pw.sequences[2*i-1].residues[cFS++]));
      strcat(msa->sequences[i].residues,tempSeq);
    }
  }

  delete[] tempmsa;
  delete[] masterSeq;
  delete[] masterSeqNoGap;
  delete[] tempSeq;
  delete[] nGaps;
  delete[] tempnGaps;

  return;
}    


// multToPair(Alignment msa, Alignment *pw)
//    - parameters : msa  - name of the msa structure to be
//    				converted
//		     *pw - pointer to the PW structure which we will write to
//    - Gets data from msa.  Uses the first sequence as the "master", and uses
//        the pairwise Alignments of the "master" with each "slave" to 
//        construct a multiple sequence Alignment. FIX
//
void multToPair(Alignment msa, Alignment *pw) 
{
  if(DB) printf("entering multToPair\n");
  //int oldNSequences = pw->nSequences;
  //int oldMaxSeqLen = pw->maximumSequenceLength;
  pw->nProteins = msa.nSequences;
  pw->nSequences = msa.nSequences*(msa.nSequences-1);
  pw->maximumSequenceLength = msa.maximumSequenceLength;

  if(pw->name!=NULL) delete[] pw->name;
  pw->name = new char[strlen(msa.name)+1];
  strcpy(pw->name, msa.name);

  if(pw->sequences!=NULL)
    delete[] pw->sequences;
  pw->sequences = new Sequence[pw->nSequences];

  int count=0;
  for(int i=0; i<msa.nSequences; i++)
    for(int j=i+1; j<msa.nSequences; j++, count+=2) {
      pw->sequences[count].setName(msa.sequences[i].name);
      pw->sequences[count+1].setName(msa.sequences[j].name);
      for(int k=0, r=0; k<msa.maximumSequenceLength; k++) {
        if(msa.sequences[i].residues[k]!='-' || msa.sequences[j].residues[k]!='-') {
          pw->sequences[count].residues[r]=msa.sequences[i].residues[k];
          pw->sequences[count+1].residues[r]=msa.sequences[j].residues[k];
          r++;
        }
      }
    }

  if(DB) printf("leaving multToPair\n");
}    


// Accepts a character, returns a 2 character string containing
// the accepted character and the NULL character
char * charToString(char i) {
  char *temp = new char[2];
  temp[0] = i;
  temp[1] = '\0';
  return temp;
}
  


float corr2(Matrix m1, Matrix m2) {
  int N=m1.N;
  float A=0,B=0;

  for(int i=0; i<N; i++)
    for(int j=0; j<=i; j++) {
      A+=m1.cell[i][j];
      B+=m2.cell[i][j];
    }
  A = A/(N*N/2+N/2);  // avg m1 value
  B = B/(N*N/2+N/2);  // avg m2 value
  
  float num=0, den1=0, den2=0;
  for(int i=0; i<N; i++)
    for(int j=0; j<=i; j++) {
      num+=(m1.cell[i][j]-A)*(m2.cell[i][j]-B);
      den1+=(m1.cell[i][j]-A)*(m1.cell[i][j]-A);
      den2+=(m2.cell[i][j]-B)*(m2.cell[i][j]-B);
    }
  float r = num/sqrt(den1*den2);
  return r;
}


// calls below w/o q-scores
void jackKnife(char *msaFileName) {
  jackKnife(msaFileName, NULL);
}

// hacky verison
//   reads in an msa; cumulatively removes sequences
//   one at a time from the end, and computes a correlation
//   coefficient for each new set with the original one.
//   also prints highest remaining pair-wise q-score
void jackKnife(char *msaFileName, char *qFile) {
  Alignment temp("temp");
  temp.readFromFile(msaFileName);
  Matrix *jMat = new Matrix[temp.nSequences];
  float *jScore = new float[temp.nSequences];
  float *qScore = new float[temp.nSequences];
  int num = temp.nSequences;

  // now read the Q-score file (Matrix format)
  FILE *fp = fopen(qFile,"r");
  char *t = new char[10000];
  char *t2 = new char[10000];
  float **qMat = new float*[num];
  for(int i = 0; i < num; i++)
    qMat[i] = new float[num];
  int count=0;
  while(fgets(t,10000,fp)!=NULL) {
    for(int i=0; i<num; i++) {
      strcpy(t2,t);
      t2[11+7*i+6]='\0';
      qMat[count][i]=charToFloat(t2+11+7*i);
    }
    count++;
  }
  for(int i=0;i<num;i++) for(int j=0;j<num;j++) printf("%d %d %f\n",i,j,qMat[i][j]);


  for(int i=0; i<num-2; i++) {
    jMat[i].computeFromAlignment(temp);
    if(i==0) jScore[i]=1;
    else jScore[i] = corr2(jMat[i],jMat[0]);
    qScore[i]=0;
    for(int j=i+1; j<num; j++) for(int k=i+1; k<num; k++) 
      if(qMat[j][k]>qScore[i]) qScore[i] = qMat[j][k];
    printf("%d %f %f\n",i,jScore[i], qScore[i]);
    temp.nSequences--;
  }
}

