/*****************************************************************************
*
*            (C) Copyright 2005 The Board of Trustees of the
*                        University of Illinois
*                         All Rights Reserved
*
******************************************************************************/

/*****************************************************************************
* RCS INFORMATION:
*
*       $RCSfile: fastaReader.cpp,v $
*       $Author: erobert3 $        $Locker:  $             $State: Exp $
*       $Revision: 1.1.2.2 $           $Date: 2005/01/03 06:16:43 $
*
******************************************************************************/

// $Id: fastaReader.cpp,v 1.1.2.2 2005/01/03 06:16:43 erobert3 Exp $

#include "fastaReader.h"


// Constructor
//
FASTAReader::FASTAReader(Alphabet* alpha)
  : alphabet(alpha), filename(0), path(0), 
     fullName(0), currentSequence(0) {

  //printf("FASTAReader::Constructing\n");

  return;
}


// Destructor
//
FASTAReader::~FASTAReader() {

  delete filename;

  return;
}


/**
 * This method sets the filename of the FASTA file that should be read.
 *
 * @param   fn  The name of the file to be read by this reader.
 * @return  1 if the file is valid otherwise 0.
 *
 * TODO XXX: CHECK FOR VALID FILENAMES WITH NO '/' OR '\' MAYBE WITH STRTRUNC()
 */
int FASTAReader::setFilename(char* fn) {

  int len = strlen(fn);
  if (filename != 0) delete filename;
  filename = new char[len+1];
  strncpy(filename,fn,len);
  filename[len] = '\0';

  if (checkFullName() == 1) {
    return 1;
  }
  else {
    delete filename;
    filename = 0;
  }

  return 0;
}


// setPath
//   
// XXX: NEED ERROR-CHECKING FOR VALID PATHS
int FASTAReader::setPath(char* p) {

  int len = strlen(p);
  if (path != 0)  delete path;
  path = new char[len+1];
  strncpy(path,p,len);
  path[len] = '\0';

  return 1;
}


Sequence* FASTAReader::getSequence(int seqIndex) {

  //printf("Entering getSequence\n");

  FILE* infile = fopen(filename,"r");

  if (infile == NULL) {
    return 0;
  }

  char* seqStr = new char[16384];
  for (int i=0; i<16384; i++) {
    seqStr[i] = '\0';
  }
  char* tempStr = new char[1024];
  char* name = new char[1024];
  int seqLen=0;    // length of seqStr
  int tempLen=0;   // length of tempStr
  int i=0;
  int flag = 0;

  // Read file until first sequence is found
  while (!feof(infile) && flag == 0) {
    fgets(tempStr,1023,infile);
    //printf("%s",tempStr);
    if (tempStr[0] == '>') {
      flag++;
    }
  }
  // Read file until seqIndex sequence is found
  while (!feof(infile) && i<seqIndex) {
    fgets(tempStr,1023,infile);
    //printf("%s",tempStr);
    if (tempStr[0] == '>') {
      i++;
    }
  }
  // Read the name for seq from FASTA comment line
  if (tempStr != 0) {
    // XXX: NAME
    int check = getSequenceName(tempStr, name);
    if (check == 0) {
      delete name;
      name = 0;
    }
  }
  // 
  // Read sequence information
  //printf("   Reading sequence information\n");
  while (i==seqIndex && !feof(infile)) {
    fgets(tempStr,1023,infile);
    if (tempStr[0] == '>'  ||
	tempStr[0] == '\n' ||
	tempStr[0] == ' ') {
      i++;
    }
    else if (feof(infile)) {
      // BLANK
    }
    else {
      tempLen = strlen(tempStr);
      tempLen--;   // ignore newline
      strncat(seqStr,tempStr,tempLen);
    }
  }

  fclose(infile);
  seqLen = strlen(seqStr);
  if (seqLen == 0) {
    delete seqStr;
    delete tempStr;
    return 0;
  }
  // Sequence will contain whatever gaps exist in the FASTA
  //   sequence representation
  Sequence* seq = new Sequence(seqLen,alphabet,name);
  int check=0;
  //printf("   Adding %d Symbols\n",seqLen);
  //printf("%s\n",seqStr);
  for (i=0; i<seqLen; i++) {
    //printf("%c\n",seqStr[i]);
    check = seq->addSymbol(seqStr[i]);
    if (check == 0) {
      delete seqStr;
      delete tempStr;
      delete seq;
      return 0;
    }
  }

  delete seqStr;
  delete tempStr;
  currentSequence = seqIndex;

  //printf("Exiting getSequence\n");

  return seq;

  return 0;
}


Sequence* FASTAReader::getNextSequence() {

  return getSequence(currentSequence+1);
}


// seqIndex is 0-indexed
AlignedSequence* FASTAReader::getAlignedSequence(int seqIndex) {

  //printf("=>getAlignedSequence\n");

  FILE* infile = fopen(fullName,"r");

  if (infile == NULL) {
    return 0;
  }

  char* seqStr = new char[16384];
  for (int i=0; i<16384; i++) {
    seqStr[i] = '\0';
  }
  char* tempStr = new char[1024];
  char* name = new char[1024];
  int seqLen=0;    // length of seqStr
  int tempLen=0;   // length of tempStr
  int i=0;
  int flag = 0;

  // Read file until first sequence is found
  while (!feof(infile) && flag == 0) {
    fgets(tempStr,1023,infile);
    //printf("%s",tempStr);
    if (tempStr[0] == '>') {
      flag++;
    }
  }
  // Read file until seqIndex sequence is found
  while (!feof(infile) && i<seqIndex) {
    fgets(tempStr,1023,infile);
    //printf("%s",tempStr);
    if (tempStr[0] == '>') {
      i++;
    }
  }
  // Read the name for alSeq from FASTA comment line
  if (tempStr != 0) {
    // XXX: NAME
    int check = getSequenceName(tempStr, name);
    if (check == 0) {
      delete name;
      name = 0;
    }
  }
  // Read sequence information
  //printf("   Reading sequence information\n");
  while (i==seqIndex && !feof(infile)) {
    fgets(tempStr,1023,infile);
    if (tempStr[0] == '>'  ||
	tempStr[0] == '\n' ||
	tempStr[0] == ' ') {
      i++;
    }
    else if (feof(infile)) {
      // BLANK
    }
    else {
      tempLen = strlen(tempStr);
      tempLen--;   // ignore newline
      strncat(seqStr,tempStr,tempLen);
    }
  }
  //printf("  Read1\n");
  fclose(infile);
  seqLen = strlen(seqStr);
  if (seqLen == 0) {
    delete seqStr;
    delete tempStr;
    return 0;
  }

  //printf("  Read2\n");
  //printf("   seqLen = %d\n",seqLen);
  //printf("   alphabet = %s\n", alphabet->toString());
  //printf("   name = %s\n",name);
  AlignedSequence* alSeq = new AlignedSequence(seqLen,alphabet,name);

  int check=0;
  //printf("   Adding %d Symbols\n",seqLen);
  //printf("%s\n",seqStr);
  for (i=0; i<seqLen; i++) {
    //printf("%c\n",seqStr[i]);
    check = alSeq->addSymbol(seqStr[i]);
    if (check == 0) {
      delete seqStr;
      delete name;
      delete tempStr;
      delete alSeq;
      return 0;
    }
  }

  delete seqStr;
  delete name;
  delete tempStr;
  currentSequence = seqIndex;

  //printf("Exiting getAlignedSequence\n");

  return alSeq;
}


AlignedSequence* FASTAReader::getNextAlignedSequence() {

  //currentSequence++;
  return getAlignedSequence(currentSequence+1);
}


/**
 * This method reads the sequence alignment from the file currently set in the reader.
 *
 * @return  A pointer to the SequenceAlignment object containing the sequence alignment that was
 *          read or NULL if an error occurred.
 *          NOTE: the caller is responsible for freeing the returned pointer when it is no longer
 *          needed.
 *
 * TODO XXX: NEED LOTS OF ERROR CATCHING FOR THINGS LIKE VARYING SEQUENCE LENGTH (MAYBE SHOULD BE
 *           IN SEQUENCEALIGNMENT)
 */
SequenceAlignment* FASTAReader::getSequenceAlignment() {

  currentSequence = 0;
  int seqCount = getSequenceCount();
  if (seqCount == -1) {
    return NULL;
  }

  //printf("FASTAReader seqCount: %d\n",seqCount);

  // Read in first AlignedSequence
  AlignedSequence* alSeq = getAlignedSequence(0);
  if (alSeq == 0) {
    return 0;
  }
  
  SequenceAlignment* seqAl = new SequenceAlignment(alSeq->getLength(),seqCount);
  seqAl->addSequence(alSeq);

  //printf("   Done reading first AlignedSequence\n");

  int i=1;   // index of sequence currently being read
  while (i<seqCount) {
    //printf("i: %d\n",i);
    alSeq = getAlignedSequence(i);
    if (alSeq == 0) {
      delete seqAl;
      printf("alSeq: sequence %d not returned\n",i);
      return 0;
    }
    seqAl->addSequence(alSeq);
    i++;
  }

  // Did not retrieve all sequences
  //   (as predicted by getSequenceCount())
  if (i != seqCount) {
    delete seqAl;
    return 0;
  }

  //printf("<=getSequenceAlignment()\n");

  return seqAl;
}

/**
 * This method gets the number of sequences in the file currently set in the reader.
 * 
 * @return  The number of sequences in the file or -1 if an error occurred.
 */
int FASTAReader::getSequenceCount() {

  //Open the file.
  FILE* infile = fopen(fullName,"r");
  if (infile == NULL) {
    return -1;
  }
  
  
  int seqCount = 0;
  char* tempStr = new char[1024];

  while (!feof(infile)) {
    fgets(tempStr,1023,infile);
    if (tempStr[0] == '>') {
      seqCount++;
      //printf("seq[%d]: %s\n",seqCount,tempStr);
    }
  }

  fclose(infile);
  delete tempStr;

  //printf("<=getSequenceCount()\n");

  //printf("seqCount: %d\n",seqCount);

  return seqCount;
}


int FASTAReader::getSequenceName(char* tempStr, char* name) {

  if (tempStr[0] != '>') {
    return 0;
  }

  int i=1;
  while (tempStr[i] == ' ') {
    
    i++;
  }

  int j=0;
  while (tempStr[i] != ' ' &&
	 tempStr[i] != '\n' &&
	 tempStr[i] != '\0') {
    name[j] = tempStr[i];
    i++;
    j++;
  }

  name[j] = '\0';

  return 1;
}


/**
 * This method make sure that the file currently set in the reader corresponds to an accessible file.
 *
 * @return  1 if the file is accessible otherwise 0.
 */
int FASTAReader::checkFullName() {

  if (filename == 0) {
    return 0;
  }

  if (fullName == 0) {
    if (path == 0) {
      FILE* infile = fopen(filename,"r");
      if (infile == NULL) {
    	return 0;
      }
      fclose(infile);
      int filenameLen = strlen(filename);
      fullName = new char[filenameLen + 1];
      strncpy(fullName,filename,filenameLen);
      fullName[filenameLen] = '\0';
      return 1;
    }
    else {
      int pathLen = strlen(path);
      int filenameLen = strlen(filename);
      fullName = new char[pathLen + filenameLen + 1];
      strncpy(fullName,path,pathLen);
      fullName[pathLen] = '\0';
      strncat(fullName,filename,filenameLen);
      fullName[pathLen + filenameLen] = '\0';
      FILE* infile = fopen(fullName,"r");
      if (infile == NULL) {
    	delete fullName;
    	fullName = 0;
    	return 0;
      }
      fclose(infile);
      return 1;
    }
  }
  else {
    FILE* infile = fopen(fullName,"r");
    if (infile == NULL) {
      delete fullName;
      fullName = 0;
      return 0;
    }
    fclose(infile);
    return 1;
  }

}
