/***************************************************************************** * * (C) Copyright 2005 The Board of Trustees of the * University of Illinois * All Rights Reserved * ******************************************************************************/ /***************************************************************************** * RCS INFORMATION: * * $RCSfile: fastaReader.cpp,v $ * $Author: erobert3 $ $Locker: $ $State: Exp $ * $Revision: 1.1.2.2 $ $Date: 2005/01/03 06:16:43 $ * ******************************************************************************/ // $Id: fastaReader.cpp,v 1.1.2.2 2005/01/03 06:16:43 erobert3 Exp $ #include "fastaReader.h" // Constructor // FASTAReader::FASTAReader(Alphabet* alpha) : alphabet(alpha), filename(0), path(0), fullName(0), currentSequence(0) { //printf("FASTAReader::Constructing\n"); return; } // Destructor // FASTAReader::~FASTAReader() { delete filename; return; } /** * This method sets the filename of the FASTA file that should be read. * * @param fn The name of the file to be read by this reader. * @return 1 if the file is valid otherwise 0. * * TODO XXX: CHECK FOR VALID FILENAMES WITH NO '/' OR '\' MAYBE WITH STRTRUNC() */ int FASTAReader::setFilename(char* fn) { int len = strlen(fn); if (filename != 0) delete filename; filename = new char[len+1]; strncpy(filename,fn,len); filename[len] = '\0'; if (checkFullName() == 1) { return 1; } else { delete filename; filename = 0; } return 0; } // setPath // // XXX: NEED ERROR-CHECKING FOR VALID PATHS int FASTAReader::setPath(char* p) { int len = strlen(p); if (path != 0) delete path; path = new char[len+1]; strncpy(path,p,len); path[len] = '\0'; return 1; } Sequence* FASTAReader::getSequence(int seqIndex) { //printf("Entering getSequence\n"); FILE* infile = fopen(filename,"r"); if (infile == NULL) { return 0; } char* seqStr = new char[16384]; for (int i=0; i<16384; i++) { seqStr[i] = '\0'; } char* tempStr = new char[1024]; char* name = new char[1024]; int seqLen=0; // length of seqStr int tempLen=0; // length of tempStr int i=0; int flag = 0; // Read file until first sequence is found while (!feof(infile) && flag == 0) { fgets(tempStr,1023,infile); //printf("%s",tempStr); if (tempStr[0] == '>') { flag++; } } // Read file until seqIndex sequence is found while (!feof(infile) && i') { i++; } } // Read the name for seq from FASTA comment line if (tempStr != 0) { // XXX: NAME int check = getSequenceName(tempStr, name); if (check == 0) { delete name; name = 0; } } // // Read sequence information //printf(" Reading sequence information\n"); while (i==seqIndex && !feof(infile)) { fgets(tempStr,1023,infile); if (tempStr[0] == '>' || tempStr[0] == '\n' || tempStr[0] == ' ') { i++; } else if (feof(infile)) { // BLANK } else { tempLen = strlen(tempStr); tempLen--; // ignore newline strncat(seqStr,tempStr,tempLen); } } fclose(infile); seqLen = strlen(seqStr); if (seqLen == 0) { delete seqStr; delete tempStr; return 0; } // Sequence will contain whatever gaps exist in the FASTA // sequence representation Sequence* seq = new Sequence(seqLen,alphabet,name); int check=0; //printf(" Adding %d Symbols\n",seqLen); //printf("%s\n",seqStr); for (i=0; iaddSymbol(seqStr[i]); if (check == 0) { delete seqStr; delete tempStr; delete seq; return 0; } } delete seqStr; delete tempStr; currentSequence = seqIndex; //printf("Exiting getSequence\n"); return seq; return 0; } Sequence* FASTAReader::getNextSequence() { return getSequence(currentSequence+1); } // seqIndex is 0-indexed AlignedSequence* FASTAReader::getAlignedSequence(int seqIndex) { //printf("=>getAlignedSequence\n"); FILE* infile = fopen(fullName,"r"); if (infile == NULL) { return 0; } char* seqStr = new char[16384]; for (int i=0; i<16384; i++) { seqStr[i] = '\0'; } char* tempStr = new char[1024]; char* name = new char[1024]; int seqLen=0; // length of seqStr int tempLen=0; // length of tempStr int i=0; int flag = 0; // Read file until first sequence is found while (!feof(infile) && flag == 0) { fgets(tempStr,1023,infile); //printf("%s",tempStr); if (tempStr[0] == '>') { flag++; } } // Read file until seqIndex sequence is found while (!feof(infile) && i') { i++; } } // Read the name for alSeq from FASTA comment line if (tempStr != 0) { // XXX: NAME int check = getSequenceName(tempStr, name); if (check == 0) { delete name; name = 0; } } // Read sequence information //printf(" Reading sequence information\n"); while (i==seqIndex && !feof(infile)) { fgets(tempStr,1023,infile); if (tempStr[0] == '>' || tempStr[0] == '\n' || tempStr[0] == ' ') { i++; } else if (feof(infile)) { // BLANK } else { tempLen = strlen(tempStr); tempLen--; // ignore newline strncat(seqStr,tempStr,tempLen); } } //printf(" Read1\n"); fclose(infile); seqLen = strlen(seqStr); if (seqLen == 0) { delete seqStr; delete tempStr; return 0; } //printf(" Read2\n"); //printf(" seqLen = %d\n",seqLen); //printf(" alphabet = %s\n", alphabet->toString()); //printf(" name = %s\n",name); AlignedSequence* alSeq = new AlignedSequence(seqLen,alphabet,name); int check=0; //printf(" Adding %d Symbols\n",seqLen); //printf("%s\n",seqStr); for (i=0; iaddSymbol(seqStr[i]); if (check == 0) { delete seqStr; delete name; delete tempStr; delete alSeq; return 0; } } delete seqStr; delete name; delete tempStr; currentSequence = seqIndex; //printf("Exiting getAlignedSequence\n"); return alSeq; } AlignedSequence* FASTAReader::getNextAlignedSequence() { //currentSequence++; return getAlignedSequence(currentSequence+1); } /** * This method reads the sequence alignment from the file currently set in the reader. * * @return A pointer to the SequenceAlignment object containing the sequence alignment that was * read or NULL if an error occurred. * NOTE: the caller is responsible for freeing the returned pointer when it is no longer * needed. * * TODO XXX: NEED LOTS OF ERROR CATCHING FOR THINGS LIKE VARYING SEQUENCE LENGTH (MAYBE SHOULD BE * IN SEQUENCEALIGNMENT) */ SequenceAlignment* FASTAReader::getSequenceAlignment() { currentSequence = 0; int seqCount = getSequenceCount(); if (seqCount == -1) { return NULL; } //printf("FASTAReader seqCount: %d\n",seqCount); // Read in first AlignedSequence AlignedSequence* alSeq = getAlignedSequence(0); if (alSeq == 0) { return 0; } SequenceAlignment* seqAl = new SequenceAlignment(alSeq->getLength(),seqCount); seqAl->addSequence(alSeq); //printf(" Done reading first AlignedSequence\n"); int i=1; // index of sequence currently being read while (iaddSequence(alSeq); i++; } // Did not retrieve all sequences // (as predicted by getSequenceCount()) if (i != seqCount) { delete seqAl; return 0; } //printf("<=getSequenceAlignment()\n"); return seqAl; } /** * This method gets the number of sequences in the file currently set in the reader. * * @return The number of sequences in the file or -1 if an error occurred. */ int FASTAReader::getSequenceCount() { //Open the file. FILE* infile = fopen(fullName,"r"); if (infile == NULL) { return -1; } int seqCount = 0; char* tempStr = new char[1024]; while (!feof(infile)) { fgets(tempStr,1023,infile); if (tempStr[0] == '>') { seqCount++; //printf("seq[%d]: %s\n",seqCount,tempStr); } } fclose(infile); delete tempStr; //printf("<=getSequenceCount()\n"); //printf("seqCount: %d\n",seqCount); return seqCount; } int FASTAReader::getSequenceName(char* tempStr, char* name) { if (tempStr[0] != '>') { return 0; } int i=1; while (tempStr[i] == ' ') { i++; } int j=0; while (tempStr[i] != ' ' && tempStr[i] != '\n' && tempStr[i] != '\0') { name[j] = tempStr[i]; i++; j++; } name[j] = '\0'; return 1; } /** * This method make sure that the file currently set in the reader corresponds to an accessible file. * * @return 1 if the file is accessible otherwise 0. */ int FASTAReader::checkFullName() { if (filename == 0) { return 0; } if (fullName == 0) { if (path == 0) { FILE* infile = fopen(filename,"r"); if (infile == NULL) { return 0; } fclose(infile); int filenameLen = strlen(filename); fullName = new char[filenameLen + 1]; strncpy(fullName,filename,filenameLen); fullName[filenameLen] = '\0'; return 1; } else { int pathLen = strlen(path); int filenameLen = strlen(filename); fullName = new char[pathLen + filenameLen + 1]; strncpy(fullName,path,pathLen); fullName[pathLen] = '\0'; strncat(fullName,filename,filenameLen); fullName[pathLen + filenameLen] = '\0'; FILE* infile = fopen(fullName,"r"); if (infile == NULL) { delete fullName; fullName = 0; return 0; } fclose(infile); return 1; } } else { FILE* infile = fopen(fullName,"r"); if (infile == NULL) { delete fullName; fullName = 0; return 0; } fclose(infile); return 1; } }