// -*- C++ -*------------------------------------------------------------------
//  $Id: 
//
//  Class:              SimpleSequenceAlignment
//
//  Base Class(es):     SequenceAlignment
//
//  Derived Class(es):  -
//
//  Authors:            Eckart Bindewald
//
//  Description:
//    This class implements a simple alignment class.
//
// ---------------------------------------------------------------------------

#ifndef _SIMPLE_SEQUENCE_ALIGNMENT_H_
#define _SIMPLE_SEQUENCE_ALIGNMENT_H_

// Includes:
#include <iostream>
#include <string>
#include <SequenceAlignment.h>
#include <StringTools.h>

using namespace std;

class SimpleSequenceAlignment : public SequenceAlignment {

 public:

  /** TYPEDEFS */

  //   typedef unsigned int size_type;
  //   typedef SequenceAlignment::sequence_type sequence_type;
  //   typedef string::size_type sequence_size_type;
  //   typedef SequenceAlignment::sequence_container sequence_container;
  //   typedef Vec<string> name_container;
  //   typedef SequenceAlignment

  enum { ENCODE_LEN_MAX = 40 }; // avoid overlap with ASCII code of '-' which is 42

  SimpleSequenceAlignment() : compressed(false), referenceSequenceId(0), referenceSequenceStartCol(0), score(0.0) { }

  SimpleSequenceAlignment(const SimpleSequenceAlignment& other) {
    copy(other);
  }

  SimpleSequenceAlignment(const SequenceAlignment& other) {
    copy(other);
  }

  virtual ~SimpleSequenceAlignment() { }

  virtual SimpleSequenceAlignment& operator=(const SimpleSequenceAlignment& other) {
    if (this != &other) {
      copy(other);
    }
    return *this;
  }

  virtual SimpleSequenceAlignment& operator=(const SequenceAlignment& other) {
    if (this != &other) {
      copy(other);
    }
    return *this;
  }

  virtual void addSequence(const SequenceAlignment::sequence_type& s,
			   const string& name) {
    PRECOND(!isCompressed());
    sequences.push_back(s);
    names.push_back(name);
    properties_type seqProps;
    properties.push_back(seqProps);
    ASSERT(sequences.size() == properties.size());
    ASSERT(sequences.size() == names.size());
  }

  virtual void addSequence(const SequenceAlignment::sequence_type& s,
			   const string& name,
			   const properties_type& seqProperties) {
    PRECOND(!isCompressed());
    sequences.push_back(s);
    names.push_back(name);
    properties.push_back(seqProperties);
    ASSERT(sequences.size() == properties.size());
    ASSERT(sequences.size() == names.size());
  }

  /** removes all sequence data. */
  virtual void clear() { 
    sequences.clear();
    names.clear();
    properties.clear();
    compressedCols.clear();
    compressed = false;
  }

  /** removes sequences with rank greater n */
  virtual void clipAfter(size_type n) {
    PRECOND(!isCompressed());
    while (size() > n) {
      removeSequence(size()-1);
    }
  }

  /** Removes all columns that correspond to a gap in sequence with specified number */
  virtual void collapse(size_type seqId);

  /** Compress internal state of  sequence alignment.
   *  All methods should still work, albeit slower */
  virtual void compress();

  /** transforms to reverse complement (reverse order in sequence, exchanges A->T, T,U->A, G->C, C->G */
  static char computeReverseComplement(char c, const string& fromAlphabet, const string& toAlphabet);

  /** transforms to reverse complement (reverse order in sequence, exchanges A->T, T,U->A, G->C, C->G */
  static string computeReverseComplement(const string& s, const string& fromAlphabet, const string& toAlphabet);

  virtual void copy(const SimpleSequenceAlignment& other) {
    compressed = other.compressed;
    sequences = other.sequences;
    names = other.names;
    properties = other.properties;
    compressedCols = other.compressedCols;
    referenceSequenceId = other.referenceSequenceId;
    referenceSequenceStartCol = other.referenceSequenceStartCol;
    score = other.score;
  }

  virtual void copy(const SequenceAlignment& other) {
    compressed = false;
    sequences = other.getSequences();
    names = other.getNames();
    properties.clear();
    compressedCols.clear();
  }

  /** counts number of characters and gaps. Careful: characters "X" and "N" are counted as characters. */
  virtual void countCharacters(sequence_size_type& numChars, sequence_size_type& numGaps) const {
    numChars = 0;
    numGaps = 0;
    for (size_type i = 0; i < sequences.size(); ++i) {
      countCharacters(numChars, numGaps, i);
    }
  }

  /** counts number of characters and gaps in n'th sequence. Careful: characters "X" and "N" are counted as characters. */
  virtual void countCharacters(sequence_size_type& numChars, sequence_size_type& numGaps,
			       size_type n) const;
  
  /** counts number of characters of type c */
  virtual size_type countCharacter(char c) const;

  /** returns number of dinucleotides defined by two characters c1 and c2 */
  virtual size_type countDiCharacter(char c1, char c2) const;

  /** removes n'th column */
  virtual void deleteColumn(sequence_size_type n);

  /** removes gap columns */
  virtual void deleteGapColumns() {
    PRECOND(!isCompressed()); // FIXIT: not too hard to loosen restriction
    for (int i = static_cast<int>(getLength())-1; i >= 0; --i) {
      sequence_size_type ii = static_cast<sequence_size_type>(i);
      if (isGapColumn(ii)) {
	deleteColumn(ii);
      }
    }
  }
    
  /** removes all gaps from each sequence. */
  virtual void deleteGaps();
    
  /** returns index of sequence with specified name. Careful:
   * change from find algorithm to compare !
   */
  virtual size_type findSequenceByName(const string& name) const;

  /** Finds sequence id, for which key-value pair of properties is fullfilled. If not found returns number of sequences. */
  virtual size_type findSequenceByProperty(const string& key, const string& value) const;

  /** returns n'th column */
  virtual sequence_type getColumn(sequence_size_type n) const;

  /** returns number of characters */
  virtual sequence_size_type getLength() const {
    if (isCompressed()) {
      return compressedCols.size();
    }
    else if (size() == 0) {
      return 0;
    }
    return sequences[0].size();
  }

  virtual const string& getName(size_type n) const {
    return names[n];
  }

  virtual const name_container& getNames() const {
    return names;
  }

  virtual sequence_type getSequence(size_type n) const {
    PRECOND(n < size());
    if (isCompressed()) {
      sequence_size_type len = getLength();
      sequence_type result(len, 'X');
      for (size_type i = 0; i < len; ++i) {
	result[i] = runLengthDecode(compressedCols[i], ENCODE_LEN_MAX)[n]; // FIXIT : improve speed
      }
      return result;
    }
    return sequences[n];
  }

  virtual const properties_type& getSequenceProperties(size_type n) const {
    return properties[n];
  }

  /** Returns named property of n'th sequence. Returns empty string if property not found. */
  virtual string getSequenceProperty(size_type n, const string& key) const {
    properties_type::const_iterator it = properties[n].find(key);
    if (it != properties[n].end()) {
      return it->second;
    }
    return ""; // returns empty string
  }

  /** Returns named property of n'th sequence. Returns empty string if property not found. */
  virtual void setSequenceProperty(size_type n, const string& key, const string& value) {
    properties[n][key] = value;
    ASSERT(getSequenceProperty(n, key) == value);
  }

  virtual const SequenceAlignment::sequence_container& getSequences() const {
    PRECOND(!isCompressed());
    return sequences;
  }

  /** returns true if all sequences have same length. */
  virtual bool hasEvenLengths() const {
    PRECOND(!isCompressed());
    for (size_type i = 1; i < size(); ++i) {
      if (sequences[i].size() != sequences[0].size()) {
	return false;
      }
    }
    return true;
  }

  /** inserts gap at spcified position */
  virtual void insertGap(sequence_size_type pos) {
    PRECOND(!isCompressed());
    for (size_type i = 0; i < size(); ++i) {
      if (pos <= sequences[i].size()) {
	sequences[i].insert(pos,1, GAP_CHAR);
      }
    }
  }

  /** Returns true if sequence alignment is internally compressed. */
  virtual bool isCompressed() const { 
    return compressed;
    // return (compressedCols.size() > 0) && (sequences.size() == 0);
  }

  /** returns true if correct start character is given.
   */
  virtual bool isFastaChar(char c) const;

  /** returns true if there is a gap at specified position for all sequences. */
  virtual bool isGapColumn(sequence_size_type pos) const {
    string col = getColumn(pos);
    for (size_type i = 0; i < col.size(); ++i) {
      // ASSERT(sequences[i].size() > pos);
      if (col[i] != GAP_CHAR) {
	return false;
      }
    }
    return true;
  }

  /** converts '.' charachter to '-' */
  virtual void normalizeGap();

  /** Returns vector of size() elements, each element (string) containing the value corresponding to this sequence and the specified key.
   * If not property was found, use empty string.
   */
  virtual Vec<string> propertyValues(const string& key) const;

  /** leaves only fragment starting from position start and with specified length. */
  virtual void prune(sequence_size_type start, sequence_size_type length);

  /** shuffles sequences horizontally, columns do not remain intact  */
  virtual void randomShuffle();

  /** shuffles string such that gap pattern is preserved */
  static string randomShuffleStringGapPreserving(const string& origString);

  /** shuffles sequences vertically */
  virtual void randomShuffleColumns(bool keepFirst); 

  /** shuffles string such that gap pattern is preserved */
  virtual void randomShuffleColumnGapPreserving(sequence_size_type col, bool keepFirstFixed);

  /** shuffles string such that gap pattern is preserved */
  virtual void randomShuffleColumnsGapPreserving(bool keepFirstFixed);

  /** shuffles sequences horizontally, all rows are shuffled the same way, leaving columns intact */
  virtual void randomShuffleHoriz(); 

  /** reads FASTA formatted file */
  virtual void readFasta(istream& is);

  virtual void removeSequence(SequenceAlignment::size_type n) {
    PRECOND(!isCompressed());
    sequences.erase(sequences.begin()+n);
    names.erase(names.begin()+n);
  }

  /** Brute-force way to remove all internally stored property objects */
  virtual void removeAllProperties() { properties.clear(); }

  /** exchange two characters in sequences */
  virtual void replace(char cOld, char cNew);

  /** sets a column of specified position. */
  virtual void setColumn(const string& col, sequence_size_type pos) {
    PRECOND(!isCompressed());
    PRECOND(col.size() == size());
    for (size_type i = 0; i < size(); ++i) {
      ASSERT(pos < sequences[i].size());
      sequences[i][pos] = col[i];
    }
  }

  virtual void setName(const string& name, size_type n) {
    PRECOND(n < sequences.size());
    names[n] = name;
  }

  virtual void setSequence(const SequenceAlignment::sequence_type& s, const string& name, size_type n) {
    PRECOND(!isCompressed());
    PRECOND(n < sequences.size());
    sequences[n] = s;
    names[n] = name;
  }

  virtual void setSequence(const SequenceAlignment::sequence_type& s, size_type n) {
    PRECOND(!isCompressed());
    PRECOND(n < sequences.size());
    sequences[n] = s;
  }

  virtual double getScore() const { return score; }

  virtual void setScore(double _score) { score = _score; }

  virtual sequence_size_type getReferenceSequenceStartCol() const { return referenceSequenceStartCol; }

  virtual void setReferenceSequenceStartCol(sequence_size_type n) { referenceSequenceStartCol = n; }
  
  virtual size_type getReferenceSequenceId() const { return referenceSequenceId; }

  virtual void setReferenceSequenceId(size_type n) { referenceSequenceId = n; }

  /** returns number of sequences */
  virtual size_type size() const { 
    if (isCompressed()) {
      return getColumn(0).size(); // FIXIT : slow!
    }
    return sequences.size(); 
  }

  /** swaps sequences and names of ids n1 and n2 */
  virtual void swapSequences(size_type n1, size_type n2) {
    PRECOND(!isCompressed());
    string stmp = sequences[n2];
    string ntmp = names[n2];
    sequences[n2] = sequences[n1];
    names[n2] = names[n1];
    sequences[n1] = stmp;
    names[n1] = ntmp;
  }

  /** transforms to reverse complement (reverse order in sequence, exchanges A->T, T,U->A, G->C, C->G */
  virtual void transformReverseComplement();

  /** Uncompress internal state of  sequence alignment.
   *  All methods should still work, albeit slower */
  virtual void uncompress();

  /** convers sequence characters to upper case */
  virtual void upperCaseSequences();

  /** Ensures that containers have equal non-zero size. FIXIT: check that all sequences have lengths greater zero. */
  virtual bool validate() const { 
    return (names.size() > 0) 
      && (size() == names.size()) 
      && (properties.size() == names.size()); 
  }

  /** writes FASTA formatted file. */
  virtual void writeFasta(ostream& os) const;


  /** writes sequence properties */
  virtual void writeProperties(ostream& os) const;

 private:

  bool compressed;

  /* ATTRIBUTES */
  sequence_container sequences;
  
  name_container names;

  Vec<properties_type> properties;

  sequence_container compressedCols;

  size_type referenceSequenceId;

  sequence_size_type referenceSequenceStartCol;

  double score;

};

/** returns n'th column */
inline
SequenceAlignment::sequence_type
SimpleSequenceAlignment::getColumn(SequenceAlignment::sequence_size_type n) const {
  PRECOND(n < getLength());
  if (isCompressed()) {
    return runLengthDecode(compressedCols[n], ENCODE_LEN_MAX);
  }
  size_type sz = size();
  sequence_type s(sz,'X');
  for (size_type i = 0; i < sz; ++i) {
    s[i] = getSequence(i)[n];
  }
  return s;
}


#endif
