// --*- C++ -*------x---------------------------------------------------------
#ifndef  __MAF_ALIGNMENT__
#define  __MAF_ALIGNMENT__

#include <string>
#include <iostream>
#include <Vec.h>
#include <set>
#include <SequenceAlignment.h>
#include <SimpleSequenceAlignment.h>
#include <StringTools.h>
#include <debug.h>
#include <algorithm>
#include <limits.h>
#include <BEDRegions.h>

using namespace std;

#define REMARK cout << "# "

#define ALIGNMENT_UNLOADED 1

#define ALIGNMENT_UPLOADED 0

#define SLASH "/"

/** This class represents a genomic alignment in UCSC MAF format. It is effectively a 
 * vector of regular sequence alignments, with each element corresponding to one MAF block.
 * The class provides the following convinience methods:
 * reading of complete genomic alignment (read, readMaf)
 * Converting between assembly coordinates and internal coordinates
 * obtaining alignment columns for all internal coordinates
 * to be continued FIXIT
 */
class MAFAlignment : public Vec<SimpleSequenceAlignment> {

 public:

  typedef SequenceAlignment alignment_type;
  typedef SequenceAlignment::sequence_type sequence_type;
  typedef SequenceAlignment::sequence_size_type sequence_size_type;
  typedef SimpleSequenceAlignment alignment_imp_type;
  typedef Vec<SequenceAlignment>::size_type size_type;
  typedef BEDRegions::length_type length_type; // genomces can contain more than 10^9 nucleotides...
  typedef char row_type; // type of assembly row id
  typedef map<string, size_t> count_hash_type;

  enum { ALIGNMENT_SIZE_MAX = CHAR_MAX }; // maximum number of sequences in block alignments
  enum { STRAND_PLUS = 1, STRAND_MINUS = -1, STRAND_UNKNOWN = 0 };
  
 private:

  int strandMode;
  mutable string::size_type pc; // used for reading alignments
  length_type totalLength; // total number of stored alignment columns
  size_type pruneAfter;
  size_type seqMin;
  set<string> assemblies;
  // mutable Vec<length_type> alignmentIds; // for n'th column in absolute column, which alignment block does it belong to
  // mutable Vec<length_type> alignmentColumnIds; // for n'column in absolute counting, which column of an alignment block does it belong to?
  mutable Vec<map<string, row_type> > assemblyRowIdMaps;
  /** Total lenth of reference assembly/chromosome as given in MAF file */
  mutable Vec<length_type> assemblyToColumnMapping; // maps reference assembly positions to columnIds. Add assemblyToColumnOffset for correct answer
  mutable length_type assemblyToColumnOffset; 
  Vec<Vec<length_type> > chromStarts; // stores "chromStart" for n'th alignment and m'th sequence
  mutable Vec<length_type> refChromStarts; // like chromStarts, but only for reference sequence
  Vec<length_type> colStarts; // which column in absolute counting does first column of n'th alignment correspond to?
  bool fastConversionMode; // fastConversionMode == true currently does not give correct results
  string refAssembly; // which sequence type is the reference assembly? Typically the first one in each alignment
  mutable length_type refAssemblyTotLength;
  /** Name of chromosome of reference assembly */
  mutable string refAssemblyChrom; // refererence assembly should be from same chromosome
  bool removePropertiesMode;
  set<string> requiredAssemblies;
  string residues;
  bool shuffleMode;
  set<string> tabooAssemblies;
  string unloadDir;
  static length_type unloadCharLimit; // unload only alignments with more than this many characters
  mutable Vec<size_type> unloadIds; // ids of alignments that have been unloaded
  string unloadPrefix;
  int verbose;

 public:
  
  /** Default constructor. */
  MAFAlignment() : Vec<alignment_imp_type>(), strandMode(STRAND_PLUS), pc(0), totalLength(0), pruneAfter(0), seqMin(0), fastConversionMode(false), refAssemblyTotLength(0), removePropertiesMode(true), residues("ACGT"), shuffleMode(false), verbose(1) { }

  /** Copy constructor. Currently not supported. */
  MAFAlignment(const MAFAlignment& other) { copy(other); }
  
  virtual ~MAFAlignment() { 
    if (unloadIds.size() > 0) {
      removeUnloadedAlignments();
    }
  }

  /** Copy method is not implemented. */
  virtual void copy(const MAFAlignment& other) { ERROR("Copying of MAF Alignments not yet implemented!"); }

  /** Assignment operator. Currently not supported. */
  virtual MAFAlignment& operator = (const MAFAlignment& other) {
    if (&other  != this) {
      copy(other);
    }
    return *this;
  }

  /** Counts number of overlapping alignments. */
  virtual size_type countOverlapping() const;

  /** Appends other alignment to end of this object. Essentially this corresponds to concatenating to vectors of alignments blocks, 
   * but the method also updated the necessary internal variables. */
  virtual void append(const MAFAlignment& other);

  /** Adds constant to all "chromStart" values in all sequences. Important for concatenating alignments so that their coordinates do not overlap. */ 
  virtual void addChromStartOffset(length_type offset, const string& assembly);

  /** Collapses all alignments with respect to sequences of this assembly. */
  virtual void collapseAssembly(const string& assembly);

  /** Returns total number of sequence characters and gaps stored. */
  virtual unsigned long computeCharacterCount() const {
    unsigned long result = 0;
    for (size_type i = 0; i < size(); ++i) {
      result += (*this)[i].size() * (*this)[i].getLength();
    }
    return result;
  }

  /** Shuffle each alignment */
  virtual void dinucleotideShuffle(double normLimit, bool shuffleColumnMode);

  /** Total length of reference assembly/chromosome as given in MAF file */
  virtual length_type getRefAssemblyTotLength() const { return refAssemblyTotLength; }

  /** Name of chromosome of reference assembly */
  virtual string getRefAssemblyChrom() const { return refAssemblyChrom; }

  /** Returns for reference genome the "chromStarts", meaning at what position each block starts */
  virtual Vec<length_type> getRefChromStarts() const {
    return refChromStarts;
  }

  /** Returns residue alphabet (default: "ACGT") */
  virtual const string& getResidues() { return residues; }

  virtual int getStrandMode() const { return strandMode; }

  virtual set<string> getRequiredAssemblies() const { return requiredAssemblies; }

  virtual set<string> getTabooAssemblies() const { return tabooAssemblies; }

  /** Prepares for future unload operations (i.e. temporary storing of alignment blocks to disk in order to save memory) */
  virtual void initUnload(const string& dir, const string& prefix);

  /** Returns 1 is plus strand of reference assembly found, -1 if negative strand, 0 if not found or strand character other than "+" or "-" */
  static int isPlusStrand(const SequenceAlignment& ali, const string& assembly);

  /** writes MAF formatted file. Requires properties called "assembly", "chromStart", "length", "totLength" to be defined for each sequence.
      FIXIT: prettier formatting.
  */
  virtual void writeMAF(ostream& os) const;

  /** writes MAF formatted file. Requires properties called "assembly", "chromStart", "length", "totLength" to be defined for each sequence.
      FIXIT: prettier formatting.
  */
  static void writeMAFBlock(ostream& os, const SequenceAlignment& alignment);

  /** Reads MAF format file, writes out blocks that pass the filter */
  virtual size_type filterMAF(istream& is, ostream& os, const BEDRegions& bed) const;

  /** Prunes overlapping alignments. Alignment with lower alignment score is being shortened.
   * If scores are equal, the alignment with lower index is shortened.
   */
  virtual size_type pruneOverlapping(); 
  
  /** Reads from input stream in either UCSC MAF or FASTA format. */
  virtual void read(istream& is) { BEDRegions bed; read(is, bed); }

  /** Reads from input stream in either UCSC MAF or FASTA format. */
  virtual void read(istream& is, const BEDRegions& bed) { read(is, bed, 0, 0); }

  /** Reads from input stream in either UCSC MAF or FASTA format. */
  virtual void read(istream& is, const BEDRegions& bed, size_type blockMin, size_type blockMax);
  
  /** Reads UCSC MAF format */
  virtual void readMAF(istream& is) { BEDRegions bed; read(is, bed); }

  /** Reads UCSC MAF format */
  virtual void readMAF(istream& is, const BEDRegions& bed, size_type blockMin, size_type blockMax);

  /** Reads UCSC MAF format */
  virtual void readMAF(istream& is, const BEDRegions& bed) { readMAF(is, bed, 
					     static_cast<size_type>(0), static_cast<size_type>(0)); }

  /** Reads from input stream in FASTA format. */
  virtual void readFASTA(istream& is);

  /** Counts number of overlapping alignments.  */
  virtual size_type removeDuplicateStartIdBlocks();

  /** Returns sequence assembly name of seqId'th sequences in alignment aliId */
  virtual string getSequenceAssembly(length_type aliId, size_type seqId) const;

  /** Returns the names of all sequence assemblies of an alignment block */
  virtual Vec<string> getAlignmentAssemblies(length_type aliId) const;

  /** Generates for a given alignment block a unique hash string describing the used sequence assemblies */
  virtual string getAlignmentAssembliesHash(length_type aliId) const;

  /** Generates for a given alignment block a unique hash string describing the used sequence assemblies. Adds up alignment lengths 
   * for all alignments corresponding to the same combination of assemblies. */
  virtual count_hash_type countAssembliesHashLengths() const;
  
  /** Returns set of all found assemblies */
  virtual set<string> getAssemblies() const { return assemblies; }

  /** Returns slice of n'th column. Uses internal column counting. */
  virtual sequence_type getSlice(length_type columnId) const {
    PRECOND(columnId >= 0);
    PRECOND(columnId < getTotalLength());
    length_type aliId = 0;
    length_type aliColId = 0;
    findAlignmentIdColumnId(columnId, &aliId, &aliColId);
    // aliId = getAlignmentId(columnId);
    // aliColId = getAlignmentColumnId(columnId);
    // string result = ((*this)[alignmentIds[columnId]]).getColumn(alignmentColumnIds[columnId]);
    ASSERT(aliColId < static_cast<length_type>((*this)[aliId].getLength()));
    string result = (*this)[aliId].getColumn(aliColId);
    ASSERT(result.size() == ((*this)[aliId]).size()); // must be equal to number of sequences of that alignment block
    return result;
  }

  /** Returns slice of n'th column such that it contains the nucleotides corresponding to the specified set. */
  virtual sequence_type getSlice(length_type columnId, const set<string>& assemblies) const;

  /** Returns the alignment id to which a certain column (in internal counting) belongs. */
  virtual length_type getAlignmentId(length_type columnId) const { 
    PRECOND(getTotalLength() > 0);
    PRECOND(columnId >= 0);
    PRECOND(columnId < getTotalLength());
    Vec<length_type>::const_iterator it = lower_bound(colStarts.begin(), colStarts.end(), columnId + 1);
    // if ((*it) != columnId) {
    //   ASSERT(it != colStarts.begin());
    //   it--; // subtract one becase lower_bound finds first element whose content *greater* search key if key not found
    // }
    length_type result = static_cast<length_type>(distance(colStarts.begin(), it)); 
    ASSERT(result > 0);
    --result;
    POSTCOND((result >= 0) && (result < static_cast<length_type>(size())));
    return result;
  }

  /** Returns the alignment column id to which a certain column (in absolute counting) belongs; FIXIT. */
  virtual length_type getAlignmentColumnId(length_type columnId) const { 
    ASSERT((columnId >= 0) && (columnId < getTotalLength()));
    length_type aliId = getAlignmentId(columnId);
    length_type result = columnId - colStarts[aliId];
    // ASSERT(result == alignmentColumnIds[columnId]);
    ASSERT((result >= 0) && (result < static_cast<length_type>(((*this)[aliId]).getLength())));
    return result;
  }

  /** Finds and sets simultaneously the alignment id and alignment column index to which a certain column
   * (in absolute counting) belongs. Method is of critical importance! */
  virtual void findAlignmentIdColumnId(length_type columnId, length_type *aliId, length_type *aliColId) const { 
    ASSERT((columnId >= 0) && (columnId < getTotalLength()));
    if (size() == 1) {
      *aliId = 0;
      *aliColId = columnId; // special case if only one alignment set
      return;
    }
    Vec<length_type>::const_iterator it = lower_bound(colStarts.begin(), colStarts.end(), columnId + 1);
    // if ((*it) != columnId) {
    //   ASSERT(it != colStarts.begin());
    //   it--; // subtract one becase lower_bound finds first element whose content *greater* search key if key not found
    // }
    // length_type result = static_cast<length_type>(distance(colStarts.begin(), it)); 
    // ASSERT(result > 0);
    // --result;
    // if (verbose > 3) {
    //   REMARK << "MAFAlignment::findAlignmentIdColumnId: result of column id " << (columnId + 1) << " alignment id: " << (result + 1) << endl;
    // }
    *aliId = getAlignmentId(columnId); // result;
    ASSERT(static_cast<size_type>(*aliId) < size());
    ASSERT(columnId >= colStarts[*aliId]);
    if ((*aliId) + 1 < static_cast<length_type>(size())) {
      ASSERT(columnId < colStarts[*aliId + 1]);
    }
    if ( (*aliId) >= static_cast<length_type>(colStarts.size())) {
      REMARK << "Warning: Could not find alignment id for column " << (columnId + 1) <<  " total length of MAF: " << getTotalLength() << " " << (*aliId) << " " << colStarts.size() << endl;
      REMARK << "List of column starts of alignment blocks: " << endl;
      for (Vec<length_type>::const_iterator it = colStarts.begin(); it != colStarts.end(); it++) {
	REMARK << (*it) << endl;
      }
    }
    ERROR_IF((*aliId) >= static_cast<length_type>(colStarts.size()),"Internal error: alignment id is greater than number of defined alignmnts.");
    *aliColId = columnId - colStarts[*aliId];
    // ASSERT(*aliId == alignmentIds[columnId]); 
//     if (verbose > 5) {
//       cout << "Alignment column conversion: " << columnId << " " << getTotalLength() << " " << (*aliId) << " "  << size() << " " 
// 	   << (*aliColId) << " " << (*this)[*aliId].getLength() << endl;
//     }
    ASSERT((*aliId >= 0) && (*aliId < static_cast<length_type>(size())));
    // ASSERT(*aliColId == alignmentColumnIds[columnId]);
    ASSERT(!(*this)[*aliId].isCompressed());
    if ((*aliColId) >= static_cast<length_type>(((*this)[*aliId]).getLength())) {
      REMARK << "Bad Alignment column conversion: " << columnId << " " << getTotalLength() << " " << (*aliId) << " "  << size() << " " 
	   << (*aliColId) << " " << ((*this)[*aliId].getLength()) << " " << colStarts[*aliId] << endl;
      REMARK << (*this)[*aliId] << endl;
    }
    ASSERT((*aliColId) < static_cast<length_type>(((*this)[*aliId]).getLength()));
  }

  /** Returns the position in assembly coordinates. Only works if sequence property "chromStart" is defined,
   * as is the case for alignments that were read in MAF format. */
  virtual length_type getAssemblyPosition(length_type columnId, const string& assembly) const;

  /** Returns the internal column id for a given position in assembly coordinates (using reference assembly). */
  virtual length_type convertAssemblyPositionToColumnId(length_type assemblyPosition) const{
    if (fastConversionMode) {
      ASSERT(false); // currently not supported because its results are not correct
      return convertAssemblyPositionToColumnIdFast(assemblyPosition);
    }
    return convertAssemblyPositionToColumnIdSlow(assemblyPosition);
  }

  /** Returns subsequence for the reference assembly and assembly coordinates. Both start and end are one-based  */
  virtual string extractAssemblySequence(length_type assemblyPosStart, length_type assemblyPosEnd) const;

  /** Returns map that indicates in which row a certain assembly for a certain alignment can be found. */
  virtual map<string, row_type>& getAssemblyRowIdMap(length_type alignmentId) const { 
    ASSERT(alignmentId < static_cast<length_type>(assemblyRowIdMaps.size()));
    return assemblyRowIdMaps[alignmentId];
  }

  /** Returns completely assembled sequence of one assembly. Gaps corresponding to areas that are not
   * present in data */
  string generateAssemblySequence(const string& assembly) const;

  /** Returns set of assemblies, that are in common between two alignments */
  virtual set<string> getCommonAssemblies(length_type alignmentId1, length_type alignmentId2) const;

  /** Returns set of assemblies, that are in common between two alignments */
  virtual set<string> getCommonAssemblies(length_type alignmentId1, const MAFAlignment& maf2, length_type alignmentId2) const;

  /** Returns set of assemblies, that are in common between two alignments */
  virtual string getCommonAssembliesHash(length_type alignmentId1, length_type alignmentId2) const;

  /** Returns set of assemblies, that are in common between two alignments */
  virtual string getCommonAssembliesHash(length_type alignmentId1, const MAFAlignment& maf2, length_type alignmentId2) const;

  /** Returns in which row a certain assembly for a certain alignment can be found. */
  virtual size_type getAssemblyRowId(length_type alignmentId, const string& assembly) const;

  /** Returns chromStart value of MAF alignment. If assembly is not defined, return -1. If chromStart is not defined, return 0. */
  virtual length_type getChromStart(length_type alignmentId, const string& assembly) const;

  /** Lowest allowed assembly coordinate 0-based */
  virtual length_type getRefAssemblyBegin() const {
    return getChromStart(0, refAssembly);
  }

  /** Highest allowed assembly coordinate 1-based */
  virtual length_type getRefAssemblyEnd() const {
    if (size() == 0) {
      return 0;
    }
    return getChromStart(size()-1, refAssembly) + (*this)[size()-1].getLength();
  }

  /** Returns total number of defined columns. */
  virtual length_type getTotalLength() const { return totalLength; }

  /** Sets chromStart value of sequence */
  bool setChromStart(length_type alignmentId, const string& assembly, length_type chromStart);

  /** Returns reference assembly */
  virtual const string& getRefAssembly() const { return refAssembly; }

  /** Returns true if unload operation can be called */
  virtual bool isUnloadable() const { return (unloadIds.size() == size()) && (unloadDir.size() > 0) && (unloadPrefix.size() > 0); }

  /** Sets minimum number of sequences for reading, after which sequences that are identical to a previously read sequence are ignored  */
  virtual void setPruneAfter(size_type _pruneAfter) { pruneAfter = _pruneAfter; }

  /** Sets reference assembly */
  virtual void setRefAssembly(const string& assembly) { refAssembly = assembly; }

  /** If set to false: retain sequence properties required for writing MAF format. Default: true. */
  virtual void setRemovePropertiesMode(bool mode) { removePropertiesMode = mode; }

  virtual void setRequiredAssemblies(const set<string>& _requiredAssemblies) { requiredAssemblies = _requiredAssemblies; }

  virtual void setTabooAssemblies(const set<string>& _tabooAssemblies) { tabooAssemblies = _tabooAssemblies; }

  /** Sets minimum number of sequences for reading */
  virtual void setSeqMin(size_type _seqMin) { seqMin = _seqMin; }

  /** Sets shuffle mode for reading */
  virtual void setShuffleMode(bool mode) { shuffleMode = mode; } 

  /** Sets the strand mode */
  virtual void setStrandMode(int mode) {
    ASSERT((mode == STRAND_PLUS) || (mode == STRAND_MINUS) || (mode == STRAND_UNKNOWN));
    strandMode = mode;
  }

  /** Sets temporary disk space directory for alignment unloading */
  virtual void setUnloadDir(const string& dir) { unloadDir = dir; }

  /** Sets verbosity level. 0: silent, 1: default, > 1: increasingly verbose */
  virtual void setVerbose(int level) { verbose = level; }

  /** Shuffle each row of each block such that columns do not remain intact */
  void shuffleRows();

  /** Shuffle each row of each block such that columns remain intact */
  void shuffleHorizontal();

  /** Shuffle each alignment column of each block such that first sequence of each block remains unchanged */
  void shuffleVertical();

  /** Largest alignment blocks will be written to tmp disk space to save RAM memory */
  virtual void unload() const;

  virtual void updateTotalLength() { 
    totalLength = 0;
    for (size_type i= 0; i < size(); ++i) {
      totalLength += (*this)[i].getLength();
    }
  }
  
 /** Converts all alignments to upper case */
  virtual void upperCaseSequences() { 
    for (size_type i = 0; i < size(); ++i) {
      (*this)[i].upperCaseSequences();
    }
  }
  
  /** returns true if data structure is validated */
  virtual bool validate() const; 

  /** Updates alignmenIds and alignmentColumnIds attributes */
  virtual bool validateColumnIds() const;

  // private:

  /** Returns the internal column id for a given position in assembly coordinates (using reference assembly). */
  virtual length_type convertAssemblyPositionToColumnIdSlow(length_type assemblyPosition) const;

  /** Generates array that allows to convert from absolute assembly position to column id.
   * Precondition: chromStarts must already be defined.*/
  length_type convertAssemblyPositionToColumnIdVerySlow(length_type assemblyId) const;

 private:

  /** Returns the internal column id for a given position in assembly coordinates (using reference assembly). */
  virtual length_type convertAssemblyPositionToColumnIdFast(length_type assemblyPosition) const;

  /** Returns chromStart value of MAF alignment. If assembly is not defined, return -1. If chromStart is not defined, return 0. */
  // length_type getOriginalChromStart(length_type alignmentId, const string& assembly) const;

  /** Uses assembly and chromStart sequence properties to redefined sequence names. Used to save memory */
  bool prepareAlignment(SequenceAlignment& ali, const BEDRegions& bed, const string& refAssembly) const;

  /** reads one MAF alignment block if current line starts with "a "
   * internal use only
   */
  SimpleSequenceAlignment readAlignment(const Vec<string>& lines, size_type pruneAfter,
					const set<string>& requiredAssemblies, const set<string>& tabooAssemblies) const;

  /** Removes alignment blocks that have been written to tmp disk space */
  void removeUnloadedAlignments() {
    ASSERT(false); // FIXIT
  }

  string unloadAlignmentName(size_type aliId) const;

  void unloadAlignment(size_type aliId) const;

  void uploadAlignment(size_type aliId) const;

  /** Updates alignmenIds and alignmentColumnIds attributes */
  void updateColumnIds();

  /** Updates alignmenIds and alignmentColumnIds attributes */
  void updateAssemblyRowIdMap(size_type aliId);

  void updateAssemblyRowIdMaps();

  bool updateAssemblyToColumnMapping() const;

  /** Creates chromStarts datastructure */
  void updateChromStarts();

  /** Updates the reference assembly to be the first one from the first sequence of the first alignment block, unless it was previously specified */
  bool updateRefAssembly();

  bool updateRefChromStarts() const;

};


/** Writes a MAF alignment (which is a vector of sequence alignments) to output stream */
inline
ostream& 
operator << (ostream& os, const MAFAlignment& ali)
{
  PRECOND(os);
  for (unsigned int i = 0; i < ali.size(); ++i) {
    ali[i].writeFasta(os);
    // ali[i].writeProperties(os);
    os << endl;
  }
  os << endl;
  ASSERT(os);
  return os;
}

#endif
