// --*- C++ -*------x---------------------------------------------------------
#ifndef __HASH_CORRELATION_FINDER__
#define __HASH_CORRELATION_FINDER__

#include <string>
#include <debug.h>
#include <CorrelationFinder.h>
#include <CorrelationTools.h>
#include <MAFAlignment.h>
#include <MAFSearchTables.h>
#include <SearchRange.h>
#include <iomanip>
#include <NucleotideTools.h>

#ifdef COVARNA_CONCURRENT_VECTOR
#include <tbb/concurrent_hash_map.h>
#include <tbb/concurrent_vector.h>
#include <tbb/concurrent_unordered_set.h>
#endif

using namespace std;

#define CACHE_ELEMENT_SIZE_MAX 10000

// Structure that defines hashing and comparison operations for user's type. 
// from TBB tutorial:
struct MyHashCompare { 
  static size_t hash( const string& x ) { 
    size_t h = 0; 
    for( const char* s = x.c_str(); *s; ++s ) 
      h = (h*17)^*s; 
    return h; 
  } 
  //! True if strings are equal 
  static bool equal( const string& x, const string& y ) { 
    return x==y; 
  } 
}; 

class HashCorrelationFinder : public CorrelationFinder {

 public:

  typedef MAFSearchTables::set_type set_type;

  typedef MAFSearchTables::compressed_type compressed_type;

#ifdef COVARNA_CONCURRENT_VECTOR
  typedef concurrent_hash_map<string, Vec<length_type>, MyHashCompare > intersection_cache_type;
  typedef concurrent_vector<length_type> result_vector_type;
  typedef concurrent_vector<result_vector_type * > result_type;
#else
  typedef map<string, Vec<length_type>, MyHashCompare > intersection_cache_type;
  typedef Vec<length_type> result_vector_type;
  typedef Vec<result_vector_type * > result_type;
  // typedef Vec<set<length_type> * > result_type;
#endif

 private:

  MAFAlignment *maf;
  result_type *resultBins; // vector of vector of results. First dimension corresponds to number of columns
  MAFSearchTables *tables;
  Vec<Correlation>::size_type reservefac;
  string residues; // = "ACGT";
  size_type minNonGap; // minimum number of non-gap characters
  size_type nonGapCharacterMin; // this many different non-gap characters in an interesting column
  length_type corrDistMin; // minimum distance between correlations
  length_type outIntervall; // output every so often
  bool allowGu;
  bool allowGap;
  bool complementMode;
  bool removeIsolated;
  set_type::size_type searchColumnMax;
  int verbose;
  set<string> assemblies;
  
  static intersection_cache_type intersectionCache; // stores set intersections

  // static length_type ignoredCount;

 public:

  /** Standard constructor */
  HashCorrelationFinder(MAFAlignment *_maf, MAFSearchTables *_tables, result_type *_resultBins) {
    PRECOND((_maf != 0) && (_resultBins != 0) && (_maf->getTotalLength() == static_cast<length_type>(_resultBins->size())));
    PRECOND(_tables->validate());
    maf = _maf; // set pointer to MAF alignment. Avoid copying!
    tables = _tables;
    resultBins = _resultBins;
    setDefaultValues();
    // ignoredCount = 0;
    POSTCOND(validate());
  }

  /** Copy constructor */
  HashCorrelationFinder(const HashCorrelationFinder& other) {
    copy(other);
  }

  /** Destructor. Consider take-down of resultBins array? */ 
  virtual ~HashCorrelationFinder() { }

  /** Assignment operator */
  virtual HashCorrelationFinder& operator = (const HashCorrelationFinder& other) {
    cout << "Called HashCorrelationFinder::= operator!" << endl;
    ASSERT(false);
    if (this != &other) {
      copy(other);
    }
    return *this;
  }

  /** Copy method */
  virtual void copy(const HashCorrelationFinder& other) {
    if (other.verbose > 0) {
      cout << "Copying correlation finder object..." << endl;
    }
    maf = other.maf; // copies POINTER to alignment
    resultBins = other.resultBins; // vector of vector of results. First dimension corresponds to number of columns
    tables = other.tables;
    reservefac = other.reservefac;
    residues = other.residues; // = "ACGT";
    minNonGap = other.minNonGap; // minimum number of non-gap characters
    nonGapCharacterMin = other.nonGapCharacterMin; // this many different non-gap characters in an interesting column
    corrDistMin = other.corrDistMin; // minimum distance between correlations
    outIntervall = other.outIntervall; // output every so often
    allowGu = other.allowGu;
    allowGap = other.allowGap;
    complementMode = other.complementMode;
    removeIsolated = other.removeIsolated;
    searchColumnMax = other.searchColumnMax;
    verbose = other.verbose;
    assemblies = other.assemblies;
    // assemblyPairFraction = other.assemblyPairFraction;  // what fraction of all possible assembly pairs should be stored as hash tables? Between 0 and 1
    if (other.verbose > 2) {
      cout << "Finished copying correlation finder object." << endl;
    }
  }

  // static length_type getIgnoredCount() { return ignoredCount; }

  virtual size_type getBasepairTypeMin() const { ERROR("Internal error: HashCorrelationFinder::getBasepairTypeMin not implemented."); return 0;  }

  virtual SingleLinkage2DProgressiveFilter& getClusterFilter() const { 
    ERROR("Not implemented!");
    return *(new SingleLinkage2DProgressiveFilter(1,1));// dummy
  }

  MAFAlignment* getMaf() const { return maf; }

  /** Collects and returns results; */
  virtual result_container getResults() const;

  /** Filters resultBins datastructure. The "const" is somewhat misleading but technically correct. */
  virtual void filterIsolatedCorrelation3(length_type colid) const; 

  /** Check it i,j co-occurs with i-1,j+1 or i+1,j-1. Makes sense for RNA helix interactions */
  static set<length_type> filterNeighborCorrelations(const set<length_type>& leftSet, const set<length_type>& middleSet, const set<length_type>& righSet);

  virtual void setAllowGu(bool mode) { allowGu = mode; }

  /** Sets base pair type minimum number of different correlated mutations per colum pair */
  virtual void setBasepairTypeMin(size_type n) { ERROR("Internal error: HashCorrelationFinder::setBasepairTypeMin not implemented."); }

  /** Sets mode for checking "wrong" diagonals NOT to be complementary */
  virtual void setCheckAntiNeighborMode(int mode) {
    ERROR("Internal error: setCheckAntiNeighborMode not implemented.");
  }

  /** Sets complement mode. Values other than true have to be considered experimental. */
  virtual void setComplementMode(bool flag) { 
    complementMode = flag; 
    ERROR_IF(!flag, "Complement mode not implemented in HashCorrelationFinder!");
  }

  /** Sets the "active" status of the cluster filter. If false, then the filter will simply pass through all input values. */
  virtual void setClusterFilterActive(bool active) {
    ERROR("Internal error: Method HashCorrelationFinder::setClusterFilter is not implemted."); 
  }

  /** Sets minimum size of clusters that pass the initial cluster filter. */
  virtual void setClusterFilterSizeMin(size_t sizeMin) {
    ERROR("Internal error: Method HashCorrelationFinder::setClusterFilter is not implemted."); 
  }

  /** Sets cutoff of cluster filter */
  virtual void setClusterCutoffAndInit(length_type cutoff) {
    ERROR("Internal error: setClusterCutoff not implemented.");
  }

  /** Sets minimum distance between correlated columns. */
  virtual void setCorrDistMin(length_type distMin) { corrDistMin = distMin; }

  /** Sets default member attribute values */
  virtual void setDefaultValues() {
    residues = "ACGT";
    minNonGap = 10; // minimum number of non-gap characters
    nonGapCharacterMin = 2; // this many different characters required in a column
    corrDistMin = 3; // minum size of loop
    allowGu = true;
    allowGap = false;
    complementMode = true;
    removeIsolated = true;
    reservefac = 10; // correlation expected every this many nucleotides
    searchColumnMax = 100000;
    verbose = 1;
    assemblies = maf->getAssemblies();
    outIntervall = 100000; 
  }

  /** Require this many non-gap characters in a column */
  virtual void setNonGapMin(size_type n) { minNonGap = n; } 

  /** Sets intermediate output intervall. */
  virtual void setOutIntervall(length_type intervall) { outIntervall = intervall; }

  /** Sets the removeIsolated flag: iff true, remove isolated correlations. */
  virtual void setRemoveIsolated(bool flag) { removeIsolated = flag; }

  virtual void setReverseMode(bool flag) { ERROR("Internal error: Reverse mode not yet implemented for HashCorrelationFinder."); }

  /** Sets maximum number of columns to be searched with linear search */ 
  virtual void setSearchColumnMax(set_type::size_type value) { searchColumnMax = value; }


  /** Sets maximum number of columns to be searched with linear search */ 
  virtual void setSearchColumnSplit(set_type::size_type value) { ERROR("Method setSearchColumnSplit is not implemented in class HashCorrelationFinder!"); }

  /** Minimum number of consecutive correlations */
  virtual void setStemLengthMin(Stem::index_type len) { 
    ERROR("Setting minimum stem lengths is not yet implemented in HasCorrelationFinder class!");
  }

  /** Sets verbosity level. 0: silent, 1: default, > 1: more and more output */
  virtual void setVerbose(int level) { verbose = level; }

  /** Runs algorithm in defined index vector set. Here: indices must be consecutive. They are counted as internal column number ,
   * this is different than the external genome position. */ 
  virtual void run(const range_type& range) const;

  virtual void run() const {
    // blocked_range<length_type> range(0, maf->getTotalLength(), maf-> getTotalLength());
    range_type range(0, maf->getTotalLength());
    run(range);
  }

  /** Runs algorithm in defined index vector set. Here: indices must be consecutive. They are counted as internal column number ,
   * this is different than the external genome position. */ 
  template <typename _Range>
  void operator () (const _Range& range) const {
    run(range);
  }

  friend class HashCorrelationFinderTest;
  
  /** Returns true, iff run() method can be run on this object 
   */
  virtual bool validate() const {
    return tables->validate() && (assemblies.size() > 0) 
      && (residues.size() > 0) && (maf != 0) && (resultBins!= 0) && maf->validate() && (maf->getTotalLength() == static_cast<length_type>(resultBins->size()));
  }

  /** Returns column ids of MAF columns that are compatible with the given column-assembly search */
  virtual set<length_type> searchMafColumn(const string& column, const Vec<string>& colAssemblies,
				   length_type colId) const;

  /** Computes densities (hits per area) assuming that search is finished. */
  virtual  void augmentDensities(double_hash_type& densities, length_type searchMax) const {
    ERROR("Internal error: Method augmentDensities not implemented in HashCorrelationFinder class.");
  }

  /** Computes densities (hits per area) assuming that search is finished. */
  virtual double_hash_type computeDensities(length_type searchMax, bool addEmpy, ostream * os) const {
    ERROR("Internal error: Method computeDensities not implemented in HashCorrelationFinder class.");
    double_hash_type result;
    return result;
  }

  /** Returns natural logarithm of  probability of a particular stem to start at a particular position. Multiply with number of possible positions
   * to obtain E-value (either (N*(N-1))/2 for one MAF alignment (N == totalLength), or N*M for two MAF alignments */ 
  virtual double computeLogPValue(const Stem& stem) const;

  /** Returns natural logarithm of  probability of a particular stem to start at a particular position. Multiply with number of possible positions
   * to obtain E-value (either (N*(N-1))/2 for one MAF alignment (N == totalLength), or N*M for two MAF alignments */ 
  virtual double computeForwardLogPValue(const Stem& stem) const;

  /**Writes contests of results datastructure. Further filtering will be applied, but this helps to understand to estimate the density of hits.
   */
  virtual void writeRawResults(ostream& os, length_type searchMax) const {
    ERROR("Internal error: HasCorrelationFinder::writeRawResults is not implemented.");
  }

 private:

  bool isCorrelationFound(length_type i, length_type j) const;

  bool isCorrelationIsolated3(length_type i, length_type j) const;

  /** Resets all start positions to beginning of hash tables */
  void resetPositionHashStarts() const;
  
  void createSearchHashTable(const set<string>& assemblies);


};

/** Runs algorithm in defined index vector set. Here: indices must be consecutive. They are counted as internal column number ,
 * this is different than the external genome position. */ 
inline
void
HashCorrelationFinder::run(const range_type& range) const {
  // resetPositionHashStarts(); // THREADISSUE
  set<string> assemblies = maf->getAssemblies();
  length_type startcol = range.begin();
  length_type endcol = range.end();
  if (verbose > 0) {
    cout << "Starting search for complementary alignment columns in region " << (startcol + 1) << " - " << (endcol + 1) << " ..." << endl;
  }
  ASSERT((startcol >= 0) && (endcol <= maf->getTotalLength()));
  ASSERT(endcol >= startcol);
  Vec<set<length_type > > currCorrelations(3);
  Vec<size_type> perm(3);
  perm[0] = 0;
  perm[1] = 1;
  perm[2] = 2;
  size_type resultSetCount = 0;
  Vec<string> colAssemblies;
  length_type aliOldId = maf->size();
  for (length_type colid = startcol; colid < endcol; ++colid) {
    if (((colid - startcol) % outIntervall) == 0) {
      if (verbose > 0) {
	cout << "Progress: Column " << (colid+1) << " ( " << (startcol+1) << " - " << endcol << " , "
	     << setprecision(3) << (100.0 * (colid-startcol) / static_cast<double>(endcol-startcol)) 
	     << "%). Found correlations for this task: " << resultSetCount << endl;
      }
    }
    // copy all content from MIDDLE id to main container:
    // rotate(currCorrelations.begin(), currCorrelations.begin()+1, currCorrelations.end()); // rotate left by one element
    rotate(perm.begin(), perm.begin()+1, perm.end());
    currCorrelations[perm[2]].clear(); // make room for new elements to be found and filled
    ASSERT(colid >= 0);
    ASSERT(colid < maf->getTotalLength());
    length_type aliId = maf->getAlignmentId(colid);
    ASSERT(aliId < static_cast<length_type>(maf->size()));
    // length_type colId = maf->getAlignmentColumnId(colid); // column id with respect to individual alignment
    string slice = maf->getSlice(colid); // getAlignmentSlice((*maf)[[aliId]], colId)
    string column = NucleotideTools::dnaComplement(slice); // use T instead of U
    size_type nonGapCount = NucleotideTools::countNonGapsInChars(column);
    if (nonGapCount < minNonGap) {
      // ++ignoredCount; // THREADISSUE ?
      continue;
    }
    if (NucleotideTools::nongapCharacterCount(slice) < nonGapCharacterMin) {
      // ++ignoredCount; // THREADISSUE ?
      continue; // ignore too highly conserved columns
    }
    ASSERT(column.size() == slice.size());
    // get all assembly words. Concerned about speed
    if (aliId != aliOldId) { // alignment has changed in current column compared to previous column
      colAssemblies = maf->getAlignmentAssemblies(aliId); // (*maf)[aliId].propertyValues("assembly"); 
      aliOldId = aliId;
    }
    ASSERT(colAssemblies.size() > 0);
    ASSERT(colAssemblies.size() == (*maf)[aliId].size());
    ASSERT(column.size() == colAssemblies.size());
    ASSERT(column.size() == colAssemblies.size());
    currCorrelations[perm[2]] = searchMafColumn(column, colAssemblies, colid); // central command; finds complementary columns!
    if ((colid - startcol) >= 2) {
      // filter MIDDLE of currCorrelations:
      if (removeIsolated) {
	currCorrelations[perm[1]] = filterNeighborCorrelations(currCorrelations[perm[0]], currCorrelations[perm[1]], 
							       currCorrelations[perm[2]]);
      }
      // now add to main container: // should be no more THREADISSUE 
      ASSERT(colid > 0);
      for (set<length_type>::const_iterator it = currCorrelations[perm[1]].begin(); it != currCorrelations[perm[1]].end(); it++) {
	ASSERT(*it != (colid - 1));
	++resultSetCount;
	if ((*it) > (colid - 1) ) {
	  // Correlation corr(colid - 1, *it);
	  // if (resultSet.find(corr) == resultSet.end()) { // check if it exists already
	  // results.push_back(corr);
	  if ((*resultBins)[colid-1] == 0) {
	    (*resultBins)[colid-1] = (new result_vector_type());
	  }
	  (*resultBins)[colid-1]->push_back(*it); // it is possible that one duplicate is being inserted
	// }
	} else { // found correlation with position smaller than query:
	  if ((*resultBins)[*it] == 0) {
	    (*resultBins)[*it] = (new result_vector_type());
	  }
	  (*resultBins)[*it]->push_back(colid-1); // it is possible that one duplicate is being inserted
	}
      }
    } else if ((colid == startcol) || ((colid + 1) == endcol)) {
      // special case: add results of first and last column // THREADISSUE
      for (set<length_type>::const_iterator it = currCorrelations[perm[2]].begin(); it != currCorrelations[perm[2]].end(); it++) {
	ASSERT(*it != colid);
	++resultSetCount;
	if ((*it) > colid) {
	  // Correlation corr(colid, *it);
	  if ((*resultBins)[colid] == 0) {
	    (*resultBins)[colid] = (new result_vector_type());
	  }
	  (*resultBins)[colid]->push_back(*it);
	} else {
	  // Correlation corr(colid, *it);
	  if ((*resultBins)[*it] == 0) {
	    (*resultBins)[*it] = (new result_vector_type());
	  }
	  (*resultBins)[*it]->push_back(colid);
	}
      }      
    }
    if (((colid - startcol) >= 5) && ((colid + 1) <= endcol) && ((*resultBins)[colid-3] != 0)) {
      length_type n1 = (*resultBins)[colid-3]->size();
      filterIsolatedCorrelation3(colid - 3); // used to save memory. Final filtering is performed in getResults
      length_type n2 = 0;
      if ((*resultBins)[colid-3] != 0) {
	n2 = (*resultBins)[colid-3]->size();
      }
      length_type nDiff = n1 - n2;
      resultSetCount -= nDiff; // subtract again from total number of hits
    }
  }
}

#endif
