// --*- C++ -*------x---------------------------------------------------------
// $Id: CompensationScorer.h,v 1.1.1.1 2006/07/03 14:43:19 bindewae Exp $
//
// Class:           CompensationScorer
// 
// Base class:      -
//
// Derived classes: - 
//
// Author:          Eckart Bindewald
//
// Description:     given two columns in a multiple sequence alignment
//                  the class computes a score for compensatory base changes.
// 
// Reviewed by:     -
// -----------------x-------------------x-------------------x-----------------

#ifndef __COMPENSATION_SCORER_H__
#define __COMPENSATION_SCORER_H__

// Includes

#include <iostream>
#include <Vec.h>
#include <debug.h>
#include <generalNumerics.h>
#include <Random.h>
#include <utility>

using namespace std;

const string RNA_ALPHABET = "ACGU";
const string DNA_ALPHABET = "ACGT";

/** Computes score for compensatory base changes of two columns in a
    sequence alignment. 1: very confident base pair, 0: worst case
    @author Eckart Bindewald
    @review -*/
class CompensationScorer {
public:

  enum { GAP_CHAR = '-' };
  
  CompensationScorer();
  
  CompensationScorer(const CompensationScorer& orig);

  virtual ~CompensationScorer();

  /* OPERATORS */

  /** Assigment operator. */
  CompensationScorer& operator = (const CompensationScorer& orig);

  friend ostream& operator << (ostream& os, const CompensationScorer& rval);

  friend istream& operator >> (istream& is, CompensationScorer& rval);

  /* PREDICATES */

  /** Is current state valid? */
  virtual bool isValid() const;

  /** How big is object? */
  virtual unsigned int size() const { return alphabet.size(); }

  /** returns true if pair should be skipped */
  virtual bool skipPair(char c1, char c2) const;

  /** returns 1 for highly reliable prediction,
      0 for totally uncompatible prediction */
  virtual double entropy(const string& col) const;

  /** returns 1 for highly reliable prediction,
      0 for totally uncompatible prediction */
  virtual double entropyColumnMax() const;

  /** returns 1 for highly reliable prediction,
      0 for totally uncompatible prediction */
  virtual double independentEntropy(const string& col1,
			    const string& col2) const {
    return entropy(col1) + entropy(col2);
  }

  /** returns entropy of combined pairs of column 1 and 2 */
  virtual double pairwiseEntropy(const string& col1,
			 const string& col2) const;

  /** returns entropy of combined pairs of column 1 and 2 */
  virtual double pairwiseEntropy(const string& col1,
				 const string& col2, 
				 const Vec<double>& w) const;

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider */
  double pairwiseEntropy3(const string& col1, const string& col2) const;

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider */
  double pairwiseEntropy3(const string& col1, const string& col2,
			  const Vec<double>& wvec) const;

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider */
  double pairwiseEntropy3fast(const string& col1, const string& col2,
			      const Vec<double>& wvec) const;

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider */
  double pairwiseEntropy4(const string& col1, const string& col2) const;

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider */
  double pairwiseEntropy4(const string& col1, const string& col2,
			  const Vec<double>& wvec) const;

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider */
  double pairwiseEntropy5(const string& col1, const string& col2,
			  const Vec<double>& wvec) const;

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider */
  double pairwiseEntropy5(const string& col1, const string& col2) const;

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider, no pseudocounts, no correction,
      not only matching nucleotides */
  double pairwiseEntropy7(const string& col1, const string& col2, const Vec<double>& wVec) const;
  
  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider, no pseudocounts, no correction,
      not only matching nucleotides */
  double pairwiseEntropy7(const string& col1, const string& col2) const {
    return pairwiseEntropy7(col1, col2, Vec<double>(col1.size(), 1.0));
  }

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider, no pseudocounts, no correction,
      not only matching nucleotides */
  double pairwiseEntropy8RawFast(const string& col1, const string& col2, const Vec<double>& wVec) const;

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider, no pseudocounts, no correction,
      not only matching nucleotides */
  double pairwiseEntropy8Fast(const string& col1, const string& col2, const Vec<double>& wVec) const;

  /** covariance score according to Hofacker 2002 */
  double covariance1(const string& col1, const string& col2) const;

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider */
  double singleEntropy3(const string& col1,
			const Vec<double>& wVec) const;

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider. Use pseudocounts and
      small sample correction. */
  double singleEntropy4(const string& col1,
			const Vec<double>& wVec) const;

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider, no correction, but use
      pseudo counts */
  double singleEntropy5(const string& col1,
			const Vec<double>& wVec) const;

  /** returns entropy of a column.
      Only use P log P  like Tom Schneider, no pseudocounts, no correction */
  double singleEntropy6(const string& col1,
			const Vec<double>& wVec) const;

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider, no pseudocounts, no correction */
  double singleEntropy6(const string& col1) const {
    return singleEntropy6(col1, Vec<double>(col1.size(), 1.0));
  }

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider, no pseudocounts, no correction,
      not only matching nucleotides
  */
  double singleEntropy7(const string& col1,
			const Vec<double>& wVec) const;

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider, no pseudocounts, no correction,
      not only matching nucleotides
  */
  double singleEntropy7(const string& col1) const {
    return singleEntropy7(col1, Vec<double>(col1.size(), 1.0));
  }

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider, no pseudocounts, no correction,
      not only matching nucleotides
  */
  double singleEntropy8(const string& col1,
			const Vec<double>& wVec) const;

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider, no pseudocounts, no correction,
      not only matching nucleotides
  */
  double singleEntropy8Raw(const string& col1,
			   const Vec<double>& wVec) const;

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider,
      EXACT correction due to random sampling,
      do NOT use Bayesian pseudocounts, 
      NOT only matching nucleotides */
  double singleEntropy9(const string& col1,
			const Vec<double>& wVec) const;

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider,
      EXACT correction due to random sampling,
      do NOT use Bayesian pseudocounts, 
      NOT only matching nucleotides */
  double singleEntropy9(const string& col1) const;

  /** returns entropy of combined pairs of column 1 and 2 */
  virtual double pairwiseEntropyMax(unsigned int n) const;

  /** returns entropy of combined pairs of column 1 and 2 */
  virtual double pairwiseEntropyMin(unsigned int n) const;

  /** returns which letter in the alpbabet the character c is */
  virtual unsigned int letterId(char c) const {
    return alphabet.find(c);
  }

  /** returns 1 for highly reliable prediction,
      0 for totally uncompatible prediction */
  virtual double compatibilityScore(const string& col1,
			    const string& col2) const;

  /** returns 1 for highly reliable prediction,
      0 for totally uncompatible prediction */
  virtual double compatibilityScore2(const string& col1,
				     const string& col2) const;

  /** returns 1 for highly reliable prediction,
      0 for totally uncompatible prediction
    Returns fraction of pairs that are compatible, 
    weighted by binomial distribution
  */
  virtual double compatibilityScore3(const string& col1,
				     const string& col2) const;

  /** returns 1 for highly reliable prediction,
      0 for totally uncompatible prediction */
  double errorScore(const string& col1,
		    const string& col2,
		    const Vec<double>& aliSequenceWeights) const;

  /** returns 1 for highly reliable prediction,
      0 for totally uncompatible prediction */
  virtual double compensationScore(const string& col1,
				   const string& col2,
				   const Vec<double>& wVec) const;

  /** returns 1 for highly reliable prediction,
      0 for totally uncompatible prediction */
  virtual double compensationScore(const string& col1,
				   const string& col2,
				   double singleScore1,
				   double singleScore2,
				   const Vec<double>& wVec) const;

  /** returns uncorrected score */
  double uncorrectedScore(const string& col1,
			  const string& col2,
			  const Vec<double>& aliSequenceWeights) const;

  /** returns 1 for highly reliable prediction,
      0 for totally uncompatible prediction */
  virtual double singleScore(const string& col1,
			     const Vec<double>& wVec) const;

  /** returns true, if c1 and c2 are found in allowedPairs */
  virtual bool isAllowedPair(char c1, char c2) const;

  /** returns id of allowed pair, otherwise size of all allowed pairs */
  virtual unsigned int numAllowedPair(char c1, char c2) const;

  /** counts number of matching nucleotides */
  virtual unsigned int countAllowedPairs(const string& s1, const string& s2) const;

  /** counts number of matching nucleotides */
  unsigned int countGaps(const string& s1, const string& s2, char gapChar) const;

  /** returns two alignment columns without gaps */
  static pair<string, string> filterGaps(const string& col1Orig, const string& col2Orig, char GAP_CHAR);

  /** returns frequency of a certain character in column */
  static double frequency(const string& col, char c);

  /** returns frequency of a certain pair of letters */
  static double frequency(const string& col1, const string& col2, 
			  char c1, char c2);

  /** returns frequency of a certain character in column */
  static double frequency(const string& col, char c, const Vec<double>& wVec);

  /** returns frequency of a certain pair of letters */
  static double frequency(const string& col1, const string& col2, char c1, char c2,
			  const Vec<double>& wVec);

  /** returns frequency of a certain character in column. Do not count gap rows. */
  static double frequency2(const string& col, char c) ;

  /** returns frequency of a certain character in column. Do not count gap rows. */
  static double frequency2(const string& col, char c, const Vec<double>& wVec) ;

  /** returns frequency of a certain pair of letters. Do not count gap rows. */
  static double frequency2(const string& col1, const string& col2, char c1, char c2);

  /** returns frequency of a certain pair of letters. Do not count gap rows. */
  static double frequency2(const string& col1, const string& col2, char c1, char c2,
			   const Vec<double>& wVec) ;

  /** returns frequency of a certain character in column. Do not count gap rows.
      Use pseudo counts. */
  static double frequency3(const string& col, char c,
			   unsigned int pseudoTop,
			   unsigned int pseudoBottom);
  /** returns frequency of a certain character in column. Do not count gap rows.
      Use pseudo counts. */
  static double frequency3(const string& col, char c,
			   unsigned int pseudoTop,
			   unsigned int pseudoBottom,
			   const Vec<double>& wVec);

  /** returns frequency of a certain pair of letters. Do not count gap rows.
      Use pseudo counts. */
  static double frequency3(const string& col1, const string& col2, char c1, char c2,
			   unsigned int pseudoTop,
			   unsigned int pseudoBottom);

  /** returns frequency of a certain pair of letters. Do not count gap rows.
      Use pseudo counts. */
  static double frequency3(const string& col1, const string& col2, char c1, char c2,
			   unsigned int pseudoTop,
			   unsigned int pseudoBottom,
			   const Vec<double>& wVec);

  /** returns expected uncertainty of 2 columns */
  double expectedUncertainty2Columns(const string& alph, unsigned int effSize, double error,
				     unsigned int randomSampleSquareNum, unsigned int minN) const;

  /** returns standard deviation of single value of expected uncertainty of 2 columns */
  double expectedUncertainty2ColumnsStddev(const string& alph, unsigned int effSize, double error,
					   unsigned int randomSampleSquareNum,
					   unsigned int minN) const;
  
  static void generateRandomSequence(string& result, const string& alph, Random& rnd);

  /** gets used algorithm id for score */
  virtual int getAlgorithm() const { return algorithm; }

  virtual const Vec<double>& getBaseFrequencies() const { return baseFrequencies; }

  /** returns allowed pairs */
  virtual const Vec<string>& getAllowedPairs() const { return allowedPairs; }

  /** returns true if same pair indipendent of order */
  static bool isSamePair(const string& s1, const string& s2);

  /** returns allowed pairs */
  virtual Vec<string> getUniqueAllowedPairs() const;

  /** returns alphabet */
  virtual const string& getAlphabet() const { return alphabet; }

  /** return substitution matrix */
  virtual const Vec<Vec<int> >& getSubMatrix() const {
    return subMatrix;
  }

  /* begin module calehnb */
  void calehnb(long n, long gna, long gnc, long gng, long gnt, 
	       double* hg, double* ehnb, double* varhnb) const;
  
  unsigned int computeMeanAndDeviation(const string& col,
			       double& mean, double& dev) const;

  /** exact way to compute excpected uncertainty.
   * Adapted almost verbatim from Pascal implementation by Tom Schneider
   * at web site:
   * @see http://www.lecb.ncifcrf.gov/~toms/paper/schneider1986/latex/node28.html
   */
  double exactSingleExpectedUncertainty(int n, 
					const Vec<double>& letterFrequencies) const;
  
  /** use formula of Goebel and Valencia */
  double pairwiseCorrelation(const string& col1,
			     const string& col2) const;

  /** returns entropy of combined columns 1 and 2 */
  double relativeEntropy(const string& col1, const string& col2) const;

  /** returns entropy of combined columns 1 and 2,
      but only take the part that is matching */
  double pairwiseEntropy2(const string& col1, const string& col2) const;

  /** returns entropy of combined columns 1 and 2,
      but only take the part that is matching */
  double pairwiseEntropy2(const string& col1, const string& col2,
			  const Vec<double>& w) const;


  /** returns entropy of combined columns 1 and 2,
      no sample correction, no pseudo counts */
  double pairwiseEntropy6(const string& col1, const string& col2,
			  const Vec<double>& w) const;

  /** returns entropy of combined columns 1 and 2,
      no sample correction, no pseudo counts */      
  double pairwiseEntropy6(const string& col1, const string& col2) const;
			  
  /** counts number of matching nucleotides,
      given a randomly renamed definition of alloed pairs! */
  unsigned int countRandomRenamedAllowedPairs(string s1, string s2) const;

  /** counts number of matching nucleotides,
      given average over a randomly renamed definition of allowed pairs! */
  double averageRandomRenamedAllowedPairs(const string& s1, 
					  const string& s2) const;

  /* MODIFIERS */

  /** sets used algorithm for score */
  virtual void setAlgorithm(int a) { algorithm = a; }

  virtual void setAlphabet(const string& s) {
    alphabet = s;
    if (s.size() != baseFrequencies.size()) {
      baseFrequencies = Vec<double>(s.size(), 
			    1.0 / static_cast<double>(s.size()));
    }
  }

  virtual void setBaseFrequencies(const Vec<double>& _baseFrequencies) {
    baseFrequencies = _baseFrequencies;
  }

  virtual void setAllowedPairs(const Vec<string>& pairs) {
    allowedPairs = pairs;
  }

  virtual void setCompatibilityExponent(double val) {
    compatibilityExponent = val;
  }

  virtual void setSubMatrix(const Vec<Vec<int> >& sub) {
    subMatrix = sub;
  }

  /* ATTRIBUTES */

  mutable int algorithm;

  int verboseLevel;

  double compMin;
  
  double energyWeight;

  double entropyStdMax;

  double entropyWeight;

  /** if less than this many gaps at a column, set correlation to zero */
  double gapFracMin;
  
  /** usage defined by user */
  double threshold;

  double userMean;

  double userDev; // used for z-score computation in algorithm 19&20

protected:
  
  /* OPERATORS  */
  
  /* PREDICATES */

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider, no pseudocounts, no correction,
      not only matching nucleotides */
  double pairwiseEntropy7(const string& col1, const string& col2, double singleVal1, double singleVal2, const Vec<double>& wVec) const;

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider, no pseudocounts, no correction,
      not only matching nucleotides */
  double pairwiseEntropy8Fast(const string& col1, const string& col2, double singleVal1, double singleVal2, const Vec<double>& wVec) const;

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider, no pseudocounts, no correction,
      not only matching nucleotides */
  double pairwiseEntropy9Fast(const string& col1, const string& col2, double singleVal1, double singleVal2, const Vec<double>& wVec) const;

  /** returns entropy of combined columns 1 and 2.
      Only use P log P  like Tom Schneider, 
      not only matching nucleotides */
  double pairwiseEntropy9Fast(const string& col1, const string& col2, const Vec<double>& wVec) const;

  /* MODIFIERS  */
  void copy(const CompensationScorer& other);

private:

  /* OPERATORS  */

  /* PREDICATES */

  /* MODIFIERS  */

  void clearLookUp(const string& alphabet) const;

  void addCountsToLookUp(const string& col1, const string& col2, 
			 const Vec<double>& wVec) const;

  /** compute factorial.
   * TODO: faster implementation with lookup tables
   */
  double computeFactorial(int n) const;

  /** compute log of factorial.
   * TODO: faster implementation with lookup tables
   */
  double computeFactorialLog(int n) const;

  /** Probability of certain composition of seuence of length n
   * corresponds to equation 11 at web site:
   * @see http://www.lecb.ncifcrf.gov/~toms/paper/schneider1986/latex/node28.html
   */
  double compositionProbability(int n, 
				const Vec<int>& letterCounts,
				const Vec<double>& letterFrequencies) const;
  
  /** Computes Shannon's uncertainty for sequence of length n
   * corresponds to equatation 12  at web site:
   * @see http://www.lecb.ncifcrf.gov/~toms/paper/schneider1986/latex/node28.html
   */
  double compositionUncertainty(int n, const Vec<int>& letterCounts) const;

private:
  /* PRIVATE ATTRIBUTES */
  
  string alphabet;

  Vec<double> baseFrequencies; // same length as alphabet

  Vec<string> allowedPairs;

  Vec<Vec<int> > subMatrix; // substitution matrix must match alphabet
  
  double compatibilityExponent;

  mutable int freqLookUpGaps;

  mutable double freqLookUpNorm;

  mutable Vec<Vec<double> > freqLookUp;

  unsigned int randomSampleNum;

  unsigned int randomSampleSquareNum;

};

#endif /* __ACLASS_H__ */

