// --*- C++ -*------x---------------------------------------------------------
// $Id: KnnNode2.cc,v 1.2 2010/06/21 14:46:07 bindewae Exp $
//
// Class:           KnnNode2
// 
// Base class:      -
//
// Derived classes: - 
//
// Author:          Eckart Bindewald
//
// Project name:    -
//
// Date:            $Date: 2010/06/21 14:46:07 $
//
// Description:     K-nearest neighbor search using ANN library implementation
// -----------------x-------------------x-------------------x-----------------

#include <KnnNode2.h>
#include <clusterAlgorithms.h>
#include <StringTools.h>
#include <Random.h>
#include <vectornumerics.h>
#include <math.h> // for exp function

// #define DEBUG_VERBOSE

// ---------------------------------------------------------------------------
//                                   SequencePairCluster
// -----------------x-------------------x-------------------x-----------------

/* CONSTRUCTORS */

/* default constructor */
KnnNode2::KnnNode2() : clusterCutoff(0.001), gaussDev(0.0), kk(10), numClasses(2),
		       verboseLevel(1), noSelfMode(false), statisticsRunning(false),
		       annBucketSize(1), annDim(0), annNumPoints(0), annEps(0.0),
		       annQuery(NULL), annPoints(NULL), annDists(NULL), annIndices(NULL), annTree(NULL)
{
  // cout << "Calling KnnNode2 default constructor!" << endl;
  ASSERT(annQuery == NULL);
  annDists =  new ANNdist[KNN_KMAX]; 
  annIndices = new ANNidx[KNN_KMAX];
  // cout << "Finished KnnNode2 default constructor!" << endl;
}

/* copy constructor
 @todo  NOT YET CORRECT! No proper freeing of memory!!! :-( */
KnnNode2::KnnNode2(const KnnNode2& other) : clusterCutoff(0.001), gaussDev(0.0), kk(10), numClasses(2),
		       verboseLevel(1), noSelfMode(false), statisticsRunning(false),
        	       annBucketSize(1), annDim(0), annNumPoints(0), annEps(0.0),
		       annQuery(NULL), annPoints(NULL), annDists(NULL), annIndices(NULL), annTree(NULL)
{
  // cout << "Calling KnnNode2 copy constructor!" << endl;
  annDists =  new ANNdist[KNN_KMAX];
  annIndices = new ANNidx[KNN_KMAX];
  for (unsigned int i = 0; i < KNN_KMAX; ++i) {
      annDists[i] = 0.0;
      annIndices[i] = 0;
  }
  copy(other);
  // cout << "Finished KnnNode2 copy constructor!" << endl;
}

/* destructor
   @todo  NOT YET CORRECT! No proper freeing of memory!!! :-(
*/
KnnNode2::~KnnNode2() 
{ 
  // cout << "calling ~KnnNode2 !" << endl;
   if ((annDim > 0) && (annQuery != NULL)) {
     annDeallocPt(annQuery);
   }
  // cout << "Mark 1" << endl;
  if ((annNumPoints > 0) && (annPoints != NULL)) {
       annDeallocPts(annPoints);
   }
  // cout << "Mark 3" << endl;
  //   if ((kk > 0) && (annDists != NULL)) {
//     cout << "Mark 3b" << endl;
  delete annDists;
//     cout << "Mark 3c" << endl;
//   }
  // cout << "Mark 4" << endl;
//   if ((kk > 0) && (annIndices != NULL)) {
  delete annIndices;
//   }
//   cout << "Mark 2" << endl;
   if ((annNumPoints > 0) && (annDim > 0) && (annTree != NULL)) {
     delete annTree;
   }
  // cout << "finished ~KnnNode2 !" << endl;
}

/* OPERATORS */

/** Assigment operator. */
KnnNode2& 
KnnNode2::operator = (const KnnNode2& orig)
{
  if ((&orig) != this) {
    copy(orig);
  }
  return *this;
}

ostream& 
operator << (ostream& os, const KnnNode2& rval)
{
  ERROR("Ouput operator not yet implemented!");
  return os;
}

istream& 
operator >> (istream& is, KnnNode2& rval)
{
  ERROR("Input operator not yet implemented!");
  return is;
}

/* PREDICATES */

/** Is current state valid? */
bool
KnnNode2::isValid() const { 
  return (kk > 0) && (data.size() > 0) && (dataClasses.size() == data.size()); 
}

Vec<double> 
KnnNode2::predictClassProb(const Vec<double>& v) const 
{
  ERROR_IF(!isValid(), "KnnNode2 not sufficiently defined for prediction!");
  ERROR_IF(data.size() == 0, "No data defined for node!");
  if (v.size() != data[0].size()) {    
    cout << "Incompatible size of knn input vector: found "
	 << v.size() << " expected " << data[0].size() << endl;
    ERROR("Incompatible size of knn input vector: found ");
  }
  ERROR_IF(static_cast<int>(v.size()) != annDim, "Internal error in line 87!");
  ERROR_IF(numClasses < 2, "At least two classes have to be defined!");
  int kkUse = kk;
  int nStart = 0; // start with nearest neighbor
  double gaussDev2 = gaussDev * gaussDev; // use square
  if (noSelfMode) {
    ++kkUse; // use one more nearest neighbor, but ignore closest one, it might be hit against itself
    ++nStart; // when noSelfMode is true, ignore closest neighbor
  }
  if (kkUse > annNumPoints) {
    kkUse = annNumPoints;
  }
  // Vec<unsigned int> nearest = kNearestNeighbors(v, data, kkUse, scaling);
  //   Vec<unsigned int> nearest = kNearestNeighbors(v, data, kkUse, scaling,
  // 						clusters, clusterCutoff);
  ASSERT(annQuery != NULL);
  for (unsigned int i = 0; i < v.size(); ++i) {
    annQuery[i] = v[i];
  }
  // cout << "Starting new ANN tree search!" << endl;
  annTree->annkSearch(annQuery, kkUse, annIndices, annDists, annEps);
//   Vec<unsigned int> nearest = kNearestNeighbors(v, data,
// 						clustData, kkUse, scaling,
// 						clusters, 
// 		subClusters, clusterCutoff, clusterCutoff2);
  Vec<double> result(numClasses, 0.0);
//   if (verboseLevel > 4) {
//     cout << "KnnNode2:: Nearest neighbors of " << v << endl;
//     for (unsigned int i = 0; i < nearest.size(); ++i) {
//       cout << i << " " << nearest[i] << "  " 
// 	   << dataClasses[nearest[i]] << "  " << data[nearest[i]] << endl;
//     }
//     cout << "KnnNode2: nStart, kkUse: " << nStart << " " << kkUse 
// 	 << " " << verboseLevel << endl;
//   }
//   ERROR_IF(nearest.size() == 0, "No knn nearest neighbors found!");
  if(kkUse <= nStart) {
    cout << "kkUse: " << kkUse << " nstart: " << nStart 
	 << " datasize: " << data.size() << endl;
    ERROR("Internal errror in line 173");
  }
  double addTerm = 1.0 / static_cast<double>(kkUse-nStart);
  ERROR_IF(!isReasonable(addTerm), "Internal error in line 176!");
  double dist = 0.0;

#ifdef DEBUG_VERBOSE
  if ((dataClasses[annIndices[0]] == 1) && (dataClasses[annIndices[1]] == 1)) {
    cout << "Debug results for data point: " << nStart << " " << kkUse << " " << data.size() << endl;
    for (unsigned int i = 0; i < v.size(); ++i) {
      cout << annQuery[i] << " ";
    }
    cout << endl;
    for (int ii = 0; ii < kkUse; ++ii) {
      cout << ii << " " 	     << annIndices[ii] << " " << annDists[ii] << " " 
	   << dataClasses[annIndices[ii]] << " " << data[annIndices[ii]];
    }
    cout << "end of debug output" << endl;
  }
#endif
  int kkEff = 0;
  unsigned int cl;
  unsigned int tmpId = 0;
  bool dataLookupMode = false;
  if (dataLookupProb.size() > 0) {
    dataLookupMode = true;
  }
  ERROR_IF ((dataLookupProb.size() > 0)
	    && (dataLookupProb.size() != data.size()),
	    "Lookup table does not have same size as data!");
  if (gaussDev <= 0.0) {
    for (int i = nStart; i < kkUse; ++i) {
      if (annIndices[i] >= static_cast<int>(clusters.size())) {
	cout << "Error: too large knn result index: " << i << " " 
	     << annIndices[i] << " " << dataClasses.size() << endl;
	for (int ii = nStart; ii < kkUse; ++ii) {
	  cout << ii << " " 	     << annIndices[ii] << " " << dataClasses.size() << endl;
	}
	ERROR( "Internal error in line 184!");
      }
      for (unsigned int j = 0; j < clusters[annIndices[i]].size(); ++j) {
	tmpId = clusters[annIndices[i]][j];
	ERROR_IF(tmpId >= data.size(), "Internal error in line 210!");
	if (dataLookupMode) {
	  result[1] = dataLookupProb[tmpId];
	  result[0] = 1.0 - result[1];
	  lastPrediction = result;
	  return result;
	}
	cl = dataClasses[tmpId];
	ERROR_IF(cl >= result.size(), "Class id larger than excepted!");
	result[cl] += addTerm;
	ERROR_IF(!isReasonable(result[cl]), "Unreasonable number in prediction encountered!");
	++kkEff; // effective kk value
	if (kkEff >= kkUse) {
	  break;
	}
      }
      if (kkEff >= kkUse) {
	break;
      }
    }
    for (unsigned int i = 0; i < result.size(); ++i) {
      if (result[i] > 1.0) {
	ASSERT(result[i] < 1.1); // only rounding errors allowed
	result[i] = 1.0;
      }
    }
  }
  else { // if gaussian weighting activated
    ERROR_IF(gaussDev2 <= 0.0, "Internal error in line 230!");
    for (int i = nStart; i < kkUse; ++i) {
      ERROR_IF(i >= KNN_KMAX,
	     "Internal error in line 236!");
      if (annIndices[i] >= static_cast<int>(clusters.size())) {
	cout << "Error: too large knn result index: " << i << " " 
	     << annIndices[i] << " " << dataClasses.size() << endl;
	for (int ii = nStart; ii < kkUse; ++ii) {
	  cout << ii << " " 	     << annIndices[ii] << " " << dataClasses.size() << endl;
	}
	ERROR("Internal error in line 199!");
      }
      for (unsigned int j = 0; j < clusters[annIndices[i]].size(); ++j) {
	tmpId = clusters[annIndices[i]][j];
	ASSERT(tmpId < data.size());
	if (dataLookupMode) {
	  result[1] = dataLookupProb[tmpId];
	  result[0] = 1.0 - result[1];
	  lastPrediction = result;
	  return result;
	}
	cl = dataClasses[tmpId];
	ASSERT(v.size() == data[tmpId].size());
	ASSERT(v.size() == scaling.size());
	dist = euclidianDistanceSquare(v, data[tmpId], scaling);
	ERROR_IF(cl >= result.size(), "Class id larger than excepted!");
	result[cl] += addTerm * exp(-dist/gaussDev2); // gaussian weighting
	ERROR_IF(!isReasonable(result[cl]), "Unreasonable number in prediction encountered!");
	++kkEff;
	if (kkEff >= kkUse) {
	  break;
	}
      }
      if (kkEff >= kkUse) {
	break;
      }
    }
    for (unsigned int i = 0; i < result.size(); ++i) {
      ERROR_IF(!isReasonable(result[i]), "Unreasonable number in prediction encountered!");
    }
  }
  probabilityNormalize(result); // normalize such that sum of values is 1.0
  if (verboseLevel > 4) {
    cout << "KnnNode2: result: " << result << " End of predictClassProb for " << v << endl;
  }
#ifdef DEBUG_VERBOSE
  for (unsigned int i = 0; i < result.size(); ++i) {
    if (!isReasonable(result[i])) {
      cout << "Warning: Unreasonable knn prediction caught: " << result  << endl;
      for (unsigned int j = 0; j < result.size(); ++j) {
	result[j] = 0.0;
      }
      result[0] = 1.0; //set to "no contact"
      break;
    }
  }
#endif
  // cache values:
  lastPrediction = result;
  return result;
}

Vec<double> 
KnnNode2::predictClassProb(const Vec<double>& v, unsigned int knownClass) const 
{
  Vec<double> result = predictClassProb(v);
  if (statisticsRunning) {
    updateStatistics(result, knownClass);
  }
  return result;
}

/** returns prediction accuracy using leave one out estimation (numTrial times) */
Vec<unsigned int>
KnnNode2::initEstimateAccuracy() const
{
  // cout << "Starting initEstimateAccuracy!" << endl;
  unsigned int maxSize = 250000;
  if (data.size() < maxSize) {
    maxSize = data.size();
  }
  // return random number subset of maxSize entries of data:
  return generateRandomIndexSubset(maxSize, data.size(), 0);
  /*
    // this part of code was a good idea, but unfortunately it skews the data distribution
  Random& rnd = Random::getInstance();
  unsigned int oneCounter = 0;
  // count number of entries for class one
  for (unsigned int i = 0; i < dataClasses.size(); ++i) {
    if (dataClasses[i] == 1) {
      ++oneCounter;
    }
  }
  // numbers of entries for class zero:
  unsigned int otherSize = 500;
  if (otherSize > dataClasses.size()) {
    otherSize = dataClasses.size();
  }
  Vec<unsigned int> result((2*oneCounter)+otherSize, 0U);
  unsigned int pc = 0;
  // add entrie of class zero, that is closest to class one:
  for (unsigned int i = 0; i < dataClasses.size(); ++i) {
    if (dataClasses[i] == 1) {
      result[pc++] = i;
      // find closes vector which belongs to class zero:
      double dBest = 1e30;
      unsigned int dBestIdx = 0;
      double d;
      for (unsigned int j = 0; j < dataClasses.size(); ++j) {
	if (dataClasses[j] == 1) {
	  continue;
	}
	ERROR_IF(data[i].size() != data[j].size(), "Data vectors with different dimensions found!");
	
	d = euclidianDistanceSquare(data[j], data[i]);
	if (d < dBest) {
	  dBest = d;
	  dBestIdx = j;
	}
      }
      result[pc++] = dBestIdx;
    }
  }
  for (unsigned int i = 0; i < otherSize; ++i) {
    result[pc++] = rnd.getRand(dataClasses.size());
  }
  */
  // ERROR_IF(pc != result.size(), "Internal error in line 139!");
  // cout << "Ending initEstimateAccuracy!" << endl;
  // return result;
}

/** returns prediction accuracy using leave one out estimation (numTrial times) */
double
KnnNode2::estimateAccuracy(unsigned int numTrials) const
{
  if ((!isValid()) || (kk < 2)) {
    cout << "KnnNode2 not sufficiently defined for accuracy estimation!"
	 << endl;
    return -1000.0;
  }
  // cout << "Starting estimateAccuracy!" << endl;
  if (estimateSet.size() == 0) {
    estimateSet = initEstimateAccuracy();
  }
  // cout << "Starting KnnNode2::estimateAccuracy with set size " << estimateSet.size() << endl;
  
  unsigned int correctPredCounter = 0;
  unsigned int tp = 0;
  unsigned int fp = 0;
  unsigned int tn = 0;
  unsigned int fn = 0;
  for (unsigned int ii = 0; ii < estimateSet.size(); ++ii) {
    unsigned int j = estimateSet[ii]; // use j'th training vector as query
    // cout << "Testing " << ii << " " << j << endl;
    ERROR_IF(j >= dataClasses.size(), "Internal error in line 155!");
    ERROR_IF(j >= data.size(), "Internal error in line 157!");
    const Vec<double>& v = data[j];
    // Vec<unsigned int> nearest = kNearestNeighbors(v, data, kk+1, scaling); // kk+1: later ignore hit with itself
//     Vec<unsigned int> nearest = kNearestNeighbors(v, data, kk+1, scaling,
//  						  clusters, clusterCutoff);
//     Vec<unsigned int> nearest = kNearestNeighbors(v, data, clustData, 
// 						  kk+1, scaling,
// 			  clusters, subClusters,
// 				  clusterCutoff, clusterCutoff2);
    ASSERT(static_cast<int>(v.size()) == annDim);
    for (unsigned int i = 0; i < v.size(); ++i) {
      annQuery[i] = v[i];
    }
    if (verboseLevel > 4) {
      cout << "Ann search!" << endl;
      cout << "kk, Point: " << kk << " " << v << endl;
      cout << annNumPoints << " " << annDim << endl;
    }
    int kkUse = kk;
    if (kkUse > annNumPoints) {
      kkUse = annNumPoints;
    }
    annTree->annkSearch(annQuery, kkUse, annIndices, annDists, annEps);
    // cout << "ANN search finished !" << endl;
    ERROR_IF(numClasses < 2, "Internal error in line 279!");
    Vec<double> result(numClasses, 0.0);
    double addTerm = 1.0 / static_cast<double>(kk-1.0);
    double dist = 0.0;
    unsigned int cl;
    int kkEff = 0;
    int tmpId = 0;
    if (gaussDev <= 0.0) {
      for (int i = 0; i < kkUse; ++i) {
	for (unsigned int k = 0; k < clusters[annIndices[i]].size(); ++k) {
	  tmpId = clusters[annIndices[i]][k];
	  if (tmpId == static_cast<int>(j)) {
	    continue; // ignore hit with "self"
	  }
	  cl = dataClasses[tmpId];
	  ASSERT(cl < result.size());
	  result[cl] += addTerm;
	  ++kkEff;
	  if (kkEff >= kkUse) {
	    break;
	  }
	}
	if (kkEff >= kkUse) {
	  break;
	}
      }
    }
    else { // if gaussian weighting is switched on
      if (verboseLevel > 4) {
	cout << "Ok, here are results for : " << v << dataClasses[j] << endl;
      }
      for (int i = 0; i < kkUse; ++i) {
	for (unsigned int k = 0; k < clusters[annIndices[i]].size(); ++k) {
	  tmpId = clusters[annIndices[i]][k];
	  if (verboseLevel > 4) {
	    cout << annIndices[i] << " " << tmpId << " " << data[tmpId] << " "
		 << annDists[i] << endl;
	  }
	  if (tmpId == static_cast<int>(j)) {
	    continue; // ignore hit with "self"
	  }
	  ERROR_IF(tmpId >= static_cast<int>(data.size()),
		   "Too large indices found!");
	  cl = dataClasses[tmpId];
	  ERROR_IF(v.size() != data[annIndices[i]].size(),
		   "Data vectors with different dimensions found!");
	  dist = euclidianDistanceSquare(v, data[annIndices[i]]); // annDists[i]; // 
	  ASSERT(cl < result.size());
	  result[cl] += addTerm * exp(-dist/(gaussDev*gaussDev)); // gaussian weighting
	  ++kkEff;
	  if (kkEff >= kkUse) {
	    break;
	  }
	}
	if (kkEff >= kkUse) {
	  break;
	}
      }
      probabilityNormalize(result); // normalize such that sum of values is 1.0
    }
    // find highest class:
    // cout << "Class result counting: " << result << endl;
    unsigned int maxIndex = findMaxIndex(result);
    if (maxIndex == dataClasses[j]) {
      ++correctPredCounter; // correct prediction!
      if (dataClasses[j] == 1) {
	++tp;
      }
      else {
	++tn;
      }
    }
    else {
      if (dataClasses[j] == 1) {
	++fn;
      }
      else {
	++fp;
      }
    }
    // cout << result << " End of predictClassProb for " << v << endl;
  }  
  // cout << "Ending estimateAccuracy!" << endl;
  
  //   return static_cast<double>(correctPredCounter)/static_cast<double>(estimateSet.size());
  return computeMathews(tp, fp, tn, fn);
}

/** returns data rows which belong to class dataClass */
Vec<Vec<double> >
KnnNode2::getData(unsigned int dataClass) const
{
  // count number of entries of that class:
  unsigned int n = count(dataClasses.begin(), dataClasses.end(),
			 dataClass);
  Vec<Vec<double> > result(n);
  unsigned int pc = 0;
  for (unsigned int i = 0; i < dataClasses.size(); ++i) {
    if (dataClasses[i] == dataClass) {
      result[pc++] = data[i];
    }
  }
  return result;
}

/** returns indices of data rows which belong to class dataClass */
Vec<unsigned int>
KnnNode2::getDataIndices(unsigned int dataClass) const
{
  // count number of entries of that class:
  unsigned int n = count(dataClasses.begin(), dataClasses.end(),
			 dataClass);
  Vec<unsigned int> result(n);
  unsigned int pc = 0;
  for (unsigned int i = 0; i < dataClasses.size(); ++i) {
    if (dataClasses[i] == dataClass) {
      result[pc++] = i;
    }
  }
  return result;
}

Vec<Vec<double> >
KnnNode2::getUsageHistogram() const {
  Vec<Vec<double> > hist(numClasses, Vec<double>(NUM_STAT_BINS, 0.0));
  if (trueCount.size() != numClasses) {
    return hist;
  }
  for (unsigned int i = 0; i < numClasses; ++i) {
    if (trueCount[i].size() != NUM_STAT_BINS) {
      return hist;
    }
    for (unsigned int j = 0; j < NUM_STAT_BINS; ++j) {
      if (falseCount[i][j] > 0) {
	hist[i][j] = static_cast<double>(trueCount[i][j])
	  / static_cast<double>(falseCount[i][j]);
      }
    }
  }
  return hist;
}

void
KnnNode2::writeData(ostream& os) const
{
  for (unsigned int i = 0; i < data.size(); ++i) {
    for (unsigned int j = 0; j < data[i].size(); ++j) {
      os << data[i][j] << " ";
    }
    os << dataClasses[i] << endl;
  }
}



/* MODIFIERS */

void
KnnNode2::clear()
{
  *this = KnnNode2();
}

void
KnnNode2::setData(const Vec<Vec<double> >& mtx, 
		  const Vec<unsigned int>& dClasses,
		  unsigned int _nClass,
		  const Vec<double>& _scale) { 
  PRECOND(mtx.size() == dClasses.size());
  ERROR_IF(!isRectangle(mtx),
	   "Set data vectors not all with same dimensions!");
  data = mtx; dataClasses = dClasses; numClasses = _nClass; scaling = _scale;
  recluster();
}

/* copy method */
void 
KnnNode2::copy(const KnnNode2& other)
{
  // cout << "starting KnnNode2::copy !" << endl;
  estimateSet = other.estimateSet;
  clusterCutoff = other.clusterCutoff;
  // clusterCutoff2 = other.clusterCutoff2;
  gaussDev = other.gaussDev;
  //  kk = other.kk; // copy later
  numClasses = other.numClasses;
  verboseLevel = other.verboseLevel;
  scaling = other.scaling;
  data = other.data;
  lastPrediction = other.lastPrediction;
  dataClasses = other.dataClasses;
  dataLookupProb = other.dataLookupProb;
  // clustData = other.clustData;
  clusters = other.clusters;
  noSelfMode = other.noSelfMode;
  statisticsRunning = other.statisticsRunning;
  trueCount = other.trueCount;
  falseCount = other.falseCount;

  annBucketSize = other.annBucketSize;
  annDim = other.annDim;
  annNumPoints = other.annNumPoints;
  annEps = other.annEps;

  if (annQuery != NULL) {
    annDeallocPt(annQuery);
    annQuery = NULL;
  }
  if ((other.annDim > 0) && (other.annQuery != NULL)) {
    annQuery = annCopyPt(other.annDim, other.annQuery);
  }
  // if (kk != other.kk) {
//   if (annDists != NULL) {
//     delete annDists;
//     if (other.kk > 0) {
//       annDists =  new ANNdist[other.kk + 1];
//     }
//     else {
//       annDists = NULL;
//     }
//   }
//     // }
//   if (annIndices != NULL) {
//     delete annIndices;
//     if (other.kk > 0) {
//       annIndices = new ANNidx[other.kk + 1];
//     }
//     else {
//       annIndices = NULL;
//     }
//   }
  // }
  if (annTree != NULL) {
    delete annTree;
    annTree = NULL;
  }
  ASSERT(other.annIndices != NULL);
  for (unsigned int i = 0; i < KNN_KMAX; ++i) {
    annDists[i] = other.annDists[i];
    annIndices[i] = other.annIndices[i];
  }
  kk = other.kk;
  if (annPoints != NULL) {
    annDeallocPts(annPoints);
    annPoints = NULL;
  }

  if ((annNumPoints > 0) && (annDim > 0)) {
    ASSERT(other.annPoints != NULL);
    annPoints = annAllocPts(annNumPoints, annDim);
    for (int i = 0; i < annNumPoints; ++i) {
      for (int j = 0; j < annDim; ++j) {
	// write into allocated array
	annPoints[i][j] = other.annPoints[i][j]; 
      }
    }
    annTree = new ANNkd_tree(annPoints, annNumPoints, annDim, annBucketSize);
  }
  // cout << "Finished KnnNode2::copy!" << endl;
}

/** read input data */
/*
void
KnnNode2::readData(istream& is,
		  unsigned int startCol,
		  unsigned int endCol,
		  unsigned int classCol)
{
  if (verboseLevel > 1) {
    cout << "KnnNode2: Starting readData (1)!" << endl;
  }
  string line;
  ERROR_IF(endCol <= startCol, "Inconsistent definition of start and end column for readData!");
  unsigned int dim = endCol - startCol;
  Vec<double> dataVec(dim, 0.0);
  unsigned int numClasses = 1;
  unsigned int highestClass = 0; // highest defined class id so far
  while (is) {
    line = getLine(is);
    vector<string> tokens = getTokens(line);
    for (unsigned int i = startCol; i < endCol; ++i) {
      ERROR_IF(i >= tokens.size(), "Undefined data column!");
      dataVec[i-startCol] = stod(tokens[i]);
    }
    ERROR_IF(classCol >= tokens.size(), "Undefined class column!");
    unsigned int dataClass = static_cast<unsigned int>(stod(tokens[classCol]) + 0.499);
    if (dataClass > highestClass) {
      highestClass = dataClass;
      numClasses = highestClass + 1; // counting starts from zero
    }
    
    data.push_back(dataVec);
    scaling = Vec<double>(data[0].size(), 1.0);
    dataClasses.push_back(dataClass);
  }
  // clusters = simpleRepresentativeLinkage(data, clusterCutoff);
  recluster();
  if (verboseLevel > 1) {
    cout << data.size() << " entries data lines read! " << clusters.size() 
	 << " clusters and ";
    unsigned int sum = 0;
    for (unsigned int kk = 0; kk < subClusters.size(); ++kk) {
      sum += subClusters[kk].size();
    }
    cout << sum << " subclusters formed." << endl;
  }
}
*/

/** read input data */
void
KnnNode2::readData(istream& is,
		  const Vec<unsigned int>& mask)
{
  if (verboseLevel > 1) {
    cout << "KnnNode2: Starting readData (2)!" << endl;
  }
  ERROR_IF(mask.size()==0, "Inconsistent definition of readData!");
  string line;
  unsigned int classCol = 0; 
  unsigned int dim = mask.size();
  Vec<double> dataVec(dim, 0.0);
  this->numClasses = 1;
  unsigned int highestClass = 0; // highest defined class id so far
  while (is) {
    line = getLine(is);
    // cout << "reading line " << line << endl;
    vector<string> tokens = getTokens(line);
    if (tokens.size() <= classCol) {
      continue;
    }
    // cout << "Tokens: " << tokens.size() << endl;
    classCol = tokens.size()-1;
    for (unsigned int i = 0; i < dim; ++i) {
      ERROR_IF(mask[i] >= tokens.size(), "Undefined data column!");
      dataVec[i] = stod(tokens[mask[i]]);
    }
    ERROR_IF(classCol >= tokens.size(), "Undefined class column!");
    unsigned int dataClass = static_cast<unsigned int>(stod(tokens[classCol]) + 0.499);
    if (dataClass > highestClass) {
      highestClass = dataClass;
      numClasses = highestClass + 1; // counting starts from zero
    }
    data.push_back(dataVec);
    dataClasses.push_back(dataClass);
  }
  scaling = Vec<double>(data[0].size(), 1.0);
  if (verboseLevel > 1) {
    cout << "Starting simpleSortLinkage: " 
	 << data.size() << " " << clusterCutoff << endl;
  }
  // clusters = simpleRepresentativeLinkage(data, clusterCutoff);
  clusters = simpleSortLinkage(data, clusterCutoff);
  Random& rnd = Random::getInstance();
  if (verboseLevel > 1) {
    cout << "End of simpleSortLinkage!" << endl;
    cout << "starting random suffling cluster ids!" << endl;
  }
  for (unsigned int i = 0; i < clusters.size(); ++i) {
    random_shuffle(clusters[i].begin(), clusters[i].end(), rnd);
  }
  if (verboseLevel > 1) {
    cout << "end of random suffling cluster ids!" << endl;
  }

  // recluster();
  if (verboseLevel > 1) {
    // unsigned int sum = 0;
//     for (unsigned int kk = 0; kk < subClusters.size(); ++kk) {
//       sum += subClusters[kk].size();
//     }
    cout << data.size() << " entries data lines read! " 
	 << clusters.size() << " clusters formed!" << endl;
  }
  ERROR_IF(data.size() == 0, "No KnnNode2 data read!");

  ERROR_IF(!isRectangle(data),
	   "Read data vectors not all with same dimensions!");

  // generate ANN datastructures:
  if (verboseLevel > 2) {
    cout << "Starting to generate ANN datastructures!" << endl;
  }
  annDim = mask.size(); // dimension of points
  annNumPoints = clusters.size(); // number of points
  if (annQuery != NULL) {
    annDeallocPt(annQuery);
  }
  if (annPoints != NULL) {
    annDeallocPts(annPoints);
  }
  annQuery = annAllocPt(annDim);
  annPoints = annAllocPts(annNumPoints, annDim);
  for (int i = 0; i < annNumPoints; ++i) {
    for (int j = 0; j < annDim; ++j) {
      annPoints[i][j] = data[clusters[i][0]][j]; // write into allocated array
    }
  }
  if (annTree != NULL) {
    delete annTree;
  }
  annTree = new ANNkd_tree(annPoints, annNumPoints, annDim, annBucketSize);
  // cout << "End of KnnNode2::readData!" << endl;
}

/** read input data */
void
KnnNode2::readLookupDataProb(istream& is,
			     const Vec<unsigned int>& mask)
{
  if (verboseLevel > 1) {
    cout << "KnnNode2: Starting readLookupDataProb!" << endl;
  }
  ERROR_IF(mask.size()==0, "Inconsistent definition of readLookupData!");
  string line;
  while (is) {
    line = getLine(is);
    // cout << "reading line " << line << endl;
    vector<string> tokens = getTokens(line);
    if (tokens.size() == 0) {
      continue;
    }
    // cout << "Tokens: " << tokens.size() << endl;
    dataLookupProb.push_back(stod(tokens[tokens.size()-1]));
  }
}

void
KnnNode2::recluster() {
#ifdef DEBUG_VERBOSE
  cout << "starting recluster!" << endl;
#endif
  // clusters = simpleRepresentativeLinkage(data, clusterCutoff);
  clusters = simpleSortLinkage(data, clusterCutoff);
  Random& rnd = Random::getInstance();
  for (unsigned int i = 0; i < clusters.size(); ++i) {
    if (clusters[i].size() > 1) {
      random_shuffle(clusters[i].begin(), clusters[i].end(), rnd);
    }
  }

  //   cout << "Clusters: " << clusters << endl;
  // cout << "Subclusters: " << subClusters << endl;

  annNumPoints = clusters.size(); // number of points
  if (data.size() > 0) {
    annDim = data[0].size();
    if (scaling.size() != data[0].size()) {
      scaling = Vec<double>(annDim, 1.0);
    }
  }
  else {
    annDim = 0;
    scaling.clear();
    cout << "Zero dim set!" << endl;
  }
  // cout << "Mark 2" << endl;
  if (annQuery != NULL) {
    annDeallocPt(annQuery);
    annQuery = NULL;
  }
  // cout << "Mark 3" << endl;
  if (annPoints != NULL) {
    annDeallocPts(annPoints);
    annPoints = NULL;
  }
  // cout << "Mark 4" << endl;
  if (annTree != NULL) {
    delete annTree;
    annTree = NULL;
  }
  // cout << "Mark 5" << endl;
  if (annDim > 0) {
    annQuery = annAllocPt(annDim);
  }
  // cout << "Mark 6" << endl;
  if ((annDim > 0) && (annNumPoints > 0)) {
    annPoints = annAllocPts(annNumPoints, annDim);
    // cout << "Mark 6a" << endl;
    for (int i = 0; i < annNumPoints; ++i) {
      for (int j = 0; j < annDim; ++j) {
	annPoints[i][j] = data[clusters[i][0]][j]; // write into allocated array
      }
    }
    if (annNumPoints > 0) {
      // cout << "Mark 6b" << endl;
      annTree = new ANNkd_tree(annPoints, annNumPoints, annDim, annBucketSize);
    }
  }
#ifdef DEBUG_VERBOSE
  cout << "recluster finished!" << endl;
#endif
}

/** optimize scaling of node using simple Monte Carlo steps */
void
KnnNode2::optimizeScaling(int numSteps, 
			 int verboseLevel,
			 double stepWidth,
			 unsigned int numTrials)
{
  if (verboseLevel > 1) {
    cout << "Starting optimize scaling of knnNode with scaling " << scaling 
	 << " and k " << kk << endl;
  }
  Vec<double> origScaling = scaling;
  Vec<double> bestScaling = scaling;
  double origScore = estimateAccuracy(numTrials);
  double bestScore = origScore;
  double score;
  if (verboseLevel > 0) {
    cout << "Score of original scaling: " << origScore << endl;
  }
  Random& rnd = Random::getInstance();

  // optimize kk:
  unsigned int kkOrig = kk;
  unsigned int kBest = kk;
  unsigned int kMin = 3;
  unsigned int kDiff = 1; // only allow for minimal changes of k
  if (kk > (kMin+kDiff)) {
    kMin = kkOrig - kDiff;
  }
  unsigned int kMax = kkOrig + kDiff + 1;
  for (unsigned int k = kMin; k < kMax; ++k) {
    kk = k;
    score = estimateAccuracy(numTrials);    
    if (score > bestScore) {
      kBest = k;
      bestScore = score;
      if (verboseLevel > 0) {
	cout << "Found better k: " << kBest << " " << bestScore << endl;
      }
    }
  }
  if (verboseLevel > 0) {
    cout << "Now using optimized k: " << kBest << " " << bestScore << endl;
  }
  kk = kBest;

  for (unsigned int i = 0; static_cast<int>(i) < numSteps; ++i) {
    scaling = bestScaling;
    for (unsigned int j = 0; j < scaling.size(); ++j) {
      scaling[j] += stepWidth * rnd.getGaussian();
      if (scaling[j] < 0.0) {
	scaling[j] *= (-1); // invert again to positive value
      }
    }
    normalizeEuclidian(scaling);
    score = estimateAccuracy(numTrials);
    if (verboseLevel > 1) {
      cout << i + 1 << " " << score << " " << scaling << endl;
    }
    if (score > bestScore) {
      bestScore = score;
      bestScaling = scaling;
      if (verboseLevel > 0) {
	cout << i + 1 << " New best scaling: " << bestScore << " " 
	     << bestScaling << endl;
      }
    }
  }
  scaling = bestScaling;

  for (unsigned int k = kMin; k < kMax; ++k) {
    kk = k;
    score = estimateAccuracy(numTrials);    
    if (score > bestScore) {
      kBest = k;
      bestScore = score;
      if (verboseLevel > 0) {
	cout << "Found better k: " << kBest << " " << bestScore << endl;
      }
    }
  }
  if (verboseLevel > 0) {
    cout << "Now using optimized k: " << kBest << endl;
  }
  kk = kBest;

  if (verboseLevel > 0) {
    cout << "End result best scaling: " << kk << " " << bestScore << " " 
	 << bestScaling << endl;
  }
}

void
KnnNode2::setK(unsigned int k)
{
  PRECOND(k > 0);
  if (k != kk) {
    kk = k;
    //     if (annDists != NULL) {
    //       delete annDists;
    //     }
    //     if (annIndices != NULL) {
    //       delete annIndices;
    //     }
    //     annDists =  new ANNdist[kk];
    //     annIndices = new ANNidx[kk];
  }
}

void
KnnNode2::startStatistics() 
{
  ERROR_IF(numClasses == 0, "No classes defined when trying to start statistics mode!");
  trueCount = Vec<Vec<unsigned int long> >(numClasses, Vec<unsigned int long>(NUM_STAT_BINS, 0));
  falseCount = Vec<Vec<unsigned int long> >(numClasses, Vec<unsigned int long>(NUM_STAT_BINS, 0));
  statisticsRunning = true;
}

void
KnnNode2::thinInner(unsigned int thinK)
{
  cout << "Starting knnThin! " << thinK << " " << data.size() << " " 
       << clusters.size() << endl;

  if (thinK >= size()) {
    return;
  }
  //   Vec<unsigned int> newIndices = knnThin(data, dataClasses, thinK);
  //   data = getSubset(data, newIndices);
  //   dataClasses = getSubset(dataClasses, newIndices);
  //  recluster();

  unsigned int oldK = kk; // store k value, replace temporarily
  setK(thinK);
  Vec<int> toBeDeleted;
  Vec<Vec<int> > deleteClusters;
  Random& rnd = Random::getInstance();
  int kkUse = thinK;
  unsigned int tmpCounter;
  int i;
  Vec<int> checkedAlreadyFlags(data.size(), 0); // set to one if checked already
  // Vec<unsigned int> remainingIds = generateStair(data.size());
  // unsigned int ri = 0;
  for (int ii = data.size()-1; (ii >= 0) && (data.size() > thinK); --ii) {
    // Vec<unsigned int> result = kNearestNeighbors(data[i], data, k);
    tmpCounter = 0;
    do {
      i = rnd.getRand(data.size());
      //        i = remainingIds[ri];
      //        remainingIds.erase(remainingIds.begin(), remainingIds.begin()+ri);
    } while ((checkedAlreadyFlags[i] != 0)
	      && (tmpCounter++ < 10));  // try again this many times
    for (int j = 0; j < annDim; ++j) {
      annQuery[j] = data[i][j];
    }
    checkedAlreadyFlags[i] = 1;
    kkUse = thinK;
    if (kkUse > annNumPoints) {
      kkUse = annNumPoints;
    }
    unsigned int tmpId;
//     cout << "Starting new ANN tree search with " 
// 	 << kkUse << " " << annNumPoints << " "
// 	 << data.size() << endl;
    annTree->annkSearch(annQuery, kkUse, annIndices, annDists, annEps);
    // cout << "ANN tree search finished!" << endl;
    //   Vec<unsigned int> nearest = kNearestNeighbors(v, data,
    // 						clustData, kkUse, scaling,
    // 						clusters, 
    // 		subClusters, clusterCutoff, clusterCutoff2);
    bool found = false;
    bool collision = false;
    int kkEff = 0;
    // check if classes diverse:
    for (int j = 1; j < kkUse; ++j) {
      for (unsigned int k = 0; k < clusters[annIndices[j]].size(); ++k) {
	tmpId = clusters[annIndices[j]][k];
	if (dataClasses[tmpId] != dataClasses[clusters[annIndices[0]][0]]) {
	  found = true; 
	  break;
	}
	++kkEff;
	if (kkEff >= kkUse) {
	  break;
	}
      }
      if (found || (kkEff >= kkUse)) {
	break;
      }
    }
    if (!found) { // classes not diverse, point can probably be savely deleted
      // cout << "deletable point found in thinning!" << i << " size: " << data.size() << endl;
      // cout << "So far to be deleted: " << toBeDeleted << endl;
      for (int j = 0; j < kkUse; ++j) {
	for (unsigned int k = 0; k < clusters[annIndices[j]].size(); ++k) {
	  tmpId = clusters[annIndices[j]][k];
	  // serch in clusters so far marked for deletion:
	  for (unsigned int m = 0; m < deleteClusters.size(); ++m) {
	    for (unsigned int n = 0; n < deleteClusters[m].size(); ++n) {
	      if (deleteClusters[m][n] == static_cast<int>(tmpId)) {
		deleteClusters[m].push_back(i);
		collision = true;
		break;
	      }
	    }
	    if (collision) {
	      break;
	    }
	  }
	  if (collision) {
	    break;
	  }
	}
	if (collision) {
	  break;
	}
      }
      // start new cluster
      if (!collision) {
	deleteClusters.push_back(Vec<int>(1, i));
      }
    }
  }
  //       cout << "Erasing point " << i + 1 << " because not diverse!"
  // 	   << endl;
  // if (collision) { // delete all points marked for deletion
  // cout << "Collision!" << endl;
  // this should work because highest indices are deleted first
  // cout << "Delete clusters found so far: " << deleteClusters << endl;

  for (unsigned int m = 0; m < deleteClusters.size(); ++m) {
    if (deleteClusters[m].size() <= thinK) {
      continue; // cluster too small, skip
    }
    random_shuffle(deleteClusters[m].begin(), deleteClusters[m].end(), rnd);
    for (unsigned int n = 0; n + thinK < deleteClusters[m].size(); ++n) {
      toBeDeleted.push_back(deleteClusters[m][n]);
    }
  }
  toBeDeleted = uniqueSet(toBeDeleted);
  cout << "All points checked! Erasing " << toBeDeleted.size() 
       << " points!" << endl;
  // cout << toBeDeleted << endl;
  sort(toBeDeleted.begin(), toBeDeleted.end());
  reverse(toBeDeleted.begin(), toBeDeleted.end());
  for (unsigned int j = 0; j < toBeDeleted.size(); ++j) {  
    // cout << "Deleting point " << j << " " << toBeDeleted[j] << endl;
    ERROR_IF(toBeDeleted[j] >= static_cast<int>(data.size()), 
	     "Internal error in line 1112!");
    data.erase(data.begin()+toBeDeleted[j]);
    dataClasses.erase(dataClasses.begin()+toBeDeleted[j]);
    // checkedAlreadyFlags.erase(checkedAlreadyFlags.begin()+toBeDeleted[j]);
    if (data.size() <= thinK) {
      break;
    }
  }
  toBeDeleted.clear();
  cout << "Reclustering of " << data.size() << " points!" << endl;
  recluster();

  setK(oldK); // replace again with old value
  cout << "thin finished!" << endl;
}

void
KnnNode2::thin(unsigned int thinK)
{
  for (unsigned int i = 0; i < 3; ++i) {
    thinInner(thinK);
  }
}

void
KnnNode2::updateStatistics(const Vec<double>& prediction, unsigned int knownClass) const
{
  unsigned int bin = (static_cast<int>(NUM_STAT_BINS * prediction[knownClass])) % NUM_STAT_BINS;
  for (unsigned int i = 0; i < numClasses; ++i) {
    if (i == knownClass) {
      ++trueCount[i][bin];
    }
    else {
      ++falseCount[i][bin];
    }
  }
}
