#!/usr/bin/perl
#Script that calculates the amount of conservation at each position based upon
#the paper on Sequence logos
#Sequence logos: a new way to display consensus sequences, Thomas Schneider, Michael Stephens
#Author: Mugdha Khaladkar
#Date: 12/05/05

$File1 = $ARGV[0]; 	#input file containig the multiple seq. alignment
$File3 = $ARGV[1];	#output file with the conservation region
#$File3 = $ARGV[2];	#jus to store the score..temporary
open(IN, "<$File1") or die "Cannot open file $File1";
#open(OUT,">$File2") or die "Cannot open file $File2";

open(TEMP,">$File3") or die "Cannot open file $File2";

my @seq_name;
my @seq_line;
my @A, @C, @G, @T, @H, @R;	#@A gives the freq of base A at location l, @H is the uncertainity measure, @R is information 
my @height_A, @height_C, @height_G, @height_T;		#gives height of base at position l
my $max_height;
my $conserved;		#string that holds the conserved region

my @total;

$i = 0;
print "\n Computing the factors for the multiple alignment in $File1...\n";

while(<IN>){
	chomp;
	if($_ =~ /\w+.*/){
		@part = split(/\s+/,$_);
		$seq_name[$i] = $part[0];
		$seq_line[$i] = $part[1];	
		$i++;
	}

} 

$no = @seq_name;
#print "\nno: $no";
#####Calculate the maximum height possible###

#$max_height = $no * (2 - (-$no * (log($no)/log(2))));
#print "\nMax: $max_height";
#############################################


for($j= 0; $j<$no; $j++){
	print "\n$seq_name[$j]\t$seq_line[$j]";
}
my %nt = ("A",0,"C",0,"G",0,"T",0);

$length_of_align = length($seq_line[0]);
$j = 0;
for($j = 0; $j < $length_of_align; $j++){
	for($i = 0; $i<$no; $i++){
		$cur_nt = substr($seq_line[$i],$j,1);
		if($cur_nt ne "-"){
			$nt{$cur_nt}++;	
		}	 	
	}
	
#	while (($key,$value) = each(%nt)){
#		print "\n$key : $value";			
#	}
	
	$A[$j] = $nt{"A"}/$no;
	$C[$j] = $nt{"C"}/$no;
	$G[$j] = $nt{"G"}/$no;
	$T[$j] = $nt{"T"}/$no;
#	print "\nA : $A[$j]";
#	print "\nC : $C[$j]";
#	print "\nG : $G[$j]";
#	print "\nT : $T[$j]";

	while (($key,$value) = each(%nt)){
		$nt{$key} = 0;
	}
}
$count = 1;
for($j = 0; $j<$length_of_align; $j++){
	$H[$j] = 0;
	if($A[$j] != 0){
	   $H[$j] += $A[$j]*(log($A[$j])/log(2));
	}

	if($C[$j] != 0){
	   $H[$j] += $C[$j]*(log($C[$j])/log(2));
	}

	if($G[$j] != 0){
	   $H[$j] += $G[$j]*(log($G[$j])/log(2));
	}
	if($T[$j] != 0){
	   $H[$j] += $T[$j]*(log($T[$j])/log(2));
	}
	$H[$j] = -$H[$j];
#	print "\nH : $H[$j]";
	$R[$j] = 2 - $H[$j];
	
	#print "\nR : $R[$j]";
	$max = 0;

	$height_A[$j] = $A[$j]*$R[$j];
#	if ($height_A[$j]> $max){
#		$max = $height_A[$j];
#	}
	
	$height_C[$j] = $C[$j]*$R[$j];
#	if ($height_C[$j]> $max){
#		$max = $height_C[$j];
#	}

	$height_G[$j] = $G[$j]*$R[$j];
#	if ($height_G[$j]> $max){
#		$max = $height_G[$j];
#	}

	$height_T[$j] = $T[$j]*$R[$j];
#	if ($height_T[$j]> $max){
#		$max = $height_T[$j];
#	}

#	if($max >= $max_height/3){		##position is conserved
#		$conserved .= "*"
#	}
#	else{
#		$conserved .= " ";
#	}
	$total[$j] = $height_A[$j] + $height_C[$j] + $height_G[$j] + $height_T[$j];
##	print "\n total : $total[$j]";
	#print "\nhtA : $height_A[$j] $A[$j]";
	#print "\nhtC : $height_C[$j] $C[$j]";
	#print "\nhtG : $height_G[$j] $G[$j]";
	#print "\nhtT : $height_T[$j] $T[$j]\n";
	if((substr($seq_line[0],$j,1)) ne '-'){
	    print TEMP $count,"\t",($total[$j]/2),"\n";
	    $count++;
	}
	
}


#######################################EXTRA STUFF##############################################################################
###Normalise using mean########################
#$mean = 0;

#for($j = 0; $j<$length_of_align; $j++){
#	$mean += $total[$j];
#}
#$mean = $mean/$length_of_align;
#
#$std_dev = 0;
#for($j = 0; $j<$length_of_align; $j++){
#	$std_dev += (($total[$j]-$mean)**2);
#}
#$std_dev = sqrt(($std_dev)/($length_of_align-1));


#print "\n$seq_line[0]\n$conserved\n";
#print "mean: $mean \tstd_dev: $std_dev\n";
#my @total_mean;				
#print TEMP "\n";
#for($j = 0; $j<$length_of_align; $j++){
#	$total_mean[$j] = ($total[$j]-$mean)/$std_dev;
#	print TEMP "$total_mean[$j]\t";
#}
#############Normalise using the Median#############################

#@sorted = sort(@total);
#$median = $sorted[$length_of_align/2];

#$std_dev = 0;
#for($j = 0; $j<$length_of_align; $j++){
#	$std_dev += (($total[$j]-$median)**2);
#}
#$std_dev = sqrt(($std_dev)/($length_of_align-1));

#my @total_median;	
#print TEMP "\n";
#for($j = 0; $j<$length_of_align; $j++){
#	$total_median[$j] = ($total[$j]-$median)/$std_dev;
#	print TEMP "$total_median[$j]\t";
#}
########################Normalise using the GMean######################
#@total_gmean;
#$gmean = 0;
#for($j = 0; $j<$length_of_align; $j++){
#	if($total[$j]!=0){
#	$total_gmean[$j] = log($total[$j]);
#	$gmean += $total_gmean[$j];
#	}
#}
#$gmean = $gmean/$length_of_align;
#$gmean = exp($gmean);

#$std_dev = 0;
#for($j = 0; $j<$length_of_align; $j++){
#	$std_dev += (($total[$j]-$gmean)**2);
#}

#print TEMP "\n";
#for($j = 0; $j<$length_of_align; $j++){
#	$total_gmean[$j] = ($total[$j]-$gmean)/$std_dev;
#	print TEMP "$total_gmean[$j]\t";
#}
#for($i = 0; $i<$no; $i++){
#	print OUT ">$seq_name[$i]\n$seq_line[$i]\n";	
	
#}
#print OUT $conserved;

print "\n\n Done! Factors written to file $File3\n\n";
