#!/usr/local/bin/perl -w
#
#analyze SNP hits from polybayes
#
#count the number of unique SNPs
#count the number of unique contigs
#count the number of SNPs where P_SNP > x
#get the length of every contig
#calculate SNPs / base for each contig
#calculate total bases from contigs
#calculate total SNPs / bases
#calculate total SNPS > x / bases
#
#do the hokey pokey, turn yourself around
#
#3 Jun 2004 by Lee
#
#useage
#SNP_analysis (threshhold) (polybayesfile)
#
#where
#(threshhold) is the P_SNP cutoff to look for
#and
#(polybayesfile) is the output from polybayes
#
use strict;
my $p_snp_cutoff = $ARGV[0];
my $polybayesfile = $ARGV[1];

my $good_line_count;
my $bad_line_count;

my %known_contigs;
my %snps_for_contigs;
my %good_snps_for_contigs;
my @contig_list;
my $contig_count = 0;
my $good_snp_count;
my $total_contig_bases = 0;
my $longest_contig = 0;
my $shortest_contig = 4000;

open (IN, "<$polybayesfile")|| die "cannot read from $polybayesfile\n";
while (<IN>){
    if (/(CLUSTER.*ALIGNMENT\:\s+)(Contig\d+)(\s+.*PADDED_LENGTH\:\s+)(\d+)(\s+.*P_SNP\:\s+)(\d?\.?\d+)(\s+.*)/){
	my $contig = $2;
	my $contig_length = $4;
	my $p_snp = $6;
        $good_line_count++;
	unless (exists $known_contigs{$contig}){
	    push (@contig_list, $contig);
	    $known_contigs{$contig} = $contig_length;
	    $total_contig_bases += $contig_length;
	    $contig_count++;
	    $snps_for_contigs{$contig} = 1;
	}else{
	    my $tmp_snp_count = $snps_for_contigs{$contig};
	    $tmp_snp_count++;
	    $snps_for_contigs{$contig} = $tmp_snp_count;
	};
	if ($p_snp >= $p_snp_cutoff){
	    $good_snp_count++;
	    unless (exists $good_snps_for_contigs{$contig}){
		$good_snps_for_contigs{$contig} = 1;
	    }else{
		my $good_contig_snp_count = $good_snps_for_contigs{$contig};
		$good_contig_snp_count++;
		$good_snps_for_contigs{$contig} = $good_contig_snp_count;
	    };
	};
	if ($contig_length > $longest_contig){
	    $longest_contig = $contig_length;
	}elsif ($contig_length < $shortest_contig){
	    $shortest_contig = $contig_length;
	};
    }else{
	$bad_line_count++;
    };

};

my $most_snps_per_contig = 0;
my $least_snps_per_contig = 40;
foreach my $snp_ed_contig (keys %snps_for_contigs){
    my $snp_count_on_contig = $snps_for_contigs{$snp_ed_contig};
    if ($snp_count_on_contig > $most_snps_per_contig){
	$most_snps_per_contig = $snp_count_on_contig;
    }elsif ($snp_count_on_contig < $least_snps_per_contig){
	$least_snps_per_contig = $snp_count_on_contig;
    };
};

my $most_good_snps = 0;
my $least_good_snps = 40;
foreach my $good_contig_snp (keys %good_snps_for_contigs){
    my $good_snp_count = $good_snps_for_contigs{$good_contig_snp};
    if ($good_snp_count > $most_good_snps){
	$most_good_snps = $good_snp_count;
    }elsif ($good_snp_count < $least_good_snps){
	$least_good_snps = $good_snp_count;
    };
};

my $grand_average = $good_snp_count / $total_contig_bases;
my $per_base = 1 / $grand_average;
my $rough_average = $good_line_count / $total_contig_bases;
my $rough_per_base = 1 / $rough_average;
my $avg_contig_length = $total_contig_bases / $contig_count;
my $avg_snps_per_contig = $good_line_count / $contig_count;
my $avg_good_snps = $good_snp_count / $contig_count;

#print "there were $contig_count contigs in $polybayesfile\n";
#print "total contig length $total_contig_bases\n";
#print "total $good_line_count candidate SNPs\n";
#print "and $good_snp_count SNPs that were >=  $p_snp_cutoff\n";
#print "average $rough_average candidate SNPs per base\n";
#print "or one candidate SNP every $rough_per_base bases\n";
#print "average $grand_average SNPs >= $p_snp_cutoff per base\n";
#print "or one qualifying SNP every $per_base bases\n";
#print "--------\n";
#print "shortest contig $shortest_contig bases long\n";
#print "longest contig $longest_contig bases long\n";
#print "average contig $avg_contig_length bases long\n";
#print "--------\n";
#print "the most candidate SNPs on a contig was $most_snps_per_contig\n";
#print "the fewest was $least_snps_per_contig\n";
#print "the average contig had $avg_snps_per_contig candidate SNPs\n";
#print "--------\n";
#print "the most SNPs >= $p_snp_cutoff on a contig was $most_good_snps\n";
#print "the fewest was $least_good_snps\n";
#print "the average contig had $avg_good_snps candidates >= $p_snp_cutoff\n";

print "$contig_count\t$total_contig_bases\t$good_line_count\t$p_snp_cutoff\t$most_snps_per_contig\t$least_snps_per_contig\t$avg_snps_per_contig\t$most_good_snps\t$least_good_snps\t$avg_good_snps\t$rough_per_base\t$per_base\n";
