#!/usr/local/bin/perl
#
# Script to output the results of the Fisher's tests, Permutation tests and POSOC in a way that illustrates the genes and terms that constitute each POSOC Cluster, as well as the outcomes of the two statistical tests.
# The three output file names must be supplied as command line arguments, in the order as follows: POSOC Clustering output; Fisher's Tests output; Permutation Tests output. A false discovery rate for the Benjamini Hochberg multiple testing correction may optionally be entered as a fourth argument, or else the default FDR is 0.1.
# Ian Grieve
# 15/08/05

use strict;
use Statistics::R;

my $posoc_in = "";
my @posoc_in = "";
my $fisher_in = "";
my @fisher_in = "";
my $permutation_in = "";
my @permutation_in = "";
my $direction_in = "";
my @direction_in = "";
my $false_discovery_rate = 0.1;

if ($#ARGV < 2)
{
    die print "Please supply the command line arguments required\n";
}

else
{
    $posoc_in = $ARGV[0];
    $fisher_in = $ARGV[1];
    $permutation_in = $ARGV[2];
}

if ($ARGV[3] > 0)
{
    $false_discovery_rate = $ARGV[3];
}

$direction_in = 'de_direction.txt';
our $results = 'results.txt';
our $pvalues = 'c:/program files/r/rw2011/pvalues.txt';

open (POSOC_IN, $posoc_in) || die print "Cannot open $posoc_in in Results Outputting Script";
my @posoc_in_tosplit = <POSOC_IN>;
my $posoc_in_tosplit = "@posoc_in_tosplit";
@posoc_in = split /-{45}/, $posoc_in_tosplit;

open (FISHER_IN, $fisher_in) || die print "Cannot open $fisher_in in Results Outputting Script";
my @fisher_in_tosplit = <FISHER_IN>;
my $fisher_in_tosplit = "@fisher_in_tosplit";
@fisher_in = split /##/, $fisher_in_tosplit;
shift (@fisher_in);

open (PERMUTATION_IN, $permutation_in) || die print "Cannot open $permutation_in in Results Outputting Script";
my @permutation_in_tosplit = <PERMUTATION_IN>;
my $permutation_in_tosplit = "@permutation_in_tosplit";
@permutation_in = split /##/, $permutation_in_tosplit;

open (DIRECTION_IN, $direction_in) || print "No Differential Expression Direction file present";
my @direction_in_tosplit = <DIRECTION_IN>;
my $direction_in_tosplit = "@direction_in_tosplit";
@direction_in = split /##/, $direction_in_tosplit;

open (RESULTS, ">$results") || die print "Cannot open $results in Results Outputting Script";

open (PVALUES, ">$pvalues") || die print "Cannot open $pvalues in Results Outputting Script";

# To put in results table:
# From POSOC_IN (for each Cluster)
# -> The number of genes in the Cluster
# -> The number of GO terms that are in the cluster
# -> The GO Terms that are in the Cluster
# -> The Genes that are in the Cluster
# -> The POSOC Score of the Cluster
# -> The name of the Cluster (Cluster description)

# From FISHER_IN
# -> The P value

# From PERMUTATION_IN
# -> The P-values from the Permutation Test

# From DE_DIRECTION
# -> The proportions of genes in each Cluster that are upregulated/downregulated under the experimental conditions compared to the controls.

# From this file
# -> The outcome of the Benjamini-Hochberg Multiple Testing Correction.


# This section gets the Cluster Genes & Summary Information from the POSOC Output File.

my $clusters_genes = "@posoc_in[2]";
my @clusters_genes = split/Cluster /, $clusters_genes;

my $how_many_clusters = "";
my $cluster_number = "";
my $cluster_name = "";
my $cluster_score = "";
my $cluster_genes_no = "";
my $cluster_terms_no = "";
my $cluster_gene_affyid = "";
my $cluster_gene_topush = "";
#my @cluster_genes = "";
my $cluster_genes_to_hash = "";
my @cluster_all_genes = "";
my $asterisk = "*";
my $element_separate = "##";
my $cluster_summary_to_hash = "";
my $cluster_to_hash = "";
my @cluster_to_hash = "";
my %cluster_summary_hash = "";

foreach my $cluster (@clusters_genes)
{
    my @cluster_genes = "";
    #print "\n";
    $cluster =~ /(\d+)\((\d+) genes\):\s+\*?GO:\d+\s+([\S ]*)\s+score = (\d+.?\d*)\s*\n\s*genes = \d+\s+\d+.?\d*\s+\d+\s+\d+.?\d*\s+terms = \d+\s+\d+.?\d*\s+(\d+)\s+\d+.?\d*\s+/;
    $cluster_number = $1;
    $cluster_name = $3;
    $cluster_score = $4;
    $cluster_genes_no = $2;
    $cluster_terms_no = $5;
    $cluster_number =~ s/ //g; 
    $cluster_summary_to_hash = "$cluster_number$asterisk$cluster_name$asterisk$cluster_score$asterisk$cluster_genes_no$asterisk$cluster_terms_no$asterisk";
    my @cluster_genes_list = split (/\n/, $cluster);
    foreach my $cluster_gene (@cluster_genes_list)
    {
    	if ($cluster_gene =~ /([\S]*_[a|s]t)/)
	{
	    $cluster_gene_affyid = $1;
	    $cluster_gene_topush = "$cluster_gene_affyid$asterisk";
	    push (@cluster_genes, $cluster_gene_topush);	    
	}
    }
    $cluster_genes_to_hash = "@cluster_genes";
    $cluster_to_hash = "$cluster_summary_to_hash$cluster_genes_to_hash$element_separate";
    push (@cluster_to_hash, $cluster_to_hash);
}
my $cluster_to_hash_tosplit = "@cluster_to_hash";
my @cluster_summary_hash_split = split /##/, $cluster_to_hash_tosplit;
shift @cluster_summary_hash_split;

foreach my $cluster_summary_hash_split (@cluster_summary_hash_split)
{
    $cluster_summary_hash_split =~ s/^ //g;
}

%cluster_summary_hash = map {split /\*/, $_, 2} @cluster_summary_hash_split;


# This section gets the list of GO terms associated with each POSOC Cluster

my $clusters_terms = "@posoc_in[3]";
my @clusters_terms = split/Cluster /, $clusters_terms;

my $cluster_number = "";
my $cluster_term_goid = "";
my $cluster_details_to_hash = "";
my @cluster_to_hash = "";
my $cluster_term_topush = "";
my $cluster_terms_to_hash = "";
my $termshash = "";
my %clusterterms_hash = "";
$how_many_clusters = $#clusters_terms;

foreach my $cluster (@clusters_terms)
{
    my @cluster_terms = "";
    #print "\n";
    $cluster =~ /(\d+)\((\d+) terms\):/;
    $cluster_number = $1;
    $cluster_terms_no = $2;
    $cluster_details_to_hash = "$cluster_number$asterisk$cluster_terms_no$asterisk";
    my @cluster_terms_list = split (/\n/, $cluster);

    foreach my $cluster_term (@cluster_terms_list)
    {
    	if ($cluster_term =~ /^\s*(GO:\d+)/)
	    # Remove the '^\s*' to include the head node in the output even if this is not in the nodes list in the POSOC output.
	{
	    $cluster_term_goid = $1;
	    $cluster_term_topush = "$cluster_term_goid$asterisk";
	    push (@cluster_terms, $cluster_term_topush);	    
	}
    }
    $cluster_terms_to_hash = "@cluster_terms";
    $termshash = "$cluster_details_to_hash$cluster_terms_to_hash$element_separate";
    push (@cluster_to_hash, $termshash);
}

my $clusterterms_to_hash_tosplit = "@cluster_to_hash";
my @clusterterms_hash_split = split /##/, $clusterterms_to_hash_tosplit;
shift @clusterterms_hash_split;

foreach my $clusterterms_hash_split (@clusterterms_hash_split)
{
    $clusterterms_hash_split =~ s/^ //g;
}

%clusterterms_hash = map {split /\*/, $_, 2} @clusterterms_hash_split;


# This section gets the Fisher's Test Results information from the Fisher's Test Outputs File

my $fisher_cluster_number = "";
my $fisher_cluster_name = "";
my $fisher_pvalue = "";
my $fisher_to_hash = "";
my @fisher_to_hash = "";
my %fisher_hash = "";

foreach my $fisheroutput (@fisher_in)
{
    $fisheroutput =~ /([\S ]*)\s+Cluster:\s+(\d+)\n*\s*P-value = ([\d .\-e]*)/;
    $fisher_cluster_name = $1;
    $fisher_cluster_number = $2;
    $fisher_pvalue = $3;
    $fisher_cluster_number =~ s/ //g;
    $fisher_cluster_name =~ s/ //;
    $fisher_to_hash = "$fisher_cluster_number$asterisk$fisher_cluster_name$asterisk$fisher_pvalue$element_separate";
    push (@fisher_to_hash, $fisher_to_hash);  
}

my $fisher_to_hash_tosplit = "@fisher_to_hash";
my @fisher_hash_split = split /##/, $fisher_to_hash_tosplit;

foreach my $fisher_hash_split (@fisher_hash_split)
{
    $fisher_hash_split =~ s/^ //g;
}

%fisher_hash = map {split /\*/, $_, 2} @fisher_hash_split;


# This section gets the Permutation Test Results information from the Permutation Test Outputs File

my $permutation_cluster_number = "";
my $permutation_lessthan_pvalue = "";
my $permutation_greaterthan_pvalue = "";
my $permutation_to_hash = "";
my @permutation_to_hash = "";
my %permutation_hash = "";

foreach my $permutationoutput (@permutation_in)
{
    $permutationoutput =~ /(\d+)\s+\d+\n*\s*Less[\w -]*:\s+([\d .\-e]*)\n*\s*Greater[\w -]*:\s+([\d .\-e]*)/;
    $permutation_cluster_number = $1;
    $permutation_lessthan_pvalue = $2;
    $permutation_greaterthan_pvalue = $3;
    $permutation_cluster_number =~ s/ //g;
    $permutation_to_hash = "$permutation_cluster_number$asterisk$permutation_lessthan_pvalue$asterisk$permutation_greaterthan_pvalue$element_separate";
    push (@permutation_to_hash, $permutation_to_hash);  
}

my $permutation_to_hash_tosplit = "@permutation_to_hash";
my @permutation_hash_split = split /##/, $permutation_to_hash_tosplit;
shift @permutation_hash_split;

foreach my $permutation_hash_split (@permutation_hash_split)
{
    $permutation_hash_split =~ s/^ //g;
}

%permutation_hash = map {split /\*/, $_, 2} @permutation_hash_split;


# This section parses information about directionality of differential expression from the DE Direction file

my $DE_cluster_number = "";
my $DE_cluster_genes = "";
my $DE_genes_upregulated = "";
my $DE_genes_downregulated = "";
my $direction_to_hash = "";
my @direction_to_hash = "";
my %direction_hash = "";

foreach my $cluster_de_direction (@direction_in)
{ 
    $cluster_de_direction =~ /Cluster\s*(\d+)/;
    $DE_cluster_number = $1;

    $cluster_de_direction =~ /\% DE Genes Upregulated:\s+(\d+.?\d*\%)/;
    $DE_genes_upregulated = $1;

    $cluster_de_direction =~ /Cluster DE Genes:\s+(\d+)/;
    $DE_cluster_genes = $1;
    
    $cluster_de_direction =~ /\% DE Genes Downregulated:\s+(\d+.?\d*\%)/;
    $DE_genes_downregulated = $1;

    $direction_to_hash = "$DE_cluster_number$asterisk$DE_cluster_genes$asterisk$DE_genes_downregulated$asterisk$DE_genes_upregulated$element_separate";
    push (@direction_to_hash, $direction_to_hash);

    my $direction_to_hash_tosplit = "@direction_to_hash";
    my @direction_hash_split = split /##/, $direction_to_hash_tosplit;
    shift @direction_hash_split;
    
    foreach my $direction_hash_split (@direction_hash_split)
    {
	$direction_hash_split =~ s/^ //g;
    }
    
    %direction_hash = map {split /\*/, $_, 2} @direction_hash_split;
}

my $i = 1;
my $to_print_fisher = "";
my $to_print_permutation = "";
my $to_print_direction = "";
my $to_resort_clustersgenes = "";
my @clustergenes_resort = "";
my @bh_rejected_clusters = "";
my $cluster_score = "";
my $cluster_genesno = "";
my $cluster_termsno = "";
my $cluster_name = "";
my $fisher_pvalue = "";
my $bh_verdict = "";
my $bh_rejected_max = "";
my $fisher_qvalue = "";
my $to_get_pvalue = "";
my $pvalue = "";
my $to_enter_pvalues = "";
my @pvalues = "";
my $permutation_lessthan_pvalue = "";
my $permutation_greaterthan_pvalue = "";
my $direction_under_percent = "";
my $direction_over_percent = "";
my $de_genes_no = "";
my $quotation = "\"";
my $asterisk = "*";
my $separator = "##";
my $number = "No.";
my $name = "Cluster Name";
my $score = "Score";
my $nogenes = "No. Genes";
my $noterms = "No. Terms";
my $fisherp = "p-value";
my $rejectnull = "B-H Reject Null?";
my $permp1 = "Perm. Test <";
my $permp2 = "Perm. Test >";
my $degenes = "No. DE Genes";
my $under = "% DE Genes Underexp.";
my $over = "% DE Genes Overexp.";

my $numheading = "$quotation$number$quotation";
my $scoreheading = "$quotation$score$quotation";
my $nogenesheading = "$quotation$nogenes$quotation";
my $notermsheading = "$quotation$noterms$quotation";
my $fisherpheading = "$quotation$fisherp$quotation";
my $rejectnullheading = "$quotation$rejectnull$quotation";
my $permp1heading = "$quotation$permp1$quotation";
my $permp2heading = "$quotation$permp2$quotation";
my $degenesheading = "$quotation$degenes$quotation";
my $underexpheading = "$quotation$under$quotation";
my $overexpheading = "$quotation$over$quotation";
my $nameheading = "$quotation$name$quotation";

printf RESULTS "%4s%12s%12s%12s%16s%20s%20s%20s%16s%24s%22s%90s\n", $numheading, $scoreheading, $nogenesheading, $notermsheading, $fisherpheading, $rejectnullheading, $permp1heading, $permp2heading, $degenesheading, $underexpheading, $overexpheading, $nameheading;

for ($i=1; $i<=$how_many_clusters; $i=$i+1)
{
    $to_get_pvalue = $fisher_hash{"$i"};
    $to_get_pvalue =~ /[^\*]*\*([\d. \-e]*)/;
    $pvalue = $1;
    if ($pvalue =~ /\d+.?\d*e?-?\d*/)
    {
	$to_enter_pvalues = "$pvalue$asterisk$i$separator";
	push (@pvalues, $to_enter_pvalues);
    }
    
    print PVALUES $pvalue;
    print PVALUES "\n";
}

my @pvalues_sorted = sort { (split /\*/, $a)[0] <=> (split /\*/, $b)[0] } @pvalues;

sub benjamini_hochberg
{
    my $bh_m = $#pvalues_sorted;
    my $bh_q = $false_discovery_rate;
    my $bh_k = 0;
    my $i = 0;
    my @bh_rejected_null = "";
   
    foreach my $bh_array_element (@pvalues_sorted)
    {
	$bh_array_element =~ /([\d. \-e]*)\*(\d+)##/;
	my $bh_pvalue = $1;
	my $bh_clusterno = $2;
	
	my $bh_testvalue = $bh_q * $bh_k / $bh_m;
	    
	if ($bh_testvalue > $bh_pvalue)
		
		{
		    push (@bh_rejected_null, $bh_k);
		}

		$bh_k = $bh_k + 1;
	
    }
    
    $bh_rejected_max = pop (@bh_rejected_null);

    until ($i == $bh_rejected_max + 1)
    {
	$pvalues_sorted[$i] =~ /(\d+.?\d*)\*(\d+)/;
	$i = $i + 1;
	push (@bh_rejected_clusters, $2);
    }
    
}

benjamini_hochberg;
print "Clusters for which null hypothesis is rejected: ";
print "@bh_rejected_clusters";
print "\n";
print "End Of File\n";


for ($i=1; $i<=$how_many_clusters; $i=$i+1)
{
    $to_resort_clustersgenes = $cluster_summary_hash{"$i"};
    @clustergenes_resort = split (/\*/, $to_resort_clustersgenes);
    $cluster_name = $clustergenes_resort[0];
    $cluster_genesno = $clustergenes_resort[2];
    $cluster_score = $clustergenes_resort[1];
    $cluster_termsno = $clustergenes_resort[3];

    $to_print_fisher = $fisher_hash{"$i"};
    
    if ($to_print_fisher =~ /([^\*]*)\*([\d .\-e]*)/)
    {
	$fisher_pvalue = $2;
	$bh_verdict = 0;
    }

    else
    {
	$fisher_pvalue = "-";
	$bh_verdict = 0;
    }

    foreach my $bh_rejected_cluster (@bh_rejected_clusters)
    {
	if ($bh_rejected_cluster =~ /^$i$/)
	{
	    $bh_verdict = 1;
	    next;
	}
    }

#    my $qvalue_tosplit = $qvalues[$i];
#    $qvalue_tosplit =~ /\s*\d+.?\d*\s+(\d+.?\d*)/;
#    $fisher_qvalue = $1;
    
    $to_print_permutation = $permutation_hash{"$i"};
    if ($to_print_permutation =~ /([\d .\-e]*)\*([\d .\-e]*)/)
    {
	$permutation_lessthan_pvalue = $1;
	$permutation_greaterthan_pvalue = $2;
    }

    else
    {
	$permutation_lessthan_pvalue = "-";
	$permutation_greaterthan_pvalue = "-";
    }

    $to_print_direction = $direction_hash{"$i"};
    if ($to_print_direction =~ /(\d+)\*(\d+.?\d*\%)\*(\d+.?\d*\%)/)
    {
       $de_genes_no = $1;
       $direction_under_percent = $2;
       $direction_over_percent = $3;
    }

    else
    {
       $de_genes_no = "-";
       $direction_under_percent = "-";
       $direction_over_percent = "-";   
    }

    my $cnameq = "$quotation$cluster_name$quotation";

    $cnameq =~ s/ \"/\"/g;
    printf RESULTS "%4s%12s%12s%12s%16s%20s%20s%20s%16s%24s%22s%90s\n", $i, $cluster_score, $cluster_genesno, $cluster_termsno, $fisher_pvalue, $bh_verdict, $permutation_lessthan_pvalue, $permutation_greaterthan_pvalue, $de_genes_no, $direction_under_percent, $direction_over_percent, $cnameq;
    print PVALUES_LESS_THAN $permutation_lessthan_pvalue;
    print PVALUES_LESS_THAN "\n";
    print PVALUES_GREATER_THAN $permutation_greaterthan_pvalue;
    print PVALUES_GREATER_THAN "\n";
}

#sub r_qvalues_lessthan
#{
#    my $R_send_1 = "library(qvalue)\n";
#    my $R_send_2 = "p <- scan(\"c:/program files/r/rw2011/pvalues_less_than.txt\")\n";
#    my $R_send_3 = "qobj <- qvalue(p)\n";
#    my $R_send_4 = "qplot(qobj)\n";
#    my $R_send_5 = "qwrite(qobj, filename=\"c:/program files/r/rw2011/qvalues_less_than.txt\")\n";
    
#    my $R = Statistics::R->new();
    
#    $R -> startR();
#    $R->send(qq'$R_send_1');
#    print (qq'$R_send_1');
#    print "\n";
#    $R->send(qq'$R_send_2');
#    print (qq'$R_send_2');
#    print "\n";
#    $R->send(qq'$R_send_3');
#    print (qq'$R_send_3');
#    print "\n";
#    $R->send(qq'$R_send_4');
#    print (qq'$R_send_4');
#    print "\n";
#    $R->send(qq'$R_send_5');
#    print (qq'$R_send_5');
#    $R -> stopR();
#}

#sub r_qvalues_greaterthan
#{
#    my $R_send_1 = "library(qvalue)\n";
#    my $R_send_2 = "p <- scan(\"c:/program files/r/rw2011/pvalues_greater_than.txt\")\n";
#    my $R_send_3 = "qobj <- qvalue(p)\n";
#    my $R_send_4 = "qplot(qobj)\n";
#    my $R_send_5 = "qwrite(qobj, filename=\"c:/program files/r/rw2011/qvalues_greater_than.txt\")\n";
    
#    my $R = Statistics::R->new();
    
#    $R -> startR();
#    $R->send(qq'$R_send_1');
#    print (qq'$R_send_1');
#    print "\n";
#    $R->send(qq'$R_send_2');
#    print (qq'$R_send_2');
#    print "\n";
#    $R->send(qq'$R_send_3');
#    print (qq'$R_send_3');
#    print "\n";
#    $R->send(qq'$R_send_4');
#    print (qq'$R_send_4');
#    print "\n";
#    $R->send(qq'$R_send_5');
#    print (qq'$R_send_5');
#    $R -> stopR();
#}

#r_qvalues_lessthan;
#r_qvalues_greaterthan;

#r_qvalues;

#our $qvalues = 'c:/program files/r/rw2011/qvalues.txt';
#open (QVALUES, $qvalues) || die print "Results Outputting Script cannot open $qvalues";
#my @qvalues = <QVALUES>;
#shift (@qvalues);
#shift (@qvalues);
#print $qvalues[0];

#sub r_qvalues
#{
#    my $R_send_1 = "library(qvalue)\n";
#    my $R_send_2 = "p <- scan(\"c:/program files/r/rw2011/pvalues.txt\")\n";
#    my $R_send_3 = "qobj <- qvalue(p)\n";
#    my $R_send_4 = "qplot(qobj)\n";
#    my $R_send_5 = "qwrite(qobj, filename=\"c:/program files/r/rw2011/qvalues.txt\")\n";
    
#    my $R = Statistics::R->new();
    
#    $R -> startR();
#    $R->send(qq'$R_send_1');
#    print (qq'$R_send_1');
#    print "\n";
#    $R->send(qq'$R_send_2');
#    print (qq'$R_send_2');
#    print "\n";
#    $R->send(qq'$R_send_3');
#    print (qq'$R_send_3');
#    print "\n";
#    $R->send(qq'$R_send_4');
#    print (qq'$R_send_4');
#    print "\n";
#    $R->send(qq'$R_send_5');
#    print (qq'$R_send_5');
#    $R -> stopR();
#}
