#!/usr/bin/perl 

# ----------------------------------------------------------------------------------------------
# Script to generate results html output plus Ka/Ks pairwise comparisons for Search-DOGS results
# Copyright Sean O hEigeartaigh <oheigeas@tcd.ie>
# Adapted from yn00.pl script by Kevin Byrne <kevin.byrne@tcd.ie>
# ----------------------------------------------------------------------------------------------

#use strict;

### Run pairwise Ka/Ks comparisons ###
my $yn_on = 1;

my $run = $ARGV[0]; 
my $date = $ARGV[1];
my $blastall = $ARGV[2];
my $clustalo = $ARGV[3];
my $copygaps = "./".$ARGV[4];
#my $copygaps = "./copygaps.pl";
my $yn00 = $ARGV[5]; my $clean_run = $ARGV[6];

# print "$run $date $blastall $clustalo $copygaps $yn00\n";

my $store = $run."/bac_stored_".$run;

my $blast_option = 1;

### direct error messages in here: #####
open ERROR, '>'.$run."/error_step3.txt" or die $!;
STDERR->fdopen( \*ERROR, 'w' ) or die $!;

### Create information hashes ###
my (%spec_name, %aa, %nt_seq_all, %map, %species, %chr, %gene_start, %fasta, %gene_stop, %gene_num, %IG, %pillar, %ps_start, %ps_stop, %alt_start, %alt_s_pill, %aa_seq_om, %nt_seq_om, %NT_OG, %adj_l, %adj_r, %o1, %o2, %allfeat, %allfeat_start, %allfeat_stop, %arrow, %strand_af);

open IN, "<".$store."/spec_name.tab"; my @sn = <IN>; close IN;
foreach my $line (@sn) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1];
    $spec_name{$split[0]} = $split[1];
}
open IN, "<".$store."/map_C.tab"; my @mapc = <IN>; close IN;
foreach my $line (@mapc) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1];
    $map{$split[0]} = $split[1];
}
open IN, "<".$store."/pillar_C.tab"; my @pc = <IN>; close IN;
foreach my $line (@pc) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1]; chomp $split[2];
    $pillar{$split[0]}{$split[1]} = $split[2];
}
open IN, "<".$store."/NT_seq_all.tab"; my @nts = <IN>; close IN;
foreach my $line (@nts) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1];
    $nt_seq_all{$split[0]} = $split[1];
    $nt_seq_om{$split[0]} = $split[1];
    my $start_cod = substr($split[1], 0, 3);
    if ($start_cod eq "GTG") { $alt_s_pill{$map{$split[0]}} = "A"; $alt_start{$split[0]} = "GTG"; }# print "start codon: $start_cod";}
    if ($start_cod eq "TTG") { $alt_s_pill{$map{$split[0]}} = "A"; $alt_start{$split[0]} = "TTG"; }# print "start codon: $start_cod";}
    if ($start_cod eq "ATT") { $alt_s_pill{$map{$split[0]}} = "A"; $alt_start{$split[0]} = "ATT"; }# print "start codon: $start_cod";}
    if ($start_cod eq "CTG") { $alt_s_pill{$map{$split[0]}} = "A"; $alt_start{$split[0]} = "CTG"; }
    if ($start_cod eq "ATC") { $alt_s_pill{$map{$split[0]}} = "A"; $alt_start{$split[0]} = "ATC"; }
}

open IN, "<".$store."/AA_seq_all.fa"; my @aas = <IN>; close IN;
for (my $i=0; $i<=$#aas; $i++) {
    if ($aas[$i] =~ /^>/) {
	my @split = split (/>/, $aas[$i]);
	chomp $split[0]; chomp $split[1]; chomp $aas[$i+1];
	$aa{$split[1]} = $aas[$i+1];
	$aa_seq_om{$split[1]} = $aas[$i+1]."*";
    }
}
#}
open IN, "<".$store."/species_all.tab"; my @spec = <IN>; close IN;
foreach my $line (@spec) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1];
    $species{$split[0]} = $split[1];
}

open IN, "<".$store."/adj_l_all.tab"; my @adjl = <IN>; close IN;
foreach my $line (@adjl) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1];
    $adj_l{$split[0]} = $split[1];
}
open IN, "<".$store."/adj_r_all.tab"; my @adjr = <IN>; close IN;
foreach my $line (@adjr) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1];
    $adj_r{$split[0]} = $split[1];
}

open IN, "<".$store."/chr_all.tab"; my @chr = <IN>; close IN;
foreach my $line (@chr) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1];
    $chr{$split[0]} = $split[1];
}
open IN, "<".$store."/gene_start_all.tab"; my @gs = <IN>; close IN;
foreach my $line (@gs) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1];
    $gene_start{$split[0]} = $split[1];
}
open IN, "<".$store."/gene_stop_all.tab"; my @gst = <IN>; close IN;
foreach my $line (@gst) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1];
    $gene_stop{$split[0]} = $split[1];
}
open IN, "<".$store."/strand_all.tab"; my @str = <IN>; close IN;
foreach my $line (@str) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1];
    if ($split[1] eq "plus") { $arrow{$split[0]} = "->"; } else { $arrow{$split[0]} = "<-"; }
}
open IN, "<".$store."/strand_all_af.tab"; my @str_af = <IN>; close IN;
foreach my $line (@str_af) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1]; chomp $split[2];
    $strand_af{$split[0]}{$split[1]} = $split[2];
}
open IN, "<".$store."/gene_num_all.tab"; my @gn = <IN>; close IN;
foreach my $line (@gn) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1];
    $gene_num{$split[0]} = $split[1];
}
open IN, "<".$store."/IG_all.tab"; my @ig = <IN>; close IN;
foreach my $line (@ig) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1];
    $IG{$split[0]} = $split[1];
}
open IN, "<".$store."/allfeat_all.tab"; my @af = <IN>; close IN;
foreach my $line (@af) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1]; chomp $split[2];
    $allfeat{$split[0]}{$split[1]} = $split[2];
}
open IN, "<".$store."/allfeat_start_all.tab"; my @af_start = <IN>; close IN;
foreach my $line (@af_start) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1]; chomp $split[2];
    $allfeat_start{$split[0]}{$split[1]} = $split[2];
}
open IN, "<".$store."/allfeat_stop_all.tab"; my @af_stop = <IN>; close IN;
foreach my $line (@af_stop) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1]; chomp $split[2];
    $allfeat_stop{$split[0]}{$split[1]} = $split[2];
}
#}
foreach my $spec (keys %spec_name) {
    open IN, "<".$store."/fasta_".$spec_name{$spec}.".tab"; my @fa = <IN>; close IN;
    foreach my $line (@fa) {
        my @split = split (/\t/, $line);
        chomp $split[0]; chomp $split[1];
        $fasta{$split[0]} = $split[1];
    }
}

open IN, "<".$run."/NT_all.txt"; my @NTA = <IN>; close IN;
foreach my $line (@NTA) {
    my @split = split (/\t/, $line); chomp $split[0];
    my @split2 = split (/\s/, $split[0]); my $og = $split2[0]."-".$split2[1];
    chomp $split[1]; chomp $split[2];
    $NT_OG{$og}{$split[1]} = $split[2];
}    

foreach my $gene (keys %aa) {
    if (defined $adj_l{$gene}) {
	my $gene1_stop = $gene_stop{$adj_l{$gene}};
	if ($gene1_stop > $gene_start{$gene}) { my $olap_left =  $gene1_stop - $gene_start{$gene}; $o1{$gene} = int(($olap_left+2)/3); }
    }
    if (defined $adj_r{$gene}) {
        my $gene2_start = $gene_start{$adj_r{$gene}};
        if ($gene2_start < $gene_stop{$gene}) { my $olap_right = $gene_stop{$gene}-$gene2_start; $o2{$gene} = int(($olap_right+2)/3); }
    }
}

my %alph; my %beta;
$alph{'1'} = "ORFA";
$alph{'2'} = "ORFB";
$alph{'3'} = "ORFC";
$alph{'4'} = "ORFD";
$alph{'5'} = "ORFE";

$beta{'1'} = "altA";  
$beta{'2'} = "altB";
$beta{'3'} = "altC";
$beta{'4'} = "altD";
$beta{'5'} = "altE";

### make directories ###
my $nt_dir= $run."/nt_seq"; unless (-e $nt_dir) { system "mkdir $nt_dir"; }
my $omega_dir= $run."/omega_files"; unless (-e $omega_dir) { system "mkdir $omega_dir"; }
my $blast_new_dir= $run."/Blast_neworfs"; unless (-e $blast_new_dir) {  system "mkdir $blast_new_dir"; }
my $blast_orig= $run."/blast_saves"; my $blast_form = $run."/blast_html"; unless (-e $blast_form) {  system "mkdir $blast_form"; }
my $results_dir= "Results"; unless (-e $results_dir) { system "mkdir $results_dir"; }

############# Stage 3.1 #########################################

## Results file from stage 2 ##
my $file = $run."/newORF-res_".$date."_NT.txt";
open IN, "<".$file; my @in = <IN>; close IN;

### break up, get info from David's results file ###


my (%ORFseq, %start_comp, %stop_comp, %mark, %count_OGS, %Neworf_inf, %Neworf_seq, %count_sec, %alt_p, %count_name, %extrapill, %correct, %real_len, %oldname, %coord_fin, %reject_mark);
for (my $i=0; $i<=$#in; $i++) {
    if ($in[$i] =~ /^>/) {      
#	print "$in[$i]";
        my @first = split (/\t/, $in[$i]);
#	print $first[0].$first[1].$first[2].$first[3]."end\n";
        my $seq = $in[$i+1];  ###ORF aa sequence.
        my @break = split (/\>/, $first[0]);
	my @break2 = split (/\s/, $break[1]); ## genes in OGS.
	my @break_hits = split (/\s/, $first[1]); ### split genes that hit the OGS fragment.
        my $pillhit = $map{$break_hits[0]};
	my $gene1_stop = $gene_stop{$break2[0]}; my $gene2_start = $gene_start{$break2[1]}; 
        my $OGS = $break2[0]."-".$break2[1];
#	print "OGS $OGS $break2[0] $break2[1]\n";
	$count_name{$OGS}++; my $num = $count_name{$OGS} -1;
	my $OGS_old = $OGS;
	if ($num > 0) {
	    $OGS = $OGS."_".$alph{$num}; ## Update OGS name with _A, _B etc
	}
	my $chrom = $chr{$break2[0]}; ###chromosome of OGS fragment

        my $spec_hit;
        foreach my $specnum (keys %spec_name) {
            if ($first[0] =~ /$spec_name{$specnum}/) {
                $spec_hit = $spec_name{$specnum}; ### species of OGS fragment.
            }
        }	
        unless (defined $pillar{$pillhit}{$spec_hit}) {
            $species{$OGS} = $spec_hit;
	    $pillar{$pillhit}{$spec_hit} = $OGS;  ## add OGS_add name to pillars
	    $map{$OGS} = $pillhit;  ##Update map coordinate
	} else { ### marking situations where we're finding (a) orfs in a pillar already containing a gene (b) 2 orfs in same pillar in same species.
	    if ($OGS =~ /$pillar{$pillhit}{$spec_hit}/) { ## cases where it's multiple ORFS within the one OGS
		$count_sec{$pillhit}++;
                my $pill_a = $pillhit."_".$beta{$count_sec{$pillhit}};
		$map{$OGS} = $pill_a;  ##Update map coordinate
		$pillar{$pill_a}{$spec_hit} = $OGS; ## replace existing with new gene
		$mark{$pill_a} = "1";
		$extrapill{$pill_a}{$spec_hit} = $OGS;
	    } else {  ## Cases where multiple (nearby syntenic) ORFS hit one pillar ##
		$count_sec{$pillhit}++;
		my $pill_a = $pillhit."_".$beta{$count_sec{$pillhit}};
                $map{$OGS} = $pill_a; ##Update map coordinate
		$pillar{$pill_a}{$spec_hit} = $OGS; ## replace existing with new gene
                $extrapill{$pill_a}{$spec_hit} = $OGS;
                $mark{$pill_a} = "1";
		$alt_p{$pillhit} = $pill_a; ### mark that alternate pillar structures exist here.
		$alt_p{$pill_a} = $pillhit;
	    }
	}
	
	$gene_num{$OGS} = $gene_num{$break2[0]}+1;
	$adj_l{$OGS} = $break2[0];
	$adj_r{$OGS} = $break2[1];
	$adj_l{$break2[1]} = $OGS;
	$adj_r{$break2[0]} = $OGS;
	$chr{$OGS} = $chr{$break2[0]};
	$oldname{$OGS} = $OGS_old;
	
	my $sm_loc; my $no_m; my $seq_fin; my $nt_seq_fin; my $H_loc;
	my $nt_seq = $NT_OG{$OGS_old}{$first[1]};
	$nt_seq_all{$OGS} = $nt_seq;	
	chomp $seq;
        if ($first[2] =~ /complement/) { ##If on complement strand
            $arrow{$OGS} = "<-";
            my @break_coord = split (/_/, $first[2]);
            my @coord1 = split (/\(/, $break_coord[0]); ### coordinates.
            my @coord2 = split (/\)/, $break_coord[1]);
            $gene_start{$OGS} = $coord1[1];  ## update gene start and stop hashes
            $gene_stop{$OGS} = $coord2[0];
        } else { ### on normal strand
            $arrow{$OGS} = "->";
            my @break_coord = split (/_/, $first[2]);
            $gene_start{$OGS} = $break_coord[0];  ## update gene start and stop hashes
            $gene_stop{$OGS} = $break_coord[1];   ##
        }

	if ($seq =~ /m/i) { #print "start present\t"; 
			    my @sm_split = split(/m/i, $seq); $sm_loc = (length($sm_split[0])+1); } else { $no_m++; 
												       }
	if ($seq =~ /[A-Z]/) { my @split_hsp = split (/[A-Z]/, $seq); $H_loc=(length($split_hsp[0])+1); 
			   #    print "HSP start: $H_loc\n";
			   } 
	unless (defined $alt_s_pill{$pillhit}) { ## disabling no M rejection rule if alternative START codon present ##
	    unless ($no_m ==1) { 
		if ((length($seq)-$sm_loc+1) >10 ) {
                    $seq_fin = substr($seq, ($sm_loc-1), (length($seq)-$sm_loc+1));
                    $nt_seq_fin = substr($nt_seq, (($sm_loc-1)*3), ((length($seq)-$sm_loc+1)*3));
		    $real_len{$OGS} = length($seq_fin); ## subtract 1 here for aa length
                    if ($first[2] =~ /complement/) { $coord_fin{$OGS} = "complement(".$gene_start{$OGS}."..".($gene_stop{$OGS}- (3*($sm_loc-1))).")";
                                                 } else { $coord_fin{$OGS} = $gene_start{$OGS}+(3*($sm_loc-1))."..".$gene_stop{$OGS}; }
                } else { $reject_mark{$OGS} = 1; }

	    } else { 
	#	print "no start\t"; 
		$seq_fin = $seq; $nt_seq_fin = $nt_seq; #}
	    }
	} else {
	    $seq_fin = substr($seq, ($H_loc-1), (length($seq)-$H_loc+1));
	    $nt_seq_fin = substr($nt_seq, (($H_loc-1)*3), ((length($seq)-$H_loc+1)*3));
	    $real_len{$OGS} = (length($seq_fin))."?"; ## subtract 1 here for aa length
	    if ($first[2] =~ /complement/) { $coord_fin{$OGS} = "complement(".$gene_start{$OGS}."..".($gene_stop{$OGS}- (3*($H_loc-1))).")";
					 } else { $coord_fin{$OGS} = $gene_start{$OGS}+(3*($H_loc-1))."..".$gene_stop{$OGS}; }
	    
	}
#	print "loc of m $sm_loc alt? $alt_s_pill{$pillhit} seq_fin $seq_fin\n";
	chomp $in[$i+1];
	chop $in[$i+1];
	$aa{$OGS} = $in[$i+1];
	if ((defined $first[3]) && ($first[3] ne "")) { $correct{$OGS} = $first[3]; } else { $correct{$OGS} = "None"; }
#	print "BEFORE: o1 $o1{$OGS} o2 $o2{$OGS\n";
	if ($gene1_stop > $gene_start{$OGS}) { my $olap_left =  $gene1_stop - $gene_start{$OGS}; if ($first[2] =~ /complement/) { $o2{$OGS} = int(($olap_left+2)/3); } else { $o1{$OGS} = int(($olap_left+2)/3); } }
        if ($gene2_start < $gene_stop{$OGS}) { my $olap_right = $gene_stop{$OGS}-$gene2_start;  if ($first[2] =~ /complement/) { $o1{$OGS} = int(($olap_right+2)/3); } else { $o2{$OGS} = int(($olap_right+2)/3); }}
#	print "$OGS o1 $o1{$OGS} o2 $o2{$OGS} (1start $gene_start{$break2[0]}) 1stop $gene1_stop 2start $gene2_start (stop $gene_stop{$break2[1]}) OGSstart $gene_start{$OGS} OGSstop $gene_stop{$OGS}\n";

	$nt_seq_om{$OGS} = $nt_seq_fin; chomp $seq_fin; $aa_seq_om{$OGS} = $seq_fin;
	$mark{$pillhit} = "1"; 
	$Neworf_inf{$OGS} = $in[$i];
	chomp $in[$i+1];
	$Neworf_seq{$OGS} = $in[$i+1];
	$species{$OGS} = $spec_hit;
	$IG{$OGS} = $IG{$OGS_old};
#	print "OGS2 $OGS\n";
	
	my @nt_arr; my @IG_arr;  ### Make fasta files for intergenic region, and nt sequence of new ORF.
	for (my $l =0; $l <= length($nt_seq); $l=$l+60) {
	    push @nt_arr, substr($nt_seq, $l, 60);
	}
        for (my $l =0; $l <= length($IG{$OGS}); $l=$l+60) {
            push @IG_arr, substr($IG{$OGS}, $l, 60);
        }
	open OUT, ">".$nt_dir."/nt_seq_".$OGS.".html";
	print OUT "<html><head><title><Nucleotide sequence of ".$OGS."</title></head>\n<body><font face = 'courier new'>\>$OGS<br>\n";
	foreach my $elt (@nt_arr) {
	    print OUT "$elt<br>\n"
	    }
	print OUT "</font></body></html>\n";
	close OUT;

        open OUT, ">".$nt_dir."/IG_seq_".$OGS.".html";
        print OUT "<html><head><title>Intergenic sequence between adjacent genes in OGS fragment ".$OGS."</title></head>\n<body><font face = 'courier new'>\>$OGS<br>\n";
        foreach my $elt (@IG_arr) {
            print OUT "$elt<br>\n"
            }
        print OUT "</font></body></html>\n";
        close OUT;
    }

}


foreach my $pill_e (keys %extrapill) {
    foreach my $pill_p (keys %pillar) {
	my @array = split (/_/, $pill_e);
	my $key_temp = $array[0];
	if ($key_temp eq $pill_p) { 
	    foreach my $spec (keys %{$pillar{$pill_p}}) {
		unless (defined $extrapill{$pill_e}{$spec}) {
                    $pillar{$pill_e}{$spec} = $pillar{$pill_p}{$spec};
#                    print "pille $pill_e $spec $pillar{$pill_e}{$spec} $pillar{$pill_p}{$spec}";
		}
	    }
	}
    }
}

### Run yn00 on whole set of pillars containing potential ORFS
#open ALN, ">yn_aln_test_25";
#open CPG, ">yn_cpg_test_25";
#open PHY, ">yn_phy_test_25";

my %cons_test; my %cons_query;
if ($yn_on == 1) {
    my $count_yn=0;
    open TEST1, ">$store/seqs_for_yn.txt";
    open YN1, ">".$store."/cons_test.tab";
    open YN2, ">".$store."/cons_query.tab";
    foreach my $pill (keys %pillar) {
	print TEST1 "pill $pill ";
	if (defined $mark{$pill}) {
	    print TEST1 "mark $mark{$pill} ";
	    foreach my $spec (keys %{$pillar{$pill}}) {
		foreach my $spec2 (keys %{$pillar{$pill}}) {
		    if ($spec ne $spec2) {
			    my $gene = $pillar{$pill}{$spec};
			    my $gene2 = $pillar{$pill}{$spec2};
			    print TEST1 "gene1 $gene gene2 $gene2\n";
			    unless (($reject_mark{$gene} ==1) || ($reject_mark{$gene2} ==1)) {
				my $aa_seq1 = uc($aa_seq_om{$gene}); my $aa_seq2 = uc($aa_seq_om{$gene2});
				open NT, ">yn00.nt" || die; #NEED TO MAKE AN AA AND NUC FILE WITH EACH GENE IN PILLAR
				open AA, ">yn00.aa" || die;
				
				print NT ">One\n";
				print NT "$nt_seq_om{$gene}\n";
				print AA ">One\n";
				print AA "$aa_seq1\n";
				
				print NT ">Two\n";
				print NT "$nt_seq_om{$gene2}\n";
				print AA ">Two\n";
				print AA "$aa_seq2\n";
			    
				print YN1 ">$gene One\n$nt_seq_om{$gene}\n$aa_seq1\n>$gene2 Two\n$nt_seq_om{$gene2}\n$aa_seq2\n";
#				print "$gene one $gene2 two\n";
				close AA;
				close NT;
				
				system "$clustalo -i yn00.aa -o yn00.aln --outfmt=clu > /dev/null"; # clustalo aa align
				system "$copygaps yn00.aln yn00.nt yn00.cpg  > /dev/null"; # get nt align
####				system "$clustalo --outfmt=phy -i yn00.cpg -o yn00.phy > /dev/null";
				my $cpg_file = "yn00.cpg"; &phy_maker($cpg_file);
				open SEQ, "<yn00.phy"; my @phy = <SEQ>; close SEQ; 
			#	print "phy0 $phy[0]\n";
				open SEQ, ">yn00.phy"; chop $phy[0]; print SEQ "$phy[0] I\n"; foreach my $p (@phy[1..$#phy]) { print SEQ $p; } close SEQ;
				open SEQ, ">yn00.ctl";
				print SEQ "      seqfile = yn00.phy * sequence data file name\n";
				print SEQ "      outfile = yn00.yn  * main result file\n";
				print SEQ "      verbose = 0  * 1: detailed output (list sequences), 0: concise output\n\n";
				print SEQ "        icode = 0  * 0:universal code; 1:mammalian mt; 2-10:see below\n\n";
				print SEQ "    weighting = 0  * weighting pathways between codons (0/1)?\n";
				print SEQ "   commonf3x4 = 0  * use one set of codon freqs for all pairs (0/1)?\n";
				close SEQ;

				system "$yn00 yn00.ctl > /dev/null"; # get Ka
				open SEQ, "<yn00.yn"; my @seq = <SEQ>; close SEQ; my @l = split(/\s+/,$seq[$#seq-20]);				
#				print "$seq[0]"; 
				my $yn_test; unless ($seq[0] =~ "YN00") { $yn_test = "n/a"; }
				print YN1 "$seq[0] $yn_test\n";
#				exit;
				if ($yn_test eq "n/a") {
				    $cons_test{$pill}{$gene}{$gene2}{"KA"} = "undef2";
				    $cons_test{$pill}{$gene}{$gene2}{'KS'} = "undef2";
				    $cons_test{$pill}{$gene}{$gene2}{'OM'} = "undef2";
				    $cons_test{$pill}{$gene}{$gene2}{'N'} = "undef2";
				    $cons_test{$pill}{$gene}{$gene2}{'S'} = "undef2";
				    $cons_test{$pill}{$gene}{$gene2}{'KS_SE'} = "undef2";
				    $cons_test{$pill}{$gene}{$gene2}{'KA_SE'} = "undef2";
				} else {
				    my $ks=$l[($#l-2)]; chop $ks;
				
				    $cons_test{$pill}{$gene}{$gene2}{'KA'}=$l[($#l-5)];
				    $cons_test{$pill}{$gene}{$gene2}{'KS'}=$ks;
				    $cons_test{$pill}{$gene}{$gene2}{'OM'}=$l[($#l-6)];
				    $cons_test{$pill}{$gene}{$gene2}{'N'}=$l[($#l-9)];
				    $cons_test{$pill}{$gene}{$gene2}{'S'}=$l[($#l-10)];
				    $cons_test{$pill}{$gene}{$gene2}{'KS_SE'}=$l[($#l-0)];
				    $cons_test{$pill}{$gene}{$gene2}{'KA_SE'}=$l[($#l-3)];
				#    print "test $pill $gene $gene2 KA $cons_test{$pill}{$gene}{$gene2}{'KA'} KS $cons_test{$pill}{$gene}{$gene2}{'KS'} omega $cons_test{$pill}{$gene}{$gene2}{'OM'}\n";
				    print YN1 "$pill\t$gene\t$gene2\t";
				    if (($cons_test{$pill}{$gene}{$gene2}{"OM"} == "99.00") || ($cons_test{$pill}{$gene}{$gene2}{"KS"} > "5.00")) {
					foreach my $stat (keys %{$cons_test{$pill}{$gene}{$gene2}}) {
					$cons_test{$pill}{$gene}{$gene2}{$stat} = "undef1";				
					}
				    }
				} 
				system "/bin/rm -rf yn00.nt yn00.aa yn00.aln yn00.dnd yn00.cpg yn00.phy yn00.ctl yn00.yn"; # tidy up files we use
			    }
		    }
		}
	    }
##	    system "/bin/rm -rf yn00.nt yn00.aa yn00.aln yn00.dnd yn00.cpg yn00.phy yn00.ctl yn00.yn"; # tidy up files we use
	}
    }

## 'mark' the query ORFs ###
    foreach my $pill (keys %pillar) {
	if (defined $mark{$pill}) {
	    foreach my $spec (keys %{$pillar{$pill}}) {
		if ($pillar{$pill}{$spec} =~ /-/) {
		    $cons_query{$pill}{$pillar{$pill}{$spec}} = "Q";
		    print YN2 "$pill\t$pillar{$pill}{$spec}\t$cons_query{$pill}{$pillar{$pill}{$spec}}\n";
		}
	    }
	}
    }
    close TEST1;
 #   close YN1; close YN2;
}

#close ALN; close CPG; close PHY;
#### Stage 3.2 Blast results for newORF against syntenic pillar ###

if ($blast_option == 1) {
    foreach my $ORF (keys %Neworf_inf) {
	unless ($reject_mark{$ORF} ==1) {
	    my @split = split(/-/, $ORF);
	    my $gene = $split[0];
	    my @first = split (/\t/, $Neworf_inf{$ORF});
	    my @break_hits = split (/\s/, $first[1]); ### split genes that hit the OGS fragment.
	    my $pillhit = $map{$break_hits[0]};
	    
	    open OUT, ">".$blast_form."/".$ORF."_".$pillhit."_form.html";
	    open IN, "<".$blast_orig."/out-seq".$gene."_".$pillhit."_full.fullblast"; my @or = <IN>; close IN;

	    print OUT "<html>";
	    print OUT "<pre>";
	    foreach my $line (@or) {
		print OUT "$line";
	    }
	    print OUT "</html>";
	    close OUT;
	}
    }
}
###################################### Stage 3.2 work out omega averages, significances, print out html files for these #########

my (%om_average, %q_average, %sig, %allsig, %mostsig, %sig_word1, %SE_hash, %differ, %bounds1, %bounds2);

### work out whether dS-DN != 0 statistically ###

open TESTF, ">$store/om_qav_test.out";

foreach my $pill (keys %cons_test) { # in cases where yn00 works one way but not reciprocally, this module plugs in the "working" values.
    foreach my $gene (keys %{$cons_test{$pill}}) { 
           foreach my $gene2 (keys %{$cons_test{$pill}{$gene}}) { #fix non-recip problem	
	       if ((defined $cons_test{$pill}{$gene}{$gene2}{"OM"}) && ($cons_test{$pill}{$gene2}{$gene}{"OM"} eq "undef2")) { ## do foreach $stat
		   foreach my $stat (keys %{$cons_test{$pill}{$gene}{$gene2}}) {		       
		       $cons_test{$pill}{$gene2}{$gene}{$stat} = $cons_test{$pill}{$gene}{$gene2}{$stat};} }
	       if ((defined $cons_test{$pill}{$gene2}{$gene}{"OM"}) && ($cons_test{$pill}{$gene}{$gene2}{"OM"} eq "undef2")) {
		   foreach my $stat2 (keys %{$cons_test{$pill}{$gene}{$gene2}}) {
		       $cons_test{$pill}{$gene}{$gene2}{$stat2} = $cons_test{$pill}{$gene2}{$gene}{$stat2};} }
	   }
    }
}
close YN1;

foreach my $pill (keys %cons_test) { 
    foreach my $gene (keys %{$cons_test{$pill}}) { 
        my $count_pil=0;
        if (defined $cons_query{$pill}{$gene}) { ### If this is a Q
            foreach my $gene2 (keys %{$cons_test{$pill}{$gene}}) {
                $count_pil++;
                my $failcount=0;
                unless ($cons_test{$pill}{$gene}{$gene2}{"OM"} =~ /undef/) {##
                    my $N = $cons_test{$pill}{$gene}{$gene2}{"N"};
                    my $S = $cons_test{$pill}{$gene}{$gene2}{"S"};
                    my $ka = $cons_test{$pill}{$gene}{$gene2}{"KA"};
                    my $ks = $cons_test{$pill}{$gene}{$gene2}{"KS"};
                    my $ka_se = $cons_test{$pill}{$gene}{$gene2}{"KA_SE"};
                    my $ks_se = $cons_test{$pill}{$gene}{$gene2}{"KS_SE"};

#		    print "test $pill $gene $gene2 KA $cons_test{$pill}{$gene}{$gene2}{'KA'} KS $cons_test{$pill}{$gene}{$gene2}{'KS'} omega $cons_test{$pill}{$gene}{$gene2}{'OM'}\n";
                    my $diff = $ks-$ka;
                    my $SE = sqrt(($ka_se*$ka_se) + ($ks_se*$ks_se)); #Standard error of the difference
                    $SE_hash{$gene}{$gene2} = sprintf("%.4f", $SE);
                    my $ninetyfive = 1.96*$SE;
                    my $bound_A=$diff-$ninetyfive; my $bound_B=$diff+$ninetyfive; # 95% confidence interval
                    $differ{$gene}{$gene2} = $diff;

                    my $small; my $big;
                    if ($bound_A > $bound_B) {
                        $small = $bound_B; $big=$bound_A;
                    } else {
                        $small = $bound_A; $big=$bound_B;
                    }
                    if (($small < 0) && ($big > 0)) {
                        $sig_word1{$gene}{$gene2} = "NO";
                    } if (($small > 0) && ($big > 0)) {
                        $sig_word1{$gene}{$gene2} = "YES";
                    }
                }
            }
	
        }
    }
}

### work out average omega for pillar (excluding potential ORFS ###
foreach my $pill (keys %cons_test) {
    my $count_om=0;
    my $om_add;
    foreach my $gene (keys %{$cons_test{$pill}}) {
        unless (defined $cons_query{$pill}{$gene}) {
            foreach my $gene2 (keys %{$cons_test{$pill}{$gene}}) {
                unless (defined $cons_query{$pill}{$gene2}) {
                    unless ($cons_test{$pill}{$gene}{$gene2}{"OM"} =~ /undef/) {
                        $count_om++;
                        $om_add += $cons_test{$pill}{$gene}{$gene2}{"OM"};
                    }
                }
            }
        }
        unless (($count_om == 0) || ($om_add eq undef)) {
            my $om_av = $om_add/$count_om;
            $om_average{$gene} = $om_av;
        }
    print TESTF "$pill $gene $om_average{$gene}\n"; 
    }
}

### work out average omega for each potential ORF vs rest of pillar ###
foreach my $pill (keys %cons_test) {
    foreach my $gene (keys %{$cons_test{$pill}}) {
	my $count_q=0;
	my $q_add;
        if (defined $cons_query{$pill}{$gene}) {
            foreach my $gene2 (keys %{$cons_test{$pill}{$gene}}) {
		    unless ($gene eq $gene2) {
			my $om1 =$cons_test{$pill}{$gene}{$gene2}{"OM"}; ##
			print TESTF "\nQ1 $gene $gene2 $om1 ";
			    unless ($cons_test{$pill}{$gene}{$gene2}{"OM"} =~ /undef/) {##

				$count_q++;
				$q_add += $cons_test{$pill}{$gene}{$gene2}{"OM"};
				print TESTF "cq $count_q qadd $q_add ";
			}
		    }
            }
            unless (($count_q == 0) || ($q_add eq undef)) {
                my $q_av = $q_add/$count_q;
                $q_average{$gene} = $q_av;
            }
        }
	print TESTF "$pill $gene $q_average{$gene}\n";
    }
}

### make output page with table of omega values and standard errors ### ##check this
open OM, ">$store/query_and_hit_omegas.out";
foreach my $pill (keys %cons_test) {
    foreach my $gene (keys %{$cons_test{$pill}}) {
	unless ($reject_mark{$gene} ==1) {
        if (defined $cons_query{$pill}{$gene}) {
            open OUT, ">".$omega_dir."/omega_".$gene.".html";

            print OUT "<html><head><title>$gene Ka Ks values</title></head>\n<body>\n";
            print OUT "<br>Pillar <b>$pill</b>\nGenes in pillar:<b>";
            foreach my $gene1 (keys %{$pillar{$pill}}) {
                print OUT "$pillar{$pill}{$gene1} ";
            }
	    my $om = sprintf("%.4f", $om_average{$gene});
	    my $q = sprintf("%.4f", $q_average{$gene});

           if ((defined $om_average{$gene}) && ($om_average{$gene} ne "")) {
#	       print "pill $pill gene $gene om_average $om_average{$gene} ";
	       print OM "pill $pill gene $gene om_average $om_average{$gene} ";
	       print OUT "</b><br>\nAverage Ka/Ks (pairwise) between known genes in the pillar:<b> $om</b>\n";
            } else {
                print OUT "</b><br>\nAverage Ka/Ks (pairwise) between known genes in the pillar:<b> undefined</b>\n";
            }
            if ((defined $q_average{$gene}) && ($q_average{$gene} ne "")) {
		print OM "q_average $q_average{$gene} ";
#		print "q_average $q_average{$gene} ";
		print OUT "<br>Average Ka/Ks of new ORF against known genes in pillar:<b> $q</b>\n";
            } else {
                print OUT "<br>Average Ka/Ks of new ORF against known genes in pillar:<b> undefined</b>\n";
            }
	    print OM "\n";
            print OUT "<br><br>\n";
            print OUT "<table border=2 align=centre size=80% cellpadding=3 cellspacing=2><tr>\n";
            print OUT "<td>gene1</td><td>gene2</td><td>Ka</td><td>Ks</td><td>omega(Ka/Ks)</td><td>Ks-Ka</td><td>SE of Ks</td><td>SE of Ka</td><td>SE of Ks-Ka</td><td>p<0.05 statistical significance?*</td></tr>\n";
            foreach my $gene2 (keys %{$cons_test{$pill}{$gene}}) {
                my $ks = $cons_test{$pill}{$gene}{$gene2}{'KS'};
                my $ka = $cons_test{$pill}{$gene}{$gene2}{'KA'};
                my $om = $cons_test{$pill}{$gene}{$gene2}{'OM'};
                my $kase = $cons_test{$pill}{$gene}{$gene2}{'KA_SE'};
                my $ksse = $cons_test{$pill}{$gene}{$gene2}{'KS_SE'};
		print OM "TEST $gene $gene2 $ks $ka $om $kase $ksse $SE_hash{$gene}{$gene2}\n";
		
                print OUT "<td>$gene</td><td>$gene2</td><td>$ka</td><td>$ks</td><td>$om</td><td>$differ{$gene}{$gene2}</td><td>$ksse</td><td>$kase</td><td>$SE_hash{$gene}{$gene2}</td><td>$sig_word1{$gene}{$gene2}<td></tr>\n";
            }
            print OUT "</table>\n";
            print OUT "<br>Ks significantly greater than Ka (and therefore <b>a (Ks-Ka) value significantly greater than 0</b>) is an indicator of protein conservation. Statistical significance of (Ks-Ka)>0 is carried out as follows:<br>\n";
            print OUT "1. Calculate SE of (Ks-Ka) by the formula:<b> SE(Ks-Ka) = sqrt[(SE(Ka)^2) + (SE(Ks)^2)]\n</b><br>";
            print OUT "2. Calculate a 95% confidence interval for (Ks-Ka) by the formula:<b> Ks-Ka +- 1.96(SE(Ks-Ka))\n</b><br>";
            print OUT "3. If the entire confidence interval is >0, then (Ks-Ka) is statistically significantly greater than 0 and there is evidence of protein sequence conservation\n\n";

	    print OUT "<br><br><b>UNDEF1</b>: When yn00 returns an omega value of 99, or a KS value of >5, this is indicative of an uninformative alignment - results are discarded\n";
	    print OUT "<br><b>UNDEF2</b>: In some instances (e.g. where a stop codon readthrough has been allowed in one of the sequences) yn00 fails to run. In these instances 'undef2' is returned\n";
            print OUT "</body></html>\n";
            close OUT;
        }
	}
    }
}
close OM; close TESTF;

### Stage 3.2b - Creates the data for the output pages

my (%pill_med, $i);
foreach my $pill (sort {$a<=>$b} keys %pillar) { # figure out median length of genes in pillar
    my @len_array; my @sort_len; my $len_tot=0; my $len_av=0; my $pil_mem=0; my $med;
    foreach my $spec (sort {length($aa{$pillar{$pill}{$a}}) <=> length($aa{$pillar{$pill}{$b}})} keys %{$pillar{$pill}}) {
	unless ($pillar{$pill}{$spec} =~ /-/) {
	    push @len_array, (length($aa{$pillar{$pill}{$spec}})+1);
	}
    }
    my $el_num = @len_array;
    if ($el_num % 2 == 0) {
        $med = ($len_array[$el_num/2] + $len_array[$el_num/2 - 1])/2;
    } else {
        $med = $len_array[int($el_num/2)];
    }    
    $pill_med{$pill} = $med;
#    print "$pill test_med: median $med\n"; 
}

##### Stage 3.3
my (%genetag, %descr, %gb_acc, %OGS_nt_all, %chromlist); #load data for outputs
open IN, "<".$store."/gene_tag_all.tab"; my @gt = <IN>; close IN;
foreach my $line (@gt) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1];
    $genetag{$split[0]} = $split[1];
}
open IN, "<".$store."/descr_all.tab"; my @ds = <IN>; close IN;
foreach my $line (@ds) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1];
    $descr{$split[0]} = $split[1];
}
open IN, "<".$store."/gb_acc_all.tab"; my @gb = <IN>; close IN;
foreach my $line (@gb) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1];
    $gb_acc{$split[0]} = $split[1];
}
open IN, "<".$store."/OGS_nt_seq_all.tab"; my @ont = <IN>; close IN;
foreach my $line (@ont) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1];
    $OGS_nt_all{$split[0]} = $split[1];
}
open IN, "<".$store."/chromlist_all.tab"; my @cl = <IN>; close IN;
foreach my $line (@cl) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1]; chomp $split[2];
    $chromlist{$split[0]}{$split[1]} = $split[2];
}

my ($element, %perc_hash, %key_hash, %outhash, %rev_spec, %pilltable, %lghash, %rghash, %new_in_pill);
my $string1 = "<font color='green'><b>m</b></font>"; my $string2 = "<font color='green'><b>M</b></font>"; my $string3 = "<font color='red'><b>*</b></font>"; my $string4 = "<font color='red'><b>X</b></font>"; my $string5 = "<font color='purple'><b>M</b></font>"; my $string6 = "<font color='orange'><b>M</b></font>"; my $string7 = "<font color='0000FF'><b>M</b></font>"; my $string8 = "<font color='000000'><b>M</b></font>"; my $ul_start ="<u>"; my $ul_stop="</u>";
my $string9 = "<font color='purple'><b>"; my $string10 = "<font color='orange'><b>"; my $string11 = "<font color='0000FF'><b>"; my $string12 = "<font color='000000'><b>"; my $string13 = "<font color='FF0000'><b>M</b></font>";  my $string14 = "<font color='FF0000'><b>"; my $string_end = "</b></font>"; 
foreach my $OGS (keys %Neworf_inf) {
    my $pillarno = $map{$OGS};
    $new_in_pill{$pillarno}{$species{$OGS}} = $OGS; ### hash of new genes per pillar
}

#open TEST_ALT, ">testalt.tab";

### Make data structures for output page.

foreach my $OGS (keys %Neworf_inf) { ###split and arrange info from Neworf_in key: OGS hit ###    
    unless ($reject_mark{$OGS} ==1) {
    my $focal_gene = $OGS;
    my @genes = split (/-/, $OGS); my $lgene= $genes[0]; my $rgene=$genes[1];    ### make table of genes in pillar to left (inc left OGS gene) right and centre ###
#    print "lgene0 $lgene rgene0 $rgene\n";
    my @first = split (/\t/, $Neworf_inf{$OGS});
    my @break_hits = split (/\s/, $first[1]); ### split genes that hit the OGS fragment.
    my $pillhit = $map{$break_hits[0]};
    my $pillarno = $map{$OGS}; ###pillar for OGS ##
    my @coords = split (/_/, $first[2]); ## replace _ with .. in coordinates ##
    foreach my $k (keys %alph) { ##get rid of any _A, _B tails;
        my $m = "_".$alph{$k};
        if ($rgene =~ /$m/) {
            my @rgene_arr = split (/$m/, $rgene);
            $rgene = $rgene_arr[0];
        }
    }
    $first[2] = "Candidate stop to stop ORF:<br>\n<a href=".$nt_dir."/nt_seq_".$OGS.".html>".$coords[0]."..".$coords[1]."</a>\n<br><br>\nStart to stop ORF<br>$coord_fin{$OGS}<br><br>\nIntergenic Sequence:<br>\n<a href=".$nt_dir."/IG_seq_".$OGS.".html>".($gene_stop{$lgene}+1)."..".($gene_start{$rgene}-1)."</a><br><br>Correction to make full-length ORF?<br>$correct{$OGS}";
    pop @first;
    pop @first;

    shift @first;  ### unshift link to hits onto front of in array ###

    my $gtl = $genetag{$lgene}; my $gtr = $genetag{$rgene};
    if ((defined $genetag{$lgene}) && ($genetag{$lgene} ne "")) {
        $gtl = "(<i><font color='red'>".$gtl."</font></i>)";
    }
    if ((defined $genetag{$rgene}) && ($genetag{$rgene} ne "")) {
        $gtr = "(<i><font color='red'>".$gtr."</font></i>)";
    }
#    print "lgene $lgene rgene $rgene gtl $gtl gtr $gtr\n";
    my $OGS_orig; my $string = "_ORF";
    if ($OGS =~ $string) { 
	my @OGS_ar = split (/$string/, $OGS); $OGS_orig = $OGS_ar[0]; 
    } else { $OGS_orig = $OGS; } # OGS_orig cuts off "_ORFA" points "_ORFA" hits to the original BLAST (new ORF against pillar) file made in previous step
    unshift @first, "<a href=".$blast_new_dir."/".$OGS_orig."_".$pillhit."_form.html>$OGS</a><br><br>ORF in intergenic region between $lgene $gtl and $rgene $gtr<br><br><a href=".$blast_form."/".$OGS."_".$pillhit."_form.html>Original BlastX result";
    unshift @first, $pillarno; ## first column - pillar - will be table with left, middle, and right pillar ###
    if ((defined $q_average{$OGS}) && ($q_average{$OGS} ne "")) { # || ($q_average{$OGS} == 0)) {
	my $q = sprintf("%.4f", $q_average{$OGS});
#	print "loading: qav: $q_average{$OGS} q: $q\n";
        push @first, "Omega average: $q <a href=".$omega_dir."/omega_".$OGS.".html>details</a>";
    } else {
        push @first, "Omega average cannot be calculated <a href=".$omega_dir."/omega_".$OGS.".html>details</a>";
    }
    if (defined $allfeat{$oldname{$OGS}}) {
	my $allfeat_co; my $feat_string="";
	foreach my $num (sort {$allfeat_start{$oldname{$OGS}}{$a}<=>$allfeat_start{$oldname{$OGS}}{$b}} keys %{$allfeat_start{$oldname{$OGS}}}) {
	    if ($strand_af{$OGS}{$num} eq "plus") { 
#		print "plus $strand_af{$OGS}{$num}\n"; 
		$allfeat_co = $allfeat_start{$oldname{$OGS}}{$num}."..".$allfeat_stop{$oldname{$OGS}}{$num}."\n";  
	  }  else { 
#		print "minus $strand_af{$OGS}{$num}\n"; 
		$allfeat_co = "complement(".$allfeat_start{$oldname{$OGS}}{$num}."..".$allfeat_stop{$oldname{$OGS}}{$num}.")\n"; 
	    } 
	    $feat_string.=$allfeat{$oldname{$OGS}}{$num}."<br>".$allfeat_co."<br>";
	}
	push @first, $feat_string;
    } else {
	push @first, "NO";
    }
    my $len_orf = length($Neworf_seq{$OGS});
    push @first, $real_len{$OGS};
    my $perc_med;
    unless (($pill_med{$pillarno} eq undef) || ($pill_med{$pillarno} == 0)) {
	$perc_med = ($len_orf*100/$pill_med{$pillarno});
    }
    $perc_med = sprintf "%.1f", $perc_med; ##get percentage length of pillar median for ORF.
#    print "OGS $OGS map $map{$OGS} $pillarno length orf = $len_orf median = $pill_med{$pillarno}  qav $q_average{$OGS} pm $perc_med\n";
    push @first, $perc_med."%";

    ### making table of predicted and known genes in a pillar plus lengths ###
    foreach my $spec (keys %{$pillar{$pillarno}}) {
	my $genepil = $pillar{$pillarno}{$spec};
        unless ($aa{$genepil} =~ /\*/) {
            $aa{$genepil} .= "*"
            }
	unless ($pillar{$pillarno}{$spec} =~ /-/) {
            my $l = length($aa{$pillar{$pillarno}{$spec}}); ##subtract 1 from both of these if giving AA length not codon length
            $pilltable{$pillarno}{$spec} = "K ".$l; ### keys pillar, species. tag value length of known genes
        } else  {
            my $l = $real_len{$pillar{$pillarno}{$spec}};
	    $pilltable{$pillarno}{$spec} = "P ".$l; ## length of predicted genes
        }
    }
    foreach my $spec (keys %spec_name) {
        unless (defined $pilltable{$pillarno}{$spec_name{$spec}}) { ### making "blanks" where no genes exist###
            $pilltable{$pillarno}{$spec_name{$spec}} = " ";
        }

    }
    ### making the table ###
    my $tablestring = "<table border=2 align=centre size=80% cellpadding=3 cellspacing=2>\n<tr>\n";
    foreach my $kspec (keys %{$pilltable{$pillarno}}) {
        $tablestring .= "<td><font size='2'>$kspec</td>"; #species row
    }
    $tablestring .= "</tr>\n<tr>\n";
    foreach my $kspec (keys %{$pilltable{$pillarno}}) {
        if ($pilltable{$pillarno}{$kspec} =~ /P/) {
            $tablestring .= "<td bgcolor='red'><font size='2'>$pilltable{$pillarno}{$kspec}</td>"; #orfs row;
        }
        if ($pilltable{$pillarno}{$kspec} =~ /K/) {
            $tablestring .= "<td bgcolor='yellow'><font size='2'>$pilltable{$pillarno}{$kspec}</td>";
        }
        if ($pilltable{$pillarno}{$kspec} eq " ") {
            $tablestring .= "<td><font size='2'>$pilltable{$pillarno}{$kspec}</td>";
        }
    }
    $tablestring .= "</tr>\n</table>\n";
    push @first, $tablestring;

    my $seqstring; ### add sequence of all genes inc predicted ORFs to end.
    $aa{$OGS} = $Neworf_seq{$OGS};
    my @order;
    foreach my $spec (keys %{$pillar{$pillarno}}) { if ($pillar{$pillarno}{$spec} eq $OGS) { 
#	print "OGS - $pillar{$pillarno}{$spec}"; 
	push @order, $spec; 
    } }
    foreach my $spec (keys %{$pillar{$pillarno}}) { if ((defined $Neworf_inf{$pillar{$pillarno}{$spec}}) && ($pillar{$pillarno}{$spec} ne $OGS)) { 
#	print "non OGS - $pillar{$pillarno}{$spec}"; 
	push @order, $spec; 
    } }
    foreach my $spec (keys %{$pillar{$pillarno}}) { unless (defined $Neworf_inf{$pillar{$pillarno}{$spec}}) { 
#	print "other OGS - $pillar{$pillarno}{$spec}"; 
	push @order, $spec;
    } }
    
    foreach my $spec (@order) {
        my @aa_arr; my $seqstring_ind; my $seqstring_ind2; my $seqstring_ind3;
        my $genepil = $pillar{$pillarno}{$spec};       
	for (my $l =0; $l < length($aa{$genepil}); $l=$l+60) {
	    push @aa_arr, substr($aa{$genepil}, $l, 60);
	} 
	foreach my $elt (@aa_arr) { $seqstring_ind .= $elt."<br>"; } 	    
#	print "$OGS $pillar{$pillarno}{$spec} spec $spec o1 $o1{$pillar{$pillarno}{$spec}} o2 $o2{$pillar{$pillarno}{$spec}}\n";
	my $seqstring_x=$seqstring_ind;
	if (defined $o1{$genepil}) { ###problem: using o1{genepil} when should be using o1{gene-spec}!!! figure out how to change ###
	    my $o1_n = $o1{$genepil}-4; 
	    for (my $i=0; $i <$o1{$genepil}; $i+=60) { $o1_n+=4; }
	    my $sub1 = substr($seqstring_ind, 0, $o1_n); # print "sub1 $sub1\n"; 
	    my $ul_sub1 = $ul_start.$sub1.$ul_stop;
#	    print "overlap1 $ol1_n $ul_sub1\n";
	    $seqstring_ind2 = $ul_sub1.substr($seqstring_ind, ($o1_n), (length($seqstring_ind) - $o1_n));
	    $seqstring_x = $seqstring_ind2;
	}
	if (defined $o2{$genepil}) { 
	    my $o2_n = length($aa{$genepil})-$o2{$genepil};
	    my $o2_n_old = $o2_n; 
	    $o2_n=$o2_n-4; 
	    for (my $i=0; $i <$o2_n_old; $i+=60) { $o2_n +=4; } if ((defined $o1{$genepil}) && (($o2{$genepil}+$o1{$genepil}) <= length($aa{$genepil}))) { $o2_n +=7; }
#	    print "double overlap: o1 $o1{$genepil} o2 $o2{$genepil} len ".length($aa{$genepil})."\n";
	    my $sub2 = substr($seqstring_x, ($o2_n), (length($seqstring_x)-$o2_n));#  print "sub2 $sub2\n";  ## add 4 for each <br>\n; 
	    my $ul_sub2 = $ul_start.$sub2.$ul_stop;
	    $seqstring_ind3 = substr($seqstring_x, 0, $o2_n).$ul_sub2;
	    $seqstring_x = $seqstring_ind3;
	}
	    
	my $seqstring_ind_4=$seqstring_x;
	if (defined $alt_s_pill{$map{$genepil}}) { 
#	    print TEST_ALT "$genepil $alt_s_pill{$map{$genepil}} $seqstring_ind_4 $nt_seq_all{$genepil}\n";
	    my $mark=0;
	    for (my $i=3; $i<length($nt_seq_all{$genepil}); $i=$i+3) {
		my $seqstring_ind_tmp;
		my $cod = substr($nt_seq_all{$genepil}, $i, 3);
		my $aa_mark = (($i+3)/3); my $aa_mark_orig = $aa_mark;
#		print TEST_ALT "$genepil $i $cod $aa_mark\n";
		for (my $j=60; $j<$aa_mark_orig; $j+=60) { $aa_mark +=4; } 
		for (my $k=1; $k<=$mark; $k++) { $aa_mark +=(length($string9)+length($string_end));}
                if ((defined $o1{$genepil}) && ($aa_mark_orig <= $o1{$genepil})) { $aa_mark +=3; }
		if ((defined $o1{$genepil}) && ($aa_mark_orig > $o1{$genepil})) { $aa_mark +=7; }
		if ((defined $o2{$genepil}) && ($aa_mark_orig > (length($aa{$genepil})-$o2{$genepil}))) { $aa_mark +=3; }
                if ($cod eq "GTG") {
		    $mark++;
		    $seqstring_ind_tmp = (substr($seqstring_ind_4, 0, $aa_mark-1)).$string9.substr($seqstring_ind_4, $aa_mark-1, 1).$string_end.(substr($seqstring_ind_4, ($aa_mark), length($seqstring_ind_4)-$aa_mark));
		    $seqstring_ind_4 = $seqstring_ind_tmp;
		}
		if ($cod eq "TTG") {
                    $mark++;
                    $seqstring_ind_tmp = (substr($seqstring_ind_4, 0, $aa_mark-1)).$string10.substr($seqstring_ind_4, $aa_mark-1, 1).$string_end.(substr($seqstring_ind_4, ($aa_mark), length($seqstring_ind_4)-$aa_mark));
		    $seqstring_ind_4 = $seqstring_ind_tmp;
                }
		if ($cod eq "CTG") {
                    $mark++;
                    $seqstring_ind_tmp = (substr($seqstring_ind_4, 0, $aa_mark-1)).$string11.substr($seqstring_ind_4, $aa_mark-1, 1).$string_end.(substr($seqstring_ind_4, ($aa_mark), length($seqstring_ind_4)-$aa_mark));
		    $seqstring_ind_4 = $seqstring_ind_tmp;
                }
		if ($cod eq "ATC") { 
		    $mark++; 
		    $seqstring_ind_tmp = (substr($seqstring_ind_4, 0, $aa_mark-1)).$string12.substr($seqstring_ind_4, $aa_mark-1, 1).$string_end.(substr($seqstring_ind_4, ($aa_mark), length($seqstring_ind_4)-$aa_mark));
		    $seqstring_ind_4 = $seqstring_ind_tmp;
		}
                if ($cod eq "ATT") {
                    $mark++;
                    $seqstring_ind_tmp = (substr($seqstring_ind_4, 0, $aa_mark-1)).$string14.substr($seqstring_ind_4, $aa_mark-1, 1).$string_end.(substr($seqstring_ind_4, ($aa_mark), length($seqstring_ind_4)-$aa_mark));
                    $seqstring_ind_4 = $seqstring_ind_tmp;

                }

	    }
#	    print TEST_ALT "seq_fin $seqstring_ind_4\n";
	}
	$seqstring_ind = $seqstring_ind_4;
	$seqstring_ind =~ s/m/$string1/g;
	$seqstring_ind =~ s/M/$string2/g;
	$seqstring_ind =~ s/\*/$string3/g;
	$seqstring_ind =~ s/X/$string4/g;
	$seqstring_ind = ">$genepil<br>\n".$seqstring_ind;
        unless (defined $Neworf_inf{$genepil}) {
            if ($alt_start{$genepil} eq "GTG") { $seqstring_ind =~ s/$string2/$string5/; } #print "alt_start $genepil $alt_start{$genepil}\n"; }
            if ($alt_start{$genepil} eq "TTG") { $seqstring_ind =~ s/$string2/$string6/; }#print "alt_start $genepil $alt_start{$genepil}\n"; }
            if ($alt_start{$genepil} eq "CTG") { $seqstring_ind =~ s/$string2/$string7/; }#print "alt_start $genepil $alt_start{$genepil}\n"; }
            if ($alt_start{$genepil} eq "ATC") { $seqstring_ind =~ s/$string2/$string8/; }#print "alt_start $genepil $alt_start{$genepil}\n"; }
	    if ($alt_start{$genepil} eq "ATT") { $seqstring_ind =~ s/$string2/$string13/;}# print "alt_start $genepil $alt_start{$genepil}\n"; }
        }
	$seqstring.=$seqstring_ind;
    }
    
    my $linekey=''; ## putting together info line again.
    foreach $element (@first) {
        $linekey .= $element."\t";
    }
    $linekey .= $seqstring;

    $lghash{$OGS} = $lgene;
    $rghash{$OGS} = $rgene;
    $perc_hash{$OGS} = $perc_med;
    %rev_spec = reverse(%spec_name);
    $outhash{$species{$OGS}}{$OGS} = $linekey;
    }
}
#close TEST_ALT;


#### Make Web output ###

open OUT, ">".$results_dir."/Species_".$run."_".$date.".html"; ###make Species 'overpage' first###

print OUT "<html><head><title>Orthologous ORFs identified by species/strain</title></head>\n";
print OUT "<body>\n";
print OUT "<br><b><h2 align='center'><font color='red'>Bacterial Search-DOGS Results: Orthologous ORFs identified by species/strain</b></font></h2><br><br>\n";

print OUT "<table border=2 align='center' size=80% cellpadding=3 cellspacing=2>\n";

print OUT "<tr><td>Link:</td><td>Genome:</td><td>Genbank accession:</td></tr>\n";
foreach my $spec (reverse sort {$rev_spec{$a}<=>$rev_spec{$b}} keys %outhash) {
    print OUT "<tr>\n";
    my $link = "a href=".$spec."_".$run."_".$date.".html";
    print OUT "<td><font size='3'><$link><b>$spec</b></a></font></td><td>";
    foreach my $chr (reverse sort {$gb_acc{$a} <=> $gb_acc{$b}} keys %{$chromlist{$spec}}) {
        print OUT "$descr{$chr}<br>";
    }
    print OUT "</td><td>";
    foreach my $chr (reverse sort {$gb_acc{$a} <=> $gb_acc{$b}} keys %{$chromlist{$spec}}) {
        print OUT "<b>$gb_acc{$chr}</b><br>";
    }
    print OUT "</td>";
    print OUT "</tr>\n";
    
}
print OUT "</table>\n";
print OUT "</body></html>\n";
close OUT;

foreach my $spec (reverse sort {$rev_spec{$a}<=>$rev_spec{$b}} keys %outhash) { ###species-specific pages###
    open OUT, ">$results_dir/".$spec."_".$run."_".$date.".html";
    print OUT "<html><head><title>Orthologous $spec_name{$spec} ORFs identified</title></head>\n";
    print OUT "<body>\n";
    print OUT "<br><b><h2 align='center'><font color='red' face='courier new'>Bacterial Search-DOGS Results: Orthologous $spec_name{$spec} ORFs identified</b></font></h2><br><br>\n";
    
    print OUT "<br><b>Table Of Results:</b></br>\n";
    print OUT "<table border=2 align=centre size=80% cellpadding=3 cellspacing=2>\n";

    print OUT "<tr>\n<td><font size='2' face='courier new'><b>Pillar</b></td><td><font size='2' face='courier new'><b>BLASTP Results</b></td><td><font size='2' face='courier new'><b>Hit by in TblastN</b></td><td><font size='2' face='courier new'><b>Coordinates:</b></td><td><font size='2'><b>Ka/Ks</b></td><td><font size='2'><b>Annotated feature?</b></td><td><font size='2' face='courier new'><b>Start to stop Length (codons)</b></td><td><font size='2' face='courier new'><b>% of pillar median</b></td><td><font size='2' face='courier new'><b>Known (K) and predicted (P) gene lengths (codons)</b></td><td><font size='2' face='courier new'><b>Amino acid sequence</b> (Predicted and Known genes)<br>(Methionines in predicted genes highlighted in green, stop codons and frameshift points in red. Methionines highlighted in <font color='purple'><b>purple</b></font> in annotated genes indicate alternative GTG (V) start codons. <font color='orange'><b>Orange</b></font> =  TTG (L) start, <font color='blue'><b>Blue</b></font> = CTG (L) start, <font color='FF0000'><b>Red</b></font> = ATT (I) start, <font color='000000'><b>Black</b></font> = ATC (I) start)</td></tr>\n";
    foreach my $OGS (reverse sort {$perc_hash{$a}<=>$perc_hash{$b}} keys %{$outhash{$spec}}) {
        my %L_dist; my %R_dist; my @t_array;

        my @nhood; ### make table of genes in pillar to left (inc left OGS gene) right and centre ###
        print OUT "<tr>\n";
#	print "outhash $outhash{$spec}{$OGS}\n";
        my @split = split (/\t/, $outhash{$spec}{$OGS});

        print OUT "<td><table border=3 width=940 align=centre size=80% cellpadding=3 cellspacing=2>\n";
        print OUT "<col width=180>\n <col width=80>\n <col width=180>\n<col width=80>\n <col width=180>\n";
        push @nhood, $map{$lghash{$OGS}};
        push @nhood, $split[0];
        push @nhood, $map{$rghash{$OGS}};
        my %orient; my %orient1; my %orient2;
        foreach my $sp (keys %spec_name) {

            if ((defined $pillar{$nhood[1]}{$spec_name{$sp}}) && (defined $pillar{$nhood[0]}{$spec_name{$sp}})) {
                my $gene = $pillar{$nhood[1]}{$spec_name{$sp}}; my $lpgene = $pillar{$nhood[0]}{$spec_name{$sp}};
                if ($gene_num{$lpgene} < $gene_num{$gene}) {
                    $orient1{$sp} = "+";
                } else {
                    $orient1{$sp} = "-";
                }
            }

            if ((defined $pillar{$nhood[1]}{$spec_name{$sp}}) && (defined $pillar{$nhood[2]}{$spec_name{$sp}})) {
                my $gene = $pillar{$nhood[1]}{$spec_name{$sp}}; my $rpgene = $pillar{$nhood[2]}{$spec_name{$sp}};
                if ($gene_num{$rpgene} > $gene_num{$gene}) {
                    $orient2{$sp} = "+";
                } else {
                    $orient2{$sp} = "-";
                }
            }
            if (($orient1{$sp} eq $orient2{$sp}) || ((defined $orient1{$sp}) && ($orient2{$sp} eq undef)))  {
                $orient{$sp} = $orient1{$sp};
            }
            if ((defined $orient2{$sp}) && ($orient1{$sp} eq undef)) {
                $orient{$sp} = $orient2{$sp};
            }

            if ($orient{$sp} eq "+") {
                if ((defined $pillar{$nhood[1]}{$spec_name{$sp}}) && (defined $adj_l{$pillar{$nhood[1]}{$spec_name{$sp}}})) {
                    my $gene = $pillar{$nhood[1]}{$spec_name{$sp}}; my $lgene = $adj_l{$pillar{$nhood[1]}{$spec_name{$sp}}};		    

                    my $l_ing = $gene_start{$gene} - $gene_stop{$lgene};
                    $L_dist{$sp} = $l_ing;
                }

                if ((defined $pillar{$nhood[1]}{$spec_name{$sp}}) && (defined $adj_r{$pillar{$nhood[1]}{$spec_name{$sp}}})) {
                    my $gene = $pillar{$nhood[1]}{$spec_name{$sp}}; my $rgene = $adj_r{$pillar{$nhood[1]}{$spec_name{$sp}}};

                    my $l_ing = $gene_start{$rgene} - $gene_stop{$gene};
                    $R_dist{$sp} = $l_ing;
                }
             }

            if ($orient{$sp} eq "-") {

                if ((defined $pillar{$nhood[1]}{$spec_name{$sp}}) && (defined $adj_r{$pillar{$nhood[1]}{$spec_name{$sp}}})) {
                    my $gene = $pillar{$nhood[1]}{$spec_name{$sp}}; my $lgene = $adj_r{$pillar{$nhood[1]}{$spec_name{$sp}}};

                    my $l_ing = $gene_start{$lgene} - $gene_stop{$gene};
                    $L_dist{$sp} = $l_ing;
                }

                if ((defined $pillar{$nhood[1]}{$spec_name{$sp}}) && (defined $adj_l{$pillar{$nhood[1]}{$spec_name{$sp}}})) {
                    my $gene = $pillar{$nhood[1]}{$spec_name{$sp}}; my $rgene = $adj_l{$pillar{$nhood[1]}{$spec_name{$sp}}};

                    my $l_ing = $gene_start{$gene} - $gene_stop{$rgene};
                    $R_dist{$sp} = $l_ing;
                }
            }

        }
        print OUT "<tr>\n<td><font size='2' face='courier new'>adjacent left</td><td><font size='2' face='courier new'><b>Distance(nt)</b> to adjacent gene</td><td><font size='2' face='courier new'><b>pillar hit ($map{$OGS}) </b></td><td><font size='2' face='courier new'><b>Distance(nt)</b> to adjacent gene</td><td><font size='2' face='courier new'>adjacent right</td></font></tr>\n";
        foreach my $sp (keys %spec_name) {
            my $num = $sp-1;
            if (defined $pillar{$nhood[0]}{$spec_name{$sp}}) {
                $t_array[$num][0] = $pillar{$nhood[0]}{$spec_name{$sp}};
            } else {
                $t_array[$num][0] = "&nbsp;";
            }
            if (defined $L_dist{$sp}) {
                $t_array[$num][1] = $L_dist{$sp};
            } else {
                $t_array[$num][1] = "&nbsp;";
            }
            if (defined $pillar{$nhood[1]}{$spec_name{$sp}}) {
                $t_array[$num][2] = $pillar{$nhood[1]}{$spec_name{$sp}};
            } else {
                $t_array[$num][2] = "&nbsp;";
            }
            if (defined $R_dist{$sp}) {
                $t_array[$num][3] = $R_dist{$sp};
            } else {
                $t_array[$num][3] = "&nbsp;";
            }
            if (defined $pillar{$nhood[2]}{$spec_name{$sp}}) {
                $t_array[$num][4] = $pillar{$nhood[2]}{$spec_name{$sp}};
            } else {
                $t_array[$num][4] = "&nbsp;";
            }
        }
        foreach my $sp (@t_array) {
            print OUT "<tr>\n";
            foreach my $element (@$sp) {
                if ($element =~ /-/) {
                    print OUT "<td><font size='2' face='courier new'><font color='red'>$element</font><font color='blue'> $arrow{$element}</font></td>\n";
                } else {
                    print OUT "<td><font size='2' face='courier new'>$element <font color='blue'>$arrow{$element}</font></td>\n"
                    }
            }
	    print OUT "</tr>\n";
        }
        print OUT "</table>\n";
        if (defined $alt_p{$map{$OGS}}) {
            print OUT "<font color='red'>Note: alternative $species{$OGS} candidate exists for this location, see pillar $alt_p{$map{$OGS}}</font>\n";
        }
        print OUT "</td>\n";
        for (my $z=1; $z<=$#split; $z++) {
            print OUT "<td><font size='2' face='courier new'>$split[$z]</font></td>\n"; ### print out all the other sections ###
        }
        print OUT "</tr>\n";
    }
    print OUT "</table>\n";
    print OUT "</body></html>\n";
    close OUT;
}
system "mv formatdb.log $store";
system "rm $store/seqs_for_yn.txt";
system "rm $store/cons_test.tab $store/cons_query.tab";
system "rm -R $blast_orig";
system "rm $store/aa_* $store/fasta_* $store/AA*";
system "rm 2YN.* rst rst1 rub";
system "rm -r $run/Blast_Out";
system "rm $run/ogsinpillars.txt $store/test_* $run/pillarsinogs.txt"; 
$x="rmdir $run/current"; system($x);

if ($clean_run == 1) {
    $x="rm $store/*"; system($x);
    system "rm $run/*.txt";
}
close ERROR;

system "mv $run Results";

print "Step 3 (creation of outputs) complete. Please find the result html files in the Results folder.\n The file 'Species_".$run."_".$date.".html acts as a starting point, linking to the other files, and is best opened in a web browser.\n\n Thank you for using SearchDOGS Bacteria. For queries email oheigeas@tcd.ie\n";

sub retrieve_nt {  #retrieve nucleotide sequence of query ORF    
    my ($start, $stop, $tag, $fasta) = @_;
    my $seq_l = substr($fasta, $start-1, ($stop-$start+1));

    return $seq_l;
}

sub phy_maker { #convert cpg file to phylip format
    open IN, "<".$_[0]; my @in = <IN>; close IN;
    my %seqs; my $string1; my $string2; my $name1; my $name2;

    if ($in[0] =~ /^>/) {
#	print "start";
	my $i; my $mark; my @split1 = split (/\>/, $in[0]); 
	chomp $split1[1]; $name1 = $split1[1]; 
	for ($i=1; $i<=$#in; $i++) {
	    if ($in[$i]=~ /^>/) {
		$mark = $i;
		last;
	    } else {
	    chomp $string1; $string1 .= $in[$i]; 
	    }
	}    
	#print "start2";
	my @split2 = split (/\>/, $in[$mark]);	
	chomp $split2[1]; $name2 = $split2[1];
	for ($i=$mark+1; $i<=$#in; $i++) {
	    if ($in[$i]=~ /" "/) {
		last;
	    } else {
		chomp $string2; $string2 .= $in[$i];
	    }
	}
    }
    chomp $string1; chomp $string2;
    $seqs{$name1} = $string1; $seqs{$name2} = $string2;
    
    #print "$name1 $seqs{$name1}\n";
    #print "$name2 $seqs{$name2}\n";
    
    open OUT, ">yn00.phy";
    my $leng = length($seqs{$name1});
    
    my $sub1 = substr($seqs{$name1}, 0, 50); my $sub2 = substr($seqs{$name2}, 0, 50);
    print OUT " 2  $leng\n";
    print OUT "$name1       $sub1\n";
    print OUT "$name2       $sub2\n";
    for (my $j = 50; $j<$leng; $j=$j+50) {
	my $sub1x = substr($seqs{$name1}, $j, 50); my $sub2x = substr($seqs{$name2}, $j, 50);
	print OUT "\n$sub1x\n$sub2x\n";
    }    
    close OUT;
}


exit;
