#!/usr/bin/perl

use warnings;
use strict;
#use Storable;


### Date ###
system "date > day.tab";
open IN, "<day.tab"; my @day = <IN>; close IN;
chomp $day[0];
my @array = split (/\s+/, $day[0]);
my $date = "$array[1]"."$array[2]";
system "rm day.tab";  

### Variables ###

my $blast_eval1 = 0.00001; ### 'Strong' BLAST hit limit
my $blast_eval2 = 10; ### 'Weak' BLAST hit limit.

my $blast_orf_eval = 10;

my $neighbour_step = 5; ##Syntenoblast: number of steps out looking at neighbours for synteny.
my $synt_step = 2; ##Syntenoblast: number of rounds of syntenoblast

my $copygaps = "copygaps.pl";

### store ###
my $count_run=1;
my $run;

### introduction and instructions 

print "\n------------------------------SearchDOGS Bacteria------------------------------\n\n";
print "SearchDOGS Bacteria automatically identifies unannotated genes in bacterial genomes using BLAST sequence similarity and conserved synteny across genomes. Genomes are required in genbank (.gb) format.\nBefore beginning, please check that Bio::Perl (Bio::Seq, Bio::SeqIO), PAML, ClustalO, BLAST (version: legacy) and formatdb software are installed on your system.\n\nThe program will initially look for these in [/usr/bin/]. If it doesn't find them here it will request their location.If these programs are not installed SearchDOGS will not run\n\n";

## counts up for each 'run' of the program to avoid results mixing ##
for (my $i=1; $i<=$count_run; $i++) {
    unless (-e "Results/run_".$count_run) {
	$run = "run_".$count_run;
        last;
    } else { $count_run++;
	 }
}

unless (-e $run) { system "mkdir $run"; }
unless (-e "Results") { system "mkdir Results"; }

my $store = $run."/bac_stored_".$run;
unless (-e $store) {
    system "mkdir $store";
}

### Instructions for inputting species to be tested  ###

print "Instructions:\n1. Please place genomes to be analysed in a folder called [Genomes].\n\n 2. Genomes to be analysed should be in genbank format [.gb], and should have a four letter name e.g. shigella flexneri could be [SHFL.gb].\n\n 3. Occasionally, bacterial genomes will have genes on multiple plasmids, e.g. vibrio cholerae. If one of your genomes has multiple plasmids, please title your genbank files [4letter name]_[number] for each genbank file,  e.g. [VCHO_1.gb], [VCHO_2.gb].\n\nRecap: SearchDOGS will look for genomes in the format [SETY.gb] or [VCHO_1.gb] in a folder called [Genomes].\n\n\n" ;

print "Run: ".$run."\n\n";

## find genomes folder ###
my $genome_store; my $store_found = 0;
if (-e "Genomes") { $store_found = 1; $genome_store = "Genomes"; } elsif (-e "genomes") { $store_found = 1; $genome_store = "genome"; } elsif (-e "Genome") { $store_found = 1; $genome_store = "Genome"; } 

if ($store_found ==1) { print "Success! $genome_store folder found.\n"; } else { die "Can't find [Genomes] folder! Please check the location of the folder. SearchDOGS exiting."; exit }

my @files; my %spec_name; my %spec_loc; my $j=0; my %taglist;
opendir (DH, $genome_store);
@files = readdir (DH);
closedir(DH);
# Look for genomes in the right format.
# If file has ABCD.gb format, INPUT=FILE, TAG=ABCD
# If file has ABCD_1.gb, INPUT_1=FILE, TAG=ABCD
# Count for each genome. Does genome count match "how many genomes"? If not print "Only found X genomes: ABCD, EFGH, IJKL. Proceed? If y, proceed

foreach my $file (@files) { 
    next if($file =~ /^\.$/); #skip . and ..
    next if($file =~ /^\.\.$/);
    if ($file =~ /\.gb/) {
	my $tag;
	unless ($file =~ /\_/) {
	    my @break = split (/\./, $file);
	    $tag = $break[0];
	    $spec_loc{$tag}{'1'} = $genome_store."/".$file;	    
	    $taglist{$tag} = $tag;
	# print "$tag $file\n"; 
	} else {
	    my @break = split (/\./, $file);
	    my @break2 = split (/\_/, $break[0]);
	    $tag = $break2[0];
	    $j = $break2[1];
	    $taglist{$tag} = $tag;
	    $spec_loc{$tag}{$j} = $genome_store."/".$file;
	  #  print "$tag $j $file\n";
	}
    }
}
my $i=0; my %spec_num;
foreach my $tag (keys %taglist) {
    $i++;
    $spec_name{$i} = $tag; $spec_num{$tag} = $i;
#    foreach my $chrom (keys %{$spec_loc{$tag}}) { 
#	print "Species $i $tag $chrom $spec_loc{$tag}{$chrom}\n"; }
}  
my $genome_count_i= $i;

print "Please enter number of genomes:";
my $num = <STDIN>;
chomp $num;
my $true_gn = 0;
while ($true_gn == 0) {
    if ($num =~ /^[0-9]+$/) {
	$true_gn = 1;	
    } else {
	print "Number of genomes must be numeric!\n";
	print "Please re-enter number of genomes:";
	$num = <STDIN>; 
    }
    if ($num != $genome_count_i) {
	print "Looking for $num genomes, but SearchDOGS finds $genome_count_i genomes:\n";
	    foreach my $tag (keys %taglist) {
		foreach my $chrom (keys %{$spec_loc{$tag}}) { 
		    print "Genome $spec_num{$tag} $tag chromosome/plasmid: $spec_loc{$tag}{$chrom}\n";
		} } 	    
	print "Remember that genomes need to be in [ABCD.gb] or [EFGH_1.gb] format. CONTINUE (performing the analysis using the genomes listed) (y/n) ?\n";
	my $ans = <STDIN>;
	print "answer $ans\n";
	if (($ans =~ "y") || ($ans =~ /"Y"/)) { # || ($ans eq "Yes") || ($ans eq "YES")) {
	    print "Continuing using the $genome_count_i genomes located in $genome_store\n";
	} else { print "SearchDOGS aborted. Exiting\n"; exit; }
    } else { print "$genome_count_i genomes found\n"; 
	foreach my $tag (keys %taglist) {
	    foreach my $chrom (keys %{$spec_loc{$tag}}) { 
		print "Genome $spec_num{$tag} $tag chromosome/plasmid: $spec_loc{$tag}{$chrom}\n";
	    }
	}
    }
}
## look for blastall, yn00, clustalo at /usr/bin/. If not there, ask for location ##
my $blastall;
if ( -e "/usr/bin/blastall" ) {
    $blastall = "/usr/bin/blastall";
}
else {

    print "BLAST (version:legacy) software must be installed on your computer. Please enter location of blastall (e.g. /usr/bin/blastall):";
    $blastall = <STDIN>;
    chomp $blastall;
    my $true_blast= 0;
    while ($true_blast == 0) {
	if (-e $blastall) {
	    $true_blast = 1;
	} else {
	    print "blastall not found, please check path\n";
	    print "Please re-enter location of blastall:";
	    $blastall = <STDIN>;
	    chomp $blastall;
	}
    }
}
print "Blastall found\n";
my $yn00;
if ( -e "/usr/bin/yn00" ) {
    $yn00 = "/usr/bin/yn00";
}
else {
    print "PAML software must be installed on your computer. Please enter location of yn00 (e.g. /usr/bin/yn00):";
    $yn00 = <STDIN>;
    chomp $yn00;
    my $true_yn= 0;
    while ($true_yn == 0) {
	if (-e $yn00) {
	    $true_yn = 1;
	} else {
	    print "yn00 not found, please check path\n";
	    print "Please re-enter location of yn00:";
	    $yn00 = <STDIN>;
	    chomp $yn00;
	}
    }
}
print "yn00 found\n";

my $format;
if ( -e "/usr/bin/formatdb" ) {
    $format = "/usr/bin/formatdb";
}
else {
    print "PAML software must be installed on your computer. Please enter location of yn00 (e.g. /usr/bin/yn00):";
    $format = <STDIN>;
    chomp $format;
    my $true_format= 0;
    while ($true_format == 0) {
	if (-e $format) {
	    $true_format = 1;
	} else {
	    print "formatdb not found, please check path\n";
	    print "Please re-enter location of yn00:";
	    $format = <STDIN>;
	    chomp $format;
	}
    }
}
print "formatdb found\n";

my $clustalo;
if ( -e "/usr/bin/clustalo" ) {
    $clustalo = "/usr/bin/clustalo";
}
else {
    print "CLUSTALO software must be installed on your computer. Please enter location of clustalo (e.g. /usr/bin/clustalo):";
    $clustalo = <STDIN>;
    chomp $clustalo;
    my $true_clus= 0;
    while ($true_clus == 0) {
	if (-e $clustalo) {
	    $true_clus = 1;
	} else {
	    print "clustalo not found, please check path\n";
	    print "Please re-enter location of clustalo:";
	    $clustalo = <STDIN>;
	    chomp $clustalo;
	}
    }
}
print "clustalo found\n";

open SPEC, ">".$store."/spec_name.tab";
open LOC, ">".$store."/spec_loc.tab";

foreach my $specnum (keys %spec_name) {
    print SPEC "$specnum\t$spec_name{$specnum}\n";
}
close SPEC;
foreach my $spectag (keys %spec_loc) {
    foreach my $chrom_num (keys %{$spec_loc{$spectag}}) {
	print LOC "$spectag\t$chrom_num\t$spec_loc{$spectag}{$chrom_num}\n";
    }
}
close LOC;

#### Clean run? Delete all files not used in final results to save memory ####

my $clean_run=1; # 0 allows additional data files to be kept, 1 provides only files used in final results output.
#exit;
##############################################################################

print "Running SearchDOGS now. Please be patient. For example, a 3 species comparison on a standard laptop may be expected to take approximately 1 hour. Note that the more species are used, the longer the runtime. Thank you!\nStep 1: Create pillars of orthologous genes between genomes using reciprocal blast searches and syntenoblast, and create other data structures.";
#print "$run $blast_eval1 $blast_eval2 $neighbour_step $synt_step $blastall $clean_run\n";

system "./Create_structures.pl $run $blast_eval1 $blast_eval2 $neighbour_step $synt_step $blastall";
print "Step 2: Find unannotated ORFs by BLASTX searches between intergenic regions and syntenic genes in other species.\n";
 
system "./Find_ORFs.pl $run $date $blastall $blast_orf_eval";
print "Step 3: create outputs and information for each unannotated ORF identified, including Ka/Ks comparisons.\n";

system "./Create_output.pl $run $date $blastall $clustalo $copygaps $yn00 $clean_run";
# print "Step 3: Results output at: Results/Species_".$run."_".$date.".html\n";

exit;
