#!/usr/bin/perl
# Writted by D.Armisen and S.OhEigeartaigh 101016
# Adapted to work with bacterial genomes
use strict;
# use Benchmark;
use Bio::Seq;
use Bio::SeqIO;
#use warnings;

#print "Exiting. Comment this line to run it. Remember to create the AA.fsa file for ORF search\n";exit;
#my $startime=new Benchmark;

my $retrieveOGSandpillarsinogs=1;	# 0= No	1=Yes
my $retrieveOGS=1;		        # 0= No	1=Yes
my $launchtheblastsearch=1;		# 0= No	1=Yes

my $updown=10;
my $mincoverage=0.4;
#my $minsize=60;
my $min_2=0.5;
#my $min_hsp =30;

my $run = $ARGV[0];
my $date = $ARGV[1];
my $blastall = $ARGV[2];
my $evalue = $ARGV[3];
my $store = $run."/bac_stored_".$run;

### direct errors in here ###
open ERROR, '>'.$run."/error_step2.txt" or die $!;
STDERR->fdopen( \*ERROR, 'w' ) or die $!;

# print "$run $date $blastall $evalue $store\n";

my (@input,$x,$y,$z,$t,$id,$ref,%hash,%sp,%pillars,%coord,%chr,%seq,%onoff,%genesinpillar,%order,%ordersp,@geneid,$currentspecies,$currentgene,@temp,@totest,%temphash);
my (%genesinspecies,@ogslist,%pillarsinogs,%ogsinpillars,%sphash);
my ($start,$stop,$stt,$stp,$intgen,$current,@temporal,@split);
my ($test,$sequence,@allgenes,@hsp,@values,%results,@mediansize,$median,$currentpillar);
my (%alt_start, %nt_seq_all, $count_alt_GTG, $count_alt_TTG, $count_alt_ATT, $count_alt_CTG, $count_alt_ATC);

my (%spec_name, %spec_loc);
open IN, "<".$store."/spec_name.tab"; my @sn = <IN>; close IN;
foreach my $line (@sn) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1];
    $spec_name{$split[0]} = $split[1];
}
open IN, "<".$store."/spec_loc.tab"; my @sl = <IN>; close IN;
foreach my $line (@sl) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1]; chomp $split[2];
    $spec_loc{$split[0]}{$split[1]} = $split[2];
}

my @species;
foreach my $specnum (keys %spec_name) {
    push @species, $spec_name{$specnum};
}

my %translation;
%translation= (
	GCT => 'A' ,GCC => 'A' ,GCA => 'A' ,GCG => 'A' ,
	CGT => 'R' ,CGC => 'R' ,CGA => 'R' ,CGG => 'R' ,AGA => 'R' ,AGG => 'R' ,
	AAT => 'N' ,AAC => 'N' ,
	GAT => 'D' ,GAC => 'D' ,
	TGT => 'C' ,TGC => 'C' ,
	CAA => 'Q' ,CAG => 'Q' ,
	GAA => 'E' ,GAG => 'E' ,
	GGT => 'G' ,GGC => 'G' ,GGA => 'G' ,GGG => 'G' ,
	CAT => 'H' ,CAC => 'H' ,
	ATT => 'I' ,ATC => 'I' ,ATA => 'I' ,
	TTA => 'L' ,TTG => 'L' ,CTT => 'L' ,CTC => 'L' ,CTA => 'L' ,CTG => 'L' ,
	AAA => 'K' ,AAG => 'K' ,
	ATG => 'M' ,
	TTT => 'F' ,TTC => 'F' ,
	CCT => 'P' ,CCC => 'P' ,CCA => 'P' ,CCG => 'P' ,
	TCT => 'S' ,TCC => 'S' ,TCA => 'S' ,TCG => 'S' ,AGT => 'S' ,AGC => 'S' ,
	ACT => 'T' ,ACC => 'T' ,ACA => 'T' ,ACG => 'T' ,
	TGG => 'W' ,
	TAT => 'Y' ,TAC => 'Y' ,
	GTT => 'V' ,GTC => 'V' ,GTA => 'V' ,GTG => 'V' ,
	TAA => '*' ,TGA => '*' ,TAG => '*'
);

my $pathsave=$run; unless (-e $pathsave) { system "mkdir $pathsave"; }
my $curr = $pathsave."/current"; unless (-e $curr) { system "mkdir $curr";}
unless (-e $pathsave."/blast_saves") { system "mkdir ".$pathsave."/blast_saves"; }
my $blast_new_dir= $run."/Blast_neworfs"; unless (-e $blast_new_dir) {  system "mkdir $blast_new_dir"; }

### Load information from genbank
#open COORD, ">".$pathsave."/gene_coords.tab";
foreach $y (@species){
    my $count;
    my $count_chr;
    
    foreach my $chrnum (keys %{$spec_loc{$y}}) {
	$count_chr++;
	my $file = $spec_loc{$y}{$chrnum};
	my $count_ps;
#	print "file: $file\n";
	
	open IN, $file; my @in = <IN>; close IN; ## Lazy way to get chromosome id.
	my $chromtag;
	foreach my $line (@in) {
	    if ($line =~ /^VERSION/) {
		my @split = split (/\s/, $line);
		$chromtag = $split[5];
		last;
	    }
	}
	
	my $seqio_object = Bio::SeqIO->new(-file => "<$file", '-format' => "GenBank");
	while (my $seq_object = $seqio_object->next_seq) {
	    $seq{$chromtag} = $seq_object->seq;
	    
	    for my $feat_object ($seq_object->get_SeqFeatures) { # run over all features (id source and 3xfeature(gene,mRNA,CDS) per real 'feature')
		if ($feat_object->primary_tag eq "CDS") { 
		    my $note = '';
		    for my $tag ($feat_object->get_all_tags) {
                    if ($tag eq "pseudo" ) {
                        $count_ps++;
                        $note = 'pseudo';
		    }
                }
		    unless ($note eq 'pseudo') {
			$count++;
			my $start=$feat_object->location->start; my $stop =$feat_object->location->end;
			
			for my $tag ($feat_object->get_all_tags) {
			    for my $value ($feat_object->get_tag_values($tag)) {
				if ($tag eq "locus_tag" ) { my $loc=$value; $hash{$count}{'locus'} = $loc; }
				if ($tag eq "protein_id" ) { my $pid=$value; $hash{$count}{'protein_id'} = $pid; }
				if ($tag eq "translation" ) { my $aa=$value; $hash{$count}{'aa'} = $aa; }
				if ($tag eq "gene" ) { my $genetag=$value; $hash{$count}{'genetag'} = $genetag; }
			    }
			}
			# Load coordinates of each gene
#			print "pid $hash{$count}{'protein_id'}\n";
			my $prot_id = $hash{$count}{'protein_id'};
			$coord{$y.'_'.$prot_id} = "($start..$stop)";
			my $strand = $feat_object->location->strand;
#			print COORD "$prot_id $start $stop $strand\n";
			if ($strand == -1) { $coord{$y.'_'.$prot_id} = "complement".$coord{$y.'_'.$prot_id}; }
			# Load chromosome of each gene
			$chr{$y.'_'.$prot_id} = $chromtag; ###id
			# Load the order of each gene
			$t=$y.'_'.$prot_id;
			$order{$y."_".$hash{$count}{'protein_id'}}=$z;$ordersp{$y."_".$z}=$t;$genesinspecies{$y}=$z;
			$z++;
			# Load information about hits to avoid spurious hits in pillars  # We lack this information in bacteria so always ON
			$onoff{$y."_".$hash{$count}{'protein_id'}}="ON";
		    }
		}
	    }
	}
    }
}
#close COORD;

# print "pillar file: <".$store."/pillar_C.tab";
open (LOAD,"<".$store."/pillar_C.tab");

while (<LOAD>){
    $x=$_;
    if ($x ne undef){
	chomp $x;
	@temp=split (/\t/,$x);
	$pillars{$temp[2]}=$temp[0];
	$genesinpillar{$pillars{$temp[2]}}=$genesinpillar{$pillars{$temp[2]}}."$temp[2] ";
	$sp{$temp[2]}=$temp[1];
#	print "$pillars{$temp[2]} $genesinpillar{$pillars{$temp[2]}} $sp{$temp[2]}\n";
    }
}
close (LOAD);

open IN, "<".$store."/NT_seq_all.tab"; my @nts = <IN>; close IN;
foreach my $line (@nts) {
    my @split = split (/\t/, $line);
    chomp $split[0]; chomp $split[1];
    $nt_seq_all{$split[0]} = $split[1];
    my $start_cod = substr($split[1], 0, 3);
    if ($start_cod eq "GTG") { $alt_start{$pillars{$split[0]}} = "GTG"; $count_alt_GTG++; }
    if ($start_cod eq "TTG") { $alt_start{$pillars{$split[0]}} = "TTG"; $count_alt_TTG++; }
    if ($start_cod eq "ATT") { $alt_start{$pillars{$split[0]}} = "ATT"; $count_alt_ATT++; }
    if ($start_cod eq "CTG") { $alt_start{$pillars{$split[0]}} = "CTG"; $count_alt_CTG++; }
    if ($start_cod eq "ATC") { $alt_start{$pillars{$split[0]}} = "ATC"; $count_alt_ATC++; }
}


if ($retrieveOGSandpillarsinogs==1){
    if ($retrieveOGS==1){
	# Retrieve all OGS
	foreach $y (@species){
	    for ($z=1;$z<=$genesinspecies{$y};$z++){
		$id=0;
		$x=1;
		if ($onoff{$ordersp{$y."_".$z}} eq "ON" && $ordersp{$y."_".$z}!~ /trna/ && $ordersp{$y."_".$z}!~ /snR/){
		    while ($id==0){
			if ($chr{$ordersp{$y."_".$z}} eq $chr{$ordersp{$y."_".($z+$x)}}){
			    if ($onoff{$ordersp{$y."_".($z+$x)}} eq "ON" && $ordersp{$y."_".($z+$x)}!~ /trna/ && $ordersp{$y."_".($z+$x)}!~ /snR/){
				$id=$ordersp{$y."_".$z}."-".$ordersp{$y."_".($z+$x)};
				push @ogslist, $id;
				$id=1;
			    }
			    else {
				$x++;
			    }
			}
			else {
			    $id=1;
			}
		    }
		    if ($x>1){$z=$z+$x-1;}
		}
	    }
	}
    }

    if ($retrieveOGS==0){
	@ogslist=qw(Shigella_boydii_YP_407404.1-Shigella_boydii_YP_407405.1);
    }

    my $cont=0;
    foreach (@ogslist){
	$cont++;
#	if ($cont%10==0){  print "cont1 $cont $_\n";}
	
	# Select the region where should lie the putative gene
	@_= split (/-/,$_);
	$geneid[0]=$_[0];
	$geneid[1]=$_[1];
	
	%hash=();
	foreach $currentspecies (@species){
	    # Retrieve the pillars
	    @temp=();
	    my @tempdecide=();
	    my @tempsense=();
	    my %genehash=();
	    @_=split (/ /,$genesinpillar{$pillars{$geneid[0]}});
	    foreach (@_){
		if ($currentspecies eq $sp{$_}){
		    $tempdecide[0]=$order{$_};
		    $genehash{$tempdecide[0]}=$geneid[0];
		}
	    }
	    @_=split (/ /,$genesinpillar{$pillars{$geneid[1]}});
	    foreach (@_){
		if ($currentspecies eq $sp{$_}){
		    $tempdecide[2]=$order{$_};
		    $genehash{$tempdecide[2]}=$geneid[1];
		}
	    }
	    
	    # Deleted the step where we have pre and post duplication species
	    
	    # Look at neighbours genes to decide the sense
	    # Tempdecide[0]
#	    print "tempdecide: $tempdecide[0]\n";
	    if ($tempdecide[0] ne undef){
		for ($x=1;$x<=$updown;$x++){
		    @temp=();
		    @_=split (/ /,$genesinpillar{$pillars{$ordersp{$currentspecies.'_'.($tempdecide[0]-$x)}}});
		    foreach (@_){
			if ($currentspecies eq $sp{$_} && $chr{$ordersp{$currentspecies.'_'.($tempdecide[0])}} eq $chr{$_}){$temp[0]=1;}
			if ($sp{$_} eq $sp{$genehash{$tempdecide[0]}} && $chr{$genehash{$tempdecide[0]}} eq $chr{$_}){$temp[1]=$_;}
		    }
		    if ($temp[0]==1 && $temp[1] ne undef && $order{$temp[1]}<$order{$genehash{$tempdecide[0]}}){
			$tempsense[0]=1;
			$x=$updown+1;
			if (abs($order{$temp[1]}-$order{$ordersp{$currentspecies.'_'.$tempdecide[0]}})>$updown){$temp[2]=1;}
		    }
		    elsif ($temp[0]==1 && $temp[1] ne undef && $order{$temp[1]}>=$order{$genehash{$tempdecide[0]}}){
			$tempsense[0]=0;
			$x=$updown+1;
			if (abs($order{$temp[1]}-$order{$ordersp{$currentspecies.'_'.$tempdecide[0]}})>$updown){$temp[2]=1;}
		    }
		}			
		if ($tempsense[0] eq undef || $temp[2]==1){
		    for ($x=1;$x<=$updown;$x++){
			@temp=();
			@_=split (/ /,$genesinpillar{$pillars{$ordersp{$currentspecies.'_'.($tempdecide[0]+$x)}}});
			foreach (@_){
			    if ($currentspecies eq $sp{$_} && $chr{$ordersp{$currentspecies.'_'.($tempdecide[0])}} eq $chr{$_}){$temp[0]=1;}
			    if ($sp{$_} eq $sp{$genehash{$tempdecide[0]}} && $chr{$genehash{$tempdecide[0]}} eq $chr{$_}){$temp[1]=$_;}
			}
			if ($temp[0]==1 && $temp[1] ne undef && $order{$genehash{$tempdecide[0]}}<=$order{$temp[1]}){
			    $tempsense[0]=1;
			    $x=$updown+1;
			}
			elsif ($temp[0]==1 && $temp[1] ne undef && $order{$genehash{$tempdecide[0]}}>$order{$temp[1]}){
			    $tempsense[0]=0;
			    $x=$updown+1;
			}
		    }
		}
	    }
	    
	    # Tempdecide[2]
	    if ($tempdecide[2] ne undef){
		for ($x=1;$x<=$updown;$x++){
		    @temp=();
		    @_=split (/ /,$genesinpillar{$pillars{$ordersp{$currentspecies.'_'.($tempdecide[2]+$x)}}});
		    foreach (@_){
			if ($currentspecies eq $sp{$_} && $chr{$ordersp{$currentspecies.'_'.($tempdecide[2])}} eq $chr{$_}){$temp[0]=1;}
			if ($sp{$_} eq $sp{$genehash{$tempdecide[2]}} && $chr{$genehash{$tempdecide[2]}} eq $chr{$_}){$temp[1]=$_;}
		    }
		    if ($temp[0]==1 && $temp[1] ne undef && $order{$genehash{$tempdecide[2]}}<=$order{$temp[1]}){
			$tempsense[2]=0;
			$x=$updown+1;
			if (abs($order{$temp[1]}-$order{$ordersp{$currentspecies.'_'.$tempdecide[2]}})>$updown){$temp[2]=1;}
		    }
		    elsif ($temp[0]==1 && $temp[1] ne undef && $order{$genehash{$tempdecide[2]}}>$order{$temp[1]}){
			$tempsense[2]=1;
			$x=$updown+1;
			if (abs($order{$temp[1]}-$order{$ordersp{$currentspecies.'_'.$tempdecide[2]}})>$updown){$temp[2]=1;}
		    }
		}
		if ($tempsense[2] eq undef || $temp[2]==1){
		    for ($x=1;$x<=$updown;$x++){
			@temp=();
			@_=split (/ /,$genesinpillar{$pillars{$ordersp{$currentspecies.'_'.($tempdecide[2]-$x)}}});
			foreach (@_){
			    if ($currentspecies eq $sp{$_} && $chr{$ordersp{$currentspecies.'_'.($tempdecide[2])}} eq $chr{$_}){$temp[0]=1;}
				if ($sp{$_} eq $sp{$genehash{$tempdecide[2]}} && $chr{$genehash{$tempdecide[2]}} eq $chr{$_}){$temp[1]=$_;}
			}
			if ($temp[0]==1 && $temp[1] ne undef && $order{$temp[1]}<$order{$genehash{$tempdecide[2]}}){
			    $tempsense[2]=0;
			    $x=$updown+1;
			}
			elsif ($temp[0]==1 && $temp[1] ne undef && $order{$temp[1]}>=$order{$genehash{$tempdecide[2]}}){
			    $tempsense[2]=1;
			    $x=$updown+1;
			}
		    }			
		}
	    }
	    
	    
	    
	    if ($tempdecide[0] ne undef && $tempsense[0] ne undef){
		if ($tempsense[0]==0){
		    for ($x=$tempdecide[0]-1;$x>$tempdecide[0]-$updown;$x--){
			@_=split (/ /,$genesinpillar{$pillars{$ordersp{$currentspecies.'_'.$x}}});
			$y=0;
			%sphash=();
			$y=0;%temphash=();
			foreach (@_){
			    if (($sp{$_} eq $sp{$genehash{$tempdecide[0]}} && $chr{$_} eq $chr{$genehash{$tempdecide[0]}}) || ($chr{$ordersp{$currentspecies.'_'.$x}} ne $chr{$ordersp{$currentspecies.'_'.$tempdecide[0]}})){$y=1;}
			    elsif ($sp{$_} eq $sp{$genehash{$tempdecide[0]}}){$sphash{$sp{$_}}=$sphash{$sp{$_}}+1;if ($sphash{$sp{$_}}==1){$y=1;}}
			    elsif ($onoff{$ordersp{$currentspecies.'_'.$x}} eq "ON"){$temphash{$pillars{$ordersp{$currentspecies.'_'.$x}}}=1;}
			}
			    if ($y==0 && %temphash ne undef){foreach (keys %temphash){$hash{$_}=$temphash{$_};}}
			
			if ($y==1){$x=$tempdecide[0]-$updown-1;}	# Exit also if we already have one gene of the current species in the pillar
		    }
		}
		elsif ($tempsense[0]==1){
		    for ($x=$tempdecide[0]+1;$x<$tempdecide[0]+$updown;$x++){
			@_=split (/ /,$genesinpillar{$pillars{$ordersp{$currentspecies.'_'.$x}}});
			$y=0;
			%sphash=();
			$y=0;%temphash=();
			    foreach (@_){
				if (($sp{$_} eq $sp{$genehash{$tempdecide[0]}} && $chr{$_} eq $chr{$genehash{$tempdecide[0]}}) || ($chr{$ordersp{$currentspecies.'_'.$x}} ne $chr{$ordersp{$currentspecies.'_'.$tempdecide[0]}})){$y=1;}
				elsif ($sp{$_} eq $sp{$genehash{$tempdecide[0]}}){$sphash{$sp{$_}}=$sphash{$sp{$_}}+1;if ($sphash{$sp{$_}}==1){$y=1;}}
				elsif ($onoff{$ordersp{$currentspecies.'_'.$x}} eq "ON"){$temphash{$pillars{$ordersp{$currentspecies.'_'.$x}}}=1;}
			    }
			if ($y==0 && %temphash ne undef){foreach (keys %temphash){$hash{$_}=$temphash{$_};}}
			if ($y==1){$x=$tempdecide[0]+$updown+1;}	# Exit also if we already have one gene of the current species in the pillar
		    }
		}
	    }
	    
	    if ($tempdecide[2] ne undef && $tempsense[2] ne undef){
		if ($tempsense[2]==0){
		    for ($x=$tempdecide[2]-1;$x>$tempdecide[2]-$updown;$x--){
			@_=split (/ /,$genesinpillar{$pillars{$ordersp{$currentspecies.'_'.$x}}});
			$y=0;
			%sphash=();
			$y=0;%temphash=();
			foreach (@_){
			    if (($sp{$_} eq $sp{$genehash{$tempdecide[2]}} && $chr{$_} eq $chr{$genehash{$tempdecide[2]}}) || ($chr{$ordersp{$currentspecies.'_'.$x}} ne $chr{$ordersp{$currentspecies.'_'.$tempdecide[2]}})){$y=1;}
			    elsif ($sp{$_} eq $sp{$genehash{$tempdecide[2]}}){$sphash{$sp{$_}}=$sphash{$sp{$_}}+1;if ($sphash{$sp{$_}}==1){$y=1;}}
			    elsif ($onoff{$ordersp{$currentspecies.'_'.$x}} eq "ON"){$temphash{$pillars{$ordersp{$currentspecies.'_'.$x}}}=1;}
			}
			if ($y==0 && %temphash ne undef){foreach (keys %temphash){$hash{$_}=$temphash{$_};}}
			if ($y==1){$x=$tempdecide[2]-$updown-1;}	# Exit also if we already have one gene of the current species in the pillar
		    }
		}
		elsif ($tempsense[2]==1){
		    for ($x=$tempdecide[2]+1;$x<$tempdecide[2]+$updown;$x++){
			    @_=split (/ /,$genesinpillar{$pillars{$ordersp{$currentspecies.'_'.$x}}});
			    $y=0;
			    %sphash=();
			    $y=0;%temphash=();
			    foreach (@_){
				if (($sp{$_} eq $sp{$genehash{$tempdecide[2]}} && $chr{$_} eq $chr{$genehash{$tempdecide[2]}}) || ($chr{$ordersp{$currentspecies.'_'.$x}} ne $chr{$ordersp{$currentspecies.'_'.$tempdecide[2]}})){$y=1;}
				elsif ($sp{$_} eq $sp{$genehash{$tempdecide[2]}}){$sphash{$sp{$_}}=$sphash{$sp{$_}}+1;if ($sphash{$sp{$_}}==1){$y=1;}}
				elsif ($onoff{$ordersp{$currentspecies.'_'.$x}} eq "ON"){$temphash{$pillars{$ordersp{$currentspecies.'_'.$x}}}=1;}
			    }
			    if ($y==0 && %temphash ne undef){foreach (keys %temphash){$hash{$_}=$temphash{$_};}}
			    if ($y==1){$x=$tempdecide[2]+$updown+1;}	# Exit also if we already have one gene of the current species in the pillar
			}
		}
	    }
	    
	    
	    
	    
	}
	# Save the pillars for each OGS			
	foreach (keys %hash){
		if ($pillarsinogs{"$geneid[0]-$geneid[1]"} eq undef){$pillarsinogs{"$geneid[0]-$geneid[1]"}=$_;}
		else {$pillarsinogs{"$geneid[0]-$geneid[1]"}=$pillarsinogs{"$geneid[0]-$geneid[1]"}." $_";}
	    }
    }
    
    foreach my $gene_seg (keys %pillarsinogs){
	@_=split (/ /,$pillarsinogs{$gene_seg});
	foreach (@_){
	    if ($ogsinpillars{$_} eq undef){$ogsinpillars{$_}=$gene_seg;}
		else {$ogsinpillars{$_}=$ogsinpillars{$_}." $gene_seg";}
	}
    }
    open (SAVE,">$pathsave/pillarsinogs.txt");
    foreach (keys %pillarsinogs){print SAVE "$_\t$pillarsinogs{$_}\n";}
    close (SAVE);
    open (SAVE,">$pathsave/ogsinpillars.txt");
    foreach (keys %ogsinpillars){print SAVE "$_\t$ogsinpillars{$_}\n";}
    close (SAVE);
}


my %count_b;
if ($launchtheblastsearch==1){
    open TEST1, ">$store/test_aa_hspNT_".$date.".tab";
    open TEST2, ">$store/test_intenexpNT".$date.".tab";
#    open FAIL, ">fail_aa_hspNT".$date.".tab";
    my $file_present=0;
    my (%inlist_hsp2, %inlist_hsp1, %inlen_hsp2, %inlen_hsp1, %incoord_hsp2, %incoord_hsp1, %inseq_hsp2, %inseq_hsp1, %in_ntseq_hsp2, %in_ntseq_hsp1, %in_correct_2h);
    if (-e $pathsave."/Inf_".$date.".txt") {
#	print TEST2 $pathsave."/Inf_".$date.".txt present\n";
	open IN, "<".$pathsave."/Inf_".$date.".txt"; my @infile = <IN>; close IN;
        for (my $i=0; $i<$#infile; $i++) {
#            print TEST2 "line $infile[$i]\n";
            if ($infile[$i] =~ /^>/) {
                my @split = split (/>/, $infile[$i]);
                my @split2 = split (/\t/, $split[1]);
                my @split_1h = split (/\t/, $infile[$i+1]);
                $inlist_hsp2{$split2[0]}{$split2[1]}= $split2[2]; $inlen_hsp2{$split2[0]}{$split2[1]}= $split2[3]; $incoord_hsp2{$split2[0]}{$split2[1]} = $split2[4];
                $inseq_hsp2{$split2[0]}{$split2[1]}= $split2[6]; $in_ntseq_hsp2{$split2[0]}{$split2[1]}= $split2[7]; $in_correct_2h{$split2[0]}{$split2[1]} = $split2[5]; 
                $inlist_hsp1{$split2[0]}{$split2[1]}= $split2[2]; $inlen_hsp1{$split2[0]}{$split2[1]}= $split_1h[0]; $incoord_hsp1{$split2[0]}{$split2[1]}= $split_1h[1];
                $inseq_hsp1{$split2[0]}{$split2[1]}= $split_1h[2]; $in_ntseq_hsp1{$split_1h[0]}{$split2[1]}= $split_1h[3];
#                print TEST2 "$split2[0] $split2[1] $split2[2] $split2[3] $split2[4] $split2[5] $split_1h[0] $split_1h[1] $split_1h[2] $split_1h[3] $split_1h[0]\n";
            }
        }
	$file_present = 1;
    }
    if ($file_present == 0) {
 #       print TEST2 $pathsave."/Inf_".$date.".txt absent\n";
#	open OUT2, ">$pathsave/Inf_".$date.".txt";
    }
#    open FIN1, ">$pathsave/seqfin1_".$date.".txt";
#    open FIN2, ">$pathsave/seqfin2_".$date.".txt";

    if ($retrieveOGSandpillarsinogs==0){
	%pillarsinogs=();
	open (LOAD,"<$pathsave/pillarsinogs.txt");
	while (<LOAD>){
	    chomp $_;
	    @_=split (/\t/,$_);
	    $pillarsinogs{$_[0]}=$_[1];
	}
	close (LOAD);
    }
    
    my $cont=0; 
    my (%seq2hsp_hash, %seq1hsp_hash, %sequence_all, %coord_2h_hash, %coord_1h_hash, %coord_all, %gene_hash, %gene_broke, %med_hash, %rejects_2h, %rejects_1h, %reject2_reason, %hit_all, %hit_2hsp, %hit_1hsp, %nt_all, %hsp_all, %listofhits, %correct_2h, ,%correct_2h_orig, %correct_1h, %correct_all);
    my (%reject1_reason, %nt_2h, %nt_1h);
    open (SAVE5,">$pathsave/newORF-res_".$date."_NT.txt");
#    open (BAD,">$pathsave/badNT.txt");
    open NT, ">$pathsave/NT_all.txt";

    foreach my $gene_seg (keys %pillarsinogs){
#	print TEST1 "gene_seg $gene_seg pillarsin $pillarsinogs{$gene_seg}\n";
	$cont++;
#	if ($cont%10==0) { print "cont2 $cont $_ gene_seg $gene_seg\n"; }
	
	@geneid=();$start=();$stop=();$stt=();$stp=();my $intgen=(); my $intgen_exp=(); my $A_start; my $B_stop;
	
	# Select the region where should lie the putative gene
	@_= split (/-/,$gene_seg);
	$geneid[0]=$_[0];
	$geneid[1]=$_[1];
#	print TEST1 "geneid0 $geneid[0] 1 $geneid[1]\n";

	$x = $coord{$geneid[0]};
	$x =~ s/[complement()]//g;
	if ($x =~ /,/){
	    @split= split(/\,/,$x);
	    @_= split(/\.\./,$split[0]);
	    if ($_[0]>$_[1]){$temp[0]=$_[0];}else {$temp[0]=$_[1];}
	    @_= split(/\.\./,$split[$#split]);
	    if ($_[0]>$_[1]){$temp[1]=$_[0];}else {$temp[1]=$_[1];}
	    $_[0]=$temp[0];$_[1]=$temp[1];
	}
	else {@_= split(/\.\./,$x);}
	if ($_[1]>$_[0]){$start=$_[1]; $A_start= $_[0]; }else{$start=$_[0]; $A_start = $_[1]; }	# < and > not exchanged because we would not test possible overlapings
	
	$x = $coord{$geneid[1]};
	$x =~ s/[complement()]//g;
	if ($x =~ /,/){
	    @split= split(/\,/,$x);
	    @_= split(/\.\./,$split[0]);
	    if ($_[0]>$_[1]){$temp[0]=$_[0];}else {$temp[0]=$_[1];}
	    @_= split(/\.\./,$split[$#split]);
	    if ($_[0]>$_[1]){$temp[1]=$_[0];}else {$temp[1]=$_[1];}
	    $_[0]=$temp[0];$_[1]=$temp[1];
	}
	else {@_= split(/\.\./,$x);}
	if ($_[0]<$_[1]){$stop=$_[0]; $B_stop = $_[1];}else{$stop=$_[1]; $B_stop=$_[0];}	
	
	# Erase previous files
	if ((-e "$pathsave/current/seq") || (-e "$pathsave/current/intgen")) { 
	    $x="rm $pathsave/current/*";system($x);
	}
	# Retrieve intergenic sequence
	unless ($start > $stop ) {
	    $intgen=substr($seq{$chr{$geneid[0]}},$start,$stop-$start-1);		
	    $intgen_exp=substr($seq{$chr{$geneid[0]}},($A_start),($B_stop-$A_start-1));
	} else { $intgen=(); }
	
	if (length($intgen)>30000){$intgen=()};	# If intgen is too big (probably missed coordinates so recovering the whole chr) we don't do anything
	if (length($intgen_exp)>30000){$intgen_exp=()}; 
	
	if ($intgen ne undef){
	    open (INTGEN,">$pathsave/current/intgen");print INTGEN ">Intgen\n$intgen\n";close (INTGEN); 
	    # Keep genes to test deleting those with annotation problems to not disturb the predictions as i.e. lacking start Met or with multiple stop codons or lacking stop codon.
	    my @temp_array=split (/ /,$pillarsinogs{$gene_seg});
	    my @temporal_array;
	    @totest=();
	    $current=();
	    $x=();$y=();$z=();$t=();
	    foreach (@temp_array){
		@temporal_array=split (/ /,$genesinpillar{$_});
		@temporal=(); # 100107 Added to avoid that two or more pillars in the same intergenic regions are tested together
		foreach $current (@temporal_array){
		    $x=$coord{$current};
		    $x=~ s/\(//g;$x=~ s/\)//g;$x=~ s/complement//g;
		    @temp=split(/\,/,$x);
		    $t=();
		    foreach $y (@temp){
			@_=split (/\.\./,$y);
			$z=substr($seq{$chr{$current}},$_[0]-1,$_[1]-$_[0]+1);
			if ($t eq undef){$t=$z;}
			else{$t=$t.$z;}
		    }
		    $t = uc $t; # Added on 100902 because sometimes the secuence is in lowercase (new bug, not in the other programs)
		    if ($coord{$current}=~ /complement/){$t=reverse $t; $t=~ tr/[ATCG]/[TAGC]/;}
		    @temp=split(//,$t);$y=();
		    for ($x=0;$x<$#temp;$x=$x+3){
			$y=$y.$translation{$temp[$x].$temp[$x+1].$temp[$x+2]};
		    }
		    push @temporal,$current;
		}
		if (@temporal ne undef){
		    $_=join(' ',@temporal);push @totest,$_;
		}
	    } 
	
    

	    
	    
	    ## Add pillars of OGS
	    foreach $test (@totest){
		$sequence=();
		$current=();
		@allgenes=();
		@hsp=();
		@values=();
		@mediansize=(); 
		my $testpil;
		
		if ($test ne undef){
		    $currentpillar=();
		    @allgenes= split(/ /,$test);
		    foreach $current (@allgenes){
			if ($currentpillar eq undef){$currentpillar=$pillars{$current};}
			$x=$coord{$current};
			$x=~ s/\(//g;
			$x=~ s/\)//g;
			$x=~ s/complement//g;
			@temp=split(/\,/,$x);
			$t=();
			foreach $y (@temp){
			    @_=split (/\.\./,$y);
			    $z=substr($seq{$chr{$current}},$_[0]-1,$_[1]-$_[0]+1);
			    if ($t eq undef){$t=$z;}
			    else{$t=$t.$z;}
			}
			$t = uc $t; # Added on 100902 because sometimes the secuence is in lowercase (new bug, not in the other programs)
			if ($coord{$current}=~ /complement/){$t=reverse $t; $t=~ tr/[ATCG]/[TAGC]/;}
			@temp=split(//,$t);
			$sequence=();
			for ($x=0;$x<$#temp;$x=$x+3){
			    $sequence=$sequence.$translation{$temp[$x].$temp[$x+1].$temp[$x+2]};
			}
			push (@mediansize,length($sequence)-1);
			open (SEQ,">>$pathsave/seq");
			$testpil = $pillars{$current};
			print SEQ ">$current\n$sequence\n";
			close (SEQ); 
		    }
				
		    @mediansize=sort(@mediansize);
		    if (($#mediansize+1)%2==1){$median=$mediansize[$#mediansize/2];}
		    else {$median=($mediansize[$#mediansize/2]+$mediansize[($#mediansize/2)+1])/2;}
				
		    $x="mv $pathsave/seq $pathsave/current/seq";system($x);
					# Build database of proteins seq
   #                 unless (-e "$pathsave/blast_saves/out-seq".$geneid[0]."_".$currentpillar.".fullblast") {
			$x="formatdb -i $pathsave/current/seq -p T";system($x); 	    
	 
			
			# Launch BLASTP to find similarities to putative ORF and parse it
			#################################################################
#			print "blast1 $gene_seg d $sequence -i $intgen\n";
			$x="$blastall -p blastx -e $evalue -F \"\" -d $pathsave/current/seq -i $pathsave/current/intgen -o $pathsave/blast_saves/out-seq".$geneid[0]."_".$currentpillar.".fullblast -m 8";system($x);
#		    }
 #                   unless (-e "$pathsave/blast_saves/out-seq".$geneid[0]."_".$currentpillar."_full.fullblast") {
                        $x="formatdb -i $pathsave/current/seq -p T";system($x);
			$x="$blastall -p blastx -e $evalue -F \"\" -d $pathsave/current/seq -i $pathsave/current/intgen -o $pathsave/blast_saves/out-seq".$geneid[0]."_".$currentpillar."_full.fullblast";system($x);
#		    }
		    open (BLAST,"<$pathsave/blast_saves/out-seq".$geneid[0]."_".$currentpillar.".fullblast"); my @in = <BLAST>; close BLAST;		    		   
		    system "rm $pathsave/blast_saves/out-seq".$geneid[0]."_".$currentpillar.".fullblast";
#                    system "rm $pathsave/blast_saves/out-seq".$geneid[0]."_".$currentpillar."_full.fullblast";
		    my $chromseq = $seq{$chr{$geneid[0]}};
		    print TEST2 "0 $geneid[0] $geneid[1] $currentpillar Istart $start Istop $stop Segstart $A_start Segstop $B_stop\n";
		    my ($seq_2hsp, $seq_1hsp, $ntseq_2hsp, $ntseq_1hsp, $coord_2h, $coord_1h, $two_hsp_len, $one_hsp_len, $list_hits, $correct2h);
                    if ($file_present == 1) {
                        $seq_2hsp= $inseq_hsp2{"$geneid[0] $geneid[1]"}{$currentpillar}; $ntseq_2hsp= $in_ntseq_hsp2{"$geneid[0] $geneid[1]"}{$currentpillar};
                        $coord_2h= $incoord_hsp2{"$geneid[0] $geneid[1]"}; $two_hsp_len= $inlen_hsp2{"$geneid[0] $geneid[1]"}{$currentpillar};
                        $list_hits=$inlist_hsp2{"$geneid[0] $geneid[1]"}{$currentpillar}; $correct2h= $in_correct_2h{"$geneid[0] $geneid[1]"}{$currentpillar};
                        $seq_1hsp= $inseq_hsp1{"$geneid[0] $geneid[1]"}{$currentpillar}; $ntseq_1hsp= $in_ntseq_hsp1{"$geneid[0] $geneid[1]"}{$currentpillar};
                        $coord_1h= $incoord_hsp1{"$geneid[0] $geneid[1]"}{$currentpillar}; $one_hsp_len= $inlen_hsp1{"$geneid[0] $geneid[1]"}{$currentpillar};
                        print TEST2 "from file seq_2hsp $seq_2hsp seq_1hsp $seq_1hsp\n";
                    } else {
			($seq_2hsp, $seq_1hsp, $ntseq_2hsp, $ntseq_1hsp, $coord_2h, $coord_1h, $two_hsp_len, $one_hsp_len, $list_hits, $correct2h) = &ORFmaker(\@in, $median, $intgen_exp, $mincoverage, $min_2, \@geneid, \%translation, $start, $stop, $A_start, $B_stop, $chromseq);
		    }		    
		    $coord_2h_hash{$currentpillar}{$gene_seg}= $coord_2h; $coord_1h_hash{$currentpillar}{$gene_seg}= $coord_1h; $gene_hash{$currentpillar}{$gene_seg}=$geneid[0]."-".$geneid[1]; 
		    $correct_2h{$currentpillar}{$gene_seg} = $correct2h; $correct_2h_orig{$currentpillar}{$gene_seg} = $correct2h;
		    $med_hash{$currentpillar} = $median;
		    $listofhits{$currentpillar}{$gene_seg}=$list_hits;
		    if (defined $ntseq_2hsp) { $nt_2h{$currentpillar}{$gene_seg}=$ntseq_2hsp; }
		    if (defined $ntseq_1hsp) { $nt_1h{$currentpillar}{$gene_seg}=$ntseq_1hsp; }
		    $gene_broke{$currentpillar}{$gene_seg} = "$geneid[0] $geneid[1]";		    
		    print TEST2 "test hash 2h $nt_2h{$currentpillar}{$gene_seg} 1h $nt_1h{$currentpillar}{$gene_seg} aa1h $seq_1hsp\n";
		    print TEST2 "correct 2hsp outofloop $correct2h\n";
		    
		    my $reject_2h; my $reject_1h; my $smallm; my $smallm_1h; my $bigm_1h; my $M_loc; my $S_loc_2h; my $M_loc_1h; my $S_loc_1h;
		    ### rejecting ORF if start is too far into HSP or too far in front of HSP: 2hsp version###
		    my $H_loc_2h; my $no_m=0;  my $seq_fin_2hsp; my $ntseq_fin_2hsp;
		    if ((defined $seq_2hsp) && ($seq_2hsp ne "")) {
			my $stop_front=length($seq_2hsp); my $am_loc_2h =length($seq_2hsp); my $stop_cut;
			my @split_X = split (/X/, $seq_2hsp); my $X_loc = (length($split_X[0])+1);
                        unless ($seq_2hsp =~ /^[A-Z]/) { my @split_hsp = split(/[A-Z]/, $seq_2hsp); $H_loc_2h=(length($split_hsp[0])+1); } else {$H_loc_2h = 1; }
			if ($seq_2hsp =~ /m/) { $smallm = "present"; my @sm_split = split(/m/, $seq_2hsp); $am_loc_2h = (length($sm_split[0])+1); } else { $no_m++; }
			if ($seq_2hsp =~ /M/) { my @M_split = split(/M/, $seq_2hsp); $M_loc = (length($M_split[0])+1); } else { $no_m++; }
			if ((defined $M_loc) && ($M_loc < $am_loc_2h)) { $am_loc_2h = $M_loc; }

			## length at end of HSP to reach stop##
			my $end_extend=$stop_front-$two_hsp_len-$H_loc_2h+1;

			unless (defined $alt_start{$currentpillar}) { ## disabling no M rejection rule if alternative START codon present ##
			    $seq_fin_2hsp = substr($seq_2hsp, ($am_loc_2h-1), (length($seq_2hsp)-$am_loc_2h));
			    $ntseq_fin_2hsp = substr($ntseq_2hsp, (($am_loc_2h-1)*3), ((length($seq_2hsp)-$am_loc_2h)*3));
			} else {
			    $seq_fin_2hsp = $seq_2hsp; $ntseq_fin_2hsp = $ntseq_2hsp;			    
			}
			my $count_stop=0;
			if ($seq_2hsp =~ /^\*/) { $stop_cut =3; } else { $stop_cut =2; } ## because sequences all start with * now
                        if ($seq_2hsp =~ /\*/) {
                            my @seq_2split = split(//, $seq_2hsp);
                            for (my $i=0; $i<=$#seq_2split; $i++) {
                                if ($seq_2split[$i] eq "\*") {
				    $count_stop++;
                                    print TEST2 "STOP $i $seq_2split[$i] countstop $count_stop\n";
				    if ($count_stop == $stop_cut) {
					$stop_front=$i+1;
					$S_loc_2h = abs($two_hsp_len-($stop_front-$end_extend-$H_loc_2h+1));
					last; #figure out how far into HSP stop is
				    }
                                }
                            }
                        } else {$S_loc_2h =0; }
			print TEST2 "Sloc2h $S_loc_2h no_m_2h $no_m\n";
			
			my $error_2h=0; ##Counting errors in sequence ##
			if ($S_loc_2h > 30) {  print TEST2 "R2h S_loc >30\n"; 
					      $reject2_reason{$currentpillar}{$gene_seg} = "STOP30"; $error_2h++; } 					    
                        if (($stop_front-$H_loc_2h) < (0.5*$two_hsp_len)) { $reject2_reason{$currentpillar}{$gene_seg} = "Stop in first 50% of ORF"; $error_2h++; }
                        unless (defined $alt_start{$currentpillar}) { ## disabling no M rejection rule if alternative START codon present ##
			    if ($stop_front < $am_loc_2h) { print TEST2 "STOP<START\n"; 
				$reject2_reason{$currentpillar}{$gene_seg} = "STOP<START"; $error_2h++; }
			    if ($X_loc < $am_loc_2h) { print TEST2 "X<START\n"; 
				$reject2_reason{$currentpillar}{$gene_seg} = "X<START"; $error_2h++; }
			    if ((defined $M_loc) && ($smallm ne "present") && ((($M_loc-$H_loc_2h)/$two_hsp_len) >0.2)) { $reject2_reason{$currentpillar}{$gene_seg} = "M loc"; $error_2h++ }
			}
                        if ($X_loc > $stop_front) { print TEST2 "stop <X\n"; 
						    $reject2_reason{$currentpillar}{$gene_seg} = "STOP>X"; $error_2h++; }
			if ($no_m == 2) { $reject2_reason{$currentpillar}{$gene_seg} = "No M"; $error_2h++; }
			
                        if ($error_2h >0) { $reject_2h = "rej"; $rejects_2h{$currentpillar}{$gene_seg} = $seq_2hsp; }
			if (defined $S_loc_2h) { if ($S_loc_2h < 30) { $correct_2h{$currentpillar}{$gene_seg} .= " Stop <30 aa from end of protein"; }
						 if ($S_loc_2h > 30) { $correct_2h{$currentpillar}{$gene_seg} .= " Stop >30 aa from end of protein"; } }			
			print TEST2 "correct 2hsp outofloop2 $correct_2h{$currentpillar}{$gene_seg}\n";
		    }

		    ### rejecting ORF if start is too far into HSP or too far in front of HSP: 1hsp version###
		    my $H_loc_1h; my $no_m_1h=0; my $seq_fin_1hsp; my $ntseq_fin_1hsp;
		    if ((defined $seq_1hsp) && ($seq_1hsp ne "")) {
			my $stop_front=length($seq_1hsp); my $am_loc_1h = length($seq_1hsp); my $stop_cut;
			if ($seq_1hsp =~ /m/) { $smallm_1h = "present"; my @sm_split = split(/m/, $seq_1hsp); $am_loc_1h = (length($sm_split[0])+1);} else { $no_m_1h++; }
			if ($seq_1hsp =~ /M/) { $bigm_1h = "present"; my @M_split = split(/M/, $seq_1hsp); $M_loc_1h = (length($M_split[0])+1); } else { $no_m_1h++; }
			if ((defined $M_loc_1h) && ($M_loc_1h < $am_loc_1h)) { $am_loc_1h = $M_loc_1h; }
			unless ($seq_1hsp =~ /^[A-Z]/) { my @split_hsp = split(/[A-Z]/, $seq_1hsp); $H_loc_1h =(length($split_hsp[0])+1); }  else {$H_loc_1h = 1; }

			## length at end of HSP to reach stop##
			my $end_extend=$stop_front-$one_hsp_len-$H_loc_1h+1;
                        unless ((defined $alt_start{$currentpillar}) || (($smallm_1h != "present") && ($bigm_1h != "present"))) { ## disabling no M rejection rule if alternative START codon present ##
 			    $seq_fin_1hsp = substr($seq_1hsp, ($am_loc_1h-1), (length($seq_1hsp)-$am_loc_1h));
                            $ntseq_fin_1hsp = substr($ntseq_1hsp, (($am_loc_1h-1)*3), ((length($seq_1hsp)-$am_loc_1h)*3));
                        } else {
                            $seq_fin_1hsp = $seq_1hsp; $ntseq_fin_1hsp = $ntseq_1hsp;
			}
                        if ($seq_1hsp =~ /^\*/) { $stop_cut =3; } else { $stop_cut =2; } ## because sequences all start with * now
			my $count_stop=0;
			if ($seq_1hsp =~ /\*/) {
			    my @seq_1split = split(//, $seq_1hsp);
			    for (my $i=0; $i<=$#seq_1split; $i++) {
				if ($seq_1split[$i] eq "\*") {
				    $count_stop++;
#                                    print TEST2 "STOP $i $seq_1split[$i] countstop $count_stop\n";
				    if ($no_m_1h <2) { ## if Met present, allow readthrough stop codon! 
					if ($count_stop == ($stop_cut)) { $correct_1h{$currentpillar}{$gene_seg} = "Readthrough of stop codon"; }
					if ($count_stop == $stop_cut) { 
					    $stop_front=$i+1; 
					    $S_loc_1h = abs($one_hsp_len-($stop_front-$H_loc_1h-$end_extend+1));
					    print TEST2 "Sloc1h $S_loc_1h ($one_hsp_len-($stop_front-$H_loc_1h-$end_extend+1))\n";
					    last; #figure out how far into HSP stop is 
					}
				    }
				    if (($no_m_1h == 2) && ($alt_start{$currentpillar} eq undef)) { ## If no Met present, do NOT allow readthrough as long as no alternative STARTs exist;
					$correct_1h{$currentpillar}{$gene_seg} = "No starting ATG present";
                                        if ($count_stop == ($stop_cut-1)) { $stop_front=$i+1; $S_loc_1h = $one_hsp_len-($stop_front-$end_extend-$H_loc_1h+1);
                                                                print TEST2 "Sloc1h $S_loc_1h ($one_hsp_len-($stop_front-$end_extend-$H_loc_1h+1))\n";
                                                                last; }
				    }
				    
				}
			    }
			} else { $S_loc_1h =0; }
			print TEST2 "Sloc1h $S_loc_1h no_m_1h $no_m_1h am_loc $am_loc_1h full length $stop_front aa1fin $seq_fin_1hsp\n";
			print TEST2 "correct 1hsp 1 $correct_1h{$currentpillar}{$gene_seg}\n";
			my $S_perc = $S_loc_1h/($one_hsp_len);

			my $error_1h = 0; ### Counting errors in sequence ##
			if ($no_m_1h == 2) { $reject1_reason{$currentpillar}{$gene_seg} = "No M"; $error_1h++; }
			if ($S_loc_1h > 30) { # print TEST2 "R1h S_loc >30 $am_loc_1h stop $stop_front\n"; 
					      $reject1_reason{$currentpillar}{$gene_seg} = "STOP30"; $error_1h++; }
			if (($stop_front-$H_loc_1h) < (0.5*$one_hsp_len)) { $reject1_reason{$currentpillar}{$gene_seg} = "Stop in first 50% of ORF"; $error_1h++; }
			unless (defined $alt_start{$currentpillar}) { ## disabling no M rejection rule if alternative START codon present ##
			    if ((defined $M_loc_1h) && ($smallm_1h ne "present") && ((($M_loc_1h-$H_loc_1h)/$one_hsp_len) >0.2)) { $reject1_reason{$currentpillar}{$gene_seg} = "M_loc"; $error_1h++; }
			    if (defined $am_loc_1h && $am_loc_1h > $stop_front) { $reject1_reason{$currentpillar}{$gene_seg} = "Stop before first start"; $error_1h++; }
			}

			if ($error_1h >1) { $reject_1h = "rej"; $rejects_1h{$currentpillar}{$gene_seg} = $seq_1hsp; }
                        if (defined $S_loc_1h) { if ($S_loc_1h < 30) { $correct_1h{$currentpillar}{$gene_seg} .= " Stop <30 aa from end of protein"; }
						 if ($S_loc_1h > 30) { $correct_1h{$currentpillar}{$gene_seg} .= " Stop >30 aa from end of protein"; } }
			print TEST2 "errorcount: 1h $error_1h\n";
		    }		 
		    print TEST2 "correct 1hsp 2 $correct_1h{$currentpillar}{$gene_seg}\n";
		    print TEST2 "R2hsp smallm $smallm mloc $M_loc hloc $H_loc_2h $two_hsp_len A: (($M_loc-$H_loc_2h)/$two_hsp_len) A2: ($H_loc_2h/$two_hsp_len) B: ($M_loc/$two_hsp_len) C: (A2) $reject_2h\n";
		    print TEST2 "R1hsp smallm $smallm_1h mloc $M_loc_1h hloc $H_loc_1h $one_hsp_len A: (($M_loc_1h-$H_loc_1h)/$one_hsp_len) A2: ($H_loc_1h/$one_hsp_len) B: ($M_loc_1h/$one_hsp_len) C: (A2) $reject_1h\n";
		    
			if (((defined $seq_2hsp) && ($reject_2h ne "rej")) && ((defined $seq_1hsp) && ($reject_1h ne "rej"))) {
                        print TEST2 "both present 2h $seq_2hsp 1h $seq_1hsp\n";
                        print TEST2 "fin2 $seq_fin_2hsp fin1 $seq_fin_1hsp\n";
			open OUT, ">$pathsave/tmp_neworf_aa_query1.tab"; print OUT ">$gene_seg\n$seq_fin_1hsp"; close OUT;
			open OUT, ">$pathsave/tmp_neworf_aa_query2.tab"; print OUT ">$gene_seg\n$seq_fin_2hsp"; close OUT;
			my $BLASTOUT1 = $blast_new_dir."/".$gene_seg."_".$currentpillar."_1.html"; my $BLASTOUT2 = $blast_new_dir."/".$gene_seg."_".$currentpillar."_2.html";
			my $BLASTOUT_full = $blast_new_dir."/".$gene_seg."_".$currentpillar.".html"; my $BLASTOUT_form = ">".$blast_new_dir."/".$gene_seg."_".$currentpillar."_form.html";
#			print TEST2 "testpil $testpil currentpillar $currentpillar\n";
			system "formatdb -i $pathsave/current/seq -p T -o T";
		#	print "blast3 $gene_seg d $sequence -i $seq_fin_1hsp\n";
			system "$blastall -p blastp -d $pathsave/current/seq -i $pathsave/tmp_neworf_aa_query1.tab -o $BLASTOUT1 -e 10 -v 100 -b 100 -F F -m 8";
		#	print "blast4 $gene_seg d $sequence -i $seq_fin_2hsp\n";
			system "$blastall -p blastp -d $pathsave/current/seq -i $pathsave/tmp_neworf_aa_query2.tab -o $BLASTOUT2 -e 10 -v 100 -b 100 -F F -m 8";
#			system "rm $pathsave/tmp_neworf_aa_query*";
			open IN, "<".$blast_new_dir."/".$gene_seg."_".$currentpillar."_1.html"; my @in1 = <IN>; close IN;
			open IN, "<".$blast_new_dir."/".$gene_seg."_".$currentpillar."_2.html"; my @in2 = <IN>; close IN;
			
			my @blast_split1 = split(/\t/, $in1[0]); chomp $blast_split1[11]; my $bit_score1 = $blast_split1[11];
                        my @blast_split2 = split(/\t/, $in2[0]); chomp $blast_split2[11]; my $bit_score2 = $blast_split2[11];
			if ($bit_score2 > $bit_score1) {			    
			    print TEST2 "$geneid[0] $geneid[1] $currentpillar bit2 $bit_score2 > bit1 $bit_score1\n";
			    $sequence_all{$currentpillar}{$gene_seg} = $seq_2hsp;
                            $coord_all{$currentpillar}{$gene_seg}= $coord_2h;
			    $nt_all{$currentpillar}{$gene_seg}=$ntseq_2hsp;
                            $hsp_all{$currentpillar}{$gene_seg}= $two_hsp_len;
			    $correct_all{$currentpillar}{$gene_seg}= $correct_2h{$currentpillar}{$gene_seg};			    
	#		    print "blast5 $gene_seg d $pathsave/current/seq -i $seq_fin_2hsp\n";			    
			    system "$blastall -p blastp -d $pathsave/current/seq -i $pathsave/tmp_neworf_aa_query2.tab -o $BLASTOUT_full -e 10 -v 100 -b 100 -F F";
                            open OUT, ">".$BLASTOUT_form;
			    open IN, "<".$BLASTOUT_full; my @in = <IN>; close IN;
			    print OUT "<html><pre>"; foreach my $line (@in) { print OUT "$line"; } print OUT "</html>"; close OUT;
                            system "rm ".$BLASTOUT_full;

#			    print TEST2 "All1 $correct_all{$currentpillar}{$gene_seg} $nt_all{$currentpillar}{$gene_seg}\n";
			} else {
#			    print TEST2 "$geneid[0] $geneid[1] $currentpillar bit1 $bit_score1 > bit2 $bit_score2\n";
#                            print TEST2 length($seq_2hsp)." < ".length($seq_1hsp)."\n";
			    $sequence_all{$currentpillar}{$gene_seg} = $seq_1hsp;
                            $coord_all{$currentpillar}{$gene_seg}= $coord_1h;
                            $nt_all{$currentpillar}{$gene_seg}=$ntseq_1hsp;
                            $hsp_all{$currentpillar}{$gene_seg}= $one_hsp_len;
                            $correct_all{$currentpillar}{$gene_seg}= $correct_1h{$currentpillar}{$gene_seg};
	#		    print "blast6 $gene_seg d $pathsave/current/seq -i $seq_fin_1hsp\n";
			    system "$blastall -p blastp -d $pathsave/current/seq -i $pathsave/tmp_neworf_aa_query1.tab -o $BLASTOUT_full -e 10 -v 100 -b 100 -F F";
                            open OUT, ">".$BLASTOUT_form;
                            open IN, "<".$BLASTOUT_full; my @in = <IN>; close IN;
                            print OUT "<html><pre>"; foreach my $line (@in) { print OUT "$line"; } print OUT "</html>"; close OUT;
			    system "rm ".$BLASTOUT_full;

 #                           print TEST2 "All2 $correct_all{$currentpillar}{$gene_seg} $nt_all{$currentpillar}{$gene_seg}\n";
			}
		    }
#			print TEST1 "outside coord2h $coord_2h coord1h $coord_1h\n";
		    
			if ((defined $seq_2hsp) && ($reject_2h ne "rej")) {
 #                       print TEST2 "2 present\n";
			$seq2hsp_hash{$currentpillar}{$gene_seg} = $seq_2hsp;
			if (($seq_1hsp eq undef) || ($reject_1h eq "rej")) {
			    open OUT, ">$pathsave/tmp_neworf_aa_query.tab"; print OUT ">$gene_seg\n$seq_fin_2hsp"; close OUT;
	#		    print ">$gene_seg\n$seq_fin_2hsp\n";
			    my $BLASTOUT_full = $blast_new_dir."/".$gene_seg."_".$currentpillar.".html"; my $BLASTOUT_form = ">".$blast_new_dir."/".$gene_seg."_".$currentpillar."_form.html";
			    system "formatdb -i $pathsave/current/seq -p T -o T";
		#	    print "blast7 $gene_seg d $sequence -i $seq_fin_2hsp\n";
                            system "$blastall -p blastp -d $pathsave/current/seq -i $pathsave/tmp_neworf_aa_query.tab -o $BLASTOUT_full -e 10 -v 100 -b 100 -F F";
                            open OUT, ">".$BLASTOUT_form;
                            open IN, "<".$BLASTOUT_full; my @in = <IN>; close IN;
                            print OUT "<html><pre>"; foreach my $line (@in) { print OUT "$line"; } print OUT "</html>"; close OUT;
                            system "rm ".$BLASTOUT_full;

#			    print TEST2 " 1 undef/rej\n";
			    $sequence_all{$currentpillar}{$gene_seg} = $seq_2hsp;
			    $coord_all{$currentpillar}{$gene_seg}= $coord_2h;
			    $nt_all{$currentpillar}{$gene_seg}=$ntseq_2hsp;
			    $hsp_all{$currentpillar}{$gene_seg}= $two_hsp_len;
                            $correct_all{$currentpillar}{$gene_seg}= $correct_2h{$currentpillar}{$gene_seg};
 #                           print TEST2 "All3 $correct_all{$currentpillar}{$gene_seg} $nt_all{$currentpillar}{$gene_seg}\n";
			}
		    }
			if ((defined $seq_1hsp) && ($reject_1h ne "rej")) {			
			    $seq1hsp_hash{$currentpillar}{$gene_seg} = $seq_1hsp;
#			print TEST2 "1 present\n";
			if (($seq_2hsp eq undef) || ($reject_2h eq "rej")) {
			    print TEST2 "$gene_seg 1 $seq_1hsp 1f $seq_fin_1hsp\n";
                            open OUT, ">$pathsave/tmp_neworf_aa_query.tab"; print OUT ">$gene_seg\n$seq_fin_1hsp"; close OUT;
	#		    print ">$gene_seg\n$seq_fin_1hsp\n";
                            my $BLASTOUT_full = $blast_new_dir."/".$gene_seg."_".$currentpillar.".html"; my $BLASTOUT_form = ">".$blast_new_dir."/".$gene_seg."_".$currentpillar."_form.html";
                            system "formatdb -i $pathsave/current/seq -p T -o T";
	#		    print "blast8 $gene_seg d $sequence -i $seq_fin_1hsp -ii $seq_1hsp\n";
                            system "$blastall -p blastp -d $pathsave/current/seq -i $pathsave/tmp_neworf_aa_query.tab -o $BLASTOUT_full -e 10 -v 100 -b 100 -F F";
                            open OUT, ">".$BLASTOUT_form;
                            open IN, "<".$BLASTOUT_full; my @in = <IN>; close IN;
                            print OUT "<html><pre>"; foreach my $line (@in) { print OUT "$line"; } print OUT "</html>"; close OUT;
                            system "rm ".$BLASTOUT_full;

#			    print TEST2 "2 undef/rej\n";
			    $sequence_all{$currentpillar}{$gene_seg} = $seq_1hsp;
			    $coord_all{$currentpillar}{$gene_seg}= $coord_1h;
                            $nt_all{$currentpillar}{$gene_seg}=$ntseq_1hsp;
                            $hsp_all{$currentpillar}{$gene_seg}= $one_hsp_len;
                            $correct_all{$currentpillar}{$gene_seg}= $correct_1h{$currentpillar}{$gene_seg};
#                            print TEST2 "All4 $correct_all{$currentpillar}{$gene_seg} $nt_all{$currentpillar}{$gene_seg}\n";
			}
			}
		    if ((-e $pathsave."/tmp_neworf_aa_query.tab") || (-e $pathsave."/tmp_neworf_aa_query1.tab") || (-e $pathsave."/tmp_neworf_aa_query2.tab")) { system "rm $pathsave/tmp_neworf_aa_query*"; }	    
		}	   
	    }
	}
#	else { print BAD "$gene_seg\n";}
    }
    
    foreach my $pill (keys %sequence_all){ foreach my $gene_seg (keys %{$sequence_all{$pill}}) {
	if (defined $sequence_all{$pill}{$gene_seg}) { print SAVE5 ">".$gene_broke{$pill}{$gene_seg}."\t$listofhits{$pill}{$gene_seg}\t$coord_all{$pill}{$gene_seg}\t$correct_all{$pill}{$gene_seg}\t\n$sequence_all{$pill}{$gene_seg}\n";
						   } } }
        foreach my $pill (keys %nt_all){ foreach my $gene_seg (keys %{$nt_all{$pill}}) { print NT $gene_broke{$pill}{$gene_seg}."\t$listofhits{$pill}{$gene_seg}\t$nt_all{$pill}{$gene_seg}\n"; } }

    close SAVE5; 
# close SAVE2; close SAVE3; close REJ;close (SAVE); close (BAD); 
close TEST1; close TEST2;  

    close NT;
}

close ERROR;

#my $endtime=new Benchmark;
#my $totaltime=timediff($endtime,$startime);
#print "The code took: ",timestr($totaltime),"\n";
print "Step 2 (find ORFs) completed\n\n";

sub ORFmaker{
    my ($blast_in_ref, $median, $intgen_full, $min_1, $min_2, $genepair_ref, $translation_ref, $int_start, $int_stop, $seg_start, $seg_stop, $seq_chrom) = @_;
    my %translation = %{$translation_ref}; my @blast_in = @{$blast_in_ref}; my @genepair = @{$genepair_ref};
    my $z1=0; my $z2=0; my $coord_2hsp; my $coord_1hsp;
    my $intgen_original = $intgen_full;
    my $len; my $count_hsp=0;
    my $small1; my $small2; my $big1; my $big2;
    my $small1_A; my $small2_A; my $big1_A; my $big2_A;
    my @start; my @stop; my @hstart; my @hstop;
    my $or1; my $or2; my $total_olap =0; my $olap; my $dist; my $aa_1; my $aa_2;
    my @temp; my $sequence_tot; my $sequence_tot_1hsp; my $nt_sequence_tot; my $nt_sequence_tot_1hsp;

    chomp $blast_in[0];
    my @temp_l1= split(/\t/,$blast_in[0]);
    my $top = $temp_l1[1];
    
    my @top_temp= split(/\t/,$blast_in[0]);
    my $toplen = abs($top_temp[8]-$top_temp[9]);
    my $hitlist = $top; my $lasthit = $top;

    for (my $n=0; $n<2; $n++) { ## looking at top 2 results;
        if (defined $blast_in[$n]) {
            chomp $blast_in[$n];
            @temp= split(/\t/,$blast_in[$n]);
	    unless ($temp[1] eq $lasthit) { $hitlist .= " $temp[1]"; }
            my $x=abs($temp[8]-$temp[9]);
            if ($temp[1] eq $top) { #provided both hits are to same protein.
                push @start, $temp[6]; push @stop, $temp[7]; #query start and stop
                push @hstart, $temp[8]; push @hstop, $temp[9]; ##hit start and stop
                if ($x>10) { ## if protein length hit is > 10aa
                    $z2=1;
                    $len +=$x; ## overall length of protein hit
                    $count_hsp++; ### count of number of eligable hsps
#                    print "$genepair[0] $top x $x len $len\n";
                }
            }
	    $lasthit = $temp[1];
        }
    }
    ### big enough relative to median? ###
    my $ly2=$median;
    my $lx2 = $len;
    if ($ly2<$lx2){($lx2,$ly2)=($ly2,$lx2);}
    if ($lx2/$ly2>$min_2){
        $z2=1;
    }
    my $lz2 = $toplen;
    if ($ly2<$lz2){($lz2,$ly2)=($ly2,$lz2);}
    if ($lz2/$ly2>$min_1){
        $z1=1;
    }
    my $begin; my $end; my $correct_2hsp;
    if (($count_hsp == 2) && ($z2 == 1)) { ## 2hsp case (and long enough) "tie" them together
        ### order coordinates so going from "smaller" to "bigger" regardless of direction of hsp ###
        if ($stop[0]<$start[0]){($small1,$big1)=($stop[0],$start[0]); $or1="-"; } else { ($small1,$big1)=($start[0],$stop[0]); $or1="+";}
        if ($stop[1]<$start[1]){($small2,$big2)=($stop[1],$start[1]); $or2="-";} else { ($small2,$big2)=($start[1],$stop[1]); $or2="+";}

        ### hsps need to not overlap too much, be close enough together and orient in the same direction ###
        if ((($small1>=$small2) && ($big1<=$big2)) || (($small2>=$small1) && ($big2<=$big1))) {
            $total_olap = 1; ## mark that a total overlap exists.
        }
        my $hsm1; my $hbig1; my $hsm2; my $hbig2;
        if (($or1 eq $or2) && ($total_olap == 0)) {
            if ($small1<$small2){($small1_A,$small2_A)=($small1,$small2);} else { ($small1_A,$small2_A)=($small2,$small1);}
            if ($big1<$big2) { ## order query and hit coordinates
                ($big1_A,$big2_A)=($big1,$big2); $hsm1=$hstart[0]; $hbig1 =$hstop[0]; $hsm2 = $hstart[1]; $hbig2 = $hstop[1];
            } else {
                ($big1_A,$big2_A)=($big2,$big1); $hsm1=$hstart[1]; $hbig1 =$hstop[1]; $hsm2 = $hstart[0]; $hbig2 = $hstop[0];
            }
	    $begin = $int_start-$seg_start+$small1_A; $end=$int_start-$seg_start+$big2_A;
            $dist = $small2_A - $big1_A; my $dist_hsp;
	    if ($hstart[1]>$hstart[0]) { $dist_hsp = ($hstart[1]-$hstop[0]); } else { $dist_hsp = ($hstart[0]-$hstop[1]); } 
	    my $range = $big2_A-$small1_A; my $pot_olap = $big1_A-$small2_A;
            if (($dist <120) && (abs($dist_hsp) <= abs(2*$dist/3)) && (abs($dist_hsp) >= abs(0.5*($dist/3)))) {  ##HSPs no more than Xnts apart!
#		print TEST1 "$genepair[0] $genepair[1] $top 2 ($small1_A $big1_A $small2_A $big2_A) ($hsm1 $hbig1 $hsm2 $hbig2)\n";
		print TEST1 "Dist $dist\n";
		print TEST1 "INTGEN_EXP: $intgen_full\n";
		my $ptA= ($int_start-$seg_start)+$small1_A; my $ptB= ($int_start-$seg_start)+$big1_A; my $ptC= ($int_start-$seg_start)+$small2_A; my $ptD=($int_start-$seg_start)+$big2_A;
		my $seq_1 = substr($intgen_full, ($ptA-1), ($ptB-$ptA+1)); ### NT Sequence of first HSP
		my $seq_2 = substr($intgen_full, ($ptC-1), ($ptD-$ptC+1)); ### NT Sequence of second HSP
		
		if ($or1 eq "-") { ### reverse sequences and coordinates. ###
		    my $prev_ptA =$ptA; my $prev_ptB =$ptB; my $prev_ptC =$ptC; my $prev_ptD =$ptD;
		    ($hsm2,$hsm1) = ($hsm1,$hsm2); ($hbig2,$hbig1)=($hbig1,$hbig2);
		    $intgen_full = &rev_comp($intgen_full);
		    ($seq_1,$seq_2)=($seq_2,$seq_1);
		    $seq_1=&rev_comp($seq_1); $seq_2 = &rev_comp($seq_2);
		    $ptA = length($intgen_full)-$prev_ptD+1; $ptB = length($intgen_full)-$prev_ptC+1; $ptC = length($intgen_full)-$prev_ptB+1; $ptD = length($intgen_full)-$prev_ptA+1; 
		}
		
		my $pre_aa; my $post_aa; my $pre_valid=0;
		my $sequence = ""; my $nt_seq = "";
		if (($hsm1 < $hsm2) && ($hbig1 < $hbig2)) { ## Check  hits are in right order
		    my $preseq = substr($intgen_full, 0, ($ptA-1)); my $postseq = substr($intgen_full, $ptD, (length($intgen_full)-($ptD)));
		    
		    my @temp_s1 = split(//, $seq_1); my @temp_s2 = split(//, $seq_2);
		    my @temp_pre = split(//, $preseq); my @temp_post = split(//, $postseq);
		    my $last_aa; my $pre_seq_tr; my $post_seq_tr;
		    for (my $x1=0;$x1<$#temp_s1;$x1=$x1+3){  ## aa seqs of first and second hsps
			$aa_1=$aa_1.$translation{$temp_s1[$x1].$temp_s1[$x1+1].$temp_s1[$x1+2]};
		    }
		    my $first_aa = $translation{$temp_s1[0].$temp_s1[1].$temp_s1[2]};
		    for (my $x2=0;$x2<$#temp_s2;$x2=$x2+3){
			$aa_2=$aa_2.$translation{$temp_s2[$x2].$temp_s2[$x2+1].$temp_s2[$x2+2]};
			$last_aa = $translation{$temp_s2[$x2].$temp_s2[$x2+1].$temp_s2[$x2+2]};
		    }
		    unless ($last_aa eq "*") { ### extend hsp to "finish" and extend to "start";
			for (my $x3=0;$x3<$#temp_post;$x3=$x3+3) {
			    if ($translation{$temp_post[$x3].$temp_post[$x3+1].$temp_post[$x3+2]} eq "*") {
				$post_aa = $post_aa.$translation{$temp_post[$x3].$temp_post[$x3+1].$temp_post[$x3+2]};
                                $post_seq_tr .= $temp_post[$x3].$temp_post[$x3+1].$temp_post[$x3+2];
				last;
			    } else {
				$post_aa = $post_aa.$translation{$temp_post[$x3].$temp_post[$x3+1].$temp_post[$x3+2]};
				$post_seq_tr .= $temp_post[$x3].$temp_post[$x3+1].$temp_post[$x3+2];
			    }
			}
		    }
		    for (my $x3=$#temp_pre-2; $x3>=0; $x3=$x3-3) {
			if ($translation{$temp_pre[$x3].$temp_pre[$x3+1].$temp_pre[$x3+2]} eq "*") {
			    $pre_aa = $translation{$temp_pre[$x3].$temp_pre[$x3+1].$temp_pre[$x3+2]}.$pre_aa;
			    $pre_seq_tr = $temp_pre[$x3].$temp_pre[$x3+1].$temp_pre[$x3+2].$pre_seq_tr;
			    last;
			} else {
			    $pre_aa = $translation{$temp_pre[$x3].$temp_pre[$x3+1].$temp_pre[$x3+2]}.$pre_aa;
                            $pre_seq_tr = $temp_pre[$x3].$temp_pre[$x3+1].$temp_pre[$x3+2].$pre_seq_tr;
			}
		    }
		    print TEST1 "pre-pre_aa $pre_aa\n";

		    $pre_aa=lc($pre_aa); $post_aa=lc($post_aa);
		    my $len_preaa=length($pre_aa); my $len_postaa=length($post_aa); my $len_prent=length($preseq); my $len_postnt=length($postseq);
		    print TEST1 "top $or1 aa1 $aa_1 aa2 $aa_2 preaa $pre_aa postaa $post_aa\n";
		    print TEST1 "seq_1 $seq_1\n seq_2 $seq_2\n preseq $preseq\n postseq $postseq lpreaa $len_preaa lpostaa $len_postaa lprent $len_prent lpostnt $len_postnt\n";
#		    print TEST1 "pre_aa $pre_aa post_aa $post_aa\n";
		    my $midseq;
		    if ($dist > 3) { ##non-overlap cases		       
			$midseq = substr($intgen_full, $ptB, ($dist-1));
			my @temp_midseq = split(//, $midseq);
			my $rem = ($dist-1) % 3; my $mid_1; my $mid_2; my $nt_mid1; my $nt_mid2;
			for (my $m=0;$m<$#temp_midseq;$m=$m+3) { ## middle aaseq, synced to first hsp, then to second.
			    $mid_1 = $mid_1.$translation{$temp_midseq[$m].$temp_midseq[$m+1].$temp_midseq[$m+2]};
			    $nt_mid1 = $nt_mid1.$temp_midseq[$m].$temp_midseq[$m+1].$temp_midseq[$m+2];
			}
			for (my $m=$rem;$m<$#temp_midseq;$m=$m+3) {
			    $mid_2 = $mid_2.$translation{$temp_midseq[$m].$temp_midseq[$m+1].$temp_midseq[$m+2]};
			    $nt_mid2 = $nt_mid2.$temp_midseq[$m].$temp_midseq[$m+1].$temp_midseq[$m+2];
			}
			if ($rem == 0) {
#			    print TEST2 "2HSPs in frame:<br> 2 HSPs separated by $dist nt region of low similarity in query\n";
			    $correct_2hsp = "2HSPs in frame:<br> 2 HSPs separated by $dist nt region of low similarity in query, ".$dist_hsp." nt region in hit";
			    $sequence = $aa_1.$mid_1.$aa_2;
			    $nt_seq = $seq_1.$nt_mid1.$seq_2;
			} else {			
#			    print TEST2 "2HSPs out of frame:<br>  2 non-overlapping HSPs separated by $dist\\n";
                            $correct_2hsp = "2HSPs out of frame:<br>  2 non-overlapping HSPs separated by $dist nt region in query, ".$dist_hsp." nt region in hit, frameshift corrected";
			    my $halflen = int((length($mid_1)+1)/2);
			    my @midcut1 = split (/\*/, $mid_1); my $mid_sh1 = $midcut1[0]; ## where do we hit stops?
			    my @midcut2 = split (/\*/, $mid_2); my $lastent = $#midcut2; my $mid_sh2 = $midcut2[$lastent];
			    my $crossover = length($mid_sh1)-(length($mid_2)-length($mid_sh2));
			    my $between_pt = (length($mid_2)-length($mid_sh2))+ int((abs($crossover)+1)/2); 
#			    print TEST1 "mid_1 $mid_1 mid_2 $mid_2 ntmid1 $nt_mid1 ntmid2 $nt_mid2 $mid_sh1 $mid_sh2\n"; 
			    if ($crossover >= 0) {
				my $mid_c1 = substr($mid_1, 0, $between_pt-1); $mid_c1 = lc($mid_c1);
				my $mid_c2 = substr($mid_2, ($between_pt), (length($mid_2)-$between_pt)); $mid_c2 = lc($mid_c2); 
				$sequence = $aa_1.$mid_c1."X".$mid_c2.$aa_2; ### both midbits extend beyond halfway
				my $nt_midc1 = substr($nt_mid1, 0, ($between_pt*3-3)); my $nt_midc2 = substr($nt_mid2, ($between_pt*3), (length($nt_mid2)-($between_pt*3)));
				my $n_add; for (my $n=0; $n<3; $n++) { $n_add = $n_add."N"; }
				$nt_seq = $seq_1.$nt_midc1.$n_add.$nt_midc2.$seq_2;
			    }
			    if ($crossover < 0) {
				my $mid_c1 = substr($mid_1, 0, ($halflen-1)); $mid_c1 = lc($mid_c1);
				my $mid_c2 = substr($mid_2, ($halflen), (length($mid_2)-$halflen)); $mid_c2 = lc($mid_c2);
                                my $nt_midc1 = substr($nt_mid1, 0, ($halflen*3-3)); my $nt_midc2 = substr($nt_mid2, ($halflen*3), (length($nt_mid2)-($halflen*3)));
                                my $n_add; for (my $n=0; $n<3; $n++) { $n_add = $n_add."N"; }
                                $nt_seq = $seq_1.$nt_midc1.$n_add.$nt_midc2.$seq_2;
				$sequence = $aa_1.$mid_c1."X".$mid_c2.$aa_2; ### both midbits extend beyond halfway
			    }
			}
			if (($dist >0) && ($dist <=3)) {
#			    print TEST2 "2HSPs out of frame:<br> 2 non-overlapping HSPs separated by $dist nt\n";
                            $correct_2hsp = "2HSPs out of frame:<br> 2 non-overlapping HSPs separated by $dist nt region in query, ".$dist_hsp." nt region in hit, frameshift corrected";
			    chop $aa_1;
			    $sequence = $aa_1."X".$aa_2;
                            my $n_add; for (my $n=0; $n<3; $n++) { $n_add = $n_add."N"; }
			    $nt_seq = $seq_1.$n_add.$seq_2;
#			    print TEST1 "$or1 E: $sequence length_seq ".length($sequence)."length_nt ".length($nt_seq)." dist $dist midaalen 1 midntlen ".length($n_add)."\n";
			}
		    }
		    print TEST1 "$genepair[0] $genepair[1] hit $top dist $dist or1 or2 $or1 $or2 total_olap $total_olap s1 $small1 b1 $big1 s2 $small2 b2 $big2 s1A $small1_A b1A $big1_A s2A $small2_A b2A $big2_A\n";
#		    print TEST1 "$or1 1: $aa_1\n2: $aa_2\nSeq1 $seq_1\nSeq2 $seq_2\nMidseq: $midseq\n";
#		    print TEST1 "$pot_olap $range ".($pot_olap/$range)."\n";
		    
		    if ($dist <=0) {
			if (abs($pot_olap/$range) <= 0.7) {			    
#			    print TEST2 "Overlap 2HSPs out of frame:<br> 2 Overlapping HSPs separated by $dist nt\n";
                            $correct_2hsp = "Overlap 2HSPs out of frame:<br> 2 Overlapping HSPs separated by $dist nt region in query, ".$dist_hsp." nt region in hit, frameshift corrected";
			    my $n_to_add = 3;
			    my $olap1 = $ptC-$ptA; my $olap1_corr;
			    if ($olap1 % 3 == 1) { $olap1_corr = (($olap1+2)/3); }
			    if ($olap1 % 3 == 2) { $olap1_corr = (($olap1+1)/3); }
			    if ($olap1 % 3 == 0) { $olap1_corr = ($olap1/3); }
			    
			    my $olap2 = $ptB-$ptC; my $olap2_corr;
			    if ($olap2 % 3 == 1) { $olap2_corr = (($olap2-1)/3); }
			    if ($olap2 % 3 == 2) { $olap2_corr = (($olap2-2)/3); }
			    if ($olap2 % 3 == 0) { $olap2_corr = ($olap2/3); }
			    
			    my $y1_o1 = substr($aa_1, 0, $olap1_corr);
			    my $y2_o2 = substr($aa_2, $olap2_corr, (length($aa_2)-$olap2_corr));
			    my $ovseq1 = substr($aa_1, $olap1_corr, (length($aa_1)-$olap1_corr));
			    my $ovseq2 = substr($aa_2, 0, $olap2_corr);			    
			    my $nt_y1o1 = substr($seq_1, 0, $olap1_corr*3); my $nt_y2o2 = substr($seq_2, ($olap2_corr*3), (length($seq_2)-($olap2_corr*3)));
			    my $nt_ov1 = substr($seq_1, ($olap1_corr*3), (length($seq_1)-($olap1_corr*3))); my $nt_ov2 = substr($seq_2, 0, ($olap2_corr*3));
			    			   
			    my $halflen1 = int((length($ovseq1)+1)/2);
			    my $halflen2 = int((length($ovseq2)+1)/2); my $mid_ov1; my $mid_ov2;
			    unless ($ovseq1 =~ /^\*/) { if ($ovseq1 =~ /\*/) { my @ovcut1 = split (/\*/, $ovseq1); $mid_ov1 = $ovcut1[0]; } else { $mid_ov1= $ovseq1; } } else { $mid_ov1 = ""; } 
			    unless (reverse($ovseq2) =~ /^\*/) { if ($ovseq2 =~ /\*/) { my @ovcut2 = split (/\*/, $ovseq2); my $lastent = $#ovcut2; $mid_ov2 = $ovcut2[$lastent]; 
										   } else { $mid_ov2 = $ovseq2; } } else { $mid_ov2 = ""; }			
#			    print TEST1 "$or1 olap1 $olap1 olap2 $olap2 olap1_corr $olap1_corr olap2_corr $olap2_corr ovseq_1 $ovseq1 ovseq_2 $ovseq2\n";
			    if ($dist >-3) {
				chop $aa_1;
				$aa_2 = substr($aa_2,1,length($aa_2)-1);
				$sequence = $aa_1."X".$aa_2;
				my $nt_2 = substr($seq_2,3,length($seq_2)-3);
				my $n_add; for (my $n=0; $n<$n_to_add; $n++) { $n_add = $n_add."N"; }
                                $nt_seq = $seq_1.$n_add.$nt_2;
#				print TEST1 "1 codon overlap, sequence: $sequence lenseq ".length($sequence)."length nt ".length($nt_seq). "dist $dist midaalen 1 midntlen ".length($n_add). "\n";
			    } else {
				my $crossover = length($mid_ov1)-(length($ovseq2)-length($mid_ov2));
				my $between_pt = (length($ovseq2)-length($mid_ov2))+int((abs($crossover)+1)/2);
				if ($crossover >= 0) {
				    my $o1_frag = substr($ovseq1, 0, $between_pt-1);
				    my $o2_frag = substr($ovseq2, ($between_pt), (length($ovseq2)-$between_pt));
				    my $nt_f1 = substr($nt_ov1, 0, ($between_pt*3-3)); my $nt_f2 = substr($nt_ov2, ($between_pt*3), (length($nt_ov2)-($between_pt*3)));
				    my $n_add; for (my $n=0; $n<$n_to_add; $n++) { $n_add = $n_add."N"; }
				    $nt_seq = $nt_y1o1.$nt_f1.$n_add.$nt_f2.$nt_y2o2;
				    				   
#				    print TEST1 "OLAPFRAGS $o1_frag $o2_frag ".($between_pt+1)." ".(length($ovseq2)-$between_pt);
				    $sequence = $y1_o1.$o1_frag."X".$o2_frag.$y2_o2; ### both midbits extend beyond halfway
#				    print TEST1 "$or1 OV A: $crossover $between_pt $sequence\n";    ### cut from hsp 1 to hsp 2 at approx midpt ###
#				    print TEST1 "NtestOVA $nt_f1 $n_add $nt_f2 dist $dist length_seq ".length($sequence)." length_nt ".length($nt_seq)." ovaalen ".(3*(length($o1_frag)+length($o2_frag)+1))." midntlen ".(length($nt_f1)+length($n_add)+length($nt_f2))."\n";
				    
				}
				if ($crossover < 0) {
				    my $o1_frag = substr($ovseq1, 0, ($halflen1-1));
                                    my $o2_frag = substr($ovseq2, ($halflen1), (length($ovseq2)-$halflen1));
                                    my $nt_f1 = substr($nt_ov1, 0, ($halflen1*3)-3); my $nt_f2 = substr($nt_ov2, ($halflen1), (length($nt_ov2)-($halflen1*3)));
                                    my $n_add; for (my $n=0; $n<$n_to_add; $n++) { $n_add = $n_add."N"; }
                                    $nt_seq = $nt_y1o1.$nt_f1.$n_add.$nt_f2.$nt_y2o2;


#                                    print TEST1 "OLAPFRAGS HALFCUT $o1_frag $o2_frag ".($halflen1-1)." ".(length($ovseq2)-$halflen1);
                                    $sequence = $y1_o1.$o1_frag."X".$o2_frag.$y2_o2; ### both midbits extend beyond halfway
#                                    print TEST1 "$or1 OV B: $halflen1 $halflen2 $sequence\n";    ### cut from hsp 1 to hsp 2 at approx midpt ###
#                                    print TEST1 "NtestOVB $nt_f1 $n_add $nt_f2 dist $dist length_seq ".length($sequence)." length_nt ".length($nt_seq)." ovaalen ".(3*(length($o1_frag)+length($o2_frag)+1))." midntlen ".(length($nt_f1)+length($n_add)+length($nt_f2))."\n";
                                }
				
			    }				
			} # else { print TEST1 "FAIL too much olap ".$pot_olap/$range."\n"; }
		    }
#		    print TEST1 "pretr $pre_seq_tr ntseq posttr $post_seq_tr\n";
                    $nt_sequence_tot = $pre_seq_tr.$nt_seq.$post_seq_tr;
		}  # else  { print TEST1 "FAIL $genepair[0] $genepair[1] $top 2 hsps not in right order hsm1 $hsm1 hsm2 $hsm2 hbig1 $hbig1 hbig2 $hbig2\n"; }
		my $to_front; my $to_back;
		if ((defined $sequence) && ($sequence ne "")) {
		    if ($or1 eq "-") {
			$to_front = 3*length($post_aa); $to_back = 3*length($pre_aa);
		    } else { $to_front = 3*length($pre_aa); $to_back = 3*length($post_aa); }	    
		    $sequence_tot = $pre_aa.$sequence.$post_aa; 
#		    print TEST1 "$or1 $pre_aa ($aa_1 $aa_2) $sequence $post_aa sequence tot: $sequence_tot length ".length($sequence_tot)."\n";
#		    print TEST1 "begin_before $begin end_before $end\n";
		    $begin = $begin-$to_front; $end = $end+$to_back;
		    
		    if ($or1 eq "-") { $coord_2hsp = "complement(".($begin+$seg_start)."_".($end+$seg_start).")"; } else { $coord_2hsp = ($begin+$seg_start)."_".($end+$seg_start); }      	
		    my $test_ntseq = substr($seq_chrom,($begin+$seg_start-1),(($end+$seg_start)-($begin+$seg_start)+1)); if ($or1 eq "-") { $test_ntseq = &rev_comp($test_ntseq); }
#		    print TEST2 "Z=2 $seg_start (int $int_start intstop $int_stop) (tofront $to_front) begin $begin end $end (to back $to_back)\n";
#		    print TEST1 "coord: $coord_2hsp range".((($end+$seg_start)-($begin+$seg_start))+1)." orf ".(3*length($sequence_tot))."\n";
		}
	    } else {# print TEST1 "FAIL $genepair[0] $genepair[1] $top distance between ORFS $dist ".($dist/3)."codons $dist_hsp\n"; 
		     my $seq_1 = substr($intgen_full, ($small1_A-1), ($big1_A-$small1_A+1)); ### NT Sequence of first HSP
		     my $seq_2 = substr($intgen_full, ($small2_A-1), ($big2_A-$small2_A+1)); ### NT Sequence of second HSP
		    # print FAIL "FAIL $genepair[0] $genepair[1] $top distance between ORFS $dist ".($dist/3)."codons $dist_hsp $seq_1 $seq_2\n"; 		   
		 }
	} else { # print TEST1 "FAIL $genepair[0] $genepair[1] $top $or1 $or2 total overlap? $total_olap\n";     
		 my $seq_1 = substr($intgen_full, ($small1_A-1), ($big1_A-$small1_A+1)); ### NT Sequence of first HSP
		 my $seq_2 = substr($intgen_full, ($small2_A-1), ($big2_A-$small2_A+1)); ### NT Sequence of second HSP
		# print FAIL "FAIL $genepair[0] $genepair[1] $top $or1 $or2 total overlap? $total_olap $seq_1 $seq_2\n"; 
	     }
    } # else  { print TEST1 "(2hsp) FAIL $genepair[0] $genepair[1] $top count_hsp $count_hsp, z2 $z2 z1? $z1 $len $median\n"; }   
    
    $intgen_full = $intgen_original;
    if ($z1 == 1) {
        my $aa; my $small; my $big;
        if ($stop[0]>$start[0]) { $or1="+"; $small= ($int_start-$seg_start)+$start[0]; $big=($int_start-$seg_start)+$stop[0]; } else { $or1 = "-";  $small=($int_start-$seg_start)+$stop[0]; $big=($int_start-$seg_start)+$start[0]; }
	$begin = $small; $end=$big;
#	print TEST1 "z1 = $z1 start $start[0] stop $stop[0] small $small big $big\n";
#	print TEST1 "at start begin $begin end $end\n";
        my $seq = substr($intgen_full, ($small-1), ($big-$small+1));
#	print TEST1 "$genepair[0] $genepair[1] $top hsplen $toplen small $small big $big\nntseq $seq\n";
	my $ptX=$small; my $ptY=$big;
	if ($or1 eq "-") {
	    my $prev_ptX = $small; my $prev_ptY = $big;
	    $intgen_full = &rev_comp($intgen_full);
            $seq=&rev_comp($seq);
	    $ptX = length($intgen_full)-$prev_ptY+1; $ptY = length($intgen_full)-$prev_ptX+1; 
	}
	my $pre_aa; my $post_aa; my $pre_valid=0;
#	print TEST1 "1hsp sequence V: $sequence\n";
	my $preseq = substr($intgen_full, 0, ($ptX-1)); my $postseq = substr($intgen_full, ($ptY), (length($intgen_full)-$ptY));
#	print TEST1 "$or1 ntpreseq: $preseq\n ntpostseq: $postseq\n";
	my @temp = split(//, $seq);
	my @temp_pre = split(//, $preseq); my @temp_post = split(//, $postseq);
	my $last_aa; my $pre_seq_tr; my $post_seq_tr;
	for (my $x1=0;$x1<$#temp;$x1=$x1+3){  ## aa seq of hsp
	    $aa=$aa.$translation{$temp[$x1].$temp[$x1+1].$temp[$x1+2]};
	    $last_aa = $translation{$temp[$x1].$temp[$x1+1].$temp[$x1+2]};
	}
	my $first_aa = $translation{$temp[0].$temp[1].$temp[2]};
	
	unless ($last_aa eq "*") {
	    for (my $x3=0;$x3<$#temp_post;$x3=$x3+3) {
		if ($translation{$temp_post[$x3].$temp_post[$x3+1].$temp_post[$x3+2]} eq "*") {
		    $post_aa = $post_aa.$translation{$temp_post[$x3].$temp_post[$x3+1].$temp_post[$x3+2]};
		    $post_seq_tr .= $temp_post[$x3].$temp_post[$x3+1].$temp_post[$x3+2];
		    last;
		} else {
		    $post_aa = $post_aa.$translation{$temp_post[$x3].$temp_post[$x3+1].$temp_post[$x3+2]};
		    $post_seq_tr .= $temp_post[$x3].$temp_post[$x3+1].$temp_post[$x3+2];
		}
	    }
	}
	for (my $x3=$#temp_pre-2; $x3>=0; $x3=$x3-3) {
	    if ($translation{$temp_pre[$x3].$temp_pre[$x3+1].$temp_pre[$x3+2]} eq "*") {
		$pre_aa = $translation{$temp_pre[$x3].$temp_pre[$x3+1].$temp_pre[$x3+2]}.$pre_aa;
		$pre_seq_tr = $temp_pre[$x3].$temp_pre[$x3+1].$temp_pre[$x3+2].$pre_seq_tr;
		last;
	    } else {
		$pre_aa = $translation{$temp_pre[$x3].$temp_pre[$x3+1].$temp_pre[$x3+2]}.$pre_aa;
		$pre_seq_tr = $temp_pre[$x3].$temp_pre[$x3+1].$temp_pre[$x3+2].$pre_seq_tr;
	    }
	}
	my $len_preaa=length($pre_aa); my $len_postaa=length($post_aa); my $len_prent=length($preseq); my $len_postnt=length($postseq);
#	print TEST1 "$or1 1hsp aa: $aa\npre_aa: $pre_aa\n post_aa: $post_aa lpreaa $len_preaa lpostaa $len_postaa lprent $len_prent lpostnt $len_postnt\n";
	$pre_aa=lc($pre_aa); $post_aa=lc($post_aa);
	my $to_front; my $to_back;
	if ($or1 eq "-") {
	    $to_front = 3*length($post_aa); $to_back = 3*length($pre_aa);
	} else { $to_front = 3*length($pre_aa); $to_back = 3*length($post_aa); }
	$sequence_tot_1hsp = $pre_aa.$aa.$post_aa;
	$nt_sequence_tot_1hsp = $pre_seq_tr.$seq.$post_seq_tr;

#	print TEST1 "begin_before $begin end_before $end\n";
	$begin = $begin-$to_front; $end = $end+$to_back;
#	print TEST1 "$or1 $sequence_tot_1hsp length ".length($sequence_tot_1hsp)."\n";		     	    
	if ($or1 eq "-") { $coord_1hsp = "complement(".($begin+$seg_start)."_".($end+$seg_start).")"; } else { $coord_1hsp = ($begin+$seg_start)."_".($end+$seg_start); }
	
#	print TEST1 "begin $begin end $end int_start $int_start\n";
#	print TEST1 "pretr $pre_seq_tr ntseq $seq posttr $post_seq_tr\n";
	my $test_ntseq = substr($seq_chrom,($begin+$seg_start-1),(($end+$seg_start)-($begin+$seg_start)+1)); if ($or1 eq "-") { $test_ntseq = &rev_comp($test_ntseq); }
	print TEST2 "Z=1 $seg_start (int $int_start) (tofront $to_front) $begin end $end (to back $to_back)\n";
	print TEST1 "coord: $coord_1hsp range".((($end+$seg_start)-($begin+$seg_start))+1)." orf ".(3*length($sequence_tot_1hsp))."\n";
	print TEST1 "ntseq2 $nt_sequence_tot ntseq1 $nt_sequence_tot_1hsp\n";
    }         
    print TEST2 "correct 2hsp inloop $correct_2hsp\n";
    return($sequence_tot, $sequence_tot_1hsp, $nt_sequence_tot, $nt_sequence_tot_1hsp, $coord_2hsp, $coord_1hsp, $len, $toplen, $hitlist, $correct_2hsp);
}
    


sub rev_comp {
    my $string = $_[0];
    my $revstring = reverse($string);
    $revstring =~ s/A/t/g;
    $revstring =~ s/T/a/g;
    $revstring =~ s/C/g/g;
    $revstring =~ s/G/c/g;
    my $revcomp = uc($revstring);
    return $revcomp;
}

exit;
