#!/usr/bin/perl

&learn_genetic_code;

#print "name of clustalW alignment file? ";
$clustal=$ARGV[0];
#print "filename of unaligned DNA? ";
$rawdna=$ARGV[1];
#print "filename for output aligned DNA? ";
$outfile=$ARGV[2];

open(CLUSTAL,$clustal)||die;
open(RAWDNA,$rawdna)||die;
open(OUTDNA,">$outfile")||die;

$trash=<CLUSTAL>.<CLUSTAL>.<CLUSTAL>;

while (<CLUSTAL>){
 chop;
 ($species,$seq)=split;
 $protein_alignment{$species} .= $seq;
}

while (<RAWDNA>){
 chop;
 if (/>/){
  ($name,$trash)=split;
  $name=substr($name,1);
 }else{
  $rawdna{$name} .= $_;
 }
}

foreach (sort keys %protein_alignment){
 $species=$_;
 next unless ($species =~ /\w/);
 $protein=$protein_alignment{$species};
 $rawdna=$rawdna{$species};
 $rawdna =~ s/[^A-Za-z]//g;
 $protein =~ s/[^A-Za-z\-]//g;
 print("species: ",$species," ",length($protein)," ",length($rawdna),"\n");

 $outdna="";
 $counter=0;
 for ($i=0; $i < length($protein); $i++){
  if (substr($protein,$i,1) eq "-"){
   $outdna .= "---";
  }else{
   $codon = substr($rawdna,$counter,3);
   $outdna .= $codon;
   $counter += 3;

   #check translation:
   $aa = substr($protein,$i,1);
   $codon =~ tr/A-Z/a-z/; 
   if ($aa{$codon} ne $aa){
    print "$codon translated as $aa in $species\n";
   }

  }   
 }
  $outdna =~ tr/a-z/A-Z/;
  print OUTDNA ">$species\n";
  for ($j=0; $j <= length($outdna); $j += 60){
   print OUTDNA (substr($outdna,$j,60),"\n");
  }
}


#######
sub learn_genetic_code{
$long="
ttt F Phe:tct S Ser:tat Y Tyr:tgt C Cys:
ttc F Phe:tcc S Ser:tac Y Tyr:tgc C Cys:
tta L Leu:tca S Ser:taa * ter:tga * ter:
ttg L Leu:tcg S Ser:tag * ter:tgg W Trp:
ctt L Leu:cct P Pro:cat H His:cgt R Arg:
ctc L Leu:ccc P Pro:cac H His:cgc R Arg:
cta L Leu:cca P Pro:caa Q Gln:cga R Arg:
ctg L Leu:ccg P Pro:cag Q Gln:cgg R Arg:
att I Ile:act T Thr:aat N Asn:agt S Ser:
atc I Ile:acc T Thr:aac N Asn:agc S Ser:
ata I Ile:aca T Thr:aaa K Lys:aga R Arg:
atg M Met:acg T Thr:aag K Lys:agg R Arg:
gtt V Val:gct A Ala:gat D Asp:ggt G Gly:
gtc V Val:gcc A Ala:gac D Asp:ggc G Gly:
gta V Val:gca A Ala:gaa E Glu:gga G Gly:
gtg V Val:gcg A Ala:gag E Glu:ggg G Gly:
";
@trios=split(/:/,$long);
foreach (@trios){
 ($codon,$one,$three)=split;
 $aa{$codon}=$one;
 $three{$codon}=$three;
}
$aa{"---"}="-";

#learn degeneracies of amino acids (one-letter-code) (useful for RSCU):
foreach (@codons){$degeneracy{$aa{$_}}++;}
}#return
