# usr/bin/perl -w
#author: Joel Klein
#description: Script that matches query proteins to open reading frames deduced from a bacterial genome. 

use warnings; 
use strict;
use Bio::Tools::Run::StandAloneBlastPlus;
use Bio::Factory::EMBOSS;
use Bio::SearchIO;
use Bio::DB::Fasta;

#some variables for input files
my $genbankfile = "inputsequence.gbk" ;
my $queryseq = "allknownprot.Fasta" ;

#extract the sequence from a genbank file
open EMBL, $genbankfile or die;
my $DNA = "";

my $knopje = 0 ;

while (<EMBL>){
    chomp;
    my $label = substr($_,0,6);
    if ($knopje == 1) {
      $DNA .= $_ ;
    }
    if ($label eq "ORIGIN") {
	$knopje = 1 ;
    }
    if ($label eq "//") {
	$knopje = 0 ;
    }
}
close EMBL;

$DNA =~ s/[^acgt]//ig;
open (SEQ, '>seq.fa');
 print SEQ "$DNA\n";
close (SEQ) ;

my $orffile = "orfs.fasta" ;

#find all the potential cds 
my $seqobj= "";
my $factory = new Bio::Factory::EMBOSS;
my $app = $factory->program("getorf");
my %input = ( -sequence => "seq.fa", -minsize => 200, -outseq => $orffile );
$app->run(\%input);
my $seqio = Bio::SeqIO->new( -file => $orffile );
$seqobj = $seqio;

#input subject seq
my $fac = Bio::Tools::Run::StandAloneBlastPlus->new(
 -db_data => $orffile,
 -create => 1
);

#go from the first aa seq to the next in a multiple fasta file query sequence
my $seqio_obj = Bio::SeqIO->new(-file => $queryseq, -format => "fasta" );

while(my $seq_obj = $seqio_obj->next_seq){
  my $seqdesc= $seq_obj->desc ;
  my $seqid= $seq_obj->id ;
  print "Searching ", $seqid, "\t", $seqdesc, "\n";

  #perform a blast search
     my $result = $fac->blastp( -query => $seq_obj,
                                -outfile => 'report.bls',
                                -method_args => [ '-evalue' => 1e-9, '-num_threads' => 2]);
  #print only significant hits
  my  $report_obj = new Bio::SearchIO(-format => 'blast',                                   
				    -file   => 'report.bls');   

  while( $result = $report_obj->next_result ) {     
      while(my $hit = $result->next_hit ) {       
	while(my $hsp = $hit->next_hsp ) {           
	  if ( $hsp->percent_identity > 60 && $hsp->frac_conserved > 0.85 ) {

	  print "HSP\tbitscore\t" .		$hsp->bits . "\n" ;
	  print "Hit\taccession\t"  .           $hit->accession . "\n";
	  print "Hit\tdescription\t"  .         $hit->description . "\n";
	  print "Hit\tlength\t" .               $hit->length . "\n";
	  print "HSP\tpercent identity\t" .     $hsp->percent_identity . "\n";
	  print "HSP\tfrac conserved\t" .    	$hsp->frac_conserved . "\n";
	  print "HSP\tevalue\t" . 		$hsp->evalue . "\n";
		    
	  my $pattern = $hit->name . "\n" ;
	  $pattern =~ tr/lcl|//d;

      # go trough orfs.fasta and extract the protein sequences from the significant hits and combine it with the protein fasta header	
	  foreach ($pattern) {
# 		  print $pattern ;
		  (my $file,my $id) = ($orffile,$pattern);
		  my $db = Bio::DB::Fasta->new($file);
		  my $seq = $db->seq($id);
		  print $seq,"\n";
	    }     
	  }       
	}     
      }   
  }
  #remove database files
  $fac->cleanup; 
}

exit;