#!/usr/bin/env perl
package Bio::MLST::Bin::GetSequenceType;

# ABSTRACT: Given an assembly find the MLST sequence type.
# PODNAME: get_sequence_type


BEGIN { unshift( @INC, '../lib' ) }
use lib "/software/pathogen/internal/prod/lib";
use Moose;
use Getopt::Long;
use Cwd;
use Bio::MLST::Check;
use Bio::MLST::Databases;
use Bio::MLST::Validate::Executable;
use Bio::MLST::SearchForFiles;
use Bio::MLST::CheckMultipleSpecies;

my ($species, $output_fasta_files, $output_directory, $output_phylip_files, $available_databases, $base_directory, $makeblastdb_exec, $blastn_exec, $spreadsheet_basename,$parallel_processes, $help);

GetOptions ('s|species=s'              => \$species,
            'o|output_directory=s'     => \$output_directory,
            'c|output_fasta_files'     => \$output_fasta_files,
            'y|output_phylip_files'    => \$output_phylip_files,
            'a|available_databases'    => \$available_databases,
            'b|mlst_base_directory=s'  => \$base_directory,
            'm|makeblastdb_exec=s'     => \$makeblastdb_exec,
            'n|blastn_exec=s'          => \$blastn_exec,
            'p|spreadsheet_basename=s' => \$spreadsheet_basename,
            'd|parallel_processes=i'   => \$parallel_processes,
            'h|help'                   => \$help,
);

( ((@ARGV > 0) || (defined($available_databases))) && ! $help ) or die <<USAGE;
Usage: $0 [options]

# Basic usage, sequence type result written to my_assembly.fa.st
get_sequence_type -s "Clostridium difficile" my_assembly.fa

# Multiple fasta files 
get_sequence_type -s "Clostridium difficile" myfasta.fa anotherfasta.fa yetanother.fa
# or
get_sequence_type -s "Clostridium difficile" *.fa

# Split into 8 parallel processes (much faster), default is 2
get_sequence_type -s "Clostridium difficile" -d 8 *.fa

# output a fasta file with the concatenated alleles and unknown sequences
get_sequence_type -s "Clostridium difficile" -c  my_assembly.fa 

# output a phylip file with the concatenated alleles and unknown sequences
get_sequence_type -s "Clostridium difficile" -y  my_assembly.fa

# Specify an output directory 
get_sequence_type  -s "Clostridium difficile" -o /path/to/results my_assembly.fa

# Match against multiple MLST databases
get_sequence_type -s "Clostridium botulinum, Clostridium difficile" my_assembly.fa

# Match against all MLST databases
get_sequence_type my_assembly.fa

# list all available MLST databases
get_sequence_type -a

# This help message
get_sequence_type -h

USAGE
;

$base_directory ||= $ENV{MLST_DATABASES};
$base_directory ||= '/lustre/scratch108/pathogen/pathpipe/mlst';

# Multiple versions of blast lying around, so use a particular one if possible
$makeblastdb_exec ||= '/software/pubseq/bin/ncbi-blast-2.2.25+/bin/makeblastdb';
$blastn_exec  ||= '/software/pubseq/bin/ncbi-blast-2.2.25+/bin/blastn';
$makeblastdb_exec = 'makeblastdb' unless(Bio::MLST::Validate::Executable->new()->does_executable_exist($makeblastdb_exec));
$blastn_exec = 'blastn' unless(Bio::MLST::Validate::Executable->new()->does_executable_exist($blastn_exec));

$spreadsheet_basename ||= 'mlst_results';
$output_directory    ||= getcwd;
$output_fasta_files  ||= 0;
$output_phylip_files ||= 0;
$parallel_processes  ||= 2;

# get list of species to check
my @species_list = ();
if(defined $species)
{
    my %species_found;
    for my $species_in (split(/,/,$species))
    {
        my $species_check = Bio::MLST::SearchForFiles->new(
            species_name => $species_in,
            base_directory => $base_directory,
            );
        for my $spec (@{$species_check->list_species})
        {
            $species_found{$spec} = 1;
        }
    }
    @species_list = sort keys %species_found;
}

if(defined($available_databases))
{
  my $mlst_dbs = Bio::MLST::Databases->new(
    base_directory => $base_directory,
  );
  $mlst_dbs->print_db_list;
}
elsif(@species_list > 1 || !defined($species))
{
    my $multiple_species = Bio::MLST::CheckMultipleSpecies->new(
        species               => \@species_list,
        base_directory        => $base_directory,
        raw_input_fasta_files => \@ARGV,
        makeblastdb_exec      => $makeblastdb_exec,
        blastn_exec           => $blastn_exec,
        output_directory      => $output_directory,
        spreadsheet_basename  => $spreadsheet_basename,
        parallel_processes    => $parallel_processes,
        output_fasta_files    => $output_fasta_files,  # not supported
        output_phylip_files   => $output_phylip_files, # not supported
        verbose               => 0, # databases searched and matches found
        report_all_mlst_db    => 0, # list databases searched in csv file
        );
    $multiple_species->create_result_files;
}
else
{
  my $multiple_fastas = Bio::MLST::Check->new(
    species               => $species,
    base_directory        => $base_directory,
    raw_input_fasta_files => \@ARGV,
    makeblastdb_exec      => $makeblastdb_exec,
    blastn_exec           => $blastn_exec,
    output_directory      => $output_directory,
    output_fasta_files    => $output_fasta_files,
    spreadsheet_basename  => $spreadsheet_basename,
    parallel_processes    => $parallel_processes,
    output_phylip_files   => $output_phylip_files
  );
  $multiple_fastas->create_result_files;
}

__END__

=pod

=head1 NAME

get_sequence_type - Given an assembly find the MLST sequence type.

=head1 VERSION

version 1.123530

=head1 SYNOPSIS

Given Fasta files and a Species regex, lookup the relevant MLST database and output the sequence type to a file.
It requires NBCI Blast+ to be available in your PATH.

   # Basic usage, sequence type result written to my_assembly.fa.st
   get_sequence_type -s "Clostridium difficile" my_assembly.fa
   
   # Multiple fasta files 
   get_sequence_type -s "Clostridium difficile" myfasta.fa anotherfasta.fa yetanother.fa
   # or
   get_sequence_type -s "Clostridium difficile" *.fa
   
   # Split into 8 parallel processes (much faster), default is 2
   get_sequence_type -s "Clostridium difficile" -d 8 *.fa
   
   # output a fasta file with the concatenated alleles and unknown sequences
   get_sequence_type -s "Clostridium difficile" -c  my_assembly.fa 
   
   # Specify an output directory 
   get_sequence_type  -s "Clostridium difficile" -o /path/to/results my_assembly.fa
   
   # Match against multiple MLST databases
   get_sequence_type -s "Clostridium botulinum, Clostridium difficile" my_assembly.fa
   
   # Match against all MLST databases
   get_sequence_type my_assembly.fa
   
   # list all available MLST databases
   get_sequence_type -a
   
   # This help message
   get_sequence_type -h

=head1 AUTHOR

Andrew J. Page <ap13@sanger.ac.uk>

=head1 COPYRIGHT AND LICENSE

This software is Copyright (c) 2012 by Wellcome Trust Sanger Institute.

This is free software, licensed under:

  The GNU General Public License, Version 3, June 2007

=cut
