Source code for GenomeBaser.genomebaser
#!/usr/bin/env python
"""
Genomebaser is a tool to manage complete genomes from the NCBI
"""
__title__ = 'GenomeBaser'
__version__ = '0.1.1'
__description__ = "GenomeBaser manages complete (bacterial) genomes from NCBI"
__author__ = 'Mitchell Stanton-Cook'
__author_email__ = 'm.stantoncook@gmail.com'
__url__ = 'http://github.com/mscook/GenomeBaser'
__license__ = 'ECL 2.0'
import os
import re
import glob
import sys
import subprocess
from Bio import SeqIO
import click
[docs]def check_for_deps():
"""
Check if 3rd party dependencies (non-python) exist
Requires:
* rsysnc
* prokka-genbank_to_fasta_db
* cd-hit
* makeblastdb
"""
reqs = ["rsync", "prokka-genbank_to_fasta_db", "cd-hit", "makeblastdb"]
for e in reqs:
output = subprocess.Popen(["which", e],
stdout=subprocess.PIPE).communicate()[0]
if output.split("/")[-1].strip() != e:
print "Misisng %s. Please install. Exiting." % (e)
sys.exit()
[docs]def fetch_genomes(target_genus_species, db_base=None):
"""
Use rsync to manage periodic updates
Examples:
>>> fetch_genomes("Escherichia coli")
>>>
>>> fetch_genomes("Klebsiella pneumoniae", "/home/me/dbs/")
:param target_genus_species: the genus species as a string
(space delimited)
:returns: the database location
"""
working_dir = os.getcwd()
if db_base is not None:
os.chdir(db_base)
target_genus_species = target_genus_species.replace(" ", "_")
if not os.path.exists(target_genus_species):
os.mkdir(target_genus_species)
os.chdir(target_genus_species)
cmd = ("rsync -av ftp.ncbi.nlm.nih.gov::genomes/Bacteria/"
"%s_*/*.gbk .") % (target_genus_species)
db_loc = os.getcwd()
os.system(cmd)
os.chdir(working_dir)
return db_loc
[docs]def genbank_to_fasta(db_loc):
"""
Converts GenBank to fasta while naming using the given in the DEFINITION
Examples:
>>> genbank_to_fasta("/home/mscook/dbs/Klebsiella_pneumoniae"
:param db_loc: the fullpath as a sting to the database location (genus
species inclusive)
:returns: a list of the output fasta files
"""
fasta_files = []
tmp_file = "tmp.gbk"
working_dir = os.getcwd()
os.chdir(db_loc)
infs = glob.glob("*.gbk")
for inf in infs:
cmd = "grep -v 'CONTIG join' "+inf+" > "+tmp_file
os.system(cmd)
os.rename(tmp_file, inf)
for seq_record in SeqIO.parse(inf, "genbank"):
out_fa = re.sub(r'\W+', ' ', seq_record.description
).replace(' ', '_')
if out_fa.endswith('_'):
out_fa = out_fa[:-1]+".fna"
else:
out_fa = out_fa+".fna"
SeqIO.write(seq_record, out_fa, "fasta")
fasta_files.append(out_fa)
dest = out_fa.replace(".fna", ".gbk")
if not os.path.lexists(dest):
os.symlink(inf, dest)
if os.path.exists(tmp_file):
os.remove(tmp_file)
os.chdir(working_dir)
return fasta_files
[docs]def partition_genomes(db_loc, fasta_files):
"""
Separate complete genomes from plasmids
..warning:: this partitions on the complete_sequence (plasmid) vs
complete_genome (genome) in filename assumption (in
DEFINITION) line
:param db_loc: the fullpath as a sting to the database location (genus
species inclusive)
:param fasta_files: a list of fasta files
:returns: a list of DEFINITION format named GenBank files
"""
plasmid, genome = [], []
working_dir = os.getcwd()
os.chdir(db_loc)
for e in fasta_files:
if e.find("complete_sequence") != -1:
plasmid.append(e)
elif e.find("complete_genome") != -1:
genome.append(e)
elif e.find("_genome") != -1:
genome.append(e)
else:
print "Could not classify %s" % (e)
print "Continuing..."
if not os.path.exists("plasmid"):
os.mkdir("plasmid")
os.chdir("plasmid")
for e in plasmid:
if not os.path.lexists(e):
os.symlink("../"+e, e)
os.chdir("../")
if not os.path.exists("genome"):
os.mkdir("genome")
os.chdir("genome")
for e in genome:
if not os.path.lexists(e):
os.symlink("../"+e, e)
os.chdir("../")
os.chdir(working_dir)
return genome
[docs]def make_prokka(db_loc, genbank_files, target_genus_species):
"""
Make a prokka database of the complete genomes
:param db_loc: the fullpath as a sting to the database location (genus
species inclusive)
:param genbank_files: a list of GenBank files
:param target_genus_species: the genus species as a string
(space delimited)
"""
working_dir = os.getcwd()
os.chdir(db_loc)
target = target_genus_species.split(" ")[0]
if not os.path.exists("prokka"):
os.mkdir("prokka")
prokka_cmd = ("prokka-genbank_to_fasta_db %s --idtag=locus_tag "
"> prokka/%s.faa") % (' '.join(genbank_files), target)
os.system(prokka_cmd.replace(".fna", ".gbk"))
os.chdir("prokka")
cd_hit_cmd = ("cd-hit -i %s.faa -o %s -T 0 "
"-M 0 -g 1 -s 0.8 -c 0.9") % (target, target)
os.system(cd_hit_cmd)
blast_cmd = "makeblastdb -dbtype prot -in %s" % (target)
os.system(blast_cmd)
os.chdir("../")
os.chdir(working_dir)
@click.command()
@click.option('--check_deps/--no-check_deps', default=True,
help='Check that non-python dependencies exist')
@click.argument("genus")
@click.argument("species")
@click.argument('out_database_location', type=click.Path(exists=True))
def main(check_deps, genus, species, out_database_location):
"""
GenomeBaser is tool to manage complete (bacterial) genomes from the NCBI.
Example usage:
$ GenomeBaser.py Klebsiella pneumoniae ~/dbs
$ # (wait a few months)...
$ GenomeBaser.py Klebsiella pneumoniae ~/dbs
By Mitchell Stanton-Cook (m.stantoncook@gmail.com)
**More info at:** https://github.com/mscook/GenomeBaser
"""
if check_deps:
print "Checking for 3rd party dependencies"
check_for_deps()
genus = genus[0].upper()+genus[1:]
gs = genus+" "+species
loc = fetch_genomes(gs, out_database_location)
fas = genbank_to_fasta(loc)
genbanks = partition_genomes(loc, fas)
make_prokka(loc, genbanks, gs)
if __name__ == '__main__':
main()