Skip to content

Commit

Permalink
Added gzip support for alignment file.
Browse files Browse the repository at this point in the history
  • Loading branch information
bastiaanvonmeijenfeldt committed Jun 23, 2020
1 parent 1b8f510 commit 8f6aa5e
Show file tree
Hide file tree
Showing 7 changed files with 126 additions and 117 deletions.
4 changes: 2 additions & 2 deletions CAT_pack/about.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python3

__author__ = 'F. A. Bastiaan von Meijenfeldt'
__version__ = '5.1'
__date__ = '22 June, 2020'
__version__ = '5.1.1'
__date__ = '23 June, 2020'
25 changes: 9 additions & 16 deletions CAT_pack/bins.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,12 @@ def parse_arguments():
action=shared.PathAction,
help=('Directory for temporary DIAMOND files (default: directory '
'to which output files are written).'))
specific.add_argument(
'--compress',
dest='compress',
required=False,
action='store_true',
help='Compress DIAMOND alignment file.')
specific.add_argument(
'--top',
dest='top',
Expand Down Expand Up @@ -398,7 +404,7 @@ def run():
args.bin_folder,
args.taxonomy_folder,
args.database_folder,
float(args.r),
int(args.r),
float(args.f),
args.log_file))
shared.give_user_feedback(message, args.log_file, args.quiet,
Expand Down Expand Up @@ -537,21 +543,8 @@ def run():
contig_names, contig2ORFs, args.log_file, args.quiet)

if 'align' in step_list:
shared.run_diamond(
args.path_to_diamond,
args.diamond_database,
args.proteins_fasta,
args.alignment_file,
args.nproc,
args.sensitive,
args.block_size,
args.index_chunks,
args.tmpdir,
args.top,
args.log_file,
args.quiet,
args.verbose)

shared.run_diamond(args)

(ORF2hits,
all_hits) = shared.parse_tabular_alignment(
args.alignment_file,
Expand Down
25 changes: 9 additions & 16 deletions CAT_pack/contigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,12 @@ def parse_arguments():
action=shared.PathAction,
help=('Directory for temporary DIAMOND files (default: directory '
'to which output files are written).'))
specific.add_argument(
'--compress',
dest='compress',
required=False,
action='store_true',
help='Compress DIAMOND alignment file.')
specific.add_argument(
'--top',
dest='top',
Expand Down Expand Up @@ -311,7 +317,7 @@ def run():
args.contigs_fasta,
args.taxonomy_folder,
args.database_folder,
float(args.r),
int(args.r),
float(args.f),
args.log_file))
shared.give_user_feedback(message, args.log_file, args.quiet,
Expand Down Expand Up @@ -431,21 +437,8 @@ def run():
contig_names, contig2ORFs, args.log_file, args.quiet)

if 'align' in step_list:
shared.run_diamond(
args.path_to_diamond,
args.diamond_database,
args.proteins_fasta,
args.alignment_file,
args.nproc,
args.sensitive,
args.block_size,
args.index_chunks,
args.tmpdir,
args.top,
args.log_file,
args.quiet,
args.verbose)

shared.run_diamond(args)

(ORF2hits,
all_hits) = shared.parse_tabular_alignment(
args.alignment_file,
Expand Down
146 changes: 79 additions & 67 deletions CAT_pack/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import argparse
import datetime
import decimal
import gzip
import os
import subprocess
import sys
Expand Down Expand Up @@ -216,24 +217,16 @@ def run_prodigal(
return


def run_diamond(
path_to_diamond,
diamond_database,
proteins_fasta,
alignment_file,
nproc,
sensitive,
block_size,
index_chunks,
tmpdir,
top,
log_file,
quiet,
verbose):
if not sensitive:
def run_diamond(args):
if args.sensitive:
mode = 'sensitive'
else:
mode = 'fast'

if args.compress:
compression = '1'
else:
mode = 'sensitive'
compression = '0'

message = (
'Homology search with DIAMOND is starting. Please be patient. Do '
Expand All @@ -246,47 +239,54 @@ def run_diamond(
'\t\t\t\tblock-size (billions of letters): {4}\n'
'\t\t\t\tindex-chunks: {5}\n'
'\t\t\t\ttmpdir: {6}\n'
'\t\t\t\ttop: {7}'.format(
proteins_fasta,
diamond_database,
'\t\t\t\tcompress: {7}\n'
'\t\t\t\ttop: {8}'.format(
args.proteins_fasta,
args.diamond_database,
mode,
nproc,
block_size,
index_chunks,
tmpdir,
top))
give_user_feedback(message, log_file, quiet)
args.nproc,
args.block_size,
args.index_chunks,
args.tmpdir,
compression,
args.top))
give_user_feedback(message, args.log_file, args.quiet)

try:
command = [
path_to_diamond,
args.path_to_diamond,
'blastp',
'-d', diamond_database,
'-q', proteins_fasta,
'--top', str(top),
'-d', args.diamond_database,
'-q', args.proteins_fasta,
'--top', str(args.top),
'--matrix', 'BLOSUM62',
'--evalue', '0.001',
'-o', alignment_file,
'-p', str(nproc),
'--block-size', str(block_size),
'--index-chunks', str(index_chunks),
'--tmpdir', tmpdir]

if not verbose:
'-o', args.alignment_file,
'-p', str(args.nproc),
'--block-size', str(args.block_size),
'--index-chunks', str(args.index_chunks),
'--tmpdir', args.tmpdir,
'--compress', compression]

if not args.verbose:
command += ['--quiet']

if sensitive:
if args.sensitive:
command += ['--sensitive']

subprocess.check_call(command)
except:
message = 'DIAMOND finished abnormally.'
give_user_feedback(message, log_file, quiet, error=True)
give_user_feedback(message, args.log_file, args.quiet, error=True)

sys.exit(1)

message = 'Homology search done! File {0} created.'.format(alignment_file)
give_user_feedback(message, log_file, quiet)
if args.compress:
setattr(args, 'alignment_file', '{0}.gz'.format(args.alignment_file))

message = 'Homology search done! File {0} created.'.format(
args.alignment_file)
give_user_feedback(message, args.log_file, args.quiet)

return

Expand Down Expand Up @@ -343,38 +343,50 @@ def parse_tabular_alignment(
message = 'Parsing alignment file {0}.'.format(alignment_file)
give_user_feedback(message, log_file, quiet)

compressed = False
if alignment_file.endswith('.gz'):
compressed = True

f1 = gzip.open(alignment_file, 'rb')
else:
f1 = open(alignment_file, 'r')

ORF2hits = {}
all_hits = set()

ORF = 'first ORF'
ORF_done = False
with open(alignment_file, 'r') as f1:
for line in f1:
if line.startswith(ORF) and ORF_done == True:
# The ORF has already surpassed its minimum allowed bit-score.
continue

line = line.rstrip().split('\t')

if not line[0] == ORF:
# A new ORF is reached.
ORF = line[0]
best_bitscore = decimal.Decimal(line[11])
ORF2hits[ORF] = []

ORF_done = False

bitscore = decimal.Decimal(line[11])

if bitscore >= one_minus_r * best_bitscore:
# The hit has a high enough bit-score to be included.
hit = line[1]

ORF2hits[ORF].append((hit, bitscore),)
all_hits.add(hit)
else:
# The hit is not included because its bit-score is too low.
ORF_done = True
for line in f1:
if compressed:
line = line.decode('utf-8')

if line.startswith(ORF) and ORF_done == True:
# The ORF has already surpassed its minimum allowed bit-score.
continue

line = line.rstrip().split('\t')

if not line[0] == ORF:
# A new ORF is reached.
ORF = line[0]
best_bitscore = decimal.Decimal(line[11])
ORF2hits[ORF] = []

ORF_done = False

bitscore = decimal.Decimal(line[11])

if bitscore >= one_minus_r * best_bitscore:
# The hit has a high enough bit-score to be included.
hit = line[1]

ORF2hits[ORF].append((hit, bitscore),)
all_hits.add(hit)
else:
# The hit is not included because its bit-score is too low.
ORF_done = True

f1.close()

return (ORF2hits, all_hits)

Expand Down
27 changes: 12 additions & 15 deletions CAT_pack/single_bin.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,12 @@ def parse_arguments():
action=shared.PathAction,
help=('Directory for temporary DIAMOND files (default: directory '
'to which output files are written).'))
specific.add_argument(
'--compress',
dest='compress',
required=False,
action='store_true',
help='Compress DIAMOND alignment file.')
specific.add_argument(
'--top',
dest='top',
Expand Down Expand Up @@ -310,7 +316,7 @@ def run():
args.bin_fasta,
args.taxonomy_folder,
args.database_folder,
float(args.r),
int(args.r),
float(args.f),
args.log_file))
shared.give_user_feedback(message, args.log_file, args.quiet,
Expand Down Expand Up @@ -399,6 +405,10 @@ def run():
check.check_fasta(
args.proteins_fasta, args.log_file, args.quiet))

if 'align' in step_list:
errors.append(
check.check_top(args.top, args.r, args.log_file, args.quiet))

# Print all variables.
shared.print_variables(args, step_list)

Expand Down Expand Up @@ -429,20 +439,7 @@ def run():
contig_names, contig2ORFs, args.log_file, args.quiet)

if 'align' in step_list:
shared.run_diamond(
args.path_to_diamond,
args.diamond_database,
args.proteins_fasta,
args.alignment_file,
args.nproc,
args.sensitive,
args.block_size,
args.index_chunks,
args.tmpdir,
args.top,
args.log_file,
args.quiet,
args.verbose)
shared.run_diamond(args)

(ORF2hits,
all_hits) = shared.parse_tabular_alignment(
Expand Down
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Changelog

## 5.1.1
CAT and BAT can now compress the DIAMOND alignment file, and import gzip compressed alignment files.

## 5.1
The code has been rewritten to prepare for future extensions. We have also added the `--verbose` flag.

Expand Down
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ CAT and BAT have been thoroughly tested on Linux systems, and should run on macO
No installation is required. You can run CAT and BAT by supplying the absolute path:

```
$ CAT_pack/CAT --help
$ ./CAT_pack/CAT --help
```

Alternatively, if you add the files in the CAT\_pack directory to your `$PATH` variable, you can run CAT and BAT from anywhere:
Expand Down Expand Up @@ -86,6 +86,17 @@ The taxonomy folder and database folder created by CAT prepare are needed in sub

To run CAT on a contig set, each header in the contig fasta file (the part after `>` and before the first space) needs to be unique. To run BAT on set of MAGs, each header in a MAG needs to be unique within that MAG. If you are unsure if this is the case, you can just run CAT or BAT, as the appropriate error messages are generated if formatting is incorrect.

### Getting help.
If you are unsure what options a program has, you can always add `--help` to a command. This is a great way to get you started with CAT and BAT.

```
$ CAT --help
$ CAT contigs --help
$ CAT summarise --help
```

## Usage
After you have got the database files on your system, you can run CAT to annotate your contig set:

Expand Down

0 comments on commit 8f6aa5e

Please sign in to comment.