Added gzip support for alignment file.

MGXlab · Jun 23, 2020 · 8f6aa5e · 8f6aa5e
1 parent 1b8f510
commit 8f6aa5e
Show file tree

Hide file tree

Showing 7 changed files with 126 additions and 117 deletions.
diff --git a/CAT_pack/about.py b/CAT_pack/about.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
 
 __author__ = 'F. A. Bastiaan von Meijenfeldt'
-__version__ = '5.1'
-__date__ = '22 June, 2020'
+__version__ = '5.1.1'
+__date__ = '23 June, 2020'
diff --git a/CAT_pack/bins.py b/CAT_pack/bins.py
@@ -224,6 +224,12 @@ def parse_arguments():
             action=shared.PathAction,
             help=('Directory for temporary DIAMOND files (default: directory '
                 'to which output files are written).'))
+    specific.add_argument(
+            '--compress',
+            dest='compress',
+            required=False,
+            action='store_true',
+            help='Compress DIAMOND alignment file.')
     specific.add_argument(
             '--top',
             dest='top',
@@ -398,7 +404,7 @@ def run():
                 args.bin_folder,
                 args.taxonomy_folder,
                 args.database_folder,
-                float(args.r),
+                int(args.r),
                 float(args.f),
                 args.log_file))
     shared.give_user_feedback(message, args.log_file, args.quiet,
@@ -537,21 +543,8 @@ def run():
             contig_names, contig2ORFs, args.log_file, args.quiet)
 
     if 'align' in step_list:
-        shared.run_diamond(
-                args.path_to_diamond,
-                args.diamond_database,
-                args.proteins_fasta,
-                args.alignment_file,
-                args.nproc,
-                args.sensitive,
-                args.block_size,
-                args.index_chunks,
-                args.tmpdir,
-                args.top,
-                args.log_file,
-                args.quiet,
-                args.verbose)
-
+        shared.run_diamond(args)
+
     (ORF2hits,
             all_hits) = shared.parse_tabular_alignment(
                     args.alignment_file,

diff --git a/CAT_pack/contigs.py b/CAT_pack/contigs.py
@@ -213,6 +213,12 @@ def parse_arguments():
             action=shared.PathAction,
             help=('Directory for temporary DIAMOND files (default: directory '
                 'to which output files are written).'))
+    specific.add_argument(
+            '--compress',
+            dest='compress',
+            required=False,
+            action='store_true',
+            help='Compress DIAMOND alignment file.')
     specific.add_argument(
             '--top',
             dest='top',
@@ -311,7 +317,7 @@ def run():
                 args.contigs_fasta,
                 args.taxonomy_folder,
                 args.database_folder,
-                float(args.r),
+                int(args.r),
                 float(args.f),
                 args.log_file))
     shared.give_user_feedback(message, args.log_file, args.quiet,
@@ -431,21 +437,8 @@ def run():
             contig_names, contig2ORFs, args.log_file, args.quiet)
 
     if 'align' in step_list:
-        shared.run_diamond(
-                args.path_to_diamond,
-                args.diamond_database,
-                args.proteins_fasta,
-                args.alignment_file,
-                args.nproc,
-                args.sensitive,
-                args.block_size,
-                args.index_chunks,
-                args.tmpdir,
-                args.top,
-                args.log_file,
-                args.quiet,
-                args.verbose)
-
+        shared.run_diamond(args)
+
     (ORF2hits,
             all_hits) = shared.parse_tabular_alignment(
                     args.alignment_file,

diff --git a/CAT_pack/shared.py b/CAT_pack/shared.py
@@ -3,6 +3,7 @@
 import argparse
 import datetime
 import decimal
+import gzip
 import os
 import subprocess
 import sys
@@ -216,24 +217,16 @@ def run_prodigal(
     return
 
 
-def run_diamond(
-        path_to_diamond,
-        diamond_database,
-        proteins_fasta,
-        alignment_file,
-        nproc,
-        sensitive,
-        block_size,
-        index_chunks,
-        tmpdir,
-        top,
-        log_file,
-        quiet,
-        verbose):
-    if not sensitive:
+def run_diamond(args):
+    if args.sensitive:
+        mode = 'sensitive'
+    else:
         mode = 'fast'
+
+    if args.compress:
+        compression = '1'
     else:
-        mode = 'sensitive'
+        compression = '0'
 
     message = (
             'Homology search with DIAMOND is starting. Please be patient. Do '
@@ -246,47 +239,54 @@ def run_diamond(
             '\t\t\t\tblock-size (billions of letters): {4}\n'
             '\t\t\t\tindex-chunks: {5}\n'
             '\t\t\t\ttmpdir: {6}\n'
-            '\t\t\t\ttop: {7}'.format(
-                proteins_fasta,
-                diamond_database,
+            '\t\t\t\tcompress: {7}\n'
+            '\t\t\t\ttop: {8}'.format(
+                args.proteins_fasta,
+                args.diamond_database,
                 mode,
-                nproc,
-                block_size,
-                index_chunks,
-                tmpdir,
-                top))
-    give_user_feedback(message, log_file, quiet)
+                args.nproc,
+                args.block_size,
+                args.index_chunks,
+                args.tmpdir,
+                compression,
+                args.top))
+    give_user_feedback(message, args.log_file, args.quiet)
 
     try:
         command = [
-                path_to_diamond,
+                args.path_to_diamond,
                 'blastp',
-                '-d', diamond_database,
-                '-q', proteins_fasta,
-                '--top', str(top),
+                '-d', args.diamond_database,
+                '-q', args.proteins_fasta,
+                '--top', str(args.top),
                 '--matrix', 'BLOSUM62',
                 '--evalue', '0.001',
-                '-o', alignment_file,
-                '-p', str(nproc),
-                '--block-size', str(block_size),
-                '--index-chunks', str(index_chunks),
-                '--tmpdir', tmpdir]
-
-        if not verbose:
+                '-o', args.alignment_file,
+                '-p', str(args.nproc),
+                '--block-size', str(args.block_size),
+                '--index-chunks', str(args.index_chunks),
+                '--tmpdir', args.tmpdir,
+                '--compress', compression]
+
+        if not args.verbose:
             command += ['--quiet']
 
-        if sensitive:
+        if args.sensitive:
             command += ['--sensitive']
 
         subprocess.check_call(command)
     except:
         message = 'DIAMOND finished abnormally.'
-        give_user_feedback(message, log_file, quiet, error=True)
+        give_user_feedback(message, args.log_file, args.quiet, error=True)
 
         sys.exit(1)
 
-    message = 'Homology search done! File {0} created.'.format(alignment_file)
-    give_user_feedback(message, log_file, quiet)
+    if args.compress:
+        setattr(args, 'alignment_file', '{0}.gz'.format(args.alignment_file))
+
+    message = 'Homology search done! File {0} created.'.format(
+            args.alignment_file)
+    give_user_feedback(message, args.log_file, args.quiet)
 
     return
 
@@ -343,38 +343,50 @@ def parse_tabular_alignment(
     message = 'Parsing alignment file {0}.'.format(alignment_file)
     give_user_feedback(message, log_file, quiet)
 
+    compressed = False
+    if alignment_file.endswith('.gz'):
+        compressed = True
+
+        f1 = gzip.open(alignment_file, 'rb')
+    else:
+        f1 = open(alignment_file, 'r')
+
     ORF2hits = {}
     all_hits = set()
 
     ORF = 'first ORF'
     ORF_done = False
-    with open(alignment_file, 'r') as f1:
-        for line in f1:
-            if line.startswith(ORF) and ORF_done == True:
-                # The ORF has already surpassed its minimum allowed bit-score.
-                continue
-
-            line = line.rstrip().split('\t')
-
-            if not line[0] == ORF:
-                # A new ORF is reached.
-                ORF = line[0]
-                best_bitscore = decimal.Decimal(line[11])
-                ORF2hits[ORF] = []
-
-                ORF_done = False
-
-            bitscore = decimal.Decimal(line[11])
-
-            if bitscore >= one_minus_r * best_bitscore:
-                # The hit has a high enough bit-score to be included.
-                hit = line[1]
-
-                ORF2hits[ORF].append((hit, bitscore),)
-                all_hits.add(hit)
-            else:
-                # The hit is not included because its bit-score is too low.
-                ORF_done = True
+    for line in f1:
+        if compressed:
+            line = line.decode('utf-8')
+
+        if line.startswith(ORF) and ORF_done == True:
+            # The ORF has already surpassed its minimum allowed bit-score.
+            continue
+
+        line = line.rstrip().split('\t')
+
+        if not line[0] == ORF:
+            # A new ORF is reached.
+            ORF = line[0]
+            best_bitscore = decimal.Decimal(line[11])
+            ORF2hits[ORF] = []
+
+            ORF_done = False
+
+        bitscore = decimal.Decimal(line[11])
+
+        if bitscore >= one_minus_r * best_bitscore:
+            # The hit has a high enough bit-score to be included.
+            hit = line[1]
+
+            ORF2hits[ORF].append((hit, bitscore),)
+            all_hits.add(hit)
+        else:
+            # The hit is not included because its bit-score is too low.
+            ORF_done = True
+
+    f1.close()
 
     return (ORF2hits, all_hits)
 

diff --git a/CAT_pack/single_bin.py b/CAT_pack/single_bin.py
@@ -212,6 +212,12 @@ def parse_arguments():
             action=shared.PathAction,
             help=('Directory for temporary DIAMOND files (default: directory '
                 'to which output files are written).'))
+    specific.add_argument(
+            '--compress',
+            dest='compress',
+            required=False,
+            action='store_true',
+            help='Compress DIAMOND alignment file.')
     specific.add_argument(
             '--top',
             dest='top',
@@ -310,7 +316,7 @@ def run():
                 args.bin_fasta,
                 args.taxonomy_folder,
                 args.database_folder,
-                float(args.r),
+                int(args.r),
                 float(args.f),
                 args.log_file))
     shared.give_user_feedback(message, args.log_file, args.quiet,
@@ -399,6 +405,10 @@ def run():
                 check.check_fasta(
                     args.proteins_fasta, args.log_file, args.quiet))
 
+    if 'align' in step_list:
+        errors.append(
+                check.check_top(args.top, args.r, args.log_file, args.quiet))
+
     # Print all variables.
     shared.print_variables(args, step_list)
 
@@ -429,20 +439,7 @@ def run():
             contig_names, contig2ORFs, args.log_file, args.quiet)
 
     if 'align' in step_list:
-        shared.run_diamond(
-                args.path_to_diamond,
-                args.diamond_database,
-                args.proteins_fasta,
-                args.alignment_file,
-                args.nproc,
-                args.sensitive,
-                args.block_size,
-                args.index_chunks,
-                args.tmpdir,
-                args.top,
-                args.log_file,
-                args.quiet,
-                args.verbose)
+        shared.run_diamond(args)
 
     (ORF2hits,
             all_hits) = shared.parse_tabular_alignment(

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,8 @@
 # Changelog
 
+## 5.1.1
+CAT and BAT can now compress the DIAMOND alignment file, and import gzip compressed alignment files.
+
 ## 5.1
 The code has been rewritten to prepare for future extensions. We have also added the `--verbose` flag.
 

diff --git a/README.md b/README.md
@@ -31,7 +31,7 @@ CAT and BAT have been thoroughly tested on Linux systems, and should run on macO
 No installation is required. You can run CAT and BAT by supplying the absolute path:
 
 ```
-$ CAT_pack/CAT --help
+$ ./CAT_pack/CAT --help
 ```
 
 Alternatively, if you add the files in the CAT\_pack directory to your `$PATH` variable, you can run CAT and BAT from anywhere:
@@ -86,6 +86,17 @@ The taxonomy folder and database folder created by CAT prepare are needed in sub
 
 To run CAT on a contig set, each header in the contig fasta file (the part after `>` and before the first space) needs to be unique. To run BAT on set of MAGs, each header in a MAG needs to be unique within that MAG. If you are unsure if this is the case, you can just run CAT or BAT, as the appropriate error messages are generated if formatting is incorrect.
 
+### Getting help.
+If you are unsure what options a program has, you can always add `--help` to a command. This is a great way to get you started with CAT and BAT.
+
+```
+$ CAT --help
+
+$ CAT contigs --help
+
+$ CAT summarise --help
+```
+
 ## Usage
 After you have got the database files on your system, you can run CAT to annotate your contig set: