Merge pull request #45 from CDCgov/dev

Update images, samplesheet updates, rehead BAM
CDCgov · Jun 20, 2024 · 51e4d64 · 51e4d64
2 parents 946ca75 + 2397f60
commit 51e4d64
Show file tree

Hide file tree

Showing 19 changed files with 765 additions and 257 deletions.
diff --git a/README.md b/README.md
@@ -1,15 +1,28 @@
-# Aquascope
-
-[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.04.0-23aa62.svg?labelColor=000000)](https://www.nextflow.io/)
-[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
-[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)
-[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)
-
-## This project is a successor to the [C-WAP pipeline](https://github.com/CFSAN-Biostatistics/C-WAP) and is intended to process SARS-CoV-2 wastewater samples to determine relative variant abundance.  
+<p align="center">
+  <img src="https://github.com/CDCgov/aquascope/assets/20726305/62d44e22-a870-4a28-9d9d-d21b6e3c4ca0" alt="Aquascope_V2_50">
+</p>
+
+<h1 align="center">Aquascope</h1>
+<div align="center">
+  <a href="https://www.nextflow.io/">
+    <img src="https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.04.0-23aa62.svg?labelColor=000000" alt="Nextflow">
+  </a>
+  <a href="https://docs.conda.io/en/latest/">
+    <img src="http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda" alt="run with conda">
+  </a>
+  <a href="https://www.docker.com/">
+    <img src="https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker" alt="run with docker">
+  </a>
+  <a href="https://sylabs.io/docs/">
+    <img src="https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000" alt="run with singularity">
+  </a>
+</div>
 
 ## Introduction
 **CDCgov/aquascope** is a bioinformatics best-practice pipeline for early detection of SARS-COV variants of concern, sequenced throughshotgun metagenomic sequencing, from wastewater.
 
+This project is a successor to the [C-WAP pipeline](https://github.com/CFSAN-Biostatistics/C-WAP) and is intended to process SARS-CoV-2 wastewater samples to determine relative variant abundance.  
+
 The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. 
 
 ## Pipeline summary

diff --git a/bin/bam_to_samplesheet.py b/bin/bam_to_samplesheet.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+
+import os
+import argparse
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Generate a samplesheet from a directory of BAM files for Freyja subworkflow",
+        epilog="Usage: python bam_to_samplesheet.py --directory <PATH_TO_BAM_FILES> --output <OUTPUT_FILE>"
+    )
+    parser.add_argument("--directory", help="Directory containing BAM files.", required=True)
+    parser.add_argument("--output", help="Output file for the samplesheet.", required=True)
+    return parser.parse_args()
+
+def extract_sample_name(bam_filename):
+    """
+    Extracts the sample name from the BAM filename assuming the sample name
+    is the first component before the first ".".
+    """
+    return bam_filename.split(".")[0]
+
+def generate_samplesheet(directory, output_file):
+    bam_files = [f for f in os.listdir(directory) if f.endswith(".bam")]
+    with open(output_file, "w") as fout:
+        fout.write("SNAME,BAMFILE\n")
+        for bam_file in bam_files:
+            sample_name = extract_sample_name(bam_file)
+            bam_path = os.path.abspath(os.path.join(directory, bam_file))
+            fout.write(f"{sample_name},{bam_path}\n")
+
+def main():
+    args = parse_args()
+    generate_samplesheet(args.directory, args.output)
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3.9
+
+import os
+import sys
+import argparse
+import http.client
+import urllib.parse
+
+VALID_PLATFORMS = {"illumina", "nanopore", "iontorrent"}
+EXPECTED_HEADERS = ["sample", "platform", "fastq_1", "fastq_2", "lr", "bam_file", "bedfile"]
+MIN_COLS_REQUIRED = 3
+
+def parse_args(args=None):
+    parser = argparse.ArgumentParser(
+        description="Reformat nf-core/aquascope samplesheet file and check its contents.",
+        epilog="Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>",
+    )
+    parser.add_argument("FILE_IN", help="Input samplesheet file.")
+    parser.add_argument("FILE_OUT", help="Output file.")
+    return parser.parse_args(args)
+
+def validate_fastq(fastq_file, line):
+    if " " in fastq_file:
+        print(f"FastQ file '{fastq_file}' contains spaces! Line: {line}")
+    if not fastq_file.endswith((".fastq.gz", ".fq.gz")):
+        print(f"FastQ file '{fastq_file}' does not have extension '.fastq.gz' or '.fq.gz'! Line: {line}")
+
+def validate_bedfile(bedfile, line, platform):
+    if bedfile:
+        if bedfile.startswith(("http://", "https://")):
+            parsed_url = urllib.parse.urlparse(bedfile)
+            conn = http.client.HTTPConnection(parsed_url.netloc) if parsed_url.scheme == "http" else http.client.HTTPSConnection(parsed_url.netloc)
+            conn.request("GET", parsed_url.path)
+            response = conn.getresponse()
+            if response.status == 200:
+                lines = response.read().decode('utf-8').splitlines()
+                for i, bed_line in enumerate(lines):
+                    cols = bed_line.strip().split("\t")
+                    if len(cols) < 6:
+                        print(f"Bed file '{bedfile}' must have at least 6 columns! (Line {i+1}) Line: {line}")
+            else:
+                print(f"Failed to download bed file '{bedfile}': {response.status} Line: {line}")
+        else:
+            if not os.path.isfile(bedfile):
+                print(f"Bed file '{bedfile}' does not exist! Line: {line}")
+            else:
+                with open(bedfile, "r") as f:
+                    for i, bed_line in enumerate(f):
+                        cols = bed_line.strip().split("\t")
+                        if len(cols) < 6:
+                            print(f"Bed file '{bedfile}' must have at least 6 columns! (Line {i+1}) Line: {line}")
+    elif platform != "iontorrent":
+        print(f"Bedfile is required for platforms other than IonTorrent if not provided. Line: {line}")
+
+def check_samplesheet(args):
+    file_in = args.FILE_IN
+    file_out = args.FILE_OUT
+
+    with open(file_in, "r") as fin, open(file_out, "w") as fout:
+        header = [x.strip('"') for x in fin.readline().strip().split(",")]
+        if header[: len(EXPECTED_HEADERS)] != EXPECTED_HEADERS:
+            print(f"Invalid header! Expected {EXPECTED_HEADERS} but got {header}. Line: {','.join(header)}")
+
+        fout.write(",".join(header) + "\n")
+
+        for i, line in enumerate(fin, start=2):
+            cols = [x.strip().strip('"') for x in line.strip().split(",")]
+            if len(cols) < MIN_COLS_REQUIRED:
+                print(f"Invalid number of columns (minimum = {MIN_COLS_REQUIRED})! Line: {line}")
+                continue
+
+            sample, platform = cols[0], cols[1].lower()
+            fastq_1, fastq_2, lr, bam_file = (cols[2:6] + [None]*4)[:4]
+            bedfile = cols[6] if len(cols) > 6 else None  # Only if bedfile is specified
+
+            if platform.lower() not in VALID_PLATFORMS:
+                print(f"Invalid platform '{platform}'! Line: {line}")
+                continue
+            if platform == "illumina":
+                if fastq_1:
+                    validate_fastq(fastq_1, line)
+                    if fastq_2:
+                        validate_fastq(fastq_2, line)
+            elif platform == "nanopore" and lr and not lr.endswith((".fastq.gz", ".fq.gz")):
+                print(f"Nanopore requires FastQ file in 'lr' column! Line: {line}")
+            elif platform == "iontorrent" and bam_file and not bam_file.endswith(".bam"):
+                print(f"IonTorrent requires BAM file! Line: {line}")
+
+            validate_bedfile(bedfile, line, platform)
+
+            # Write to the output file
+            output_line = ",".join([sample, platform, fastq_1 or '', fastq_2 or '', lr or '', bam_file or '', bedfile or ''])
+            fout.write(output_line + "\n")
+
+def main(args=None):
+    check_samplesheet(parse_args(args))
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/reheaderbam.sh b/bin/reheaderbam.sh
@@ -13,6 +13,8 @@ SNID=$(awk 'NR>5 {print $1; exit}' "$gff")
 # Reheader the BAM file
 samtools view -H "$bam" | awk -v OFS='\t' -v SNID="$SNID" '{ if ($1 == "@SQ" && sub(/^SN:.*/, "SN:"SNID, $2)) print }' | samtools reheader -P - "$bam" > "${bam%_sorted.bam}_reheadered.bam"
 
+# Now index the reheadered BAM file
+samtools index "${bam%_sorted.bam}_reheadered.bam"
 
 # Clean up temporary files
 if [ -f "${bam}_sorted.bam" ]; then

diff --git a/conf/freyja_standalone.config b/conf/freyja_standalone.config
@@ -0,0 +1,52 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Config file for defining DSL2 per module options and publishing paths
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Available keys to override module options:
+        ext.args   = Additional arguments appended to command in module.
+        ext.args2  = Second set of arguments appended to command in module (multi-tool modules).
+        ext.args3  = Third set of arguments appended to command in module (multi-tool modules).
+        ext.prefix = File name prefix for output files.
+----------------------------------------------------------------------------------------
+*/
+
+process {
+
+    publishDir = [
+        path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
+        mode: params.publish_dir_mode,
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+    ]
+
+    withName: 'FREYJA_VARIANTS' {
+        ext.args = "--minq 20 --annot \"${params.gff3}\" --varthresh \"${params.varthresh}\" "
+        publishDir = [
+            path: { "${params.outdir}/FREYJA_STANDALONE/VarCalls" },
+            mode: params.publish_dir_mode,
+            pattern: "*.{tsv,csv}"
+        ]
+    }
+
+    withName: 'FREYJA_DEMIX' {
+        ext.args = '--covcut 10 --confirmedonly'
+        publishDir = [
+            path: { "${params.outdir}/FREYJA_STANDALONE/Demix" },
+            mode: params.publish_dir_mode,
+            pattern: "*.{tsv,csv}"
+        ]
+    }
+
+    withName: 'FREYJA_UPDATE' {
+        publishDir = [
+            path: { "${params.outdir}/FREYJA_STANDALONE/FREYJA_DB/" },
+            mode: params.publish_dir_mode,
+        ]
+    }
+
+    withName: 'MULTIQC' {
+        publishDir = [
+            path: { "${params.outdir}/MULTIQC" },
+            mode: params.publish_dir_mode
+        ]
+    }
+}
diff --git a/conf/modules.config b/conf/modules.config
@@ -18,6 +18,20 @@ process {
         saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
     ]
 
+    withName: 'SAMPLESHEET_CHECK' {
+		publishDir = [
+            path: { "${params.outdir}/pipeline_info/" },
+            mode: params.publish_dir_mode,
+        ]
+    }
+
+    withName: 'SAMTOOLS_FAIDX' {
+        publishDir = [
+            path: { "${params.outdir}/SAMTOOLS/"},
+            mode: params.publish_dir_mode,
+        ]
+    }
+
 	withName: 'FASTQC_RAW_SHORT' {
         ext.args = '--quiet'
 		publishDir = [
@@ -94,15 +108,6 @@ process {
         ]
     }
 
-    withName: KRAKEN2_STD {
-        publishDir = [
-            path: { "${params.outdir}/kraken2_std" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-	ext.prefix = { "${meta.id}_std" }
-    }
-
     withName: 'MINIMAP2_ALIGN_SHORT' {
         publishDir = [
             path: "${params.outdir}/ALIGNMENT/",
@@ -115,7 +120,7 @@ process {
         publishDir = [
             path: "${params.outdir}/ALIGNMENT/",
             mode: "copy",
-            pattern: '*.{bam,bai}'
+            pattern: '*.{bam,.bai}'
         ]
     }
 
@@ -127,11 +132,19 @@ process {
         ]
     }
 
+    withName: 'REHEADER_BAM' {
+        publishDir = [
+            path: { "${params.outdir}/TRIMMED_SORTED_BAM/" },
+            mode: "copy",
+            pattern: '*.{bam,bai}'
+        ]
+    }
+
     withName: 'SAMTOOLS_AMPLICONCLIP' {
         ext.args = '--hard-clip --both-ends --clipped --no-excluded --tolerance 10'
         ext.prefix = { "${meta.id}.ampliconclip" }
         publishDir = [
-            path: { "${params.outdir}/PRIMER_TRIMMING/AmpliconClip"},
+            path: { "${params.outdir}/PRIMER_TRIMMING/AmpliconClip_Unsorted"},
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
@@ -150,25 +163,49 @@ process {
     withName: 'SAMTOOLS_SORT' {
         ext.prefix = { "${meta.id}.trimmed.sorted" }
         publishDir = [
-            path: { "${params.outdir}/Variants/Minimap2" },
+            path: { "${params.outdir}/TRIMMED_SORTED_BAM/" },
             mode: params.publish_dir_mode,
             pattern: "*.bam"
         ]
     }
 
     withName: 'SAMTOOLS_INDEX' {
         publishDir = [
-            path: { "${params.outdir}/Variants/Minimap2" },
+            path: { "${params.outdir}/TRIMMED_SORTED_BAM/" },
             mode: params.publish_dir_mode,
             pattern: "*.bai"
         ]
     }
 
+    withName: 'SAMTOOLS_STATS' {
+        publishDir = [
+            path: { "${params.outdir}/SAMTOOLS/" },
+            mode: params.publish_dir_mode,
+            pattern: "*.stats"
+        ]
+    }
+
+    withName: 'SAMTOOLS_IDXSTATS' {
+        publishDir = [
+            path: { "${params.outdir}/SAMTOOLS/" },
+            mode: params.publish_dir_mode,
+            pattern: "*.idxstats"
+        ]
+    }
+
+    withName: 'SAMTOOLS_FLAGSTAT' {
+        publishDir = [
+            path: { "${params.outdir}/SAMTOOLS/" },
+            mode: params.publish_dir_mode,
+            pattern: "*.flagstat"
+        ]
+    }
+
     withName: 'IVAR_VARIANTS' {
         ext.args = '-t 0.01 -q 20'
         ext.args2 = '-aa -A -d 600000 -Q 20 -q 0 -B'
         publishDir = [
-            path: { "${params.outdir}/Variants/IVAR/VarCalls" },
+            path: { "${params.outdir}/VARIANTS/IVAR/VarCalls" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
@@ -177,7 +214,7 @@ process {
     withName: 'FREYJA_VARIANTS' {
         ext.args = "--minq 20 --annot \"${params.gff3}\" --varthresh \"${params.varthresh}\" "
         publishDir = [
-            path: { "${params.outdir}/Variants/FREYJA/VarCalls" },
+            path: { "${params.outdir}/VARIANTS/FREYJA/VarCalls" },
             mode: params.publish_dir_mode,
             pattern: "*.{tsv,csv}"
         ]
@@ -186,15 +223,22 @@ process {
     withName: 'FREYJA_DEMIX' {
         ext.args = '--covcut 10 --confirmedonly'
         publishDir = [
-            path: { "${params.outdir}/Variants/FREYJA/Demix" },
+            path: { "${params.outdir}/VARIANTS/FREYJA/Demix" },
             mode: params.publish_dir_mode,
             pattern: "*.{tsv,csv}"
         ]
     }
 
     withName: 'FREYJA_UPDATE' {
         publishDir = [
-            path: { "${params.outdir}/Variants/FREYJA/" },
+            path: { "${params.outdir}/VARIANTS/FREYJA/" },
+            mode: params.publish_dir_mode,
+        ]
+    }
+
+    withName: 'MULTIQC' {
+        publishDir = [
+            path: { "${params.outdir}/MULTIQC" },
             mode: params.publish_dir_mode,
         ]
     }