Skip to content

Commit

Permalink
Merge pull request #45 from CDCgov/dev
Browse files Browse the repository at this point in the history
Update images, samplesheet updates, rehead BAM
  • Loading branch information
slsevilla authored Jun 20, 2024
2 parents 946ca75 + 2397f60 commit 51e4d64
Show file tree
Hide file tree
Showing 19 changed files with 765 additions and 257 deletions.
29 changes: 21 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,28 @@
# Aquascope

[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.04.0-23aa62.svg?labelColor=000000)](https://www.nextflow.io/)
[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)
[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)

## This project is a successor to the [C-WAP pipeline](https://github.com/CFSAN-Biostatistics/C-WAP) and is intended to process SARS-CoV-2 wastewater samples to determine relative variant abundance.
<p align="center">
<img src="https://github.com/CDCgov/aquascope/assets/20726305/62d44e22-a870-4a28-9d9d-d21b6e3c4ca0" alt="Aquascope_V2_50">
</p>

<h1 align="center">Aquascope</h1>
<div align="center">
<a href="https://www.nextflow.io/">
<img src="https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.04.0-23aa62.svg?labelColor=000000" alt="Nextflow">
</a>
<a href="https://docs.conda.io/en/latest/">
<img src="http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda" alt="run with conda">
</a>
<a href="https://www.docker.com/">
<img src="https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker" alt="run with docker">
</a>
<a href="https://sylabs.io/docs/">
<img src="https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000" alt="run with singularity">
</a>
</div>

## Introduction
**CDCgov/aquascope** is a bioinformatics best-practice pipeline for early detection of SARS-COV variants of concern, sequenced throughshotgun metagenomic sequencing, from wastewater.

This project is a successor to the [C-WAP pipeline](https://github.com/CFSAN-Biostatistics/C-WAP) and is intended to process SARS-CoV-2 wastewater samples to determine relative variant abundance.

The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible.

## Pipeline summary
Expand Down
36 changes: 36 additions & 0 deletions bin/bam_to_samplesheet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env python

import os
import argparse

def parse_args():
parser = argparse.ArgumentParser(
description="Generate a samplesheet from a directory of BAM files for Freyja subworkflow",
epilog="Usage: python bam_to_samplesheet.py --directory <PATH_TO_BAM_FILES> --output <OUTPUT_FILE>"
)
parser.add_argument("--directory", help="Directory containing BAM files.", required=True)
parser.add_argument("--output", help="Output file for the samplesheet.", required=True)
return parser.parse_args()

def extract_sample_name(bam_filename):
"""
Extracts the sample name from the BAM filename assuming the sample name
is the first component before the first ".".
"""
return bam_filename.split(".")[0]

def generate_samplesheet(directory, output_file):
bam_files = [f for f in os.listdir(directory) if f.endswith(".bam")]
with open(output_file, "w") as fout:
fout.write("SNAME,BAMFILE\n")
for bam_file in bam_files:
sample_name = extract_sample_name(bam_file)
bam_path = os.path.abspath(os.path.join(directory, bam_file))
fout.write(f"{sample_name},{bam_path}\n")

def main():
args = parse_args()
generate_samplesheet(args.directory, args.output)

if __name__ == "__main__":
main()
99 changes: 99 additions & 0 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/usr/bin/env python3.9

import os
import sys
import argparse
import http.client
import urllib.parse

VALID_PLATFORMS = {"illumina", "nanopore", "iontorrent"}
EXPECTED_HEADERS = ["sample", "platform", "fastq_1", "fastq_2", "lr", "bam_file", "bedfile"]
MIN_COLS_REQUIRED = 3

def parse_args(args=None):
parser = argparse.ArgumentParser(
description="Reformat nf-core/aquascope samplesheet file and check its contents.",
epilog="Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>",
)
parser.add_argument("FILE_IN", help="Input samplesheet file.")
parser.add_argument("FILE_OUT", help="Output file.")
return parser.parse_args(args)

def validate_fastq(fastq_file, line):
if " " in fastq_file:
print(f"FastQ file '{fastq_file}' contains spaces! Line: {line}")
if not fastq_file.endswith((".fastq.gz", ".fq.gz")):
print(f"FastQ file '{fastq_file}' does not have extension '.fastq.gz' or '.fq.gz'! Line: {line}")

def validate_bedfile(bedfile, line, platform):
if bedfile:
if bedfile.startswith(("http://", "https://")):
parsed_url = urllib.parse.urlparse(bedfile)
conn = http.client.HTTPConnection(parsed_url.netloc) if parsed_url.scheme == "http" else http.client.HTTPSConnection(parsed_url.netloc)
conn.request("GET", parsed_url.path)
response = conn.getresponse()
if response.status == 200:
lines = response.read().decode('utf-8').splitlines()
for i, bed_line in enumerate(lines):
cols = bed_line.strip().split("\t")
if len(cols) < 6:
print(f"Bed file '{bedfile}' must have at least 6 columns! (Line {i+1}) Line: {line}")
else:
print(f"Failed to download bed file '{bedfile}': {response.status} Line: {line}")
else:
if not os.path.isfile(bedfile):
print(f"Bed file '{bedfile}' does not exist! Line: {line}")
else:
with open(bedfile, "r") as f:
for i, bed_line in enumerate(f):
cols = bed_line.strip().split("\t")
if len(cols) < 6:
print(f"Bed file '{bedfile}' must have at least 6 columns! (Line {i+1}) Line: {line}")
elif platform != "iontorrent":
print(f"Bedfile is required for platforms other than IonTorrent if not provided. Line: {line}")

def check_samplesheet(args):
file_in = args.FILE_IN
file_out = args.FILE_OUT

with open(file_in, "r") as fin, open(file_out, "w") as fout:
header = [x.strip('"') for x in fin.readline().strip().split(",")]
if header[: len(EXPECTED_HEADERS)] != EXPECTED_HEADERS:
print(f"Invalid header! Expected {EXPECTED_HEADERS} but got {header}. Line: {','.join(header)}")

fout.write(",".join(header) + "\n")

for i, line in enumerate(fin, start=2):
cols = [x.strip().strip('"') for x in line.strip().split(",")]
if len(cols) < MIN_COLS_REQUIRED:
print(f"Invalid number of columns (minimum = {MIN_COLS_REQUIRED})! Line: {line}")
continue

sample, platform = cols[0], cols[1].lower()
fastq_1, fastq_2, lr, bam_file = (cols[2:6] + [None]*4)[:4]
bedfile = cols[6] if len(cols) > 6 else None # Only if bedfile is specified

if platform.lower() not in VALID_PLATFORMS:
print(f"Invalid platform '{platform}'! Line: {line}")
continue
if platform == "illumina":
if fastq_1:
validate_fastq(fastq_1, line)
if fastq_2:
validate_fastq(fastq_2, line)
elif platform == "nanopore" and lr and not lr.endswith((".fastq.gz", ".fq.gz")):
print(f"Nanopore requires FastQ file in 'lr' column! Line: {line}")
elif platform == "iontorrent" and bam_file and not bam_file.endswith(".bam"):
print(f"IonTorrent requires BAM file! Line: {line}")

validate_bedfile(bedfile, line, platform)

# Write to the output file
output_line = ",".join([sample, platform, fastq_1 or '', fastq_2 or '', lr or '', bam_file or '', bedfile or ''])
fout.write(output_line + "\n")

def main(args=None):
check_samplesheet(parse_args(args))

if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions bin/reheaderbam.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ SNID=$(awk 'NR>5 {print $1; exit}' "$gff")
# Reheader the BAM file
samtools view -H "$bam" | awk -v OFS='\t' -v SNID="$SNID" '{ if ($1 == "@SQ" && sub(/^SN:.*/, "SN:"SNID, $2)) print }' | samtools reheader -P - "$bam" > "${bam%_sorted.bam}_reheadered.bam"

# Now index the reheadered BAM file
samtools index "${bam%_sorted.bam}_reheadered.bam"

# Clean up temporary files
if [ -f "${bam}_sorted.bam" ]; then
Expand Down
52 changes: 52 additions & 0 deletions conf/freyja_standalone.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Config file for defining DSL2 per module options and publishing paths
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Available keys to override module options:
ext.args = Additional arguments appended to command in module.
ext.args2 = Second set of arguments appended to command in module (multi-tool modules).
ext.args3 = Third set of arguments appended to command in module (multi-tool modules).
ext.prefix = File name prefix for output files.
----------------------------------------------------------------------------------------
*/

process {

publishDir = [
path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]

withName: 'FREYJA_VARIANTS' {
ext.args = "--minq 20 --annot \"${params.gff3}\" --varthresh \"${params.varthresh}\" "
publishDir = [
path: { "${params.outdir}/FREYJA_STANDALONE/VarCalls" },
mode: params.publish_dir_mode,
pattern: "*.{tsv,csv}"
]
}

withName: 'FREYJA_DEMIX' {
ext.args = '--covcut 10 --confirmedonly'
publishDir = [
path: { "${params.outdir}/FREYJA_STANDALONE/Demix" },
mode: params.publish_dir_mode,
pattern: "*.{tsv,csv}"
]
}

withName: 'FREYJA_UPDATE' {
publishDir = [
path: { "${params.outdir}/FREYJA_STANDALONE/FREYJA_DB/" },
mode: params.publish_dir_mode,
]
}

withName: 'MULTIQC' {
publishDir = [
path: { "${params.outdir}/MULTIQC" },
mode: params.publish_dir_mode
]
}
}
78 changes: 61 additions & 17 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,20 @@ process {
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]

withName: 'SAMPLESHEET_CHECK' {
publishDir = [
path: { "${params.outdir}/pipeline_info/" },
mode: params.publish_dir_mode,
]
}

withName: 'SAMTOOLS_FAIDX' {
publishDir = [
path: { "${params.outdir}/SAMTOOLS/"},
mode: params.publish_dir_mode,
]
}

withName: 'FASTQC_RAW_SHORT' {
ext.args = '--quiet'
publishDir = [
Expand Down Expand Up @@ -94,15 +108,6 @@ process {
]
}

withName: KRAKEN2_STD {
publishDir = [
path: { "${params.outdir}/kraken2_std" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
ext.prefix = { "${meta.id}_std" }
}

withName: 'MINIMAP2_ALIGN_SHORT' {
publishDir = [
path: "${params.outdir}/ALIGNMENT/",
Expand All @@ -115,7 +120,7 @@ process {
publishDir = [
path: "${params.outdir}/ALIGNMENT/",
mode: "copy",
pattern: '*.{bam,bai}'
pattern: '*.{bam,.bai}'
]
}

Expand All @@ -127,11 +132,19 @@ process {
]
}

withName: 'REHEADER_BAM' {
publishDir = [
path: { "${params.outdir}/TRIMMED_SORTED_BAM/" },
mode: "copy",
pattern: '*.{bam,bai}'
]
}

withName: 'SAMTOOLS_AMPLICONCLIP' {
ext.args = '--hard-clip --both-ends --clipped --no-excluded --tolerance 10'
ext.prefix = { "${meta.id}.ampliconclip" }
publishDir = [
path: { "${params.outdir}/PRIMER_TRIMMING/AmpliconClip"},
path: { "${params.outdir}/PRIMER_TRIMMING/AmpliconClip_Unsorted"},
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
Expand All @@ -150,25 +163,49 @@ process {
withName: 'SAMTOOLS_SORT' {
ext.prefix = { "${meta.id}.trimmed.sorted" }
publishDir = [
path: { "${params.outdir}/Variants/Minimap2" },
path: { "${params.outdir}/TRIMMED_SORTED_BAM/" },
mode: params.publish_dir_mode,
pattern: "*.bam"
]
}

withName: 'SAMTOOLS_INDEX' {
publishDir = [
path: { "${params.outdir}/Variants/Minimap2" },
path: { "${params.outdir}/TRIMMED_SORTED_BAM/" },
mode: params.publish_dir_mode,
pattern: "*.bai"
]
}

withName: 'SAMTOOLS_STATS' {
publishDir = [
path: { "${params.outdir}/SAMTOOLS/" },
mode: params.publish_dir_mode,
pattern: "*.stats"
]
}

withName: 'SAMTOOLS_IDXSTATS' {
publishDir = [
path: { "${params.outdir}/SAMTOOLS/" },
mode: params.publish_dir_mode,
pattern: "*.idxstats"
]
}

withName: 'SAMTOOLS_FLAGSTAT' {
publishDir = [
path: { "${params.outdir}/SAMTOOLS/" },
mode: params.publish_dir_mode,
pattern: "*.flagstat"
]
}

withName: 'IVAR_VARIANTS' {
ext.args = '-t 0.01 -q 20'
ext.args2 = '-aa -A -d 600000 -Q 20 -q 0 -B'
publishDir = [
path: { "${params.outdir}/Variants/IVAR/VarCalls" },
path: { "${params.outdir}/VARIANTS/IVAR/VarCalls" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
Expand All @@ -177,7 +214,7 @@ process {
withName: 'FREYJA_VARIANTS' {
ext.args = "--minq 20 --annot \"${params.gff3}\" --varthresh \"${params.varthresh}\" "
publishDir = [
path: { "${params.outdir}/Variants/FREYJA/VarCalls" },
path: { "${params.outdir}/VARIANTS/FREYJA/VarCalls" },
mode: params.publish_dir_mode,
pattern: "*.{tsv,csv}"
]
Expand All @@ -186,15 +223,22 @@ process {
withName: 'FREYJA_DEMIX' {
ext.args = '--covcut 10 --confirmedonly'
publishDir = [
path: { "${params.outdir}/Variants/FREYJA/Demix" },
path: { "${params.outdir}/VARIANTS/FREYJA/Demix" },
mode: params.publish_dir_mode,
pattern: "*.{tsv,csv}"
]
}

withName: 'FREYJA_UPDATE' {
publishDir = [
path: { "${params.outdir}/Variants/FREYJA/" },
path: { "${params.outdir}/VARIANTS/FREYJA/" },
mode: params.publish_dir_mode,
]
}

withName: 'MULTIQC' {
publishDir = [
path: { "${params.outdir}/MULTIQC" },
mode: params.publish_dir_mode,
]
}
Expand Down
Loading

0 comments on commit 51e4d64

Please sign in to comment.