From 77382d310d5dd5420988b3a3593d3c9a731ba481 Mon Sep 17 00:00:00 2001 From: Samantha Date: Thu, 6 Jun 2024 00:22:40 -0400 Subject: [PATCH 01/18] docs: add image to README fixes #43 --- README.md | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index c44b794..7a33ab2 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,28 @@ -# Aquascope - -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.04.0-23aa62.svg?labelColor=000000)](https://www.nextflow.io/) -[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) -[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) -[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) - -## This project is a successor to the [C-WAP pipeline](https://github.com/CFSAN-Biostatistics/C-WAP) and is intended to process SARS-CoV-2 wastewater samples to determine relative variant abundance. +

+ Aquascope_V2_50 +

+ +

Aquascope

+
+ + Nextflow + + + run with conda + + + run with docker + + + run with singularity + +
## Introduction **CDCgov/aquascope** is a bioinformatics best-practice pipeline for early detection of SARS-COV variants of concern, sequenced throughshotgun metagenomic sequencing, from wastewater. +This project is a successor to the [C-WAP pipeline](https://github.com/CFSAN-Biostatistics/C-WAP) and is intended to process SARS-CoV-2 wastewater samples to determine relative variant abundance. + The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. ## Pipeline summary From e6b65719a67ba5212ec7c1b26442a4615d822e18 Mon Sep 17 00:00:00 2001 From: Samantha Date: Thu, 6 Jun 2024 00:23:35 -0400 Subject: [PATCH 02/18] docs: add image to user-guides --- docs/index.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 9176b33..d6f87c8 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,3 +1,7 @@ +

+ Aquascope_V2_50 +

+ This project is a successor to the [C-WAP pipeline](https://github.com/CFSAN-Biostatistics/C-WAP) and is intended to process SARS-CoV-2 wastewater samples to determine relative variant abundance. -**Aquascope** is a bioinformatics best-practice analysis pipeline for Pipeline is for early detection of SARS-CoV-2 variants of concern via Targeted-amplicon metagenomic sequencing of wastewater. It is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! \ No newline at end of file +**Aquascope** is a bioinformatics best-practice analysis pipeline for Pipeline is for early detection of SARS-CoV-2 variants of concern via Targeted-amplicon metagenomic sequencing of wastewater. It is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! From bc649940bd8ec8e22efd060150c7c28d2391ea9f Mon Sep 17 00:00:00 2001 From: tzx6 Date: Thu, 13 Jun 2024 23:23:32 -0400 Subject: [PATCH 03/18] Added indexing to reheader.bam --- bin/reheaderbam.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/reheaderbam.sh b/bin/reheaderbam.sh index c936f97..ca9e8b2 100755 --- a/bin/reheaderbam.sh +++ b/bin/reheaderbam.sh @@ -13,6 +13,8 @@ SNID=$(awk 'NR>5 {print $1; exit}' "$gff") # Reheader the BAM file samtools view -H "$bam" | awk -v OFS='\t' -v SNID="$SNID" '{ if ($1 == "@SQ" && sub(/^SN:.*/, "SN:"SNID, $2)) print }' | samtools reheader -P - "$bam" > "${bam%_sorted.bam}_reheadered.bam" +# Now index the reheadered BAM file +samtools index "${bam%_sorted.bam}_reheadered.bam" # Clean up temporary files if [ -f "${bam}_sorted.bam" ]; then From cc91aa94454824f6588da7a74bf6407b47e8003e Mon Sep 17 00:00:00 2001 From: tzx6 Date: Thu, 13 Jun 2024 23:24:07 -0400 Subject: [PATCH 04/18] Added publish dir for reheaderbam and samtools --- conf/modules.config | 57 +++++++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 5ebbe71..86d60ed 100755 --- a/conf/modules.config +++ b/conf/modules.config @@ -94,15 +94,6 @@ process { ] } - withName: KRAKEN2_STD { - publishDir = [ - path: { "${params.outdir}/kraken2_std" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - ext.prefix = { "${meta.id}_std" } - } - withName: 'MINIMAP2_ALIGN_SHORT' { publishDir = [ path: "${params.outdir}/ALIGNMENT/", @@ -115,7 +106,7 @@ process { publishDir = [ path: "${params.outdir}/ALIGNMENT/", mode: "copy", - pattern: '*.{bam,bai}' + pattern: '*.{bam,.bai}' ] } @@ -127,11 +118,19 @@ process { ] } + withName: 'REHEADER_BAM' { + publishDir = [ + path: { "${params.outdir}/TRIMMED_SORTED_BAM/" }, + mode: "copy", + pattern: '*.{bam,bai}' + ] + } + withName: 'SAMTOOLS_AMPLICONCLIP' { ext.args = '--hard-clip --both-ends --clipped --no-excluded --tolerance 10' ext.prefix = { "${meta.id}.ampliconclip" } publishDir = [ - path: { "${params.outdir}/PRIMER_TRIMMING/AmpliconClip"}, + path: { "${params.outdir}/PRIMER_TRIMMING/AmpliconClip_Unsorted"}, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -150,7 +149,7 @@ process { withName: 'SAMTOOLS_SORT' { ext.prefix = { "${meta.id}.trimmed.sorted" } publishDir = [ - path: { "${params.outdir}/Variants/Minimap2" }, + path: { "${params.outdir}/TRIMMED_SORTED_BAM/" }, mode: params.publish_dir_mode, pattern: "*.bam" ] @@ -158,17 +157,41 @@ process { withName: 'SAMTOOLS_INDEX' { publishDir = [ - path: { "${params.outdir}/Variants/Minimap2" }, + path: { "${params.outdir}/TRIMMED_SORTED_BAM/" }, mode: params.publish_dir_mode, pattern: "*.bai" ] } + withName: 'SAMTOOLS_STATS' { + publishDir = [ + path: { "${params.outdir}/SAMTOOLS/" }, + mode: params.publish_dir_mode, + pattern: "*.stats" + ] + } + + withName: 'SAMTOOLS_IDXSTATS' { + publishDir = [ + path: { "${params.outdir}/SAMTOOLS/" }, + mode: params.publish_dir_mode, + pattern: "*.idxstats" + ] + } + + withName: 'SAMTOOLS_FLAGSTAT' { + publishDir = [ + path: { "${params.outdir}/SAMTOOLS/" }, + mode: params.publish_dir_mode, + pattern: "*.flagstat" + ] + } + withName: 'IVAR_VARIANTS' { ext.args = '-t 0.01 -q 20' ext.args2 = '-aa -A -d 600000 -Q 20 -q 0 -B' publishDir = [ - path: { "${params.outdir}/Variants/IVAR/VarCalls" }, + path: { "${params.outdir}/VARIANTS/IVAR/VarCalls" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -177,7 +200,7 @@ process { withName: 'FREYJA_VARIANTS' { ext.args = "--minq 20 --annot \"${params.gff3}\" --varthresh \"${params.varthresh}\" " publishDir = [ - path: { "${params.outdir}/Variants/FREYJA/VarCalls" }, + path: { "${params.outdir}/VARIANTS/FREYJA/VarCalls" }, mode: params.publish_dir_mode, pattern: "*.{tsv,csv}" ] @@ -186,7 +209,7 @@ process { withName: 'FREYJA_DEMIX' { ext.args = '--covcut 10 --confirmedonly' publishDir = [ - path: { "${params.outdir}/Variants/FREYJA/Demix" }, + path: { "${params.outdir}/VARIANTS/FREYJA/Demix" }, mode: params.publish_dir_mode, pattern: "*.{tsv,csv}" ] @@ -194,7 +217,7 @@ process { withName: 'FREYJA_UPDATE' { publishDir = [ - path: { "${params.outdir}/Variants/FREYJA/" }, + path: { "${params.outdir}/VARIANTS/FREYJA/" }, mode: params.publish_dir_mode, ] } From 2ce37093fdc9bf9762b9ab522d5a070febd68b5d Mon Sep 17 00:00:00 2001 From: tzx6 Date: Thu, 13 Jun 2024 23:24:22 -0400 Subject: [PATCH 05/18] deleted sge config, redundant --- conf/sge.config | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100755 conf/sge.config diff --git a/conf/sge.config b/conf/sge.config deleted file mode 100755 index 368ae51..0000000 --- a/conf/sge.config +++ /dev/null @@ -1,13 +0,0 @@ -profiles { - sge { - process { - executor = 'sge' - penv = 'smp' - queue = 'all.q' - } - - executor { - queueSize = 24 - } - } -} From c3b778af722ea3b9ece5e51a2b54f404eeabf806 Mon Sep 17 00:00:00 2001 From: tzx6 Date: Thu, 13 Jun 2024 23:24:41 -0400 Subject: [PATCH 06/18] modified module reheader_bam --- modules/local/reheader_bam.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/local/reheader_bam.nf b/modules/local/reheader_bam.nf index 6ee81c5..a4c767b 100755 --- a/modules/local/reheader_bam.nf +++ b/modules/local/reheader_bam.nf @@ -18,6 +18,7 @@ process REHEADER_BAM { // Output file (sorted BAM) output: tuple val(meta), path("*.bam"), emit: reheadered_bam + tuple val(meta), path("*.bai"), emit: reheadered_bai path "versions.yml", emit: versions // Script to run within the container From 2efae003ee0a2006e0e568bb2cbd343ef0675664 Mon Sep 17 00:00:00 2001 From: tzx6 Date: Thu, 13 Jun 2024 23:25:04 -0400 Subject: [PATCH 07/18] removed kraken2 params and configured pipeline info and results directory --- nextflow.config | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/nextflow.config b/nextflow.config index 1f8b4cd..c161dbc 100755 --- a/nextflow.config +++ b/nextflow.config @@ -23,10 +23,6 @@ freyja_barcodes = null freyja_lineages_meta = null varthresh = 0.01 - - // Kraken2 database parameter - kraken = false - kraken_db_std = null // Samtools Ampliconclip parameters save_cliprejects = false @@ -37,14 +33,14 @@ // MultiQC options multiqc_config = null - multiqc_title = 'AquaScope-SARS-CoV-2 Quality Report' + multiqc_title = 'Aquascope-SARS-CoV-2 Quality Report' multiqc_logo = null - max_multiqc_email_size = '25.MB' + max_multiqc_email_size = '30.MB' multiqc_methods_description = null // Boilerplate options - outdir = './results' - tracedir = "${params.outdir}/pipeline_info" + outdir = './SARS-COV-2-Results' + tracedir = "${params.outdir}/Pipeline_Info" publish_dir_mode = 'copy' email = null email_on_fail = null @@ -138,7 +134,6 @@ profiles { } test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } - sge { includeConfig 'conf/sge.config' } } // Export these variables to prevent local Python/R libraries from conflicting with those in the container From 30cfb205e1368e056c14d3600afdcb81cade444d Mon Sep 17 00:00:00 2001 From: tzx6 Date: Thu, 13 Jun 2024 23:25:31 -0400 Subject: [PATCH 08/18] removed redundant or unused tools --- workflows/aquascope.nf | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/workflows/aquascope.nf b/workflows/aquascope.nf index 86043f4..cc04dee 100755 --- a/workflows/aquascope.nf +++ b/workflows/aquascope.nf @@ -52,10 +52,8 @@ include { FASTQC as FASTQC_RAW_SHORT } from '../modules/nf-core/modul include { NANOPLOT as NANOPLOT_RAW_LONG } from '../modules/nf-core/modules/nf-core/nanoplot/main' include { FASTP as FASTP_SHORT } from '../modules/nf-core/modules/nf-core/fastp/main' include { FASTP as FASTP_LONG } from '../modules/local/fastp/main' -//include { CHOPPER } from '../modules/nf-core/modules/nf-core/chopper/main' include { FASTQC as FASTQC_SHORT_TRIMMED } from '../modules/nf-core/modules/nf-core/fastqc/main' include { NANOPLOT as NANOPLOT_LONG_TRIMMED } from '../modules/nf-core/modules/nf-core/nanoplot/main' -include { KRAKEN2_KRAKEN2 as KRAKEN2_STD } from '../modules/nf-core/modules/nf-core/kraken2/kraken2/main' include { QUALIMAP_BAMQC } from '../modules/nf-core/modules/nf-core/qualimap/bamqc/main' include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_SHORT} from '../modules/local/minimap2/align/main' include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_LONG } from '../modules/local/minimap2/align/main' @@ -138,18 +136,6 @@ workflow AQUASCOPE { ch_trimmed_reads_long ) - // KRAKEN2 to check for human and bacterial reads - ch_kraken2_multiqc = Channel.empty() - if (params.kraken != false) { - KRAKEN2_STD ( - ch_trimmed_reads_short, - params.kraken_db_std, - true, - true - ) - ch_kraken2_multiqc = KRAKEN2_STD.out.report - } - // MODULE: Align reads against reference genome ch_short_align_bam = Channel.empty() MINIMAP2_ALIGN_SHORT ( @@ -167,11 +153,13 @@ workflow AQUASCOPE { ch_long_align_bai = MINIMAP2_ALIGN_LONG.out.bai // Reheader and Sort the INPUT BAM from Ion-Torrent - ch_rehead_sorted_bam = Channel.empty() + ch_rehead_sorted_bam = Channel.empty() + ch_rehead_sorted_bai = Channel.empty() REHEADER_BAM ( ch_raw_bam, ch_gff ) ch_rehead_sorted_bam = REHEADER_BAM.out.reheadered_bam + ch_rehead_sorted_bai = REHEADER_BAM.out.reheadered_bai ch_versions = ch_versions.mix(REHEADER_BAM.out.versions) // Combine channels for further processing @@ -253,7 +241,6 @@ workflow AQUASCOPE { ch_multiqc_files = ch_multiqc_files.mix(FASTP_SHORT.out.json.collect{ it[1] }.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(FASTP_LONG.out.json.collect{ it[1] }.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(FASTQC_SHORT_TRIMMED.out.zip.collect{ it[1] }.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_kraken2_multiqc.collect{ it[1] }.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(ch_qualimap_multiqc.collect{ it[1] }.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(ch_freyja_demix.collect{ it[1] }.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(ch_ivar_stats.collect{ it[1] }.ifEmpty([])) From 7f947b478845f334474a76accd435ef993de487c Mon Sep 17 00:00:00 2001 From: tzx6 Date: Mon, 17 Jun 2024 21:58:44 -0400 Subject: [PATCH 09/18] feat:added a samplesheet validation script --- bin/check_samplesheet.py | 99 ++++++++++++++++++++++++++++++ modules/local/samplesheet_check.nf | 15 +++-- 2 files changed, 108 insertions(+), 6 deletions(-) create mode 100755 bin/check_samplesheet.py diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py new file mode 100755 index 0000000..07116c2 --- /dev/null +++ b/bin/check_samplesheet.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3.9 + +import os +import sys +import argparse +import http.client +import urllib.parse + +VALID_PLATFORMS = {"illumina", "nanopore", "iontorrent"} +EXPECTED_HEADERS = ["sample", "platform", "fastq_1", "fastq_2", "lr", "bam_file", "bedfile"] +MIN_COLS_REQUIRED = 3 + +def parse_args(args=None): + parser = argparse.ArgumentParser( + description="Reformat nf-core/aquascope samplesheet file and check its contents.", + epilog="Example usage: python check_samplesheet.py ", + ) + parser.add_argument("FILE_IN", help="Input samplesheet file.") + parser.add_argument("FILE_OUT", help="Output file.") + return parser.parse_args(args) + +def validate_fastq(fastq_file, line): + if " " in fastq_file: + print(f"FastQ file '{fastq_file}' contains spaces! Line: {line}") + if not fastq_file.endswith((".fastq.gz", ".fq.gz")): + print(f"FastQ file '{fastq_file}' does not have extension '.fastq.gz' or '.fq.gz'! Line: {line}") + +def validate_bedfile(bedfile, line, platform): + if bedfile: + if bedfile.startswith(("http://", "https://")): + parsed_url = urllib.parse.urlparse(bedfile) + conn = http.client.HTTPConnection(parsed_url.netloc) if parsed_url.scheme == "http" else http.client.HTTPSConnection(parsed_url.netloc) + conn.request("GET", parsed_url.path) + response = conn.getresponse() + if response.status == 200: + lines = response.read().decode('utf-8').splitlines() + for i, bed_line in enumerate(lines): + cols = bed_line.strip().split("\t") + if len(cols) < 6: + print(f"Bed file '{bedfile}' must have at least 6 columns! (Line {i+1}) Line: {line}") + else: + print(f"Failed to download bed file '{bedfile}': {response.status} Line: {line}") + else: + if not os.path.isfile(bedfile): + print(f"Bed file '{bedfile}' does not exist! Line: {line}") + else: + with open(bedfile, "r") as f: + for i, bed_line in enumerate(f): + cols = bed_line.strip().split("\t") + if len(cols) < 6: + print(f"Bed file '{bedfile}' must have at least 6 columns! (Line {i+1}) Line: {line}") + elif platform != "iontorrent": + print(f"Bedfile is required for platforms other than IonTorrent if not provided. Line: {line}") + +def check_samplesheet(args): + file_in = args.FILE_IN + file_out = args.FILE_OUT + + with open(file_in, "r") as fin, open(file_out, "w") as fout: + header = [x.strip('"') for x in fin.readline().strip().split(",")] + if header[: len(EXPECTED_HEADERS)] != EXPECTED_HEADERS: + print(f"Invalid header! Expected {EXPECTED_HEADERS} but got {header}. Line: {','.join(header)}") + + fout.write(",".join(header) + "\n") + + for i, line in enumerate(fin, start=2): + cols = [x.strip().strip('"') for x in line.strip().split(",")] + if len(cols) < MIN_COLS_REQUIRED: + print(f"Invalid number of columns (minimum = {MIN_COLS_REQUIRED})! Line: {line}") + continue + + sample, platform = cols[0], cols[1].lower() + fastq_1, fastq_2, lr, bam_file = (cols[2:6] + [None]*4)[:4] + bedfile = cols[6] if len(cols) > 6 else None # Only if bedfile is specified + + if platform.lower() not in VALID_PLATFORMS: + print(f"Invalid platform '{platform}'! Line: {line}") + continue + if platform == "illumina": + if fastq_1: + validate_fastq(fastq_1, line) + if fastq_2: + validate_fastq(fastq_2, line) + elif platform == "nanopore" and lr and not lr.endswith((".fastq.gz", ".fq.gz")): + print(f"Nanopore requires FastQ file in 'lr' column! Line: {line}") + elif platform == "iontorrent" and bam_file and not bam_file.endswith(".bam"): + print(f"IonTorrent requires BAM file! Line: {line}") + + validate_bedfile(bedfile, line, platform) + + # Write to the output file + output_line = ",".join([sample, platform, fastq_1 or '', fastq_2 or '', lr or '', bam_file or '', bedfile or '']) + fout.write(output_line + "\n") + +def main(args=None): + check_samplesheet(parse_args(args)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index 872b268..c948caf 100755 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -2,10 +2,10 @@ process SAMPLESHEET_CHECK { tag "$samplesheet" label 'process_single' - conda "conda-forge::python=3.8.3" + conda "conda-forge::python=3.9.5" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'quay.io/biocontainers/python:3.8.3' }" + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'quay.io/biocontainers/python:3.9--1' }" input: path samplesheet @@ -13,19 +13,22 @@ process SAMPLESHEET_CHECK { output: path '*.csv' , emit: csv path "versions.yml", emit: versions + val true , emit: success when: task.ext.when == null || task.ext.when - script: // This script is bundled with the pipeline, in hseabolt/metaxplore/bin/ + script: """ check_samplesheet.py \\ $samplesheet \\ - samplesheet.valid.csv + samplesheet.valid.csv \\ + && echo "SUCCESS" > success.txt + cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') END_VERSIONS """ -} +} \ No newline at end of file From 9e8ccd2c04cdb964ebc96180b3e80e835c6ebb9c Mon Sep 17 00:00:00 2001 From: tzx6 Date: Mon, 17 Jun 2024 21:59:08 -0400 Subject: [PATCH 10/18] added samplesheet check to modules.config --- conf/modules.config | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 86d60ed..2b075bc 100755 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,6 +18,20 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + withName: 'SAMPLESHEET_CHECK' { + publishDir = [ + path: { "${params.outdir}/pipeline_info/" }, + mode: params.publish_dir_mode, + ] + } + + withName: 'SAMTOOLS_FAIDX' { + publishDir = [ + path: { "${params.outdir}/SAMTOOLS/"}, + mode: params.publish_dir_mode, + ] + } + withName: 'FASTQC_RAW_SHORT' { ext.args = '--quiet' publishDir = [ @@ -224,7 +238,7 @@ process { withName: 'CUSTOM_DUMPSOFTWAREVERSIONS' { publishDir = [ - path: { "${params.outdir}/pipeline_info" }, + path: { "${params.outdir}/Pipeline_Info" }, mode: params.publish_dir_mode, pattern: '*_versions.yml' ] From ba6158fe354a055b1384a93cd1056eb6326c8cdd Mon Sep 17 00:00:00 2001 From: tzx6 Date: Mon, 17 Jun 2024 21:59:30 -0400 Subject: [PATCH 11/18] feat: added samplesheet validation to the workflow --- workflows/aquascope.nf | 336 +++++++++++++++++++---------------------- 1 file changed, 156 insertions(+), 180 deletions(-) diff --git a/workflows/aquascope.nf b/workflows/aquascope.nf index cc04dee..f7d7796 100755 --- a/workflows/aquascope.nf +++ b/workflows/aquascope.nf @@ -13,14 +13,11 @@ WorkflowAquascope.initialise(params, log) // TODO nf-core: Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.fasta, params.gff, params.freyja_barcodes, params.freyja_lineages_meta] +def checkPathParamList = [params.input, params.fasta, params.gff, params.freyja_barcodes, params.freyja_lineages_meta] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } -// Check mandatory parameters - if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } - /* ======================================================================================== CONFIG FILES @@ -39,6 +36,7 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi // // SUBWORKFLOWS // +include { SAMPLESHEET_CHECK } from '../modules/local/samplesheet_check.nf' include { INPUT_CHECK } from '../subworkflows/local/input_check.nf' include { TRIMMING as IVAR_TRIMMING_SORTING } from '../subworkflows/local/trimming.nf' include { ONT_TRIMMING } from '../subworkflows/local/ont_trimming.nf' @@ -77,8 +75,10 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.mu ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) -workflow AQUASCOPE { +nextflow.enable.dsl=2 +workflow AQUASCOPE { + // Initialize channels with known values ch_genome = params.fasta ? Channel.value(file(params.fasta)) : Channel.empty() ch_gff = params.gff ? Channel.value(file(params.gff)) : Channel.empty() @@ -87,179 +87,160 @@ workflow AQUASCOPE { ch_versions = Channel.empty() ch_genome_fai = Channel.empty() - // SUBWORKFLOW: Read in samplesheet, validate and stage input files - INPUT_CHECK () - ch_short_reads = INPUT_CHECK.out.raw_short_reads - ch_long_reads = INPUT_CHECK.out.raw_long_reads - ch_raw_bam = INPUT_CHECK.out.raw_bam - - // MODULE: Create Fasta Index file using samtools faidx - SAMTOOLS_FAIDX ( - ch_genome - ) - ch_genome_fai = SAMTOOLS_FAIDX.out.fai - ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions) - - // MODULE: FastQC on raw data for initial quality checking for short reads - FASTQC_RAW_SHORT ( - ch_short_reads - ) - ch_versions = ch_versions.mix(FASTQC_RAW_SHORT.out.versions.first()) - - NANOPLOT_RAW_LONG ( - ch_long_reads - ) - ch_versions = ch_versions.mix(NANOPLOT_RAW_LONG.out.versions.first()) - - // MODULE: Run FastP for short reads - ch_trimmed_reads_short = Channel.empty() - FASTP_SHORT ( - ch_short_reads, [], false, false - ) - ch_trimmed_reads_short = ch_trimmed_reads_short.mix(FASTP_SHORT.out.reads) - ch_versions = ch_versions.mix(FASTP_SHORT.out.versions.first()) - - // MODULE: Run FastP for Long reads - ch_trimmed_reads_long = Channel.empty() - FASTP_LONG ( - ch_long_reads, [], false, false - ) - ch_trimmed_reads_long = ch_trimmed_reads_long.mix(FASTP_LONG.out.reads) - ch_versions = ch_versions.mix(FASTP_LONG.out.versions.first()) - - // Quality checking for trimmed reads - FASTQC_SHORT_TRIMMED ( - ch_trimmed_reads_short - ) - - NANOPLOT_LONG_TRIMMED ( - ch_trimmed_reads_long - ) - - // MODULE: Align reads against reference genome - ch_short_align_bam = Channel.empty() - MINIMAP2_ALIGN_SHORT ( - ch_trimmed_reads_short, ch_genome, true, false, false - ) - ch_short_align_bam = MINIMAP2_ALIGN_SHORT.out.bam - ch_short_align_bai = MINIMAP2_ALIGN_SHORT.out.bai - ch_versions = ch_versions.mix(MINIMAP2_ALIGN_SHORT.out.versions.first()) - - ch_long_align_bam = Channel.empty() - MINIMAP2_ALIGN_LONG ( - ch_trimmed_reads_long, ch_genome, true, false, false - ) - ch_long_align_bam = MINIMAP2_ALIGN_LONG.out.bam - ch_long_align_bai = MINIMAP2_ALIGN_LONG.out.bai - - // Reheader and Sort the INPUT BAM from Ion-Torrent - ch_rehead_sorted_bam = Channel.empty() - ch_rehead_sorted_bai = Channel.empty() - REHEADER_BAM ( - ch_raw_bam, ch_gff - ) - ch_rehead_sorted_bam = REHEADER_BAM.out.reheadered_bam - ch_rehead_sorted_bai = REHEADER_BAM.out.reheadered_bai - ch_versions = ch_versions.mix(REHEADER_BAM.out.versions) - - // Combine channels for further processing - ch_combined_bam = ch_short_align_bam.mix(ch_long_align_bam, ch_rehead_sorted_bam) - - // MODULE : QUALIMAP for post-alignment BAM QC - QUALIMAP_BAMQC ( - ch_combined_bam, - ch_gff - ) - ch_qualimap_multiqc = QUALIMAP_BAMQC.out.results - ch_versions = ch_versions.mix(QUALIMAP_BAMQC.out.versions.first()) - - // MODULE: RUN IVAR_TRIM_SORT - Illumina only - ch_ivar_sort_bam = Channel.empty() - ch_ivar_sort_log = Channel.empty() - IVAR_TRIMMING_SORTING( - ch_short_align_bam - ) - ch_ivar_sort_bam = IVAR_TRIMMING_SORTING.out.bam - ch_ivar_sort_log = IVAR_TRIMMING_SORTING.out.log_out - ch_ivar_stats = IVAR_TRIMMING_SORTING.out.stats - ch_ivar_bam = IVAR_TRIMMING_SORTING.out.ivar_bam - ch_versions = ch_versions.mix(IVAR_TRIMMING_SORTING.out.versions) - - // MODULE: RUN SAMTOOLS_AMPLICON_CLIP_SORT - ONT reads only - ch_amplicon_sort_bam = Channel.empty() - ONT_TRIMMING( - ch_long_align_bam, params.save_cliprejects, params.save_clipstats - ) - ch_amplicon_sort_bam = ONT_TRIMMING.out.bam - ch_amplicon_sort_bai = ONT_TRIMMING.out.bai - ch_versions = ch_versions.mix(ONT_TRIMMING.out.versions) - - // Combine sorted BAM files - ch_sorted_bam = ch_ivar_sort_bam.mix(ch_rehead_sorted_bam) - ch_sorted_mixedbam = ch_sorted_bam.mix(ch_amplicon_sort_bam) - - // MODULE: Identify variants with iVar - ch_ivar_vcf = Channel.empty() - IVAR_VARIANTS( - ch_sorted_bam, - ch_genome, // Assuming the reference and this are the same - ch_genome_fai, - ch_gff, - params.save_mpileup // default is false, change it to true in nextflow.config file - ) - ch_ivar_vcf = IVAR_VARIANTS.out.tsv - ch_versions = ch_versions.mix(IVAR_VARIANTS.out.versions.first()) - - // MODULE: RUN FREYJA_VARIANT_CALLING - ch_freyja_variants = Channel.empty() - ch_freyja_depths = Channel.empty() - ch_freyja_demix = Channel.empty() - ch_freyja_lineages = Channel.empty() - ch_freyja_summarized = Channel.empty() - FREYJA_VARIANT_CALLING( - ch_sorted_mixedbam, - ch_genome, - params.freyja_db_name, - params.freyja_barcodes, - params.freyja_lineages_meta - ) - ch_freyja_variants = FREYJA_VARIANT_CALLING.out.variants - ch_freyja_depths = FREYJA_VARIANT_CALLING.out.depths - ch_freyja_demix = FREYJA_VARIANT_CALLING.out.demix - ch_versions = ch_versions.mix(FREYJA_VARIANT_CALLING.out.versions) - - // Workflow reporting - workflow_summary = WorkflowAquascope.paramsSummaryMultiqc(workflow, summary_params) - ch_workflow_summary = Channel.value(workflow_summary) - methods_description = WorkflowAquascope.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description) - ch_methods_description = Channel.value(methods_description) - - ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC_RAW_SHORT.out.zip.collect{ it[1] }.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(FASTP_SHORT.out.json.collect{ it[1] }.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(FASTP_LONG.out.json.collect{ it[1] }.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC_SHORT_TRIMMED.out.zip.collect{ it[1] }.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_qualimap_multiqc.collect{ it[1] }.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_freyja_demix.collect{ it[1] }.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_ivar_stats.collect{ it[1] }.ifEmpty([])) - - // Run MultiQC - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList() - ) - multiqc_report = MULTIQC.out.report.toList() + // Validate the Sample File + SAMPLESHEET_CHECK(ch_input) + + // Proceed only if SAMPLESHEET_CHECK is successful + if (SAMPLESHEET_CHECK.out.success) { + println "The samplesheet has been validated, the pipeline will start soon, check samplesheet_validated.csv for more details" + INPUT_CHECK() + + ch_short_reads = INPUT_CHECK.out.raw_short_reads + ch_long_reads = INPUT_CHECK.out.raw_long_reads + ch_raw_bam = INPUT_CHECK.out.raw_bam + + // MODULE: Create Fasta Index file using samtools faidx + SAMTOOLS_FAIDX(ch_genome) + ch_genome_fai = SAMTOOLS_FAIDX.out.fai + ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions) + + // MODULE: FastQC on raw data for initial quality checking for short reads + FASTQC_RAW_SHORT(ch_short_reads) + ch_versions = ch_versions.mix(FASTQC_RAW_SHORT.out.versions.first()) + + NANOPLOT_RAW_LONG(ch_long_reads) + ch_versions = ch_versions.mix(NANOPLOT_RAW_LONG.out.versions.first()) + + // MODULE: Run FastP for short reads + ch_trimmed_reads_short = Channel.empty() + FASTP_SHORT(ch_short_reads, [], false, false) + ch_trimmed_reads_short = ch_trimmed_reads_short.mix(FASTP_SHORT.out.reads) + ch_versions = ch_versions.mix(FASTP_SHORT.out.versions.first()) + + // MODULE: Run FastP for Long reads + ch_trimmed_reads_long = Channel.empty() + FASTP_LONG(ch_long_reads, [], false, false) + ch_trimmed_reads_long = ch_trimmed_reads_long.mix(FASTP_LONG.out.reads) + ch_versions = ch_versions.mix(FASTP_LONG.out.versions.first()) + + // Quality checking for trimmed reads + FASTQC_SHORT_TRIMMED(ch_trimmed_reads_short) + + NANOPLOT_LONG_TRIMMED(ch_trimmed_reads_long) + + // MODULE: Align reads against reference genome + ch_short_align_bam = Channel.empty() + MINIMAP2_ALIGN_SHORT(ch_trimmed_reads_short, ch_genome, true, false, false) + ch_short_align_bam = MINIMAP2_ALIGN_SHORT.out.bam + ch_short_align_bai = MINIMAP2_ALIGN_SHORT.out.bai + ch_versions = ch_versions.mix(MINIMAP2_ALIGN_SHORT.out.versions.first()) + + ch_long_align_bam = Channel.empty() + MINIMAP2_ALIGN_LONG(ch_trimmed_reads_long, ch_genome, true, false, false) + ch_long_align_bam = MINIMAP2_ALIGN_LONG.out.bam + ch_long_align_bai = MINIMAP2_ALIGN_LONG.out.bai + + // Reheader and Sort the INPUT BAM from Ion-Torrent + ch_rehead_sorted_bam = Channel.empty() + ch_rehead_sorted_bai = Channel.empty() + REHEADER_BAM(ch_raw_bam, ch_gff) + ch_rehead_sorted_bam = REHEADER_BAM.out.reheadered_bam + ch_rehead_sorted_bai = REHEADER_BAM.out.reheadered_bai + ch_versions = ch_versions.mix(REHEADER_BAM.out.versions) + + // Combine channels for further processing + ch_combined_bam = ch_short_align_bam.mix(ch_long_align_bam, ch_rehead_sorted_bam) + + // MODULE : QUALIMAP for post-alignment BAM QC + QUALIMAP_BAMQC(ch_combined_bam, ch_gff) + ch_qualimap_multiqc = QUALIMAP_BAMQC.out.results + ch_versions = ch_versions.mix(QUALIMAP_BAMQC.out.versions.first()) + + // MODULE: RUN IVAR_TRIM_SORT - Illumina only + ch_ivar_sort_bam = Channel.empty() + ch_ivar_sort_log = Channel.empty() + IVAR_TRIMMING_SORTING(ch_short_align_bam) + ch_ivar_sort_bam = IVAR_TRIMMING_SORTING.out.bam + ch_ivar_sort_log = IVAR_TRIMMING_SORTING.out.log_out + ch_ivar_stats = IVAR_TRIMMING_SORTING.out.stats + ch_ivar_bam = IVAR_TRIMMING_SORTING.out.ivar_bam + ch_versions = ch_versions.mix(IVAR_TRIMMING_SORTING.out.versions) + + // MODULE: RUN SAMTOOLS_AMPLICON_CLIP_SORT - ONT reads only + ch_amplicon_sort_bam = Channel.empty() + ONT_TRIMMING(ch_long_align_bam, params.save_cliprejects, params.save_clipstats) + ch_amplicon_sort_bam = ONT_TRIMMING.out.bam + ch_amplicon_sort_bai = ONT_TRIMMING.out.bai + ch_versions = ch_versions.mix(ONT_TRIMMING.out.versions) + + // Combine sorted BAM files + ch_sorted_bam = ch_ivar_sort_bam.mix(ch_rehead_sorted_bam) + ch_sorted_mixedbam = ch_sorted_bam.mix(ch_amplicon_sort_bam) + + // MODULE: Identify variants with iVar + ch_ivar_vcf = Channel.empty() + IVAR_VARIANTS( + ch_sorted_bam, + ch_genome, // Assuming the reference and this are the same + ch_genome_fai, + ch_gff, + params.save_mpileup // default is false, change it to true in nextflow.config file + ) + ch_ivar_vcf = IVAR_VARIANTS.out.tsv + ch_versions = ch_versions.mix(IVAR_VARIANTS.out.versions.first()) + + // MODULE: RUN FREYJA_VARIANT_CALLING + ch_freyja_variants = Channel.empty() + ch_freyja_depths = Channel.empty() + ch_freyja_demix = Channel.empty() + ch_freyja_lineages = Channel.empty() + ch_freyja_summarized = Channel.empty() + FREYJA_VARIANT_CALLING( + ch_sorted_mixedbam, + ch_genome, + params.freyja_db_name, + params.freyja_barcodes, + params.freyja_lineages_meta + ) + ch_freyja_variants = FREYJA_VARIANT_CALLING.out.variants + ch_freyja_depths = FREYJA_VARIANT_CALLING.out.depths + ch_freyja_demix = FREYJA_VARIANT_CALLING.out.demix + ch_versions = ch_versions.mix(FREYJA_VARIANT_CALLING.out.versions) + + // Workflow reporting + workflow_summary = WorkflowAquascope.paramsSummaryMultiqc(workflow, summary_params) + ch_workflow_summary = Channel.value(workflow_summary) + methods_description = WorkflowAquascope.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description) + ch_methods_description = Channel.value(methods_description) + + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC_RAW_SHORT.out.zip.collect{ it[1] }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTP_SHORT.out.json.collect{ it[1] }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTP_LONG.out.json.collect{ it[1] }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC_SHORT_TRIMMED.out.zip.collect{ it[1] }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_qualimap_multiqc.collect{ it[1] }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_freyja_demix.collect{ it[1] }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_ivar_stats.collect{ it[1] }.ifEmpty([])) + + // Run MultiQC + MULTIQC ( + ch_multiqc_files.collect(), + ch_multiqc_config.toList(), + ch_multiqc_custom_config.toList(), + ch_multiqc_logo.toList() + ) + multiqc_report = MULTIQC.out.report.toList() + } else { + println "The samplesheet validation failed. Please check the input samplesheet and try again." + exit 1 + } } -// /* + // ======================================================================================== // COMPLETION EMAIL AND SUMMARY // ======================================================================================== -// */ - workflow.onComplete { if (params.email || params.email_on_fail) { NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report) @@ -268,9 +249,4 @@ workflow.onComplete { if (params.hook_url) { NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) } -} -/* -======================================================================================== - THE END -======================================================================================== -*/ +} \ No newline at end of file From 3b47ac8c491b30b93822f674759f97ce3f3f0c4f Mon Sep 17 00:00:00 2001 From: tzx6 Date: Mon, 17 Jun 2024 21:59:56 -0400 Subject: [PATCH 12/18] feat: added samplesheet validation and processing for freyja subworkflow --- bin/bam_to_samplesheet.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100755 bin/bam_to_samplesheet.py diff --git a/bin/bam_to_samplesheet.py b/bin/bam_to_samplesheet.py new file mode 100755 index 0000000..c942666 --- /dev/null +++ b/bin/bam_to_samplesheet.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python + +import os +import argparse + +def parse_args(): + parser = argparse.ArgumentParser( + description="Generate a samplesheet from a directory of BAM files for Freyja subworkflow", + epilog="Usage: python bam_to_samplesheet.py --directory --output " + ) + parser.add_argument("--directory", help="Directory containing BAM files.", required=True) + parser.add_argument("--output", help="Output file for the samplesheet.", required=True) + return parser.parse_args() + +def extract_sample_name(bam_filename): + """ + Extracts the sample name from the BAM filename assuming the sample name + is the first component before the first ".". + """ + return bam_filename.split(".")[0] + +def generate_samplesheet(directory, output_file): + bam_files = [f for f in os.listdir(directory) if f.endswith(".bam")] + with open(output_file, "w") as fout: + fout.write("SNAME,BAMFILE\n") + for bam_file in bam_files: + sample_name = extract_sample_name(bam_file) + bam_path = os.path.abspath(os.path.join(directory, bam_file)) + fout.write(f"{sample_name},{bam_path}\n") + +def main(): + args = parse_args() + generate_samplesheet(args.directory, args.output) + +if __name__ == "__main__": + main() \ No newline at end of file From b06277f55836795ed14c7b7381a4044318007cbc Mon Sep 17 00:00:00 2001 From: tzx6 Date: Tue, 18 Jun 2024 00:19:47 -0400 Subject: [PATCH 13/18] feat: added freyja standalone subworkflow --- conf/freyja_standalone.config | 52 +++++++++++++++++++++++++++ conf/modules.config | 9 ++++- nextflow.config | 9 +++-- nextflow_schema.json | 4 ++- workflows/freyja_standalone.nf | 66 ++++++++++++++++++++++++++++++++++ 5 files changed, 136 insertions(+), 4 deletions(-) create mode 100755 conf/freyja_standalone.config create mode 100755 workflows/freyja_standalone.nf diff --git a/conf/freyja_standalone.config b/conf/freyja_standalone.config new file mode 100755 index 0000000..a00dbf2 --- /dev/null +++ b/conf/freyja_standalone.config @@ -0,0 +1,52 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. +---------------------------------------------------------------------------------------- +*/ + +process { + + publishDir = [ + path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + withName: 'FREYJA_VARIANTS' { + ext.args = "--minq 20 --annot \"${params.gff3}\" --varthresh \"${params.varthresh}\" " + publishDir = [ + path: { "${params.outdir}/FREYJA_STANDALONE/VarCalls" }, + mode: params.publish_dir_mode, + pattern: "*.{tsv,csv}" + ] + } + + withName: 'FREYJA_DEMIX' { + ext.args = '--covcut 10 --confirmedonly' + publishDir = [ + path: { "${params.outdir}/FREYJA_STANDALONE/Demix" }, + mode: params.publish_dir_mode, + pattern: "*.{tsv,csv}" + ] + } + + withName: 'FREYJA_UPDATE' { + publishDir = [ + path: { "${params.outdir}/FREYJA_STANDALONE/FREYJA_DB/" }, + mode: params.publish_dir_mode, + ] + } + + withName: 'MULTIQC' { + publishDir = [ + path: { "${params.outdir}/MULTIQC" }, + mode: params.publish_dir_mode + ] + } +} diff --git a/conf/modules.config b/conf/modules.config index 2b075bc..0a889e1 100755 --- a/conf/modules.config +++ b/conf/modules.config @@ -236,9 +236,16 @@ process { ] } + withName: 'MULTIQC' { + publishDir = [ + path: { "${params.outdir}/MULTIQC" }, + mode: params.publish_dir_mode, + ] + } + withName: 'CUSTOM_DUMPSOFTWAREVERSIONS' { publishDir = [ - path: { "${params.outdir}/Pipeline_Info" }, + path: { "${params.outdir}/pipeline_info" }, mode: params.publish_dir_mode, pattern: '*_versions.yml' ] diff --git a/nextflow.config b/nextflow.config index c161dbc..7c3ca42 100755 --- a/nextflow.config +++ b/nextflow.config @@ -12,6 +12,8 @@ // Input options input = null + workflow = 'aquascope' + // References fasta = "${projectDir}/assets/references/SARS-CoV-2.reference.fasta" gff = "${projectDir}/assets/references/SARS-CoV-2.reference.gff" @@ -84,6 +86,9 @@ try { System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") } +if (params.workflow == 'freyja_standalone') { + includeConfig 'conf/freyja_standalone.config' +} profiles { debug { process.beforeScript = 'echo $HOSTNAME' } @@ -167,11 +172,11 @@ dag { manifest { name = 'CDCGov-Aquascope' author = 'Arun Boddapati, Hunter Seabolt, CDC SciComp' - homePage = 'https://github.com/CDCgov/aquascope' + homePage = 'https://github.com/CDCgov/aquascope, https://cdcgov.github.io/aquascope/' description = 'Pipeline is for early detection of SC2 variants of concern via shotgun metagenomic sequencing of wastewater' mainScript = 'main.nf' nextflowVersion = '!>=23.04.02' - version = '1.0dev' + version = '2.1.0' } // Function to ensure that resource requirements don't go beyond diff --git a/nextflow_schema.json b/nextflow_schema.json index 51ed366..8044c9e 100755 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -11,7 +11,9 @@ "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", "required": [ - "input" + "input", + "fasta", + "gff" ], "properties": { "input": { diff --git a/workflows/freyja_standalone.nf b/workflows/freyja_standalone.nf new file mode 100755 index 0000000..172ba42 --- /dev/null +++ b/workflows/freyja_standalone.nf @@ -0,0 +1,66 @@ +def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) + +// Validate input parameters +WorkflowAquascope.initialise(params, log) + +// TODO nf-core: Add all file path parameters for the pipeline to the list below +// Check input path parameters to see if they exist + +def checkPathParamList = [params.input, params.fasta] + +for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } + + +if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified! please specify samplesheet containing BAM files only! Run bam_to_samplesheet.py to generate bamfile samplesheet' } + +include { INPUT_BAM_CHECK } from '../modules/local/input_check_bam.nf' +include { FREYJA_VARIANT_CALLING } from '../subworkflows/local/bam_variant_demix_boot_freyja/main' +include { MULTIQC } from '../modules/nf-core/modules/nf-core/multiqc/main' + + + +def multiqc_report = [] +ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() +ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() +ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + + +workflow FREYJA_STANDALONE { + + ch_genome = params.fasta ? Channel.value(file(params.fasta)) : Channel.empty() + + INPUT_BAM_CHECK () + + ch_sorted_bam = INPUT_BAM_CHECK.out.bam_files + +// MODULE: RUN FREYJA_VARIANT_CALLING + ch_freyja_variants = Channel.empty() + ch_freyja_depths = Channel.empty() + ch_freyja_demix = Channel.empty() + ch_freyja_lineages = Channel.empty() + ch_freyja_summarized = Channel.empty() + ch_versions = Channel.empty() + FREYJA_VARIANT_CALLING( + ch_sorted_bam, + ch_genome, + params.freyja_db_name, + params.freyja_barcodes, + params.freyja_lineages_meta + ) + ch_freyja_variants = FREYJA_VARIANT_CALLING.out.variants + ch_freyja_depths = FREYJA_VARIANT_CALLING.out.depths + ch_freyja_demix = FREYJA_VARIANT_CALLING.out.demix + ch_versions = ch_versions.mix(FREYJA_VARIANT_CALLING.out.versions) + + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(ch_freyja_demix.collect{ it[1] }.ifEmpty([])) + // Run MultiQC + MULTIQC ( + ch_multiqc_files.collect(), + ch_multiqc_config.toList(), + ch_multiqc_custom_config.toList(), + ch_multiqc_logo.toList() + ) + multiqc_report = MULTIQC.out.report.toList() +} \ No newline at end of file From 089c48225c62115292bb0bc4b56e8bc11df9ecf5 Mon Sep 17 00:00:00 2001 From: tzx6 Date: Tue, 18 Jun 2024 00:21:06 -0400 Subject: [PATCH 14/18] feat: added workflow as entry point --- workflows/aquascope.nf | 336 +++++++++++++++++++---------------------- 1 file changed, 155 insertions(+), 181 deletions(-) diff --git a/workflows/aquascope.nf b/workflows/aquascope.nf index cc04dee..4824ded 100755 --- a/workflows/aquascope.nf +++ b/workflows/aquascope.nf @@ -3,7 +3,6 @@ VALIDATE INPUTS ======================================================================================== */ - def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) @@ -13,14 +12,11 @@ WorkflowAquascope.initialise(params, log) // TODO nf-core: Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.fasta, params.gff, params.freyja_barcodes, params.freyja_lineages_meta] +def checkPathParamList = [params.input, params.fasta, params.gff, params.freyja_barcodes, params.freyja_lineages_meta] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } -// Check mandatory parameters - if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } - /* ======================================================================================== CONFIG FILES @@ -39,6 +35,7 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi // // SUBWORKFLOWS // +include { SAMPLESHEET_CHECK } from '../modules/local/samplesheet_check.nf' include { INPUT_CHECK } from '../subworkflows/local/input_check.nf' include { TRIMMING as IVAR_TRIMMING_SORTING } from '../subworkflows/local/trimming.nf' include { ONT_TRIMMING } from '../subworkflows/local/ont_trimming.nf' @@ -77,8 +74,9 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.mu ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) -workflow AQUASCOPE { +workflow AQUASCOPE { + // Initialize channels with known values ch_genome = params.fasta ? Channel.value(file(params.fasta)) : Channel.empty() ch_gff = params.gff ? Channel.value(file(params.gff)) : Channel.empty() @@ -87,179 +85,160 @@ workflow AQUASCOPE { ch_versions = Channel.empty() ch_genome_fai = Channel.empty() - // SUBWORKFLOW: Read in samplesheet, validate and stage input files - INPUT_CHECK () - ch_short_reads = INPUT_CHECK.out.raw_short_reads - ch_long_reads = INPUT_CHECK.out.raw_long_reads - ch_raw_bam = INPUT_CHECK.out.raw_bam - - // MODULE: Create Fasta Index file using samtools faidx - SAMTOOLS_FAIDX ( - ch_genome - ) - ch_genome_fai = SAMTOOLS_FAIDX.out.fai - ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions) - - // MODULE: FastQC on raw data for initial quality checking for short reads - FASTQC_RAW_SHORT ( - ch_short_reads - ) - ch_versions = ch_versions.mix(FASTQC_RAW_SHORT.out.versions.first()) - - NANOPLOT_RAW_LONG ( - ch_long_reads - ) - ch_versions = ch_versions.mix(NANOPLOT_RAW_LONG.out.versions.first()) - - // MODULE: Run FastP for short reads - ch_trimmed_reads_short = Channel.empty() - FASTP_SHORT ( - ch_short_reads, [], false, false - ) - ch_trimmed_reads_short = ch_trimmed_reads_short.mix(FASTP_SHORT.out.reads) - ch_versions = ch_versions.mix(FASTP_SHORT.out.versions.first()) - - // MODULE: Run FastP for Long reads - ch_trimmed_reads_long = Channel.empty() - FASTP_LONG ( - ch_long_reads, [], false, false - ) - ch_trimmed_reads_long = ch_trimmed_reads_long.mix(FASTP_LONG.out.reads) - ch_versions = ch_versions.mix(FASTP_LONG.out.versions.first()) - - // Quality checking for trimmed reads - FASTQC_SHORT_TRIMMED ( - ch_trimmed_reads_short - ) - - NANOPLOT_LONG_TRIMMED ( - ch_trimmed_reads_long - ) - - // MODULE: Align reads against reference genome - ch_short_align_bam = Channel.empty() - MINIMAP2_ALIGN_SHORT ( - ch_trimmed_reads_short, ch_genome, true, false, false - ) - ch_short_align_bam = MINIMAP2_ALIGN_SHORT.out.bam - ch_short_align_bai = MINIMAP2_ALIGN_SHORT.out.bai - ch_versions = ch_versions.mix(MINIMAP2_ALIGN_SHORT.out.versions.first()) - - ch_long_align_bam = Channel.empty() - MINIMAP2_ALIGN_LONG ( - ch_trimmed_reads_long, ch_genome, true, false, false - ) - ch_long_align_bam = MINIMAP2_ALIGN_LONG.out.bam - ch_long_align_bai = MINIMAP2_ALIGN_LONG.out.bai - - // Reheader and Sort the INPUT BAM from Ion-Torrent - ch_rehead_sorted_bam = Channel.empty() - ch_rehead_sorted_bai = Channel.empty() - REHEADER_BAM ( - ch_raw_bam, ch_gff - ) - ch_rehead_sorted_bam = REHEADER_BAM.out.reheadered_bam - ch_rehead_sorted_bai = REHEADER_BAM.out.reheadered_bai - ch_versions = ch_versions.mix(REHEADER_BAM.out.versions) - - // Combine channels for further processing - ch_combined_bam = ch_short_align_bam.mix(ch_long_align_bam, ch_rehead_sorted_bam) - - // MODULE : QUALIMAP for post-alignment BAM QC - QUALIMAP_BAMQC ( - ch_combined_bam, - ch_gff - ) - ch_qualimap_multiqc = QUALIMAP_BAMQC.out.results - ch_versions = ch_versions.mix(QUALIMAP_BAMQC.out.versions.first()) - - // MODULE: RUN IVAR_TRIM_SORT - Illumina only - ch_ivar_sort_bam = Channel.empty() - ch_ivar_sort_log = Channel.empty() - IVAR_TRIMMING_SORTING( - ch_short_align_bam - ) - ch_ivar_sort_bam = IVAR_TRIMMING_SORTING.out.bam - ch_ivar_sort_log = IVAR_TRIMMING_SORTING.out.log_out - ch_ivar_stats = IVAR_TRIMMING_SORTING.out.stats - ch_ivar_bam = IVAR_TRIMMING_SORTING.out.ivar_bam - ch_versions = ch_versions.mix(IVAR_TRIMMING_SORTING.out.versions) - - // MODULE: RUN SAMTOOLS_AMPLICON_CLIP_SORT - ONT reads only - ch_amplicon_sort_bam = Channel.empty() - ONT_TRIMMING( - ch_long_align_bam, params.save_cliprejects, params.save_clipstats - ) - ch_amplicon_sort_bam = ONT_TRIMMING.out.bam - ch_amplicon_sort_bai = ONT_TRIMMING.out.bai - ch_versions = ch_versions.mix(ONT_TRIMMING.out.versions) - - // Combine sorted BAM files - ch_sorted_bam = ch_ivar_sort_bam.mix(ch_rehead_sorted_bam) - ch_sorted_mixedbam = ch_sorted_bam.mix(ch_amplicon_sort_bam) - - // MODULE: Identify variants with iVar - ch_ivar_vcf = Channel.empty() - IVAR_VARIANTS( - ch_sorted_bam, - ch_genome, // Assuming the reference and this are the same - ch_genome_fai, - ch_gff, - params.save_mpileup // default is false, change it to true in nextflow.config file - ) - ch_ivar_vcf = IVAR_VARIANTS.out.tsv - ch_versions = ch_versions.mix(IVAR_VARIANTS.out.versions.first()) - - // MODULE: RUN FREYJA_VARIANT_CALLING - ch_freyja_variants = Channel.empty() - ch_freyja_depths = Channel.empty() - ch_freyja_demix = Channel.empty() - ch_freyja_lineages = Channel.empty() - ch_freyja_summarized = Channel.empty() - FREYJA_VARIANT_CALLING( - ch_sorted_mixedbam, - ch_genome, - params.freyja_db_name, - params.freyja_barcodes, - params.freyja_lineages_meta - ) - ch_freyja_variants = FREYJA_VARIANT_CALLING.out.variants - ch_freyja_depths = FREYJA_VARIANT_CALLING.out.depths - ch_freyja_demix = FREYJA_VARIANT_CALLING.out.demix - ch_versions = ch_versions.mix(FREYJA_VARIANT_CALLING.out.versions) - - // Workflow reporting - workflow_summary = WorkflowAquascope.paramsSummaryMultiqc(workflow, summary_params) - ch_workflow_summary = Channel.value(workflow_summary) - methods_description = WorkflowAquascope.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description) - ch_methods_description = Channel.value(methods_description) - - ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC_RAW_SHORT.out.zip.collect{ it[1] }.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(FASTP_SHORT.out.json.collect{ it[1] }.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(FASTP_LONG.out.json.collect{ it[1] }.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC_SHORT_TRIMMED.out.zip.collect{ it[1] }.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_qualimap_multiqc.collect{ it[1] }.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_freyja_demix.collect{ it[1] }.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_ivar_stats.collect{ it[1] }.ifEmpty([])) - - // Run MultiQC - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList() - ) - multiqc_report = MULTIQC.out.report.toList() + // Validate the Sample File + SAMPLESHEET_CHECK(ch_input) + + // Proceed only if SAMPLESHEET_CHECK is successful + if (SAMPLESHEET_CHECK.out.success) { + println "The samplesheet has been validated, the pipeline will start soon, check samplesheet_validated.csv for more details" + INPUT_CHECK() + + ch_short_reads = INPUT_CHECK.out.raw_short_reads + ch_long_reads = INPUT_CHECK.out.raw_long_reads + ch_raw_bam = INPUT_CHECK.out.raw_bam + + // MODULE: Create Fasta Index file using samtools faidx + SAMTOOLS_FAIDX(ch_genome) + ch_genome_fai = SAMTOOLS_FAIDX.out.fai + ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions) + + // MODULE: FastQC on raw data for initial quality checking for short reads + FASTQC_RAW_SHORT(ch_short_reads) + ch_versions = ch_versions.mix(FASTQC_RAW_SHORT.out.versions.first()) + + NANOPLOT_RAW_LONG(ch_long_reads) + ch_versions = ch_versions.mix(NANOPLOT_RAW_LONG.out.versions.first()) + + // MODULE: Run FastP for short reads + ch_trimmed_reads_short = Channel.empty() + FASTP_SHORT(ch_short_reads, [], false, false) + ch_trimmed_reads_short = ch_trimmed_reads_short.mix(FASTP_SHORT.out.reads) + ch_versions = ch_versions.mix(FASTP_SHORT.out.versions.first()) + + // MODULE: Run FastP for Long reads + ch_trimmed_reads_long = Channel.empty() + FASTP_LONG(ch_long_reads, [], false, false) + ch_trimmed_reads_long = ch_trimmed_reads_long.mix(FASTP_LONG.out.reads) + ch_versions = ch_versions.mix(FASTP_LONG.out.versions.first()) + + // Quality checking for trimmed reads + FASTQC_SHORT_TRIMMED(ch_trimmed_reads_short) + + NANOPLOT_LONG_TRIMMED(ch_trimmed_reads_long) + + // MODULE: Align reads against reference genome + ch_short_align_bam = Channel.empty() + MINIMAP2_ALIGN_SHORT(ch_trimmed_reads_short, ch_genome, true, false, false) + ch_short_align_bam = MINIMAP2_ALIGN_SHORT.out.bam + ch_short_align_bai = MINIMAP2_ALIGN_SHORT.out.bai + ch_versions = ch_versions.mix(MINIMAP2_ALIGN_SHORT.out.versions.first()) + + ch_long_align_bam = Channel.empty() + MINIMAP2_ALIGN_LONG(ch_trimmed_reads_long, ch_genome, true, false, false) + ch_long_align_bam = MINIMAP2_ALIGN_LONG.out.bam + ch_long_align_bai = MINIMAP2_ALIGN_LONG.out.bai + + // Reheader and Sort the INPUT BAM from Ion-Torrent + ch_rehead_sorted_bam = Channel.empty() + ch_rehead_sorted_bai = Channel.empty() + REHEADER_BAM(ch_raw_bam, ch_gff) + ch_rehead_sorted_bam = REHEADER_BAM.out.reheadered_bam + ch_rehead_sorted_bai = REHEADER_BAM.out.reheadered_bai + ch_versions = ch_versions.mix(REHEADER_BAM.out.versions) + + // Combine channels for further processing + ch_combined_bam = ch_short_align_bam.mix(ch_long_align_bam, ch_rehead_sorted_bam) + + // MODULE : QUALIMAP for post-alignment BAM QC + QUALIMAP_BAMQC(ch_combined_bam, ch_gff) + ch_qualimap_multiqc = QUALIMAP_BAMQC.out.results + ch_versions = ch_versions.mix(QUALIMAP_BAMQC.out.versions.first()) + + // MODULE: RUN IVAR_TRIM_SORT - Illumina only + ch_ivar_sort_bam = Channel.empty() + ch_ivar_sort_log = Channel.empty() + IVAR_TRIMMING_SORTING(ch_short_align_bam) + ch_ivar_sort_bam = IVAR_TRIMMING_SORTING.out.bam + ch_ivar_sort_log = IVAR_TRIMMING_SORTING.out.log_out + ch_ivar_stats = IVAR_TRIMMING_SORTING.out.stats + ch_ivar_bam = IVAR_TRIMMING_SORTING.out.ivar_bam + ch_versions = ch_versions.mix(IVAR_TRIMMING_SORTING.out.versions) + + // MODULE: RUN SAMTOOLS_AMPLICON_CLIP_SORT - ONT reads only + ch_amplicon_sort_bam = Channel.empty() + ONT_TRIMMING(ch_long_align_bam, params.save_cliprejects, params.save_clipstats) + ch_amplicon_sort_bam = ONT_TRIMMING.out.bam + ch_amplicon_sort_bai = ONT_TRIMMING.out.bai + ch_versions = ch_versions.mix(ONT_TRIMMING.out.versions) + + // Combine sorted BAM files + ch_sorted_bam = ch_ivar_sort_bam.mix(ch_rehead_sorted_bam) + ch_sorted_mixedbam = ch_sorted_bam.mix(ch_amplicon_sort_bam) + + // MODULE: Identify variants with iVar + ch_ivar_vcf = Channel.empty() + IVAR_VARIANTS( + ch_sorted_bam, + ch_genome, // Assuming the reference and this are the same + ch_genome_fai, + ch_gff, + params.save_mpileup // default is false, change it to true in nextflow.config file + ) + ch_ivar_vcf = IVAR_VARIANTS.out.tsv + ch_versions = ch_versions.mix(IVAR_VARIANTS.out.versions.first()) + + // MODULE: RUN FREYJA_VARIANT_CALLING + ch_freyja_variants = Channel.empty() + ch_freyja_depths = Channel.empty() + ch_freyja_demix = Channel.empty() + ch_freyja_lineages = Channel.empty() + ch_freyja_summarized = Channel.empty() + FREYJA_VARIANT_CALLING( + ch_sorted_mixedbam, + ch_genome, + params.freyja_db_name, + params.freyja_barcodes, + params.freyja_lineages_meta + ) + ch_freyja_variants = FREYJA_VARIANT_CALLING.out.variants + ch_freyja_depths = FREYJA_VARIANT_CALLING.out.depths + ch_freyja_demix = FREYJA_VARIANT_CALLING.out.demix + ch_versions = ch_versions.mix(FREYJA_VARIANT_CALLING.out.versions) + + // Workflow reporting + workflow_summary = WorkflowAquascope.paramsSummaryMultiqc(workflow, summary_params) + ch_workflow_summary = Channel.value(workflow_summary) + methods_description = WorkflowAquascope.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description) + ch_methods_description = Channel.value(methods_description) + + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC_RAW_SHORT.out.zip.collect{ it[1] }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTP_SHORT.out.json.collect{ it[1] }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTP_LONG.out.json.collect{ it[1] }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC_SHORT_TRIMMED.out.zip.collect{ it[1] }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_qualimap_multiqc.collect{ it[1] }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_freyja_demix.collect{ it[1] }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_ivar_stats.collect{ it[1] }.ifEmpty([])) + + // Run MultiQC + MULTIQC ( + ch_multiqc_files.collect(), + ch_multiqc_config.toList(), + ch_multiqc_custom_config.toList(), + ch_multiqc_logo.toList() + ) + multiqc_report = MULTIQC.out.report.toList() + } else { + println "The samplesheet validation failed. Please check the input samplesheet and try again." + exit 1 + } } -// /* + // ======================================================================================== // COMPLETION EMAIL AND SUMMARY // ======================================================================================== -// */ - workflow.onComplete { if (params.email || params.email_on_fail) { NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report) @@ -268,9 +247,4 @@ workflow.onComplete { if (params.hook_url) { NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) } -} -/* -======================================================================================== - THE END -======================================================================================== -*/ +} \ No newline at end of file From 3185529d9e52d1a277da9b27ab84665a34cff685 Mon Sep 17 00:00:00 2001 From: tzx6 Date: Tue, 18 Jun 2024 00:22:00 -0400 Subject: [PATCH 15/18] feat: added subworkflows with input_check_bam --- bin/bam_to_samplesheet.py | 36 ++++++++ main.nf | 28 ++++-- modules/local/input_check_bam.nf | 48 ++++++++++ subworkflows/local/freyja_subworkflow/main.nf | 83 +++++++++++++++++ .../local/freyja_subworkflow/meta.yml | 88 +++++++++++++++++++ 5 files changed, 275 insertions(+), 8 deletions(-) create mode 100755 bin/bam_to_samplesheet.py create mode 100755 modules/local/input_check_bam.nf create mode 100755 subworkflows/local/freyja_subworkflow/main.nf create mode 100755 subworkflows/local/freyja_subworkflow/meta.yml diff --git a/bin/bam_to_samplesheet.py b/bin/bam_to_samplesheet.py new file mode 100755 index 0000000..04db7e3 --- /dev/null +++ b/bin/bam_to_samplesheet.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python + +import os +import argparse + +def parse_args(): + parser = argparse.ArgumentParser( + description="Generate a samplesheet from a directory of BAM files for Freyja subworkflow", + epilog="Usage: python bam_to_samplesheet.py --directory --output " + ) + parser.add_argument("--directory", help="Directory containing BAM files.", required=True) + parser.add_argument("--output", help="Output file for the samplesheet.", required=True) + return parser.parse_args() + +def extract_sample_name(bam_filename): + """ + Extracts the sample name from the BAM filename assuming the sample name + is the first component before the first ".". + """ + return bam_filename.split(".")[0] + +def generate_samplesheet(directory, output_file): + bam_files = [f for f in os.listdir(directory) if f.endswith(".bam")] + with open(output_file, "w") as fout: + fout.write("SNAME,BAMFILE\n") + for bam_file in bam_files: + sample_name = extract_sample_name(bam_file) + bam_path = os.path.abspath(os.path.join(directory, bam_file)) + fout.write(f"{sample_name},{bam_path}\n") + +def main(): + args = parse_args() + generate_samplesheet(args.directory, args.output) + +if __name__ == "__main__": + main() diff --git a/main.nf b/main.nf index dc64934..019f81a 100755 --- a/main.nf +++ b/main.nf @@ -13,6 +13,7 @@ nextflow.enable.dsl = 2 ======================================================================================== */ +// Initialises the workflow and validates parameters WorkflowMain.initialise(workflow, params, log) /* @@ -21,27 +22,38 @@ WorkflowMain.initialise(workflow, params, log) ======================================================================================== */ -include { AQUASCOPE } from './workflows/aquascope' +// Include the workflows from their respective files +include { AQUASCOPE } from './workflows/aquascope' +include { FREYJA_STANDALONE } from './workflows/freyja_standalone' // // WORKFLOW: Run main nf-core/aquascope analysis pipeline // workflow NFCORE_AQUASCOPE { - AQUASCOPE () + AQUASCOPE() +} + +// +// WORKFLOW: Run Freyja standalone analysis +// +workflow NFCORE_FREYJA_STANDALONE { + FREYJA_STANDALONE() } /* ======================================================================================== - RUN ALL WORKFLOWS + RUN THE SELECTED WORKFLOW ======================================================================================== */ -// -// WORKFLOW: Execute a single named workflow for the pipeline -// See: https://github.com/nf-core/rnaseq/issues/619 -// workflow { - NFCORE_AQUASCOPE () + if (params.workflow == 'aquascope') { + NFCORE_AQUASCOPE() + } else if (params.workflow == 'freyja_standalone') { + NFCORE_FREYJA_STANDALONE() + } else { + error "Unknown workflow specified: ${params.workflow}. Valid options are 'aquascope' or 'freyja_standalone'." + } } /* diff --git a/modules/local/input_check_bam.nf b/modules/local/input_check_bam.nf new file mode 100755 index 0000000..fb905f3 --- /dev/null +++ b/modules/local/input_check_bam.nf @@ -0,0 +1,48 @@ +def hasExtension(it, extension) { + it.toString().toLowerCase().endsWith(extension.toLowerCase()) +} + +workflow INPUT_BAM_CHECK { + main: + if (hasExtension(params.input, "csv")) { + ch_input_rows = Channel + .from(file(params.input)) + .splitCsv(header: true) + .map { row -> + if (row.size() == 2) { + def id = row.SNAME + def bam_file = row.BAMFILE ? file(row.BAMFILE, checkIfExists: true) : false + + if (!bam_file) { + exit 1, "Invalid input samplesheet: BAM file must be specified." + } + + if (!hasExtension(bam_file, ".bam")) { + exit 1, "Invalid input samplesheet: The bam_file column must end with .bam." + } + + return [id, bam_file] + } else { + exit 1, "Input samplesheet contains row with ${row.size()} column(s). Expects 2." + } + } + + ch_bam_files = ch_input_rows + .map { id, bam_file -> + def meta = [:] + meta.id = id + return [meta, bam_file] + } + } else { + exit 1, "Input file must be a CSV." + } + + // Ensure sample IDs are unique + ch_input_rows + .map { id, bam_file -> id } + .toList() + .map { ids -> if (ids.size() != ids.unique().size()) { exit 1, "ERROR: input samplesheet contains duplicated sample IDs!" } } + + emit: + bam_files = ch_bam_files +} diff --git a/subworkflows/local/freyja_subworkflow/main.nf b/subworkflows/local/freyja_subworkflow/main.nf new file mode 100755 index 0000000..1a4b51f --- /dev/null +++ b/subworkflows/local/freyja_subworkflow/main.nf @@ -0,0 +1,83 @@ + +include { FREYJA_VARIANTS } from '../../../modules/nf-core/modules/nf-core/freyja/variants/main' +include { FREYJA_UPDATE } from '../../../modules/nf-core/modules/nf-core/freyja/update/main' +include { FREYJA_DEMIX } from '../../../modules/nf-core/modules/nf-core/freyja/demix/main' +//include { FREYJA_BOOT } from '../../../modules/nf-core/modules/nf-core/freyja/boot/main' + +workflow FREYJA_VARIANT_CALLING { + + take: + bam // channel: [ val(meta), path(bam) ] + ch_fasta // channel: [ val(meta), path(fasta) ] + //ch_val_repeats // value repeats + val_db_name // string db_name + ch_barcodes // channel: [ val(meta), path(barcodes)] + ch_lineages_meta // channel: [ val(meta), path(lineages_meta)] + + main: + + ch_versions = Channel.empty() + + // + // Variant calling + // + FREYJA_VARIANTS ( + ch_bam, + ch_fasta + ) + ch_freyja_variants = FREYJA_VARIANTS.out.variants + ch_freyja_depths = FREYJA_VARIANTS.out.depths + + ch_versions = ch_versions.mix(FREYJA_VARIANTS.out.versions.first()) + + // + // Update the database if none are given. + // + + if (!ch_barcodes || !ch_lineages_meta) { + FREYJA_UPDATE ( + val_db_name ) + + ch_barcodes = FREYJA_UPDATE.out.barcodes + ch_lineages_meta = FREYJA_UPDATE.out.lineages_meta + + ch_versions = ch_versions.mix(FREYJA_UPDATE.out.versions.first()) + } + + // + // demix and define minimum variant abundances + // + FREYJA_DEMIX ( + ch_freyja_variants, + ch_freyja_depths, + ch_barcodes, + ch_lineages_meta + ) + ch_freyja_demix = FREYJA_DEMIX.out.demix + ch_versions = ch_versions.mix(FREYJA_DEMIX.out.versions.first()) + + + // + // Perform bootstrapping to get more accurate estimates of abundancies + // + /*FREYJA_BOOT ( + ch_freyja_variants, + ch_freyja_depths, + ch_val_repeats, + ch_barcodes, + ch_lineages_meta + ) + ch_versions = ch_versions.mix(FREYJA_BOOT.out.versions.first()) +*/ + emit: + variants = FREYJA_VARIANTS.out.variants // channel: [ val(meta), path(variants_tsv) ] + depths = FREYJA_VARIANTS.out.depths // channel: [ val(meta), path(depths_tsv) ] + demix = FREYJA_DEMIX.out.demix // channel: [ val(meta), path(demix_tsv) ] + //lineages = FREYJA_BOOT.out.lineages // channel: [ val(meta), path(lineages_csv) ] + //summarized = FREYJA_BOOT.out.summarized // channel: [ val(meta), path(summarized_csv) ]] + barcodes = ch_barcodes // channel: [ val(meta), path(barcodes) ] + lineages_meta = ch_lineages_meta // channel: [ val(meta), path(lineages_meta) ] + versions = ch_versions // channel: [ path(versions.yml) ] + +} + diff --git a/subworkflows/local/freyja_subworkflow/meta.yml b/subworkflows/local/freyja_subworkflow/meta.yml new file mode 100755 index 0000000..d7a97a4 --- /dev/null +++ b/subworkflows/local/freyja_subworkflow/meta.yml @@ -0,0 +1,88 @@ +name: "bam_variant_demix_boot_freyja" +description: Recover relative lineage abundances from mixed SARS-CoV-2 samples from a sequencing dataset (BAM aligned to the Hu-1 reference) +keywords: + - bam + - variants + - cram + +modules: + - freyja/variants + - freyja/demix + - freyja/update + - freyja/boot + +input: + - ch_bam: + type: file + description: | + Structure: [ val(meta), path(bam) ] + Groovy Map containing sample information e.g. [ id:'test', single_end:false ] and sorted BAM file + - ch_fasta: + type: file + description: | + Structure: [ val(meta), path(fasta) ] + Groovy Map containing sample information e.g. [ id:'test', single_end:false ] and the fasta reference used for the sorted BAM file + - val_repeats: + type: value (int) + description: Number of bootstrap repeats to perform + - val_db_name: + type: value (string) + description: Name of the dir where UShER's files will be stored + - ch_barcodes: + type: file + description: | + Structure: path(barcodes) + File containing lineage defining barcodes + - ch_lineages_meta: + type: file + description: | + Structure: path(lineages_meta) + File containing lineage metadata that correspond to barcodes + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - variants: + type: file + description: | + Structure: [ val(meta), path(variants) ] + File containing identified variants in a gff-like format + - depths: + type: file + description: | + Structure: [ val(meta), path(variants) ] + File containing depth of the variants + - demix: + type: file + description: | + Structure: [ val(meta), path(demix) ] + a tsv file that includes the lineages present, their corresponding abundances, and summarization by constellation + - lineages: + type: file + description: | + Structure: [ val(meta), path(lineages) ] + a csv file that includes the lineages present and their corresponding abundances + - summarized: + type: file + description: | + Structure: [ val(meta), path(lineages) ] + a csv file that includes the lineages present but summarized by constellation and their corresponding abundances + - barcodes: + type: file + description: | + Structure: [ val(meta), path(barcodes) ] + a csv file that includes the lineages present but summarized by constellation and their corresponding abundances + - lineages_meta: + type: file + description: | + Structure: [ val(meta), path(lineages_meta) ] + a csv file that includes the lineages present but summarized by constellation and their corresponding abundances + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Joon-Klaps" From ddc36f10461337dd8b9d9dc6b8a74c77921833de Mon Sep 17 00:00:00 2001 From: tzx6 Date: Tue, 18 Jun 2024 00:22:16 -0400 Subject: [PATCH 16/18] no change --- subworkflows/local/ont_trimming.nf | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 subworkflows/local/ont_trimming.nf diff --git a/subworkflows/local/ont_trimming.nf b/subworkflows/local/ont_trimming.nf old mode 100644 new mode 100755 From 3eb9d06c329c3ba9fc87a4edc10fee24bab344ed Mon Sep 17 00:00:00 2001 From: tzx6 Date: Tue, 18 Jun 2024 00:19:47 -0400 Subject: [PATCH 17/18] feat: added freyja standalone subworkflow --- conf/freyja_standalone.config | 52 +++++++++++++++++++++++++++ conf/modules.config | 9 ++++- nextflow.config | 9 +++-- nextflow_schema.json | 4 ++- workflows/freyja_standalone.nf | 66 ++++++++++++++++++++++++++++++++++ 5 files changed, 136 insertions(+), 4 deletions(-) create mode 100755 conf/freyja_standalone.config create mode 100755 workflows/freyja_standalone.nf diff --git a/conf/freyja_standalone.config b/conf/freyja_standalone.config new file mode 100755 index 0000000..a00dbf2 --- /dev/null +++ b/conf/freyja_standalone.config @@ -0,0 +1,52 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. +---------------------------------------------------------------------------------------- +*/ + +process { + + publishDir = [ + path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + withName: 'FREYJA_VARIANTS' { + ext.args = "--minq 20 --annot \"${params.gff3}\" --varthresh \"${params.varthresh}\" " + publishDir = [ + path: { "${params.outdir}/FREYJA_STANDALONE/VarCalls" }, + mode: params.publish_dir_mode, + pattern: "*.{tsv,csv}" + ] + } + + withName: 'FREYJA_DEMIX' { + ext.args = '--covcut 10 --confirmedonly' + publishDir = [ + path: { "${params.outdir}/FREYJA_STANDALONE/Demix" }, + mode: params.publish_dir_mode, + pattern: "*.{tsv,csv}" + ] + } + + withName: 'FREYJA_UPDATE' { + publishDir = [ + path: { "${params.outdir}/FREYJA_STANDALONE/FREYJA_DB/" }, + mode: params.publish_dir_mode, + ] + } + + withName: 'MULTIQC' { + publishDir = [ + path: { "${params.outdir}/MULTIQC" }, + mode: params.publish_dir_mode + ] + } +} diff --git a/conf/modules.config b/conf/modules.config index 2b075bc..0a889e1 100755 --- a/conf/modules.config +++ b/conf/modules.config @@ -236,9 +236,16 @@ process { ] } + withName: 'MULTIQC' { + publishDir = [ + path: { "${params.outdir}/MULTIQC" }, + mode: params.publish_dir_mode, + ] + } + withName: 'CUSTOM_DUMPSOFTWAREVERSIONS' { publishDir = [ - path: { "${params.outdir}/Pipeline_Info" }, + path: { "${params.outdir}/pipeline_info" }, mode: params.publish_dir_mode, pattern: '*_versions.yml' ] diff --git a/nextflow.config b/nextflow.config index c161dbc..7c3ca42 100755 --- a/nextflow.config +++ b/nextflow.config @@ -12,6 +12,8 @@ // Input options input = null + workflow = 'aquascope' + // References fasta = "${projectDir}/assets/references/SARS-CoV-2.reference.fasta" gff = "${projectDir}/assets/references/SARS-CoV-2.reference.gff" @@ -84,6 +86,9 @@ try { System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") } +if (params.workflow == 'freyja_standalone') { + includeConfig 'conf/freyja_standalone.config' +} profiles { debug { process.beforeScript = 'echo $HOSTNAME' } @@ -167,11 +172,11 @@ dag { manifest { name = 'CDCGov-Aquascope' author = 'Arun Boddapati, Hunter Seabolt, CDC SciComp' - homePage = 'https://github.com/CDCgov/aquascope' + homePage = 'https://github.com/CDCgov/aquascope, https://cdcgov.github.io/aquascope/' description = 'Pipeline is for early detection of SC2 variants of concern via shotgun metagenomic sequencing of wastewater' mainScript = 'main.nf' nextflowVersion = '!>=23.04.02' - version = '1.0dev' + version = '2.1.0' } // Function to ensure that resource requirements don't go beyond diff --git a/nextflow_schema.json b/nextflow_schema.json index 51ed366..8044c9e 100755 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -11,7 +11,9 @@ "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", "required": [ - "input" + "input", + "fasta", + "gff" ], "properties": { "input": { diff --git a/workflows/freyja_standalone.nf b/workflows/freyja_standalone.nf new file mode 100755 index 0000000..172ba42 --- /dev/null +++ b/workflows/freyja_standalone.nf @@ -0,0 +1,66 @@ +def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) + +// Validate input parameters +WorkflowAquascope.initialise(params, log) + +// TODO nf-core: Add all file path parameters for the pipeline to the list below +// Check input path parameters to see if they exist + +def checkPathParamList = [params.input, params.fasta] + +for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } + + +if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified! please specify samplesheet containing BAM files only! Run bam_to_samplesheet.py to generate bamfile samplesheet' } + +include { INPUT_BAM_CHECK } from '../modules/local/input_check_bam.nf' +include { FREYJA_VARIANT_CALLING } from '../subworkflows/local/bam_variant_demix_boot_freyja/main' +include { MULTIQC } from '../modules/nf-core/modules/nf-core/multiqc/main' + + + +def multiqc_report = [] +ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() +ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() +ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + + +workflow FREYJA_STANDALONE { + + ch_genome = params.fasta ? Channel.value(file(params.fasta)) : Channel.empty() + + INPUT_BAM_CHECK () + + ch_sorted_bam = INPUT_BAM_CHECK.out.bam_files + +// MODULE: RUN FREYJA_VARIANT_CALLING + ch_freyja_variants = Channel.empty() + ch_freyja_depths = Channel.empty() + ch_freyja_demix = Channel.empty() + ch_freyja_lineages = Channel.empty() + ch_freyja_summarized = Channel.empty() + ch_versions = Channel.empty() + FREYJA_VARIANT_CALLING( + ch_sorted_bam, + ch_genome, + params.freyja_db_name, + params.freyja_barcodes, + params.freyja_lineages_meta + ) + ch_freyja_variants = FREYJA_VARIANT_CALLING.out.variants + ch_freyja_depths = FREYJA_VARIANT_CALLING.out.depths + ch_freyja_demix = FREYJA_VARIANT_CALLING.out.demix + ch_versions = ch_versions.mix(FREYJA_VARIANT_CALLING.out.versions) + + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(ch_freyja_demix.collect{ it[1] }.ifEmpty([])) + // Run MultiQC + MULTIQC ( + ch_multiqc_files.collect(), + ch_multiqc_config.toList(), + ch_multiqc_custom_config.toList(), + ch_multiqc_logo.toList() + ) + multiqc_report = MULTIQC.out.report.toList() +} \ No newline at end of file From 289ddc226c22165e4c2fc7efd917faa99210eceb Mon Sep 17 00:00:00 2001 From: tzx6 Date: Tue, 18 Jun 2024 00:26:23 -0400 Subject: [PATCH 18/18] rebase rebase aquascope wf --- workflows/aquascope.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/aquascope.nf b/workflows/aquascope.nf index f7d7796..6d945a5 100755 --- a/workflows/aquascope.nf +++ b/workflows/aquascope.nf @@ -3,7 +3,6 @@ VALIDATE INPUTS ======================================================================================== */ - def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) @@ -75,7 +74,10 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.mu ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) +<<<<<<< HEAD nextflow.enable.dsl=2 +======= +>>>>>>> feat: added workflow as entry point workflow AQUASCOPE {