Merge pull request #441 from atrigila/nf-core_bowtie_align

Migrate to nf-core `bowtie align` in contaminant filter
nf-core · Sep 26, 2024 · 586ef69 · 586ef69
2 parents d86d425 + 1438a91
commit 586ef69
Show file tree

Hide file tree

Showing 21 changed files with 1,448 additions and 112 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -37,6 +37,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [[#435]](https://github.com/nf-core/smrnaseq/pull/435) - Replace local instances of bowtie for nf-core [`bowtie2`](https://github.com/nf-core/smrnaseq/issues/434) and [`bowtie1`](https://github.com/nf-core/smrnaseq/issues/433) - Additionally adds a `bioawk` module that cleans fasta files.
 - [[#438]](https://github.com/nf-core/smrnaseq/pull/438) - Update [Mirtop to latest version](https://github.com/nf-core/smrnaseq/issues/437) - Process samples separately and join results with `CSVTK_JOIN`.
 - [[#439]](https://github.com/nf-core/smrnaseq/pull/439) - Fix [Fix paired end samples processing](https://github.com/nf-core/smrnaseq/issues/415) - Fix paired end sample handling and add test profile.
+- [[#441]](https://github.com/nf-core/smrnaseq/pull/441) - Migrate [local contaminant bowtie to nf-core](https://github.com/nf-core/smrnaseq/issues/436) - Replace local processes with `BOWTIE2_ALIGN`.
 
 ## v2.3.1 - 2024-04-18 - Gray Zinc Dalmation Patch
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -242,6 +242,7 @@ process {
         publishDir = [
             path: { "${params.outdir}/contaminant_filter/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
             mode: params.publish_dir_mode,
+            enabled: false,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
@@ -264,6 +265,62 @@ process {
         publishDir = [ enabled: false ]
     }
 
+    withName: 'NFCORE_SMRNASEQ:CONTAMINANT_FILTER:BOWTIE2_ALIGN.*' {
+        ext.args = '--very-sensitive-local -k 1'
+        ext.prefix = {"${meta.contaminant}_${meta.id}"}
+        publishDir = [ enabled: false ]
+    }
+
+    withName: 'NFCORE_SMRNASEQ:CONTAMINANT_FILTER:STATS_GAWK_RRNA' {
+        ext.prefix = {"${meta.contaminant}_${meta.id}"}
+        ext.suffix = "stats"
+        ext.args2 = '\'BEGIN {tot=0} {if(NR==4 || NR==5){tot+=\$1}} END {print "\\"' + "rRNA" + '\\": " tot}\''
+        publishDir = [ enabled: false ]
+    }
+
+    withName: 'NFCORE_SMRNASEQ:CONTAMINANT_FILTER:STATS_GAWK_TRNA' {
+        ext.prefix = {"${meta.contaminant}_${meta.id}"}
+        ext.suffix = "stats"
+        ext.args2 = '\'BEGIN {tot=0} {if(NR==4 || NR==5){tot+=\$1}} END {print "\\"' + "tRNA" + '\\": " tot}\''
+        publishDir = [ enabled: false ]
+    }
+
+    withName: 'NFCORE_SMRNASEQ:CONTAMINANT_FILTER:STATS_GAWK_CDNA' {
+        ext.prefix = {"${meta.contaminant}_${meta.id}"}
+        ext.suffix = "stats"
+        ext.args2 = '\'BEGIN {tot=0} {if(NR==4 || NR==5){tot+=\$1}} END {print "\\"' + "cDNA" + '\\": " tot}\''
+        publishDir = [ enabled: false ]
+    }
+        withName: 'NFCORE_SMRNASEQ:CONTAMINANT_FILTER:STATS_GAWK_NCRNA' {
+        ext.prefix = {"${meta.contaminant}_${meta.id}"}
+        ext.suffix = "stats"
+        ext.args2 = '\'BEGIN {tot=0} {if(NR==4 || NR==5){tot+=\$1}} END {print "\\"' + "ncRNA" + '\\": " tot}\''
+        publishDir = [ enabled: false ]
+    }
+
+    withName: 'NFCORE_SMRNASEQ:CONTAMINANT_FILTER:STATS_GAWK_PIRNA' {
+        ext.prefix = {"${meta.contaminant}_${meta.id}"}
+        ext.suffix = "stats"
+        ext.args2 = '\'BEGIN {tot=0} {if(NR==4 || NR==5){tot+=\$1}} END {print "\\"' + "piRNA" + '\\": " tot}\''
+        publishDir = [ enabled: false ]
+    }
+
+    withName: 'NFCORE_SMRNASEQ:CONTAMINANT_FILTER:STATS_GAWK_OTHER' {
+        ext.prefix = {"${meta.contaminant}_${meta.id}"}
+        ext.suffix = "stats"
+        ext.args2 = '\'BEGIN {tot=0} {if(NR==4 || NR==5){tot+=\$1}} END {print "\\"' + "other" + '\\": " tot}\''
+        publishDir = [ enabled: false ]
+    }
+
+    withName: 'NFCORE_SMRNASEQ:CONTAMINANT_FILTER:FILTER_STATS' {
+        publishDir = [
+            path: { "${params.outdir}/contaminant_filter/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
+            mode: params.publish_dir_mode,
+            enabled: params.save_intermediates,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
     //
     // MIRNA_QUANT
     //

diff --git a/modules.json b/modules.json
@@ -21,6 +21,11 @@
                         "git_sha": "06c8865e36741e05ad32ef70ab3fac127486af48",
                         "installed_by": ["modules"]
                     },
+                    "bowtie2/align": {
+                        "branch": "master",
+                        "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
+                        "installed_by": ["modules"]
+                    },
                     "bowtie2/build": {
                         "branch": "master",
                         "git_sha": "06c8865e36741e05ad32ef70ab3fac127486af48",

diff --git a/modules/local/bowtie_map_contaminants.nf b/modules/local/bowtie_map_contaminants.nf
diff --git a/modules/local/filter_stats.nf b/modules/local/filter_stats.nf
@@ -1,30 +1,41 @@
 process FILTER_STATS {
     label 'process_medium'
+    tag "$meta.id"
 
     conda 'bowtie2=2.4.5'
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/bowtie2:2.4.5--py39hd2f7db1_2' :
         'biocontainers/bowtie2:2.4.5--py39hd2f7db1_2' }"
 
     input:
-    tuple val(meta), path(reads)
-    path stats_files
+    tuple val(meta), path(reads), path (stats_files)
 
     output:
     path "*_mqc.yaml"                           , emit: stats
-    tuple val(meta), path('*.filtered.fastq.gz'), emit: reads
+    tuple val(meta), path('*.filtered.fastq.gz'), emit: reads, optional: true
     path "versions.yml"                         , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
     """
-    readnumber=\$(wc -l ${reads} | awk '{ print \$1/4 }')
-    cat ./filtered.${meta.id}_*.stats | \\
+
+    if [[ ${reads} == *.gz ]]; then
+        readnumber=\$(zcat ${reads} | wc -l | awk '{ print \$1/4 }')
+    else
+        readnumber=\$(wc -l ${reads} | awk '{ print \$1/4 }')
+    fi
+
+    cat ./*${meta.id}*.stats | \\
     tr '\\n' ', ' | \\
     awk -v sample=${meta.id} -v readnumber=\$readnumber '{ print "id: \\"my_pca_section\\"\\nsection_name: \\"Contamination Filtering\\"\\ndescription: \\"This plot shows the amount of reads filtered by contaminant type.\\"\\nplot_type: \\"bargraph\\"\\npconfig:\\n  id: \\"contamination_filter_plot\\"\\n  title: \\"Contamination Plot\\"\\n  ylab: \\"Number of reads\\"\\ndata:\\n    "sample": {"\$0"\\"remaining reads\\": "readnumber"}" }' > ${meta.id}.contamination_mqc.yaml
-    gzip -c ${reads} > ${meta.id}.filtered.fastq.gz
+
+    if [[ ${reads} == *.gz ]]; then
+        cp ${reads} ${meta.id}.filtered.fastq.gz
+    else
+        gzip -c ${reads} > ${meta.id}.filtered.fastq.gz
+    fi
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/nf-core/bowtie2/align/environment.yml b/modules/nf-core/bowtie2/align/environment.yml
diff --git a/modules/nf-core/bowtie2/align/main.nf b/modules/nf-core/bowtie2/align/main.nf