Merge pull request #133 from Joon-Klaps/prinseq-contig

Add read & contig decomplexification using prinseq++
Joon-Klaps · Jul 26, 2024 · 249d50a · 249d50a
2 parents 77a7269 + 8200635
commit 249d50a
Show file tree

Hide file tree

Showing 22 changed files with 638 additions and 126 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ Initial release of Joon-Klaps/viralgenie, created with the [nf-core](https://nf-
 - Include sspace for contig extension ([#123](https://github.com/Joon-Klaps/viralgenie/pull/123))
 - Include both krakenreport &nodes.dmp in taxonomy ([#128](https://github.com/Joon-Klaps/viralgenie/pull/128))
 - Update new variable mmseqs_cluster_mode default 0 ([#130](https://github.com/Joon-Klaps/viralgenie/pull/130))
+- Add read & contig decomplexification using prinseq++  ([#133](https://github.com/Joon-Klaps/viralgenie/pull/133))
 
 ### `Fixed`
 

diff --git a/README.md b/README.md
@@ -32,7 +32,7 @@
 2. Performs optional read pre-processing
     - Adapter trimming([`fastp`](https://github.com/OpenGene/fastp), [`Trimmomatic`](https://github.com/usadellab/Trimmomatic))
     - Read UMI deduplication ([`HUMID`](https://humid.readthedocs.io/en/latest/usage.html))
-    - Low complexity and quality filtering ([`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/))
+    - Low complexity and quality filtering ([`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/), [`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus))
     - Host-read removal ([`BowTie2`](http://bowtie-bio.sourceforge.net/bowtie2/))
 3. Metagenomic diveristy mapping
     - Performs taxonomic classification and/or profiling using one or more of:
@@ -41,7 +41,7 @@
         - [`Kaiju`](https://kaiju.binf.ku.dk/)
     - Plotting Kraken2 and Kaiju ([`Krona`](https://hpc.nih.gov/apps/kronatools.html))
 4. Denovo assembly ([`SPAdes`](http://cab.spbu.ru/software/spades/), [`TRINITY`](https://github.com/trinityrnaseq/trinityrnaseq), [`megahit`](https://github.com/voutcn/megahit)), combine contigs.
-5. [Optional] extend the contigs with [sspace_basic](https://github.com/nsoranzo/sspace_basic)
+5. [Optional] extend the contigs with [sspace_basic](https://github.com/nsoranzo/sspace_basic) and filter with [`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus)
 6. Contig reference idententification ([`blastn`](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch))
     -   Identify top 5 blast hits
     -   Merge blast hit and all contigs of a sample

diff --git a/conf/modules.config b/conf/modules.config
@@ -172,6 +172,27 @@ process {
         ]
     }
 
+    withName: PRINSEQ_READS {
+            ext.args = [
+                "-out_gz",
+            ].join(' ').trim()
+            publishDir = [
+                [
+                    path: { "${params.outdir}/preprocessing/prinseq" },
+                    mode: params.publish_dir_mode,
+                    pattern: '*.gz',
+                    enabled: params.save_intermediate_reads || save_final_reads == 'complexity',
+                    saveAs: { filename -> params.prefix || params.global_prefix  ? "${params.global_prefix}-$filename" : filename }
+                ],
+                [
+                    path: { "${params.outdir}/preprocessing/prinseq/log" },
+                    mode: params.publish_dir_mode,
+                    pattern: '*.log',
+                    saveAs: { filename -> params.prefix || params.global_prefix  ? "${params.global_prefix}-$filename" : filename }
+                ],
+            ]
+        }
+
     withName: FASTQC_TRIM {
         ext.args = '--quiet'
         ext.prefix = { "${meta.id}_trim" }
@@ -448,14 +469,35 @@ process {
                     saveAs: { filename -> params.prefix || params.global_prefix  ? "${params.global_prefix}-$filename" : filename }
                 ],
                 [
-                    path: { "${params.outdir}/assembly/assemblers/sspace_basic/logs" },
+                    path: { "${params.outdir}/assembly/assemblers/sspace_basic/log" },
                     mode: params.publish_dir_mode,
                     pattern: '*.txt',
                     saveAs: { filename -> params.prefix || params.global_prefix  ? "${params.global_prefix}-$filename" : filename }
                 ],
             ]
         }
 
+        withName: PRINSEQ_CONTIG {
+            ext.args = [
+                "-out_format 1",
+                "-lc_dust .20",
+            ].join(' ').trim()
+            publishDir = [
+                [
+                    path: { "${params.outdir}/assembly/assemblers/prinseq/scaffolds" },
+                    mode: params.publish_dir_mode,
+                    pattern: '*.fasta',
+                    saveAs: { filename -> params.prefix || params.global_prefix  ? "${params.global_prefix}-$filename" : filename }
+                ],
+                [
+                    path: { "${params.outdir}/assembly/assemblers/prinseq/log" },
+                    mode: params.publish_dir_mode,
+                    pattern: '*.log',
+                    saveAs: { filename -> params.prefix || params.global_prefix  ? "${params.global_prefix}-$filename" : filename }
+                ],
+            ]
+        }
+
         if (!params.skip_polishing){
 
             withName: BLAST_BLASTN{

diff --git a/docs/output.md b/docs/output.md
@@ -89,6 +89,21 @@ It is used in viralgenie for complexity filtering using different algorithms. Th
 
 By default viralgenie will only provide the log files of bbduk. The filtered reads can be saved by specifying `--save_intermediate_reads` or `--save_final_reads 'complexity'`.
 
+### prinseq++
+
+[`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus)
+
+It is used in viralgenie for complexity filtering using different algorithms. This means that it will remove reads with low sequence diversity (e.g. mono- or dinucleotide repeats).
+
+???- abstract "Output files"
+
+    - `prinseq/`
+        - `log/<sample-id>.log`: log file containing filtering statistics
+        - `<sample-id>.fastq.gz`: resulting FASTQ file without low-complexity reads
+
+By default viralgenie will only provide the log files of prinseq. The filtered reads can be saved by specifying `--save_intermediate_reads` or `--save_final_reads 'complexity'`.
+
+
 
 ### Hostremoval-Kraken2
 
@@ -188,6 +203,17 @@ Finally, the results of the assemblers are combined and stored in the `tools_com
         - `scaffolds/<sample-id>.scaffolds.fasta`: Scaffolds generated by SSPACE Basic.
         - `log/<sample-id>.*.txt`: Various txt files containig log and summary information on the SSPACE Basic run.
 
+### prinseq++ - contigs
+
+[`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus) is used in for complexity filtering of contigs.
+
+???- abstract "Output files"
+
+    - `prinseq/`
+        - `scaffolds/<sample-id>.scaffolds.fasta`: Scaffolds generated by SSPACE Basic.
+        - `log/<sample-id>.*.txt`: Various txt files containig log and summary information on the SSPACE Basic run.
+
+
 ### BLAST
 
 [BLAST](https://blast.ncbi.nlm.nih.gov/Blast.cgi) is a sequence comparison tool that can be used to compare a query sequence against a database of sequences. In viralgenie, BLAST is used to compare the contigs generated by the assemblers to a database of viral sequences.

diff --git a/docs/workflow/assembly_polishing.md b/docs/workflow/assembly_polishing.md
@@ -25,6 +25,8 @@ Three assemblers are used, [SPAdes](http://cab.spbu.ru/software/spades/), [Megah
 
 Contigs can be extended using [SSPACE Basic](https://github.com/nsoranzo/sspace_basic) with the `--skip_sspace_basic false` parameter. SSPACE is a tool for scaffolding contigs using paired-end reads. It is modified from SSAKE assembler and has the feature of extending contigs using reads that are unmappable in the contig assembly step.
 
+Low complexity contigs can be filtered out using prinseq++ with the `--skip_contig_prinseq false` parameter. Complexity filtering is primarily a run-time optimisation step. Low-complexity sequences are defined as having commonly found stretches of nucleotides with limited information content (e.g. the dinucleotide repeat CACACACACA). Such sequences can produce a large number of high-scoring but biologically insignificant results in database searches. Removing these reads therefore saves computational time and resources.
+
 ## Reference Matching
 The newly assembled contigs are compared to a reference sequence pool (--reference_pool) using a [BLASTn search](https://www.ncbi.nlm.nih.gov/books/NBK153387/). This process not only helps annotate the contigs but also assists in linking together sets of contigs that are distant within a single genome. Essentially, it aids in identifying contigs belonging to the same genomic segment and choosing the right reference for scaffolding purposes.
 

diff --git a/docs/workflow/preprocessing.md b/docs/workflow/preprocessing.md
@@ -56,9 +56,9 @@ Viralgenie supports both deduplication on a read level as well as a mapping leve
 
 Complexity filtering is primarily a run-time optimisation step. Low-complexity sequences are defined as having commonly found stretches of nucleotides with limited information content (e.g. the dinucleotide repeat CACACACACA). Such sequences can produce a large number of high-scoring but biologically insignificant results in database searches. Removing these reads therefore saves computational time and resources.
 
-Complexity filtering is done with [`Bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) which is part of [`BBtools`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/) where the "duk" stands for Decontamination Using Kmers.
+Complexity filtering is done with [`Bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) which is part of [`BBtools`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/) where the "duk" stands for Decontamination Using Kmers. Alternativly, complexity filtering can be done with [`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus).
 
-> By default this step is skipped, if this step shouldn't be skipped specify `--skip_complexity_filtering false`.
+> By default this step is skipped, if this step shouldn't be skipped specify `--skip_complexity_filtering false`. Specify the tool to use for complexity filtering with the `--decomplexifier` parameter, `bbduk` or `prinseq`[default].
 
 ## Host read-removal
 

diff --git a/modules.json b/modules.json
@@ -214,9 +214,8 @@
                     },
                     "megahit": {
                         "branch": "master",
-                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
-                        "installed_by": ["modules"],
-                        "patch": "modules/nf-core/megahit/megahit.diff"
+                        "git_sha": "f5efd4ff526a91f32f87c818943b4d7ce82f80cb",
+                        "installed_by": ["modules"]
                     },
                     "minimap2/align": {
                         "branch": "master",
@@ -281,6 +280,12 @@
                         "git_sha": "1943aa60f7490c3d6740e8872e6e69122ccc8087",
                         "installed_by": ["modules"]
                     },
+                    "prinseqplusplus": {
+                        "branch": "master",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
+                        "installed_by": ["modules"],
+                        "patch": "modules/nf-core/prinseqplusplus/prinseqplusplus.diff"
+                    },
                     "quast": {
                         "branch": "master",
                         "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",

diff --git a/modules/nf-core/megahit/environment.yml b/modules/nf-core/megahit/environment.yml
diff --git a/modules/nf-core/megahit/main.nf b/modules/nf-core/megahit/main.nf
diff --git a/modules/nf-core/megahit/megahit.diff b/modules/nf-core/megahit/megahit.diff