Skip to content

Commit

Permalink
Merge pull request #140 from Joon-Klaps/mash-screen-mqc-report
Browse files Browse the repository at this point in the history
Adding mash-screen output to result table
  • Loading branch information
Joon-Klaps authored Aug 7, 2024
2 parents 62c24a8 + 95ac620 commit 0b9a667
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 10 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Initial release of Joon-Klaps/viralgenie, created with the [nf-core](https://nf-
- Add Sspace indiv to each assembler seperatly ([#132](https://github.com/Joon-Klaps/viralgenie/pull/132))
- Add read & contig decomplexification using prinseq++ ([#133](https://github.com/Joon-Klaps/viralgenie/pull/133))
- Add option to filter contig clusters based on cumulative read coverage ([#138](https://github.com/Joon-Klaps/viralgenie/pull/138))
- Adding mash-screen output to result table ([#140](https://github.com/Joon-Klaps/viralgenie/pull/140))

### `Fixed`

Expand Down
44 changes: 44 additions & 0 deletions bin/custom_multiqc_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import argparse
import csv
import logging
import json
import os
import re
import sys
Expand Down Expand Up @@ -84,6 +85,14 @@ def file_choices(choices, fname):
type=lambda s: file_choices(("csv", "tsv", "yaml","yml"), s),
)

parser.add_argument(
"--screen_files",
metavar="MASH SCREEN FILES",
nargs="+",
help="Mash screen result of the module SELECT REFERENCE where top hit is outputted in json file format",
type=lambda s: file_choices(("json"), s),
)

parser.add_argument(
"--mapping_constrains",
metavar="MAPPING CONSTRAINS",
Expand Down Expand Up @@ -381,7 +390,31 @@ def read_dataframe_from_yaml(file, **kwargs):
df = pd.DataFrame(data)
return df

def read_dataframe_from_json(file, **kwargs):
"""
Read a dataframe from a JSON file.
Args:
file (str): The path to the file.
Returns:
pandas.DataFrame: The dataframe read from the file.
"""
with open(file, "r") as json_file:
try:
data = json.load(json_file, **kwargs)
except json.JSONDecodeError as e:
logger.warning("Error reading JSON file %s: %s", file, e)
return pd.DataFrame()

# Check if 'query' key exists
if 'filename' not in data:
# Get the filename without path and suffix
filename = os.path.splitext(os.path.basename(file))[0]
# Add new key-value pair
data['filename'] = filename + '_constrain'
df = pd.DataFrame([data])
return df
def read_file_to_dataframe(file, **kwargs):
"""
Read a dataframe from a file.
Expand All @@ -402,6 +435,8 @@ def read_file_to_dataframe(file, **kwargs):
df = read_dataframe_from_csv(file_path, **kwargs)
elif file_path.suffix in [".yaml", ".yml"]:
df = read_dataframe_from_yaml(file_path, **kwargs)
elif file_path.suffix in [".json"]:
df = read_dataframe_from_json(file_path, **kwargs)
else:
logger.error("The file format %s is not supported of file %s!", file_path.suffix, file_path)
sys.exit(2)
Expand Down Expand Up @@ -879,6 +914,13 @@ def main(argv=None):
blast_df, "blast", "query", blast_header, "summary_blast_mqc.tsv"
)

# MASH screen used for reference selection summarisation
screen_df = handle_tables(args.screen_files)
if not screen_df.empty:
screen_df = handle_dataframe( screen_df, "mash-screen", "filename")
# Make everything a string
screen_df = screen_df.astype(str)

# CLuster table - mmseqs easysearch summary - annotation section
annotation_df = handle_tables(args.annotation_files, header=None)
if not annotation_df.empty:
Expand Down Expand Up @@ -937,6 +979,7 @@ def main(argv=None):
logger.info("Joining dataframes")
multiqc_contigs_df = multiqc_contigs_df.join(checkv_df, how="outer")
multiqc_contigs_df = multiqc_contigs_df.join(quast_df, how="outer")
multiqc_contigs_df = multiqc_contigs_df.join(screen_df, how="outer")
multiqc_contigs_df = multiqc_contigs_df.join(blast_df, how="outer")
multiqc_contigs_df = multiqc_contigs_df.join(annotation_df, how="outer")

Expand Down Expand Up @@ -975,6 +1018,7 @@ def main(argv=None):
final_columns = (
["index", "sample name", "cluster", "step"]
+ [column for column in mqc_contigs_sel.columns if "annotation" in column]
+ [column for column in mqc_contigs_sel.columns if "mash-screen" in column]
+ [column for column in mqc_contigs_sel.columns if "blast" in column]
+ [column for column in mqc_contigs_sel.columns if "checkv" in column]
+ [column for column in mqc_contigs_sel.columns if "QC check" in column]
Expand Down
21 changes: 12 additions & 9 deletions modules/local/custom_multiqc_tables/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,21 @@ process CUSTOM_MULTIQC_TABLES {
path blast_files, stageAs: "?/blast/*"
path mapping_constrains
path anno_files, stageAs: "?/annotation/*"
path screen_files, stageAs: "?/screen/*"
path multiqc_dir
path comment_headers
path custom_table_headers

output:
path("summary_clusters_mqc.tsv") , emit: summary_clusters_mqc , optional: true
path("sample_metadata_mqc.tsv") , emit: sample_metadata_mqc , optional: true
path("contigs_overview_mqc.tsv") , emit: contigs_overview_mqc , optional: true
path("summary_checkv_mqc.tsv") , emit: summary_checkv_mqc , optional: true
path("summary_quast_mqc.tsv") , emit: summary_quast_mqc , optional: true
path("summary_blast_mqc.tsv") , emit: summary_blast_mqc , optional: true
path("summary_anno_mqc.tsv") , emit: summary_anno_mqc , optional: true
path("mapping_constrains_mqc.tsv") , emit: mapping_constrains_mqc, optional: true
path("mapping_constrains_summary_mqc.tsv"), emit: constrains_summary_mqc, optional: true
path("summary_clusters_mqc.tsv") , emit: summary_clusters_mqc , optional: true
path("sample_metadata_mqc.tsv") , emit: sample_metadata_mqc , optional: true
path("contigs_overview_mqc.tsv") , emit: contigs_overview_mqc , optional: true
path("summary_checkv_mqc.tsv") , emit: summary_checkv_mqc , optional: true
path("summary_quast_mqc.tsv") , emit: summary_quast_mqc , optional: true
path("summary_blast_mqc.tsv") , emit: summary_blast_mqc , optional: true
path("summary_anno_mqc.tsv") , emit: summary_anno_mqc , optional: true
path("mapping_constrains_mqc.tsv") , emit: mapping_constrains_mqc , optional: true
path("mapping_constrains_summary_mqc.tsv"), emit: constrains_summary_mqc , optional: true
path "versions.yml" , emit: versions

when:
Expand All @@ -42,6 +43,7 @@ process CUSTOM_MULTIQC_TABLES {
def blast_files = blast_files ? "--blast_files ${blast_files}" : ''
def annotation_files = anno_files ? "--annotation_files ${anno_files}" : ''
def mapping_constrains = mapping_constrains ? "--mapping_constrains ${mapping_constrains}" : ''
def screen_files = screen_files ? "--screen_files ${screen_files}" : ''
def multiqc_dir = multiqc_dir ? "--multiqc_dir ${multiqc_dir}" : ''
def comment_headers = comment_headers ? "--comment_dir ${comment_headers}" : ''
def custom_table_headers = custom_table_headers ? "--table_headers ${custom_table_headers}" : ''
Expand All @@ -56,6 +58,7 @@ process CUSTOM_MULTIQC_TABLES {
$blast_files \\
$mapping_constrains \\
$annotation_files \\
$screen_files\\
$comment_headers \\
$custom_table_headers \\
$multiqc_dir
Expand Down
5 changes: 4 additions & 1 deletion subworkflows/local/fastq_fasta_mash_screen.nf
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,11 @@ workflow FASTQ_FASTA_MASH_SCREEN {
return [meta + json, fasta, reads]
}

ch_json = SELECT_REFERENCE.out.fasta_reads.map{ meta, json, fasta, reads -> [meta, json]}

emit:
reference_fastq = reference_fastq // channel: [meta, fasta, reads]
reference_fastq = reference_fastq // channel: [meta, fasta, reads ]
json = ch_json // channel: [meta, json ]
versions = ch_versions // channel: [ versions.yml ]
}

4 changes: 4 additions & 0 deletions workflows/viralgenie.nf
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,8 @@ workflow VIRALGENIE {
}
.set{ch_consensus_results_reads}

ch_mash_screen = Channel.empty()

if (params.mapping_constrains && !params.skip_variant_calling ) {
// Importing samplesheet
Channel.fromSamplesheet('mapping_constrains')
Expand Down Expand Up @@ -412,6 +414,7 @@ workflow VIRALGENIE {
constrain_consensus_reads.multiFastaSelection
)
ch_versions = ch_versions.mix(FASTQ_FASTA_MASH_SCREEN.out.versions)
ch_mash_screen = FASTQ_FASTA_MASH_SCREEN.out.json.collect{it[1]}

// For QC we keep original sequence to compare to
ch_unaligned_raw_contigs = ch_unaligned_raw_contigs
Expand Down Expand Up @@ -487,6 +490,7 @@ workflow VIRALGENIE {
ch_blast_summary.ifEmpty([]),
ch_constrain_meta,
ch_annotation_summary.ifEmpty([]),
ch_mash_screen.ifEmpty([]),
multiqc_data,
ch_multiqc_comment_headers.ifEmpty([]),
ch_multiqc_custom_table_headers.ifEmpty([])
Expand Down

0 comments on commit 0b9a667

Please sign in to comment.