diff --git a/CHANGELOG.md b/CHANGELOG.md index 1440a712..c10d057e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,8 @@ Initial release of Joon-Klaps/viralgenie, created with the [nf-core](https://nf- - Add read & contig decomplexification using prinseq++ ([#133](https://github.com/Joon-Klaps/viralgenie/pull/133)) - Allow reference pool for specific samples ([#134](https://github.com/Joon-Klaps/viralgenie/pull/134)) - Add option to filter contig clusters based on cumulative read coverage ([#138](https://github.com/Joon-Klaps/viralgenie/pull/138)) +- Adding mash-screen output to result table ([#140](https://github.com/Joon-Klaps/viralgenie/pull/140)) +- Add logic to allow samples with no reference hits to be analysed ([#141](https://github.com/Joon-Klaps/viralgenie/pull/141)) ### `Fixed` diff --git a/bin/blast_filter.py b/bin/blast_filter.py index 20fccb57..508a3c46 100755 --- a/bin/blast_filter.py +++ b/bin/blast_filter.py @@ -177,6 +177,15 @@ def main(argv=None): """Coordinate argument parsing and program execution.""" args = parse_args(argv) logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") + + if args.blast is None and args.contigs.is_file(): + logger.warning(f"No blast input was provide, just copying input file.") + with open(args.contigs, "r") as contigs_file: + contig_content = contigs_file.read() + with open(f"{args.prefix}_withref.fa", "w") as f: + f.write(contig_content) + return 0 + if not args.blast.is_file(): logger.error(f"The given input file {args.blast} was not found!") sys.exit(2) @@ -193,6 +202,8 @@ def main(argv=None): write_hits(df_filter, args.contigs, args.references, args.prefix) + return 0 + if __name__ == "__main__": sys.exit(main()) diff --git a/bin/custom_multiqc_tables.py b/bin/custom_multiqc_tables.py index 9eafb896..5b2f5eb3 100755 --- a/bin/custom_multiqc_tables.py +++ b/bin/custom_multiqc_tables.py @@ -5,6 +5,7 @@ import argparse import csv import logging +import json import os import re import sys @@ -84,6 +85,14 @@ def file_choices(choices, fname): type=lambda s: file_choices(("csv", "tsv", "yaml","yml"), s), ) + parser.add_argument( + "--screen_files", + metavar="MASH SCREEN FILES", + nargs="+", + help="Mash screen result of the module SELECT REFERENCE where top hit is outputted in json file format", + type=lambda s: file_choices(("json"), s), + ) + parser.add_argument( "--mapping_constrains", metavar="MAPPING CONSTRAINS", @@ -204,14 +213,24 @@ def concat_table_files(table_files, **kwargs): Returns: pandas.DataFrame: The concatenated dataframe. """ - df = pd.concat( - [ + try: + valid_dfs = [ read_file_to_dataframe(file, **kwargs) for file in table_files if check_file_exists(file) ] - ) - return df + + if not valid_dfs: + logging.warning(f"Warning concatenating files: {table_files}") + logging.warning("No valid files found to concatenate.") + return pd.DataFrame() + + df = pd.concat(valid_dfs) + return df + + except ValueError as e: + logging.warning(f"Warning concatenating files: {table_files}") + return pd.DataFrame() def read_in_quast(table_files): @@ -381,7 +400,31 @@ def read_dataframe_from_yaml(file, **kwargs): df = pd.DataFrame(data) return df +def read_dataframe_from_json(file, **kwargs): + """ + Read a dataframe from a JSON file. + Args: + file (str): The path to the file. + + Returns: + pandas.DataFrame: The dataframe read from the file. + """ + with open(file, "r") as json_file: + try: + data = json.load(json_file, **kwargs) + except json.JSONDecodeError as e: + logger.warning("Error reading JSON file %s: %s", file, e) + return pd.DataFrame() + + # Check if 'query' key exists + if 'filename' not in data: + # Get the filename without path and suffix + filename = os.path.splitext(os.path.basename(file))[0] + # Add new key-value pair + data['filename'] = filename + '_constrain' + df = pd.DataFrame([data]) + return df def read_file_to_dataframe(file, **kwargs): """ Read a dataframe from a file. @@ -402,6 +445,8 @@ def read_file_to_dataframe(file, **kwargs): df = read_dataframe_from_csv(file_path, **kwargs) elif file_path.suffix in [".yaml", ".yml"]: df = read_dataframe_from_yaml(file_path, **kwargs) + elif file_path.suffix in [".json"]: + df = read_dataframe_from_json(file_path, **kwargs) else: logger.error("The file format %s is not supported of file %s!", file_path.suffix, file_path) sys.exit(2) @@ -879,6 +924,13 @@ def main(argv=None): blast_df, "blast", "query", blast_header, "summary_blast_mqc.tsv" ) + # MASH screen used for reference selection summarisation + screen_df = handle_tables(args.screen_files) + if not screen_df.empty: + screen_df = handle_dataframe( screen_df, "mash-screen", "filename") + # Make everything a string + screen_df = screen_df.astype(str) + # CLuster table - mmseqs easysearch summary - annotation section annotation_df = handle_tables(args.annotation_files, header=None) if not annotation_df.empty: @@ -937,6 +989,7 @@ def main(argv=None): logger.info("Joining dataframes") multiqc_contigs_df = multiqc_contigs_df.join(checkv_df, how="outer") multiqc_contigs_df = multiqc_contigs_df.join(quast_df, how="outer") + multiqc_contigs_df = multiqc_contigs_df.join(screen_df, how="outer") multiqc_contigs_df = multiqc_contigs_df.join(blast_df, how="outer") multiqc_contigs_df = multiqc_contigs_df.join(annotation_df, how="outer") @@ -975,6 +1028,7 @@ def main(argv=None): final_columns = ( ["index", "sample name", "cluster", "step"] + [column for column in mqc_contigs_sel.columns if "annotation" in column] + + [column for column in mqc_contigs_sel.columns if "mash-screen" in column] + [column for column in mqc_contigs_sel.columns if "blast" in column] + [column for column in mqc_contigs_sel.columns if "checkv" in column] + [column for column in mqc_contigs_sel.columns if "QC check" in column] diff --git a/conf/modules.config b/conf/modules.config index 9938c1d0..207e969f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -890,6 +890,7 @@ process { } withName: RENAME_FASTA_HEADER_SINGLETON { + ext.prefix = { "${meta.id}_singleton" } // DON'T CHANGE publishDir = [ enabled: false ] diff --git a/conf/tests/test_fail_mapped.config b/conf/tests/test_fail_mapped.config index bc92a29b..2b79ab99 100644 --- a/conf/tests/test_fail_mapped.config +++ b/conf/tests/test_fail_mapped.config @@ -35,7 +35,7 @@ params { mapping_constrains = "${projectDir}/assets/samplesheets/mapping_constrains_fail.tsv" min_mapped_reads = 100 - intermediate_mapping_stats = true + intermediate_mapping_stats = true skip_checkv = true } diff --git a/modules/local/blast_filter/main.nf b/modules/local/blast_filter/main.nf index 572b51e3..a340355e 100644 --- a/modules/local/blast_filter/main.nf +++ b/modules/local/blast_filter/main.nf @@ -8,13 +8,14 @@ process BLAST_FILTER { 'biocontainers/mulled-v2-949aaaddebd054dc6bded102520daff6f0f93ce6:aa2a3707bfa0550fee316844baba7752eaab7802-0' }" input: - tuple val(meta), path(blast), path(contigs) + tuple val(meta), path(blast) + tuple val(meta), path(contigs) tuple val(meta2), path(db) output: - tuple val(meta), path("*.hits.txt") , emit: hits + tuple val(meta), path("*.hits.txt") , emit: hits, optional: true tuple val(meta), path("*.fa") , emit: sequence - tuple val(meta), path("*.filter.tsv"), emit: filter + tuple val(meta), path("*.filter.tsv"), emit: filter, optional: true path "versions.yml" , emit: versions when: @@ -23,10 +24,11 @@ process BLAST_FILTER { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def blast_command = blast ? "-i ${blast}" : "" """ blast_filter.py \\ $args \\ - -i ${blast} \\ + ${blast_command} \\ -c ${contigs} \\ -r ${db} \\ -p ${prefix} diff --git a/modules/local/custom_multiqc_tables/main.nf b/modules/local/custom_multiqc_tables/main.nf index c0c643b7..38f47964 100644 --- a/modules/local/custom_multiqc_tables/main.nf +++ b/modules/local/custom_multiqc_tables/main.nf @@ -14,20 +14,21 @@ process CUSTOM_MULTIQC_TABLES { path blast_files, stageAs: "?/blast/*" path mapping_constrains path anno_files, stageAs: "?/annotation/*" + path screen_files, stageAs: "?/screen/*" path multiqc_dir path comment_headers path custom_table_headers output: - path("summary_clusters_mqc.tsv") , emit: summary_clusters_mqc , optional: true - path("sample_metadata_mqc.tsv") , emit: sample_metadata_mqc , optional: true - path("contigs_overview_mqc.tsv") , emit: contigs_overview_mqc , optional: true - path("summary_checkv_mqc.tsv") , emit: summary_checkv_mqc , optional: true - path("summary_quast_mqc.tsv") , emit: summary_quast_mqc , optional: true - path("summary_blast_mqc.tsv") , emit: summary_blast_mqc , optional: true - path("summary_anno_mqc.tsv") , emit: summary_anno_mqc , optional: true - path("mapping_constrains_mqc.tsv") , emit: mapping_constrains_mqc, optional: true - path("mapping_constrains_summary_mqc.tsv"), emit: constrains_summary_mqc, optional: true + path("summary_clusters_mqc.tsv") , emit: summary_clusters_mqc , optional: true + path("sample_metadata_mqc.tsv") , emit: sample_metadata_mqc , optional: true + path("contigs_overview_mqc.tsv") , emit: contigs_overview_mqc , optional: true + path("summary_checkv_mqc.tsv") , emit: summary_checkv_mqc , optional: true + path("summary_quast_mqc.tsv") , emit: summary_quast_mqc , optional: true + path("summary_blast_mqc.tsv") , emit: summary_blast_mqc , optional: true + path("summary_anno_mqc.tsv") , emit: summary_anno_mqc , optional: true + path("mapping_constrains_mqc.tsv") , emit: mapping_constrains_mqc , optional: true + path("mapping_constrains_summary_mqc.tsv"), emit: constrains_summary_mqc , optional: true path "versions.yml" , emit: versions when: @@ -42,6 +43,7 @@ process CUSTOM_MULTIQC_TABLES { def blast_files = blast_files ? "--blast_files ${blast_files}" : '' def annotation_files = anno_files ? "--annotation_files ${anno_files}" : '' def mapping_constrains = mapping_constrains ? "--mapping_constrains ${mapping_constrains}" : '' + def screen_files = screen_files ? "--screen_files ${screen_files}" : '' def multiqc_dir = multiqc_dir ? "--multiqc_dir ${multiqc_dir}" : '' def comment_headers = comment_headers ? "--comment_dir ${comment_headers}" : '' def custom_table_headers = custom_table_headers ? "--table_headers ${custom_table_headers}" : '' @@ -56,6 +58,7 @@ process CUSTOM_MULTIQC_TABLES { $blast_files \\ $mapping_constrains \\ $annotation_files \\ + $screen_files\\ $comment_headers \\ $custom_table_headers \\ $multiqc_dir diff --git a/subworkflows/local/fasta_blast_refsel.nf b/subworkflows/local/fasta_blast_refsel.nf index 606d0a45..84553754 100644 --- a/subworkflows/local/fasta_blast_refsel.nf +++ b/subworkflows/local/fasta_blast_refsel.nf @@ -41,11 +41,17 @@ workflow FASTA_BLAST_REFSEL { // Filter out false positve hits that based on query length, alignment length, identity, e-score & bit-score ch_blast_txt .hits - .join(fasta, by:[0], remainder:false) - .set{ hits_contigs } + .join(fasta, by:[0], remainder:true) + .multiMap{ + meta, txt, fasta -> + hits : [meta, txt ? txt : []] + contigs : [meta, fasta] + } + .set{input_blast_filter} BLAST_FILTER ( - hits_contigs, + input_blast_filter.hits, + input_blast_filter.contigs, blast_db_fasta ) ch_versions = ch_versions.mix(BLAST_FILTER.out.versions.first()) diff --git a/subworkflows/local/fastq_assembly.nf b/subworkflows/local/fastq_assembly.nf index a6ee565f..0fc20138 100644 --- a/subworkflows/local/fastq_assembly.nf +++ b/subworkflows/local/fastq_assembly.nf @@ -37,7 +37,12 @@ workflow FASTQ_ASSEMBLY { ) ch_versions = ch_versions.mix(SPADES.out.versions.first()) - EXTEND_SPADES( reads, SPADES.out.scaffolds, "spades") + SPADES.out.scaffolds + .join(SPADES.out.contigs, remainder:true) + .map{meta, scaffold, contig -> [meta, scaffold ? scaffold : contig]} // sometimes no scaffold could be created if so take contig + .set{spades_consensus} + + EXTEND_SPADES( reads, spades_consensus, "spades") ch_scaffolds = ch_scaffolds.mix(EXTEND_SPADES.out.scaffolds) ch_coverages = ch_coverages.mix(EXTEND_SPADES.out.coverages) ch_versions = ch_versions.mix(EXTEND_SPADES.out.versions) diff --git a/subworkflows/local/fastq_fasta_mash_screen.nf b/subworkflows/local/fastq_fasta_mash_screen.nf index 59a169ce..56dc1583 100644 --- a/subworkflows/local/fastq_fasta_mash_screen.nf +++ b/subworkflows/local/fastq_fasta_mash_screen.nf @@ -62,8 +62,11 @@ workflow FASTQ_FASTA_MASH_SCREEN { return [meta + json, fasta, reads] } + ch_json = SELECT_REFERENCE.out.fasta_reads.map{ meta, json, fasta, reads -> [meta, json]} + emit: - reference_fastq = reference_fastq // channel: [meta, fasta, reads] + reference_fastq = reference_fastq // channel: [meta, fasta, reads ] + json = ch_json // channel: [meta, json ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/viralgenie.nf b/workflows/viralgenie.nf index c0fd0ddb..3e32a09e 100644 --- a/workflows/viralgenie.nf +++ b/workflows/viralgenie.nf @@ -361,6 +361,8 @@ workflow VIRALGENIE { } .set{ch_consensus_results_reads} + ch_mash_screen = Channel.empty() + if (params.mapping_constrains && !params.skip_variant_calling ) { // Importing samplesheet Channel.fromSamplesheet('mapping_constrains') @@ -410,6 +412,7 @@ workflow VIRALGENIE { constrain_consensus_reads.multiFastaSelection ) ch_versions = ch_versions.mix(FASTQ_FASTA_MASH_SCREEN.out.versions) + ch_mash_screen = FASTQ_FASTA_MASH_SCREEN.out.json.collect{it[1]} // For QC we keep original sequence to compare to ch_unaligned_raw_contigs = ch_unaligned_raw_contigs @@ -484,6 +487,7 @@ workflow VIRALGENIE { ch_blast_summary.ifEmpty([]), ch_constrain_meta, ch_annotation_summary.ifEmpty([]), + ch_mash_screen.ifEmpty([]), multiqc_data, ch_multiqc_comment_headers.ifEmpty([]), ch_multiqc_custom_table_headers.ifEmpty([])