Merge branch 'dev' into database-sample-specifc

Joon-Klaps · Aug 22, 2024 · 9f25774 · 9f25774
2 parents f3861b3 + 680be05
commit 9f25774
Show file tree

Hide file tree

Showing 11 changed files with 114 additions and 23 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,8 @@ Initial release of Joon-Klaps/viralgenie, created with the [nf-core](https://nf-
 - Add read & contig decomplexification using prinseq++  ([#133](https://github.com/Joon-Klaps/viralgenie/pull/133))
 - Allow reference pool for specific samples ([#134](https://github.com/Joon-Klaps/viralgenie/pull/134))
 - Add option to filter contig clusters based on cumulative read coverage ([#138](https://github.com/Joon-Klaps/viralgenie/pull/138))
+- Adding mash-screen output to result table ([#140](https://github.com/Joon-Klaps/viralgenie/pull/140))
+- Add logic to allow samples with no reference hits to be analysed ([#141](https://github.com/Joon-Klaps/viralgenie/pull/141))
 
 ### `Fixed`
 

diff --git a/bin/blast_filter.py b/bin/blast_filter.py
@@ -177,6 +177,15 @@ def main(argv=None):
     """Coordinate argument parsing and program execution."""
     args = parse_args(argv)
     logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s")
+
+    if args.blast is None and args.contigs.is_file():
+        logger.warning(f"No blast input was provide, just copying input file.")
+        with open(args.contigs, "r") as contigs_file:
+            contig_content = contigs_file.read()
+        with open(f"{args.prefix}_withref.fa", "w") as f:
+            f.write(contig_content)
+        return 0
+
     if not args.blast.is_file():
         logger.error(f"The given input file {args.blast} was not found!")
         sys.exit(2)
@@ -193,6 +202,8 @@ def main(argv=None):
 
     write_hits(df_filter, args.contigs, args.references, args.prefix)
 
+    return 0
+
 
 if __name__ == "__main__":
     sys.exit(main())
diff --git a/bin/custom_multiqc_tables.py b/bin/custom_multiqc_tables.py
@@ -5,6 +5,7 @@
 import argparse
 import csv
 import logging
+import json
 import os
 import re
 import sys
@@ -84,6 +85,14 @@ def file_choices(choices, fname):
         type=lambda s: file_choices(("csv", "tsv", "yaml","yml"), s),
     )
 
+    parser.add_argument(
+        "--screen_files",
+        metavar="MASH SCREEN FILES",
+        nargs="+",
+        help="Mash screen result of the module SELECT REFERENCE where top hit is outputted in json file format",
+        type=lambda s: file_choices(("json"), s),
+    )
+
     parser.add_argument(
         "--mapping_constrains",
         metavar="MAPPING CONSTRAINS",
@@ -204,14 +213,24 @@ def concat_table_files(table_files, **kwargs):
     Returns:
         pandas.DataFrame: The concatenated dataframe.
     """
-    df = pd.concat(
-        [
+    try:
+        valid_dfs = [
             read_file_to_dataframe(file, **kwargs)
             for file in table_files
             if check_file_exists(file)
         ]
-    )
-    return df
+
+        if not valid_dfs:
+            logging.warning(f"Warning concatenating files: {table_files}")
+            logging.warning("No valid files found to concatenate.")
+            return pd.DataFrame()
+
+        df = pd.concat(valid_dfs)
+        return df
+
+    except ValueError as e:
+        logging.warning(f"Warning concatenating files: {table_files}")
+        return pd.DataFrame()
 
 
 def read_in_quast(table_files):
@@ -381,7 +400,31 @@ def read_dataframe_from_yaml(file, **kwargs):
         df = pd.DataFrame(data)
     return df
 
+def read_dataframe_from_json(file, **kwargs):
+    """
+    Read a dataframe from a JSON file.
 
+    Args:
+        file (str): The path to the file.
+
+    Returns:
+        pandas.DataFrame: The dataframe read from the file.
+    """
+    with open(file, "r") as json_file:
+        try:
+            data = json.load(json_file, **kwargs)
+        except json.JSONDecodeError as e:
+            logger.warning("Error reading JSON file %s: %s", file, e)
+            return pd.DataFrame()
+
+        # Check if 'query' key exists
+        if 'filename' not in data:
+            # Get the filename without path and suffix
+            filename = os.path.splitext(os.path.basename(file))[0]
+            # Add new key-value pair
+            data['filename'] = filename + '_constrain'
+        df = pd.DataFrame([data])
+    return df
 def read_file_to_dataframe(file, **kwargs):
     """
     Read a dataframe from a file.
@@ -402,6 +445,8 @@ def read_file_to_dataframe(file, **kwargs):
         df = read_dataframe_from_csv(file_path, **kwargs)
     elif file_path.suffix in [".yaml", ".yml"]:
         df = read_dataframe_from_yaml(file_path, **kwargs)
+    elif file_path.suffix in [".json"]:
+        df = read_dataframe_from_json(file_path, **kwargs)
     else:
         logger.error("The file format %s is not supported of file %s!", file_path.suffix, file_path)
         sys.exit(2)
@@ -879,6 +924,13 @@ def main(argv=None):
             blast_df, "blast", "query", blast_header, "summary_blast_mqc.tsv"
         )
 
+    # MASH screen used for reference selection summarisation
+    screen_df = handle_tables(args.screen_files)
+    if not screen_df.empty:
+        screen_df = handle_dataframe( screen_df, "mash-screen", "filename")
+         # Make everything a string
+        screen_df = screen_df.astype(str)
+
     # CLuster table - mmseqs easysearch summary - annotation section
     annotation_df = handle_tables(args.annotation_files, header=None)
     if not annotation_df.empty:
@@ -937,6 +989,7 @@ def main(argv=None):
         logger.info("Joining dataframes")
         multiqc_contigs_df = multiqc_contigs_df.join(checkv_df, how="outer")
         multiqc_contigs_df = multiqc_contigs_df.join(quast_df, how="outer")
+        multiqc_contigs_df = multiqc_contigs_df.join(screen_df, how="outer")
         multiqc_contigs_df = multiqc_contigs_df.join(blast_df, how="outer")
         multiqc_contigs_df = multiqc_contigs_df.join(annotation_df, how="outer")
 
@@ -975,6 +1028,7 @@ def main(argv=None):
         final_columns = (
             ["index", "sample name", "cluster", "step"]
             + [column for column in mqc_contigs_sel.columns if "annotation" in column]
+            + [column for column in mqc_contigs_sel.columns if "mash-screen" in column]
             + [column for column in mqc_contigs_sel.columns if "blast" in column]
             + [column for column in mqc_contigs_sel.columns if "checkv" in column]
             + [column for column in mqc_contigs_sel.columns if "QC check" in column]

diff --git a/conf/modules.config b/conf/modules.config
@@ -890,6 +890,7 @@ process {
             }
 
             withName: RENAME_FASTA_HEADER_SINGLETON {
+                ext.prefix = { "${meta.id}_singleton" } // DON'T CHANGE
                 publishDir = [
                     enabled: false
                 ]

diff --git a/conf/tests/test_fail_mapped.config b/conf/tests/test_fail_mapped.config
@@ -35,7 +35,7 @@ params {
     mapping_constrains           = "${projectDir}/assets/samplesheets/mapping_constrains_fail.tsv"
 
     min_mapped_reads            = 100
-    intermediate_mapping_stats      = true
+    intermediate_mapping_stats  = true
     skip_checkv                 = true
 }
 

diff --git a/modules/local/blast_filter/main.nf b/modules/local/blast_filter/main.nf
@@ -8,13 +8,14 @@ process BLAST_FILTER {
         'biocontainers/mulled-v2-949aaaddebd054dc6bded102520daff6f0f93ce6:aa2a3707bfa0550fee316844baba7752eaab7802-0' }"
 
     input:
-    tuple val(meta), path(blast), path(contigs)
+    tuple val(meta), path(blast)
+    tuple val(meta), path(contigs)
     tuple val(meta2), path(db)
 
     output:
-    tuple val(meta), path("*.hits.txt")  , emit: hits
+    tuple val(meta), path("*.hits.txt")  , emit: hits, optional: true
     tuple val(meta), path("*.fa")        , emit: sequence
-    tuple val(meta), path("*.filter.tsv"), emit: filter
+    tuple val(meta), path("*.filter.tsv"), emit: filter, optional: true
     path "versions.yml"                  , emit: versions
 
     when:
@@ -23,10 +24,11 @@ process BLAST_FILTER {
     script:
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
+    def blast_command = blast ? "-i ${blast}" : ""
     """
     blast_filter.py \\
         $args \\
-        -i ${blast} \\
+        ${blast_command} \\
         -c ${contigs} \\
         -r ${db} \\
         -p ${prefix}

diff --git a/modules/local/custom_multiqc_tables/main.nf b/modules/local/custom_multiqc_tables/main.nf
@@ -14,20 +14,21 @@ process CUSTOM_MULTIQC_TABLES {
     path blast_files, stageAs: "?/blast/*"
     path mapping_constrains
     path anno_files, stageAs: "?/annotation/*"
+    path screen_files, stageAs: "?/screen/*"
     path multiqc_dir
     path comment_headers
     path custom_table_headers
 
     output:
-    path("summary_clusters_mqc.tsv")          , emit: summary_clusters_mqc  , optional: true
-    path("sample_metadata_mqc.tsv")           , emit: sample_metadata_mqc   , optional: true
-    path("contigs_overview_mqc.tsv")          , emit: contigs_overview_mqc  , optional: true
-    path("summary_checkv_mqc.tsv")            , emit: summary_checkv_mqc    , optional: true
-    path("summary_quast_mqc.tsv")             , emit: summary_quast_mqc     , optional: true
-    path("summary_blast_mqc.tsv")             , emit: summary_blast_mqc     , optional: true
-    path("summary_anno_mqc.tsv")              , emit: summary_anno_mqc      , optional: true
-    path("mapping_constrains_mqc.tsv")        , emit: mapping_constrains_mqc, optional: true
-    path("mapping_constrains_summary_mqc.tsv"), emit: constrains_summary_mqc, optional: true
+    path("summary_clusters_mqc.tsv")          , emit: summary_clusters_mqc   , optional: true
+    path("sample_metadata_mqc.tsv")           , emit: sample_metadata_mqc    , optional: true
+    path("contigs_overview_mqc.tsv")          , emit: contigs_overview_mqc   , optional: true
+    path("summary_checkv_mqc.tsv")            , emit: summary_checkv_mqc     , optional: true
+    path("summary_quast_mqc.tsv")             , emit: summary_quast_mqc      , optional: true
+    path("summary_blast_mqc.tsv")             , emit: summary_blast_mqc      , optional: true
+    path("summary_anno_mqc.tsv")              , emit: summary_anno_mqc       , optional: true
+    path("mapping_constrains_mqc.tsv")        , emit: mapping_constrains_mqc , optional: true
+    path("mapping_constrains_summary_mqc.tsv"), emit: constrains_summary_mqc , optional: true
     path "versions.yml"                       , emit: versions
 
     when:
@@ -42,6 +43,7 @@ process CUSTOM_MULTIQC_TABLES {
     def blast_files            = blast_files            ? "--blast_files ${blast_files}"                 : ''
     def annotation_files       = anno_files             ? "--annotation_files ${anno_files}"             : ''
     def mapping_constrains     = mapping_constrains     ? "--mapping_constrains ${mapping_constrains}"   : ''
+    def screen_files           = screen_files           ? "--screen_files ${screen_files}"               : ''
     def multiqc_dir            = multiqc_dir            ? "--multiqc_dir ${multiqc_dir}"                 : ''
     def comment_headers        = comment_headers        ? "--comment_dir ${comment_headers}"             : ''
     def custom_table_headers   = custom_table_headers   ? "--table_headers ${custom_table_headers}"      : ''
@@ -56,6 +58,7 @@ process CUSTOM_MULTIQC_TABLES {
         $blast_files \\
         $mapping_constrains \\
         $annotation_files \\
+        $screen_files\\
         $comment_headers \\
         $custom_table_headers \\
         $multiqc_dir

diff --git a/subworkflows/local/fasta_blast_refsel.nf b/subworkflows/local/fasta_blast_refsel.nf
@@ -41,11 +41,17 @@ workflow FASTA_BLAST_REFSEL {
     // Filter out false positve hits that based on query length, alignment length, identity, e-score & bit-score
     ch_blast_txt
         .hits
-        .join(fasta, by:[0], remainder:false)
-        .set{ hits_contigs }
+        .join(fasta, by:[0], remainder:true)
+        .multiMap{
+            meta, txt, fasta ->
+            hits : [meta, txt ? txt : []]
+            contigs : [meta, fasta]
+        }
+        .set{input_blast_filter}
 
     BLAST_FILTER (
-        hits_contigs,
+        input_blast_filter.hits,
+        input_blast_filter.contigs,
         blast_db_fasta
     )
     ch_versions = ch_versions.mix(BLAST_FILTER.out.versions.first())

diff --git a/subworkflows/local/fastq_assembly.nf b/subworkflows/local/fastq_assembly.nf
@@ -37,7 +37,12 @@ workflow FASTQ_ASSEMBLY {
             )
         ch_versions          = ch_versions.mix(SPADES.out.versions.first())
 
-        EXTEND_SPADES( reads, SPADES.out.scaffolds, "spades")
+        SPADES.out.scaffolds
+            .join(SPADES.out.contigs, remainder:true)
+            .map{meta, scaffold, contig -> [meta, scaffold ? scaffold : contig]} // sometimes no scaffold could be created if so take contig
+            .set{spades_consensus}
+
+        EXTEND_SPADES( reads, spades_consensus, "spades")
         ch_scaffolds         = ch_scaffolds.mix(EXTEND_SPADES.out.scaffolds)
         ch_coverages         = ch_coverages.mix(EXTEND_SPADES.out.coverages)
         ch_versions          = ch_versions.mix(EXTEND_SPADES.out.versions)

diff --git a/subworkflows/local/fastq_fasta_mash_screen.nf b/subworkflows/local/fastq_fasta_mash_screen.nf
@@ -62,8 +62,11 @@ workflow FASTQ_FASTA_MASH_SCREEN {
             return [meta + json, fasta, reads]
         }
 
+    ch_json = SELECT_REFERENCE.out.fasta_reads.map{ meta, json, fasta, reads -> [meta, json]}
+
     emit:
-    reference_fastq = reference_fastq   // channel: [meta, fasta, reads]
+    reference_fastq = reference_fastq   // channel: [meta, fasta, reads ]
+    json            = ch_json           // channel: [meta, json ]
     versions        = ch_versions       // channel: [ versions.yml ]
 }
 
diff --git a/workflows/viralgenie.nf b/workflows/viralgenie.nf
@@ -361,6 +361,8 @@ workflow VIRALGENIE {
             }
         .set{ch_consensus_results_reads}
 
+    ch_mash_screen = Channel.empty()
+
     if (params.mapping_constrains && !params.skip_variant_calling ) {
         // Importing samplesheet
         Channel.fromSamplesheet('mapping_constrains')
@@ -410,6 +412,7 @@ workflow VIRALGENIE {
             constrain_consensus_reads.multiFastaSelection
         )
         ch_versions = ch_versions.mix(FASTQ_FASTA_MASH_SCREEN.out.versions)
+        ch_mash_screen = FASTQ_FASTA_MASH_SCREEN.out.json.collect{it[1]}
 
         // For QC we keep original sequence to compare to
         ch_unaligned_raw_contigs = ch_unaligned_raw_contigs
@@ -484,6 +487,7 @@ workflow VIRALGENIE {
             ch_blast_summary.ifEmpty([]),
             ch_constrain_meta,
             ch_annotation_summary.ifEmpty([]),
+            ch_mash_screen.ifEmpty([]),
             multiqc_data,
             ch_multiqc_comment_headers.ifEmpty([]),
             ch_multiqc_custom_table_headers.ifEmpty([])