Skip to content

Commit

Permalink
Merge branch 'dev' into database-sample-specifc
Browse files Browse the repository at this point in the history
  • Loading branch information
Joon-Klaps authored Aug 22, 2024
2 parents f3861b3 + 680be05 commit 9f25774
Show file tree
Hide file tree
Showing 11 changed files with 114 additions and 23 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ Initial release of Joon-Klaps/viralgenie, created with the [nf-core](https://nf-
- Add read & contig decomplexification using prinseq++ ([#133](https://github.com/Joon-Klaps/viralgenie/pull/133))
- Allow reference pool for specific samples ([#134](https://github.com/Joon-Klaps/viralgenie/pull/134))
- Add option to filter contig clusters based on cumulative read coverage ([#138](https://github.com/Joon-Klaps/viralgenie/pull/138))
- Adding mash-screen output to result table ([#140](https://github.com/Joon-Klaps/viralgenie/pull/140))
- Add logic to allow samples with no reference hits to be analysed ([#141](https://github.com/Joon-Klaps/viralgenie/pull/141))

### `Fixed`

Expand Down
11 changes: 11 additions & 0 deletions bin/blast_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,15 @@ def main(argv=None):
"""Coordinate argument parsing and program execution."""
args = parse_args(argv)
logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s")

if args.blast is None and args.contigs.is_file():
logger.warning(f"No blast input was provide, just copying input file.")
with open(args.contigs, "r") as contigs_file:
contig_content = contigs_file.read()
with open(f"{args.prefix}_withref.fa", "w") as f:
f.write(contig_content)
return 0

if not args.blast.is_file():
logger.error(f"The given input file {args.blast} was not found!")
sys.exit(2)
Expand All @@ -193,6 +202,8 @@ def main(argv=None):

write_hits(df_filter, args.contigs, args.references, args.prefix)

return 0


if __name__ == "__main__":
sys.exit(main())
62 changes: 58 additions & 4 deletions bin/custom_multiqc_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import argparse
import csv
import logging
import json
import os
import re
import sys
Expand Down Expand Up @@ -84,6 +85,14 @@ def file_choices(choices, fname):
type=lambda s: file_choices(("csv", "tsv", "yaml","yml"), s),
)

parser.add_argument(
"--screen_files",
metavar="MASH SCREEN FILES",
nargs="+",
help="Mash screen result of the module SELECT REFERENCE where top hit is outputted in json file format",
type=lambda s: file_choices(("json"), s),
)

parser.add_argument(
"--mapping_constrains",
metavar="MAPPING CONSTRAINS",
Expand Down Expand Up @@ -204,14 +213,24 @@ def concat_table_files(table_files, **kwargs):
Returns:
pandas.DataFrame: The concatenated dataframe.
"""
df = pd.concat(
[
try:
valid_dfs = [
read_file_to_dataframe(file, **kwargs)
for file in table_files
if check_file_exists(file)
]
)
return df

if not valid_dfs:
logging.warning(f"Warning concatenating files: {table_files}")
logging.warning("No valid files found to concatenate.")
return pd.DataFrame()

df = pd.concat(valid_dfs)
return df

except ValueError as e:
logging.warning(f"Warning concatenating files: {table_files}")
return pd.DataFrame()


def read_in_quast(table_files):
Expand Down Expand Up @@ -381,7 +400,31 @@ def read_dataframe_from_yaml(file, **kwargs):
df = pd.DataFrame(data)
return df

def read_dataframe_from_json(file, **kwargs):
"""
Read a dataframe from a JSON file.
Args:
file (str): The path to the file.
Returns:
pandas.DataFrame: The dataframe read from the file.
"""
with open(file, "r") as json_file:
try:
data = json.load(json_file, **kwargs)
except json.JSONDecodeError as e:
logger.warning("Error reading JSON file %s: %s", file, e)
return pd.DataFrame()

# Check if 'query' key exists
if 'filename' not in data:
# Get the filename without path and suffix
filename = os.path.splitext(os.path.basename(file))[0]
# Add new key-value pair
data['filename'] = filename + '_constrain'
df = pd.DataFrame([data])
return df
def read_file_to_dataframe(file, **kwargs):
"""
Read a dataframe from a file.
Expand All @@ -402,6 +445,8 @@ def read_file_to_dataframe(file, **kwargs):
df = read_dataframe_from_csv(file_path, **kwargs)
elif file_path.suffix in [".yaml", ".yml"]:
df = read_dataframe_from_yaml(file_path, **kwargs)
elif file_path.suffix in [".json"]:
df = read_dataframe_from_json(file_path, **kwargs)
else:
logger.error("The file format %s is not supported of file %s!", file_path.suffix, file_path)
sys.exit(2)
Expand Down Expand Up @@ -879,6 +924,13 @@ def main(argv=None):
blast_df, "blast", "query", blast_header, "summary_blast_mqc.tsv"
)

# MASH screen used for reference selection summarisation
screen_df = handle_tables(args.screen_files)
if not screen_df.empty:
screen_df = handle_dataframe( screen_df, "mash-screen", "filename")
# Make everything a string
screen_df = screen_df.astype(str)

# CLuster table - mmseqs easysearch summary - annotation section
annotation_df = handle_tables(args.annotation_files, header=None)
if not annotation_df.empty:
Expand Down Expand Up @@ -937,6 +989,7 @@ def main(argv=None):
logger.info("Joining dataframes")
multiqc_contigs_df = multiqc_contigs_df.join(checkv_df, how="outer")
multiqc_contigs_df = multiqc_contigs_df.join(quast_df, how="outer")
multiqc_contigs_df = multiqc_contigs_df.join(screen_df, how="outer")
multiqc_contigs_df = multiqc_contigs_df.join(blast_df, how="outer")
multiqc_contigs_df = multiqc_contigs_df.join(annotation_df, how="outer")

Expand Down Expand Up @@ -975,6 +1028,7 @@ def main(argv=None):
final_columns = (
["index", "sample name", "cluster", "step"]
+ [column for column in mqc_contigs_sel.columns if "annotation" in column]
+ [column for column in mqc_contigs_sel.columns if "mash-screen" in column]
+ [column for column in mqc_contigs_sel.columns if "blast" in column]
+ [column for column in mqc_contigs_sel.columns if "checkv" in column]
+ [column for column in mqc_contigs_sel.columns if "QC check" in column]
Expand Down
1 change: 1 addition & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -890,6 +890,7 @@ process {
}

withName: RENAME_FASTA_HEADER_SINGLETON {
ext.prefix = { "${meta.id}_singleton" } // DON'T CHANGE
publishDir = [
enabled: false
]
Expand Down
2 changes: 1 addition & 1 deletion conf/tests/test_fail_mapped.config
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ params {
mapping_constrains = "${projectDir}/assets/samplesheets/mapping_constrains_fail.tsv"

min_mapped_reads = 100
intermediate_mapping_stats = true
intermediate_mapping_stats = true
skip_checkv = true
}

Expand Down
10 changes: 6 additions & 4 deletions modules/local/blast_filter/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@ process BLAST_FILTER {
'biocontainers/mulled-v2-949aaaddebd054dc6bded102520daff6f0f93ce6:aa2a3707bfa0550fee316844baba7752eaab7802-0' }"

input:
tuple val(meta), path(blast), path(contigs)
tuple val(meta), path(blast)
tuple val(meta), path(contigs)
tuple val(meta2), path(db)

output:
tuple val(meta), path("*.hits.txt") , emit: hits
tuple val(meta), path("*.hits.txt") , emit: hits, optional: true
tuple val(meta), path("*.fa") , emit: sequence
tuple val(meta), path("*.filter.tsv"), emit: filter
tuple val(meta), path("*.filter.tsv"), emit: filter, optional: true
path "versions.yml" , emit: versions

when:
Expand All @@ -23,10 +24,11 @@ process BLAST_FILTER {
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def blast_command = blast ? "-i ${blast}" : ""
"""
blast_filter.py \\
$args \\
-i ${blast} \\
${blast_command} \\
-c ${contigs} \\
-r ${db} \\
-p ${prefix}
Expand Down
21 changes: 12 additions & 9 deletions modules/local/custom_multiqc_tables/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,21 @@ process CUSTOM_MULTIQC_TABLES {
path blast_files, stageAs: "?/blast/*"
path mapping_constrains
path anno_files, stageAs: "?/annotation/*"
path screen_files, stageAs: "?/screen/*"
path multiqc_dir
path comment_headers
path custom_table_headers

output:
path("summary_clusters_mqc.tsv") , emit: summary_clusters_mqc , optional: true
path("sample_metadata_mqc.tsv") , emit: sample_metadata_mqc , optional: true
path("contigs_overview_mqc.tsv") , emit: contigs_overview_mqc , optional: true
path("summary_checkv_mqc.tsv") , emit: summary_checkv_mqc , optional: true
path("summary_quast_mqc.tsv") , emit: summary_quast_mqc , optional: true
path("summary_blast_mqc.tsv") , emit: summary_blast_mqc , optional: true
path("summary_anno_mqc.tsv") , emit: summary_anno_mqc , optional: true
path("mapping_constrains_mqc.tsv") , emit: mapping_constrains_mqc, optional: true
path("mapping_constrains_summary_mqc.tsv"), emit: constrains_summary_mqc, optional: true
path("summary_clusters_mqc.tsv") , emit: summary_clusters_mqc , optional: true
path("sample_metadata_mqc.tsv") , emit: sample_metadata_mqc , optional: true
path("contigs_overview_mqc.tsv") , emit: contigs_overview_mqc , optional: true
path("summary_checkv_mqc.tsv") , emit: summary_checkv_mqc , optional: true
path("summary_quast_mqc.tsv") , emit: summary_quast_mqc , optional: true
path("summary_blast_mqc.tsv") , emit: summary_blast_mqc , optional: true
path("summary_anno_mqc.tsv") , emit: summary_anno_mqc , optional: true
path("mapping_constrains_mqc.tsv") , emit: mapping_constrains_mqc , optional: true
path("mapping_constrains_summary_mqc.tsv"), emit: constrains_summary_mqc , optional: true
path "versions.yml" , emit: versions

when:
Expand All @@ -42,6 +43,7 @@ process CUSTOM_MULTIQC_TABLES {
def blast_files = blast_files ? "--blast_files ${blast_files}" : ''
def annotation_files = anno_files ? "--annotation_files ${anno_files}" : ''
def mapping_constrains = mapping_constrains ? "--mapping_constrains ${mapping_constrains}" : ''
def screen_files = screen_files ? "--screen_files ${screen_files}" : ''
def multiqc_dir = multiqc_dir ? "--multiqc_dir ${multiqc_dir}" : ''
def comment_headers = comment_headers ? "--comment_dir ${comment_headers}" : ''
def custom_table_headers = custom_table_headers ? "--table_headers ${custom_table_headers}" : ''
Expand All @@ -56,6 +58,7 @@ process CUSTOM_MULTIQC_TABLES {
$blast_files \\
$mapping_constrains \\
$annotation_files \\
$screen_files\\
$comment_headers \\
$custom_table_headers \\
$multiqc_dir
Expand Down
12 changes: 9 additions & 3 deletions subworkflows/local/fasta_blast_refsel.nf
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,17 @@ workflow FASTA_BLAST_REFSEL {
// Filter out false positve hits that based on query length, alignment length, identity, e-score & bit-score
ch_blast_txt
.hits
.join(fasta, by:[0], remainder:false)
.set{ hits_contigs }
.join(fasta, by:[0], remainder:true)
.multiMap{
meta, txt, fasta ->
hits : [meta, txt ? txt : []]
contigs : [meta, fasta]
}
.set{input_blast_filter}

BLAST_FILTER (
hits_contigs,
input_blast_filter.hits,
input_blast_filter.contigs,
blast_db_fasta
)
ch_versions = ch_versions.mix(BLAST_FILTER.out.versions.first())
Expand Down
7 changes: 6 additions & 1 deletion subworkflows/local/fastq_assembly.nf
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,12 @@ workflow FASTQ_ASSEMBLY {
)
ch_versions = ch_versions.mix(SPADES.out.versions.first())

EXTEND_SPADES( reads, SPADES.out.scaffolds, "spades")
SPADES.out.scaffolds
.join(SPADES.out.contigs, remainder:true)
.map{meta, scaffold, contig -> [meta, scaffold ? scaffold : contig]} // sometimes no scaffold could be created if so take contig
.set{spades_consensus}

EXTEND_SPADES( reads, spades_consensus, "spades")
ch_scaffolds = ch_scaffolds.mix(EXTEND_SPADES.out.scaffolds)
ch_coverages = ch_coverages.mix(EXTEND_SPADES.out.coverages)
ch_versions = ch_versions.mix(EXTEND_SPADES.out.versions)
Expand Down
5 changes: 4 additions & 1 deletion subworkflows/local/fastq_fasta_mash_screen.nf
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,11 @@ workflow FASTQ_FASTA_MASH_SCREEN {
return [meta + json, fasta, reads]
}

ch_json = SELECT_REFERENCE.out.fasta_reads.map{ meta, json, fasta, reads -> [meta, json]}

emit:
reference_fastq = reference_fastq // channel: [meta, fasta, reads]
reference_fastq = reference_fastq // channel: [meta, fasta, reads ]
json = ch_json // channel: [meta, json ]
versions = ch_versions // channel: [ versions.yml ]
}

4 changes: 4 additions & 0 deletions workflows/viralgenie.nf
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,8 @@ workflow VIRALGENIE {
}
.set{ch_consensus_results_reads}

ch_mash_screen = Channel.empty()

if (params.mapping_constrains && !params.skip_variant_calling ) {
// Importing samplesheet
Channel.fromSamplesheet('mapping_constrains')
Expand Down Expand Up @@ -410,6 +412,7 @@ workflow VIRALGENIE {
constrain_consensus_reads.multiFastaSelection
)
ch_versions = ch_versions.mix(FASTQ_FASTA_MASH_SCREEN.out.versions)
ch_mash_screen = FASTQ_FASTA_MASH_SCREEN.out.json.collect{it[1]}

// For QC we keep original sequence to compare to
ch_unaligned_raw_contigs = ch_unaligned_raw_contigs
Expand Down Expand Up @@ -484,6 +487,7 @@ workflow VIRALGENIE {
ch_blast_summary.ifEmpty([]),
ch_constrain_meta,
ch_annotation_summary.ifEmpty([]),
ch_mash_screen.ifEmpty([]),
multiqc_data,
ch_multiqc_comment_headers.ifEmpty([]),
ch_multiqc_custom_table_headers.ifEmpty([])
Expand Down

0 comments on commit 9f25774

Please sign in to comment.