Skip to content

Commit

Permalink
Merge pull request #125 from Joon-Klaps/124-fix-oom-for-long-contigs
Browse files Browse the repository at this point in the history
Fix OOM with longer contigs for lowcov_to_reference
  • Loading branch information
Joon-Klaps authored Jun 5, 2024
2 parents 84e37f2 + 32e0892 commit bdd43ba
Show file tree
Hide file tree
Showing 8 changed files with 33 additions and 64 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,6 @@ Initial release of Joon-Klaps/viralgenie, created with the [nf-core](https://nf-

### `Fixed`

- OOM with longer contigs for lowcov_to_reference, uses more RAM now ([#125](https://github.com/Joon-Klaps/viralgenie/pull/125))

### `Parameters`
77 changes: 22 additions & 55 deletions bin/lowcov_to_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,38 +87,10 @@ def annotate_ambiguous(reference, consensus, regions, args):
ID = f"{args.prefix}"
DESCRIPTION = f"Hybrid construct of the {args.reference} and {args.consensus} sequences where regions with depth lower then {args.minimum_depth} have been replaced"

# If lengths are the same, then we can use the fast replacement ie no alignment, use mpileup positions
if len(reference) == len(consensus):
logger.info("> Lengths are the same, using fast replacement")
seq = fast_replacement(reference, consensus, regions)
else:
logger.info(
f"> Lengths are NOT the same {args.reference}: {len(reference)} vs. {args.consensus}: {len(consensus)}, using alignment replacement"
)
seq = alignment_replacement(reference, consensus, regions)
seq = alignment_replacement(reference, consensus, regions)

return SeqRecord(seq=Seq(seq), id=ID, description=DESCRIPTION)


def fast_replacement(reference_record, consensus_record, regions):
"""
Annotates ambiguous bases in the consensus sequence with the corresponding bases from the reference sequence.
Args:
reference_record (SeqRecord): The reference sequence record.
consensus_record (SeqRecord): The consensus sequence record.
regions (list): A list of indices representing the regions where annotation should be performed.
Returns:
string
"""
ref_seq = np.array(list(str(reference_record.seq)))
cons_seq = np.array(list(str(consensus_record.seq)))

cons_seq[regions] = ref_seq[regions]
return "".join(cons_seq)


def alignment_replacement(reference_record, consensus_record, regions):
"""
Replaces the aligned regions in the consensus sequence with the corresponding regions from the reference sequence.
Expand Down Expand Up @@ -146,12 +118,13 @@ def alignment_replacement(reference_record, consensus_record, regions):
target_locations = alignment.aligned[0] # Reference locations
query_locations = alignment.aligned[1] # Consensus locations

logger.debug(alignment.aligned)
logger.debug("ALIGNMENT: %s", alignment.aligned)

with open("alignment.txt", "w") as f:
f.write(str(alignment))

# Account for the gaps in the alignment, by updating the consensus indexes
# Needs to be double checked if the object is a tuple.
logger.info("> Finding target tuples")
indexes_differences = find_target_tuples_sorted(regions, target_locations)

Expand All @@ -167,49 +140,43 @@ def alignment_replacement(reference_record, consensus_record, regions):
return "".join(cons_seq)


def find_target_tuples_sorted(regions, target_tuple_list):
def find_target_tuples_sorted(regions, target_matrix):
"""
Finds the target tuples sorted based on the regions.
Args:
regions (list): A list of regions.
target_tuple_list (list): A list of target tuples.
regions (list): A sorted list of regions.
target_matrix (matrix): A n x 2 matrix of [[start1,end1], [start2,end2], ...] alignment positions of the target query
Returns:
list: A list of tuples containing the index of the target tuple and the difference between the region and the start of the target tuple.
"""
result = []

region_index = 0
target_tuple_index = None
difference = None

for start, end in target_tuple_list:
while region_index < len(regions) and regions[region_index] <= end:
region = regions[region_index]
for block_index, (start, end) in enumerate(target_matrix):
logger.debug("target_matrix[%s]: start: %s - end: %s", block_index, start, end)

if start <= region <= end:
# Iterate through regions while they are less than or equal to the end of the target tuple
for region in regions:
if region > end:
logger.error("Region %s is greater than the end of the target tuple %s", region, end)
logger.error("Please report this issue to the developers with your data from the current workdir.")
break
elif start <= region <= end:
# If the region falls within the start and end of the target tuple,
# calculate the index of the target tuple and the difference between the region and the start of the target tuple.
target_tuple_index = target_tuple_list.index((start, end))
# Store the alignment block's index & the difference between the target position and the start alignment block.
difference = region - start
result.append((target_tuple_index, difference))
region_index += 1
elif region < start:
# If the region is before the start of the target tuple, move to the next region.
region_index += 1
else:
# If the region is after the end of the target tuple, break the loop.
break
logger.debug("In alignment block (0-based) %s at postion %s (of that block), the consensus should be updated.", block_index, difference)
result.append((block_index, difference))
return result


def update_query_tuple_with_difference(query_tuple_list, results):
def update_query_tuple_with_difference(query_matrix, results):
updated_query_tuples = []

for target_tuple_index, difference in results:
if target_tuple_index is not None:
query_tuple = query_tuple_list[target_tuple_index]
for block_index, difference in results:
if block_index is not None:
query_tuple = query_matrix[block_index]
updated_query_tuple = query_tuple[0] + difference
updated_query_tuples.append(updated_query_tuple)

Expand Down
2 changes: 1 addition & 1 deletion modules/local/extract_cluster/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ channels:
- bioconda
- defaults
dependencies:
- conda-forge::biopython=1.78
- conda-forge::biopython=1.81
4 changes: 2 additions & 2 deletions modules/local/extract_cluster/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ process EXTRACT_CLUSTER {

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/biopython:1.78':
'biocontainers/biopython:1.78' }"
'https://depot.galaxyproject.org/singularity/biopython:1.81':
'biocontainers/biopython:1.81' }"

input:
tuple val(meta), path(clusters), path(seq)
Expand Down
2 changes: 1 addition & 1 deletion modules/local/lowcov_to_reference/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ channels:
- bioconda
- defaults
dependencies:
- conda-forge::bioframe=0.4.1
- conda-forge::biopython=1.81
6 changes: 3 additions & 3 deletions modules/local/lowcov_to_reference/main.nf
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
process LOWCOV_TO_REFERENCE {
tag "$meta.id"
label 'process_single'
label 'process_medium'
errorStrategy { task.exitStatus == 4 ? 'ignore' : 'retry' } // can fail if mpileup empty

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/mulled-v2-949aaaddebd054dc6bded102520daff6f0f93ce6:aa2a3707bfa0550fee316844baba7752eaab7802-0':
'biocontainers/mulled-v2-949aaaddebd054dc6bded102520daff6f0f93ce6:aa2a3707bfa0550fee316844baba7752eaab7802-0' }"
'https://depot.galaxyproject.org/singularity/biopython:1.81':
'biocontainers/biopython:1.81' }"

input:
tuple val(meta), path(reference), path(consensus), path(mpileup)
Expand Down
2 changes: 1 addition & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ params {
skip_quast = false
skip_blast_qc = false
skip_alignment_qc = false
annotation_db = "https://viralzone.expasy.org/resources/Virosaurus/2020_4/virosaurus90_vertebrate-20200330.fas.gz"
annotation_db = "ftp://ftp.expasy.org/databases/viralzone/2020_4/virosaurus90_vertebrate-20200330.fas.gz"
skip_annotation = false
mmseqs_searchtype = 4

Expand Down
2 changes: 1 addition & 1 deletion nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -671,7 +671,7 @@
},
"annotation_db": {
"type": "string",
"default": "https://viralzone.expasy.org/resources/Virosaurus/2020_4/virosaurus90_vertebrate-20200330.fas.gz",
"default": "ftp://ftp.expasy.org/databases/viralzone/2020_4/virosaurus90_vertebrate-20200330.fas.gz",
"fa_icon": "fas fa-database",
"description": "Database used for annotation of the cosensus constructs",
"help_text": "The metada fields are stored in the fasta comment as `key1:\"value1\"|key2:\"value2\"|...` see docs/databases.md for more information."
Expand Down

0 comments on commit bdd43ba

Please sign in to comment.