Merge pull request #125 from Joon-Klaps/124-fix-oom-for-long-contigs

Fix OOM with longer contigs for lowcov_to_reference
Joon-Klaps · Jun 5, 2024 · bdd43ba · bdd43ba
2 parents 84e37f2 + 32e0892
commit bdd43ba
Show file tree

Hide file tree

Showing 8 changed files with 33 additions and 64 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,4 +13,6 @@ Initial release of Joon-Klaps/viralgenie, created with the [nf-core](https://nf-
 
 ### `Fixed`
 
+- OOM with longer contigs for lowcov_to_reference, uses more RAM now ([#125](https://github.com/Joon-Klaps/viralgenie/pull/125))
+
 ### `Parameters`
diff --git a/bin/lowcov_to_reference.py b/bin/lowcov_to_reference.py
@@ -87,38 +87,10 @@ def annotate_ambiguous(reference, consensus, regions, args):
     ID = f"{args.prefix}"
     DESCRIPTION = f"Hybrid construct of the {args.reference} and {args.consensus} sequences where regions with depth lower then {args.minimum_depth} have been replaced"
 
-    # If lengths are the same, then we can use the fast replacement ie no alignment, use mpileup positions
-    if len(reference) == len(consensus):
-        logger.info("> Lengths are the same, using fast replacement")
-        seq = fast_replacement(reference, consensus, regions)
-    else:
-        logger.info(
-            f"> Lengths are NOT the same  {args.reference}: {len(reference)} vs. {args.consensus}: {len(consensus)}, using alignment replacement"
-        )
-        seq = alignment_replacement(reference, consensus, regions)
+    seq = alignment_replacement(reference, consensus, regions)
 
     return SeqRecord(seq=Seq(seq), id=ID, description=DESCRIPTION)
 
-
-def fast_replacement(reference_record, consensus_record, regions):
-    """
-    Annotates ambiguous bases in the consensus sequence with the corresponding bases from the reference sequence.
-
-    Args:
-        reference_record (SeqRecord): The reference sequence record.
-        consensus_record (SeqRecord): The consensus sequence record.
-        regions (list): A list of indices representing the regions where annotation should be performed.
-
-    Returns:
-        string
-    """
-    ref_seq = np.array(list(str(reference_record.seq)))
-    cons_seq = np.array(list(str(consensus_record.seq)))
-
-    cons_seq[regions] = ref_seq[regions]
-    return "".join(cons_seq)
-
-
 def alignment_replacement(reference_record, consensus_record, regions):
     """
     Replaces the aligned regions in the consensus sequence with the corresponding regions from the reference sequence.
@@ -146,12 +118,13 @@ def alignment_replacement(reference_record, consensus_record, regions):
     target_locations = alignment.aligned[0] # Reference locations
     query_locations = alignment.aligned[1]  # Consensus locations
 
-    logger.debug(alignment.aligned)
+    logger.debug("ALIGNMENT: %s", alignment.aligned)
 
     with open("alignment.txt", "w") as f:
         f.write(str(alignment))
 
     # Account for the gaps in the alignment, by updating the consensus indexes
+    # Needs to be double checked if the object is a tuple.
     logger.info("> Finding target tuples")
     indexes_differences = find_target_tuples_sorted(regions, target_locations)
 
@@ -167,49 +140,43 @@ def alignment_replacement(reference_record, consensus_record, regions):
     return "".join(cons_seq)
 
 
-def find_target_tuples_sorted(regions, target_tuple_list):
+def find_target_tuples_sorted(regions, target_matrix):
     """
     Finds the target tuples sorted based on the regions.
 
     Args:
-        regions (list): A list of regions.
-        target_tuple_list (list): A list of target tuples.
+        regions (list): A sorted list of regions.
+        target_matrix (matrix): A n x 2 matrix of [[start1,end1], [start2,end2], ...] alignment positions of the target query
 
     Returns:
         list: A list of tuples containing the index of the target tuple and the difference between the region and the start of the target tuple.
     """
     result = []
 
-    region_index = 0
-    target_tuple_index = None
-    difference = None
-
-    for start, end in target_tuple_list:
-        while region_index < len(regions) and regions[region_index] <= end:
-            region = regions[region_index]
+    for block_index, (start, end) in enumerate(target_matrix):
+        logger.debug("target_matrix[%s]: start: %s - end: %s", block_index, start, end)
 
-            if start <= region <= end:
+        # Iterate through regions while they are less than or equal to the end of the target tuple
+        for region in regions:
+            if region > end:
+                logger.error("Region %s is greater than the end of the target tuple %s", region, end)
+                logger.error("Please report this issue to the developers with your data from the current workdir.")
+                break
+            elif start <= region <= end:
                 # If the region falls within the start and end of the target tuple,
-                # calculate the index of the target tuple and the difference between the region and the start of the target tuple.
-                target_tuple_index = target_tuple_list.index((start, end))
+                # Store the alignment block's index & the difference between the target position and the start alignment block.
                 difference = region - start
-                result.append((target_tuple_index, difference))
-                region_index += 1
-            elif region < start:
-                # If the region is before the start of the target tuple, move to the next region.
-                region_index += 1
-            else:
-                # If the region is after the end of the target tuple, break the loop.
-                break
+                logger.debug("In alignment block (0-based) %s at postion %s (of that block), the consensus should be updated.", block_index, difference)
+                result.append((block_index, difference))
     return result
 
 
-def update_query_tuple_with_difference(query_tuple_list, results):
+def update_query_tuple_with_difference(query_matrix, results):
     updated_query_tuples = []
 
-    for target_tuple_index, difference in results:
-        if target_tuple_index is not None:
-            query_tuple = query_tuple_list[target_tuple_index]
+    for block_index, difference in results:
+        if block_index is not None:
+            query_tuple = query_matrix[block_index]
             updated_query_tuple = query_tuple[0] + difference
             updated_query_tuples.append(updated_query_tuple)
 

diff --git a/modules/local/extract_cluster/environment.yml b/modules/local/extract_cluster/environment.yml
@@ -4,4 +4,4 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - conda-forge::biopython=1.78
+  - conda-forge::biopython=1.81
diff --git a/modules/local/extract_cluster/main.nf b/modules/local/extract_cluster/main.nf
@@ -4,8 +4,8 @@ process EXTRACT_CLUSTER {
 
     conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/biopython:1.78':
-        'biocontainers/biopython:1.78' }"
+        'https://depot.galaxyproject.org/singularity/biopython:1.81':
+        'biocontainers/biopython:1.81' }"
 
     input:
     tuple val(meta), path(clusters), path(seq)

diff --git a/modules/local/lowcov_to_reference/environment.yml b/modules/local/lowcov_to_reference/environment.yml
@@ -4,4 +4,4 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - conda-forge::bioframe=0.4.1
+  - conda-forge::biopython=1.81
diff --git a/modules/local/lowcov_to_reference/main.nf b/modules/local/lowcov_to_reference/main.nf
@@ -1,12 +1,12 @@
 process LOWCOV_TO_REFERENCE {
     tag "$meta.id"
-    label 'process_single'
+    label 'process_medium'
     errorStrategy { task.exitStatus == 4 ? 'ignore' : 'retry' } // can fail if mpileup empty
 
     conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/mulled-v2-949aaaddebd054dc6bded102520daff6f0f93ce6:aa2a3707bfa0550fee316844baba7752eaab7802-0':
-        'biocontainers/mulled-v2-949aaaddebd054dc6bded102520daff6f0f93ce6:aa2a3707bfa0550fee316844baba7752eaab7802-0' }"
+        'https://depot.galaxyproject.org/singularity/biopython:1.81':
+        'biocontainers/biopython:1.81' }"
 
     input:
     tuple val(meta), path(reference), path(consensus), path(mpileup)

diff --git a/nextflow.config b/nextflow.config
@@ -143,7 +143,7 @@ params {
     skip_quast                  = false
     skip_blast_qc               = false
     skip_alignment_qc           = false
-    annotation_db               = "https://viralzone.expasy.org/resources/Virosaurus/2020_4/virosaurus90_vertebrate-20200330.fas.gz"
+    annotation_db               = "ftp://ftp.expasy.org/databases/viralzone/2020_4/virosaurus90_vertebrate-20200330.fas.gz"
     skip_annotation             = false
     mmseqs_searchtype           = 4
 

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -671,7 +671,7 @@
                 },
                 "annotation_db": {
                     "type": "string",
-                    "default": "https://viralzone.expasy.org/resources/Virosaurus/2020_4/virosaurus90_vertebrate-20200330.fas.gz",
+                    "default": "ftp://ftp.expasy.org/databases/viralzone/2020_4/virosaurus90_vertebrate-20200330.fas.gz",
                     "fa_icon": "fas fa-database",
                     "description": "Database used for annotation of the cosensus constructs",
                     "help_text": "The metada fields are stored in the fasta comment as `key1:\"value1\"|key2:\"value2\"|...` see docs/databases.md for more information."