From d7ea80226e83ec69829eb0f62bd44a773aca68b0 Mon Sep 17 00:00:00 2001
From: Mark Walker <markw@broadinstitute.org>
Date: Fri, 14 Jun 2024 13:13:01 -0400
Subject: [PATCH] Finish addressing comments

---
 .../process_posthoc_cpx_depth_regenotyping.py | 73 ++++++++++---------
 1 file changed, 39 insertions(+), 34 deletions(-)

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.py b/src/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.py
index 6e2626ead..8b63e6073 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.py
@@ -2,6 +2,7 @@
 
 import argparse
 import gzip
+import logging
 import sys
 
 from collections import Counter
@@ -320,6 +321,7 @@ def _count_del_dup_wt(variants_list, samples, control_cn):
 
 def clean_up_intervals(genotype_counts, intervals_list_dict, genotype_counts_tree_dict):
     visited_vids = set()  # Only visit each unique extended vid (w/ interval info)
+    cleaned_genotype_counts = dict()
     for query_record in genotype_counts:
         if query_record.merged_vid in visited_vids:
             continue
@@ -354,14 +356,19 @@ def clean_up_intervals(genotype_counts, intervals_list_dict, genotype_counts_tre
                         cnv_assessment = "TOO_SMALL"
                     else:
                         cnv_assessment = "WT"
-                    yield CleanedVariantIntervalRecord(interval_chrom, interval_start, interval_end,
-                                                       interval_type, matching_record.vid, matching_record.sv_type,
-                                                       matching_record.cpx_type, matching_record.carrier_del,
-                                                       matching_record.carrier_wt, matching_record.carrier_dup,
-                                                       matching_record.control_del, matching_record.control_wt,
-                                                       matching_record.control_dup,
-                                                       matching_record.diff_case_control_del_frac,
-                                                       matching_record.diff_case_control_dup_frac, cnv_assessment)
+                    record = CleanedVariantIntervalRecord(interval_chrom, interval_start, interval_end,
+                                                          interval_type, matching_record.vid, matching_record.sv_type,
+                                                          matching_record.cpx_type, matching_record.carrier_del,
+                                                          matching_record.carrier_wt, matching_record.carrier_dup,
+                                                          matching_record.control_del, matching_record.control_wt,
+                                                          matching_record.control_dup,
+                                                          matching_record.diff_case_control_del_frac,
+                                                          matching_record.diff_case_control_dup_frac, cnv_assessment)
+                    if record.vid not in cleaned_genotype_counts:
+                        cleaned_genotype_counts[record.vid] = [record]
+                    else:
+                        cleaned_genotype_counts[record.vid].append(record)
+    return cleaned_genotype_counts
 
 
 def parse_ends(vcf_path: Text) -> Dict[Text, int]:
@@ -398,20 +405,13 @@ def parse_ends(vcf_path: Text) -> Dict[Text, int]:
     return end_dict
 
 
-def read_vcf(vcf_path, cleaned_genotype_counts):
+def read_vcf(vcf_path, cleaned_genotype_counts_vids):
     end_dict = parse_ends(vcf_path)
-    vids = set([r.vid for r in cleaned_genotype_counts])
     with VariantFile(vcf_path) as f:
-        return {r.id: VcfRecord(r, end_dict) for r in f if r.id in vids}
+        return {r.id: VcfRecord(r, end_dict) for r in f if r.id in cleaned_genotype_counts_vids}
 
 
 def final_assessment(cleaned_genotype_counts, variants_to_reclassify):
-    # Group records by vid, ie dictionary of vid: list of records
-    genotype_counts_dict = dict()
-    for record in cleaned_genotype_counts:
-        if record.vid not in genotype_counts_dict:
-            genotype_counts_dict[record.vid] = list()
-        genotype_counts_dict[record.vid].append(record)
 
     def _subtract_interval(start1, end1, start2, end2):
         # Spanning case results in null (shouldn't ever do this in this context)
@@ -490,6 +490,7 @@ def _evaluate_ctx_ins(r, records_list, default_sv_type, default_cpx_type):
                            if has_reciprocal_overlap(r.source_chrom, r.source_start, r.source_end,
                                                      m.chrom, m.start, m.end, 0.95)]
         source_is_dup = any(m.cnv_assessment == "DUP" for m in source_overlaps)
+        # TODO: could overlap against r.sink_*
         sinks = [m for m in records_list
                  if not has_reciprocal_overlap(r.source_chrom, r.source_start, r.source_end,
                                                m.chrom, m.start, m.end, 0.95)]
@@ -606,7 +607,7 @@ def _evaluate_dup5_ins3(r, records_list, default_sv_type, default_cpx_type):
                     # Revise inv interval (subtracting del interval)
                     inv_start, inv_end = _subtract_interval(inv_start, inv_end, r.sink_start, r.sink_end)
                     if inv_start is None:
-                        raise ValueError(f"Snk interval {r.sink_start}-{r.sink_end} spans inversion interval "
+                        raise ValueError(f"Sink interval {r.sink_start}-{r.sink_end} spans inversion interval "
                                          f"{inv_start}-{inv_end} for record {r.vid}")
                     return VariantAssessment(
                         vid=r.vid,
@@ -731,7 +732,7 @@ def _evaluate_dup3_ins5(r, records_list, default_sv_type, default_cpx_type):
                         new_end=max(dup_start, dup_end, inv_start, inv_end, r.sink_start, r.sink_end)
                     )
                 else:
-                    # If sink is not clearly deleted (or too small), classify as dupINV (no del)
+                    # If sink is not clearly deleted (or too small), classify as INVdup (no del)
                     return VariantAssessment(
                         vid=r.vid,
                         modification="RECLASSIFY",
@@ -802,19 +803,11 @@ def _evaluate_ins_idel(r, records_list, default_sv_type, default_cpx_type):
             if len(sinks) > 0:
                 # TODO unstable ordering
                 sink_record = sinks[0]
+                # TODO Both sink_is_del and sink_is_not_del could both be true. The logic here needs to be cleaned up.
                 if sink_record.size() >= MIN_SIZE_IDEL:
                     # If insertion site size >= MINSIZEiDEL, check if insertion site is deleted
-                    sink_is_del = any(
-                        m.cnv_assessment == "DEL" and not has_reciprocal_overlap(
-                            r.sink_chrom, r.sink_start, r.sink_end,
-                            m.chrom, m.start, m.end, 0.95)
-                        for m in records_list
-                    )
-                    sink_is_not_del = any(
-                        m.cnv_assessment == "WT" and not has_reciprocal_overlap(
-                            r.sink_chrom, r.sink_start, r.sink_end, m.chrom, m.start, m.end, 0.95)
-                        for m in records_list
-                    )
+                    sink_is_del = any(m.cnv_assessment == "DEL" for m in sinks)
+                    sink_is_not_del = any(m.cnv_assessment == "WT" for m in sinks)
                 else:
                     sink_is_del = False
                     sink_is_not_del = False
@@ -829,7 +822,7 @@ def _evaluate_ins_idel(r, records_list, default_sv_type, default_cpx_type):
                         new_sv_type="CPX",
                         new_cpx_type="INS_iDEL",
                         new_cpx_intervals=f"DEL_{r.sink_string()}",
-                        new_svlen=None + r.source_size(),
+                        new_svlen=r.sink_size() + r.source_size(),
                         new_source=f"INS_{r.source_string()}",
                         new_start=None,
                         new_end=None
@@ -907,6 +900,7 @@ def _evaluate_bnd(r, records_list, default_sv_type, default_cpx_type):
         )
         if dup_confirmed:
             # If dup confirms, resolve as palindromic inverted duplication
+            # TODO Investigate why these are never being found
             new_cpx_type = "piDUP_FR" if default_cpx_type == "INVERSION_SINGLE_ENDER_" else "piDUP_RF"
             return VariantAssessment(
                 vid=r.vid,
@@ -952,8 +946,9 @@ def _evaluate_irrelevant(r, default_sv_type, default_cpx_type):
         )
 
     # SV and CPX class each is most common one
-    for vid, records in genotype_counts_dict.items():
+    for vid, records in cleaned_genotype_counts.items():
         if vid not in variants_to_reclassify:
+            logging.warning(f"Variant ID {vid} was not found in vcf, ignoring...")
             continue
         vcf_record = variants_to_reclassify[vid]
         sv_type = Counter([r.variant_sv_type for r in records]).most_common(1)[0][0]
@@ -1131,6 +1126,9 @@ def _parse_arguments(argv: List[Text]) -> argparse.Namespace:
     parser.add_argument('--reclassification-table', type=str, help='Output reclassification table path', required=False)
     parser.add_argument('--chrx', type=str, help='Chromosome X contig name', default='chrX')
     parser.add_argument('--chry', type=str, help='Chromosome Y contig name', default='chrY')
+    parser.add_argument("-l", "--log-level", required=False, default="INFO",
+                        help="Specify level of logging information, ie. info, warning, error (not case-sensitive). "
+                             "Default: INFO")
     if len(argv) <= 1:
         parser.parse_args(["--help"])
         sys.exit(0)
@@ -1143,6 +1141,13 @@ def main(argv: Optional[List[Text]] = None):
         argv = sys.argv
     args = _parse_arguments(argv)
 
+    # Set logging level from -l input
+    log_level = args.log_level
+    numeric_level = getattr(logging, log_level.upper(), None)
+    if not isinstance(numeric_level, int):
+        raise ValueError('Invalid log level: %s' % log_level)
+    logging.basicConfig(level=numeric_level, format='%(asctime)s - %(levelname)s: %(message)s')
+
     all_samples, male_samples, female_samples = parse_ped(args.ped)
     genotype_dict = parse_genotypes(args.genotypes)
     intervals_list_dict, genotype_counts = genotype_counts_per_variant(intervals_path=args.intervals,
@@ -1154,9 +1159,9 @@ def main(argv: Optional[List[Text]] = None):
                                                                        female_samples=female_samples)
     del genotype_dict
     genotype_counts_tree_dict = make_genotype_tree(genotype_counts)
-    cleaned_genotype_counts = list(clean_up_intervals(genotype_counts, intervals_list_dict, genotype_counts_tree_dict))
+    cleaned_genotype_counts = clean_up_intervals(genotype_counts, intervals_list_dict, genotype_counts_tree_dict)
     del genotype_counts, intervals_list_dict, genotype_counts_tree_dict
-    variants_to_reclassify = read_vcf(args.vcf, cleaned_genotype_counts)
+    variants_to_reclassify = read_vcf(args.vcf, cleaned_genotype_counts_vids=cleaned_genotype_counts.keys())
     assessment = list(final_assessment(cleaned_genotype_counts, variants_to_reclassify))
     del cleaned_genotype_counts, variants_to_reclassify
     if args.reclassification_table: