From 68015ed6d3141af338972656ec95495bba60bdc7 Mon Sep 17 00:00:00 2001
From: Kirtana Veeraraghavan <kveerara@broadinstitute.org>
Date: Mon, 13 Nov 2023 14:36:27 -0500
Subject: [PATCH 01/15] add python file for SplitVariants task

---
 src/sv-pipeline/scripts/SplitVariants.py | 70 ++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 src/sv-pipeline/scripts/SplitVariants.py

diff --git a/src/sv-pipeline/scripts/SplitVariants.py b/src/sv-pipeline/scripts/SplitVariants.py
new file mode 100644
index 000000000..85bbc0b01
--- /dev/null
+++ b/src/sv-pipeline/scripts/SplitVariants.py
@@ -0,0 +1,70 @@
+#!/bin/python
+
+import pandas as pd
+import csv
+import os
+import argparse
+def process_bed_file(input_bed, N, bca=True):
+    condition_prefixes = {
+        'gt5kb': {'condition': lambda line: (line[4] == 'DEL' or line[4] == 'DUP') and (int(line[2]) - int(line[1]) >= 5000)},
+        'lt5kb': {'condition': lambda line: (line[4] == 'DEL' or line[4] == 'DUP') and (int(line[2]) - int(line[1]) < 5000)},
+        'bca': {'condition': lambda line: bca and (line[4] != 'DEL' and line[4] != 'DUP' and line[4] != 'INS')},
+        'ins': {'condition': lambda line: bca and line[4] == 'INS'}
+    }
+
+    current_lines = {prefix: [] for prefix in condition_prefixes.keys()}
+    current_counts = {prefix: 0 for prefix in condition_prefixes.keys()}
+    current_suffixes = {prefix: 'a' for prefix in condition_prefixes.keys()}
+
+    with open(input_bed, 'r') as infile:
+        for line in infile:
+            line = line.strip().split('\t')
+
+            for prefix, conditions in condition_prefixes.items():
+                if conditions['condition'](line):
+                    current_lines[prefix].append('\t'.join(line))
+                    current_counts[prefix] += 1
+
+                    if current_counts[prefix] == N:
+                        output_suffix = current_suffixes[prefix].rjust(6, 'a')
+                        output_file = f"{prefix}.{output_suffix}.bed"
+                        with open(output_file, 'w') as outfile:
+                            outfile.write('\n'.join(current_lines[prefix]))
+
+                        print(f"File {output_file} written.")
+                        current_lines[prefix] = []
+                        current_counts[prefix] = 0
+                        current_suffixes[prefix] = increment_suffix(current_suffixes[prefix])
+
+    # Handle remaining lines after the loop
+    for prefix, lines in current_lines.items():
+        if lines:
+            output_suffix = current_suffixes[prefix].rjust(6, 'a')
+            output_file = f"{prefix}.{output_suffix}.bed"
+            with open(output_file, 'w') as outfile:
+                outfile.write('\n'.join(lines))
+
+            print(f"File {output_file} written.")
+
+def increment_suffix(suffix):
+    alphabet = 'abcdefghijklmnopqrstuvwxyz'
+    if suffix == 'z' * 6:
+        return 'a' * 6
+    else:
+        index = alphabet.index(suffix[0])
+        next_char = alphabet[(index + 1) % 26]
+        return next_char + suffix[1:]
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--bed", help="Path to input bed file")
+    parser.add_argument("--n_per_split", help="number of variants per file")
+    parser.add_argument("--bca", default="FALSE", help="")
+    args = parser.parse_args()
+    process_bed_file(args.bed, args.n_per_split, args.bca)
+
+# Press the green button in the gutter to run the script.
+if __name__ == '__main__':
+    main()
+

From 8d7ca52d75a2e029bdace5c35defcc7581c778b7 Mon Sep 17 00:00:00 2001
From: Kirtana Veeraraghavan <kveerara@broadinstitute.org>
Date: Tue, 14 Nov 2023 10:58:18 -0500
Subject: [PATCH 02/15] edited TasksGenotype.wdl command to call
 SplitVariants.py

---
 src/sv-pipeline/scripts/SplitVariants.py |  4 ++--
 wdl/TasksGenotypeBatch.wdl               | 20 +++++---------------
 2 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/src/sv-pipeline/scripts/SplitVariants.py b/src/sv-pipeline/scripts/SplitVariants.py
index 85bbc0b01..d1603754e 100644
--- a/src/sv-pipeline/scripts/SplitVariants.py
+++ b/src/sv-pipeline/scripts/SplitVariants.py
@@ -59,10 +59,10 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--bed", help="Path to input bed file")
-    parser.add_argument("--n_per_split", help="number of variants per file")
+    parser.add_argument("--n", help="number of variants per file")
     parser.add_argument("--bca", default="FALSE", help="")
     args = parser.parse_args()
-    process_bed_file(args.bed, args.n_per_split, args.bca)
+    process_bed_file(args.bed, args.n, args.bca)
 
 # Press the green button in the gutter to run the script.
 if __name__ == '__main__':
diff --git a/wdl/TasksGenotypeBatch.wdl b/wdl/TasksGenotypeBatch.wdl
index 1b853a056..56e8d6753 100644
--- a/wdl/TasksGenotypeBatch.wdl
+++ b/wdl/TasksGenotypeBatch.wdl
@@ -28,22 +28,12 @@ task SplitVariants {
     Array[File] ins_beds = glob("ins.*")
   }
   command <<<
-
     set -euo pipefail
-    svtk vcf2bed ~{vcf} stdout \
-      | awk -v OFS="\t" '(($5=="DEL" || $5=="DUP") && $3-$2>=5000) {print $1, $2, $3, $4, $6, $5}' \
-      | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - gt5kb.
-    svtk vcf2bed ~{vcf} stdout \
-      | awk -v OFS="\t" '(($5=="DEL" || $5=="DUP") && $3-$2<5000) {print $1, $2, $3, $4, $6, $5}' \
-      | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - lt5kb.
-    if [ ~{generate_bca} == "true" ]; then
-      svtk vcf2bed ~{vcf} stdout \
-        | awk -v OFS="\t" '($5!="DEL" && $5!="DUP" && $5!="INS") {print $1, $2, $3, $4, $6, $5}' \
-        | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - bca.
-      svtk vcf2bed ~{vcf} stdout \
-        | awk -v OFS="\t" '($5=="INS") {print $1, $2, $3, $4, $6, $5}' \
-        | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - ins.
-    fi
+    svtk vcf2bed ~{vcf} bed_file.bed
+    python /opt/sv-pipeline/scripts/SplitVariants.py \
+    --bed bed_file.bed \
+    ~{"--n " + n_per_split} \
+    ~{"--bca " + generate_bca}
 
   >>>
   runtime {

From b4f84146a4f22434f06b997e79c1f1c70e4fdc17 Mon Sep 17 00:00:00 2001
From: Kirtana Veeraraghavan <kveerara@broadinstitute.org>
Date: Fri, 17 Nov 2023 14:55:01 -0500
Subject: [PATCH 03/15] changed command in TasksGenotypeBatch.wdl

---
 inputs/values/dockers.json | 2 +-
 wdl/TasksGenotypeBatch.wdl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json
index e8ab74bea..2b4de1013 100644
--- a/inputs/values/dockers.json
+++ b/inputs/values/dockers.json
@@ -13,7 +13,7 @@
   "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2023-07-28-v0.28.1-beta-e70dfbd7",
   "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-base-mini:5994670",
   "sv_pipeline_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
-  "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
+  "sv_pipeline_docker": "us.gcr.io/talkowski-sv-gnomad/kveerara/sv-pipeline:8d7ca52",
   "sv_pipeline_hail_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
   "sv_pipeline_updates_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
   "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
diff --git a/wdl/TasksGenotypeBatch.wdl b/wdl/TasksGenotypeBatch.wdl
index 56e8d6753..b43321556 100644
--- a/wdl/TasksGenotypeBatch.wdl
+++ b/wdl/TasksGenotypeBatch.wdl
@@ -33,7 +33,7 @@ task SplitVariants {
     python /opt/sv-pipeline/scripts/SplitVariants.py \
     --bed bed_file.bed \
     ~{"--n " + n_per_split} \
-    ~{"--bca " + generate_bca}
+    ~{if generate_bca then "--bca" else ""}
 
   >>>
   runtime {

From d7ee19ecb0e63e230d5e4721906212d499e19d25 Mon Sep 17 00:00:00 2001
From: Kirtana Veeraraghavan <kveerara@broadinstitute.org>
Date: Mon, 20 Nov 2023 14:53:52 -0500
Subject: [PATCH 04/15] changed docker to include correct tag for sv-pipeline

---
 inputs/values/dockers.json               | 2 +-
 src/sv-pipeline/scripts/SplitVariants.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json
index 2b4de1013..d50c476ee 100644
--- a/inputs/values/dockers.json
+++ b/inputs/values/dockers.json
@@ -13,7 +13,7 @@
   "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2023-07-28-v0.28.1-beta-e70dfbd7",
   "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-base-mini:5994670",
   "sv_pipeline_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
-  "sv_pipeline_docker": "us.gcr.io/talkowski-sv-gnomad/kveerara/sv-pipeline:8d7ca52",
+  "sv_pipeline_docker": "us.gcr.io/talkowski-sv-gnomad/kveerara/sv-pipeline:kv_split_variants_8d7ca52",
   "sv_pipeline_hail_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
   "sv_pipeline_updates_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
   "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
diff --git a/src/sv-pipeline/scripts/SplitVariants.py b/src/sv-pipeline/scripts/SplitVariants.py
index d1603754e..b1ee784c9 100644
--- a/src/sv-pipeline/scripts/SplitVariants.py
+++ b/src/sv-pipeline/scripts/SplitVariants.py
@@ -66,5 +66,4 @@ def main():
 
 # Press the green button in the gutter to run the script.
 if __name__ == '__main__':
-    main()
-
+    main()
\ No newline at end of file

From 2b0f5da27ad2953f432a3688bd96bdb2fa4d7480 Mon Sep 17 00:00:00 2001
From: Kirtana Veeraraghavan <kveerara@broadinstitute.org>
Date: Mon, 20 Nov 2023 15:10:52 -0500
Subject: [PATCH 05/15] reformatted python script to match github lint8
 formatting specifications

---
 src/sv-pipeline/scripts/SplitVariants.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/sv-pipeline/scripts/SplitVariants.py b/src/sv-pipeline/scripts/SplitVariants.py
index b1ee784c9..7565886b5 100644
--- a/src/sv-pipeline/scripts/SplitVariants.py
+++ b/src/sv-pipeline/scripts/SplitVariants.py
@@ -4,10 +4,14 @@
 import csv
 import os
 import argparse
+
+
 def process_bed_file(input_bed, N, bca=True):
     condition_prefixes = {
-        'gt5kb': {'condition': lambda line: (line[4] == 'DEL' or line[4] == 'DUP') and (int(line[2]) - int(line[1]) >= 5000)},
-        'lt5kb': {'condition': lambda line: (line[4] == 'DEL' or line[4] == 'DUP') and (int(line[2]) - int(line[1]) < 5000)},
+        'gt5kb': {
+            'condition': lambda line: (line[4] == 'DEL' or line[4] == 'DUP') and (int(line[2]) - int(line[1]) >= 5000)},
+        'lt5kb': {
+            'condition': lambda line: (line[4] == 'DEL' or line[4] == 'DUP') and (int(line[2]) - int(line[1]) < 5000)},
         'bca': {'condition': lambda line: bca and (line[4] != 'DEL' and line[4] != 'DUP' and line[4] != 'INS')},
         'ins': {'condition': lambda line: bca and line[4] == 'INS'}
     }
@@ -46,6 +50,7 @@ def process_bed_file(input_bed, N, bca=True):
 
             print(f"File {output_file} written.")
 
+
 def increment_suffix(suffix):
     alphabet = 'abcdefghijklmnopqrstuvwxyz'
     if suffix == 'z' * 6:
@@ -55,6 +60,7 @@ def increment_suffix(suffix):
         next_char = alphabet[(index + 1) % 26]
         return next_char + suffix[1:]
 
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -64,6 +70,7 @@ def main():
     args = parser.parse_args()
     process_bed_file(args.bed, args.n, args.bca)
 
+
 # Press the green button in the gutter to run the script.
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()

From 23688002fb07d685d913026435063878014fb964 Mon Sep 17 00:00:00 2001
From: Kirtana Veeraraghavan <kveerara@broadinstitute.org>
Date: Mon, 20 Nov 2023 15:15:18 -0500
Subject: [PATCH 06/15] reformatted python script to match github lint8
 formatting specifications

---
 src/sv-pipeline/scripts/SplitVariants.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/sv-pipeline/scripts/SplitVariants.py b/src/sv-pipeline/scripts/SplitVariants.py
index 7565886b5..4e87d3b03 100644
--- a/src/sv-pipeline/scripts/SplitVariants.py
+++ b/src/sv-pipeline/scripts/SplitVariants.py
@@ -1,8 +1,4 @@
 #!/bin/python
-
-import pandas as pd
-import csv
-import os
 import argparse
 
 

From c5be22da9fe6dddbb7518b6b89398f019ea7f58a Mon Sep 17 00:00:00 2001
From: Kirtana Veeraraghavan <kveerara@broadinstitute.org>
Date: Mon, 18 Dec 2023 15:07:14 -0500
Subject: [PATCH 07/15] made changes based on first review

---
 .../scripts/split_variants.py}                | 27 ++++++++++---------
 wdl/TasksGenotypeBatch.wdl                    |  2 +-
 2 files changed, 15 insertions(+), 14 deletions(-)
 rename src/sv-pipeline/{scripts/SplitVariants.py => 04_variant_resolution/scripts/split_variants.py} (68%)

diff --git a/src/sv-pipeline/scripts/SplitVariants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
similarity index 68%
rename from src/sv-pipeline/scripts/SplitVariants.py
rename to src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
index 4e87d3b03..f9ccb3058 100644
--- a/src/sv-pipeline/scripts/SplitVariants.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
@@ -1,15 +1,18 @@
 #!/bin/python
 import argparse
+import logging
 
-
-def process_bed_file(input_bed, N, bca=True):
+def process_bed_file(input_bed, n_per_split, bca=True):
+    SVTYPE_FIELD=4
+    END_POS=2
+    START_POS=1
     condition_prefixes = {
         'gt5kb': {
-            'condition': lambda line: (line[4] == 'DEL' or line[4] == 'DUP') and (int(line[2]) - int(line[1]) >= 5000)},
+            'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_POS]) - int(line[START_POS]) >= 5000)},
         'lt5kb': {
-            'condition': lambda line: (line[4] == 'DEL' or line[4] == 'DUP') and (int(line[2]) - int(line[1]) < 5000)},
-        'bca': {'condition': lambda line: bca and (line[4] != 'DEL' and line[4] != 'DUP' and line[4] != 'INS')},
-        'ins': {'condition': lambda line: bca and line[4] == 'INS'}
+            'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_POS]) - int(line[START_POS]) < 5000)},
+        'bca': {'condition': lambda line: bca and (line[SVTYPE_FIELD] != 'DEL' and line[SVTYPE_FIELD] != 'DUP' and line[SVTYPE_FIELD] != 'INS')},
+        'ins': {'condition': lambda line: bca and line[SVTYPE_FIELD] == 'INS'}
     }
 
     current_lines = {prefix: [] for prefix in condition_prefixes.keys()}
@@ -25,7 +28,7 @@ def process_bed_file(input_bed, N, bca=True):
                     current_lines[prefix].append('\t'.join(line))
                     current_counts[prefix] += 1
 
-                    if current_counts[prefix] == N:
+                    if current_counts[prefix] == n_per_split:
                         output_suffix = current_suffixes[prefix].rjust(6, 'a')
                         output_file = f"{prefix}.{output_suffix}.bed"
                         with open(output_file, 'w') as outfile:
@@ -44,7 +47,7 @@ def process_bed_file(input_bed, N, bca=True):
             with open(output_file, 'w') as outfile:
                 outfile.write('\n'.join(lines))
 
-            print(f"File {output_file} written.")
+            logging.info(f"File '{output_file}' written.")
 
 
 def increment_suffix(suffix):
@@ -60,13 +63,11 @@ def increment_suffix(suffix):
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--bed", help="Path to input bed file")
-    parser.add_argument("--n", help="number of variants per file")
-    parser.add_argument("--bca", default="FALSE", help="")
+        "--bed", help="Path to input bed file", required=True)
+    parser.add_argument("--n", help="number of variants per file",required=True)
+    parser.add_argument("--bca", default=False, help="If there are ", action='store_true')
     args = parser.parse_args()
     process_bed_file(args.bed, args.n, args.bca)
 
-
-# Press the green button in the gutter to run the script.
 if __name__ == '__main__':
     main()
diff --git a/wdl/TasksGenotypeBatch.wdl b/wdl/TasksGenotypeBatch.wdl
index b43321556..80ac04376 100644
--- a/wdl/TasksGenotypeBatch.wdl
+++ b/wdl/TasksGenotypeBatch.wdl
@@ -30,7 +30,7 @@ task SplitVariants {
   command <<<
     set -euo pipefail
     svtk vcf2bed ~{vcf} bed_file.bed
-    python /opt/sv-pipeline/scripts/SplitVariants.py \
+    python /opt/sv-pipeline/04_variant_resolution/scripts/split_variants.py \
     --bed bed_file.bed \
     ~{"--n " + n_per_split} \
     ~{if generate_bca then "--bca" else ""}

From 70e6a80fa73efd9596377b0f50eaea7ba981a3b3 Mon Sep 17 00:00:00 2001
From: Kirtana Veeraraghavan <kveerara@broadinstitute.org>
Date: Mon, 18 Dec 2023 15:14:27 -0500
Subject: [PATCH 08/15] made edit to python script to lint correctly

---
 src/sv-pipeline/04_variant_resolution/scripts/split_variants.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
index f9ccb3058..b55d034fb 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
@@ -2,10 +2,12 @@
 import argparse
 import logging
 
+
 def process_bed_file(input_bed, n_per_split, bca=True):
     SVTYPE_FIELD=4
     END_POS=2
     START_POS=1
+
     condition_prefixes = {
         'gt5kb': {
             'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_POS]) - int(line[START_POS]) >= 5000)},

From 0b68f62b8c53fbc5c42fd6f5440d344670c59db7 Mon Sep 17 00:00:00 2001
From: Kirtana Veeraraghavan <kveerara@broadinstitute.org>
Date: Tue, 19 Dec 2023 10:34:25 -0500
Subject: [PATCH 09/15] made edit to python script to lint correctly, and added
 extra clarifying comments to code.

---
 .../scripts/split_variants.py                 | 36 +++++++++++++------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
index b55d034fb..3915d41fc 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
@@ -3,40 +3,51 @@
 import logging
 
 
+# Function to process the bed file by checking for conditions
 def process_bed_file(input_bed, n_per_split, bca=True):
-    SVTYPE_FIELD=4
-    END_POS=2
-    START_POS=1
+    svtype_field = 4
+    end_pos = 2
+    start_pos = 1
 
+    # Dictionary to store the conditions to be checked with matching prefixes
     condition_prefixes = {
         'gt5kb': {
-            'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_POS]) - int(line[START_POS]) >= 5000)},
+            'condition': lambda curr_1: (curr_1[svtype_field] == 'DEL' or curr_1[svtype_field] == 'DUP') and (
+                    int(curr_1[end_pos]) - int(curr_1[start_pos]) >= 5000)},
         'lt5kb': {
-            'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_POS]) - int(line[START_POS]) < 5000)},
-        'bca': {'condition': lambda line: bca and (line[SVTYPE_FIELD] != 'DEL' and line[SVTYPE_FIELD] != 'DUP' and line[SVTYPE_FIELD] != 'INS')},
-        'ins': {'condition': lambda line: bca and line[SVTYPE_FIELD] == 'INS'}
+            'condition': lambda curr_2: (curr_2[svtype_field] == 'DEL' or curr_2[svtype_field] == 'DUP') and (
+                    int(curr_2[end_pos]) - int(curr_2[start_pos]) < 5000)},
+        'bca': {'condition': lambda curr_3: bca and (
+                curr_3[svtype_field] != 'DEL' and curr_3[svtype_field] != 'DUP' and curr_3[svtype_field] != 'INS')},
+        'ins': {'condition': lambda curr_4: bca and curr_4[svtype_field] == 'INS'}
     }
 
     current_lines = {prefix: [] for prefix in condition_prefixes.keys()}
     current_counts = {prefix: 0 for prefix in condition_prefixes.keys()}
     current_suffixes = {prefix: 'a' for prefix in condition_prefixes.keys()}
 
+    # Open the bed file and process
     with open(input_bed, 'r') as infile:
         for line in infile:
+            # process bed file line by line
             line = line.strip().split('\t')
 
+            # Checks which condition and prefix the current line matches and appends it to the corresponding
+            # array and increments the counter for that array
             for prefix, conditions in condition_prefixes.items():
                 if conditions['condition'](line):
                     current_lines[prefix].append('\t'.join(line))
                     current_counts[prefix] += 1
 
+                    # If the current array has the maximum allowed lines added to it create a new array
+                    # with the preceding suffix and write the current array to a file
                     if current_counts[prefix] == n_per_split:
                         output_suffix = current_suffixes[prefix].rjust(6, 'a')
                         output_file = f"{prefix}.{output_suffix}.bed"
                         with open(output_file, 'w') as outfile:
                             outfile.write('\n'.join(current_lines[prefix]))
 
-                        print(f"File {output_file} written.")
+                        logging.info(f"File '{output_file}' written.")
                         current_lines[prefix] = []
                         current_counts[prefix] = 0
                         current_suffixes[prefix] = increment_suffix(current_suffixes[prefix])
@@ -52,11 +63,15 @@ def process_bed_file(input_bed, n_per_split, bca=True):
             logging.info(f"File '{output_file}' written.")
 
 
+# Function to generate the pattern for suffixes
 def increment_suffix(suffix):
+    # define the alphabet and ending
     alphabet = 'abcdefghijklmnopqrstuvwxyz'
     if suffix == 'z' * 6:
-        return 'a' * 6
+        raise ValueError('All possible files generated.')
     else:
+        # if there are available suffixes, increment with appropriate number
+        # of padded zeroes
         index = alphabet.index(suffix[0])
         next_char = alphabet[(index + 1) % 26]
         return next_char + suffix[1:]
@@ -66,10 +81,11 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--bed", help="Path to input bed file", required=True)
-    parser.add_argument("--n", help="number of variants per file",required=True)
+    parser.add_argument("--n", help="number of variants per file", required=True)
     parser.add_argument("--bca", default=False, help="If there are ", action='store_true')
     args = parser.parse_args()
     process_bed_file(args.bed, args.n, args.bca)
 
+
 if __name__ == '__main__':
     main()

From e61c128b713008af2ea16fcc8180eb54e8cddcde Mon Sep 17 00:00:00 2001
From: Kirtana Veeraraghavan <kveerara@broadinstitute.org>
Date: Tue, 19 Dec 2023 10:38:50 -0500
Subject: [PATCH 10/15] made edit to python script to lint correctly, and added
 extra clarifying comments to code.

---
 .../04_variant_resolution/scripts/split_variants.py         | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
index 3915d41fc..b9c0c3005 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
@@ -12,11 +12,9 @@ def process_bed_file(input_bed, n_per_split, bca=True):
     # Dictionary to store the conditions to be checked with matching prefixes
     condition_prefixes = {
         'gt5kb': {
-            'condition': lambda curr_1: (curr_1[svtype_field] == 'DEL' or curr_1[svtype_field] == 'DUP') and (
-                    int(curr_1[end_pos]) - int(curr_1[start_pos]) >= 5000)},
+            'condition': lambda curr_1: (curr_1[svtype_field] == 'DEL' or curr_1[svtype_field] == 'DUP') and (int(curr_1[end_pos]) - int(curr_1[start_pos]) >= 5000)},
         'lt5kb': {
-            'condition': lambda curr_2: (curr_2[svtype_field] == 'DEL' or curr_2[svtype_field] == 'DUP') and (
-                    int(curr_2[end_pos]) - int(curr_2[start_pos]) < 5000)},
+            'condition': lambda curr_2: (curr_2[svtype_field] == 'DEL' or curr_2[svtype_field] == 'DUP') and (int(curr_2[end_pos]) - int(curr_2[start_pos]) < 5000)},
         'bca': {'condition': lambda curr_3: bca and (
                 curr_3[svtype_field] != 'DEL' and curr_3[svtype_field] != 'DUP' and curr_3[svtype_field] != 'INS')},
         'ins': {'condition': lambda curr_4: bca and curr_4[svtype_field] == 'INS'}

From f57b37c0b18b605de29205ca5ecf592738cc6668 Mon Sep 17 00:00:00 2001
From: Kirtana Veeraraghavan <kveerara@broadinstitute.org>
Date: Wed, 20 Dec 2023 08:08:00 -0500
Subject: [PATCH 11/15] made edits based on second review.

---
 .../scripts/split_variants.py                 | 29 ++++++++++++-------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
index b9c0c3005..27a6470fe 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
@@ -5,19 +5,19 @@
 
 # Function to process the bed file by checking for conditions
 def process_bed_file(input_bed, n_per_split, bca=True):
-    svtype_field = 4
-    end_pos = 2
-    start_pos = 1
+    SVTYPE_FIELD = 4
+    END_FIELD = 2
+    START_FIELD = 1
 
     # Dictionary to store the conditions to be checked with matching prefixes
     condition_prefixes = {
         'gt5kb': {
-            'condition': lambda curr_1: (curr_1[svtype_field] == 'DEL' or curr_1[svtype_field] == 'DUP') and (int(curr_1[end_pos]) - int(curr_1[start_pos]) >= 5000)},
+            'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) >= 5000)},
         'lt5kb': {
-            'condition': lambda curr_2: (curr_2[svtype_field] == 'DEL' or curr_2[svtype_field] == 'DUP') and (int(curr_2[end_pos]) - int(curr_2[start_pos]) < 5000)},
-        'bca': {'condition': lambda curr_3: bca and (
-                curr_3[svtype_field] != 'DEL' and curr_3[svtype_field] != 'DUP' and curr_3[svtype_field] != 'INS')},
-        'ins': {'condition': lambda curr_4: bca and curr_4[svtype_field] == 'INS'}
+            'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) < 5000)},
+        'bca': {'condition': lambda line: bca and (
+                line[SVTYPE_FIELD] != 'DEL' and line[SVTYPE_FIELD] != 'DUP' and line[SVTYPE_FIELD] != 'INS')},
+        'ins': {'condition': lambda line: bca and line[SVTYPE_FIELD] == 'INS'}
     }
 
     current_lines = {prefix: [] for prefix in condition_prefixes.keys()}
@@ -68,8 +68,7 @@ def increment_suffix(suffix):
     if suffix == 'z' * 6:
         raise ValueError('All possible files generated.')
     else:
-        # if there are available suffixes, increment with appropriate number
-        # of padded zeroes
+        # if there are available suffixes, increment to next available suffix
         index = alphabet.index(suffix[0])
         next_char = alphabet[(index + 1) % 26]
         return next_char + suffix[1:]
@@ -80,8 +79,16 @@ def main():
     parser.add_argument(
         "--bed", help="Path to input bed file", required=True)
     parser.add_argument("--n", help="number of variants per file", required=True)
-    parser.add_argument("--bca", default=False, help="If there are ", action='store_true')
+    parser.add_argument("--bca", default=False, help="If there are bcas to address set to True", action='store_true')
+    parser.add_argument("--log-level", required=False, default="INFO",help="Specify level of logging information")
     args = parser.parse_args()
+
+    # Set logging level from --log-level input
+    log_level = args.log_level
+    numeric_level = getattr(logging, log_level.upper(), None)
+    if not isinstance(numeric_level, int):
+        raise ValueError('Invalid log level: %s' % log_level)
+    logging.basicConfig(level=numeric_level, format='%(levelname)s: %(message)s')
     process_bed_file(args.bed, args.n, args.bca)
 
 

From 32550f11b34a3e8d18fbd5a0e476eb92cd6d187c Mon Sep 17 00:00:00 2001
From: Kirtana Veeraraghavan <kveerara@broadinstitute.org>
Date: Wed, 20 Dec 2023 08:09:56 -0500
Subject: [PATCH 12/15] made edits based on second review.

---
 .../04_variant_resolution/scripts/split_variants.py          | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
index 27a6470fe..b1b898cfc 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
@@ -76,11 +76,10 @@ def increment_suffix(suffix):
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--bed", help="Path to input bed file", required=True)
+    parser.add_argument("--bed", help="Path to input bed file", required=True)
     parser.add_argument("--n", help="number of variants per file", required=True)
     parser.add_argument("--bca", default=False, help="If there are bcas to address set to True", action='store_true')
-    parser.add_argument("--log-level", required=False, default="INFO",help="Specify level of logging information")
+    parser.add_argument("--log-level", required=False, default="INFO", help="Specify level of logging information")
     args = parser.parse_args()
 
     # Set logging level from --log-level input

From c590e1e3b404638c80fbd6245183f6061a8bf889 Mon Sep 17 00:00:00 2001
From: Kirtana Veeraraghavan <kveerara@broadinstitute.org>
Date: Wed, 20 Dec 2023 08:11:07 -0500
Subject: [PATCH 13/15] made edits based on second review.

---
 src/sv-pipeline/04_variant_resolution/scripts/split_variants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
index b1b898cfc..118d8c83f 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
@@ -68,7 +68,7 @@ def increment_suffix(suffix):
     if suffix == 'z' * 6:
         raise ValueError('All possible files generated.')
     else:
-        # if there are available suffixes, increment to next available suffix
+        # if there are available suffixes increment to next available suffix
         index = alphabet.index(suffix[0])
         next_char = alphabet[(index + 1) % 26]
         return next_char + suffix[1:]

From a1bb4feac6f4d58370a27c53c727bb448b9b86d9 Mon Sep 17 00:00:00 2001
From: Kirtana Veeraraghavan <kveerara@broadinstitute.org>
Date: Fri, 22 Dec 2023 08:31:36 -0600
Subject: [PATCH 14/15] made edits based on second review.

---
 inputs/values/dockers.json | 2 +-
 wdl/TasksGenotypeBatch.wdl | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json
index d50c476ee..e8ab74bea 100644
--- a/inputs/values/dockers.json
+++ b/inputs/values/dockers.json
@@ -13,7 +13,7 @@
   "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2023-07-28-v0.28.1-beta-e70dfbd7",
   "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-base-mini:5994670",
   "sv_pipeline_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
-  "sv_pipeline_docker": "us.gcr.io/talkowski-sv-gnomad/kveerara/sv-pipeline:kv_split_variants_8d7ca52",
+  "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
   "sv_pipeline_hail_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
   "sv_pipeline_updates_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
   "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
diff --git a/wdl/TasksGenotypeBatch.wdl b/wdl/TasksGenotypeBatch.wdl
index 80ac04376..37a86993b 100644
--- a/wdl/TasksGenotypeBatch.wdl
+++ b/wdl/TasksGenotypeBatch.wdl
@@ -32,8 +32,8 @@ task SplitVariants {
     svtk vcf2bed ~{vcf} bed_file.bed
     python /opt/sv-pipeline/04_variant_resolution/scripts/split_variants.py \
     --bed bed_file.bed \
-    ~{"--n " + n_per_split} \
-    ~{if generate_bca then "--bca" else ""}
+      ~{"--n " + n_per_split} \
+      ~{if generate_bca then "--bca" else ""}
 
   >>>
   runtime {

From 47c3da348f015ef46ead1d80d3097625a0962296 Mon Sep 17 00:00:00 2001
From: Kirtana Veeraraghavan <kveerara@broadinstitute.org>
Date: Fri, 5 Jan 2024 14:15:14 -0500
Subject: [PATCH 15/15] addressed changes in the last review

---
 src/sv-pipeline/04_variant_resolution/scripts/split_variants.py | 2 +-
 wdl/TasksGenotypeBatch.wdl                                      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
index 118d8c83f..34e36dba9 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
@@ -78,7 +78,7 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--bed", help="Path to input bed file", required=True)
     parser.add_argument("--n", help="number of variants per file", required=True)
-    parser.add_argument("--bca", default=False, help="If there are bcas to address set to True", action='store_true')
+    parser.add_argument("--bca", default=False, help="Flag to set to True if the VCF contains BCAs", action='store_true')
     parser.add_argument("--log-level", required=False, default="INFO", help="Specify level of logging information")
     args = parser.parse_args()
 
diff --git a/wdl/TasksGenotypeBatch.wdl b/wdl/TasksGenotypeBatch.wdl
index 37a86993b..4e0d44021 100644
--- a/wdl/TasksGenotypeBatch.wdl
+++ b/wdl/TasksGenotypeBatch.wdl
@@ -31,7 +31,7 @@ task SplitVariants {
     set -euo pipefail
     svtk vcf2bed ~{vcf} bed_file.bed
     python /opt/sv-pipeline/04_variant_resolution/scripts/split_variants.py \
-    --bed bed_file.bed \
+      --bed bed_file.bed \
       ~{"--n " + n_per_split} \
       ~{if generate_bca then "--bca" else ""}