broadinstitute · kirtanav98 · Jan 5, 2024 · Nov 13, 2023 · Nov 14, 2023 · Nov 17, 2023
diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json
@@ -13,7 +13,7 @@
   "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2023-07-28-v0.28.1-beta-e70dfbd7",
   "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-base-mini:5994670",
   "sv_pipeline_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
-  "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
+  "sv_pipeline_docker": "us.gcr.io/talkowski-sv-gnomad/kveerara/sv-pipeline:kv_split_variants_8d7ca52",
   "sv_pipeline_hail_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
   "sv_pipeline_updates_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
   "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",

diff --git a/src/sv-pipeline/scripts/SplitVariants.py b/src/sv-pipeline/scripts/SplitVariants.py
@@ -0,0 +1,72 @@
+#!/bin/python
+import argparse
+
+
+def process_bed_file(input_bed, N, bca=True):
+    condition_prefixes = {
+        'gt5kb': {
+            'condition': lambda line: (line[4] == 'DEL' or line[4] == 'DUP') and (int(line[2]) - int(line[1]) >= 5000)},
+        'lt5kb': {
+            'condition': lambda line: (line[4] == 'DEL' or line[4] == 'DUP') and (int(line[2]) - int(line[1]) < 5000)},
+        'bca': {'condition': lambda line: bca and (line[4] != 'DEL' and line[4] != 'DUP' and line[4] != 'INS')},
+        'ins': {'condition': lambda line: bca and line[4] == 'INS'}
+    }
+
+    current_lines = {prefix: [] for prefix in condition_prefixes.keys()}
+    current_counts = {prefix: 0 for prefix in condition_prefixes.keys()}
+    current_suffixes = {prefix: 'a' for prefix in condition_prefixes.keys()}
+
+    with open(input_bed, 'r') as infile:
+        for line in infile:
+            line = line.strip().split('\t')
+
+            for prefix, conditions in condition_prefixes.items():
+                if conditions['condition'](line):
+                    current_lines[prefix].append('\t'.join(line))
+                    current_counts[prefix] += 1
+
+                    if current_counts[prefix] == N:
+                        output_suffix = current_suffixes[prefix].rjust(6, 'a')
+                        output_file = f"{prefix}.{output_suffix}.bed"
+                        with open(output_file, 'w') as outfile:
+                            outfile.write('\n'.join(current_lines[prefix]))
+
+                        print(f"File {output_file} written.")
+                        current_lines[prefix] = []
+                        current_counts[prefix] = 0
+                        current_suffixes[prefix] = increment_suffix(current_suffixes[prefix])
+
+    # Handle remaining lines after the loop
+    for prefix, lines in current_lines.items():
+        if lines:
+            output_suffix = current_suffixes[prefix].rjust(6, 'a')
+            output_file = f"{prefix}.{output_suffix}.bed"
+            with open(output_file, 'w') as outfile:
+                outfile.write('\n'.join(lines))
+
+            print(f"File {output_file} written.")
+
+
+def increment_suffix(suffix):
+    alphabet = 'abcdefghijklmnopqrstuvwxyz'
+    if suffix == 'z' * 6:
+        return 'a' * 6
+    else:
+        index = alphabet.index(suffix[0])
+        next_char = alphabet[(index + 1) % 26]
+        return next_char + suffix[1:]
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--bed", help="Path to input bed file")
+    parser.add_argument("--n", help="number of variants per file")
+    parser.add_argument("--bca", default="FALSE", help="")
+    args = parser.parse_args()
+    process_bed_file(args.bed, args.n, args.bca)
+
+
+# Press the green button in the gutter to run the script.
+if __name__ == '__main__':
+    main()
diff --git a/wdl/TasksGenotypeBatch.wdl b/wdl/TasksGenotypeBatch.wdl
@@ -28,22 +28,12 @@ task SplitVariants {
     Array[File] ins_beds = glob("ins.*")
   }
   command <<<
-
     set -euo pipefail
-    svtk vcf2bed ~{vcf} stdout \
-      | awk -v OFS="\t" '(($5=="DEL" || $5=="DUP") && $3-$2>=5000) {print $1, $2, $3, $4, $6, $5}' \
-      | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - gt5kb.
-    svtk vcf2bed ~{vcf} stdout \
-      | awk -v OFS="\t" '(($5=="DEL" || $5=="DUP") && $3-$2<5000) {print $1, $2, $3, $4, $6, $5}' \
-      | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - lt5kb.
-    if [ ~{generate_bca} == "true" ]; then
-      svtk vcf2bed ~{vcf} stdout \
-        | awk -v OFS="\t" '($5!="DEL" && $5!="DUP" && $5!="INS") {print $1, $2, $3, $4, $6, $5}' \
-        | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - bca.
-      svtk vcf2bed ~{vcf} stdout \
-        | awk -v OFS="\t" '($5=="INS") {print $1, $2, $3, $4, $6, $5}' \
-        | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - ins.
-    fi
+    svtk vcf2bed ~{vcf} bed_file.bed
+    python /opt/sv-pipeline/scripts/SplitVariants.py \
+    --bed bed_file.bed \
+    ~{"--n " + n_per_split} \
+    ~{if generate_bca then "--bca" else ""}
 
   >>>
   runtime {