From 68015ed6d3141af338972656ec95495bba60bdc7 Mon Sep 17 00:00:00 2001 From: Kirtana Veeraraghavan Date: Mon, 13 Nov 2023 14:36:27 -0500 Subject: [PATCH 01/15] add python file for SplitVariants task --- src/sv-pipeline/scripts/SplitVariants.py | 70 ++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 src/sv-pipeline/scripts/SplitVariants.py diff --git a/src/sv-pipeline/scripts/SplitVariants.py b/src/sv-pipeline/scripts/SplitVariants.py new file mode 100644 index 000000000..85bbc0b01 --- /dev/null +++ b/src/sv-pipeline/scripts/SplitVariants.py @@ -0,0 +1,70 @@ +#!/bin/python + +import pandas as pd +import csv +import os +import argparse +def process_bed_file(input_bed, N, bca=True): + condition_prefixes = { + 'gt5kb': {'condition': lambda line: (line[4] == 'DEL' or line[4] == 'DUP') and (int(line[2]) - int(line[1]) >= 5000)}, + 'lt5kb': {'condition': lambda line: (line[4] == 'DEL' or line[4] == 'DUP') and (int(line[2]) - int(line[1]) < 5000)}, + 'bca': {'condition': lambda line: bca and (line[4] != 'DEL' and line[4] != 'DUP' and line[4] != 'INS')}, + 'ins': {'condition': lambda line: bca and line[4] == 'INS'} + } + + current_lines = {prefix: [] for prefix in condition_prefixes.keys()} + current_counts = {prefix: 0 for prefix in condition_prefixes.keys()} + current_suffixes = {prefix: 'a' for prefix in condition_prefixes.keys()} + + with open(input_bed, 'r') as infile: + for line in infile: + line = line.strip().split('\t') + + for prefix, conditions in condition_prefixes.items(): + if conditions['condition'](line): + current_lines[prefix].append('\t'.join(line)) + current_counts[prefix] += 1 + + if current_counts[prefix] == N: + output_suffix = current_suffixes[prefix].rjust(6, 'a') + output_file = f"{prefix}.{output_suffix}.bed" + with open(output_file, 'w') as outfile: + outfile.write('\n'.join(current_lines[prefix])) + + print(f"File {output_file} written.") + current_lines[prefix] = [] + current_counts[prefix] = 0 + current_suffixes[prefix] = increment_suffix(current_suffixes[prefix]) + + # Handle remaining lines after the loop + for prefix, lines in current_lines.items(): + if lines: + output_suffix = current_suffixes[prefix].rjust(6, 'a') + output_file = f"{prefix}.{output_suffix}.bed" + with open(output_file, 'w') as outfile: + outfile.write('\n'.join(lines)) + + print(f"File {output_file} written.") + +def increment_suffix(suffix): + alphabet = 'abcdefghijklmnopqrstuvwxyz' + if suffix == 'z' * 6: + return 'a' * 6 + else: + index = alphabet.index(suffix[0]) + next_char = alphabet[(index + 1) % 26] + return next_char + suffix[1:] + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--bed", help="Path to input bed file") + parser.add_argument("--n_per_split", help="number of variants per file") + parser.add_argument("--bca", default="FALSE", help="") + args = parser.parse_args() + process_bed_file(args.bed, args.n_per_split, args.bca) + +# Press the green button in the gutter to run the script. +if __name__ == '__main__': + main() + From 8d7ca52d75a2e029bdace5c35defcc7581c778b7 Mon Sep 17 00:00:00 2001 From: Kirtana Veeraraghavan Date: Tue, 14 Nov 2023 10:58:18 -0500 Subject: [PATCH 02/15] edited TasksGenotype.wdl command to call SplitVariants.py --- src/sv-pipeline/scripts/SplitVariants.py | 4 ++-- wdl/TasksGenotypeBatch.wdl | 20 +++++--------------- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/src/sv-pipeline/scripts/SplitVariants.py b/src/sv-pipeline/scripts/SplitVariants.py index 85bbc0b01..d1603754e 100644 --- a/src/sv-pipeline/scripts/SplitVariants.py +++ b/src/sv-pipeline/scripts/SplitVariants.py @@ -59,10 +59,10 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bed", help="Path to input bed file") - parser.add_argument("--n_per_split", help="number of variants per file") + parser.add_argument("--n", help="number of variants per file") parser.add_argument("--bca", default="FALSE", help="") args = parser.parse_args() - process_bed_file(args.bed, args.n_per_split, args.bca) + process_bed_file(args.bed, args.n, args.bca) # Press the green button in the gutter to run the script. if __name__ == '__main__': diff --git a/wdl/TasksGenotypeBatch.wdl b/wdl/TasksGenotypeBatch.wdl index 1b853a056..56e8d6753 100644 --- a/wdl/TasksGenotypeBatch.wdl +++ b/wdl/TasksGenotypeBatch.wdl @@ -28,22 +28,12 @@ task SplitVariants { Array[File] ins_beds = glob("ins.*") } command <<< - set -euo pipefail - svtk vcf2bed ~{vcf} stdout \ - | awk -v OFS="\t" '(($5=="DEL" || $5=="DUP") && $3-$2>=5000) {print $1, $2, $3, $4, $6, $5}' \ - | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - gt5kb. - svtk vcf2bed ~{vcf} stdout \ - | awk -v OFS="\t" '(($5=="DEL" || $5=="DUP") && $3-$2<5000) {print $1, $2, $3, $4, $6, $5}' \ - | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - lt5kb. - if [ ~{generate_bca} == "true" ]; then - svtk vcf2bed ~{vcf} stdout \ - | awk -v OFS="\t" '($5!="DEL" && $5!="DUP" && $5!="INS") {print $1, $2, $3, $4, $6, $5}' \ - | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - bca. - svtk vcf2bed ~{vcf} stdout \ - | awk -v OFS="\t" '($5=="INS") {print $1, $2, $3, $4, $6, $5}' \ - | split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - ins. - fi + svtk vcf2bed ~{vcf} bed_file.bed + python /opt/sv-pipeline/scripts/SplitVariants.py \ + --bed bed_file.bed \ + ~{"--n " + n_per_split} \ + ~{"--bca " + generate_bca} >>> runtime { From b4f84146a4f22434f06b997e79c1f1c70e4fdc17 Mon Sep 17 00:00:00 2001 From: Kirtana Veeraraghavan Date: Fri, 17 Nov 2023 14:55:01 -0500 Subject: [PATCH 03/15] changed command in TasksGenotypeBatch.wdl --- inputs/values/dockers.json | 2 +- wdl/TasksGenotypeBatch.wdl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index e8ab74bea..2b4de1013 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -13,7 +13,7 @@ "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2023-07-28-v0.28.1-beta-e70dfbd7", "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-base-mini:5994670", "sv_pipeline_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3", - "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3", + "sv_pipeline_docker": "us.gcr.io/talkowski-sv-gnomad/kveerara/sv-pipeline:8d7ca52", "sv_pipeline_hail_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3", "sv_pipeline_updates_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3", "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3", diff --git a/wdl/TasksGenotypeBatch.wdl b/wdl/TasksGenotypeBatch.wdl index 56e8d6753..b43321556 100644 --- a/wdl/TasksGenotypeBatch.wdl +++ b/wdl/TasksGenotypeBatch.wdl @@ -33,7 +33,7 @@ task SplitVariants { python /opt/sv-pipeline/scripts/SplitVariants.py \ --bed bed_file.bed \ ~{"--n " + n_per_split} \ - ~{"--bca " + generate_bca} + ~{if generate_bca then "--bca" else ""} >>> runtime { From d7ee19ecb0e63e230d5e4721906212d499e19d25 Mon Sep 17 00:00:00 2001 From: Kirtana Veeraraghavan Date: Mon, 20 Nov 2023 14:53:52 -0500 Subject: [PATCH 04/15] changed docker to include correct tag for sv-pipeline --- inputs/values/dockers.json | 2 +- src/sv-pipeline/scripts/SplitVariants.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index 2b4de1013..d50c476ee 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -13,7 +13,7 @@ "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2023-07-28-v0.28.1-beta-e70dfbd7", "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-base-mini:5994670", "sv_pipeline_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3", - "sv_pipeline_docker": "us.gcr.io/talkowski-sv-gnomad/kveerara/sv-pipeline:8d7ca52", + "sv_pipeline_docker": "us.gcr.io/talkowski-sv-gnomad/kveerara/sv-pipeline:kv_split_variants_8d7ca52", "sv_pipeline_hail_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3", "sv_pipeline_updates_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3", "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3", diff --git a/src/sv-pipeline/scripts/SplitVariants.py b/src/sv-pipeline/scripts/SplitVariants.py index d1603754e..b1ee784c9 100644 --- a/src/sv-pipeline/scripts/SplitVariants.py +++ b/src/sv-pipeline/scripts/SplitVariants.py @@ -66,5 +66,4 @@ def main(): # Press the green button in the gutter to run the script. if __name__ == '__main__': - main() - + main() \ No newline at end of file From 2b0f5da27ad2953f432a3688bd96bdb2fa4d7480 Mon Sep 17 00:00:00 2001 From: Kirtana Veeraraghavan Date: Mon, 20 Nov 2023 15:10:52 -0500 Subject: [PATCH 05/15] reformatted python script to match github lint8 formatting specifications --- src/sv-pipeline/scripts/SplitVariants.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/sv-pipeline/scripts/SplitVariants.py b/src/sv-pipeline/scripts/SplitVariants.py index b1ee784c9..7565886b5 100644 --- a/src/sv-pipeline/scripts/SplitVariants.py +++ b/src/sv-pipeline/scripts/SplitVariants.py @@ -4,10 +4,14 @@ import csv import os import argparse + + def process_bed_file(input_bed, N, bca=True): condition_prefixes = { - 'gt5kb': {'condition': lambda line: (line[4] == 'DEL' or line[4] == 'DUP') and (int(line[2]) - int(line[1]) >= 5000)}, - 'lt5kb': {'condition': lambda line: (line[4] == 'DEL' or line[4] == 'DUP') and (int(line[2]) - int(line[1]) < 5000)}, + 'gt5kb': { + 'condition': lambda line: (line[4] == 'DEL' or line[4] == 'DUP') and (int(line[2]) - int(line[1]) >= 5000)}, + 'lt5kb': { + 'condition': lambda line: (line[4] == 'DEL' or line[4] == 'DUP') and (int(line[2]) - int(line[1]) < 5000)}, 'bca': {'condition': lambda line: bca and (line[4] != 'DEL' and line[4] != 'DUP' and line[4] != 'INS')}, 'ins': {'condition': lambda line: bca and line[4] == 'INS'} } @@ -46,6 +50,7 @@ def process_bed_file(input_bed, N, bca=True): print(f"File {output_file} written.") + def increment_suffix(suffix): alphabet = 'abcdefghijklmnopqrstuvwxyz' if suffix == 'z' * 6: @@ -55,6 +60,7 @@ def increment_suffix(suffix): next_char = alphabet[(index + 1) % 26] return next_char + suffix[1:] + def main(): parser = argparse.ArgumentParser() parser.add_argument( @@ -64,6 +70,7 @@ def main(): args = parser.parse_args() process_bed_file(args.bed, args.n, args.bca) + # Press the green button in the gutter to run the script. if __name__ == '__main__': - main() \ No newline at end of file + main() From 23688002fb07d685d913026435063878014fb964 Mon Sep 17 00:00:00 2001 From: Kirtana Veeraraghavan Date: Mon, 20 Nov 2023 15:15:18 -0500 Subject: [PATCH 06/15] reformatted python script to match github lint8 formatting specifications --- src/sv-pipeline/scripts/SplitVariants.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/sv-pipeline/scripts/SplitVariants.py b/src/sv-pipeline/scripts/SplitVariants.py index 7565886b5..4e87d3b03 100644 --- a/src/sv-pipeline/scripts/SplitVariants.py +++ b/src/sv-pipeline/scripts/SplitVariants.py @@ -1,8 +1,4 @@ #!/bin/python - -import pandas as pd -import csv -import os import argparse From c5be22da9fe6dddbb7518b6b89398f019ea7f58a Mon Sep 17 00:00:00 2001 From: Kirtana Veeraraghavan Date: Mon, 18 Dec 2023 15:07:14 -0500 Subject: [PATCH 07/15] made changes based on first review --- .../scripts/split_variants.py} | 27 ++++++++++--------- wdl/TasksGenotypeBatch.wdl | 2 +- 2 files changed, 15 insertions(+), 14 deletions(-) rename src/sv-pipeline/{scripts/SplitVariants.py => 04_variant_resolution/scripts/split_variants.py} (68%) diff --git a/src/sv-pipeline/scripts/SplitVariants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py similarity index 68% rename from src/sv-pipeline/scripts/SplitVariants.py rename to src/sv-pipeline/04_variant_resolution/scripts/split_variants.py index 4e87d3b03..f9ccb3058 100644 --- a/src/sv-pipeline/scripts/SplitVariants.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py @@ -1,15 +1,18 @@ #!/bin/python import argparse +import logging - -def process_bed_file(input_bed, N, bca=True): +def process_bed_file(input_bed, n_per_split, bca=True): + SVTYPE_FIELD=4 + END_POS=2 + START_POS=1 condition_prefixes = { 'gt5kb': { - 'condition': lambda line: (line[4] == 'DEL' or line[4] == 'DUP') and (int(line[2]) - int(line[1]) >= 5000)}, + 'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_POS]) - int(line[START_POS]) >= 5000)}, 'lt5kb': { - 'condition': lambda line: (line[4] == 'DEL' or line[4] == 'DUP') and (int(line[2]) - int(line[1]) < 5000)}, - 'bca': {'condition': lambda line: bca and (line[4] != 'DEL' and line[4] != 'DUP' and line[4] != 'INS')}, - 'ins': {'condition': lambda line: bca and line[4] == 'INS'} + 'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_POS]) - int(line[START_POS]) < 5000)}, + 'bca': {'condition': lambda line: bca and (line[SVTYPE_FIELD] != 'DEL' and line[SVTYPE_FIELD] != 'DUP' and line[SVTYPE_FIELD] != 'INS')}, + 'ins': {'condition': lambda line: bca and line[SVTYPE_FIELD] == 'INS'} } current_lines = {prefix: [] for prefix in condition_prefixes.keys()} @@ -25,7 +28,7 @@ def process_bed_file(input_bed, N, bca=True): current_lines[prefix].append('\t'.join(line)) current_counts[prefix] += 1 - if current_counts[prefix] == N: + if current_counts[prefix] == n_per_split: output_suffix = current_suffixes[prefix].rjust(6, 'a') output_file = f"{prefix}.{output_suffix}.bed" with open(output_file, 'w') as outfile: @@ -44,7 +47,7 @@ def process_bed_file(input_bed, N, bca=True): with open(output_file, 'w') as outfile: outfile.write('\n'.join(lines)) - print(f"File {output_file} written.") + logging.info(f"File '{output_file}' written.") def increment_suffix(suffix): @@ -60,13 +63,11 @@ def increment_suffix(suffix): def main(): parser = argparse.ArgumentParser() parser.add_argument( - "--bed", help="Path to input bed file") - parser.add_argument("--n", help="number of variants per file") - parser.add_argument("--bca", default="FALSE", help="") + "--bed", help="Path to input bed file", required=True) + parser.add_argument("--n", help="number of variants per file",required=True) + parser.add_argument("--bca", default=False, help="If there are ", action='store_true') args = parser.parse_args() process_bed_file(args.bed, args.n, args.bca) - -# Press the green button in the gutter to run the script. if __name__ == '__main__': main() diff --git a/wdl/TasksGenotypeBatch.wdl b/wdl/TasksGenotypeBatch.wdl index b43321556..80ac04376 100644 --- a/wdl/TasksGenotypeBatch.wdl +++ b/wdl/TasksGenotypeBatch.wdl @@ -30,7 +30,7 @@ task SplitVariants { command <<< set -euo pipefail svtk vcf2bed ~{vcf} bed_file.bed - python /opt/sv-pipeline/scripts/SplitVariants.py \ + python /opt/sv-pipeline/04_variant_resolution/scripts/split_variants.py \ --bed bed_file.bed \ ~{"--n " + n_per_split} \ ~{if generate_bca then "--bca" else ""} From 70e6a80fa73efd9596377b0f50eaea7ba981a3b3 Mon Sep 17 00:00:00 2001 From: Kirtana Veeraraghavan Date: Mon, 18 Dec 2023 15:14:27 -0500 Subject: [PATCH 08/15] made edit to python script to lint correctly --- src/sv-pipeline/04_variant_resolution/scripts/split_variants.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py index f9ccb3058..b55d034fb 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py @@ -2,10 +2,12 @@ import argparse import logging + def process_bed_file(input_bed, n_per_split, bca=True): SVTYPE_FIELD=4 END_POS=2 START_POS=1 + condition_prefixes = { 'gt5kb': { 'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_POS]) - int(line[START_POS]) >= 5000)}, From 0b68f62b8c53fbc5c42fd6f5440d344670c59db7 Mon Sep 17 00:00:00 2001 From: Kirtana Veeraraghavan Date: Tue, 19 Dec 2023 10:34:25 -0500 Subject: [PATCH 09/15] made edit to python script to lint correctly, and added extra clarifying comments to code. --- .../scripts/split_variants.py | 36 +++++++++++++------ 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py index b55d034fb..3915d41fc 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py @@ -3,40 +3,51 @@ import logging +# Function to process the bed file by checking for conditions def process_bed_file(input_bed, n_per_split, bca=True): - SVTYPE_FIELD=4 - END_POS=2 - START_POS=1 + svtype_field = 4 + end_pos = 2 + start_pos = 1 + # Dictionary to store the conditions to be checked with matching prefixes condition_prefixes = { 'gt5kb': { - 'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_POS]) - int(line[START_POS]) >= 5000)}, + 'condition': lambda curr_1: (curr_1[svtype_field] == 'DEL' or curr_1[svtype_field] == 'DUP') and ( + int(curr_1[end_pos]) - int(curr_1[start_pos]) >= 5000)}, 'lt5kb': { - 'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_POS]) - int(line[START_POS]) < 5000)}, - 'bca': {'condition': lambda line: bca and (line[SVTYPE_FIELD] != 'DEL' and line[SVTYPE_FIELD] != 'DUP' and line[SVTYPE_FIELD] != 'INS')}, - 'ins': {'condition': lambda line: bca and line[SVTYPE_FIELD] == 'INS'} + 'condition': lambda curr_2: (curr_2[svtype_field] == 'DEL' or curr_2[svtype_field] == 'DUP') and ( + int(curr_2[end_pos]) - int(curr_2[start_pos]) < 5000)}, + 'bca': {'condition': lambda curr_3: bca and ( + curr_3[svtype_field] != 'DEL' and curr_3[svtype_field] != 'DUP' and curr_3[svtype_field] != 'INS')}, + 'ins': {'condition': lambda curr_4: bca and curr_4[svtype_field] == 'INS'} } current_lines = {prefix: [] for prefix in condition_prefixes.keys()} current_counts = {prefix: 0 for prefix in condition_prefixes.keys()} current_suffixes = {prefix: 'a' for prefix in condition_prefixes.keys()} + # Open the bed file and process with open(input_bed, 'r') as infile: for line in infile: + # process bed file line by line line = line.strip().split('\t') + # Checks which condition and prefix the current line matches and appends it to the corresponding + # array and increments the counter for that array for prefix, conditions in condition_prefixes.items(): if conditions['condition'](line): current_lines[prefix].append('\t'.join(line)) current_counts[prefix] += 1 + # If the current array has the maximum allowed lines added to it create a new array + # with the preceding suffix and write the current array to a file if current_counts[prefix] == n_per_split: output_suffix = current_suffixes[prefix].rjust(6, 'a') output_file = f"{prefix}.{output_suffix}.bed" with open(output_file, 'w') as outfile: outfile.write('\n'.join(current_lines[prefix])) - print(f"File {output_file} written.") + logging.info(f"File '{output_file}' written.") current_lines[prefix] = [] current_counts[prefix] = 0 current_suffixes[prefix] = increment_suffix(current_suffixes[prefix]) @@ -52,11 +63,15 @@ def process_bed_file(input_bed, n_per_split, bca=True): logging.info(f"File '{output_file}' written.") +# Function to generate the pattern for suffixes def increment_suffix(suffix): + # define the alphabet and ending alphabet = 'abcdefghijklmnopqrstuvwxyz' if suffix == 'z' * 6: - return 'a' * 6 + raise ValueError('All possible files generated.') else: + # if there are available suffixes, increment with appropriate number + # of padded zeroes index = alphabet.index(suffix[0]) next_char = alphabet[(index + 1) % 26] return next_char + suffix[1:] @@ -66,10 +81,11 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bed", help="Path to input bed file", required=True) - parser.add_argument("--n", help="number of variants per file",required=True) + parser.add_argument("--n", help="number of variants per file", required=True) parser.add_argument("--bca", default=False, help="If there are ", action='store_true') args = parser.parse_args() process_bed_file(args.bed, args.n, args.bca) + if __name__ == '__main__': main() From e61c128b713008af2ea16fcc8180eb54e8cddcde Mon Sep 17 00:00:00 2001 From: Kirtana Veeraraghavan Date: Tue, 19 Dec 2023 10:38:50 -0500 Subject: [PATCH 10/15] made edit to python script to lint correctly, and added extra clarifying comments to code. --- .../04_variant_resolution/scripts/split_variants.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py index 3915d41fc..b9c0c3005 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py @@ -12,11 +12,9 @@ def process_bed_file(input_bed, n_per_split, bca=True): # Dictionary to store the conditions to be checked with matching prefixes condition_prefixes = { 'gt5kb': { - 'condition': lambda curr_1: (curr_1[svtype_field] == 'DEL' or curr_1[svtype_field] == 'DUP') and ( - int(curr_1[end_pos]) - int(curr_1[start_pos]) >= 5000)}, + 'condition': lambda curr_1: (curr_1[svtype_field] == 'DEL' or curr_1[svtype_field] == 'DUP') and (int(curr_1[end_pos]) - int(curr_1[start_pos]) >= 5000)}, 'lt5kb': { - 'condition': lambda curr_2: (curr_2[svtype_field] == 'DEL' or curr_2[svtype_field] == 'DUP') and ( - int(curr_2[end_pos]) - int(curr_2[start_pos]) < 5000)}, + 'condition': lambda curr_2: (curr_2[svtype_field] == 'DEL' or curr_2[svtype_field] == 'DUP') and (int(curr_2[end_pos]) - int(curr_2[start_pos]) < 5000)}, 'bca': {'condition': lambda curr_3: bca and ( curr_3[svtype_field] != 'DEL' and curr_3[svtype_field] != 'DUP' and curr_3[svtype_field] != 'INS')}, 'ins': {'condition': lambda curr_4: bca and curr_4[svtype_field] == 'INS'} From f57b37c0b18b605de29205ca5ecf592738cc6668 Mon Sep 17 00:00:00 2001 From: Kirtana Veeraraghavan Date: Wed, 20 Dec 2023 08:08:00 -0500 Subject: [PATCH 11/15] made edits based on second review. --- .../scripts/split_variants.py | 29 ++++++++++++------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py index b9c0c3005..27a6470fe 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py @@ -5,19 +5,19 @@ # Function to process the bed file by checking for conditions def process_bed_file(input_bed, n_per_split, bca=True): - svtype_field = 4 - end_pos = 2 - start_pos = 1 + SVTYPE_FIELD = 4 + END_FIELD = 2 + START_FIELD = 1 # Dictionary to store the conditions to be checked with matching prefixes condition_prefixes = { 'gt5kb': { - 'condition': lambda curr_1: (curr_1[svtype_field] == 'DEL' or curr_1[svtype_field] == 'DUP') and (int(curr_1[end_pos]) - int(curr_1[start_pos]) >= 5000)}, + 'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) >= 5000)}, 'lt5kb': { - 'condition': lambda curr_2: (curr_2[svtype_field] == 'DEL' or curr_2[svtype_field] == 'DUP') and (int(curr_2[end_pos]) - int(curr_2[start_pos]) < 5000)}, - 'bca': {'condition': lambda curr_3: bca and ( - curr_3[svtype_field] != 'DEL' and curr_3[svtype_field] != 'DUP' and curr_3[svtype_field] != 'INS')}, - 'ins': {'condition': lambda curr_4: bca and curr_4[svtype_field] == 'INS'} + 'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) < 5000)}, + 'bca': {'condition': lambda line: bca and ( + line[SVTYPE_FIELD] != 'DEL' and line[SVTYPE_FIELD] != 'DUP' and line[SVTYPE_FIELD] != 'INS')}, + 'ins': {'condition': lambda line: bca and line[SVTYPE_FIELD] == 'INS'} } current_lines = {prefix: [] for prefix in condition_prefixes.keys()} @@ -68,8 +68,7 @@ def increment_suffix(suffix): if suffix == 'z' * 6: raise ValueError('All possible files generated.') else: - # if there are available suffixes, increment with appropriate number - # of padded zeroes + # if there are available suffixes, increment to next available suffix index = alphabet.index(suffix[0]) next_char = alphabet[(index + 1) % 26] return next_char + suffix[1:] @@ -80,8 +79,16 @@ def main(): parser.add_argument( "--bed", help="Path to input bed file", required=True) parser.add_argument("--n", help="number of variants per file", required=True) - parser.add_argument("--bca", default=False, help="If there are ", action='store_true') + parser.add_argument("--bca", default=False, help="If there are bcas to address set to True", action='store_true') + parser.add_argument("--log-level", required=False, default="INFO",help="Specify level of logging information") args = parser.parse_args() + + # Set logging level from --log-level input + log_level = args.log_level + numeric_level = getattr(logging, log_level.upper(), None) + if not isinstance(numeric_level, int): + raise ValueError('Invalid log level: %s' % log_level) + logging.basicConfig(level=numeric_level, format='%(levelname)s: %(message)s') process_bed_file(args.bed, args.n, args.bca) From 32550f11b34a3e8d18fbd5a0e476eb92cd6d187c Mon Sep 17 00:00:00 2001 From: Kirtana Veeraraghavan Date: Wed, 20 Dec 2023 08:09:56 -0500 Subject: [PATCH 12/15] made edits based on second review. --- .../04_variant_resolution/scripts/split_variants.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py index 27a6470fe..b1b898cfc 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py @@ -76,11 +76,10 @@ def increment_suffix(suffix): def main(): parser = argparse.ArgumentParser() - parser.add_argument( - "--bed", help="Path to input bed file", required=True) + parser.add_argument("--bed", help="Path to input bed file", required=True) parser.add_argument("--n", help="number of variants per file", required=True) parser.add_argument("--bca", default=False, help="If there are bcas to address set to True", action='store_true') - parser.add_argument("--log-level", required=False, default="INFO",help="Specify level of logging information") + parser.add_argument("--log-level", required=False, default="INFO", help="Specify level of logging information") args = parser.parse_args() # Set logging level from --log-level input From c590e1e3b404638c80fbd6245183f6061a8bf889 Mon Sep 17 00:00:00 2001 From: Kirtana Veeraraghavan Date: Wed, 20 Dec 2023 08:11:07 -0500 Subject: [PATCH 13/15] made edits based on second review. --- src/sv-pipeline/04_variant_resolution/scripts/split_variants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py index b1b898cfc..118d8c83f 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py @@ -68,7 +68,7 @@ def increment_suffix(suffix): if suffix == 'z' * 6: raise ValueError('All possible files generated.') else: - # if there are available suffixes, increment to next available suffix + # if there are available suffixes increment to next available suffix index = alphabet.index(suffix[0]) next_char = alphabet[(index + 1) % 26] return next_char + suffix[1:] From a1bb4feac6f4d58370a27c53c727bb448b9b86d9 Mon Sep 17 00:00:00 2001 From: Kirtana Veeraraghavan Date: Fri, 22 Dec 2023 08:31:36 -0600 Subject: [PATCH 14/15] made edits based on second review. --- inputs/values/dockers.json | 2 +- wdl/TasksGenotypeBatch.wdl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index d50c476ee..e8ab74bea 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -13,7 +13,7 @@ "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2023-07-28-v0.28.1-beta-e70dfbd7", "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-base-mini:5994670", "sv_pipeline_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3", - "sv_pipeline_docker": "us.gcr.io/talkowski-sv-gnomad/kveerara/sv-pipeline:kv_split_variants_8d7ca52", + "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3", "sv_pipeline_hail_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3", "sv_pipeline_updates_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3", "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3", diff --git a/wdl/TasksGenotypeBatch.wdl b/wdl/TasksGenotypeBatch.wdl index 80ac04376..37a86993b 100644 --- a/wdl/TasksGenotypeBatch.wdl +++ b/wdl/TasksGenotypeBatch.wdl @@ -32,8 +32,8 @@ task SplitVariants { svtk vcf2bed ~{vcf} bed_file.bed python /opt/sv-pipeline/04_variant_resolution/scripts/split_variants.py \ --bed bed_file.bed \ - ~{"--n " + n_per_split} \ - ~{if generate_bca then "--bca" else ""} + ~{"--n " + n_per_split} \ + ~{if generate_bca then "--bca" else ""} >>> runtime { From 47c3da348f015ef46ead1d80d3097625a0962296 Mon Sep 17 00:00:00 2001 From: Kirtana Veeraraghavan Date: Fri, 5 Jan 2024 14:15:14 -0500 Subject: [PATCH 15/15] addressed changes in the last review --- src/sv-pipeline/04_variant_resolution/scripts/split_variants.py | 2 +- wdl/TasksGenotypeBatch.wdl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py index 118d8c83f..34e36dba9 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py @@ -78,7 +78,7 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument("--bed", help="Path to input bed file", required=True) parser.add_argument("--n", help="number of variants per file", required=True) - parser.add_argument("--bca", default=False, help="If there are bcas to address set to True", action='store_true') + parser.add_argument("--bca", default=False, help="Flag to set to True if the VCF contains BCAs", action='store_true') parser.add_argument("--log-level", required=False, default="INFO", help="Specify level of logging information") args = parser.parse_args() diff --git a/wdl/TasksGenotypeBatch.wdl b/wdl/TasksGenotypeBatch.wdl index 37a86993b..4e0d44021 100644 --- a/wdl/TasksGenotypeBatch.wdl +++ b/wdl/TasksGenotypeBatch.wdl @@ -31,7 +31,7 @@ task SplitVariants { set -euo pipefail svtk vcf2bed ~{vcf} bed_file.bed python /opt/sv-pipeline/04_variant_resolution/scripts/split_variants.py \ - --bed bed_file.bed \ + --bed bed_file.bed \ ~{"--n " + n_per_split} \ ~{if generate_bca then "--bca" else ""}