Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite the SplitVariants task command in TasksGenotypeBatch.wdl to call svtk only once #618

Merged
merged 15 commits into from
Jan 5, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion inputs/values/dockers.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2023-07-28-v0.28.1-beta-e70dfbd7",
"sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-base-mini:5994670",
"sv_pipeline_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
"sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
"sv_pipeline_docker": "us.gcr.io/talkowski-sv-gnomad/kveerara/sv-pipeline:kv_split_variants_8d7ca52",
epiercehoffman marked this conversation as resolved.
Show resolved Hide resolved
"sv_pipeline_hail_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
"sv_pipeline_updates_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
"sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3",
Expand Down
72 changes: 72 additions & 0 deletions src/sv-pipeline/scripts/SplitVariants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/bin/python
import argparse


def process_bed_file(input_bed, N, bca=True):
epiercehoffman marked this conversation as resolved.
Show resolved Hide resolved
condition_prefixes = {
'gt5kb': {
'condition': lambda line: (line[4] == 'DEL' or line[4] == 'DUP') and (int(line[2]) - int(line[1]) >= 5000)},
epiercehoffman marked this conversation as resolved.
Show resolved Hide resolved
'lt5kb': {
'condition': lambda line: (line[4] == 'DEL' or line[4] == 'DUP') and (int(line[2]) - int(line[1]) < 5000)},
'bca': {'condition': lambda line: bca and (line[4] != 'DEL' and line[4] != 'DUP' and line[4] != 'INS')},
'ins': {'condition': lambda line: bca and line[4] == 'INS'}
}

current_lines = {prefix: [] for prefix in condition_prefixes.keys()}
current_counts = {prefix: 0 for prefix in condition_prefixes.keys()}
current_suffixes = {prefix: 'a' for prefix in condition_prefixes.keys()}

with open(input_bed, 'r') as infile:
for line in infile:
line = line.strip().split('\t')

for prefix, conditions in condition_prefixes.items():
epiercehoffman marked this conversation as resolved.
Show resolved Hide resolved
if conditions['condition'](line):
current_lines[prefix].append('\t'.join(line))
epiercehoffman marked this conversation as resolved.
Show resolved Hide resolved
current_counts[prefix] += 1

if current_counts[prefix] == N:
output_suffix = current_suffixes[prefix].rjust(6, 'a')
output_file = f"{prefix}.{output_suffix}.bed"
with open(output_file, 'w') as outfile:
outfile.write('\n'.join(current_lines[prefix]))

print(f"File {output_file} written.")
current_lines[prefix] = []
current_counts[prefix] = 0
current_suffixes[prefix] = increment_suffix(current_suffixes[prefix])

# Handle remaining lines after the loop
for prefix, lines in current_lines.items():
if lines:
output_suffix = current_suffixes[prefix].rjust(6, 'a')
output_file = f"{prefix}.{output_suffix}.bed"
with open(output_file, 'w') as outfile:
outfile.write('\n'.join(lines))

print(f"File {output_file} written.")
epiercehoffman marked this conversation as resolved.
Show resolved Hide resolved


def increment_suffix(suffix):
epiercehoffman marked this conversation as resolved.
Show resolved Hide resolved
alphabet = 'abcdefghijklmnopqrstuvwxyz'
if suffix == 'z' * 6:
return 'a' * 6
epiercehoffman marked this conversation as resolved.
Show resolved Hide resolved
else:
index = alphabet.index(suffix[0])
next_char = alphabet[(index + 1) % 26]
return next_char + suffix[1:]


def main():
parser = argparse.ArgumentParser()
parser.add_argument(
epiercehoffman marked this conversation as resolved.
Show resolved Hide resolved
"--bed", help="Path to input bed file")
parser.add_argument("--n", help="number of variants per file")
parser.add_argument("--bca", default="FALSE", help="")
epiercehoffman marked this conversation as resolved.
Show resolved Hide resolved
args = parser.parse_args()
process_bed_file(args.bed, args.n, args.bca)


# Press the green button in the gutter to run the script.
epiercehoffman marked this conversation as resolved.
Show resolved Hide resolved
if __name__ == '__main__':
main()
20 changes: 5 additions & 15 deletions wdl/TasksGenotypeBatch.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,12 @@ task SplitVariants {
Array[File] ins_beds = glob("ins.*")
}
command <<<

set -euo pipefail
svtk vcf2bed ~{vcf} stdout \
| awk -v OFS="\t" '(($5=="DEL" || $5=="DUP") && $3-$2>=5000) {print $1, $2, $3, $4, $6, $5}' \
| split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - gt5kb.
svtk vcf2bed ~{vcf} stdout \
| awk -v OFS="\t" '(($5=="DEL" || $5=="DUP") && $3-$2<5000) {print $1, $2, $3, $4, $6, $5}' \
| split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - lt5kb.
if [ ~{generate_bca} == "true" ]; then
svtk vcf2bed ~{vcf} stdout \
| awk -v OFS="\t" '($5!="DEL" && $5!="DUP" && $5!="INS") {print $1, $2, $3, $4, $6, $5}' \
| split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - bca.
svtk vcf2bed ~{vcf} stdout \
| awk -v OFS="\t" '($5=="INS") {print $1, $2, $3, $4, $6, $5}' \
| split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - ins.
fi
svtk vcf2bed ~{vcf} bed_file.bed
python /opt/sv-pipeline/scripts/SplitVariants.py \
epiercehoffman marked this conversation as resolved.
Show resolved Hide resolved
--bed bed_file.bed \
~{"--n " + n_per_split} \
~{if generate_bca then "--bca" else ""}
epiercehoffman marked this conversation as resolved.
Show resolved Hide resolved

>>>
runtime {
Expand Down
Loading