Skip to content

Commit

Permalink
Rewrite the SplitVariants task command in TasksGenotypeBatch.wdl to c…
Browse files Browse the repository at this point in the history
…all svtk only once (#618)

* add python file for SplitVariants task

* edited TasksGenotype.wdl command to call SplitVariants.py

* changed command in TasksGenotypeBatch.wdl

* changed docker to include correct tag for sv-pipeline

* reformatted python script to match github lint8 formatting specifications

* reformatted python script to match github lint8 formatting specifications

* made changes based on first review

* made edit to python script to lint correctly

* made edit to python script to lint correctly, and added extra clarifying comments to code.

* made edit to python script to lint correctly, and added extra clarifying comments to code.

* made edits based on second review.

* made edits based on second review.

* made edits based on second review.

* made edits based on second review.

* addressed changes in the last review
  • Loading branch information
kirtanav98 authored Jan 5, 2024
1 parent dc92e9f commit 310eb7b
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 15 deletions.
95 changes: 95 additions & 0 deletions src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/bin/python
import argparse
import logging


# Function to process the bed file by checking for conditions
def process_bed_file(input_bed, n_per_split, bca=True):
SVTYPE_FIELD = 4
END_FIELD = 2
START_FIELD = 1

# Dictionary to store the conditions to be checked with matching prefixes
condition_prefixes = {
'gt5kb': {
'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) >= 5000)},
'lt5kb': {
'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) < 5000)},
'bca': {'condition': lambda line: bca and (
line[SVTYPE_FIELD] != 'DEL' and line[SVTYPE_FIELD] != 'DUP' and line[SVTYPE_FIELD] != 'INS')},
'ins': {'condition': lambda line: bca and line[SVTYPE_FIELD] == 'INS'}
}

current_lines = {prefix: [] for prefix in condition_prefixes.keys()}
current_counts = {prefix: 0 for prefix in condition_prefixes.keys()}
current_suffixes = {prefix: 'a' for prefix in condition_prefixes.keys()}

# Open the bed file and process
with open(input_bed, 'r') as infile:
for line in infile:
# process bed file line by line
line = line.strip().split('\t')

# Checks which condition and prefix the current line matches and appends it to the corresponding
# array and increments the counter for that array
for prefix, conditions in condition_prefixes.items():
if conditions['condition'](line):
current_lines[prefix].append('\t'.join(line))
current_counts[prefix] += 1

# If the current array has the maximum allowed lines added to it create a new array
# with the preceding suffix and write the current array to a file
if current_counts[prefix] == n_per_split:
output_suffix = current_suffixes[prefix].rjust(6, 'a')
output_file = f"{prefix}.{output_suffix}.bed"
with open(output_file, 'w') as outfile:
outfile.write('\n'.join(current_lines[prefix]))

logging.info(f"File '{output_file}' written.")
current_lines[prefix] = []
current_counts[prefix] = 0
current_suffixes[prefix] = increment_suffix(current_suffixes[prefix])

# Handle remaining lines after the loop
for prefix, lines in current_lines.items():
if lines:
output_suffix = current_suffixes[prefix].rjust(6, 'a')
output_file = f"{prefix}.{output_suffix}.bed"
with open(output_file, 'w') as outfile:
outfile.write('\n'.join(lines))

logging.info(f"File '{output_file}' written.")


# Function to generate the pattern for suffixes
def increment_suffix(suffix):
# define the alphabet and ending
alphabet = 'abcdefghijklmnopqrstuvwxyz'
if suffix == 'z' * 6:
raise ValueError('All possible files generated.')
else:
# if there are available suffixes increment to next available suffix
index = alphabet.index(suffix[0])
next_char = alphabet[(index + 1) % 26]
return next_char + suffix[1:]


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--bed", help="Path to input bed file", required=True)
parser.add_argument("--n", help="number of variants per file", required=True)
parser.add_argument("--bca", default=False, help="Flag to set to True if the VCF contains BCAs", action='store_true')
parser.add_argument("--log-level", required=False, default="INFO", help="Specify level of logging information")
args = parser.parse_args()

# Set logging level from --log-level input
log_level = args.log_level
numeric_level = getattr(logging, log_level.upper(), None)
if not isinstance(numeric_level, int):
raise ValueError('Invalid log level: %s' % log_level)
logging.basicConfig(level=numeric_level, format='%(levelname)s: %(message)s')
process_bed_file(args.bed, args.n, args.bca)


if __name__ == '__main__':
main()
20 changes: 5 additions & 15 deletions wdl/TasksGenotypeBatch.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,12 @@ task SplitVariants {
Array[File] ins_beds = glob("ins.*")
}
command <<<

set -euo pipefail
svtk vcf2bed ~{vcf} stdout \
| awk -v OFS="\t" '(($5=="DEL" || $5=="DUP") && $3-$2>=5000) {print $1, $2, $3, $4, $6, $5}' \
| split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - gt5kb.
svtk vcf2bed ~{vcf} stdout \
| awk -v OFS="\t" '(($5=="DEL" || $5=="DUP") && $3-$2<5000) {print $1, $2, $3, $4, $6, $5}' \
| split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - lt5kb.
if [ ~{generate_bca} == "true" ]; then
svtk vcf2bed ~{vcf} stdout \
| awk -v OFS="\t" '($5!="DEL" && $5!="DUP" && $5!="INS") {print $1, $2, $3, $4, $6, $5}' \
| split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - bca.
svtk vcf2bed ~{vcf} stdout \
| awk -v OFS="\t" '($5=="INS") {print $1, $2, $3, $4, $6, $5}' \
| split --additional-suffix ".bed" -l ~{n_per_split} -a 6 - ins.
fi
svtk vcf2bed ~{vcf} bed_file.bed
python /opt/sv-pipeline/04_variant_resolution/scripts/split_variants.py \
--bed bed_file.bed \
~{"--n " + n_per_split} \
~{if generate_bca then "--bca" else ""}

>>>
runtime {
Expand Down

0 comments on commit 310eb7b

Please sign in to comment.