Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update CombineBatches workflow #732

Draft
wants to merge 36 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
0a55c41
Add cluster_batch_format.py; clean up MCNV formatting
mwalker174 Jul 17, 2024
b97481d
Don't assign genotypes to CNVs in add_genotypes.py
mwalker174 Aug 2, 2024
667e14c
Set mixed_breakend_window to 1mb
mwalker174 Aug 23, 2024
724cd57
Fixes
mwalker174 Aug 26, 2024
6ebd243
Start implementing ContextAwareClustering
mwalker174 Aug 27, 2024
6a9eed6
Fix wdl
mwalker174 Aug 30, 2024
012647c
Update wdl and gatk docker
mwalker174 Aug 30, 2024
8d9de6f
Comment
mwalker174 Aug 30, 2024
b13b4b8
Set context overlap parameters
mwalker174 Aug 30, 2024
1c5caee
Update docker
mwalker174 Sep 3, 2024
ff36610
Fix size in ReformatGenotypedVcf
mwalker174 Sep 4, 2024
f1f11d5
Update to reformat_genotyped_vcf.py
mwalker174 Sep 4, 2024
7debe9f
Update legacy reformatter, fix GenotypeBatchMetrics wdl
mwalker174 Sep 5, 2024
406bfd4
Remove reformat step from GenotypeBatch
mwalker174 Sep 5, 2024
a1ac4f3
Use RD_CN if CN is unavailable for mCNVs in svtk vcf2bed
mwalker174 Sep 9, 2024
4b06c95
Add additional_args to svcluster and groupedsvcluster
mwalker174 Sep 11, 2024
88ee212
Add join step
mwalker174 Sep 11, 2024
f54d464
Update runtime attr
mwalker174 Sep 11, 2024
b18605a
Filter legacy records with invalid coords (needs testing)
mwalker174 Sep 16, 2024
9a7887b
Fix record dropping; add --fix-end to wdl call
mwalker174 Sep 19, 2024
f3f9430
Representative breakpoint summary strategy
mwalker174 Sep 19, 2024
9e807b3
Update gatk docker
mwalker174 Sep 19, 2024
be304c7
Integerate SR flags into VCF
mwalker174 Sep 23, 2024
b35bed5
Update dockers
mwalker174 Sep 25, 2024
72ae66f
Parse last column in SR flags lists
mwalker174 Sep 26, 2024
d5fd902
Gatk to svtk formatting
mwalker174 Sep 30, 2024
24ba4e1
Fix CNV strands and overlap breakpoint filter bothside pass file parsing
mwalker174 Sep 30, 2024
9529a3d
Breakpoint overlap filter now sorts by BOTHSIDES_SUPPORT status rathe…
mwalker174 Sep 30, 2024
9447cc0
Set empty FILTER statuses to PASS
mwalker174 Oct 2, 2024
3892993
Use safer get() methods instead of brackets for accessing FORMAT fields
mwalker174 Oct 2, 2024
db12058
Update dockers
mwalker174 Oct 3, 2024
84975e4
Delete unused VcfClusterSingleChromsome.wdl
mwalker174 Oct 3, 2024
f786f0e
Remove other unused wdls
mwalker174 Oct 3, 2024
7a20df4
Do not require CN or RD_CN to be defined for all samples for CNVs in …
mwalker174 Oct 3, 2024
03d2879
Fix multi-allelic ALTs and genotype parsing
mwalker174 Oct 4, 2024
e276958
Update dockers
mwalker174 Oct 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions inputs/templates/test/MakeCohortVcf/CombineBatches.json.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,17 @@
"CombineBatches.depth_exclude_list": {{ reference_resources.depth_exclude_list | tojson }},
"CombineBatches.empty_file" : {{ reference_resources.empty_file | tojson }},

"CombineBatches.reference_fasta": {{ reference_resources.reference_fasta | tojson }},
"CombineBatches.reference_dict": {{ reference_resources.reference_dict | tojson }},
"CombineBatches.reference_fasta_fai": {{ reference_resources.reference_index | tojson }},

"CombineBatches.min_sr_background_fail_batches": 0.5,
"CombineBatches.gatk_docker": {{ dockers.gatk_docker | tojson }},
"CombineBatches.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }},
"CombineBatches.sv_base_mini_docker":{{ dockers.sv_base_mini_docker | tojson }},

"CombineBatches.cohort_name": {{ test_batch.name | tojson }},
"CombineBatches.ped_file": {{ test_batch.ped_file | tojson }},
"CombineBatches.batches": [
{{ test_batch.name | tojson }}
],
Expand Down
12 changes: 6 additions & 6 deletions inputs/values/dockers.json
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
{
"name": "dockers",
"cnmops_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/cnmops:2024-08-27-v0.29-beta-6b27c39f",
"cnmops_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/cnmops:2024-06-04-v0.28.5-beta-a8dfecba",
"condense_counts_docker": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432",
"gatk_docker": "us.gcr.io/broad-dsde-methods/eph/gatk:2024-07-02-4.6.0.0-1-g4af2b49e9-NIGHTLY-SNAPSHOT",
"gatk_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-sv-stratify-99422dc",
"gatk_docker_pesr_override": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432",
"genomes_in_the_cloud_docker": "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135",
"linux_docker": "marketplace.gcr.io/google/ubuntu1804",
"manta_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/manta:2023-09-14-v0.28.3-beta-3f22f94d",
"melt_docker": "us.gcr.io/talkowski-sv-gnomad/melt:a85c92f",
"scramble_docker": "us.gcr.io/broad-dsde-methods/markw/scramble:mw-scramble-99af4c50",
"samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2024-01-24-v0.28.4-beta-9debd6d7",
"sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-08-27-v0.29-beta-6b27c39f",
"sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-01-24-v0.28.4-beta-9debd6d7",
"sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2024-01-24-v0.28.4-beta-9debd6d7",
"sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2024-09-25-v0.29-beta-f064b2d7",
"sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2024-09-25-v0.29-beta-f064b2d7",
"sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/markw/sv-pipeline:mw-gatk-combine-batches-7a20df41",
"sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/markw/sv-pipeline:mw-gatk-combine-batches-7a20df41",
"wham_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/wham:2024-01-24-v0.28.4-beta-9debd6d7",
"igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9",
"duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9",
Expand All @@ -28,5 +28,5 @@
"sv_utils_docker": "us.gcr.io/broad-dsde-methods/markw/sv-utils:mw-train-genotype-filtering-a9479501",
"gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a",
"str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6",
"denovo": "us.gcr.io/broad-dsde-methods/gatk-sv/denovo:2024-09-25-v0.29-beta-f064b2d7"
"denovo": "us.gcr.io/broad-dsde-methods/markw/denovo:mw-gatk-combine-batches-7a20df41"
}
55 changes: 7 additions & 48 deletions src/sv-pipeline/04_variant_resolution/scripts/add_genotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,6 @@
import pysam


def make_multiallelic_alts(record, max_CN, is_bca=False):
"""
Add alts for CN states up to half of max observed total CN
"""

max_haplo_CN = int(np.ceil(max_CN / 2))

if is_bca:
alts = tuple(['<CN1>'] +
['<CN{0}>'.format(i) for i in range(2, max_haplo_CN + 1)])
else:
alts = tuple(['<CN0>'] +
['<CN{0}>'.format(i) for i in range(2, max_haplo_CN + 1)])

stop = record.stop
record.alts = alts
record.stop = stop


def make_evidence_int(ev):
ev = ev.split(',')

Expand Down Expand Up @@ -59,16 +40,14 @@ def add_genotypes(record, genotypes, varGQ):
del record.format[fmt]

max_GT = genotypes['GT'].max()
is_bca = record.info['SVTYPE'] not in 'DEL DUP'.split()

if is_bca:
max_CN = max_GT
else:
max_CN = max_GT + 2

if max_GT > 2:
record.info['MULTIALLELIC'] = True
make_multiallelic_alts(record, max_CN, is_bca)
record.alts = ('<CNV>',)
if record.info['SVTYPE'] != 'DUP':
msg = 'Invalid SVTYPE {0} for multiallelic record {1}'
msg = msg.format(record.info['SVTYPE'], record.id)
raise Exception(msg)
record.info['SVTYPE'] = 'CNV'

cols = 'name sample GT GQ RD_CN RD_GQ PE_GT PE_GQ SR_GT SR_GQ EV'.split()
gt_matrix = genotypes.reset_index()[cols].to_numpy()
Expand All @@ -85,27 +64,7 @@ def add_genotypes(record, genotypes, varGQ):
raise Exception(msg)

if max_GT > 2:
if record.info['SVTYPE'] == 'DEL':
msg = 'Invalid SVTYPE {0} for multiallelic genotype in record {1}'
msg = msg.format(record.info['SVTYPE'], record.id)
raise Exception(msg)

if is_bca:
idx1 = int(np.floor(data[2] / 2))
idx2 = int(np.ceil(data[2] / 2))
else:
# split copy state roughly evenly between haplotypes
idx1 = int(np.floor((data[2] + 2) / 2))
idx2 = int(np.ceil((data[2] + 2) / 2))

# if copy state is 1, assign reference genotype
if idx1 == 1:
idx1 = 0
if idx2 == 1:
idx2 = 0

record.samples[sample]['GT'] = (idx1, idx2)

record.samples[sample]['GT'] = (None, None)
elif data[2] == 0:
record.samples[sample]['GT'] = (0, 0)
elif data[2] == 1:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(self):
self.sample_list = []

def _update_rd_cn(self, variant, sample_indices):
self.rd_cn[variant.id] = {s: variant.samples[s][VariantFormatTypes.RD_CN] for s in sample_indices}
self.rd_cn[variant.id] = {s: variant.samples[s].get(VariantFormatTypes.RD_CN) for s in sample_indices}

@staticmethod
def get_wider(f):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,13 @@ def modify_variants(dict_file_gz, vcf, multi_cnvs):
for sample_id in geno_normal_revise_dict[variant.id]:
o = variant.samples[sample_id]
o.update({"GT": (0, 1)})
o.update({"GQ": o["RD_GQ"]})
o.update({"GQ": o.get("RD_GQ")})

if variant.stop - variant.start >= 1000:
if variant.info[SVTYPE] in [SVType.DEL, SVType.DUP]:
is_del = variant.info[SVTYPE] == SVType.DEL
for k, v in variant.samples.items():
rd_cn = v[VariantFormatTypes.RD_CN]
rd_cn = v.get(VariantFormatTypes.RD_CN)
if rd_cn is None:
continue
if (is_del and rd_cn > 3) or \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,10 @@ def main():
record = revised_lines_by_id[record.id]
if record.info.get('SVTYPE', None) == 'DEL':
if abs(record.stop - record.pos) >= 1000:
sample_cn_map = {s: record.samples[s]['RD_CN'] for s in non_outlier_samples}
sample_cn_map = {s: record.samples[s].get('RD_CN') for s in non_outlier_samples}
if len([s for s in sample_cn_map if (sample_cn_map[s] is not None and sample_cn_map[s] > 3)]) > vf_1:
multi_del = True
gts = [record.samples[s]['GT'] for s in non_outlier_samples]
gts = [record.samples[s].get('GT') for s in non_outlier_samples]
if any(gt not in biallelic_gts for gt in gts):
gt5kb_del = True
if abs(record.stop - record.pos) >= 5000:
Expand All @@ -105,7 +105,7 @@ def main():

if record.info.get('SVTYPE', None) == 'DUP':
if abs(record.stop - record.pos) >= 1000:
sample_cn_map = {s: record.samples[s]['RD_CN'] for s in non_outlier_samples}
sample_cn_map = {s: record.samples[s].get('RD_CN') for s in non_outlier_samples}
if sum(1 for s in sample_cn_map if sample_cn_map[s] is not None and sample_cn_map[s] > 4) > vf_1:
multi_dup = True
if sum(1 for x in Counter(sample_cn_map.values()) if x is not None and (x < 1 or x > 4)) > 4:
Expand All @@ -114,7 +114,7 @@ def main():
(sample_cn_map[s] < 1 or sample_cn_map[s] > 4) and
gt4_copystate) > vf_1:
multi_dup = True
gts = [record.samples[s]['GT'] for s in non_outlier_samples]
gts = [record.samples[s].get('GT') for s in non_outlier_samples]
if any(gt not in biallelic_gts for gt in gts):
gt5kb_dup = True
if abs(record.stop - record.pos) >= 5000:
Expand All @@ -123,36 +123,39 @@ def main():

if gt5kb_del:
for sample_obj in record.samples.itervalues():
if not sample_obj['GQ'] is None and \
(sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] >= 2):
if not sample_obj.get('GQ') is None and \
(sample_obj.get('RD_CN') is not None and sample_obj.get('RD_CN') >= 2):
sample_obj['GT'] = (0, 0)
elif not sample_obj['GQ'] is None and \
(sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] == 1):
elif not sample_obj.get('GQ') is None and \
(sample_obj.get('RD_CN') is not None and sample_obj.get('RD_CN') == 1):
sample_obj['GT'] = (0, 1)
elif not sample_obj['GQ'] is None:
elif not sample_obj.get('GQ') is None:
sample_obj['GT'] = (1, 1) # RD_CN 0 DEL

if gt5kb_dup:
# Convert to bi-allelic
if '<CN0>' in record.alts:
record.alts = ('<DUP>',)
for sample_obj in record.samples.itervalues():
if not sample_obj['GQ'] is None and \
(sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] <= 2):
if not sample_obj.get('GQ') is None and \
(sample_obj.get('RD_CN') is not None and sample_obj.get('RD_CN') <= 2):
sample_obj['GT'] = (0, 0)
elif not sample_obj['GQ'] is None and \
(sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] == 3):
elif not sample_obj.get('GQ') is None and \
(sample_obj.get('RD_CN') is not None and sample_obj.get('RD_CN') == 3):
sample_obj['GT'] = (0, 1)
elif not sample_obj['GQ'] is None:
elif not sample_obj.get('GQ') is None:
sample_obj['GT'] = (1, 1) # RD_CN > 3 DUP

if record.id in multi_geno_ids:
record.info['PESR_GT_OVERDISPERSION'] = True

if multi_del or multi_dup:
record.filter.add('MULTIALLELIC')
for j, sample in enumerate(record.samples):
for sample in record.samples:
record.samples[sample]['GT'] = None
record.samples[sample]['GQ'] = None
record.samples[sample]['CN'] = record.samples[sample]['RD_CN']
record.samples[sample]['CNQ'] = record.samples[sample]['RD_GQ']
record.samples[sample]['CN'] = record.samples[sample].get('RD_CN')
record.samples[sample]['CNQ'] = record.samples[sample].get('RD_GQ')

if len(record.filter) > 1 and 'PASS' in record.filter:
del record.filter['PASS']
Expand All @@ -164,7 +167,7 @@ def main():
if record.id in sexchr_revise:
for sample in record.samples:
if sample in male_samples:
cn = record.samples[sample]['RD_CN']
cn = record.samples[sample].get('RD_CN')
if cn is not None and int(cn) > 0:
cn = int(cn)
record.samples[sample]['RD_CN'] = cn - 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __init__(self, record):
else:
raise ValueError("Uninterpretable evidence: {}".format(ev))
if record.id in bothside_pass:
self.both_end_support = bothside_pass[record.id]
self.both_end_support = 1
else:
self.both_end_support = 0
self.sr_fail = record.id in background_fail
Expand Down Expand Up @@ -102,7 +102,7 @@ def __str__(self):
# Read bothside-pass/background-fail records
sys.stderr.write("Reading SR files...\n")
with open(BOTHSIDE_PASS_PATH) as f:
bothside_pass = {line.strip().split('\t')[-1]: float(line.strip().split('\t')[0]) for line in f}
bothside_pass = set([line.strip().split('\t')[-1] for line in f])

with open(BACKGROUND_FAIL_PATH) as f:
background_fail = set([line.strip().split('\t')[-1] for line in f])
Expand Down
Loading
Loading