From 73cf5ec5d29328b8e4f81ef245e76cd08ba60fbf Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Tue, 30 Jul 2024 09:52:42 -0400 Subject: [PATCH 1/6] Intersect target and truth intervals for P+S [VS-1460] --- .dockstore.yml | 1 + .../GvsCalculatePrecisionAndSensitivity.wdl | 54 ++++++++++++++++++- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/.dockstore.yml b/.dockstore.yml index ed23039d3d4..327cd8db314 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -279,6 +279,7 @@ workflows: branches: - master - ah_var_store + - vs_1460_intersect_target_truth_ps tags: - /.*/ - name: GvsQuickstartVcfIntegration diff --git a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl index 6e1f6c58657..db3af3df44a 100644 --- a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl +++ b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl @@ -99,6 +99,17 @@ workflow GvsCalculatePrecisionAndSensitivity { gatk_docker = effective_gatk_docker, } + if (defined(target_interval_list)) { + call IntersectTargetIntervalListWithTruthBeds { + input: + truth_beds = truth_beds, + target_interval_list = target_interval_list, + gatk_docker = effective_gatk_docker, + } + } + + Array[File] effective_truth_beds = if (defined(target_interval_list)) then IntersectTargetIntervalListWithTruthBeds.intersected_truth_beds else truth_beds + scatter(i in range(length(sample_names))) { String sample_name = sample_names[i] String output_sample_basename = output_basename + "." + sample_name @@ -138,7 +149,7 @@ workflow GvsCalculatePrecisionAndSensitivity { input_vcf_index = BgzipAndTabix.output_vcf_index, truth_vcf = truth_vcfs[i], truth_vcf_index = truth_vcf_indices[i], - truth_bed = truth_beds[i], + truth_bed = effective_truth_beds[i], vcf_eval_bed_file = vcf_eval_bed_file, chromosomes = chromosomes, output_basename = sample_name + "-bq_roc_filtered", @@ -153,7 +164,7 @@ workflow GvsCalculatePrecisionAndSensitivity { input_vcf_index = BgzipAndTabix.output_vcf_index, truth_vcf = truth_vcfs[i], truth_vcf_index = truth_vcf_indices[i], - truth_bed = truth_beds[i], + truth_bed = effective_truth_beds[i], vcf_eval_bed_file = vcf_eval_bed_file, chromosomes = chromosomes, all_records = true, @@ -524,3 +535,42 @@ task CountInputVcfs { docker: basic_docker } } + + +task IntersectTargetIntervalListWithTruthBeds { + input { + Array[File] truth_beds + File target_interval_list + String gatk_docker + } + command <<< + # Prepend date, time and pwd to xtrace log entries. + PS4='\D{+%F %T} \w $ ' + set -o errexit -o nounset -o pipefail -o xtrace + + target_bed="~{target_interval_list}" + target_bed="${target_bed%%.interval_list}.bed" + + gatk IntervalListToBed -I ~{target_interval_list} -O "${target_bed}" --SORT + + intersected_beds="intersected_beds.txt" + touch $intersected_beds + + for truth_bed in ~{sep=' ' truth_beds} + do + intersected_truth_bed="${truth_bed%%.bed}" + intersected_truth_bed="${intersected_truth_bed}_intersected.bed" + + bedtools intersect -a "${target_bed}" -b ${truth_bed} > ${intersected_truth_bed} + + # Need to preserve the same order in the output intersected beds as in the input + echo ${intersected_truth_bed} >> ${intersected_beds} + done + >>> + runtime { + docker: gatk_docker + } + output { + Array[File] intersected_truth_beds = read_lines("intersected_beds.txt") + } +} From 4500e9ec4234df8e41138722b8342a309b154845 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Tue, 30 Jul 2024 09:59:13 -0400 Subject: [PATCH 2/6] oops --- .../variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl index db3af3df44a..76b8051ae8f 100644 --- a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl +++ b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl @@ -103,12 +103,12 @@ workflow GvsCalculatePrecisionAndSensitivity { call IntersectTargetIntervalListWithTruthBeds { input: truth_beds = truth_beds, - target_interval_list = target_interval_list, + target_interval_list = select_first([target_interval_list]), gatk_docker = effective_gatk_docker, } } - Array[File] effective_truth_beds = if (defined(target_interval_list)) then IntersectTargetIntervalListWithTruthBeds.intersected_truth_beds else truth_beds + Array[File] effective_truth_beds = if (defined(target_interval_list)) then select_first([IntersectTargetIntervalListWithTruthBeds.intersected_truth_beds]) else truth_beds scatter(i in range(length(sample_names))) { String sample_name = sample_names[i] From 93b68cc971a4b8773bd40917dc27618f12d994e1 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Tue, 30 Jul 2024 18:28:55 -0400 Subject: [PATCH 3/6] try again --- .../GvsCalculatePrecisionAndSensitivity.wdl | 47 ++++++++----------- 1 file changed, 19 insertions(+), 28 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl index 76b8051ae8f..d7c813979f9 100644 --- a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl +++ b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl @@ -99,17 +99,6 @@ workflow GvsCalculatePrecisionAndSensitivity { gatk_docker = effective_gatk_docker, } - if (defined(target_interval_list)) { - call IntersectTargetIntervalListWithTruthBeds { - input: - truth_beds = truth_beds, - target_interval_list = select_first([target_interval_list]), - gatk_docker = effective_gatk_docker, - } - } - - Array[File] effective_truth_beds = if (defined(target_interval_list)) then select_first([IntersectTargetIntervalListWithTruthBeds.intersected_truth_beds]) else truth_beds - scatter(i in range(length(sample_names))) { String sample_name = sample_names[i] String output_sample_basename = output_basename + "." + sample_name @@ -143,13 +132,24 @@ workflow GvsCalculatePrecisionAndSensitivity { gotc_imputation_docker = effective_gotc_imputation_docker, } + if (defined(target_interval_list)) { + call IntersectTargetIntervalListWithTruthBed { + input: + truth_bed = truth_beds[i], + target_interval_list = select_first([target_interval_list]), + gatk_docker = effective_gatk_docker, + } + } + + File effective_truth_bed = select_first([IntersectTargetIntervalListWithTruthBed.intersected_truth_bed, truth_beds[i]]) + call EvaluateVcf as EvaluateVcfFiltered { input: input_vcf = BgzipAndTabix.output_vcf, input_vcf_index = BgzipAndTabix.output_vcf_index, truth_vcf = truth_vcfs[i], truth_vcf_index = truth_vcf_indices[i], - truth_bed = effective_truth_beds[i], + truth_bed = effective_truth_bed, vcf_eval_bed_file = vcf_eval_bed_file, chromosomes = chromosomes, output_basename = sample_name + "-bq_roc_filtered", @@ -164,7 +164,7 @@ workflow GvsCalculatePrecisionAndSensitivity { input_vcf_index = BgzipAndTabix.output_vcf_index, truth_vcf = truth_vcfs[i], truth_vcf_index = truth_vcf_indices[i], - truth_bed = effective_truth_beds[i], + truth_bed = effective_truth_bed, vcf_eval_bed_file = vcf_eval_bed_file, chromosomes = chromosomes, all_records = true, @@ -537,9 +537,9 @@ task CountInputVcfs { } -task IntersectTargetIntervalListWithTruthBeds { +task IntersectTargetIntervalListWithTruthBed { input { - Array[File] truth_beds + File truth_bed File target_interval_list String gatk_docker } @@ -553,24 +553,15 @@ task IntersectTargetIntervalListWithTruthBeds { gatk IntervalListToBed -I ~{target_interval_list} -O "${target_bed}" --SORT - intersected_beds="intersected_beds.txt" - touch $intersected_beds - - for truth_bed in ~{sep=' ' truth_beds} - do - intersected_truth_bed="${truth_bed%%.bed}" - intersected_truth_bed="${intersected_truth_bed}_intersected.bed" - - bedtools intersect -a "${target_bed}" -b ${truth_bed} > ${intersected_truth_bed} + intersected_truth_bed="${truth_bed%%.bed}" + intersected_truth_bed="${intersected_truth_bed}_intersected.bed" - # Need to preserve the same order in the output intersected beds as in the input - echo ${intersected_truth_bed} >> ${intersected_beds} - done + bedtools intersect -a "${target_bed}" -b ${truth_bed} > ${intersected_truth_bed} >>> runtime { docker: gatk_docker } output { - Array[File] intersected_truth_beds = read_lines("intersected_beds.txt") + File intersected_truth_bed = glob("*_intersected.bed")[0] } } From 8a56b1b802164e0ca0e6b762f77a15259fcc5c2a Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Tue, 30 Jul 2024 20:13:22 -0400 Subject: [PATCH 4/6] oops --- scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl index d7c813979f9..d65f6672f31 100644 --- a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl +++ b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl @@ -553,6 +553,7 @@ task IntersectTargetIntervalListWithTruthBed { gatk IntervalListToBed -I ~{target_interval_list} -O "${target_bed}" --SORT + truth_bed="~{truth_bed}" intersected_truth_bed="${truth_bed%%.bed}" intersected_truth_bed="${intersected_truth_bed}_intersected.bed" From 65b96830dba4e407549b70a711a5b072f2d32ba8 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Wed, 31 Jul 2024 05:49:39 -0400 Subject: [PATCH 5/6] fixups --- .../wdl/GvsCalculatePrecisionAndSensitivity.wdl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl index d65f6672f31..75c73645d97 100644 --- a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl +++ b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl @@ -548,12 +548,14 @@ task IntersectTargetIntervalListWithTruthBed { PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace - target_bed="~{target_interval_list}" + # `basename` so the output ends up in $PWD (/cromwell_root) and not wherever the inputs were localized. + # The outputs of these transformations need to be in a place where the `glob` expression will find them. + target_bed="$(basename ~{target_interval_list})" target_bed="${target_bed%%.interval_list}.bed" gatk IntervalListToBed -I ~{target_interval_list} -O "${target_bed}" --SORT - truth_bed="~{truth_bed}" + truth_bed="$(basename ~{truth_bed})" intersected_truth_bed="${truth_bed%%.bed}" intersected_truth_bed="${intersected_truth_bed}_intersected.bed" From 002729a81802e7f9707c95bb53afd5024c58ec10 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Wed, 31 Jul 2024 06:24:29 -0400 Subject: [PATCH 6/6] oops --- .../variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl index 75c73645d97..cf5a2f4168b 100644 --- a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl +++ b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl @@ -555,8 +555,8 @@ task IntersectTargetIntervalListWithTruthBed { gatk IntervalListToBed -I ~{target_interval_list} -O "${target_bed}" --SORT - truth_bed="$(basename ~{truth_bed})" - intersected_truth_bed="${truth_bed%%.bed}" + truth_bed="~{truth_bed}" + intersected_truth_bed="$(basename ${truth_bed%%.bed})" intersected_truth_bed="${intersected_truth_bed}_intersected.bed" bedtools intersect -a "${target_bed}" -b ${truth_bed} > ${intersected_truth_bed}