Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VS-1113 Update VQSR and VETS naming #8948

Merged
merged 12 commits into from
Aug 16, 2024
3 changes: 3 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,7 @@ workflows:
branches:
- master
- ah_var_store
- gg_VS-1113_VQSR_VETS_That_Is_All
tags:
- /.*/
- name: GvsQuickstartHailIntegration
Expand All @@ -298,6 +299,7 @@ workflows:
branches:
- master
- ah_var_store
- gg_VS-1113_VQSR_VETS_That_Is_All
tags:
- /.*/
- name: GvsQuickstartIntegration
Expand All @@ -308,6 +310,7 @@ workflows:
- master
- ah_var_store
- vs_1456_status_writes_bug
- gg_VS-1113_VQSR_VETS_That_Is_All
tags:
- /.*/
- name: GvsIngestTieout
Expand Down
6 changes: 3 additions & 3 deletions scripts/variantstore/repo/generate_git_filter_repo_command.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ ah_var_store_branch_point() {
files_added_on_ah_var_store() {
# Look for files added to ah_var_store since the branch point. Note that these files were not necessarily *uniquely*
# added to ah_var_store and might represent cherry picks from master (e.g. scripts and build files for the migration
# from Travis to GitHub Actions, VQSR Lite work, etc.)
# from Travis to GitHub Actions, VETS work, etc.)
git diff "$(ah_var_store_branch_point)" $variants_branch --name-status | grep -E '^A' | cut -f 2-
}

Expand All @@ -46,8 +46,8 @@ files_added_on_ah_var_store_not_on_master() {

files_deleted_from_master() {
# This intentionally does not use `git diff` as is used in `files_added_on_ah_var_store` since that would only show
# files deleted from the branch point to the head of master. There are numerous files here (mostly related to VQSR
# Lite) where files added to master after the branch point were cherry picked onto ah_var_store and subsequently
# files deleted from the branch point to the head of master. There are numerous files here (mostly related to VETS)
# where files added to master after the branch point were cherry picked onto ah_var_store and subsequently
# deleted from master. This `git log` finds these while the `git diff` does not.
#
# https://waylonwalker.com/git-find-deleted-files/#git-log-diff-filter
Expand Down
8 changes: 4 additions & 4 deletions scripts/variantstore/tieout/legacy_wdl/JointGenotyping.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ workflow JointGenotyping {
}

if (num_gvcfs <= snps_variant_recalibration_threshold) {
call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic {
call Tasks.SNPsVariantRecalibrator {
input:
sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf,
sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index,
Expand Down Expand Up @@ -300,9 +300,9 @@ workflow JointGenotyping {
indels_recalibration = IndelsVariantRecalibrator.recalibration,
indels_recalibration_index = IndelsVariantRecalibrator.recalibration_index,
indels_tranches = IndelsVariantRecalibrator.tranches,
snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration]),
snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration_index]),
snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]),
snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibrator.recalibration]),
snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibrator.recalibration_index]),
snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibrator.tranches]),
indel_filter_level = indel_filter_level,
snp_filter_level = snp_filter_level,
use_allele_specific_annotations = allele_specific_annotations,
Expand Down
22 changes: 11 additions & 11 deletions scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ workflow GvsCalculatePrecisionAndSensitivity {
variants_docker = effective_variants_docker,
}

call IsVQSRLite {
call IsVETS {
input:
input_vcf = Add_AS_MAX_VQS_SCORE_ToVcf.output_vcf,
basic_docker = effective_basic_docker,
Expand Down Expand Up @@ -153,7 +153,7 @@ workflow GvsCalculatePrecisionAndSensitivity {
vcf_eval_bed_file = vcf_eval_bed_file,
chromosomes = chromosomes,
output_basename = sample_name + "-bq_roc_filtered",
is_vqsr_lite = IsVQSRLite.is_vqsr_lite,
is_vets = IsVETS.is_vets,
ref_fasta = ref_fasta,
real_time_genomics_docker = effective_real_time_genomics_docker,
}
Expand All @@ -169,7 +169,7 @@ workflow GvsCalculatePrecisionAndSensitivity {
chromosomes = chromosomes,
all_records = true,
output_basename = sample_name + "-bq_all",
is_vqsr_lite = IsVQSRLite.is_vqsr_lite,
is_vets = IsVETS.is_vets,
ref_fasta = ref_fasta,
real_time_genomics_docker = effective_real_time_genomics_docker,
}
Expand Down Expand Up @@ -320,13 +320,13 @@ task Add_AS_MAX_VQS_SCORE_ToVcf {
}
}

task IsVQSRLite {
task IsVETS {
input {
File input_vcf
String basic_docker
}

String is_vqsr_lite_file = "is_vqsr_lite_file.txt"
String is_vets_file = "is_vets_file.txt"

command <<<
# Prepend date, time and pwd to xtrace log entries.
Expand All @@ -337,9 +337,9 @@ task IsVQSRLite {
set +o errexit
grep -v '^#' ~{input_vcf} | grep CALIBRATION_SENSITIVITY > /dev/null
if [[ $? -eq 0 ]]; then
echo "true" > ~{is_vqsr_lite_file}
echo "true" > ~{is_vets_file}
else
echo "false" > ~{is_vqsr_lite_file}
echo "false" > ~{is_vets_file}
fi
set -o errexit
>>>
Expand All @@ -351,7 +351,7 @@ task IsVQSRLite {
preemptible: 3
}
output {
Boolean is_vqsr_lite = read_boolean(is_vqsr_lite_file)
Boolean is_vets = read_boolean(is_vets_file)
}
}

Expand Down Expand Up @@ -406,15 +406,15 @@ task EvaluateVcf {

String output_basename

Boolean is_vqsr_lite
Boolean is_vets

String real_time_genomics_docker
Int cpu = 1
Int memory_mb = 3500
Int disk_size_gb = ceil(2 * size(ref_fasta, "GiB")) + 500
}

String max_score_field_tag = if (is_vqsr_lite == true) then 'MAX_CALIBRATION_SENSITIVITY' else 'MAX_AS_VQSLOD'
String max_score_field_tag = if (is_vets == true) then 'MAX_CALIBRATION_SENSITIVITY' else 'MAX_AS_VQSLOD'

command <<<
chromosomes=( ~{sep=' ' chromosomes} )
Expand All @@ -436,7 +436,7 @@ task EvaluateVcf {
~{if all_records then "--all-records" else ""} \
--roc-subset snp,indel \
--vcf-score-field=INFO.~{max_score_field_tag} \
~{if is_vqsr_lite then "--sort-order ascending" else "--sort-order descending"} \
~{if is_vets then "--sort-order ascending" else "--sort-order descending"} \
-t human_REF_SDF \
-b ~{truth_vcf} \
-e ~{truth_bed}\
Expand Down
50 changes: 25 additions & 25 deletions scripts/variantstore/wdl/GvsCreateFilterSet.wdl
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
version 1.0

import "GvsUtils.wdl" as Utils
import "GvsVQSRClassic.wdl" as VQSRClassic
import "../../vcf_site_level_filtering_wdl/JointVcfFiltering.wdl" as VQSRLite
import "GvsVQSR.wdl" as VQSR
import "../../vcf_site_level_filtering_wdl/JointVcfFiltering.wdl" as VETS

workflow GvsCreateFilterSet {
input {
Expand All @@ -21,16 +21,16 @@ workflow GvsCreateFilterSet {
String? git_hash
File? gatk_override

Boolean use_VQSR_lite = true
Boolean use_VETS = true

Int? INDEL_VQSR_CLASSIC_max_gaussians_override = 4
Int? INDEL_VQSR_CLASSIC_mem_gb_override
Int? SNP_VQSR_CLASSIC_max_gaussians_override = 6
Int? SNP_VQSR_CLASSIC_mem_gb_override
Int? INDEL_VQSR_max_gaussians_override = 4
Int? INDEL_VQSR_mem_gb_override
Int? SNP_VQSR_max_gaussians_override = 6
Int? SNP_VQSR_mem_gb_override

RuntimeAttributes? vqsr_lite_extract_runtime_attributes = {"command_mem_gb": 27}
RuntimeAttributes? vqsr_lite_train_runtime_attributes = {"command_mem_gb": 27}
RuntimeAttributes? vqsr_lite_score_runtime_attributes = {"command_mem_gb": 15}
RuntimeAttributes? vets_extract_runtime_attributes = {"command_mem_gb": 27}
RuntimeAttributes? vets_train_runtime_attributes = {"command_mem_gb": 27}
RuntimeAttributes? vets_score_runtime_attributes = {"command_mem_gb": 15}

File? training_python_script
File? scoring_python_script
Expand Down Expand Up @@ -127,10 +127,10 @@ workflow GvsCreateFilterSet {
gatk_docker = effective_gatk_docker,
}

# From this point, the paths diverge depending on whether they're using classic VQSR or VQSR-Lite
# The first branch here is VQSR-Lite, and the second is classic VQSR
if (use_VQSR_lite) {
call VQSRLite.JointVcfFiltering as JointVcfFiltering {
# From this point, the paths diverge depending on whether they're using VQSR or VETS
# The first branch here is VETS, and the second is VQSR
if (use_VETS) {
call VETS.JointVcfFiltering as JointVcfFiltering {
input:
input_vcfs = ExtractFilterTask.output_vcf,
input_vcf_idxs = ExtractFilterTask.output_vcf_index,
Expand All @@ -141,9 +141,9 @@ workflow GvsCreateFilterSet {
resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz",
extract_extra_args = "-L ${interval_list}",
score_extra_args = "-L ${interval_list}",
extract_runtime_attributes = vqsr_lite_extract_runtime_attributes,
train_runtime_attributes = vqsr_lite_train_runtime_attributes,
score_runtime_attributes = vqsr_lite_score_runtime_attributes,
extract_runtime_attributes = vets_extract_runtime_attributes,
train_runtime_attributes = vets_train_runtime_attributes,
score_runtime_attributes = vets_score_runtime_attributes,
gatk_docker = effective_gatk_docker,
gatk_override = gatk_override,
monitoring_script = "gs://gvs_quickstart_storage/cromwell_monitoring_script.sh",
Expand Down Expand Up @@ -198,12 +198,12 @@ workflow GvsCreateFilterSet {
indel_recal_file = CreateFilteredScoredINDELsVCF.output_vcf,
indel_recal_file_index = CreateFilteredScoredINDELsVCF.output_vcf_index,
project_id = project_id,
useClassic = false
useVQSR = false
}
}

if (!use_VQSR_lite) {
call VQSRClassic.JointVcfFiltering as VQSRClassic {
if (!use_VETS) {
call VQSR.JointVcfFiltering as VQSR {
input:
git_branch_or_tag = git_branch_or_tag,
git_hash = git_hash,
Expand All @@ -218,10 +218,10 @@ workflow GvsCreateFilterSet {
sites_only_variant_filtered_vcf_idx = MergeVCFs.output_vcf_index,
sites_only_variant_filtered_vcfs = ExtractFilterTask.output_vcf,
sites_only_variant_filtered_vcf_idxs = ExtractFilterTask.output_vcf_index,
INDEL_VQSR_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override,
INDEL_VQSR_mem_gb_override = INDEL_VQSR_CLASSIC_mem_gb_override,
SNP_VQSR_max_gaussians_override = SNP_VQSR_CLASSIC_max_gaussians_override,
SNP_VQSR_mem_gb_override = SNP_VQSR_CLASSIC_mem_gb_override,
INDEL_VQSR_max_gaussians_override = INDEL_VQSR_max_gaussians_override,
INDEL_VQSR_mem_gb_override = INDEL_VQSR_mem_gb_override,
SNP_VQSR_max_gaussians_override = SNP_VQSR_max_gaussians_override,
SNP_VQSR_mem_gb_override = SNP_VQSR_mem_gb_override,
gatk_docker = effective_gatk_docker,
gatk_override = gatk_override,
}
Expand Down Expand Up @@ -255,7 +255,7 @@ workflow GvsCreateFilterSet {
[CreateFilteredScoredSNPsVCF.monitoring_log],
[CreateFilteredScoredINDELsVCF.monitoring_log],
[PopulateFilterSetInfo.monitoring_log],
select_first([VQSRClassic.monitoring_logs, []]),
select_first([VQSR.monitoring_logs, []]),
[PopulateFilterSetSites.monitoring_log]
]
)
Expand Down
3 changes: 0 additions & 3 deletions scripts/variantstore/wdl/GvsCreateVATfromVDS.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ workflow GvsCreateVATfromVDS {
Int? split_intervals_disk_size_override
Int? split_intervals_mem_override
Int? split_intervals_scatter_count
Boolean use_classic_VQSR = false
Boolean use_reference_disk = true

String? cloud_sdk_docker
Expand Down Expand Up @@ -144,7 +143,6 @@ workflow GvsCreateVATfromVDS {
call GenerateSitesOnlyVcf {
input:
vds_path = select_first([vds_path]),
use_classic_VQSR = use_classic_VQSR,
workspace_project = effective_google_project,
hail_version = effective_hail_version,
hail_wheel = hail_wheel,
Expand Down Expand Up @@ -310,7 +308,6 @@ workflow GvsCreateVATfromVDS {
task GenerateSitesOnlyVcf {
input {
String vds_path
Boolean use_classic_VQSR
String workspace_project
String workspace_bucket
String region
Expand Down
8 changes: 4 additions & 4 deletions scripts/variantstore/wdl/GvsCreateVDS.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ workflow GvsCreateVDS {
Int? cluster_max_age_minutes
Boolean leave_cluster_running_at_end = false
Float? master_memory_fraction
Boolean use_classic_VQSR = false
Boolean use_VQSR = false

String? git_branch_or_tag
String? hail_version
Expand Down Expand Up @@ -111,7 +111,7 @@ workflow GvsCreateVDS {
prefix = cluster_prefix,
vds_path = vds_destination_path,
avro_path = avro_path,
use_classic_VQSR = use_classic_VQSR,
use_VQSR = use_VQSR,
hail_version = effective_hail_version,
hail_wheel = hail_wheel,
hail_temp_path = hail_temp_path,
Expand Down Expand Up @@ -158,7 +158,7 @@ task CreateVds {
String prefix
String vds_path
String avro_path
Boolean use_classic_VQSR
Boolean use_VQSR
Boolean leave_cluster_running_at_end
File hail_gvs_import_script
File gvs_import_script
Expand Down Expand Up @@ -233,7 +233,7 @@ task CreateVds {
"temp-path": "${hail_temp_path}",
"avro-path": "~{avro_path}"
~{', "intermediate-resume-point": ' + intermediate_resume_point}
~{true=', "use-classic-vqsr": ""' false='' use_classic_VQSR}
~{true=', "use-vqsr": ""' false='' use_VQSR}
}
FIN

Expand Down
14 changes: 7 additions & 7 deletions scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ workflow GvsExtractAvroFilesForHail {
String dataset_name
String filter_set_name
String call_set_identifier
Boolean use_VQSR_lite = true
Boolean use_VETS = true
Int scatter_width = 10
String? basic_docker
String? cloud_sdk_docker
Expand Down Expand Up @@ -43,7 +43,7 @@ workflow GvsExtractAvroFilesForHail {
cloud_sdk_docker = effective_cloud_sdk_docker,
}

call Utils.IsVQSRLite {
call Utils.IsVETS {
input:
project_id = project_id,
fq_filter_set_info_table = "~{project_id}.~{dataset_name}.filter_set_info",
Expand Down Expand Up @@ -74,7 +74,7 @@ workflow GvsExtractAvroFilesForHail {
filter_set_name = filter_set_name,
avro_sibling = OutputPath.out,
call_set_identifier = call_set_identifier,
is_vqsr_lite = IsVQSRLite.is_vqsr_lite,
is_vets = IsVETS.is_vets,
variants_docker = effective_variants_docker,
}

Expand Down Expand Up @@ -203,7 +203,7 @@ task ExtractFromSampleInfoTable {

task ExtractFromFilterTables {
meta {
description: "Extracts from the tables: filter_set_sites, filter_set_info/filter_set_info_vqsr_lite, and filter_set_tranches (if using VQSR Classic)"
description: "Extracts from the tables: filter_set_sites, filter_set_info, and filter_set_tranches (if using VQSR)"
# Not dealing with caching for now as that would introduce a lot of complexity.
volatile: true
}
Expand All @@ -214,11 +214,11 @@ task ExtractFromFilterTables {
String filter_set_name
String avro_sibling
String call_set_identifier
Boolean is_vqsr_lite = true
Boolean is_vets = true
String variants_docker
}

String vqs_score_field = if (is_vqsr_lite == true) then 'calibration_sensitivity' else 'vqslod'
String vqs_score_field = if (is_vets == true) then 'calibration_sensitivity' else 'vqslod'

parameter_meta {
avro_sibling: "Cloud path to a file that will be the sibling to the 'avro' 'directory' under which output Avro files will be written."
Expand Down Expand Up @@ -249,7 +249,7 @@ task ExtractFromFilterTables {
ORDER BY location
" --call_set_identifier ~{call_set_identifier} --dataset_name ~{dataset_name} --table_name filter_set_sites --project_id=~{project_id}

if [ ~{is_vqsr_lite} = false ]; then
if [ ~{is_vets} = false ]; then
python3 /app/run_avro_query.py --sql "
EXPORT DATA OPTIONS(
uri='${avro_prefix}/vqsr_tranche_data/vqsr_tranche_data_*.avro', format='AVRO', compression='SNAPPY') AS
Expand Down
Loading
Loading