diff --git a/scripts/variantstore/repo/generate_git_filter_repo_command.sh b/scripts/variantstore/repo/generate_git_filter_repo_command.sh index 4953e2611d7..b7af88f6231 100755 --- a/scripts/variantstore/repo/generate_git_filter_repo_command.sh +++ b/scripts/variantstore/repo/generate_git_filter_repo_command.sh @@ -33,7 +33,7 @@ ah_var_store_branch_point() { files_added_on_ah_var_store() { # Look for files added to ah_var_store since the branch point. Note that these files were not necessarily *uniquely* # added to ah_var_store and might represent cherry picks from master (e.g. scripts and build files for the migration - # from Travis to GitHub Actions, VQSR Lite work, etc.) + # from Travis to GitHub Actions, VETS work, etc.) git diff "$(ah_var_store_branch_point)" $variants_branch --name-status | grep -E '^A' | cut -f 2- } @@ -46,8 +46,8 @@ files_added_on_ah_var_store_not_on_master() { files_deleted_from_master() { # This intentionally does not use `git diff` as is used in `files_added_on_ah_var_store` since that would only show - # files deleted from the branch point to the head of master. There are numerous files here (mostly related to VQSR - # Lite) where files added to master after the branch point were cherry picked onto ah_var_store and subsequently + # files deleted from the branch point to the head of master. There are numerous files here (mostly related to VETS) + # where files added to master after the branch point were cherry picked onto ah_var_store and subsequently # deleted from master. This `git log` finds these while the `git diff` does not. # # https://waylonwalker.com/git-find-deleted-files/#git-log-diff-filter diff --git a/scripts/variantstore/tieout/legacy_wdl/JointGenotyping.wdl b/scripts/variantstore/tieout/legacy_wdl/JointGenotyping.wdl index b40d84aaad4..f695d4bad17 100644 --- a/scripts/variantstore/tieout/legacy_wdl/JointGenotyping.wdl +++ b/scripts/variantstore/tieout/legacy_wdl/JointGenotyping.wdl @@ -269,7 +269,7 @@ workflow JointGenotyping { } if (num_gvcfs <= snps_variant_recalibration_threshold) { - call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic { + call Tasks.SNPsVariantRecalibrator { input: sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, @@ -300,9 +300,9 @@ workflow JointGenotyping { indels_recalibration = IndelsVariantRecalibrator.recalibration, indels_recalibration_index = IndelsVariantRecalibrator.recalibration_index, indels_tranches = IndelsVariantRecalibrator.tranches, - snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration]), - snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration_index]), - snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]), + snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibrator.recalibration]), + snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibrator.recalibration_index]), + snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibrator.tranches]), indel_filter_level = indel_filter_level, snp_filter_level = snp_filter_level, use_allele_specific_annotations = allele_specific_annotations, diff --git a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl index cf5a2f4168b..8dfb3aef8f6 100644 --- a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl +++ b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl @@ -119,7 +119,7 @@ workflow GvsCalculatePrecisionAndSensitivity { variants_docker = effective_variants_docker, } - call IsVQSRLite { + call IsVETS { input: input_vcf = Add_AS_MAX_VQS_SCORE_ToVcf.output_vcf, basic_docker = effective_basic_docker, @@ -153,7 +153,7 @@ workflow GvsCalculatePrecisionAndSensitivity { vcf_eval_bed_file = vcf_eval_bed_file, chromosomes = chromosomes, output_basename = sample_name + "-bq_roc_filtered", - is_vqsr_lite = IsVQSRLite.is_vqsr_lite, + is_vets = IsVETS.is_vets, ref_fasta = ref_fasta, real_time_genomics_docker = effective_real_time_genomics_docker, } @@ -169,7 +169,7 @@ workflow GvsCalculatePrecisionAndSensitivity { chromosomes = chromosomes, all_records = true, output_basename = sample_name + "-bq_all", - is_vqsr_lite = IsVQSRLite.is_vqsr_lite, + is_vets = IsVETS.is_vets, ref_fasta = ref_fasta, real_time_genomics_docker = effective_real_time_genomics_docker, } @@ -320,13 +320,13 @@ task Add_AS_MAX_VQS_SCORE_ToVcf { } } -task IsVQSRLite { +task IsVETS { input { File input_vcf String basic_docker } - String is_vqsr_lite_file = "is_vqsr_lite_file.txt" + String is_vets_file = "is_vets_file.txt" command <<< # Prepend date, time and pwd to xtrace log entries. @@ -337,9 +337,9 @@ task IsVQSRLite { set +o errexit grep -v '^#' ~{input_vcf} | grep CALIBRATION_SENSITIVITY > /dev/null if [[ $? -eq 0 ]]; then - echo "true" > ~{is_vqsr_lite_file} + echo "true" > ~{is_vets_file} else - echo "false" > ~{is_vqsr_lite_file} + echo "false" > ~{is_vets_file} fi set -o errexit >>> @@ -351,7 +351,7 @@ task IsVQSRLite { preemptible: 3 } output { - Boolean is_vqsr_lite = read_boolean(is_vqsr_lite_file) + Boolean is_vets = read_boolean(is_vets_file) } } @@ -406,7 +406,7 @@ task EvaluateVcf { String output_basename - Boolean is_vqsr_lite + Boolean is_vets String real_time_genomics_docker Int cpu = 1 @@ -414,7 +414,7 @@ task EvaluateVcf { Int disk_size_gb = ceil(2 * size(ref_fasta, "GiB")) + 500 } - String max_score_field_tag = if (is_vqsr_lite == true) then 'MAX_CALIBRATION_SENSITIVITY' else 'MAX_AS_VQSLOD' + String max_score_field_tag = if (is_vets == true) then 'MAX_CALIBRATION_SENSITIVITY' else 'MAX_AS_VQSLOD' command <<< chromosomes=( ~{sep=' ' chromosomes} ) @@ -436,7 +436,7 @@ task EvaluateVcf { ~{if all_records then "--all-records" else ""} \ --roc-subset snp,indel \ --vcf-score-field=INFO.~{max_score_field_tag} \ - ~{if is_vqsr_lite then "--sort-order ascending" else "--sort-order descending"} \ + ~{if is_vets then "--sort-order ascending" else "--sort-order descending"} \ -t human_REF_SDF \ -b ~{truth_vcf} \ -e ~{truth_bed}\ diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl index 18a9f00b150..ecf5d3f5701 100644 --- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl +++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl @@ -1,8 +1,8 @@ version 1.0 import "GvsUtils.wdl" as Utils -import "GvsVQSRClassic.wdl" as VQSRClassic -import "../../vcf_site_level_filtering_wdl/JointVcfFiltering.wdl" as VQSRLite +import "GvsVQSR.wdl" as VQSR +import "../../vcf_site_level_filtering_wdl/JointVcfFiltering.wdl" as VETS workflow GvsCreateFilterSet { input { @@ -21,16 +21,16 @@ workflow GvsCreateFilterSet { String? git_hash File? gatk_override - Boolean use_VQSR_lite = true + Boolean use_VETS = true - Int? INDEL_VQSR_CLASSIC_max_gaussians_override = 4 - Int? INDEL_VQSR_CLASSIC_mem_gb_override - Int? SNP_VQSR_CLASSIC_max_gaussians_override = 6 - Int? SNP_VQSR_CLASSIC_mem_gb_override + Int? INDEL_VQSR_max_gaussians_override = 4 + Int? INDEL_VQSR_mem_gb_override + Int? SNP_VQSR_max_gaussians_override = 6 + Int? SNP_VQSR_mem_gb_override - RuntimeAttributes? vqsr_lite_extract_runtime_attributes = {"command_mem_gb": 27} - RuntimeAttributes? vqsr_lite_train_runtime_attributes = {"command_mem_gb": 27} - RuntimeAttributes? vqsr_lite_score_runtime_attributes = {"command_mem_gb": 15} + RuntimeAttributes? vets_extract_runtime_attributes = {"command_mem_gb": 27} + RuntimeAttributes? vets_train_runtime_attributes = {"command_mem_gb": 27} + RuntimeAttributes? vets_score_runtime_attributes = {"command_mem_gb": 15} File? training_python_script File? scoring_python_script @@ -127,10 +127,10 @@ workflow GvsCreateFilterSet { gatk_docker = effective_gatk_docker, } - # From this point, the paths diverge depending on whether they're using classic VQSR or VQSR-Lite - # The first branch here is VQSR-Lite, and the second is classic VQSR - if (use_VQSR_lite) { - call VQSRLite.JointVcfFiltering as JointVcfFiltering { + # From this point, the paths diverge depending on whether they're using VQSR or VETS + # The first branch here is VETS, and the second is VQSR + if (use_VETS) { + call VETS.JointVcfFiltering as JointVcfFiltering { input: input_vcfs = ExtractFilterTask.output_vcf, input_vcf_idxs = ExtractFilterTask.output_vcf_index, @@ -141,9 +141,9 @@ workflow GvsCreateFilterSet { resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:axiom,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", extract_extra_args = "-L ${interval_list}", score_extra_args = "-L ${interval_list}", - extract_runtime_attributes = vqsr_lite_extract_runtime_attributes, - train_runtime_attributes = vqsr_lite_train_runtime_attributes, - score_runtime_attributes = vqsr_lite_score_runtime_attributes, + extract_runtime_attributes = vets_extract_runtime_attributes, + train_runtime_attributes = vets_train_runtime_attributes, + score_runtime_attributes = vets_score_runtime_attributes, gatk_docker = effective_gatk_docker, gatk_override = gatk_override, monitoring_script = "gs://gvs_quickstart_storage/cromwell_monitoring_script.sh", @@ -198,12 +198,12 @@ workflow GvsCreateFilterSet { indel_recal_file = CreateFilteredScoredINDELsVCF.output_vcf, indel_recal_file_index = CreateFilteredScoredINDELsVCF.output_vcf_index, project_id = project_id, - useClassic = false + useVQSR = false } } - if (!use_VQSR_lite) { - call VQSRClassic.JointVcfFiltering as VQSRClassic { + if (!use_VETS) { + call VQSR.JointVcfFiltering as VQSR { input: git_branch_or_tag = git_branch_or_tag, git_hash = git_hash, @@ -218,10 +218,10 @@ workflow GvsCreateFilterSet { sites_only_variant_filtered_vcf_idx = MergeVCFs.output_vcf_index, sites_only_variant_filtered_vcfs = ExtractFilterTask.output_vcf, sites_only_variant_filtered_vcf_idxs = ExtractFilterTask.output_vcf_index, - INDEL_VQSR_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override, - INDEL_VQSR_mem_gb_override = INDEL_VQSR_CLASSIC_mem_gb_override, - SNP_VQSR_max_gaussians_override = SNP_VQSR_CLASSIC_max_gaussians_override, - SNP_VQSR_mem_gb_override = SNP_VQSR_CLASSIC_mem_gb_override, + INDEL_VQSR_max_gaussians_override = INDEL_VQSR_max_gaussians_override, + INDEL_VQSR_mem_gb_override = INDEL_VQSR_mem_gb_override, + SNP_VQSR_max_gaussians_override = SNP_VQSR_max_gaussians_override, + SNP_VQSR_mem_gb_override = SNP_VQSR_mem_gb_override, gatk_docker = effective_gatk_docker, gatk_override = gatk_override, } @@ -255,7 +255,7 @@ workflow GvsCreateFilterSet { [CreateFilteredScoredSNPsVCF.monitoring_log], [CreateFilteredScoredINDELsVCF.monitoring_log], [PopulateFilterSetInfo.monitoring_log], - select_first([VQSRClassic.monitoring_logs, []]), + select_first([VQSR.monitoring_logs, []]), [PopulateFilterSetSites.monitoring_log] ] ) diff --git a/scripts/variantstore/wdl/GvsCreateVATfromVDS.wdl b/scripts/variantstore/wdl/GvsCreateVATfromVDS.wdl index 355b0468617..9ddbe7094fc 100644 --- a/scripts/variantstore/wdl/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/wdl/GvsCreateVATfromVDS.wdl @@ -27,7 +27,6 @@ workflow GvsCreateVATfromVDS { Int? split_intervals_disk_size_override Int? split_intervals_mem_override Int? split_intervals_scatter_count - Boolean use_classic_VQSR = false Boolean use_reference_disk = true String? cloud_sdk_docker @@ -144,7 +143,6 @@ workflow GvsCreateVATfromVDS { call GenerateSitesOnlyVcf { input: vds_path = select_first([vds_path]), - use_classic_VQSR = use_classic_VQSR, workspace_project = effective_google_project, hail_version = effective_hail_version, hail_wheel = hail_wheel, @@ -310,7 +308,6 @@ workflow GvsCreateVATfromVDS { task GenerateSitesOnlyVcf { input { String vds_path - Boolean use_classic_VQSR String workspace_project String workspace_bucket String region diff --git a/scripts/variantstore/wdl/GvsCreateVDS.wdl b/scripts/variantstore/wdl/GvsCreateVDS.wdl index b40f9f26918..5f8fdcb8d52 100644 --- a/scripts/variantstore/wdl/GvsCreateVDS.wdl +++ b/scripts/variantstore/wdl/GvsCreateVDS.wdl @@ -19,7 +19,7 @@ workflow GvsCreateVDS { Int? cluster_max_age_minutes Boolean leave_cluster_running_at_end = false Float? master_memory_fraction - Boolean use_classic_VQSR = false + Boolean use_VQSR = false String? git_branch_or_tag String? hail_version @@ -111,7 +111,7 @@ workflow GvsCreateVDS { prefix = cluster_prefix, vds_path = vds_destination_path, avro_path = avro_path, - use_classic_VQSR = use_classic_VQSR, + use_VQSR = use_VQSR, hail_version = effective_hail_version, hail_wheel = hail_wheel, hail_temp_path = hail_temp_path, @@ -158,7 +158,7 @@ task CreateVds { String prefix String vds_path String avro_path - Boolean use_classic_VQSR + Boolean use_VQSR Boolean leave_cluster_running_at_end File hail_gvs_import_script File gvs_import_script @@ -233,7 +233,7 @@ task CreateVds { "temp-path": "${hail_temp_path}", "avro-path": "~{avro_path}" ~{', "intermediate-resume-point": ' + intermediate_resume_point} - ~{true=', "use-classic-vqsr": ""' false='' use_classic_VQSR} + ~{true=', "use-vqsr": ""' false='' use_VQSR} } FIN diff --git a/scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl b/scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl index 76e82011e5d..4c67e650bd2 100644 --- a/scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl +++ b/scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl @@ -12,7 +12,7 @@ workflow GvsExtractAvroFilesForHail { String dataset_name String filter_set_name String call_set_identifier - Boolean use_VQSR_lite = true + Boolean use_VETS = true Int scatter_width = 10 String? basic_docker String? cloud_sdk_docker @@ -43,7 +43,7 @@ workflow GvsExtractAvroFilesForHail { cloud_sdk_docker = effective_cloud_sdk_docker, } - call Utils.IsVQSRLite { + call Utils.IsVETS { input: project_id = project_id, fq_filter_set_info_table = "~{project_id}.~{dataset_name}.filter_set_info", @@ -74,7 +74,7 @@ workflow GvsExtractAvroFilesForHail { filter_set_name = filter_set_name, avro_sibling = OutputPath.out, call_set_identifier = call_set_identifier, - is_vqsr_lite = IsVQSRLite.is_vqsr_lite, + is_vets = IsVETS.is_vets, variants_docker = effective_variants_docker, } @@ -203,7 +203,7 @@ task ExtractFromSampleInfoTable { task ExtractFromFilterTables { meta { - description: "Extracts from the tables: filter_set_sites, filter_set_info/filter_set_info_vqsr_lite, and filter_set_tranches (if using VQSR Classic)" + description: "Extracts from the tables: filter_set_sites, filter_set_info, and filter_set_tranches (if using VQSR)" # Not dealing with caching for now as that would introduce a lot of complexity. volatile: true } @@ -214,11 +214,11 @@ task ExtractFromFilterTables { String filter_set_name String avro_sibling String call_set_identifier - Boolean is_vqsr_lite = true + Boolean is_vets = true String variants_docker } - String vqs_score_field = if (is_vqsr_lite == true) then 'calibration_sensitivity' else 'vqslod' + String vqs_score_field = if (is_vets == true) then 'calibration_sensitivity' else 'vqslod' parameter_meta { avro_sibling: "Cloud path to a file that will be the sibling to the 'avro' 'directory' under which output Avro files will be written." @@ -249,7 +249,7 @@ task ExtractFromFilterTables { ORDER BY location " --call_set_identifier ~{call_set_identifier} --dataset_name ~{dataset_name} --table_name filter_set_sites --project_id=~{project_id} - if [ ~{is_vqsr_lite} = false ]; then + if [ ~{is_vets} = false ]; then python3 /app/run_avro_query.py --sql " EXPORT DATA OPTIONS( uri='${avro_prefix}/vqsr_tranche_data/vqsr_tranche_data_*.avro', format='AVRO', compression='SNAPPY') AS diff --git a/scripts/variantstore/wdl/GvsExtractCallset.wdl b/scripts/variantstore/wdl/GvsExtractCallset.wdl index 67b6a4a0477..cc07c0ef4e5 100644 --- a/scripts/variantstore/wdl/GvsExtractCallset.wdl +++ b/scripts/variantstore/wdl/GvsExtractCallset.wdl @@ -182,7 +182,7 @@ workflow GvsExtractCallset { cloud_sdk_docker = effective_cloud_sdk_docker, } - call Utils.IsVQSRLite { + call Utils.IsVETS { input: project_id = query_project, fq_filter_set_info_table = "~{fq_filter_set_info_table}", @@ -191,9 +191,9 @@ workflow GvsExtractCallset { } } - # If we're not using the VQSR filters, set it to Lite (really shouldn't matter one way or the other) + # If we're not using the VQSR filters, set it to VETS (really shouldn't matter one way or the other) # Otherwise use the auto-derived flag. - Boolean use_VQSR_lite = select_first([IsVQSRLite.is_vqsr_lite, true]) + Boolean use_VETS = select_first([IsVETS.is_vets, true]) call Utils.GetBQTablesMaxLastModifiedTimestamp { input: @@ -223,7 +223,7 @@ workflow GvsExtractCallset { go = select_first([ValidateFilterSetName.done, true]), dataset_name = dataset_name, call_set_identifier = call_set_identifier, - use_VQSR_lite = use_VQSR_lite, + use_VETS = use_VETS, gatk_docker = effective_gatk_docker, gatk_override = gatk_override, reference = reference, @@ -241,7 +241,7 @@ workflow GvsExtractCallset { fq_filter_set_info_table = fq_filter_set_info_table, fq_filter_set_site_table = fq_filter_set_site_table, fq_ploidy_mapping_table = fq_ploidy_mapping_table, - fq_filter_set_tranches_table = if (use_VQSR_lite) then none else fq_filter_set_tranches_table, + fq_filter_set_tranches_table = if (use_VETS) then none else fq_filter_set_tranches_table, filter_set_name = filter_set_name, drop_state = drop_state, output_file = vcf_filename + vcf_extension, @@ -309,7 +309,7 @@ task ExtractTask { String dataset_name String call_set_identifier - Boolean use_VQSR_lite + Boolean use_VETS File reference File reference_index @@ -382,7 +382,7 @@ task ExtractTask { if [ ~{do_not_filter_override} = true ]; then FILTERING_ARGS='' - elif [ ~{use_VQSR_lite} = false ]; then + elif [ ~{use_VETS} = false ]; then FILTERING_ARGS='--filter-set-info-table ~{fq_filter_set_info_table} --filter-set-site-table ~{fq_filter_set_site_table} --tranches-table ~{fq_filter_set_tranches_table} @@ -419,7 +419,7 @@ task ExtractTask { --project-id ~{read_project_id} \ ~{true='--emit-pls' false='' emit_pls} \ ~{true='--emit-ads' false='' emit_ads} \ - ~{true='' false='--use-vqsr-scoring' use_VQSR_lite} \ + ~{true='' false='--use-vqsr-scoring' use_VETS} \ ~{true='--convert-filtered-genotypes-to-no-calls' false='' convert_filtered_genotypes_to_nocalls} \ ~{'--maximum-alternate-alleles ' + maximum_alternate_alleles} \ ${FILTERING_ARGS} \ diff --git a/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl b/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl index be223d201a3..2fe7bce4c62 100644 --- a/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl +++ b/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl @@ -184,7 +184,7 @@ workflow GvsExtractCallsetPgen { cloud_sdk_docker = effective_cloud_sdk_docker, } - call Utils.IsVQSRLite { + call Utils.IsVETS { input: project_id = query_project, fq_filter_set_info_table = "~{fq_filter_set_info_table}", @@ -193,9 +193,9 @@ workflow GvsExtractCallsetPgen { } } - # If we're not using the VQSR filters, set it to Lite (really shouldn't matter one way or the other) + # If we're not using the VQSR filters, set it to VETS (really shouldn't matter one way or the other) # Otherwise use the auto-derived flag. - Boolean use_VQSR_lite = select_first([IsVQSRLite.is_vqsr_lite, true]) + Boolean use_VETS = select_first([IsVETS.is_vets, true]) call Utils.GetBQTablesMaxLastModifiedTimestamp { input: @@ -226,7 +226,7 @@ workflow GvsExtractCallsetPgen { max_alt_alleles = max_alt_alleles, lenient_ploidy_validation = lenient_ploidy_validation, preserve_phasing = preserve_phasing, - use_VQSR_lite = use_VQSR_lite, + use_VETS = use_VETS, gatk_docker = effective_gatk_docker, gatk_override = gatk_override, reference = reference, @@ -244,7 +244,7 @@ workflow GvsExtractCallsetPgen { do_not_filter_override = do_not_filter_override, fq_filter_set_info_table = fq_filter_set_info_table, fq_filter_set_site_table = fq_filter_set_site_table, - fq_filter_set_tranches_table = if (use_VQSR_lite) then none else fq_filter_set_tranches_table, + fq_filter_set_tranches_table = if (use_VETS) then none else fq_filter_set_tranches_table, filter_set_name = filter_set_name, drop_state = drop_state, output_pgen_basename = pgen_basename, @@ -327,7 +327,7 @@ task PgenExtractTask { # If true, preserves phasing in the output PGEN files if phasing is present in the source genotypes Boolean preserve_phasing = false - Boolean use_VQSR_lite + Boolean use_VETS File reference File reference_index @@ -394,7 +394,7 @@ task PgenExtractTask { if [ ~{do_not_filter_override} = true ]; then FILTERING_ARGS='' - elif [ ~{use_VQSR_lite} = false ]; then + elif [ ~{use_VETS} = false ]; then FILTERING_ARGS='--filter-set-info-table ~{fq_filter_set_info_table} --filter-set-site-table ~{fq_filter_set_site_table} --tranches-table ~{fq_filter_set_tranches_table} @@ -436,7 +436,7 @@ task PgenExtractTask { --project-id ~{read_project_id} \ ~{true='--emit-pls' false='' emit_pls} \ ~{true='--emit-ads' false='' emit_ads} \ - ~{true='' false='--use-vqsr-scoring' use_VQSR_lite} \ + ~{true='' false='--use-vqsr-scoring' use_VETS} \ --convert-filtered-genotypes-to-no-calls \ ${FILTERING_ARGS} \ --dataset-id ~{dataset_name} \ diff --git a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl index fb6ee176b57..9e740184f15 100644 --- a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl +++ b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl @@ -21,7 +21,7 @@ workflow GvsJointVariantCalling { Boolean bgzip_output_vcfs = false String drop_state = "FORTY" - Boolean use_classic_VQSR = false + Boolean use_VQSR = false Boolean use_compressed_references = false Boolean load_vet_and_ref_ranges = true Boolean load_vcf_headers = false @@ -61,10 +61,10 @@ workflow GvsJointVariantCalling { File? target_interval_list # Overrides to be passed to GvsCreateFilterSet - Int? INDEL_VQSR_CLASSIC_max_gaussians_override = 4 - Int? INDEL_VQSR_CLASSIC_mem_gb_override - Int? SNP_VQSR_CLASSIC_max_gaussians_override = 6 - Int? SNP_VQSR_CLASSIC_mem_gb_override + Int? INDEL_VQSR_max_gaussians_override = 4 + Int? INDEL_VQSR_mem_gb_override + Int? SNP_VQSR_max_gaussians_override = 6 + Int? SNP_VQSR_mem_gb_override File? training_python_script File? scoring_python_script @@ -168,15 +168,15 @@ workflow GvsJointVariantCalling { project_id = project_id, call_set_identifier = call_set_identifier, filter_set_name = effective_filter_set_name, - use_VQSR_lite = !use_classic_VQSR, + use_VETS = !use_VQSR, interval_list = interval_list_to_use, variants_docker = effective_variants_docker, gatk_docker = effective_gatk_docker, gatk_override = gatk_override, - INDEL_VQSR_CLASSIC_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override, - INDEL_VQSR_CLASSIC_mem_gb_override = INDEL_VQSR_CLASSIC_mem_gb_override, - SNP_VQSR_CLASSIC_max_gaussians_override = SNP_VQSR_CLASSIC_max_gaussians_override, - SNP_VQSR_CLASSIC_mem_gb_override = SNP_VQSR_CLASSIC_mem_gb_override, + INDEL_VQSR_max_gaussians_override = INDEL_VQSR_max_gaussians_override, + INDEL_VQSR_mem_gb_override = INDEL_VQSR_mem_gb_override, + SNP_VQSR_max_gaussians_override = SNP_VQSR_max_gaussians_override, + SNP_VQSR_mem_gb_override = SNP_VQSR_mem_gb_override, cloud_sdk_docker = effective_cloud_sdk_docker, training_python_script = training_python_script, scoring_python_script = scoring_python_script, diff --git a/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl index 78e2c6b93fb..85a006e55dd 100644 --- a/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl +++ b/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl @@ -11,7 +11,7 @@ workflow GvsQuickstartHailIntegration { String? git_hash Boolean is_wgs File? interval_list - Boolean use_VQSR_lite = true + Boolean use_VETS = true Boolean use_compressed_references = false Boolean extract_do_not_filter_override String dataset_suffix = "hail" @@ -67,7 +67,7 @@ workflow GvsQuickstartHailIntegration { git_branch_or_tag = git_branch_or_tag, git_hash = git_hash, drop_state = "ZERO", - use_VQSR_lite = use_VQSR_lite, + use_VETS = use_VETS, use_compressed_references = use_compressed_references, extract_do_not_filter_override = extract_do_not_filter_override, load_vcf_headers = true, @@ -100,7 +100,7 @@ workflow GvsQuickstartHailIntegration { git_branch_or_tag = git_branch_or_tag, git_hash = git_hash, project_id = project_id, - use_VQSR_lite = use_VQSR_lite, + use_VETS = use_VETS, dataset_name = GvsQuickstartVcfIntegration.dataset_name, filter_set_name = GvsQuickstartVcfIntegration.filter_set_name, scatter_width = 10, @@ -114,7 +114,7 @@ workflow GvsQuickstartHailIntegration { input: git_branch_or_tag = git_branch_or_tag, hail_version = effective_hail_version, - use_classic_VQSR = !use_VQSR_lite, + use_VQSR = !use_VETS, avro_path = GvsExtractAvroFilesForHail.avro_prefix, vds_destination_path = GvsExtractAvroFilesForHail.vds_output_path, cluster_prefix = "vds-cluster", diff --git a/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl index 91de5ae2ae3..f3491f0cabd 100644 --- a/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl @@ -34,7 +34,7 @@ workflow GvsQuickstartIntegration { File full_wgs_interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list" File full_exome_interval_list = "gs://gcp-public-data--broad-references/hg38/v0/bge_exome_calling_regions.v1.1.interval_list" String expected_subdir = if (!chr20_X_Y_only) then "all_chrs/" else "" - File expected_output_prefix = "gs://gvs-internal-quickstart/integration/2024-07-03/" + expected_subdir + File expected_output_prefix = "gs://gvs-internal-quickstart/integration/2024-08-13/" + expected_subdir # WDL 1.0 trick to set a variable ('none') to be undefined. if (false) { @@ -77,13 +77,13 @@ workflow GvsQuickstartIntegration { # though in practice likely they are the same. if (run_hail_integration) { # This test workflow is probably best representative of the AoU workflow. Parameters used here should be those used for AoU callsets - call QuickstartHailIntegration.GvsQuickstartHailIntegration as GvsQuickstartHailVQSRLiteIntegration { + call QuickstartHailIntegration.GvsQuickstartHailIntegration as GvsQuickstartHailVETSIntegration { input: git_branch_or_tag = git_branch_or_tag, git_hash = GetToolVersions.git_hash, - use_VQSR_lite = true, + use_VETS = true, extract_do_not_filter_override = false, - dataset_suffix = "lite_hail", + dataset_suffix = "vets_hail", use_default_dockers = use_default_dockers, gatk_override = if (use_default_dockers) then none else BuildGATKJar.jar, is_wgs = true, @@ -105,13 +105,13 @@ workflow GvsQuickstartIntegration { hail_version = effective_hail_version, maximum_alternate_alleles = maximum_alternate_alleles, } - call QuickstartHailIntegration.GvsQuickstartHailIntegration as GvsQuickstartHailVQSRClassicIntegration { + call QuickstartHailIntegration.GvsQuickstartHailIntegration as GvsQuickstartHailVQSRIntegration { input: git_branch_or_tag = git_branch_or_tag, git_hash = GetToolVersions.git_hash, - use_VQSR_lite = false, + use_VETS = false, extract_do_not_filter_override = false, - dataset_suffix = "classic_hail", + dataset_suffix = "vqsr_hail", use_default_dockers = use_default_dockers, gatk_override = if (use_default_dockers) then none else BuildGATKJar.jar, is_wgs = true, @@ -133,31 +133,31 @@ workflow GvsQuickstartIntegration { maximum_alternate_alleles = maximum_alternate_alleles, } - if (GvsQuickstartHailVQSRLiteIntegration.used_tighter_gcp_quotas) { - call Utils.TerminateWorkflow as HailVQSRLiteQuotaFail { + if (GvsQuickstartHailVETSIntegration.used_tighter_gcp_quotas) { + call Utils.TerminateWorkflow as HailVETSQuotaFail { input: - message = "GvsQuickstartHailVQSRLiteIntegration should not have used tighter GCP quotas but did!", + message = "GvsQuickstartHailVETSIntegration should not have used tighter GCP quotas but did!", basic_docker = effective_basic_docker, } } - if (GvsQuickstartHailVQSRClassicIntegration.used_tighter_gcp_quotas) { - call Utils.TerminateWorkflow as HailVQSRClassicQuotaFail { + if (GvsQuickstartHailVQSRIntegration.used_tighter_gcp_quotas) { + call Utils.TerminateWorkflow as HailVQSRQuotaFail { input: - message = "GvsQuickstartHailVQSRClassicIntegration should not have used tighter GCP quotas but did!", + message = "GvsQuickstartHailVQSRIntegration should not have used tighter GCP quotas but did!", basic_docker = effective_basic_docker, } } } if (run_vcf_integration) { - call QuickstartVcfIntegration.GvsQuickstartVcfIntegration as QuickstartVcfVQSRLiteIntegration { + call QuickstartVcfIntegration.GvsQuickstartVcfIntegration as QuickstartVcfVETSIntegration { input: git_branch_or_tag = git_branch_or_tag, git_hash = GetToolVersions.git_hash, - use_VQSR_lite = true, + use_VETS = true, extract_do_not_filter_override = false, - dataset_suffix = "lite_vcf", + dataset_suffix = "vets_vcf", use_default_dockers = use_default_dockers, gatk_override = if (use_default_dockers) then none else BuildGATKJar.jar, is_wgs = true, @@ -178,13 +178,13 @@ workflow GvsQuickstartIntegration { submission_id = GetToolVersions.submission_id, maximum_alternate_alleles = maximum_alternate_alleles, } - call QuickstartVcfIntegration.GvsQuickstartVcfIntegration as QuickstartVcfVQSRClassicIntegration { + call QuickstartVcfIntegration.GvsQuickstartVcfIntegration as QuickstartVcfVQSRIntegration { input: git_branch_or_tag = git_branch_or_tag, git_hash = GetToolVersions.git_hash, - use_VQSR_lite = false, + use_VETS = false, extract_do_not_filter_override = true, - dataset_suffix = "classic_vcf", + dataset_suffix = "vqsr_vcf", use_default_dockers = use_default_dockers, gatk_override = if (use_default_dockers) then none else BuildGATKJar.jar, is_wgs = true, @@ -206,18 +206,18 @@ workflow GvsQuickstartIntegration { maximum_alternate_alleles = maximum_alternate_alleles, } - if (QuickstartVcfVQSRClassicIntegration.used_tighter_gcp_quotas) { - call Utils.TerminateWorkflow as VcfVQSRClassicQuotaFail { + if (QuickstartVcfVQSRIntegration.used_tighter_gcp_quotas) { + call Utils.TerminateWorkflow as VcfVQSRQuotaFail { input: - message = "QuickstartVcfVQSRLiteIntegration should not have used tighter GCP quotas but did!", + message = "QuickstartVcfVQSRIntegration should not have used tighter GCP quotas but did!", basic_docker = effective_basic_docker, } } - if (QuickstartVcfVQSRLiteIntegration.used_tighter_gcp_quotas) { - call Utils.TerminateWorkflow as VcfVQSRLiteQuotaFail { + if (QuickstartVcfVETSIntegration.used_tighter_gcp_quotas) { + call Utils.TerminateWorkflow as VcfVETSQuotaFail { input: - message = "QuickstartVcfVQSRLiteIntegration should not have used tighter GCP quotas but did!", + message = "QuickstartVcfVETSIntegration should not have used tighter GCP quotas but did!", basic_docker = effective_basic_docker, } } @@ -228,7 +228,7 @@ workflow GvsQuickstartIntegration { input: git_branch_or_tag = git_branch_or_tag, git_hash = GetToolVersions.git_hash, - use_VQSR_lite = true, + use_VETS = true, extract_do_not_filter_override = false, dataset_suffix = "exome", use_default_dockers = use_default_dockers, @@ -267,7 +267,7 @@ workflow GvsQuickstartIntegration { input: git_branch_or_tag = git_branch_or_tag, git_hash = GetToolVersions.git_hash, - use_VQSR_lite = true, + use_VETS = true, extract_do_not_filter_override = false, dataset_suffix = "bge", use_default_dockers = use_default_dockers, diff --git a/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl index 55ed3229fa9..375229db061 100644 --- a/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl +++ b/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl @@ -8,7 +8,7 @@ workflow GvsQuickstartVcfIntegration { String git_branch_or_tag String? git_hash String expected_output_prefix - Boolean use_VQSR_lite = true + Boolean use_VETS = true Boolean extract_do_not_filter_override = true Boolean use_compressed_references = false Boolean load_vcf_headers = false @@ -86,13 +86,13 @@ workflow GvsQuickstartVcfIntegration { dataset_name = CreateDatasetForTest.dataset_name, project_id = project_id, gatk_override = if (use_default_dockers) then none else select_first([gatk_override, BuildGATKJar.jar]), - use_classic_VQSR = !use_VQSR_lite, + use_VQSR = !use_VETS, use_compressed_references = use_compressed_references, load_vcf_headers = load_vcf_headers, extract_output_file_base_name = "quickit", filter_set_name = "quickit", extract_table_prefix = "quickit", - # optionally turn off filtering (VQSR Classic is not deterministic) + # optionally turn off filtering (VQSR is not deterministic) # (and the initial version of this integration test does not allow for inexact matching of actual and expected results.) extract_do_not_filter_override = extract_do_not_filter_override, drop_state = drop_state, @@ -118,8 +118,8 @@ workflow GvsQuickstartVcfIntegration { extract_output_gcs_dir = extract_output_gcs_dir, } - # Only assert identical outputs if we did not filter (filtering is not deterministic) OR if we are using VQSR Lite (which is deterministic) - if (extract_do_not_filter_override || use_VQSR_lite) { + # Only assert identical outputs if we did not filter (filtering is not deterministic) OR if we are using VETS (which is deterministic) + if (extract_do_not_filter_override || use_VETS) { String expected_prefix = expected_output_prefix + dataset_suffix + "/" call AssertIdenticalOutputs { input: diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl index cf46eb54962..e3915b7693c 100644 --- a/scripts/variantstore/wdl/GvsUtils.wdl +++ b/scripts/variantstore/wdl/GvsUtils.wdl @@ -68,13 +68,13 @@ task GetToolVersions { String git_hash = read_string("git_hash.txt") String hail_version = "0.2.126" String basic_docker = "ubuntu:22.04" - String cloud_sdk_docker = cloud_sdk_docker_decl # Defined above as a declaration. + String cloud_sdk_docker = cloud_sdk_docker_decl # Defined above as a declaration. # GVS generally uses the smallest `alpine` version of the Google Cloud SDK as it suffices for most tasks, but # there are a handlful of tasks that require the larger GNU libc-based `slim`. String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:435.0.0-slim" - String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2024-08-13-alpine-fe21da898f54" + String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2024-08-15-alpine-254df9be288d" String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19" - String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024-07-23-gatkbase-abbe96265d5f" + String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024_08_08-gatkbase-7dc245ec27ce" String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest" String gotc_imputation_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.5-1.10.2-0.1.16-1649948623" String plink_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/plink2:2024-04-23-slim-a0a65f52cc0e" @@ -857,7 +857,7 @@ task ValidateFilterSetName { } } -task IsVQSRLite { +task IsVETS { input { String project_id String fq_filter_set_info_table @@ -871,7 +871,7 @@ task IsVQSRLite { # add labels for DSP Cloud Cost Control Labeling and Reporting String bq_labels = "--label service:gvs --label team:variants --label managedby:gvs_utils" - String is_vqsr_lite_file = "is_vqsr_lite_file.txt" + String is_vets_file = "is_vets_file.txt" command <<< # Prepend date, time and pwd to xtrace log entries. @@ -887,27 +887,27 @@ task IsVQSRLite { AND calibration_sensitivity IS NOT NULL; EXCEPTION WHEN ERROR THEN SELECT "0" AS counted ; - END' | tail -1 > lite_count_file.txt - LITE_COUNT=`cat lite_count_file.txt` + END' | tail -1 > vets_count_file.txt + VETS_COUNT=`cat vets_count_file.txt` # bq query --max_rows check: ok one row bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false ~{bq_labels} \ 'SELECT COUNT(1) FROM `~{fq_filter_set_info_table}` WHERE filter_set_name = "~{filter_set_name}" - AND vqslod IS NOT NULL' | tail -1 > classic_count_file.txt - CLASSIC_COUNT=`cat classic_count_file.txt` + AND vqslod IS NOT NULL' | tail -1 > vqsr_count_file.txt + VQSR_COUNT=`cat vqsr_count_file.txt` - if [[ $LITE_COUNT != "0" ]]; then - echo "Found $LITE_COUNT rows with calibration_sensitivity defined" - if [[ $CLASSIC_COUNT != "0" ]]; then - echo "Found $CLASSIC_COUNT rows with vqslod defined" + if [[ $VETS_COUNT != "0" ]]; then + echo "Found $VETS_COUNT rows with calibration_sensitivity defined" + if [[ $VQSR_COUNT != "0" ]]; then + echo "Found $VQSR_COUNT rows with vqslod defined" echo "ERROR - can't have both defined for a filter_set" exit 1 fi - echo "true" > ~{is_vqsr_lite_file} - elif [[ $CLASSIC_COUNT != "0" ]]; then - echo "Found $CLASSIC_COUNT rows with vqslod defined" - echo "false" > ~{is_vqsr_lite_file} + echo "true" > ~{is_vets_file} + elif [[ $VQSR_COUNT != "0" ]]; then + echo "Found $VQSR_COUNT rows with vqslod defined" + echo "false" > ~{is_vets_file} else echo "Found NO rows with either calibration_sensitivity or vqslod defined" exit 1 @@ -915,7 +915,7 @@ task IsVQSRLite { >>> output { - Boolean is_vqsr_lite = read_boolean(is_vqsr_lite_file) + Boolean is_vets = read_boolean(is_vets_file) } runtime { @@ -1222,15 +1222,15 @@ task SummarizeTaskMonitorLogs { } } -# Note - this task should probably live in GvsCreateFilterSet, but I moved it here when I was refactoring VQSR Classic out of +# Note - this task should probably live in GvsCreateFilterSet, but I moved it here when I was refactoring VQSR out of # GvsCreateFilterSet (in order to avoid a circular dependency) -# When VQSR Classic is removed, consider putting this task back in GvsCreateFilterSet +# When VQSR is removed entirely, consider putting this task back in GvsCreateFilterSet task PopulateFilterSetInfo { input { String filter_set_name String filter_schema String fq_filter_set_info_destination_table - Boolean useClassic = false + Boolean useVQSR = false File snp_recal_file File snp_recal_file_index @@ -1271,7 +1271,7 @@ task PopulateFilterSetInfo { --ref-version 38 \ --filter-set-name ~{filter_set_name} \ -mode SNP \ - --classic ~{useClassic} \ + --use-vqsr ~{useVQSR} \ -V ~{snp_recal_file} \ -O ~{filter_set_name}.snps.recal.tsv @@ -1281,7 +1281,7 @@ task PopulateFilterSetInfo { --ref-version 38 \ --filter-set-name ~{filter_set_name} \ -mode INDEL \ - --classic ~{useClassic} \ + --use-vqsr ~{useVQSR} \ -V ~{indel_recal_file} \ -O ~{filter_set_name}.indels.recal.tsv diff --git a/scripts/variantstore/wdl/GvsVQSRClassic.wdl b/scripts/variantstore/wdl/GvsVQSR.wdl similarity index 99% rename from scripts/variantstore/wdl/GvsVQSRClassic.wdl rename to scripts/variantstore/wdl/GvsVQSR.wdl index 5b090b23314..058c7319e1b 100644 --- a/scripts/variantstore/wdl/GvsVQSRClassic.wdl +++ b/scripts/variantstore/wdl/GvsVQSR.wdl @@ -33,7 +33,7 @@ workflow JointVcfFiltering { # This is the minimum number of samples where the SNP model will be created and applied in separate tasks # (SNPsVariantRecalibratorClassic vs. SNPsVariantRecalibratorCreateModel and SNPsVariantRecalibratorScattered) - # For VQSR classic this is done with 20k but the 10K Stroke Anderson dataset would not work unscattered (at least + # For VQSR this is done with 20k but the 10K Stroke Anderson dataset would not work unscattered (at least # with the default VM memory settings) so this was adjusted down to 5K. Int snps_variant_recalibration_threshold = 5000 } @@ -214,7 +214,7 @@ workflow JointVcfFiltering { indel_recal_file = IndelsVariantRecalibrator.recalibration, indel_recal_file_index = IndelsVariantRecalibrator.recalibration_index, project_id = project_id, - useClassic = true + useVQSR = true } call PopulateFilterSetTranches { diff --git a/scripts/variantstore/wdl/extract/hail_gvs_import.py b/scripts/variantstore/wdl/extract/hail_gvs_import.py index b5151a936e8..9d0e930f643 100644 --- a/scripts/variantstore/wdl/extract/hail_gvs_import.py +++ b/scripts/variantstore/wdl/extract/hail_gvs_import.py @@ -14,7 +14,7 @@ gcs_re = re.compile("^gs://(?P[^/]+)/(?P.*)$") -def create_vds(argsfn, vds_path, references_path, temp_path, use_classic_vqsr, intermediate_resume_point): +def create_vds(argsfn, vds_path, references_path, temp_path, use_vqsr, intermediate_resume_point): import hail as hl import import_gvs from hail.utils.java import Env @@ -47,7 +47,7 @@ def create_vds(argsfn, vds_path, references_path, temp_path, use_classic_vqsr, i # partitions_per_sample=0.35, # check with Hail about how to tune this for your large callset # intermediate_resume_point=0, # if your first run fails, and you want to use the intermediate files that already exist, check in with Hail to find out what stage to resume on # skip_final_merge=false, # if you want to create your VDS in two steps (because of mem issues) this can be skipped until the final run - use_classic_vqsr=use_classic_vqsr, + use_vqsr=use_vqsr, intermediate_resume_point=intermediate_resume_point ) finally: @@ -145,8 +145,8 @@ def regular_handler(): required=True) parser.add_argument('--references-path', type=str, help='Path to references, only required for local files', required=False) - parser.add_argument("--use-classic-vqsr", action="store_true", - help="If set, expect that the input GVS Avro files were generated using VQSR Classic") + parser.add_argument("--use-vqsr", action="store_true", + help="If set, expect that the input GVS Avro files were generated using VQSR") parser.add_argument('--intermediate-resume-point', type=int, required=False, default=0, help='Intermediate VDS index at which to resume') @@ -155,7 +155,7 @@ def regular_handler(): # Remove trailing slashes if present. avro_path, temp_path, vds_path = [p if not p.endswith('/') else p[:-1] for p in [args.avro_path, args.temp_path, args.vds_path]] - use_classic_vqsr = args.use_classic_vqsr + use_vqsr = args.use_vqsr is_gcs = [gcs_re.match(p) for p in [avro_path, temp_path, vds_path]] is_not_gcs = [not g for g in is_gcs] @@ -166,7 +166,7 @@ def regular_handler(): def arguments(key): return gcs_generate_avro_args(avro_bucket, avro_object_prefix, key) - create_vds(arguments, vds_path, 'gs://hail-common/references', temp_path, use_classic_vqsr, + create_vds(arguments, vds_path, 'gs://hail-common/references', temp_path, use_vqsr, args.intermediate_resume_point) elif all(is_not_gcs): @@ -179,7 +179,7 @@ def arguments(key): def arguments(key): return local_generate_avro_args(avro_path, key) - create_vds(arguments, vds_path, references_path, temp_path, use_classic_vqsr, + create_vds(arguments, vds_path, references_path, temp_path, use_vqsr, args.intermediate_resume_point) else: raise ValueError("Arguments appear to be some unsavory mix of GCS and local paths, all or nothing please.") diff --git a/scripts/variantstore/wdl/extract/import_gvs.py b/scripts/variantstore/wdl/extract/import_gvs.py index dfe5f218e37..069b18f7f19 100644 --- a/scripts/variantstore/wdl/extract/import_gvs.py +++ b/scripts/variantstore/wdl/extract/import_gvs.py @@ -22,7 +22,7 @@ intermediate_resume_point=int, skip_final_merge=bool, ref_block_max_length=int, - use_classic_vqsr=bool + use_vqsr=bool ) def import_gvs(refs: 'List[List[str]]', vets: 'List[List[str]]', @@ -39,7 +39,7 @@ def import_gvs(refs: 'List[List[str]]', intermediate_resume_point=0, skip_final_merge=False, ref_block_max_length: 'int' = 1000, - use_classic_vqsr=False + use_vqsr=False ): """Import a collection of Avro files exported from GVS. @@ -132,8 +132,8 @@ def import_gvs(refs: 'List[List[str]]', Skip final merge if true. ref_block_max_length : :class:`int` Maximum reference block length. - use_classic_vqsr : :class:`bool` - Expect input Avro files to have been generated from VQSR 'Classic' data + use_vqsr : :class:`bool` + Expect input Avro files to have been generated from VQSR (NOT VETS) data Script workflow: --------------- @@ -200,8 +200,8 @@ def convert_array_with_id_keys_to_dense_array(arr, ids, drop=[]): vqsr = vqsr.key_by('locus') vqsr.write(vqsr_path, overwrite=True) - if use_classic_vqsr: - info('vqsr_classic: Loading tranche data') + if use_vqsr: + info('vqsr: Loading tranche data') tranche = hl.import_avro(vqsr_tranche_data) n_samples = 0 @@ -339,7 +339,7 @@ def convert_array_with_id_keys_to_dense_array(arr, ids, drop=[]): vd = vd.annotate_rows(as_vqsr = hl.dict(vqsr.index(vd.locus, all_matches=True) .map(lambda record: (record.alt + vd.alleles[0][hl.len(record.ref):], record.drop('ref', 'alt'))))) - if use_classic_vqsr: + if use_vqsr: vd = vd.annotate_globals(tranche_data=tranche.collect(_localize=False), truth_sensitivity_snp_threshold=truth_sensitivity_snp_threshold, truth_sensitivity_indel_threshold=truth_sensitivity_indel_threshold) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/filtering/CreateFilteringFiles.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/filtering/CreateFilteringFiles.java index 76e819ae761..a749d83a86a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/filtering/CreateFilteringFiles.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/filtering/CreateFilteringFiles.java @@ -32,8 +32,6 @@ omitFromCommandLine = true ) public final class CreateFilteringFiles extends VariantWalker { - static final Logger logger = LogManager.getLogger(CreateVariantIngestFiles.class); - private SimpleXSVWriter writer; private final List HEADER = @@ -64,10 +62,10 @@ public final class CreateFilteringFiles extends VariantWalker { private String mode; @Argument( - fullName = "classic", - doc = "Whether or not this is using classic VQSR or the newer VQSR-Lite", + fullName = "use-vqsr", + doc = "Whether or not this is using VQSR or VETS", optional = true) - private Boolean usingOldVQSR = null; + private Boolean useVQSR = null; @Override public boolean requiresIntervals() { @@ -82,8 +80,8 @@ public void onTraversalStart() { throw new GATKException("Unable to initialize writer", ioe); } - if (usingOldVQSR == null) { // default to using the old, or "classic" VQSR if the user specifies nothing - usingOldVQSR = Boolean.TRUE; + if (useVQSR == null) { // default to using VQSR if the user specifies nothing + useVQSR = Boolean.TRUE; } writer.setHeaderLine(HEADER); @@ -110,7 +108,7 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, String culprit = ""; String trainingLabel = ""; String yng = ""; - if (!usingOldVQSR) { + if (!useVQSR) { calibration_sensitivity = variant.getAttributeAsString("CALIBRATION_SENSITIVITY",""); score = variant.getAttributeAsString("SCORE", ""); trainingLabel = variant.hasAttribute("training") ? "POSITIVE" : "";