From 1288478da48947d76855722dcf261714e1dcc078 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Thu, 13 Jul 2023 16:09:21 -0400 Subject: [PATCH] Beta is the new Unified --- .dockstore.yml | 7 - .../wdl/GvsJointVariantCalling.wdl | 114 +++++++++---- .../wdl/GvsQuickstartVcfIntegration.wdl | 1 - scripts/variantstore/wdl/GvsUnified.wdl | 161 ------------------ 4 files changed, 80 insertions(+), 203 deletions(-) delete mode 100644 scripts/variantstore/wdl/GvsUnified.wdl diff --git a/.dockstore.yml b/.dockstore.yml index a744fa0b804..34ad49dff8e 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -171,13 +171,6 @@ workflows: branches: - master - ah_var_store - - name: GvsUnified - subclass: WDL - primaryDescriptorPath: /scripts/variantstore/wdl/GvsUnified.wdl - filters: - branches: - - master - - ah_var_store - name: GvsJointVariantCalling subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/GvsJointVariantCalling.wdl diff --git a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl index 2b2dea596b2..23f7b6fcb3c 100644 --- a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl +++ b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl @@ -1,6 +1,10 @@ version 1.0 -import "GvsUnified.wdl" as GvsUnified +import "GvsBulkIngestGenomes.wdl" as BulkIngestGenomes +import "GvsPopulateAltAllele.wdl" as PopulateAltAllele +import "GvsCreateFilterSet.wdl" as CreateFilterSet +import "GvsPrepareRangesCallset.wdl" as PrepareRangesCallset +import "GvsExtractCallset.wdl" as ExtractCallset workflow GvsJointVariantCalling { input { @@ -29,6 +33,12 @@ workflow GvsJointVariantCalling { String extract_output_file_base_name = sub(call_set_identifier, "\\s+|\_+", "-") String extract_table_prefix = sub(call_set_identifier, "\\s+|\_+", "-") String filter_set_name = sub(call_set_identifier, "\\s+|\_+", "-") + + String query_project = project_id + String destination_project = project_id + String destination_dataset = dataset_name + String fq_temp_table_dataset = "~{destination_project}.~{destination_dataset}" + if (false) { Int extract_maxretries_override = "" Int extract_preemptible_override = "" @@ -48,53 +58,89 @@ workflow GvsJointVariantCalling { File interval_weights_bed = "gs://broad-public-datasets/gvs/weights/gvs_vet_weights_1kb.bed" - call GvsUnified.GvsUnified { + + call BulkIngestGenomes.GvsBulkIngestGenomes as BulkIngestGenomes { + input: + dataset_name = dataset_name, + project_id = project_id, + gatk_override = gatk_override, + interval_list = interval_list, + drop_state = drop_state, + sample_id_column_name = sample_id_column_name, + vcf_files_column_name = vcf_files_column_name, + vcf_index_files_column_name = vcf_index_files_column_name, + sample_set_name = sample_set_name, + } + + call PopulateAltAllele.GvsPopulateAltAllele { input: call_set_identifier = call_set_identifier, + go = BulkIngestGenomes.done, dataset_name = dataset_name, project_id = project_id, + } + + call CreateFilterSet.GvsCreateFilterSet { + input: + go = GvsPopulateAltAllele.done, + dataset_name = dataset_name, + project_id = project_id, + call_set_identifier = call_set_identifier, filter_set_name = filter_set_name, - use_VQSR_lite = !use_classic_VQSR, - extract_output_gcs_dir = extract_output_gcs_dir, - destination_dataset = dataset_name, - destination_project = project_id, - extract_do_not_filter_override = extract_do_not_filter_override, - extract_maxretries_override = extract_maxretries_override, - extract_output_file_base_name = extract_output_file_base_name, - extract_preemptible_override = extract_preemptible_override, - extract_scatter_count = extract_scatter_count, - extract_table_prefix = extract_table_prefix, - fq_temp_table_dataset = "~{project_id}.~{dataset_name}", - gatk_override = gatk_override, + use_VQSR_lite = use_classic_VQSR, interval_list = interval_list, - interval_weights_bed = interval_weights_bed, - load_data_batch_size = load_data_batch_size, - load_data_maxretries_override = load_data_maxretries_override, - load_data_preemptible_override = load_data_preemptible_override, - query_labels = query_labels, - query_project = project_id, - sample_names_to_extract = sample_names_to_extract, - split_intervals_disk_size_override = split_intervals_disk_size_override, - split_intervals_mem_override = split_intervals_mem_override, + gatk_override = gatk_override, INDEL_VQSR_CLASSIC_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override, INDEL_VQSR_CLASSIC_mem_gb_override = INDEL_VQSR_CLASSIC_mem_gb_override, SNP_VQSR_CLASSIC_max_gaussians_override = SNP_VQSR_CLASSIC_max_gaussians_override, SNP_VQSR_CLASSIC_mem_gb_override = SNP_VQSR_CLASSIC_mem_gb_override, + } + + call PrepareRangesCallset.GvsPrepareCallset { + input: + call_set_identifier = call_set_identifier, + go = GvsCreateFilterSet.done, + dataset_name = dataset_name, + project_id = project_id, + extract_table_prefix = extract_table_prefix, + query_project = query_project, + destination_project = destination_project, + destination_dataset = destination_dataset, + fq_temp_table_dataset = fq_temp_table_dataset, + query_labels = query_labels, + sample_names_to_extract = sample_names_to_extract, + } + + call ExtractCallset.GvsExtractCallset { + input: + go = GvsPrepareCallset.done, + dataset_name = dataset_name, + project_id = project_id, + call_set_identifier = call_set_identifier, + extract_table_prefix = extract_table_prefix, + filter_set_name = filter_set_name, + query_project = query_project, + scatter_count = extract_scatter_count, + interval_list = interval_list, + interval_weights_bed = interval_weights_bed, + gatk_override = gatk_override, + output_file_base_name = extract_output_file_base_name, + extract_maxretries_override = extract_maxretries_override, + extract_preemptible_override = extract_preemptible_override, + output_gcs_dir = extract_output_gcs_dir, + split_intervals_disk_size_override = split_intervals_disk_size_override, + split_intervals_mem_override = split_intervals_mem_override, + do_not_filter_override = extract_do_not_filter_override, drop_state = drop_state, - is_beta_user = tighter_gcp_quotas, - sample_id_column_name = sample_id_column_name, - vcf_files_column_name = vcf_files_column_name, - vcf_index_files_column_name = vcf_index_files_column_name, - sample_set_name = sample_set_name, } output { - Array[File] output_vcfs = GvsUnified.output_vcfs - Array[File] output_vcf_indexes = GvsUnified.output_vcf_indexes - Array[File] output_vcf_interval_files = GvsUnified.output_vcf_interval_files - Float total_vcfs_size_mb = GvsUnified.total_vcfs_size_mb - File? sample_name_list = GvsUnified.sample_name_list - File manifest = GvsUnified.manifest + Array[File] output_vcfs = GvsExtractCallset.output_vcfs + Array[File] output_vcf_indexes = GvsExtractCallset.output_vcf_indexes + Array[File] output_vcf_interval_files = GvsExtractCallset.output_vcf_interval_files + Float total_vcfs_size_mb = GvsExtractCallset.total_vcfs_size_mb + File? sample_name_list = GvsExtractCallset.sample_name_list + File manifest = GvsExtractCallset.manifest Boolean done = true } } diff --git a/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl index 0174612b9df..d22b1eaeef7 100644 --- a/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl +++ b/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl @@ -1,6 +1,5 @@ version 1.0 -import "GvsUnified.wdl" as Unified import "GvsUtils.wdl" as Utils import "GvsJointVariantCalling.wdl" as JointVariantCalling diff --git a/scripts/variantstore/wdl/GvsUnified.wdl b/scripts/variantstore/wdl/GvsUnified.wdl deleted file mode 100644 index bef5f898daa..00000000000 --- a/scripts/variantstore/wdl/GvsUnified.wdl +++ /dev/null @@ -1,161 +0,0 @@ -version 1.0 - -import "GvsBulkIngestGenomes.wdl" as BulkIngestGenomes -import "GvsPopulateAltAllele.wdl" as PopulateAltAllele -import "GvsCreateFilterSet.wdl" as CreateFilterSet -import "GvsPrepareRangesCallset.wdl" as PrepareRangesCallset -import "GvsExtractCallset.wdl" as ExtractCallset - -workflow GvsUnified { - input { - # Begin GvsAssignIds - String dataset_name - String project_id - String call_set_identifier - - File? gatk_override - # End GvsAssignIds - - # Begin GvsImportGenomes - File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list" - - # set to "NONE" to ingest all the reference data into GVS for VDS (instead of VCF) output - String drop_state = "NONE" - # for beta users, rate limit their ingest to stay below quotas - Boolean is_beta_user = false - - # The larger the `load_data_batch_size` the greater the probability of preemptions and non-retryable - # BigQuery errors so if specifying this adjust preemptible and maxretries accordingly. Or just take the defaults, - # those should work fine in most cases. - Int? load_data_batch_size - Int? load_data_preemptible_override - Int? load_data_maxretries_override - # End GvsImportGenomes - - # Begin GvsCreateFilterSet - String filter_set_name = call_set_identifier - Boolean use_VQSR_lite = true - - Int? INDEL_VQSR_CLASSIC_max_gaussians_override = 4 - Int? INDEL_VQSR_CLASSIC_mem_gb_override - Int? SNP_VQSR_CLASSIC_max_gaussians_override = 6 - Int? SNP_VQSR_CLASSIC_mem_gb_override - # End GvsCreateFilterSet - - # Begin GvsPrepareRangesCallset - String extract_table_prefix - - String query_project = project_id - String destination_project = project_id - String destination_dataset = dataset_name - String fq_temp_table_dataset = "~{destination_project}.~{destination_dataset}" - - Array[String]? query_labels - File? sample_names_to_extract - # End GvsPrepareRangesCallset - - # Begin GvsExtractCallset - Int? extract_scatter_count - - File interval_weights_bed = "gs://broad-public-datasets/gvs/weights/gvs_vet_weights_1kb.bed" - - String extract_output_file_base_name = sub(filter_set_name, " ", "-") - - Int? extract_maxretries_override - Int? extract_preemptible_override - String? extract_output_gcs_dir - Int? split_intervals_disk_size_override - Int? split_intervals_mem_override - Boolean extract_do_not_filter_override = false - # End GvsExtractCallset - String? sample_id_column_name ## Note that a column WILL exist that is the _id from the table name. However, some users will want to specify an alternate column for the sample_name during ingest - String? vcf_files_column_name - String? vcf_index_files_column_name - String? sample_set_name ## NOTE: currently we only allow the loading of one sample set at a time - } - - call BulkIngestGenomes.GvsBulkIngestGenomes as BulkIngestGenomes { - input: - dataset_name = dataset_name, - project_id = project_id, - gatk_override = gatk_override, - interval_list = interval_list, - drop_state = drop_state, - sample_id_column_name = sample_id_column_name, - vcf_files_column_name = vcf_files_column_name, - vcf_index_files_column_name = vcf_index_files_column_name, - sample_set_name = sample_set_name, - } - - call PopulateAltAllele.GvsPopulateAltAllele { - input: - call_set_identifier = call_set_identifier, - go = BulkIngestGenomes.done, - dataset_name = dataset_name, - project_id = project_id, - } - - call CreateFilterSet.GvsCreateFilterSet { - input: - go = GvsPopulateAltAllele.done, - dataset_name = dataset_name, - project_id = project_id, - call_set_identifier = call_set_identifier, - filter_set_name = filter_set_name, - use_VQSR_lite = use_VQSR_lite, - interval_list = interval_list, - gatk_override = gatk_override, - INDEL_VQSR_CLASSIC_max_gaussians_override = INDEL_VQSR_CLASSIC_max_gaussians_override, - INDEL_VQSR_CLASSIC_mem_gb_override = INDEL_VQSR_CLASSIC_mem_gb_override, - SNP_VQSR_CLASSIC_max_gaussians_override = SNP_VQSR_CLASSIC_max_gaussians_override, - SNP_VQSR_CLASSIC_mem_gb_override = SNP_VQSR_CLASSIC_mem_gb_override, - } - - call PrepareRangesCallset.GvsPrepareCallset { - input: - call_set_identifier = call_set_identifier, - go = GvsCreateFilterSet.done, - dataset_name = dataset_name, - project_id = project_id, - extract_table_prefix = extract_table_prefix, - query_project = query_project, - destination_project = destination_project, - destination_dataset = destination_dataset, - fq_temp_table_dataset = fq_temp_table_dataset, - query_labels = query_labels, - sample_names_to_extract = sample_names_to_extract, - } - - call ExtractCallset.GvsExtractCallset { - input: - go = GvsPrepareCallset.done, - dataset_name = dataset_name, - project_id = project_id, - call_set_identifier = call_set_identifier, - extract_table_prefix = extract_table_prefix, - filter_set_name = filter_set_name, - query_project = query_project, - scatter_count = extract_scatter_count, - interval_list = interval_list, - interval_weights_bed = interval_weights_bed, - gatk_override = gatk_override, - output_file_base_name = extract_output_file_base_name, - extract_maxretries_override = extract_maxretries_override, - extract_preemptible_override = extract_preemptible_override, - output_gcs_dir = extract_output_gcs_dir, - split_intervals_disk_size_override = split_intervals_disk_size_override, - split_intervals_mem_override = split_intervals_mem_override, - do_not_filter_override = extract_do_not_filter_override, - drop_state = drop_state, - } - - output { - Array[File] output_vcfs = GvsExtractCallset.output_vcfs - Array[File] output_vcf_indexes = GvsExtractCallset.output_vcf_indexes - Float total_vcfs_size_mb = GvsExtractCallset.total_vcfs_size_mb - Array[File] output_vcf_interval_files = GvsExtractCallset.output_vcf_interval_files - File? sample_name_list = GvsExtractCallset.sample_name_list - File manifest = GvsExtractCallset.manifest - Boolean done = true - } -}