From dcbe619366fcdf7615bff13dabf2b2144a5e4ff9 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Sat, 8 Jul 2023 13:34:00 -0400 Subject: [PATCH] expose bulk ingest optional parameters --- .../wdl/GvsJointVariantCalling.wdl | 9 +++++++++ .../wdl/GvsQuickstartHailIntegration.wdl | 8 ++++++++ .../wdl/GvsQuickstartIntegration.wdl | 20 +++++++++++++++++++ .../wdl/GvsQuickstartVcfIntegration.wdl | 8 ++++++++ scripts/variantstore/wdl/GvsUnified.wdl | 8 ++++++++ 5 files changed, 53 insertions(+) diff --git a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl index 8e250e69eef..af34379f67f 100644 --- a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl +++ b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl @@ -13,6 +13,11 @@ workflow GvsJointVariantCalling { # Beta users have accounts with tighter quotas, and we must work around that Boolean tighter_gcp_quotas = true String branch_name = "ah_var_store" + # TODO should these all be exposed in this WDL? + String? sample_id_column_name ## Note that a column WILL exist that is the _id from the table name. However, some users will want to specify an alternate column for the sample_name during ingest + String? vcf_files_column_name + String? vcf_index_files_column_name + String? sample_set_name ## NOTE: currently we only allow the loading of one sample set at a time } # the call_set_identifier string is used to name many different things throughout this workflow (BQ tables, vcfs etc), @@ -77,6 +82,10 @@ workflow GvsJointVariantCalling { drop_state = drop_state, is_beta_user = tighter_gcp_quotas, branch_name = branch_name, + sample_id_column_name = sample_id_column_name, + vcf_files_column_name = vcf_files_column_name, + vcf_index_files_column_name = vcf_index_files_column_name, + sample_set_name = sample_set_name, } output { diff --git a/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl index 8cdd08fab5f..ab6c785ba8e 100644 --- a/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl +++ b/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl @@ -14,6 +14,10 @@ workflow GvsQuickstartHailIntegration { String dataset_suffix = "hail" String? gatk_override String expected_output_prefix + String? sample_id_column_name ## Note that a column WILL exist that is the _id from the table name. However, some users will want to specify an alternate column for the sample_name during ingest + String? vcf_files_column_name + String? vcf_index_files_column_name + String? sample_set_name ## NOTE: currently we only allow the loading of one sample set at a time } String project_id = "gvs-internal" @@ -28,6 +32,10 @@ workflow GvsQuickstartHailIntegration { gatk_override = gatk_override, interval_list = interval_list, expected_output_prefix = expected_output_prefix, + sample_id_column_name = sample_id_column_name, + vcf_files_column_name = vcf_files_column_name, + vcf_index_files_column_name = vcf_index_files_column_name, + sample_set_name = sample_set_name, } call ExtractAvroFilesForHail.GvsExtractAvroFilesForHail { diff --git a/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl index 43fe3355880..939b6be3325 100644 --- a/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl @@ -31,6 +31,10 @@ workflow GvsQuickstartIntegration { String branch_name Boolean run_vcf_integration = true Boolean run_hail_integration = true + String? sample_id_column_name ## Note that a column WILL exist that is the _id from the table name. However, some users will want to specify an alternate column for the sample_name during ingest + String? vcf_files_column_name + String? vcf_index_files_column_name + String? sample_set_name ## NOTE: currently we only allow the loading of one sample set at a time } File full_interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list" @@ -57,6 +61,10 @@ workflow GvsQuickstartIntegration { gatk_override = BuildGATKJar.jar, interval_list = FilterIntervalListChromosomes.out, expected_output_prefix = expected_output_prefix, + sample_id_column_name = sample_id_column_name, + vcf_files_column_name = vcf_files_column_name, + vcf_index_files_column_name = vcf_index_files_column_name, + sample_set_name = sample_set_name, } call QuickstartHailIntegration.GvsQuickstartHailIntegration as GvsQuickstartHailVQSRClassicIntegration { input: @@ -68,6 +76,10 @@ workflow GvsQuickstartIntegration { gatk_override = BuildGATKJar.jar, interval_list = FilterIntervalListChromosomes.out, expected_output_prefix = expected_output_prefix, + sample_id_column_name = sample_id_column_name, + vcf_files_column_name = vcf_files_column_name, + vcf_index_files_column_name = vcf_index_files_column_name, + sample_set_name = sample_set_name, } } @@ -81,6 +93,10 @@ workflow GvsQuickstartIntegration { gatk_override = BuildGATKJar.jar, interval_list = FilterIntervalListChromosomes.out, expected_output_prefix = expected_output_prefix, + sample_id_column_name = sample_id_column_name, + vcf_files_column_name = vcf_files_column_name, + vcf_index_files_column_name = vcf_index_files_column_name, + sample_set_name = sample_set_name, } call QuickstartVcfIntegration.GvsQuickstartVcfIntegration as QuickstartVcfVQSRClassicIntegration { input: @@ -91,6 +107,10 @@ workflow GvsQuickstartIntegration { gatk_override = BuildGATKJar.jar, interval_list = FilterIntervalListChromosomes.out, expected_output_prefix = expected_output_prefix, + sample_id_column_name = sample_id_column_name, + vcf_files_column_name = vcf_files_column_name, + vcf_index_files_column_name = vcf_index_files_column_name, + sample_set_name = sample_set_name, } } } diff --git a/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl index 17bb55654c8..2bc16a3d30a 100644 --- a/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl +++ b/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl @@ -16,6 +16,10 @@ workflow GvsQuickstartVcfIntegration { String drop_state = "FORTY" String dataset_suffix File? gatk_override + String? sample_id_column_name ## Note that a column WILL exist that is the _id from the table name. However, some users will want to specify an alternate column for the sample_name during ingest + String? vcf_files_column_name + String? vcf_index_files_column_name + String? sample_set_name ## NOTE: currently we only allow the loading of one sample set at a time } String project_id = "gvs-internal" @@ -49,6 +53,10 @@ workflow GvsQuickstartVcfIntegration { drop_state = drop_state, interval_list = interval_list, branch_name = branch_name, + sample_id_column_name = sample_id_column_name, + vcf_files_column_name = vcf_files_column_name, + vcf_index_files_column_name = vcf_index_files_column_name, + sample_set_name = sample_set_name, } # Only assert identical outputs if we did not filter (filtering is not deterministic) OR if we are using VQSR Lite (which is deterministic) diff --git a/scripts/variantstore/wdl/GvsUnified.wdl b/scripts/variantstore/wdl/GvsUnified.wdl index 3258177489f..667b7661951 100644 --- a/scripts/variantstore/wdl/GvsUnified.wdl +++ b/scripts/variantstore/wdl/GvsUnified.wdl @@ -71,6 +71,10 @@ workflow GvsUnified { Boolean extract_do_not_filter_override = false # End GvsExtractCallset String branch_name + String? sample_id_column_name ## Note that a column WILL exist that is the _id from the table name. However, some users will want to specify an alternate column for the sample_name during ingest + String? vcf_files_column_name + String? vcf_index_files_column_name + String? sample_set_name ## NOTE: currently we only allow the loading of one sample set at a time } call BulkIngestGenomes.GvsBulkIngestGenomes as BulkIngestGenomes { @@ -82,6 +86,10 @@ workflow GvsUnified { branch_name = branch_name, interval_list = interval_list, drop_state = drop_state, + sample_id_column_name = sample_id_column_name, + vcf_files_column_name = vcf_files_column_name, + vcf_index_files_column_name = vcf_index_files_column_name, + sample_set_name = sample_set_name, } call PopulateAltAllele.GvsPopulateAltAllele {