From ee06db9496f4671879b880858faa33a1e588481a Mon Sep 17 00:00:00 2001 From: Rebecca Asch Date: Fri, 13 Sep 2024 10:20:45 -0400 Subject: [PATCH 1/3] adjustments based on first stab at Echo runs --- scripts/variantstore/docs/aou/AOU_DELIVERABLES.md | 1 + scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md b/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md index 2eb7bd56fad..65b25f7adfd 100644 --- a/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md +++ b/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md @@ -172,6 +172,7 @@ You can take advantage of our existing sub-cohort WDL, `GvsExtractCohortFromSamp - Specify the same `call_set_identifier`, `dataset_name`, `project_id`, `extract_table_prefix`, and `interval_list` that were used in the `GvsPrepareRangesCallset` run documented above. - Specify the `interval_weights_bed` appropriate for the PGEN extraction run you are performing. `gs://gvs_quickstart_storage/weights/gvs_full_vet_weights_1kb_padded_orig.bed` is the interval weights BED used for Quickstart. - Select the workflow option "Retry with more memory" and choose a "Memory retry factor" of 1.5 + - Set the `extract_maxretries_override` input to 5, `split_intervals_disk_size_override` to 1000, `scatter_count` to 25000, and `y_bed_weight_scaling` to 8 - `GvsExtractCallsetPgen` currently defaults to 100 alt alleles maximum, which means that any sites having more than that number of alt alleles will be dropped. - Be sure to set the `output_gcs_dir` to the proper path in the AoU delivery bucket so you don't need to copy the output files there yourself once the workflow has finished. - For `GvsExtractCallsetPgen` (which is called by `GvsExtractCallsetPgenMerged`), if one (or several) of the `PgenExtractTask` shards fail because of angry cloud, you can re-run the workflow with the exact same inputs with call caching turned on; the successful shards will cache and only the failed ones will re-run. diff --git a/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl b/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl index cdbfb69f669..65c34b138d0 100644 --- a/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl +++ b/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl @@ -140,7 +140,7 @@ workflow GvsExtractCallsetPgen { Int effective_split_intervals_disk_size_override = select_first([split_intervals_disk_size_override, if GetNumSamplesLoaded.num_samples < 100 then 50 # Quickstart - else 500]) + else 200]) Int effective_extract_memory_gib = if defined(extract_memory_override_gib) then select_first([extract_memory_override_gib]) else if effective_scatter_count <= 100 then 37 + extract_overhead_memory_override_gib From ec6d19e9c8e29bfadf2f303b61304cac8e821bce Mon Sep 17 00:00:00 2001 From: Rebecca Asch Date: Fri, 13 Sep 2024 10:44:58 -0400 Subject: [PATCH 2/3] English --- scripts/variantstore/docs/aou/AOU_DELIVERABLES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md b/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md index 65b25f7adfd..b54de7ce43a 100644 --- a/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md +++ b/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md @@ -172,7 +172,7 @@ You can take advantage of our existing sub-cohort WDL, `GvsExtractCohortFromSamp - Specify the same `call_set_identifier`, `dataset_name`, `project_id`, `extract_table_prefix`, and `interval_list` that were used in the `GvsPrepareRangesCallset` run documented above. - Specify the `interval_weights_bed` appropriate for the PGEN extraction run you are performing. `gs://gvs_quickstart_storage/weights/gvs_full_vet_weights_1kb_padded_orig.bed` is the interval weights BED used for Quickstart. - Select the workflow option "Retry with more memory" and choose a "Memory retry factor" of 1.5 - - Set the `extract_maxretries_override` input to 5, `split_intervals_disk_size_override` to 1000, `scatter_count` to 25000, and `y_bed_weight_scaling` to 8 + - Set the `extract_maxretries_override` input to 5, `split_intervals_disk_size_override` to 1000, `scatter_count` to 25000, and `y_bed_weight_scaling` to 8 to start; you will likely have to adjust one or more of these values in subsequent attempts. - `GvsExtractCallsetPgen` currently defaults to 100 alt alleles maximum, which means that any sites having more than that number of alt alleles will be dropped. - Be sure to set the `output_gcs_dir` to the proper path in the AoU delivery bucket so you don't need to copy the output files there yourself once the workflow has finished. - For `GvsExtractCallsetPgen` (which is called by `GvsExtractCallsetPgenMerged`), if one (or several) of the `PgenExtractTask` shards fail because of angry cloud, you can re-run the workflow with the exact same inputs with call caching turned on; the successful shards will cache and only the failed ones will re-run. From f0421761591f1dbe9d9191f159a278c10f73a390 Mon Sep 17 00:00:00 2001 From: Rebecca Asch Date: Fri, 13 Sep 2024 11:15:19 -0400 Subject: [PATCH 3/3] adjust effective_extract_memory_gib sizes --- scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl b/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl index 65c34b138d0..ebb73f38935 100644 --- a/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl +++ b/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl @@ -143,9 +143,9 @@ workflow GvsExtractCallsetPgen { else 200]) Int effective_extract_memory_gib = if defined(extract_memory_override_gib) then select_first([extract_memory_override_gib]) - else if effective_scatter_count <= 100 then 37 + extract_overhead_memory_override_gib - else if effective_scatter_count <= 500 then 17 + extract_overhead_memory_override_gib - else 9 + extract_overhead_memory_override_gib + else if effective_scatter_count <= 100 then 35 + extract_overhead_memory_override_gib + else if effective_scatter_count <= 500 then 15 + extract_overhead_memory_override_gib + else 5 + extract_overhead_memory_override_gib # WDL 1.0 trick to set a variable ('none') to be undefined. if (false) { File? none = ""