From e1b9afb0624c3736d7b3f5939fcfe87869ff2534 Mon Sep 17 00:00:00 2001 From: Mark Walker Date: Wed, 5 Jun 2024 13:41:16 -0400 Subject: [PATCH 1/5] Add NonZeroReferenceLengthAlignmentReadFilter read filter to CollectSVEvidence (#686) --- wdl/CollectSVEvidence.wdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/wdl/CollectSVEvidence.wdl b/wdl/CollectSVEvidence.wdl index 244af9b02..512689cac 100644 --- a/wdl/CollectSVEvidence.wdl +++ b/wdl/CollectSVEvidence.wdl @@ -113,7 +113,8 @@ task RunCollectSVEvidence { --site-depth-min-mapq "~{site_depth_min_mapq}" \ --site-depth-min-baseq "~{site_depth_min_baseq}" \ ~{"-R " + reference_fasta} \ - ~{"-L " + primary_contigs_list} + ~{"-L " + primary_contigs_list} \ + --read-filter NonZeroReferenceLengthAlignmentReadFilter >>> runtime { From d96e4d6ebcd84709e9995b46c6d4d89511053e87 Mon Sep 17 00:00:00 2001 From: Snow Date: Tue, 11 Jun 2024 11:00:37 -0400 Subject: [PATCH 2/5] AF annotation script updates for gnomAD (#655) --- .../test/AnnotateVcf/AnnotateVcf.json.tmpl | 1 + inputs/values/ref_panel_1kg.json | 1 + .../05_annotation/scripts/compute_AFs.py | 191 +++++++++--------- 3 files changed, 95 insertions(+), 98 deletions(-) diff --git a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl index c805588ae..2217b529a 100644 --- a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl +++ b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl @@ -7,6 +7,7 @@ "AnnotateVcf.external_af_ref_prefix" : {{ reference_resources.external_af_ref_bed_prefix | tojson }}, "AnnotateVcf.external_af_population" : {{ reference_resources.external_af_population | tojson }}, "AnnotateVcf.par_bed": {{ reference_resources.par_bed | tojson }}, + "AnnotateVcf.sample_pop_assignments": {{ test_batch.sample_pop_assignments | tojson }}, "AnnotateVcf.contig_list" : {{ reference_resources.primary_contigs_list | tojson }}, "AnnotateVcf.ped_file": {{ test_batch.ped_file | tojson }}, diff --git a/inputs/values/ref_panel_1kg.json b/inputs/values/ref_panel_1kg.json index b3020d1f0..5cb0ccd82 100644 --- a/inputs/values/ref_panel_1kg.json +++ b/inputs/values/ref_panel_1kg.json @@ -2886,6 +2886,7 @@ "NA21133" ], "samples_list": "gs://gatk-sv-ref-panel-1kg/outputs/GATKSVPipelineBatch/38c65ca4-2a07-4805-86b6-214696075fef/samples_list.txt", + "sample_pop_assignments": "gs://gatk-sv-resources-public/hg38/v0/sv-resources/ref-panel/1KG/v2/populations.ref_panel_1kg.tsv", "scramble_vcfs": [ "gs://gatk-sv-ref-panel-1kg/outputs/mw-scramble/Scramble/HG00096.scramble.vcf.gz", "gs://gatk-sv-ref-panel-1kg/outputs/mw-scramble/Scramble/HG00129.scramble.vcf.gz", diff --git a/src/sv-pipeline/05_annotation/scripts/compute_AFs.py b/src/sv-pipeline/05_annotation/scripts/compute_AFs.py index b805f70de..7ac67fcff 100755 --- a/src/sv-pipeline/05_annotation/scripts/compute_AFs.py +++ b/src/sv-pipeline/05_annotation/scripts/compute_AFs.py @@ -55,13 +55,13 @@ def update_sex_freqs(record, pop=None): m_prefix = 'MALE' f_prefix = 'FEMALE' - m_an = record.info.get(m_prefix + '_AN', 0) - m_ac = sum(record.info.get(m_prefix + '_AC', 0)) - # m_af = sum(record.info.get(m_prefix + '_AF', 0)) + m_an = record.info.get('AN_' + m_prefix, 0) + m_ac = sum(record.info.get('AC_' + m_prefix, 0)) + # m_af = sum(record.info.get('AF_' + m_prefix, 0)) - f_an = record.info.get(f_prefix + '_AN', 0) - f_ac = sum(record.info.get(f_prefix + '_AC', 0)) - # f_af = sum(record.info.get(f_prefix + '_AF', 0)) + f_an = record.info.get('AN_' + f_prefix , 0) + f_ac = sum(record.info.get('AC_' + f_prefix , 0)) + # f_af = sum(record.info.get('AF_' + f_prefix , 0)) adj_an = m_an + f_an adj_ac = m_ac + f_ac @@ -75,9 +75,9 @@ def update_sex_freqs(record, pop=None): record.info['AC'] = (adj_ac, ) record.info['AF'] = (adj_af, ) else: - record.info[pop + '_AN'] = adj_an - record.info[pop + '_AC'] = (adj_ac, ) - record.info[pop + '_AF'] = (adj_af, ) + record.info['AN_' + pop ] = adj_an + record.info['AC_' + pop ] = (adj_ac, ) + record.info['AF_' + pop ] = (adj_af, ) return record @@ -137,7 +137,7 @@ def gather_allele_freqs(record, samples, males_set, females_set, parbt, pop_dict # Get POPMAX AF biallelic sites only if svu.is_biallelic(record): - AFs = [record.info['{0}_AF'.format(pop)][0] for pop in pops] + AFs = [record.info['AF_{0}'.format(pop)][0] for pop in pops] popmax = max(AFs) record.info['POPMAX_AF'] = popmax @@ -192,9 +192,9 @@ def calc_allele_freq(record, samples, prefix=None, hemi=False): AF = 0 # Add AN, AC, and AF to INFO field - record.info[(prefix + '_' if prefix else '') + 'AN'] = AN - record.info[(prefix + '_' if prefix else '') + 'AC'] = AC - record.info[(prefix + '_' if prefix else '') + 'AF'] = AF + record.info['AN' + ('_' + prefix if prefix else '')] = AN + record.info['AC' + ('_' + prefix if prefix else '')] = AC + record.info['AF' + ('_' + prefix if prefix else '')] = AF # Calculate genotype frequencies n_bi_genos = n_alt_count_0 + n_alt_count_1 + n_alt_count_2 @@ -210,27 +210,18 @@ def calc_allele_freq(record, samples, prefix=None, hemi=False): freq_hemialt = freq_het + freq_homalt # Add N_BI_GENOS, N_HOMREF, N_HET, N_HOMALT, FREQ_HOMREF, FREQ_HET, and FREQ_HOMALT to INFO field - record.info[(prefix + '_' if prefix else '') + - 'N_BI_GENOS'] = n_bi_genos + record.info['N_BI_GENOS' + ('_' + prefix if prefix else '')] = n_bi_genos if hemi: - record.info[(prefix + '_' if prefix else '') + - 'N_HEMIREF'] = n_alt_count_0 - record.info[(prefix + '_' if prefix else '') + - 'N_HEMIALT'] = n_gts_with_gt_0_alts - record.info[(prefix + '_' if prefix else '') + - 'FREQ_HEMIREF'] = freq_homref - record.info[(prefix + '_' if prefix else '') + - 'FREQ_HEMIALT'] = freq_hemialt - record.info[(prefix + '_' if prefix else '') + - 'N_HOMREF'] = n_alt_count_0 - record.info[(prefix + '_' if prefix else '') + 'N_HET'] = n_alt_count_1 - record.info[(prefix + '_' if prefix else '') + - 'N_HOMALT'] = n_alt_count_2 - record.info[(prefix + '_' if prefix else '') + - 'FREQ_HOMREF'] = freq_homref - record.info[(prefix + '_' if prefix else '') + 'FREQ_HET'] = freq_het - record.info[(prefix + '_' if prefix else '') + - 'FREQ_HOMALT'] = freq_homalt + record.info['N_HEMIREF' + ('_' + prefix if prefix else '')] = n_alt_count_0 + record.info['N_HEMIALT' + ('_' + prefix if prefix else '')] = n_gts_with_gt_0_alts + record.info['FREQ_HEMIREF' + ('_' + prefix if prefix else '')] = freq_homref + record.info['FREQ_HEMIALT' + ('_' + prefix if prefix else '')] = freq_hemialt + record.info['N_HOMREF' + ('_' + prefix if prefix else '')] = n_alt_count_0 + record.info['N_HET' + ('_' + prefix if prefix else '')] = n_alt_count_1 + record.info['N_HOMALT' + ('_' + prefix if prefix else '')] = n_alt_count_2 + record.info['FREQ_HOMREF' + ('_' + prefix if prefix else '')] = freq_homref + record.info['FREQ_HET' + ('_' + prefix if prefix else '')] = freq_het + record.info['FREQ_HOMALT' + ('_' + prefix if prefix else '')] = freq_homalt # Multiallelic sites should reference FORMAT:CN rather than GT # Compute CN_NUMBER, CN_NONREF_COUNT, CN_NONREF_FREQ, and CN_COUNT/CN_FREQ for each copy state @@ -244,6 +235,7 @@ def calc_allele_freq(record, samples, prefix=None, hemi=False): nonnull_CNs, nonref_CN_count, nonref_CN_freq = [0] * 3 CN_dist = (0, ) CN_freqs = (0, ) + CN_status = (0, ) else: # Count number of samples per CN and total CNs observed CN_counts = dict(Counter(CNs)) @@ -253,27 +245,23 @@ def calc_allele_freq(record, samples, prefix=None, hemi=False): max_CN = max([int(k) for k, v in CN_counts.items()]) CN_dist = [int(CN_counts.get(k, 0)) for k in range(max_CN + 1)] CN_freqs = [round(v / nonnull_CNs, 6) for v in CN_dist] + CN_status = [s for s in range(max_CN + 1)] # Get total non-reference CN counts and freq if hemi: ref_CN = 1 else: ref_CN = 2 - nonref_CN_count = sum([int(CN_counts.get(k, 0)) - for k in range(max_CN + 1) if k != ref_CN]) + nonref_CN_count = sum([int(CN_counts.get(k, 0)) for k in range(max_CN + 1) if k != ref_CN]) nonref_CN_freq = round(nonref_CN_count / nonnull_CNs, 6) # Add values to INFO field - record.info[(prefix + '_' if prefix else '') + - 'CN_NUMBER'] = nonnull_CNs - record.info[(prefix + '_' if prefix else '') + - 'CN_COUNT'] = tuple(CN_dist) - record.info[(prefix + '_' if prefix else '') + - 'CN_FREQ'] = tuple(CN_freqs) - record.info[(prefix + '_' if prefix else '') + - 'CN_NONREF_COUNT'] = nonref_CN_count - record.info[(prefix + '_' if prefix else '') + - 'CN_NONREF_FREQ'] = nonref_CN_freq + record.info['CN_NUMBER' + ('_' + prefix if prefix else '')] = nonnull_CNs + record.info['CN_COUNT' + ('_' + prefix if prefix else '')] = tuple(CN_dist) + record.info['CN_FREQ' + ('_' + prefix if prefix else '')] = tuple(CN_freqs) + record.info['CN_STATUS' + ('_' + prefix if prefix else '')] = tuple(CN_status) + record.info['CN_NONREF_COUNT' + ('_' + prefix if prefix else '')] = nonref_CN_count + record.info['CN_NONREF_FREQ' + ('_' + prefix if prefix else '')] = nonref_CN_freq return record @@ -360,6 +348,7 @@ def main(): '##INFO=', '##INFO=', '##INFO=', + '##INFO=', '##INFO=', '##INFO=', '##INFO=' @@ -367,44 +356,46 @@ def main(): if len(sexes) > 0: for sex in sexes: INFO_ADD.append( - '##INFO=' % (sex, sex)) + '##INFO=' % (sex, sex)) INFO_ADD.append( - '##INFO=' % (sex, sex)) + '##INFO=' % (sex, sex)) INFO_ADD.append( - '##INFO=' % (sex, sex)) + '##INFO=' % (sex, sex)) INFO_ADD.append( - '##INFO=' % (sex, sex)) + '##INFO=' % (sex, sex)) INFO_ADD.append( - '##INFO=' % (sex, sex)) + '##INFO=' % (sex, sex)) INFO_ADD.append( - '##INFO=' % (sex, sex)) + '##INFO=' % (sex, sex)) INFO_ADD.append( - '##INFO=' % (sex, sex)) + '##INFO=' % (sex, sex)) INFO_ADD.append( - '##INFO=' % (sex, sex)) + '##INFO=' % (sex, sex)) INFO_ADD.append( - '##INFO=' % (sex, sex)) + '##INFO=' % (sex, sex)) INFO_ADD.append( - '##INFO=' % (sex, sex)) + '##INFO=' % (sex, sex)) INFO_ADD.append( - '##INFO=' % (sex, sex)) + '##INFO=' % (sex, sex)) INFO_ADD.append( - '##INFO=' % (sex, sex)) + '##INFO=' % (sex, sex)) INFO_ADD.append( - '##INFO=' % (sex, sex)) + '##INFO=' % (sex, sex, sex)) INFO_ADD.append( - '##INFO=' % (sex, sex)) + '##INFO=' % (sex, sex)) INFO_ADD.append( - '##INFO=' % (sex, sex)) + '##INFO=' % (sex, sex)) + INFO_ADD.append( + '##INFO=' % (sex, sex)) if sex == 'MALE': INFO_ADD.append( - '##INFO=' % (sex, sex)) + '##INFO=' % (sex, sex)) INFO_ADD.append( - '##INFO=' % (sex, sex)) + '##INFO=' % (sex, sex)) INFO_ADD.append( - '##INFO=' % (sex, sex)) + '##INFO=' % (sex, sex)) INFO_ADD.append( - '##INFO=' % (sex, sex)) + '##INFO=' % (sex, sex)) if len(parbt) > 0: INFO_ADD.append( '##INFO=') @@ -413,75 +404,79 @@ def main(): '##INFO=') for pop in pops: INFO_ADD.append( - '##INFO=' % (pop, pop)) + '##INFO=' % (pop, pop)) + INFO_ADD.append( + '##INFO=' % (pop, pop)) INFO_ADD.append( - '##INFO=' % (pop, pop)) + '##INFO=' % (pop, pop)) INFO_ADD.append( - '##INFO=' % (pop, pop)) + '##INFO=' % (pop, pop)) INFO_ADD.append( - '##INFO=' % (pop, pop)) + '##INFO=' % (pop, pop)) INFO_ADD.append( - '##INFO=' % (pop, pop)) + '##INFO=' % (pop, pop)) INFO_ADD.append( - '##INFO=' % (pop, pop)) + '##INFO=' % (pop, pop)) INFO_ADD.append( - '##INFO=' % (pop, pop)) + '##INFO=' % (pop, pop)) INFO_ADD.append( - '##INFO=' % (pop, pop)) + '##INFO=' % (pop, pop)) INFO_ADD.append( - '##INFO=' % (pop, pop)) + '##INFO=' % (pop, pop)) INFO_ADD.append( - '##INFO=' % (pop, pop)) + '##INFO=' % (pop, pop)) INFO_ADD.append( - '##INFO=' % (pop, pop)) + '##INFO=' % (pop, pop)) INFO_ADD.append( - '##INFO=' % (pop, pop)) + '##INFO=' % (pop, pop, pop)) INFO_ADD.append( - '##INFO=' % (pop, pop)) + '##INFO=' % (pop, pop)) INFO_ADD.append( - '##INFO=' % (pop, pop)) + '##INFO=' % (pop, pop)) INFO_ADD.append( - '##INFO=' % (pop, pop)) + '##INFO=' % (pop, pop)) if len(sexes) > 0 and not args.no_combos: for sex in sexes: - INFO_ADD.append('##INFO=' % ( + INFO_ADD.append('##INFO=' % ( '_'.join((pop, sex)), ' '.join((pop, sex)))) - INFO_ADD.append('##INFO=' % ( + INFO_ADD.append('##INFO=' % ( '_'.join((pop, sex)), ' '.join((pop, sex)))) - INFO_ADD.append('##INFO=' % ( + INFO_ADD.append('##INFO=' % ( '_'.join((pop, sex)), ' '.join((pop, sex)))) - INFO_ADD.append('##INFO=' % ( + INFO_ADD.append('##INFO=' % ( '_'.join((pop, sex)), ' '.join((pop, sex)))) - INFO_ADD.append('##INFO=' % ( + INFO_ADD.append('##INFO=' % ( '_'.join((pop, sex)), ' '.join((pop, sex)))) - INFO_ADD.append('##INFO=' % ( + INFO_ADD.append('##INFO=' % ( '_'.join((pop, sex)), ' '.join((pop, sex)))) - INFO_ADD.append('##INFO=' % ( + INFO_ADD.append('##INFO=' % ( '_'.join((pop, sex)), ' '.join((pop, sex)))) - INFO_ADD.append('##INFO=' % ( + INFO_ADD.append('##INFO=' % ( '_'.join((pop, sex)), ' '.join((pop, sex)))) - INFO_ADD.append('##INFO=' % ( + INFO_ADD.append('##INFO=' % ( '_'.join((pop, sex)), ' '.join((pop, sex)))) - INFO_ADD.append('##INFO=' % ( + INFO_ADD.append('##INFO=' % ( '_'.join((pop, sex)), ' '.join((pop, sex)))) - INFO_ADD.append('##INFO=' % ( + INFO_ADD.append('##INFO=' % ( '_'.join((pop, sex)), ' '.join((pop, sex)))) - INFO_ADD.append('##INFO=' % ( + INFO_ADD.append('##INFO=' % ( '_'.join((pop, sex)), ' '.join((pop, sex)))) - INFO_ADD.append('##INFO=' % ( + INFO_ADD.append('##INFO=' % ( + '_'.join((pop, sex)), '_'.join((pop, sex)), '_'.join((pop, sex)))) + INFO_ADD.append('##INFO=' % ( '_'.join((pop, sex)), ' '.join((pop, sex)))) - INFO_ADD.append('##INFO=' % ( + INFO_ADD.append('##INFO=' % ( '_'.join((pop, sex)), ' '.join((pop, sex)))) - INFO_ADD.append('##INFO=' % ( + INFO_ADD.append('##INFO=' % ( '_'.join((pop, sex)), ' '.join((pop, sex)))) if sex == 'MALE': - INFO_ADD.append('##INFO=' % ( + INFO_ADD.append('##INFO=' % ( '_'.join((pop, sex)), ' '.join((pop, sex)))) - INFO_ADD.append('##INFO=' % ( + INFO_ADD.append('##INFO=' % ( '_'.join((pop, sex)), ' '.join((pop, sex)))) - INFO_ADD.append('##INFO=' % ( + INFO_ADD.append('##INFO=' % ( '_'.join((pop, sex)), ' '.join((pop, sex)))) - INFO_ADD.append('##INFO=' % ( + INFO_ADD.append('##INFO=' % ( '_'.join((pop, sex)), ' '.join((pop, sex)))) for line in INFO_ADD: From 2b9af68bc471231909d700cd2a08347a73543e41 Mon Sep 17 00:00:00 2001 From: gatk-sv-bot <101641599+gatk-sv-bot@users.noreply.github.com> Date: Tue, 11 Jun 2024 15:19:37 +0000 Subject: [PATCH 3/5] Update docker images list, triggered by d96e4d6e --- inputs/values/dockers.json | 6 +++--- inputs/values/dockers_azure.json | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index ba8b096f0..cf801252b 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -12,8 +12,8 @@ "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2024-01-24-v0.28.4-beta-9debd6d7", "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-01-24-v0.28.4-beta-9debd6d7", "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2024-01-24-v0.28.4-beta-9debd6d7", - "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2024-06-04-v0.28.5-beta-a8dfecba", - "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2024-06-04-v0.28.5-beta-a8dfecba", + "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2024-06-11-v0.28.5-beta-d96e4d6e", + "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2024-06-11-v0.28.5-beta-d96e4d6e", "wham_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/wham:2024-01-24-v0.28.4-beta-9debd6d7", "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9", "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9", @@ -28,5 +28,5 @@ "sv_utils_docker": "us.gcr.io/broad-dsde-methods/markw/sv-utils:mw-train-genotype-filtering-a9479501", "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a", "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6", - "denovo": "us.gcr.io/broad-dsde-methods/gatk-sv/denovo:2024-06-04-v0.28.5-beta-a8dfecba" + "denovo": "us.gcr.io/broad-dsde-methods/gatk-sv/denovo:2024-06-11-v0.28.5-beta-d96e4d6e" } \ No newline at end of file diff --git a/inputs/values/dockers_azure.json b/inputs/values/dockers_azure.json index 7a14ddba2..c01519bf4 100644 --- a/inputs/values/dockers_azure.json +++ b/inputs/values/dockers_azure.json @@ -12,8 +12,8 @@ "samtools_cloud_docker": "vahid.azurecr.io/gatk-sv/samtools-cloud:2024-01-24-v0.28.4-beta-9debd6d7", "sv_base_docker": "vahid.azurecr.io/gatk-sv/sv-base:2024-01-24-v0.28.4-beta-9debd6d7", "sv_base_mini_docker": "vahid.azurecr.io/gatk-sv/sv-base-mini:2024-01-24-v0.28.4-beta-9debd6d7", - "sv_pipeline_docker": "vahid.azurecr.io/gatk-sv/sv-pipeline:2024-06-04-v0.28.5-beta-a8dfecba", - "sv_pipeline_qc_docker": "vahid.azurecr.io/gatk-sv/sv-pipeline:2024-06-04-v0.28.5-beta-a8dfecba", + "sv_pipeline_docker": "vahid.azurecr.io/gatk-sv/sv-pipeline:2024-06-11-v0.28.5-beta-d96e4d6e", + "sv_pipeline_qc_docker": "vahid.azurecr.io/gatk-sv/sv-pipeline:2024-06-11-v0.28.5-beta-d96e4d6e", "wham_docker": "vahid.azurecr.io/gatk-sv/wham:2024-01-24-v0.28.4-beta-9debd6d7", "igv_docker": "vahid.azurecr.io/gatk-sv/igv:mw-xz-fixes-2-b1be6a9", "duphold_docker": "vahid.azurecr.io/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9", @@ -28,5 +28,5 @@ "sv_utils_docker": "vahid.azurecr.io/gatk-sv/sv-utils:2024-01-24-v0.28.4-beta-9debd6d7", "gq_recalibrator_docker": "vahid.azurecr.io/markw/gatk:mw-tb-form-sv-filter-training-data-899360a", "str": "vahid.azurecr.io/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6", - "denovo": "vahid.azurecr.io/gatk-sv/denovo:2024-06-04-v0.28.5-beta-a8dfecba" + "denovo": "vahid.azurecr.io/gatk-sv/denovo:2024-06-11-v0.28.5-beta-d96e4d6e" } \ No newline at end of file From 48c0f765b00c78ad4ba510ea3bb19cc3e54d4059 Mon Sep 17 00:00:00 2001 From: epiercehoffman Date: Tue, 11 Jun 2024 12:13:20 -0400 Subject: [PATCH 4/5] Deprecate single-batch Terra configs (#689) --- .../cohort_mode_workspace_dashboard.md.tmpl | 45 +++++++---------- .../AnnotateVcf.SingleBatch.json.tmpl | 21 -------- .../CleanVcf.SingleBatch.json.tmpl | 25 ---------- .../CombineBatches.SingleBatch.json.tmpl | 18 ------- .../GenotypeBatch.SingleBatch.json.tmpl | 26 ---------- ...otypeComplexVariants.SingleBatch.json.tmpl | 19 ------- .../MainVcfQc.SingleBatch.json.tmpl | 23 --------- .../MakeCohortVcf.SingleBatch.json.tmpl | 50 ------------------- .../RegenotypeCNVs.SingleBatch.json.tmpl | 22 -------- ...solveComplexVariants.SingleBatch.json.tmpl | 19 ------- 10 files changed, 18 insertions(+), 250 deletions(-) delete mode 100644 inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl delete mode 100644 inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.SingleBatch.json.tmpl delete mode 100644 inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CombineBatches.SingleBatch.json.tmpl delete mode 100644 inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/GenotypeBatch.SingleBatch.json.tmpl delete mode 100644 inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/GenotypeComplexVariants.SingleBatch.json.tmpl delete mode 100644 inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MainVcfQc.SingleBatch.json.tmpl delete mode 100644 inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.SingleBatch.json.tmpl delete mode 100644 inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/RegenotypeCNVs.SingleBatch.json.tmpl delete mode 100644 inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/ResolveComplexVariants.SingleBatch.json.tmpl diff --git a/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl b/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl index 5d4faf9eb..7523f1ac1 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl @@ -16,7 +16,7 @@ The following inputs must be provided for each sample in the cohort, via the sam |Input Type|Input Name|Description| |---------|--------|--------------| |`String`|`sample_id`|Case sample identifier*| -|`File`|`bam_or_cram_file`|Path to the GCS location of the input CRAM or BAM file. If using BAM files, an index `.bai` file must either be present in the same directory, or the path must be provided with the input `bam_or_cram_index`.| +|`File`|`bam_or_cram_file`|Path to the GCS location of the input CRAM or BAM file. If using BAM files, an index `.bam.bai` file must either be present in the same directory, or the path must be provided with the input `bam_or_cram_index`. If using CRAM files, an index `.cram.crai` file must either be present in the same directory, or the path must be provided with the input `bam_or_cram_index`.| *See **Sample ID requirements** below for specifications. @@ -35,7 +35,7 @@ The following are the main pipeline outputs. For more information on the outputs |Output Type|Output Name|Description| |---------|--------|--------------| |`File`|`annotated_vcf`|Annotated SV VCF for the cohort***| -|`File`|`annotated_vcf_idx`|Index for `output_vcf`| +|`File`|`annotated_vcf_idx`|Index for `annotated_vcf`| |`File`|`sv_vcf_qc_output`|QC plots (bundled in a .tar.gz file)| ***Note that this VCF is not filtered @@ -54,15 +54,15 @@ The following workflows are included in this workspace, to be executed in this o 6. `06-GenerateBatchMetrics`: Per-batch variant filtering, metric generation 7. `07-FilterBatchSites`: Per-batch variant filtering and plot SV counts per sample per SV type to enable choice of IQR cutoff for outlier filtration in `08-FilterBatchSamples` 8. `08-FilterBatchSamples`: Per-batch outlier sample filtration -9. (Skip for a single batch) `09-MergeBatchSites`: Site merging of SVs discovered across batches, run on a cohort-level `sample_set_set` -10. `10-GenotypeBatch`: Per-batch genotyping of all sites in the cohort. Use `10-GenotypeBatch_SingleBatch` if you only have one batch. -11. `11-RegenotypeCNVs`: Cohort-level genotype refinement of some depth calls. Use `11-RegenotypeCNVs_SingleBatch` if you only have one batch. -12. `12-CombineBatches`: Cohort-level cross-batch integration and clustering. Use `12-CombineBatches_SingleBatch` if you only have one batch. -13. `13-ResolveComplexVariants`: Complex variant resolution. Use `13-ResolveComplexVariants_SingleBatch` if you only have one batch. -14. `14-GenotypeComplexVariants`: Complex variant re-genotyping. Use `14-GenotypeComplexVariants_SingleBatch` if you only have one batch. -15. `15-CleanVcf`: VCF cleanup. Use `15-CleanVcf_SingleBatch` if you only have one batch. -16. `16-MainVcfQc`: Generates VCF QC reports. Use `16-MainVcfQc_SingleBatch` if you only have one batch. -17. `17-AnnotateVcf`: Cohort VCF annotations, including functional annotation, allele frequency (AF) annotation, and AF annotation with external population callsets. Use `17-AnnotateVcf_SingleBatch` if you only have one batch. +9. `09-MergeBatchSites`: Site merging of SVs discovered across batches, run on a cohort-level `sample_set_set` +10. `10-GenotypeBatch`: Per-batch genotyping of all sites in the cohort +11. `11-RegenotypeCNVs`: Cohort-level genotype refinement of some depth calls +12. `12-CombineBatches`: Cohort-level cross-batch integration and clustering +13. `13-ResolveComplexVariants`: Complex variant resolution +14. `14-GenotypeComplexVariants`: Complex variant re-genotyping +15. `15-CleanVcf`: VCF cleanup +16. `16-MainVcfQc`: Generates VCF QC reports +17. `17-AnnotateVcf`: Cohort VCF annotations, including functional annotation, allele frequency (AF) annotation, and AF annotation with external population callsets Additional downstream modules, such as those for filtering and visualization, are under development. They are not included in this workspace at this time, but the source code can be found in the [GATK-SV GitHub repository](https://github.com/broadinstitute/gatk-sv). See **Downstream steps** towards the bottom of this page for more information. @@ -74,18 +74,17 @@ For detailed instructions on running the pipeline in Terra, see **Step-by-step i ### How many samples can I process at once? -#### Single-sample vs. single-batch vs. multi-batch mode +#### Single-sample vs. cohort mode -There are three modes for this pipeline according to the number of samples you need to process: +There are two modes for this pipeline according to the number of samples you need to process: 1. Single-sample mode (<100 samples): The cohort mode of this pipeline requires at least 100 samples, so for smaller sets of samples we recommend the single-sample version of this pipeline, which is available as a [featured Terra workspace](https://app.terra.bio/#workspaces/help-gatk/GATK-Structural-Variants-Single-Sample). -2. Single-batch mode (100-500 samples) -3. Cohort (multi-batch) mode (>200 samples): Batches should be 100-500 samples, so you may choose to divide your cohort into multiple batches if you have at least 200 samples. Refer to the [Batching](https://github.com/broadinstitute/gatk-sv#batching) section of the README for further information. +2. Cohort mode (>=100 samples): Batches should be 100-500 samples, so you may choose to divide your cohort into multiple batches if you have at least 200 samples. Refer to the [Batching](https://github.com/broadinstitute/gatk-sv#batching) section of the README for further information. #### What is the maximum number of samples the pipeline can handle? -In Terra, we have tested batch sizes of up to 500 samples and cohort sizes of up to 11,000 samples (and 40,000 samples with the final steps split by chromosome). On a separate cromwell server, we have tested the pipeline on cohorts of up to ~140,000 samples, but Terra's metadata handling will likely limit cohort sizes further. +In Terra, we have tested batch sizes of up to 500 samples and cohort sizes of up to 11,000 samples (and 98,000 samples with the final steps split by chromosome). On a separate cromwell server, we have tested the pipeline on cohorts of up to ~140,000 samples. ### Time and cost estimates @@ -144,7 +143,6 @@ To create batches (in the `sample_set` table), the easiest way is to upload a ta * Another option is to use the `fiss mop` API call to delete all files that do not appear in one of the Terra data tables (intermediate files). Always ensure that you are completely done with a step and you will not need to return before using this option, as it will break call-caching. See [this blog post](https://terra.bio/deleting-intermediate-workflow-outputs/) for more details. This can also be done [via the command line](https://github.com/broadinstitute/fiss/wiki/MOP:-reducing-your-cloud-storage-footprint). * If your workflow fails, check the job manager for the error message. Most issues can be resolved by increasing the memory or disk. Do not delete workflow log files until you are done troubleshooting. If call-caching is enabled, do not delete any files from the failed workflow until you have run it successfully. * To display run costs, see [this article](https://support.terra.bio/hc/en-us/articles/360037862771#h_01EX5ED53HAZ59M29DRCG24CXY) for one-time setup instructions for non-Broad users. -* If you only have one batch, you will need to skip `09-MergeBatchSites` and use the single-batch versions of all workflows after `10-GenotypeBatch`. #### 01-GatherSampleEvidence @@ -167,8 +165,8 @@ Read the full EvidenceQC documentation [here](https://github.com/broadinstitute/ #### 03-TrainGCNV Read the full TrainGCNV documentation [here](https://github.com/broadinstitute/gatk-sv#gcnv-training-1). -* By default, `03-TrainGCNV` is configured to be run once per `sample_set` on 100 randomly-chosen samples from that set to create a gCNV model for each batch. * Before running this workflow, create the batches (~100-500 samples) you will use for the rest of the pipeline based on sample coverage, WGD score (from `02-EvidenceQC`), and PCR status. These will likely not be the same as the batches you used for `02-EvidenceQC`. +* By default, `03-TrainGCNV` is configured to be run once per `sample_set` on 100 randomly-chosen samples from that set to create a gCNV model for each batch. If your `sample_set` contains fewer than 100 samples (not recommended), you will need to edit the `n_samples_subsample` parameter to be less than or equal to the number of samples. #### 04-GatherBatchEvidence @@ -192,26 +190,19 @@ These two workflows make up FilterBatch; they are subdivided in this workspace t #### 09-MergeBatchSites Read the full MergeBatchSites documentation [here](https://github.com/broadinstitute/gatk-sv#merge-batch-sites). -* If you only have one batch, skip this workflow. -* For a multi-batch cohort, `09-MergeBatchSites` is a cohort-level workflow, so it is run on a `sample_set_set` containing all of the batches in the cohort. You can create this `sample_set_set` while you are launching the `09-MergeBatchSites` workflow: click "Select Data", choose "Create new sample_set_set [...]", check all the batches to include (all of the ones used in `03-TrainGCNV` through `08-FilterBatchSamples`), and give it a name that follows the **Sample ID requirements**. +* `09-MergeBatchSites` is a cohort-level workflow, so it is run on a `sample_set_set` containing all of the batches in the cohort. You can create this `sample_set_set` while you are launching the `09-MergeBatchSites` workflow: click "Select Data", choose "Create new sample_set_set [...]", check all the batches to include (all of the ones used in `03-TrainGCNV` through `08-FilterBatchSamples`), and give it a name that follows the **Sample ID requirements**. creating a cohort sample_set_set -#### Single-batch vs. Multi-batch processing -* If you only have one batch (`sample_set`), you will be using the workflows with the suffix `_SingleBatch` from `10-GenotypeBatch_SingleBatch` to `17-AnnotateVcf_SingleBatch`. We recommend deleting the workflow versions without the `_SingleBatch` suffix from `10-GenotypeBatch` to `17-AnnotateVcf` now to avoid confusion. -* If you have multiple batches (`sample_set`s), you will be using the cohort-mode versions of the workflows from `10-GenotypeBatch` onwards, which do not have the suffix `_SingleBatch`. We recommend deleting the workflow versions with the `_SingleBatch` suffix now to avoid confusion. - #### 10-GenotypeBatch Read the full GenotypeBatch documentation [here](https://github.com/broadinstitute/gatk-sv#genotype-batch). * Use the same `sample_set` definitions you used for `03-TrainGCNV` through `08-FilterBatchSamples`. -* If you only have one batch, use the `10-GenotypeBatch_SingleBatch` version of the workflow. #### 11-RegenotypeCNVs, 12-CombineBatches, 13-ResolveComplexVariants, 14-GenotypeComplexVariants, 15-CleanVcf, 16-MainVcfQc, and 17-AnnotateVcf Read the full documentation for [RegenotypeCNVs](https://github.com/broadinstitute/gatk-sv#regenotype-cnvs), [MakeCohortVcf](https://github.com/broadinstitute/gatk-sv#make-cohort-vcf) (which includes `CombineBatches`, `ResolveComplexVariants`, `GenotypeComplexVariants`, `CleanVcf`, `MainVcfQc`), and [AnnotateVcf](https://github.com/broadinstitute/gatk-sv#annotate-vcf) on the README. -* For a multi-batch cohort, use the same cohort `sample_set_set` you created and used for `09-MergeBatchSites`. -* If you only have one batch, use the `[...]_SingleBatch` version of the workflow. +* Use the same cohort `sample_set_set` you created and used for `09-MergeBatchSites`. #### Downstream steps diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl deleted file mode 100644 index 87b3da618..000000000 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl +++ /dev/null @@ -1,21 +0,0 @@ -{ - "AnnotateVcf.vcf" : "${this.cleaned_vcf}", - - "AnnotateVcf.protein_coding_gtf" : "${workspace.protein_coding_gtf}", - "AnnotateVcf.noncoding_bed" : "${workspace.noncoding_bed}", - "AnnotateVcf.external_af_ref_bed" : "${workspace.external_af_ref_bed}", - "AnnotateVcf.external_af_ref_prefix" : "${workspace.external_af_ref_bed_prefix}", - "AnnotateVcf.external_af_population" : {{ reference_resources.external_af_population | tojson }}, - "AnnotateVcf.par_bed": "${workspace.par_bed}", - - "AnnotateVcf.contig_list" : "${workspace.primary_contigs_list}", - "AnnotateVcf.ped_file": "${workspace.cohort_ped_file}", - "AnnotateVcf.sv_per_shard" : "5000", - - "AnnotateVcf.prefix" : "${this.sample_set_id}", - "AnnotateVcf.use_hail": "false", - - "AnnotateVcf.gatk_docker" : "${workspace.gatk_docker}", - "AnnotateVcf.sv_base_mini_docker" : "${workspace.sv_base_mini_docker}", - "AnnotateVcf.sv_pipeline_docker" : "${workspace.sv_pipeline_docker}" -} \ No newline at end of file diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.SingleBatch.json.tmpl deleted file mode 100644 index 3afa629da..000000000 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.SingleBatch.json.tmpl +++ /dev/null @@ -1,25 +0,0 @@ -{ - "CleanVcf.contig_list": "${workspace.primary_contigs_fai}", - "CleanVcf.allosome_fai": "${workspace.allosome_file}", - "CleanVcf.chr_x": "${workspace.chr_x}", - "CleanVcf.chr_y": "${workspace.chr_y}", - - "CleanVcf.max_shards_per_chrom_step1": 200, - "CleanVcf.min_records_per_shard_step1": 5000, - "CleanVcf.clean_vcf1b_records_per_shard": 10000, - "CleanVcf.samples_per_step2_shard": 100, - "CleanVcf.clean_vcf5_records_per_shard": 5000, - - "CleanVcf.primary_contigs_list": "${workspace.primary_contigs_list}", - - "CleanVcf.linux_docker": "${workspace.linux_docker}", - "CleanVcf.sv_pipeline_docker": "${workspace.sv_pipeline_docker}", - "CleanVcf.sv_base_mini_docker": "${workspace.sv_base_mini_docker}", - - "CleanVcf.cohort_name": "${this.sample_set_id}", - "CleanVcf.ped_file": "${workspace.cohort_ped_file}", - "CleanVcf.complex_genotype_vcfs": "${this.complex_genotype_vcfs}", - "CleanVcf.complex_resolve_bothside_pass_lists": "${this.complex_resolve_bothside_pass_lists}", - "CleanVcf.complex_resolve_background_fail_lists": "${this.complex_resolve_background_fail_lists}" - -} diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CombineBatches.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CombineBatches.SingleBatch.json.tmpl deleted file mode 100644 index c49bc7f26..000000000 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CombineBatches.SingleBatch.json.tmpl +++ /dev/null @@ -1,18 +0,0 @@ -{ - "CombineBatches.contig_list": "${workspace.primary_contigs_fai}", - "CombineBatches.pe_exclude_list": "${workspace.pesr_exclude_list}", - "CombineBatches.depth_exclude_list": "${workspace.depth_exclude_list}", - "CombineBatches.empty_file" : "${workspace.empty_file}", - - "CombineBatches.min_sr_background_fail_batches": 0.5, - "CombineBatches.sv_pipeline_docker": "${workspace.sv_pipeline_docker}", - "CombineBatches.sv_base_mini_docker": "${workspace.sv_base_mini_docker}", - - "CombineBatches.cohort_name": "${this.sample_set_id}", - "CombineBatches.batches": "${this.sample_set_id}", - "CombineBatches.pesr_vcfs": "${this.genotyped_pesr_vcf}", - "CombineBatches.depth_vcfs": "${this.regenotyped_depth_vcfs}", - "CombineBatches.raw_sr_bothside_pass_files": "${this.sr_bothside_pass}", - "CombineBatches.raw_sr_background_fail_files": "${this.sr_background_fail}" - -} diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/GenotypeBatch.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/GenotypeBatch.SingleBatch.json.tmpl deleted file mode 100644 index 8f0fb489f..000000000 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/GenotypeBatch.SingleBatch.json.tmpl +++ /dev/null @@ -1,26 +0,0 @@ -{ - "GenotypeBatch.sv_base_mini_docker": "${workspace.sv_base_mini_docker}", - "GenotypeBatch.sv_pipeline_docker": "${workspace.sv_pipeline_docker}", - "GenotypeBatch.linux_docker" : "${workspace.linux_docker}", - - "GenotypeBatch.n_RD_genotype_bins": "100000", - "GenotypeBatch.n_per_split": "5000", - "GenotypeBatch.pesr_exclude_list": "${workspace.pesr_exclude_list}", - "GenotypeBatch.seed_cutoffs": "${workspace.seed_cutoffs}", - "GenotypeBatch.reference_build": "${workspace.reference_build}", - "GenotypeBatch.ref_dict": "${workspace.reference_dict}", - - "GenotypeBatch.primary_contigs_list": "${workspace.primary_contigs_list}", - - "GenotypeBatch.batch": "${this.sample_set_id}", - "GenotypeBatch.rf_cutoffs": "${this.cutoffs}", - "GenotypeBatch.batch_depth_vcf": "${this.outlier_filtered_depth_vcf}", - "GenotypeBatch.batch_pesr_vcf": "${this.outlier_filtered_pesr_vcf}", - "GenotypeBatch.bin_exclude": "${workspace.bin_exclude}", - "GenotypeBatch.discfile": "${this.merged_PE}", - "GenotypeBatch.coveragefile": "${this.merged_bincov}", - "GenotypeBatch.splitfile": "${this.merged_SR}", - "GenotypeBatch.medianfile": "${this.median_cov}", - "GenotypeBatch.cohort_depth_vcf": "${this.outlier_filtered_depth_vcf}", - "GenotypeBatch.cohort_pesr_vcf": "${this.outlier_filtered_pesr_vcf}" -} diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/GenotypeComplexVariants.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/GenotypeComplexVariants.SingleBatch.json.tmpl deleted file mode 100644 index ae598805f..000000000 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/GenotypeComplexVariants.SingleBatch.json.tmpl +++ /dev/null @@ -1,19 +0,0 @@ -{ - "GenotypeComplexVariants.bin_exclude": "${workspace.bin_exclude}", - "GenotypeComplexVariants.contig_list": "${workspace.primary_contigs_fai}", - "GenotypeComplexVariants.ref_dict": "${workspace.reference_dict}", - - "GenotypeComplexVariants.linux_docker": "${workspace.linux_docker}", - "GenotypeComplexVariants.sv_pipeline_docker": "${workspace.sv_pipeline_docker}", - "GenotypeComplexVariants.sv_base_mini_docker": "${workspace.sv_base_mini_docker}", - - "GenotypeComplexVariants.cohort_name": "${this.sample_set_id}", - "GenotypeComplexVariants.batches": "${this.sample_set_id}", - "GenotypeComplexVariants.depth_vcfs": "${this.regenotyped_depth_vcfs}", - "GenotypeComplexVariants.complex_resolve_vcfs": "${this.complex_resolve_vcfs}", - "GenotypeComplexVariants.complex_resolve_vcf_indexes": "${this.complex_resolve_vcf_indexes}", - "GenotypeComplexVariants.ped_file": "${workspace.cohort_ped_file}", - "GenotypeComplexVariants.bincov_files": "${this.merged_bincov}", - "GenotypeComplexVariants.depth_gt_rd_sep_files": "${this.trained_genotype_depth_depth_sepcutoff}", - "GenotypeComplexVariants.median_coverage_files": "${this.median_cov}" -} diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MainVcfQc.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MainVcfQc.SingleBatch.json.tmpl deleted file mode 100644 index 9b1b00e7f..000000000 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MainVcfQc.SingleBatch.json.tmpl +++ /dev/null @@ -1,23 +0,0 @@ -{ - "MainVcfQc.primary_contigs_fai": "${workspace.primary_contigs_fai}", - - "MainVcfQc.site_level_comparison_datasets": [ - {{ reference_resources.ccdg_abel_site_level_benchmarking_dataset | tojson }}, - {{ reference_resources.gnomad_v2_collins_site_level_benchmarking_dataset | tojson }}, - {{ reference_resources.hgsv_byrska_bishop_site_level_benchmarking_dataset | tojson }}, - {{ reference_resources.thousand_genomes_site_level_benchmarking_dataset | tojson }} - ], - - "MainVcfQc.sv_pipeline_docker": "${workspace.sv_pipeline_docker}", - "MainVcfQc.sv_base_mini_docker": "${workspace.sv_base_mini_docker}", - "MainVcfQc.sv_pipeline_qc_docker": "${workspace.sv_pipeline_qc_docker}", - - "MainVcfQc.prefix": "${this.sample_set_id}", - "MainVcfQc.ped_file": "${workspace.cohort_ped_file}", - - "MainVcfQc.vcfs": "${this.cleaned_vcf}", - - "MainVcfQc.sv_per_shard": 2500, - "MainVcfQc.samples_per_shard": 600 - -} diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.SingleBatch.json.tmpl deleted file mode 100644 index f8597223c..000000000 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.SingleBatch.json.tmpl +++ /dev/null @@ -1,50 +0,0 @@ -{ - "MakeCohortVcf.bin_exclude": "${workspace.bin_exclude}", - "MakeCohortVcf.contig_list": "${workspace.primary_contigs_fai}", - "MakeCohortVcf.allosome_fai": "${workspace.allosome_file}", - "MakeCohortVcf.cytobands": "${workspace.cytobands}", - "MakeCohortVcf.mei_bed": "${workspace.mei_bed}", - "MakeCohortVcf.pe_exclude_list": "${workspace.pesr_exclude_list}", - "MakeCohortVcf.depth_exclude_list": "${workspace.depth_exclude_list}", - "MakeCohortVcf.empty_file" : "${workspace.empty_file}", - "MakeCohortVcf.ref_dict": "${workspace.reference_dict}", - - "MakeCohortVcf.site_level_comparison_datasets": [ - {{ reference_resources.ccdg_abel_site_level_benchmarking_dataset | tojson }}, - {{ reference_resources.gnomad_v2_collins_site_level_benchmarking_dataset | tojson }}, - {{ reference_resources.hgsv_byrska_bishop_site_level_benchmarking_dataset | tojson }}, - {{ reference_resources.thousand_genomes_site_level_benchmarking_dataset | tojson }} - ], - - "MakeCohortVcf.min_sr_background_fail_batches": 0.5, - "MakeCohortVcf.max_shards_per_chrom_clean_vcf_step1": 200, - "MakeCohortVcf.min_records_per_shard_clean_vcf_step1": 5000, - "MakeCohortVcf.clean_vcf1b_records_per_shard": 10000, - "MakeCohortVcf.clean_vcf5_records_per_shard": 5000, - "MakeCohortVcf.samples_per_clean_vcf_step2_shard": 100, - "MakeCohortVcf.random_seed": 0, - "MakeCohortVcf.max_shard_size_resolve": 500, - - "MakeCohortVcf.linux_docker": "${workspace.linux_docker}", - "MakeCohortVcf.sv_pipeline_docker": "${workspace.sv_pipeline_docker}", - "MakeCohortVcf.sv_base_mini_docker": "${workspace.sv_base_mini_docker}", - "MakeCohortVcf.sv_pipeline_qc_docker": "${workspace.sv_pipeline_qc_docker}", - - "MakeCohortVcf.primary_contigs_list": "${workspace.primary_contigs_list}", - - "MakeCohortVcf.chr_x": "${workspace.chr_x}", - "MakeCohortVcf.chr_y": "${workspace.chr_y}", - - "MakeCohortVcf.cohort_name": "${this.sample_set_id}", - "MakeCohortVcf.batches": "${this.sample_set_id}", - "MakeCohortVcf.ped_file": "${workspace.cohort_ped_file}", - "MakeCohortVcf.disc_files": "${this.merged_PE}", - "MakeCohortVcf.bincov_files": "${this.merged_bincov}", - "MakeCohortVcf.median_coverage_files": "${this.median_cov}", - "MakeCohortVcf.rf_cutoff_files": "${this.cutoffs}", - "MakeCohortVcf.pesr_vcfs": "${this.genotyped_pesr_vcf}", - "MakeCohortVcf.depth_vcfs": "${this.regenotyped_depth_vcfs}", - "MakeCohortVcf.depth_gt_rd_sep_files": "${this.trained_genotype_depth_depth_sepcutoff}", - "MakeCohortVcf.raw_sr_bothside_pass_files": "${this.sr_bothside_pass}", - "MakeCohortVcf.raw_sr_background_fail_files": "${this.sr_background_fail}" -} diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/RegenotypeCNVs.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/RegenotypeCNVs.SingleBatch.json.tmpl deleted file mode 100644 index cd3847583..000000000 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/RegenotypeCNVs.SingleBatch.json.tmpl +++ /dev/null @@ -1,22 +0,0 @@ -{ - "RegenotypeCNVs.sv_base_mini_docker": "${workspace.sv_base_mini_docker}", - "RegenotypeCNVs.sv_pipeline_docker": "${workspace.sv_pipeline_docker}", - "RegenotypeCNVs.n_RdTest_bins": "100000", - "RegenotypeCNVs.n_per_split": "5000", - - "RegenotypeCNVs.cohort": "${this.sample_set_id}", - "RegenotypeCNVs.contig_list": "${workspace.primary_contigs_list}", - "RegenotypeCNVs.regeno_coverage_medians": "${this.regeno_coverage_medians}", - - "RegenotypeCNVs.RD_depth_sepcutoffs": "${this.trained_genotype_depth_depth_sepcutoff}", - - "RegenotypeCNVs.cohort_depth_vcf": "${this.outlier_filtered_depth_vcf}", - - "RegenotypeCNVs.batch_depth_vcfs": "${this.outlier_filtered_depth_vcf}", - - "RegenotypeCNVs.depth_vcfs": "${this.genotyped_depth_vcf}", - "RegenotypeCNVs.coveragefiles": "${this.merged_bincov}", - "RegenotypeCNVs.coveragefile_idxs": "${this.merged_bincov_index}", - "RegenotypeCNVs.medianfiles": "${this.median_cov}", - "RegenotypeCNVs.batches": "${this.sample_set_id}" -} diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/ResolveComplexVariants.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/ResolveComplexVariants.SingleBatch.json.tmpl deleted file mode 100644 index 5a3c23109..000000000 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/ResolveComplexVariants.SingleBatch.json.tmpl +++ /dev/null @@ -1,19 +0,0 @@ -{ - "ResolveComplexVariants.contig_list": "${workspace.primary_contigs_fai}", - "ResolveComplexVariants.cytobands": "${workspace.cytobands}", - "ResolveComplexVariants.mei_bed": "${workspace.mei_bed}", - "ResolveComplexVariants.pe_exclude_list": "${workspace.pesr_exclude_list}", - "ResolveComplexVariants.ref_dict": "${workspace.reference_dict}", - - "ResolveComplexVariants.max_shard_size" : 500, - "ResolveComplexVariants.sv_pipeline_docker": "${workspace.sv_pipeline_docker}", - "ResolveComplexVariants.sv_base_mini_docker": "${workspace.sv_base_mini_docker}", - - "ResolveComplexVariants.cohort_name": "${this.sample_set_id}", - "ResolveComplexVariants.disc_files": "${this.merged_PE}", - "ResolveComplexVariants.rf_cutoff_files": "${this.cutoffs}", - "ResolveComplexVariants.cluster_vcfs": "${this.combined_vcfs}", - "ResolveComplexVariants.cluster_bothside_pass_lists": "${this.cluster_bothside_pass_lists}", - "ResolveComplexVariants.cluster_background_fail_lists": "${this.cluster_background_fail_lists}" - -} From 944337e5a317acf16041f57c25f4dfc671e43ec6 Mon Sep 17 00:00:00 2001 From: Vahid Date: Wed, 12 Jun 2024 11:28:50 -0400 Subject: [PATCH 5/5] Update docs on building and hosting Docker images (#640) * Update docusaurus to current latest version. * Simplify docker documentation. * Rewrite the manual section to include explicit steps to take. * Remove docker from getting started as it could be a bit too technical for "getting started". * Fix broken links. * Add more languages for code blocks. * Remove unused config. * Add a link to jump to next section. * Create VM with additional disk size, & more docker login steps added. * pin mermaid version. * Update website/docs/advanced/docker/automated.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/automated.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/automated.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/automated.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/automated.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/automated.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/automated.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/automated.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/automated.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/automated.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/automated.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/automated.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/automated.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/automated.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/manual.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/manual.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/manual.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/manual.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/manual.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/automated.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/automated.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/automated.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/automated.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/automated.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/automated.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/images.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/images.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/images.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/index.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/index.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/index.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/index.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/index.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/index.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/manual.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/manual.md Co-authored-by: Mark Walker * Update website/docs/advanced/docker/manual.md Co-authored-by: Mark Walker * Expand the list to better enumerate the features. * Add a tip. * increase disk size. --------- Co-authored-by: Mark Walker --- website/docs/advanced/docker/automated.md | 165 ++++++++ website/docs/advanced/docker/dependencies.md | 28 ++ .../docs/advanced/docker/deploy/automated.md | 204 ---------- .../advanced/docker/deploy/incremental.md | 104 ----- website/docs/advanced/docker/deploy/index.md | 30 -- website/docs/advanced/docker/deploy/manual.md | 313 --------------- website/docs/advanced/docker/images.md | 139 ++++--- website/docs/advanced/docker/index.md | 88 ++-- website/docs/advanced/docker/manual.md | 380 ++++++++++++++++++ website/docs/gs/docker.md | 41 -- website/package.json | 2 +- 11 files changed, 700 insertions(+), 794 deletions(-) create mode 100644 website/docs/advanced/docker/automated.md create mode 100644 website/docs/advanced/docker/dependencies.md delete mode 100644 website/docs/advanced/docker/deploy/automated.md delete mode 100644 website/docs/advanced/docker/deploy/incremental.md delete mode 100644 website/docs/advanced/docker/deploy/index.md delete mode 100644 website/docs/advanced/docker/deploy/manual.md create mode 100644 website/docs/advanced/docker/manual.md diff --git a/website/docs/advanced/docker/automated.md b/website/docs/advanced/docker/automated.md new file mode 100644 index 000000000..0ad392359 --- /dev/null +++ b/website/docs/advanced/docker/automated.md @@ -0,0 +1,165 @@ +--- +title: Automated Deployment +description: Build and Publish Images +sidebar_position: 2 +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + + +GATK-SV Docker images are automatically built, tested, and pushed to +container registries. An automated continuous +integration and continuous delivery (CI/CD) ensures the +images are built and tested consistently and reproducibly in standardized Linux virtual machines. + + +The automation pipeline runs on GitHub Actions and performs a regression +test as part of every pull request. When a pull request is merged, the automation +pipeline publishes images on the Google Container Registry (GCR) +and Azure Container Registry (ACR), and updates their references. + + +The latest Docker images are listed in the files below. +Detailed automated deployment is described in the following sections. + + + + + + [gatk-sv/inputs/values/dockers_azure.json](https://github.com/broadinstitute/gatk-sv/blob/main/inputs/values/dockers_azure.json) + + + + + + [gatk-sv/inputs/values/dockers.json](https://github.com/broadinstitute/gatk-sv/blob/main/inputs/values/dockers.json) + + + + + + +:::info +The detailed explanation of the automation workflow provided on this page +is intended for users who need to configure the CI/CD workflow on +their own fork of GATK-SV's GitHub repository to host Docker images on +their own container registries. + + +If you only need the list of latest Docker images, you may refer to the above-listed files. +::: + + +## Workflow Layout + +The automation workflow is defined in +[`sv_pipeline.yml`](https://github.com/broadinstitute/gatk-sv/blob/main/.github/workflows/sv_pipeline_docker.yml) +and utilizes the +[`build_docker.py`](https://github.com/broadinstitute/gatk-sv/blob/main/scripts/docker/build_docker.py) +script for building and publishing Docker images. +The workflow consists of three +[_jobs_](https://docs.github.com/en/actions/learn-github-actions/workflow-syntax-for-github-actions#jobs) +discussed in the following sections: + +1. [Determine build arguments](#args) +2. [Regression testing](#build) (pull request and merge) +3. [Publishing Docker images](#publish) (merge only) + +### Determine Build Args {#args} +This job is responsible for determining the arguments to be used by the +`build_docker.py` script, specifically: + +- **Determining commit SHAs**: + Considering the large number of GATK-SV Docker images, + the workflow builds and publishes only the + Docker images affected by the changes introduced + in a pull request. + You may refer to [this page](/docs/advanced/docker/images#incremental) + on details regarding the incremental build strategy. + This job determines the commit SHAs of `HEAD` and `BASE` + commits. + +- **Compose image tag**: + The images are tagged with a consistent template as the following: + + ``` + [DATE]-[RELEASE_TAG]-[HEAD_SHA_8] + ``` + + - `[DATE]` is in `YYYY-MM-DD`, and is extracted + from the timestamp of the last commit on the branch associated + with the pull request. + - `RELEASE_TAG` is extracted from the + latest [pre-]release on GitHub. + - `HEAD_SHA_8` denotes the first eight characters + of the `HEAD` commit SHA. + + The following is an example of a tag generated + in this step: + + ``` + 2023-05-24-v0.27.3-beta-1796b665 + ``` + + +### Testing Docker Image Build {#build} + +This job is triggered when **a commit is pushed to the pull request branch.** +It serves the purpose of regression testing of the Docker images. +It builds Docker images according to the arguments determined in [`Determine Build Args`](#args). +If the Docker images are not successfully built, then the +job fails and all images are discarded. + + +### Publishing Docker Images {#publish} + +This job is triggered when **a pull request is merged or a commit is pushed to the `main` branch.** +Similar to the [`Test Images Build`](#build) job, +it builds Docker images. In addition, +this job also pushes the built images to the GCR and ACR +and updates their list. +The job fails if it cannot successfully run all the steps. +The publishing process is summarized below. + + +- **Login** + to container registries in order to push the built images. + The job obtains authorization to push to Google and Azure container registries + by assuming a Google service account and an Azure service principal, respectively. + The credentials required to assume these identities are defined as + [encrypted environment secrets](https://docs.github.com/en/actions/security-guides/encrypted-secrets). + + +- **Build and publish to ACR and GCR**: + Similar to the [build job](#build), this job builds Docker images + based on the list of changed files specified using the + `HEAD` and `BASE` commit SHA. It's important to note + that the images pushed to GCR and ACR are identical and only differ in their tags. + +- **Update the list of published images**: + Once the newly built images are successfully pushed, + this job updates the JSON files containing the list of images (i.e., `dockers*.json`), + and commits and pushes the changes to the `main` branch. + To achieve this, we use a _bot_ account that performs the following actions: + + a. Login to git using the bot's Personal Access Token (PAT) + in order to authorize it to push to the `main` branch. + + b. Configure the Git installation in the GitHub Actions VMs using the _bot_'s credentials. + + c. Commit the changed files. The commit message references the + Git commit that triggered the [publish](#publish) job. + + d. Push the commit to the main branch. + + It is worth noting that GitHub recognizes that this push to the `main` branch is made from a GitHub + Actions environment, hence it does not trigger another [Publish](#publish) job, + avoiding infinite triggers of this job. diff --git a/website/docs/advanced/docker/dependencies.md b/website/docs/advanced/docker/dependencies.md new file mode 100644 index 000000000..e83a5a183 --- /dev/null +++ b/website/docs/advanced/docker/dependencies.md @@ -0,0 +1,28 @@ +--- +title: Image Dependencies +description: Docker images code and image dependencies +sidebar_position: 4 +--- + +## Docker Images List {#list} + +The table below lists the GATK-SV Docker images and their dependencies. + +| Image | Code Dependencies | Docker Dependencies | +|------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------| +| `manta` |
  • `dockerfiles/manta/*`
| | +| `melt` |
  • `dockerfiles/melt/*`
|
  • `sv-base`
| +| `scramble` |
  • `dockerfiles/scramble/*`
| | + | `wham` |
  • `dockerfiles/wham/*`
|
  • `samtools-cloud`
| + | `str` |
  • `dockerfiles/str/*`
| | + | `sv-base-mini` |
  • `dockerfiles/sv-base-mini/*`
| | + | `samtools-cloud-virtual-env` |
  • `dockerfiles/samtools-cloud-virtual-env/*`
| | + | `samtools-cloud` |
  • `dockerfiles/samtools-cloud/*`
|
  • `sv-base-mini`
  • `samtools-cloud-virtual-env`
| + | `sv-base-virtual-env` |
  • `dockerfiles/sv-base-virtual-env/*`
| | + | `sv-base` |
  • `dockerfiles/sv-base/*`
|
  • `samtools-cloud`
  • `sv-base-virtual-env`
| + | `cnmops-virtual-env` |
  • `dockerfiles/cnmops-virtual-env/*`
|
  • `sv-base-virtual-env`
| + | `cnmops` |
  • `dockerfiles/cnmops/*`
|
  • `sv-base`
  • `cnmops-virtual-env`
| + | `sv-pipeline-virtual-env` |
  • `dockerfiles/sv-pipeline-virtual-env/*`
|
  • `sv-base-mini`
  • `sv-base-virtual-env`
  • `samtools-cloud-virtual-env`
| + | `sv-pipeline` |
  • `dockerfiles/sv-pipeline/*`
  • `src/RdTest/*`
  • `src/sv-pipeline/*`
  • `src/svqc/*`
  • `src/svtest/*`
  • `src/svtk/*`
  • `src/WGD/*`
|
  • `sv-base`
  • `sv-pipeline-virtual-env`
| + | `sv-utils-env` |
  • `dockerfiles/sv-utils-env/*`
|
  • `samtools-cloud-virtual-env`
| + | `sv-utils` |
  • `dockerfiles/sv-utils/*`
  • `src/sv_utils/src/*`
  • `src/sv_utils/setup.py`
|
  • `samtools-cloud`
  • `sv-utils-env`
| diff --git a/website/docs/advanced/docker/deploy/automated.md b/website/docs/advanced/docker/deploy/automated.md deleted file mode 100644 index 07ba1f3f9..000000000 --- a/website/docs/advanced/docker/deploy/automated.md +++ /dev/null @@ -1,204 +0,0 @@ ---- -title: Automated Deployment -description: Build and Publish Images -sidebar_position: 2 ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - -In the GATK-SV pipeline, the Docker images undergo automated -processes for building, testing, and publishing as part of -the CI/CD workflow. These automated procedures guarantee -that all images are consistently and reproducibly built -within a standardized Linux VM environment -(specifically, GitHub Actions). -This ensures uniformity across all GATK-SV Docker images -and keeps them synchronized with the latest code-base. - - -The automated CI/CD pipeline also includes continuous -testing and regression identification during pull requests. -This proactive approach allows for the detection and -resolution of any issues related to image changes or content -before merging the pull request. -Consequently, it ensures the reliability and consistency -of the Docker images, simplifies the review process, -and maintains the high quality of the pipeline. - - -Additionally, the automated CI/CD workflow ensures that -the Docker images are correctly mirrored on multiple -container registries, specifically Azure Container Registry (ACR) -and Google Cloud Container Registry (GCR). -This redundancy guarantees availability and accessibility -of the images across different platforms. - - -Latest Docker images are listed in the files, -with detailed automated deployment descriptions in the following sections. - - - - - ```shell - gatk_sv_codebase/inputs/values/dockers_azure.json - ``` - - - - - ```shell - gatk_sv_codebase/inputs/values/dockers.json - ``` - - - - - -## Workflow Layout - -The CI/CD workflow for building, testing, and publishing GATK-SV Docker images -is defined in [`sv_pipeline.yml`](https://github.com/broadinstitute/gatk-sv/blob/main/.github/workflows/sv_pipeline_docker.yml). -The [`build_docker.py`](https://github.com/broadinstitute/gatk-sv/blob/main/scripts/docker/build_docker.py) -script is utilized for building and publishing the images. -When a pull request is issued against the repository, the images are built, -and upon merging the pull request, they are published to ACR and GCR. - - - -The workflow consists of three -[_jobs_](https://docs.github.com/en/actions/learn-github-actions/workflow-syntax-for-github-actions#jobs) -discussed in the following sections. - - -### Determine Build Args {#args} -This job is responsible for determining the arguments to be used by the -`build_docker.py` script, specifically: - -- **Determining commit SHAs**: - Considering the size and number of GATK-SV Docker images, - the workflow focuses on building and publishing only the - Docker images that are affected by the changes introduced - in a pull request (PR). - You may refer to [this page](/docs/advanced/docker/deploy/incremental) - on details regarding the incremental build strategy. - This job determines the commit SHAs of `HEAD` and `BASE` - commits. - -- **Compose image tag**: - GATK-SV Docker images are tagged with a consistent template - to simplify referencing and usage in the pipeline. - The tag composition step follows the following template. - - ``` - [DATE]-[RELEASE_TAG]-[HEAD_SHA_8] - ``` - where `[DATE]` represents the `YYYY-MM-DD` format extracted - from the timestamp of the last commit on the branch associated - with the pull request. `RELEASE_TAG` is extracted from the - latest [pre-]release on GitHub. - Additionally, `HEAD_SHA_8` denotes the first eight characters - of the `HEAD` commit SHA. The following is an example tag generated - in this step. - - ``` - 2023-05-24-v0.27.3-beta-1796b665 - ``` - - -### Testing Docker Image Build {#build} - -The `Test Images Build` job is triggered when a commit is pushed to -the pull request branch. It is responsible for -building the Docker images identified by the -[`Determine Build Args`](#args) -job. If the Docker image building process fails, -this job will also fail. The Docker images created -by this job are not published to GCR or ACR and -are discarded once the job is successfully completed. -This job primarily serves for testing purposes during -the review process, ensuring that the affected images -can be successfully built and that the changes introduced -in the pull request do not disrupt the Docker build process. - - -### Publishing Docker Images {#publish} - -The `Publish` job is triggered when a pull request -is merged or a commit is pushed to the `main` branch. -Similar to the [`Test Images Build`](#build) job, -it builds Docker images; however, in addition, -this job also pushes the built images to the GCR and ACR, -and updates the list of published images. Specifically, -this job runs the following steps. - - -- **Login to ACR**: - To authorize access to the Azure Container Registry (ACR), - this job logs in to Docker by assuming an Azure service principal. - The credentials required for the login are defined as - [encrypted environment secrets](https://docs.github.com/en/actions/security-guides/encrypted-secrets). - -- **Login to GCR**: - Similar to ACR, to authorize access to GCR, - this job assumes a Google Cloud Platform service account. - The secrets related to the service account are defined as - [encrypted environment secrets](https://docs.github.com/en/actions/security-guides/encrypted-secrets). - -- **Build and publish to ACR and GCR**: - Similar to the [build job](#build), this job builds Docker images - based on the list of changed files specified using the - `HEAD` and `BASE` commit SHA. Additionally, it pushes the - built images to both ACR and GCR. It's important to note - that the job doesn't rebuild separate images for each registry. - Instead, it labels a single image for both ACR and GCR, - resulting in an identical image with the same tag and Docker - image hash being pushed to both registries. - This job will fail if the build or push process encounters any issues. - -- **Update the list of published images**: - GATK-SV maintains two JSON files that store the latest Docker - images built and pushed to ACR and GCR. - These files require updates whenever a new image is successfully - built and published. The `build_docker` script handles the - update of the JSON files by adding the latest built and - published Docker images for ACR and GCR. - - However, it's important to note that the updated JSON - files reside in the GitHub Actions virtual machines, - and they are discarded once the GitHub Actions job is - completed successfully. To preserve these changes, - we need to commit them to the `main` branch from within the - GitHub Actions VM as part of the CI/CD process. - To achieve this, we utilize a dedicated _bot_ account. - The steps necessary to perform this are explained - in the following. - - - **Login to git using the bot's Personal Access Token (PAT)**: - This step is necessary to enable the _bot_ account to - commit the modified JSON files to the `main` branch - and to authorize the _bot_ to push the changes from - the GitHub Actions VM to the `main` branch using its credentials. - - - **Commit changes and push to the `main` branch**: - This step configures the Git installation in the - GitHub Actions VMs using the _bot_'s credentials. - It commits the modified JSON files, which contain - the latest built and pushed images. The commit message - references the Git commit that triggered the [publish](#publish) job, - providing improved tracking of changes in the Git history. - Finally, it pushes the commit to the main branch. - It's worth noting that Git is intelligent enough - to recognize that this push is made from a GitHub - Actions environment, preventing it from triggering - another publish job. This avoids the issue of - infinite triggers of the publish job. - diff --git a/website/docs/advanced/docker/deploy/incremental.md b/website/docs/advanced/docker/deploy/incremental.md deleted file mode 100644 index 95898bc0c..000000000 --- a/website/docs/advanced/docker/deploy/incremental.md +++ /dev/null @@ -1,104 +0,0 @@ ---- -title: Incremental Publishing -description: Incremental Publishing Strategy -sidebar_position: 4 ---- - - -The hierarchical and modular organization of GATK-SV Docker -images offers a significant advantage: when updating the codebase, -not every Docker image is affected, minimizing the impact of changes. -This means that not all Docker images need to be rebuilt and -published with each pipeline modification. The -[`build_docker`](https://github.com/broadinstitute/gatk-sv/blob/main/scripts/docker/build_docker.py) -script efficiently tracks these changes and determines which -Docker images are impacted. Consequently, only the affected Docker -images are built, saving both storage space and build time. - - -This incremental and selective building and publishing -strategy is particularly beneficial considering the size and -build time of Docker images. By building and publishing -only the necessary images, we can save on storage space and -reduce the overall build time. -This page provides a detailed explanation of -this incremental and selective approach. - - -## Determining Modified Files - -The incremental build strategy relies on the determination -of modified files to identify which Docker images require rebuilding. -Using `git` history, the `build_docker` script automatically -infers the list of changed files. - - -To achieve this, the script compares two -[`git` commit SHAs](https://docs.github.com/en/pull-requests/committing-changes-to-your-project/creating-and-editing-commits/about-commits): - -- `BASE_SHA`: the reference commit representing the base branch - (e.g., `broadinstitute/gatk-sv:main`), and; -- `HEAD_SHA`: the target commit representing the latest commit on the feature branch. - - -By analyzing the changes between these commits -the script identifies the impacted files and proceeds to -build the corresponding Docker images. - -During manual runs, the user provides the commit SHAs, -while in automated builds as part of CI/CD, -the commit SHAs are determined automatically. - -In CI/CD, the commit SHAs are determined as the following example. - -```mermaid -%%{init: { - 'logLevel': 'debug', - 'gitGraph': {'rotateCommitLabel': false}, - 'themeVariables': { 'commitLabelFontSize': '22px' } - } - }%% -gitGraph - commit id: "A" - commit id: "B" - branch feature - checkout feature - commit id: "X" - checkout main - commit id: "C" - checkout feature - commit id: "Y" - checkout main - commit id: "D" - checkout feature - commit id: "Z" - checkout main - merge feature id: "E" - commit id: "F" -``` - -In this example, `BASE_SHA=B`, `HEAD_SHA=Z`, and `E` is the merge commit. - - -## Identifying Images Requiring Rebuilding from Changed Files - -The `build_docker` script identifies the list of docker images -that need to be rebuilt based on two factors. - -1. Directly impacted images are determined by checking the -list of files each image depends on. If any of these files have -changed, the corresponding image needs rebuilding. - -2. Indirectly impacted images are identified based on -the hierarchical dependency between images. -If a base image is rebuilt, any dependent images built upon -it also require rebuilding. - -This two-step process ensures that all the affected images are correctly -identified for rebuilding. - - -A comprehensive mapping of files to their corresponding -Docker images, specifying which images need to be -rebuilt when their associated files are updated is given in -[this section](https://github.com/broadinstitute/gatk-sv/blob/e86d59962146ae1770c535a97c2774d825026957/scripts/docker/build_docker.py#L170-L245). diff --git a/website/docs/advanced/docker/deploy/index.md b/website/docs/advanced/docker/deploy/index.md deleted file mode 100644 index 33ab138b1..000000000 --- a/website/docs/advanced/docker/deploy/index.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -title: Deploying Docker Images -description: Docker Concepts and Execution Overview -sidebar_position: 2 ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - -:::info -This section offers a comprehensive explanation of the process of -building, testing, and publishing Docker images. For details -regarding the images and their hierarchy, please refer to -[this page](/docs/advanced/docker/images). -::: - - -GATK-SV Docker image _deployment_ involves the essential steps of -_building_, _testing_, and _publishing_ to Docker container registries. -There are two deployment options available: fully automated and manual. -With the fully automated approach, GATK-SV Docker images are built -and published to Google Container Registry (GCR) and -Azure Container Registry (ACR) through continuous integration and -continuous delivery (CI/CD) after merging a pull request. -However, if you are working on extending or improving the -GATK-SV Docker images, you may need to build the images locally -for testing or store them on an alternative container registry. -This section provides comprehensive insights into the automatic -build process and a detailed guide on locally building the images -for development purposes. diff --git a/website/docs/advanced/docker/deploy/manual.md b/website/docs/advanced/docker/deploy/manual.md deleted file mode 100644 index 896985e44..000000000 --- a/website/docs/advanced/docker/deploy/manual.md +++ /dev/null @@ -1,313 +0,0 @@ ---- -title: Manual Deployment -description: Build and Publish Images -sidebar_position: 3 ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - -If you are contributing to the GATK-SV codebase, specifically focusing on -enhancing tools, configuring dependencies in Dockerfiles, or modifying GATK-SV scripts -within the Docker images, it is important to build and test the Docker images locally. -This ensures that the images are successfully built and function as intended. - -The process of updating GATK-SV Docker images involves two steps: build and publish. - -- **Build**: Create Docker images from Dockerfiles and store them on your computer. - -- **Publish**: Upload the built Docker images to container registries -(e.g., Google Container registry, or Azure container registry) -to make them available for use in Terra or Cromwell. - -You may refer to [this page](/docs/advanced/docker/index.md) for detailed description of the process. -To streamline the process, we have developed a Python script -that automates the image building and publishing to your container registry. -This section provides guidelines on building and publishing the images using this script. - - -:::warning Linux Machine Required - -Only Linux machines (dedicated or virtual) are supported for building GATK-SV Docker images. -Images created on non-Linux machines may not work with Terra or Cromwell execution environment. -The instructions provided on this page assume you are using a Linux Ubuntu machine. -::: - - - -## Setup - -### Runtime environment {#runtime} - -Currently, GATK-SV Docker images can only be built on the `linux/amd64` platform, -which is a machine running Linux OS on x86-64 architecture. -Images build on Apple M1 (`linux/arm64`) are not currently supported. -You can use a local Linux machine or obtain a virtual machine from a cloud platform. - -You may follow the steps in the -[GCP](https://cloud.google.com/compute/docs/instances/create-start-instance#publicimage) -or [Azure](https://learn.microsoft.com/en-us/azure/virtual-machines/windows/quick-create-portal) -documentation to create a virtual machine (VM) on Google Cloud Platform (GCP) or Microsoft Azure respectively. -Make sure the VM is built using an Ubuntu image, has at least 8 GB RAM, and some additional -disk space (e.g., 50 GB should be sufficient). - -Building and publishing GATK-SV Docker images is time-consuming and can take around 1 hour. -Therefore, we recommend using a terminal multiplexer -(e.g., [tmux](https://github.com/tmux/tmux/wiki/Getting-Started); -[tmux cheat sheet](https://tmuxcheatsheet.com)) -when running on a VM to ensure the process continues even if you are disconnected from the VM. - -### Docker {#docker} - -[Install](https://docs.docker.com/engine/install/) Docker desktop -and login using `sudo docker login`. If utilizing GATK-SV Docker images -from a private container registry or intending to publish the resulting -images to a registry, ensure that you are logged in with credentials -that grant you access to the registry. - - - - - You may follow - [this documentation](https://learn.microsoft.com/en-us/azure/container-registry/container-registry-authentication?tabs=azure-cli) - on setting up Docker authentication to an Azure container registry. - - - - You may follow - [this documentation](https://cloud.google.com/artifact-registry/docs/docker/authentication) - on setting up Docker authentication to a Google container registry. - - - - -### Checkout codebase {#checkout} - -Make sure you are on the `git` branch with the code you want to add -to the GATK-SV Docker images you are building. - -```shell -git fetch origin -git checkout origin/ -``` - -## Build and Publish Docker Images {#build} - -All the GATK-SV Dockerfiles are hosted under the directory -[`gatk-sv/dockerfiles/`](https://github.com/broadinstitute/gatk-sv/tree/main/dockerfiles). -While you can build the GATK-SV Docker images by following the standard -[Docker image build procedures](https://docs.docker.com/engine/reference/commandline/image_build/), -that can be challenging due to the nested hierarchy of GATK-SV Docker images. -To simplify the process, we have developed a utility script that streamlines the -Docker image build process -([`scripts/docker/build_docker.py`](https://github.com/broadinstitute/gatk-sv/blob/main/scripts/docker/build_docker.py)). - -In the following, we will explain how to use the utility script for a simple use-case. -For more advanced and additional functionalities, please refer to the script's documentation, -which you may access it as the following. - -```shell -python scripts/docker/build_docker.py --help -``` - - -In its basic setup, you can use the following command to **build and publish** a GATK-SV Docker image. - -```shell -python scripts/docker/build_docker.py \ - --targets \ - --image-tag \ - --docker-repo -``` - -The arguments used are explained in the following. - -### Determine which images need to be rebuilt {#targets} - -You may follow either of the following practices to determine which images to rebuild. - -- **Automatic:** - The script can automatically determine which Docker images need a rebuild - based on a list of changed files and cross-referencing them with the - table in [this section](/docs/advanced/docker/images#list). - Specifically, it takes two git commit SHAs as input, uses `git diff` - to extract the list of changed files, and then cross-referencing them - with [this table](/docs/advanced/docker/images#list) to identify the Docker - images requiring rebuilding. Details can be found on [this page](/docs/advanced/docker/deploy/incremental.md). - To use this feature, commit the changes first, identify `BASE_SHA` and `HEAD_SHA` using `git log` or GitHub - (details on [this page](/docs/advanced/docker/deploy/incremental.md)), - and then call the script as follows. - - ```shell - python scripts/docker/build_docker.py \ - --base-git-commit BASE_SHA \ - --current-git-commit HEAD_SHA - ``` - -- **Manual: ** - You may refer to the table in [this section](/docs/advanced/docker/images#list) - to determine which Docker images to rebuild based on the changed files. - For instance, if you modified any of the files under the - [`gatk-sv/src/svtk/`](https://github.com/broadinstitute/gatk-sv/tree/main/src/svtk) - directory, you will need to rebuild the `sv-pipeline` Docker image. - You can set the list of images to rebuild using the `--targets` argument. - For instance: - - ```shell - python scripts/docker/build_docker.py \ - --targets sv-pipeline - ``` - - You may specify multiple images to rebuild by providing a list of their names. - For instance, the following command builds the `sv-pipeline` and the `str` Docker images. - - ```shell - python scripts/docker/build_docker.py \ - --targets sv-pipeline str - ``` - -Please note that `--targets` and `--base-git-commit --current-git-commit` -options are mutually exclusive. In other words, you can either manually specify -images to rebuild, or let the script determine them. -Combining or avoiding both options is not currently supported. - -:::info -Following the steps above, the script builds the specified Docker images -_and all the images derived from them_, ensuring proper propagation of changes through the pipeline. -If you want to build only the specified images, you would need to add the `--skip-dependent-images` flag. -::: - - -### Image tag {#tag} - -[Docker image tags](https://docs.docker.com/engine/reference/commandline/tag/) -are used to distinguish between different builds of the same image. -You can use any naming convention for your tags. -GATK-SV docker images use the following template for tags, -which you may want to adopt, in particular, if you plan to publish -your images on the GATK-SV container registries. - -``` -[Date]-[Release Tag]-[Head SHA 8] -``` - -where `[Date]` is `YYYY-MM-DD` extracted from the time stamp of the last -commit on the feature branch, `[Release Tag]` is extracted from the latest [pre-]release on GitHub, -and the `[Head SHA 8]` is the first eight letters of the SHA of the -last commit on the feature branch. - -For example: - -``` -2023-07-28-v0.28.1-beta-e70dfbd7 -``` - -For automatically composing image tags, you may follow the practices -used in [GATK-SV CI/CD](https://github.com/broadinstitute/gatk-sv/blob/286a87f3bcfc0b8c811ff789776dd0b135f582e9/.github/workflows/sv_pipeline_docker.yml#L85-L109). - - - -### Specify the container registry {#registry} -The built images are stored on your computer. If you are only developing -or testing locally, there is no need to push them to a container registry. -In this case you can avoid providing `--docker-repo `. - -You need to push the images to a container registry if you want to: - -- Use the updated Docker images for WDL testing or development; -- Store them on a container registry other than those maintained by the GATK-SV team. - -The script automatically pushes Docker images to a container registry. -To use this feature, you may follow these steps: - -1. Ensure you are logged into Docker with credentials granting -push access to the container registry. Please refer to the -[Docker](#docker) section for details. - - -2. Provide the `--docker-repo ` argument, -replacing `` with the name of your container registry. -For Google Container Registry (GCR) and Azure Container Registry (ACR), -the format is generally as follows. - - - - - Template: - - ```shell - .azurecr.io//: - ``` - - Example: - ```shell - python scripts/docker/build_docker.py \ - --targets sv-pipeline - --tag v1 - --docker-repo myregistry.azurecr.io/gatk-sv - ``` - - which results in creating the following image: - - ```shell - myregistry.azurecr.io/gatk-sv/sv-pipeline:v1 - ``` - - - - - Template: - - ```shell - //: - ``` - - Example: - ```shell - python scripts/docker/build_docker.py \ - --targets sv-pipeline - --tag v1 - --docker-repo us.gcr.io/my-repository/gatk-sv - ``` - - which results in creating the following image: - - ```shell - us.gcr.io/my-repository/gatk-sv/sv-pipeline:v1 - ``` - - - - -Please note that we are currently using GCR, but it has been migrated to Google Artifact Registry. - - - -## Post-build - -- GATK-SV docker images are mainly intended for use in WDLs. - Therefore, it's a good practice to test the newly updated - images in related WDLs. This ensures that the updated images function - as expected within specific workflows. - -- If you were using a Linux VM to build the Docker images, - ensure you either stop or delete the VM after building the images. - Stopping the VM won't delete the disk, and you'll continue to - incur disk usage charges. If you don't want to incur disk costs, - you can delete the VM along with all its associated resources. - Stopping is preferred over deleting if you intend to reuse the VM. diff --git a/website/docs/advanced/docker/images.md b/website/docs/advanced/docker/images.md index 8a8f084f7..f9f6fff81 100644 --- a/website/docs/advanced/docker/images.md +++ b/website/docs/advanced/docker/images.md @@ -10,21 +10,36 @@ import TabItem from '@theme/TabItem'; :::info This page provides a detailed explanation of Docker images and their hierarchy. For information on the process -of building these images, please refer to [this section](/docs/advanced/docker/deploy). +of building these images, please refer to the +[automated](/docs/advanced/docker/automated) or +[manual](/docs/advanced/docker/manual) builds sections. ::: -The tools, scripts, dependencies, and configurations utilized by the -GATK-SV pipeline, written in WDL, are organized into separate Docker -containers. This modular approach ensures that each container -contains only the necessary tools for its specific task, -resulting in smaller image sizes. This design choice simplifies -the definition of Dockerfiles and facilitates easier maintenance. -Moreover, the smaller image sizes contribute to reduced disk -usage and lower workflow execution costs. +GATK-SV organizes the tools, scripts, and their dependencies and configurations +into multiple Docker images. Each Docker image is built for a specific purpose, +and images have a hierarchical dependency. +This modular design has the following key advantages. +- It results in focused and more straightforward instructions in Dockerfiles, +facilitating their development, maintenance, and extensibility. -The figure below illustrates the relationships between the GATK-SV Docker images. +- It results in smaller Docker images, as each image contains only +the related tools and scripts. Smaller images reduce storage costs on container +registries and are transferred faster to virtual machines, resulting in shorter start-up. + +- The modular design reduces duplication in Dockerfiles and ensures configuration +consistency across different Docker images. + +- This architecture significantly lowers the maintenance cost as it +necessitates updating only the affected Docker images throughout the development +(discussed in details in the [following section](#incremental)). + + + +The following figure illustrates the hierarchical relationship between GATK-SV Docker images. +The arrows indicate the flow from a base to a derived image, where the derived image +extends or modifies the tools and configuration it inherits from the base image. ```mermaid @@ -45,78 +60,74 @@ flowchart TD ubuntu2204 --> scramble[Scramble] & manta[Manta] ``` -The image depicts the hierarchical relationship among GATK-SV -Docker images. Arrows indicate the flow from a base image -to a derived image. The base image, located at the arrow's -starting point, shares its content which is then expanded -upon and modified in the derived image. In simple terms, -the derived image inherits the same tools and configuration -as the base image, while incorporating additional settings and tools. - The list of the Docker images and their latest builds are available in [`dockers.json`](https://github.com/broadinstitute/gatk-sv/blob/main/inputs/values/dockers.json) and [`dockers_azure.json`](https://github.com/broadinstitute/gatk-sv/blob/main/inputs/values/dockers_azure.json) for images hosted on Google Container Registry (GCR) and Azure Container Registry (ACR), respectively. -## Docker Images List {#list} -The table below lists the GATK-SV Docker images and their dependencies. +## Incremental publishing {#incremental} -| Image | Code Dependencies | Docker Dependencies | -|------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------| -| `manta` |
  • `dockerfiles/manta/*`
| | -| `melt` |
  • `dockerfiles/melt/*`
|
  • `sv-base`
| -| `scramble` |
  • `dockerfiles/scramble/*`
| | - | `wham` |
  • `dockerfiles/wham/*`
|
  • `samtools-cloud`
| - | `str` |
  • `dockerfiles/str/*`
| | - | `sv-base-mini` |
  • `dockerfiles/sv-base-mini/*`
| | - | `samtools-cloud-virtual-env` |
  • `dockerfiles/samtools-cloud-virtual-env/*`
| | - | `samtools-cloud` |
  • `dockerfiles/samtools-cloud/*`
|
  • `sv-base-mini`
  • `samtools-cloud-virtual-env`
| - | `sv-base-virtual-env` |
  • `dockerfiles/sv-base-virtual-env/*`
| | - | `sv-base` |
  • `dockerfiles/sv-base/*`
|
  • `samtools-cloud`
  • `sv-base-virtual-env`
| - | `cnmops-virtual-env` |
  • `dockerfiles/cnmops-virtual-env/*`
|
  • `sv-base-virtual-env`
| - | `cnmops` |
  • `dockerfiles/cnmops/*`
|
  • `sv-base`
  • `cnmops-virtual-env`
| - | `sv-pipeline-virtual-env` |
  • `dockerfiles/sv-pipeline-virtual-env/*`
|
  • `sv-base-mini`
  • `sv-base-virtual-env`
  • `samtools-cloud-virtual-env`
| - | `sv-pipeline` |
  • `dockerfiles/sv-pipeline/*`
  • `src/RdTest/*`
  • `src/sv-pipeline/*`
  • `src/svqc/*`
  • `src/svtest/*`
  • `src/svtk/*`
  • `src/WGD/*`
|
  • `sv-base`
  • `sv-pipeline-virtual-env`
| - | `sv-utils-env` |
  • `dockerfiles/sv-utils-env/*`
|
  • `samtools-cloud-virtual-env`
| - | `sv-utils` |
  • `dockerfiles/sv-utils/*`
  • `src/sv_utils/src/*`
  • `src/sv_utils/setup.py`
|
  • `samtools-cloud`
  • `sv-utils-env`
| +The hierarchical and modular architecture of GATK-SV Docker images has a significant advantage: +not every image is affected by every change to the codebase; +hence, not all Docker images need to be rebuilt and published with every pull request. +This strategy is particularly beneficial considering the build time and the size of Docker images. -## Advantages of Dividing Images by Functionality +This strategy is implemented in the build_docker.py script, and it has two main steps as follows. -The GATK-SV pipeline utilizes Docker images to encapsulate the necessary tools, -dependencies, and configurations. Instead of having a single monolithic image, -the pipeline is organized into multiple smaller images, each focusing on a specific task. -This approach offers several benefits. +### Determining modified files +The incremental build strategy relies on identifying the list of files changed between two +`git` commits and mapping it to the list of Docker images. The +[`build_docker`](https://github.com/broadinstitute/gatk-sv/blob/main/scripts/docker/build_docker.py) +extracts the list of changed files from the diff between two +[`git` commit SHAs](https://docs.github.com/en/pull-requests/committing-changes-to-your-project/creating-and-editing-commits/about-commits): -- **Modular and focused structure:** -Each image includes task-specific tools, simplifying the use and maintenance of -GATK-SV Docker images for users and developers, respectively. +- `BASE_SHA`: the reference commit (e.g., `HEAD` of the `main` branch); +- `HEAD_SHA`: the target commit (e.g., the latest commit on the feature branch). +The user provides these commit SHAs (or references the images specifically) +when building the images manually. +However, the automated CI/CD builds determine the commit SHAs automatically as the following example. -- **Reduced Docker image size:** -Using task-specific Docker images reduces sizes, requiring less storage space -in container registries. It also enables faster image transfer -when creating virtual machines for task execution. +```mermaid +%%{init: { + 'logLevel': 'debug', + 'gitGraph': {'rotateCommitLabel': false}, + 'themeVariables': { 'commitLabelFontSize': '22px' } + } + }%% +gitGraph + commit id: "A" + commit id: "B" + branch feature + checkout feature + commit id: "X" + checkout main + commit id: "C" + checkout feature + commit id: "Y" + checkout main + commit id: "D" + checkout feature + commit id: "Z" + checkout main + merge feature id: "E" + commit id: "F" +``` +In this example, `BASE_SHA=B`, `HEAD_SHA=Z`, and `E` is the merge commit. -- **Enhanced maintenance and extensibility:** -Maintainers can easily modify specific tools or configurations within -a single image without affecting others, improving maintainability and -facilitating seamless expansion by adding or replacing tools as required. +## Identifying Images Requiring Rebuilding from Changed Files -- **Consistency and efficiency:** -Building images on top of existing setups and tools promotes code -reuse and reduces duplication, ensuring consistent configurations -across pipeline stages. It simplifies dependency management by -allowing changes or updates at the appropriate level, cascading -down to dependent images. +The [`build_docker`](https://github.com/broadinstitute/gatk-sv/blob/main/scripts/docker/build_docker.py) +script determines the list of docker images +that need to be rebuilt based on the following conditions. +1. It determines the list of directly impacted images by checking the +list of files each image depends on, and rebuilds the image if any of the files have changed. -In summary, splitting tools into smaller, task-specific -Docker images optimizes storage, execution, maintenance, and extensibility. -It enhances consistency, code reuse, and dependency management, -ensuring efficient and scalable pipeline execution. +2. It builds any image if its base image is rebuilt. diff --git a/website/docs/advanced/docker/index.md b/website/docs/advanced/docker/index.md index 6bee59397..b50ce9140 100644 --- a/website/docs/advanced/docker/index.md +++ b/website/docs/advanced/docker/index.md @@ -4,31 +4,49 @@ description: Docker Concepts and Execution Overview sidebar_position: 0 --- -To make the analysis process scalable, reproducible, and cost-efficient, -GATK-SV is designed as a cloud-native pipeline, -meaning it runs on virtual machines (VMs) hosted in the cloud. -These VMs are pre-configured with all the necessary tools, scripts, and settings -required to run the GATK-SV analysis reliably. +GATK-SV is a cloud-native pipeline, making it scalable and reproducible. +All of the tools, scripts, and settings required to run the pipeline are +packaged in multiple Docker images, which are built and hosted +on container registries and are ready to use in Terra workspaces. -To ensure that the analysis can be easily replicated and shared, -GATK-SV utilizes Docker technology. -Docker allows the tools and scripts, including all their dependencies and configurations, -to be packaged into a self-contained unit called a container. -This container can be deployed and run on different VMs in the cloud, -making the analysis process consistent and reproducible across multiple experiments or collaborations. +There are two options for building, testing, and publishing GATK-SV +docker images: fully automated and manual. +GATK-SV Docker images are maintained through the automated approach, +which is built into CI/CD and builds, tests, and publishes images to +Google Container Registry (GCR) and Azure Container Registry (ACR). +However, if you are working on extending or improving the GATK-SV Docker images, +you may need to build them locally +for testing or storing them on an alternative container registry. +In this section, we provide detailed guidelines on both approaches. +Specifically, this section covers the following topics: -Docker containers are built from Docker images, -which serve as the blueprints or templates for creating containers. -Dockerfiles are used to define the contents and behavior of a Docker image. -A Dockerfile is a text file that contains a series of instructions, -specifying the base image, adding dependencies, configuring settings, -and executing commands necessary to build the desired software environment within the container. +- [Docker primer](#docker-primer) +- [GATK-SV Docker images](./images) +- [Automatic deployment](./automated) +- [Manual deployment](./manual) -The following figure is a high-level illustration depicting the relationship -between Dockerfiles, Docker images, Docker containers, and Cloud VMs. +## Docker Primer + +Docker technology enables creating a reproducible environment for data analysis. +It enables defining an environment with all the tools, scripts, +and their dependencies installed and configured as needed to run a data analysis pipeline. +The following are the key components to define and run in this environment: + + +- **Dockerfile**; a text file with instructions on installing and configuring tools, + scripts, and their dependencies. It is mainly used to create reproducible Docker images. + +- **Docker image**; is a template generated from a Dockerfile and contains all + the tools and scripts installed and configured as defined in a Dockerfile. + + +- **Docker container**; is an isolated runtime environment created based on a Docker image, + which runs on a host machine (e.g., laptop or a virtual machine on the cloud) and can execute scripts. + +The following figure illustrates the relationship between Dockerfiles, Docker images, and Docker containers: ```mermaid @@ -59,24 +77,20 @@ flowchart LR ``` -The GATK-SV Docker setup is organized as follows: - - - **Dockerfile**: - These files define the instructions for building the necessary tools and - configurations required for the GATK-SV pipeline. - - **Docker Images**: Docker images are automatically built based on each Dockerfile. - These images are stored in both Azure Container Registry (ACR) and - Google Cloud Container Registry (GCR). The images serve as self-contained - packages that encapsulate all the tools needed for the GATK-SV pipeline. +Dockerfiles are text files, and GATK-SV stores them on +[GitHub](https://github.com/broadinstitute/gatk-sv/tree/main/dockerfiles) +for accessibility and version control. +Docker images are larger files (e.g., 1GiB) and should be hosted on container registries +accessible to runtime environments. GATK-SV stores images on Google Container Registry (GCR) +and Azure Container Registry (ACR) so they are accessible to the +workflow execution environment on the Terra platform. +Docker containers are ephemeral runtime environments, created on +virtual machines when the analysis starts, and are “purged” when the analysis finishes. - - **Docker Containers**: Cromwell, a workflow execution system, creates GATK-SV - Docker containers on virtual machines within the Google Cloud Platform (GCP). - These containers are instantiated based on the Docker images obtained - from GCR. The GATK-SV data analysis tasks are then executed within - these containers, providing a consistent and isolated environment. -In summary, the GATK-SV Docker setup involves multiple Dockerfiles defining -the build instructions, resulting in Docker images that are stored in ACR and GCR. -These images are used to create Docker containers on GCP virtual machines through Cromwell, -where the GATK-SV data analysis takes place. +:::tip Images hosted on ACR and GCR are identical +The GATK-SV images hosted on GCR and ACR are identical. +We maintain these mirrored repositories to enable running GATK-SV on Terra +with both GCP and Azure (WIP) backends. +::: \ No newline at end of file diff --git a/website/docs/advanced/docker/manual.md b/website/docs/advanced/docker/manual.md new file mode 100644 index 000000000..272a370d1 --- /dev/null +++ b/website/docs/advanced/docker/manual.md @@ -0,0 +1,380 @@ +--- +title: Manual Deployment +description: Build and Publish Images +sidebar_position: 3 +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + + +If you contribute to the GATK-SV codebase, we recommend you ensure that affected Docker images build successfully and function as intended. The process involves two steps: + +1. **Build**: Create Docker images from Dockerfiles. + +2. **Publish**: Upload the built Docker images to container registries +(e.g., Google or Azure container registries, GCR and ACR, respectively) +to make them available for use in Terra or Cromwell. +_You may skip this step unless you would like to host the images you built on your own container registry._ + +To streamline the process, we have developed a +[script](https://github.com/broadinstitute/gatk-sv/blob/main/scripts/docker/build_docker.py) +that automates both the build and publish steps. +This section provides guidelines on setting up the environment and running the +script with a minimal example. + + +:::danger x86 Linux Machine Required +Only Linux machines (dedicated or virtual) are supported for building GATK-SV Docker images. +In addition, images created on non-Intel processor architectures (e.g., Apple M1) may not function as intended, +even if the build process runs successfully. +::: + + +## Setup an Ubuntu VM + +This section outlines steps to follow in order to +create and connect to a Linux virtual machine (VM) +on a cloud service provider. +You may [skip to the next section](#checkout) if you are using a dedicated Linux machine +(e.g., a laptop running Ubuntu). + + +#### 1. Set environment variables + + + + ```bash + export PROJECT_ID="" + export ZONE_ID="" + + # Make sure no machine with the following name exist, + # and you follow VM naming conventions, e.g., all lower-case characters. + export INSTANCE_NAMES="" + ``` + + + + + +#### 2. Create an Ubuntu VM +You may [skip to the next step](#connect-to-vm) if you have already created a VM. + + + + ```bash + gcloud compute instances create $INSTANCE_NAMES \ + --project=$PROJECT_ID \ + --zone=$ZONE_ID \ + --machine-type=e2-standard-2 \ + --create-disk=auto-delete=yes,boot=yes,device-name=$INSTANCE_NAMES,image=projects/ubuntu-os-cloud/global/images/ubuntu-2310-mantic-amd64-v20240213,mode=rw,size=100 + ``` + Note that this command creates a VM with `100 GiB` disk size, + to accommodate for the disk space requirements of GATK-SV Docker images. + + You may follow the documentation on + [this page](https://cloud.google.com/compute/docs/instances/create-start-instance#publicimage) + for more details on creating a virtual machine on GCP. + + + +:::tip +The firewall rules of your institute may require you to be on-site or connected +to the institute's VPN before you can access the cloud resources billed to your institute. +::: + + +#### 3. Connect to the VM {#connect-to-vm} + + + + + ```bash + gcloud compute ssh $INSTANCE_NAMES --project $PROJECT_ID + ``` + Follow the on-screen prompts for authorizing access to `ssh` credentials. + +
+ Errors running this command +
+ If you are getting any of the following error messages when you try + to connect to the VM immediately after you have created it, + it may indicate that the VM is not ready yet, and you may need to + wait a few minutes before retrying. + + ```bash + ssh: connect to host [IP address] port 22: Connection refused + ``` + + ```bash + ERROR: (gcloud.compute.ssh) [/usr/bin/ssh] exited with return code [255]. + username@[IP address]: Permission denied (publickey). + ``` +
+
+
+
+ +#### 4. Install Docker {#docker} +You may [skip to the next step](#checkout) if you have already installed and configured Docker on this VM. + +1. Install pre-requisites + ```bash + sudo apt-get update && \ + sudo apt-get install ca-certificates curl && \ + sudo install -m 0755 -d /etc/apt/keyrings && \ + sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc && \ + sudo chmod a+r /etc/apt/keyrings/docker.asc && \ + echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null && \ + sudo apt-get update + ``` + +2. Install Docker + + ```bash + sudo apt-get -y install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin && \ + sudo usermod -aG docker ${USER} && \ + newgrp docker + ``` + + You may follow [Docker documentation](https://docs.docker.com/engine/install/ubuntu/) + on details on installed Docker on Ubuntu. + + +3. Login to Docker + + + + - Run the following command on the VM. + ```bash + gcloud auth login + ``` + + - Follow the on-screen prompts, it will display a URL that you need to copy-paste it + on the browser of your computer (_not_ the VM). + + - Follow the prompts on your browser, and login with an account that will provide you + with access to the GCR repository. If you are planning on _publishing_ images you + build to GCR, you need to make sure you account has [sufficient access](https://cloud.google.com/artifact-registry/docs/docker/pushing-and-pulling#required_roles) + to GCR. + + - Configure Docker with your credentials. + + ```bash + gcloud auth configure-docker + ``` + + You may refer to [this page](https://cloud.google.com/artifact-registry/docs/docker/authentication) + for more details on configure Docker to access GCR. + + + + +## Checkout codebase {#checkout} + +1. Clone the repository or its fork that contains the branch with the changes +that you want to build the Docker images based-off. + + ```shell + git clone https://github.com/broadinstitute/gatk-sv && cd gatk-sv + ``` + +2. Checkout the branch containing your changes. + + ```shell + git checkout + ``` + +## Build and Publish Docker Images {#build} + +In its minimal setup, you may use the following command to **build and publish** GATK-SV Docker images. + +```shell +python3 scripts/docker/build_docker.py \ + --targets \ + --image-tag \ + --docker-repo +``` + +The arguments are explained in the following. + +- [`--targets`](#targets) +- [`--image-tag`](#tag) +- [`--docker-repo`](#registry) + +### `--targets` {#targets} + +You may follow either of the following approaches to determine which images to rebuild. + +- **Manual:** + You may refer to the table in [this section](./dependencies#list) + to determine which Docker images to rebuild based on the changed files. + For instance, if you modified any of the files under the + [`gatk-sv/src/svtk/`](https://github.com/broadinstitute/gatk-sv/tree/main/src/svtk) + directory, you will need to rebuild the `sv-pipeline` Docker image. + You can set the list of images to rebuild using the `--targets` argument. + For instance: + + ```shell + python scripts/docker/build_docker.py \ + --targets sv-pipeline + ``` + + You may specify multiple images to rebuild by providing a list of their names. + For instance, the following command builds the `sv-pipeline` and the `str` Docker images. + + ```shell + python scripts/docker/build_docker.py \ + --targets sv-pipeline str + ``` + +- **Automatic (advanced):** + You may refer to [this page](./images#incremental) for details on this method. + Briefly, you may take the following steps. + + 1. `git commit` the changes. + 2. Identify `BASE_SHA` and `HEAD_SHA` using `git log` or GitHub. + You may use the following commands to get these SHAs. + + ```shell + export \ + HEAD_SHA=$(git log -1 --pretty=format:"%H") \ + BASE_SHA=$(git merge-base main $(git branch --show-current)) + ``` + Note that, you may need to [modify these commands](https://git-scm.com/docs/git-merge-base) if your branch has a complicated git history. + + 3. Run the script using `--base-git-commit` and `--current-git-commit` instead of `--targets`. + ```shell + python scripts/docker/build_docker.py \ + --base-git-commit \ + --current-git-commit + ``` + +Please note that `--targets` and `--base-git-commit --current-git-commit` +options are mutually exclusive. In other words, you can either manually specify +images to rebuild, or let the script determine them automatically using commit SHAs; +combining or avoiding both options is not currently supported. + +:::info +Following the steps above, the script builds the specified Docker images +_and all the images derived from them_. +You may add the `--skip-dependent-images` flag to build only the explicitly specified images. +::: + + +### `--image-tag` {#tag} + +You may use any naming convention for the Docker image +[tags](https://docs.docker.com/engine/reference/commandline/tag/). +GATK-SV Docker images are tagged using the following template +(you may refer to [this section](./automated#args) for details). + +``` +[Date]-[Release Tag]-[Head SHA 8] +``` + +For example: + +``` +--image-tag 2023-07-28-v0.28.1-beta-e70dfbd7 +``` + + +### ` --docker-repo` {#registry} + +If you are only testing GATK-SV Docker image build, +you may skip this section and avoid providing `--docker-repo `. +However, if you need to push image to container registries, +need images for WDL testing, or need to host the images on a container registry +other than those maintained by the GATK-SV team. + +The `build_docker.py` script automatically pushes Docker images to a container registry +when `--docker-repo ` is provided, replacing `` with the container registry you want to use. +When providing this argument, ensure that you are logged into Docker with +credentials granting push access to the registry, +You may configure and set the registry as the following. + + + + + - You may follow [these steps](https://learn.microsoft.com/en-us/azure/container-registry/container-registry-get-started-portal?tabs=azure-cli) + if you have not configured a container registry. + - Once configured, you may set `` in the following template. + + ```shell + .azurecr.io// + ``` + + Example: + + ```shell + myregistry.azurecr.io/gatk-sv + ``` + + + + + - You may follow [these steps](https://cloud.google.com/artifact-registry/docs/repositories/create-repos) + if you have not configured a container registry. + - Once configured, you may set `` in the following template. + + ```shell + // + ``` + + Example: + ```shell + us.gcr.io/my-repository/gatk-sv + ``` + + + + + +## Post-build + +- GATK-SV docker images are mainly intended for use in WDLs. + Therefore, it's a good practice to run the related WDLs with + updated images to assert if the images function as expected. + +- If you were using a Linux VM to build the Docker images, + ensure you either stop or delete the VM after building the images. + Stopping the VM won't delete the disk, and you may continue to + incur disk usage charges. If you plan on re-using the VM, + stopping is preferred as it preserves the configuration; + otherwise, you may delete the VM and all the associated resources + (attached disks in particular). diff --git a/website/docs/gs/docker.md b/website/docs/gs/docker.md index 6407569c5..e69de29bb 100644 --- a/website/docs/gs/docker.md +++ b/website/docs/gs/docker.md @@ -1,41 +0,0 @@ ---- -title: Docker Images -description: GATK-SV Docker Images -sidebar_position: 4 -slug: ./dockers ---- - - -To make the analysis process scalable, reproducible, and cost-efficient, -GATK-SV is designed as a cloud-native pipeline, -meaning it runs on virtual machines (VMs) in the cloud, -which are pre-configured with all the necessary tools, scripts, -and settings for reliable analysis. To easily replicate and share -the analysis, GATK-SV uses Docker technology. Docker packages the tools, -scripts, and their requirements into self-contained units called containers. -These containers can be deployed on different VMs in the cloud, -ensuring consistent and reproducible analysis for various experiments -and collaborations. - -The latest Docker image builds can be found in the following files. - - - -- [`dockers.json`](https://github.com/broadinstitute/gatk-sv/blob/main/inputs/values/dockers.json). - The list of images hosted on Google Container Registry (GCR). - You may use the Docker images listed in this file if you are running - the pipeline on Google Cloud Platform (GCP). - -- [`dockers_azure.json`](https://github.com/broadinstitute/gatk-sv/blob/main/inputs/values/dockers_azure.json). - The list of images hosted on Azure Container Registry (ACR). - You may use the Docker images listed in this file if you are - running the pipeline on Azure. - - -:::tip For developers and power users - -You may refer to [this section](/docs/advanced/docker/) for a detailed -description of the Docker images, including their design principles, -as well as guides on build and deploy them. -::: - \ No newline at end of file diff --git a/website/package.json b/website/package.json index 81a3c06de..fa32a8117 100644 --- a/website/package.json +++ b/website/package.json @@ -16,7 +16,7 @@ "dependencies": { "@docusaurus/core": "3.3.2", "@docusaurus/preset-classic": "3.3.2", - "@docusaurus/theme-mermaid": "^3.3.2", + "@docusaurus/theme-mermaid": "3.3.2", "@mdx-js/react": "^3.0.0", "clsx": "^2.0.0", "prism-react-renderer": "^2.3.0",