From 3619cf4b14d4c3091f6f481c797ea4e3a8436e1a Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 15 Dec 2022 14:45:49 -0500 Subject: [PATCH 01/26] carry over Xuefang's changes to AnnotateVcf from gnomAD v3 --- wdl/AnnotateExternalAF.wdl | 8 +- wdl/AnnotateExternalAFperContig.wdl | 92 ++++----- wdl/AnnotateVcf.wdl | 136 ++++++------- wdl/ChromosomeAlleleFrequencies.wdl | 106 +++++++--- wdl/HailMerge.wdl | 46 ++--- wdl/PruneAndAddVafs.wdl | 132 ++++++++----- wdl/ShardedAnnotateVcf.wdl | 290 ++++++++++++++++++++++++++++ wdl/TasksMakeCohortVcf.wdl | 18 +- 8 files changed, 604 insertions(+), 224 deletions(-) create mode 100755 wdl/ShardedAnnotateVcf.wdl diff --git a/wdl/AnnotateExternalAF.wdl b/wdl/AnnotateExternalAF.wdl index b44c81a46..322a48a74 100644 --- a/wdl/AnnotateExternalAF.wdl +++ b/wdl/AnnotateExternalAF.wdl @@ -24,8 +24,8 @@ workflow AnnotateExternalAF { # overrides for local tasks RuntimeAttr? runtime_attr_modify_vcf - RuntimeAttr? runtime_override_combine_vcfs - RuntimeAttr? runtime_override_split_vcf + RuntimeAttr? runtime_attr_combine_vcfs + RuntimeAttr? runtime_attr_split_vcf RuntimeAttr? runtime_attr_split_ref_bed RuntimeAttr? runtime_attr_split_query_vcf RuntimeAttr? runtime_attr_bedtools_closest @@ -70,7 +70,7 @@ workflow AnnotateExternalAF { min_records_per_shard_step1 = min_records_per_shard_step1, sv_base_mini_docker = sv_base_mini_docker, sv_pipeline_docker = sv_pipeline_docker, - runtime_override_split_vcf = runtime_override_split_vcf, + runtime_attr_split_vcf = runtime_attr_split_vcf, runtime_attr_modify_vcf = runtime_attr_modify_vcf, runtime_attr_select_matched_svs = runtime_attr_select_matched_svs, runtime_attr_bedtools_closest = runtime_attr_bedtools_closest @@ -84,7 +84,7 @@ workflow AnnotateExternalAF { naive = true, outfile_prefix = "~{prefix}.annotated", sv_base_mini_docker = sv_base_mini_docker, - runtime_attr_override = runtime_override_combine_vcfs + runtime_attr_override = runtime_attr_combine_vcfs } output { diff --git a/wdl/AnnotateExternalAFperContig.wdl b/wdl/AnnotateExternalAFperContig.wdl index 18fd17a9f..cd0fc233b 100644 --- a/wdl/AnnotateExternalAFperContig.wdl +++ b/wdl/AnnotateExternalAFperContig.wdl @@ -33,8 +33,8 @@ workflow AnnotateExternalAFperContig { # overrides for local tasks RuntimeAttr? runtime_attr_modify_vcf - RuntimeAttr? runtime_override_split_vcf - RuntimeAttr? runtime_override_combine_vcfs + RuntimeAttr? runtime_attr_split_vcf + RuntimeAttr? runtime_attr_combine_vcfs RuntimeAttr? runtime_attr_bedtools_closest RuntimeAttr? runtime_attr_select_matched_svs } @@ -143,7 +143,7 @@ workflow AnnotateExternalAFperContig { n_shards=max_shards_per_chrom_step1, min_vars_per_shard=min_records_per_shard_step1, sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_split_vcf + runtime_attr_override=runtime_attr_split_vcf } @@ -169,7 +169,7 @@ workflow AnnotateExternalAFperContig { naive = true, outfile_prefix = "~{contig}.annotated.vcf", sv_base_mini_docker = sv_base_mini_docker, - runtime_attr_override = runtime_override_combine_vcfs + runtime_attr_override = runtime_attr_combine_vcfs } output { @@ -195,16 +195,16 @@ task SplitBed { boot_disk_gb: 10 } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + RuntimeAttr runtime_attr = select_first([runtime_attr_override, runtime_default]) runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + memory: "~{select_first([runtime_attr.mem_gb, runtime_default.mem_gb])} GiB" + disks: "local-disk ~{select_first([runtime_attr.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_attr.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_attr.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, runtime_default.max_retries]) docker: sv_base_mini_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, runtime_default.boot_disk_gb]) } String prefix = basename(bed, ".bed.gz") @@ -243,16 +243,16 @@ task SplitVcf { boot_disk_gb: 10 } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + RuntimeAttr runtime_attr = select_first([runtime_attr_override, runtime_default]) runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + memory: "~{select_first([runtime_attr.mem_gb, runtime_default.mem_gb])} GiB" + disks: "local-disk ~{select_first([runtime_attr.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_attr.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_attr.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, runtime_default.max_retries]) docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, runtime_default.boot_disk_gb]) } String prefix = basename(vcf, ".vcf.gz") @@ -297,16 +297,16 @@ task BedtoolsClosest { boot_disk_gb: 10 } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + RuntimeAttr runtime_attr = select_first([runtime_attr_override, runtime_default]) runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + memory: "~{select_first([runtime_attr.mem_gb, runtime_default.mem_gb])} GiB" + disks: "local-disk ~{select_first([runtime_attr.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_attr.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_attr.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, runtime_default.max_retries]) docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, runtime_default.boot_disk_gb]) } command <<< @@ -341,16 +341,16 @@ task SelectMatchedSVs { boot_disk_gb: 10 } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + RuntimeAttr runtime_attr = select_first([runtime_attr_override, runtime_default]) runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + memory: "~{select_first([runtime_attr.mem_gb, runtime_default.mem_gb])} GiB" + disks: "local-disk ~{select_first([runtime_attr.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_attr.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_attr.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, runtime_default.max_retries]) docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, runtime_default.boot_disk_gb]) } String prefix = basename(input_bed, ".bed") @@ -386,16 +386,16 @@ task SelectMatchedINSs { boot_disk_gb: 10 } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + RuntimeAttr runtime_attr = select_first([runtime_attr_override, runtime_default]) runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + memory: "~{select_first([runtime_attr.mem_gb, runtime_default.mem_gb])} GiB" + disks: "local-disk ~{select_first([runtime_attr.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_attr.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_attr.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, runtime_default.max_retries]) docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, runtime_default.boot_disk_gb]) } String prefix = basename(input_bed, ".bed") @@ -435,16 +435,16 @@ task ModifyVcf { boot_disk_gb: 10 } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + RuntimeAttr runtime_attr = select_first([runtime_attr_override, runtime_default]) runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + memory: "~{select_first([runtime_attr.mem_gb, runtime_default.mem_gb])} GiB" + disks: "local-disk ~{select_first([runtime_attr.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_attr.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_attr.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, runtime_default.max_retries]) docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, runtime_default.boot_disk_gb]) } String prefix = basename(vcf,'.vcf.gz') diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl index b95e5c55c..1682e025b 100644 --- a/wdl/AnnotateVcf.wdl +++ b/wdl/AnnotateVcf.wdl @@ -1,16 +1,15 @@ version 1.0 -import "AnnotateFunctionalConsequences.wdl" as func -import "PruneAndAddVafs.wdl" as pav -import "AnnotateExternalAF.wdl" as eaf +import "Structs.wdl" +import "ShardedAnnotateVcf.wdl" as sharded_annotate_vcf workflow AnnotateVcf { input { - File vcf - File vcf_idx - File contig_list - String prefix + Array[File] vcf_list + Array[File] vcf_idx_list + Array[String] contig_list + Array[String] prefix_list File protein_coding_gtf File? noncoding_bed @@ -22,92 +21,99 @@ workflow AnnotateVcf { Int min_records_per_shard_step1 File? sample_pop_assignments # Two-column file with sample ID & pop assignment. "." for pop will ignore sample - File? prune_list # List of samples to be excluded from the output vcf + File sample_list # List of samples to be retained from the output vcf File? ped_file # Used for M/F AF calculations + File? par_bed + File? allosomes_list Int sv_per_shard File? ref_bed # File with external allele frequencies String? ref_prefix # prefix name for external AF call set (required if ref_bed set) Array[String]? population # populations to annotate external AF for (required if ref_bed set) - String sv_base_mini_docker + Boolean use_hail + String? gcs_project + String sv_pipeline_docker + String sv_pipeline_hail_docker + String sv_base_mini_docker String gatk_docker RuntimeAttr? runtime_attr_svannotate RuntimeAttr? runtime_attr_concat_vcfs - RuntimeAttr? runtime_attr_prune_vcf RuntimeAttr? runtime_attr_shard_vcf RuntimeAttr? runtime_attr_compute_AFs RuntimeAttr? runtime_attr_combine_vcfs RuntimeAttr? runtime_attr_modify_vcf - RuntimeAttr? runtime_override_combine_vcfs - RuntimeAttr? runtime_override_split_vcf + RuntimeAttr? runtime_attr_combine_vcfs + RuntimeAttr? runtime_attr_split_vcf RuntimeAttr? runtime_attr_split_ref_bed RuntimeAttr? runtime_attr_split_query_vcf RuntimeAttr? runtime_attr_bedtools_closest RuntimeAttr? runtime_attr_select_matched_svs + RuntimeAttr? runtime_attr_concat_sharded_cluster + RuntimeAttr? runtime_attr_preconcat_sharded_cluster + RuntimeAttr? runtime_attr_hail_merge_sharded_cluster + RuntimeAttr? runtime_attr_fix_header_sharded_cluster + RuntimeAttr? runtime_attr_get_vcf_header_with_members_info_line } - call func.AnnotateFunctionalConsequences { - input: - vcf = vcf, - vcf_index = vcf_idx, - prefix = prefix, - protein_coding_gtf = protein_coding_gtf, - noncoding_bed = noncoding_bed, - promoter_window = promoter_window, - max_breakend_as_cnv_length = max_breakend_as_cnv_length, - additional_args = svannotate_additional_args, - gatk_docker = gatk_docker, - runtime_attr_svannotate = runtime_attr_svannotate - } - - call pav.PruneAndAddVafs as PruneAndAddVafs { - input: - vcf = AnnotateFunctionalConsequences.annotated_vcf, - vcf_idx = AnnotateFunctionalConsequences.annotated_vcf_index, - prefix = prefix, - sample_pop_assignments = sample_pop_assignments, - prune_list = prune_list, - ped_file = ped_file, - sv_per_shard = sv_per_shard, - contig_list = contig_list, - sv_base_mini_docker = sv_base_mini_docker, - sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_prune_vcf = runtime_attr_prune_vcf, - runtime_attr_shard_vcf = runtime_attr_shard_vcf, - runtime_attr_compute_AFs = runtime_attr_compute_AFs, - runtime_attr_combine_vcfs = runtime_attr_combine_vcfs, - runtime_attr_concat_vcfs = runtime_attr_concat_vcfs - } - - if (defined(ref_bed)) { - call eaf.AnnotateExternalAF as AnnotateExternalAF { + scatter (i in range(length(vcf_list))) { + call sharded_annotate_vcf.ShardedAnnotateVcf as ShardedAnnotateVcf{ input: - vcf = PruneAndAddVafs.output_vcf, - vcf_idx = PruneAndAddVafs.output_vcf_idx, - ref_bed = select_first([ref_bed]), - population = select_first([population]), - ref_prefix = select_first([ref_prefix]), - prefix = prefix, - contigs = read_lines(contig_list), + vcf = vcf_list[i], + vcf_idx = vcf_idx_list[i], + contig = contig_list[i], + prefix = prefix_list[i], + protein_coding_gtf = protein_coding_gtf, + noncoding_bed = noncoding_bed, + promoter_window = promoter_window, + svannotate_additional_args = svannotate_additional_args, + max_breakend_as_cnv_length = max_breakend_as_cnv_length, + max_shards_per_chrom_step1 = max_shards_per_chrom_step1, min_records_per_shard_step1 = min_records_per_shard_step1, - sv_base_mini_docker = sv_base_mini_docker, + sample_pop_assignments = sample_pop_assignments, + sample_list = sample_list, + ped_file = ped_file, + par_bed = par_bed, + sv_per_shard = sv_per_shard, + allosomes_list = allosomes_list, + + ref_bed = ref_bed, + ref_prefix = ref_prefix, + population = population, + + use_hail = use_hail, + gcs_project = gcs_project, + + gatk_docker = gatk_docker, sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_modify_vcf = runtime_attr_modify_vcf, - runtime_override_split_vcf = runtime_override_split_vcf, - runtime_override_combine_vcfs = runtime_override_combine_vcfs, - runtime_attr_split_ref_bed = runtime_attr_split_ref_bed, - runtime_attr_split_query_vcf = runtime_attr_split_query_vcf, - runtime_attr_bedtools_closest = runtime_attr_bedtools_closest, - runtime_attr_select_matched_svs = runtime_attr_select_matched_svs + sv_base_mini_docker = sv_base_mini_docker, + sv_pipeline_hail_docker = sv_pipeline_hail_docker, + + runtime_attr_svannotate = runtime_attr_svannotate , + runtime_attr_concat_vcfs = runtime_attr_concat_vcfs , + runtime_attr_shard_vcf = runtime_attr_shard_vcf , + runtime_attr_compute_AFs = runtime_attr_compute_AFs , + runtime_attr_combine_vcfs = runtime_attr_combine_vcfs , + runtime_attr_modify_vcf = runtime_attr_modify_vcf , + runtime_attr_combine_vcfs = runtime_attr_combine_vcfs , + runtime_attr_split_vcf = runtime_attr_split_vcf , + runtime_attr_split_ref_bed = runtime_attr_split_ref_bed , + runtime_attr_split_query_vcf = runtime_attr_split_query_vcf , + runtime_attr_bedtools_closest = runtime_attr_bedtools_closest , + runtime_attr_select_matched_svs = runtime_attr_select_matched_svs , + runtime_attr_concat_sharded_cluster = runtime_attr_concat_sharded_cluster , + runtime_attr_preconcat_sharded_cluster = runtime_attr_preconcat_sharded_cluster , + runtime_attr_hail_merge_sharded_cluster = runtime_attr_hail_merge_sharded_cluster , + runtime_attr_fix_header_sharded_cluster = runtime_attr_fix_header_sharded_cluster , + runtime_attr_get_vcf_header_with_members_info_line = runtime_attr_get_vcf_header_with_members_info_line } } output { - File output_vcf = select_first([AnnotateExternalAF.annotated_vcf, PruneAndAddVafs.output_vcf]) - File output_vcf_idx = select_first([AnnotateExternalAF.annotated_vcf_tbi, PruneAndAddVafs.output_vcf_idx]) + Array[File] output_vcf_list = ShardedAnnotateVcf.output_vcf + Array[File] output_vcf_idx_list = ShardedAnnotateVcf.output_vcf_idx } -} +} \ No newline at end of file diff --git a/wdl/ChromosomeAlleleFrequencies.wdl b/wdl/ChromosomeAlleleFrequencies.wdl index 4e8ab434d..6e7397a2e 100644 --- a/wdl/ChromosomeAlleleFrequencies.wdl +++ b/wdl/ChromosomeAlleleFrequencies.wdl @@ -11,12 +11,14 @@ workflow ChromosomeAlleleFrequencies { File vcf File vcf_idx - Int sv_per_shard String contig String prefix File? sample_pop_assignments # Two-column file with sample ID & pop assignment. "." for pop will ignore sample File? ped_file # Used for M/F AF calculations + File? par_bed + File? allosomes_list + String sv_pipeline_docker String sv_base_mini_docker @@ -27,43 +29,24 @@ workflow ChromosomeAlleleFrequencies { } # Tabix to chromosome of interest, and shard input VCF for stats collection - call ShardVcf { - input: - vcf = vcf, - vcf_idx = vcf_idx, - contig = contig, - sv_per_shard = sv_per_shard, - sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_override = runtime_attr_shard_vcf - } - # Scatter over VCF shards - scatter ( shard in ShardVcf.shard_vcfs ) { - # Collect AF summary stats - call ComputeShardAlleleFrequencies { + call ComputeShardAFs { input: - vcf = shard, - prefix = "${prefix}.${contig}", + vcf = vcf, + prefix = "${prefix}.${contig}", sample_pop_assignments = sample_pop_assignments, - ped_file = ped_file, + ped_file = ped_file, + par_bed = par_bed, + allosomes_list = allosomes_list, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_compute_AFs - } } - # Merge shards into single VCF - call CombineShardedVcfs { - input: - vcfs = ComputeShardAlleleFrequencies.shard_wAFs, - prefix = "${prefix}.${contig}", - sv_base_mini_docker = sv_base_mini_docker, - runtime_attr_override = runtime_attr_combine_vcfs - } # Final output output { - File vcf_wAFs = CombineShardedVcfs.vcf_out - File vcf_wAFs_idx = CombineShardedVcfs.vcf_out_idx + File vcf_wAFs = ComputeShardAFs.shard_wAFs + File vcf_wAFs_idx = ComputeShardAFs.shard_wAFs_idx } } @@ -142,7 +125,8 @@ task ComputeShardAlleleFrequencies { } output { - File shard_wAFs = "${prefix}.wAFs.vcf.gz" + File shard_wAFs = "~{prefix}.wAFs.vcf.gz" + File shard_wAFs_idx = "~{prefix}.wAFs.vcf.gz.tbi" } command <<< @@ -164,6 +148,8 @@ task ComputeShardAlleleFrequencies { /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $optionals "~{vcf}" stdout \ | bgzip -c \ > "~{prefix}.wAFs.vcf.gz" + + tabix -p vcf ~{prefix}.wAFs.vcf.gz >>> @@ -187,6 +173,68 @@ task ComputeShardAlleleFrequencies { } } +task ComputeShardAFs { + input { + File vcf + String prefix + String sv_pipeline_docker + File? sample_pop_assignments + File? ped_file + File? par_bed + File? allosomes_list + RuntimeAttr? runtime_attr_override + } + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1.5, + disk_gb: ceil(20 + size(vcf, "GB") * 2), + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command <<< + set -euo pipefail + optionals=" " + if [ ~{default="SKIP" sample_pop_assignments} != "SKIP" ]; then + optionals="$( echo "$optionals" ) -p ~{sample_pop_assignments}" + fi + if [ ~{default="SKIP" ped_file} != "SKIP" ]; then + optionals="$( echo "$optionals" ) -f ~{ped_file}" + fi + if [ ~{default="SKIP" par_bed} != "SKIP" ]; then + optionals="$( echo "$optionals" ) --par ~{par_bed}" + fi + if [ ~{default="SKIP" allosomes_list} != "SKIP" ]; then + optionals="$( echo "$optionals" ) --allosomes-list ~{allosomes_list}" + fi + echo -e "OPTIONALS INTERPRETED AS: $optionals" + echo -e "NOW RUNNING: /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $( echo "$optionals" ) ~{vcf} stdout" + #Tabix chromosome of interest & compute AN, AC, and AF + /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $optionals "~{vcf}" stdout \ + | bgzip -c \ + > "~{prefix}.wAFs.vcf.gz" + + tabix -p vcf "~{prefix}.wAFs.vcf.gz" + >>> + + output { + File shard_wAFs = "~{prefix}.wAFs.vcf.gz" + File shard_wAFs_idx = "~{prefix}.wAFs.vcf.gz.tbi" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + # Merge VCF shards task CombineShardedVcfs { diff --git a/wdl/HailMerge.wdl b/wdl/HailMerge.wdl index 1d1ef7498..9571da1fa 100644 --- a/wdl/HailMerge.wdl +++ b/wdl/HailMerge.wdl @@ -12,9 +12,9 @@ workflow HailMerge { String sv_base_mini_docker String sv_pipeline_docker String sv_pipeline_hail_docker - RuntimeAttr? runtime_override_preconcat - RuntimeAttr? runtime_override_hail_merge - RuntimeAttr? runtime_override_fix_header + RuntimeAttr? runtime_attr_preconcat + RuntimeAttr? runtime_attr_hail_merge + RuntimeAttr? runtime_attr_fix_header } # Concatenate vcfs naively to prevent ClassTooLargeException in Hail @@ -26,27 +26,27 @@ workflow HailMerge { generate_index=false, outfile_prefix="~{prefix}.preconcat", sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_preconcat + runtime_attr_override=runtime_attr_preconcat } } - call HailMergeTask { + call HailMerge { input: vcfs = [select_first([Preconcat.concat_vcf, vcfs[0]])], prefix = prefix, gcs_project = select_first([gcs_project]), sv_pipeline_hail_docker=sv_pipeline_hail_docker, - runtime_attr_override=runtime_override_hail_merge + runtime_attr_override=runtime_attr_hail_merge } call FixHeader { input: - merged_vcf = HailMergeTask.merged_vcf, + merged_vcf = HailMerge.merged_vcf, example_vcf = vcfs[0], prefix = prefix + ".reheadered", reset_cnv_gts = select_first([reset_cnv_gts, false]), sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_override=runtime_override_fix_header + runtime_attr_override=runtime_attr_fix_header } output { @@ -55,7 +55,7 @@ workflow HailMerge { } } -task HailMergeTask { +task HailMerge { input { Array[File] vcfs String prefix @@ -81,15 +81,15 @@ task HailMergeTask { max_retries: 1, boot_disk_gb: 10 } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + RuntimeAttr runtime_attr = select_first([runtime_attr_override, runtime_default]) runtime { - memory: select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + " GB" - disks: "local-disk " + select_first([runtime_override.disk_gb, runtime_default.disk_gb]) + " SSD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + memory: select_first([runtime_attr.mem_gb, runtime_default.mem_gb]) + " GB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, runtime_default.disk_gb]) + " SSD" + cpu: select_first([runtime_attr.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_attr.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, runtime_default.max_retries]) docker: sv_pipeline_hail_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, runtime_default.boot_disk_gb]) } command <<< @@ -155,15 +155,15 @@ task FixHeader { max_retries: 1, boot_disk_gb: 10 } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + RuntimeAttr runtime_attr = select_first([runtime_attr_override, runtime_default]) runtime { - memory: select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + " GB" - disks: "local-disk " + select_first([runtime_override.disk_gb, runtime_default.disk_gb]) + " SSD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + memory: select_first([runtime_attr.mem_gb, runtime_default.mem_gb]) + " GB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, runtime_default.disk_gb]) + " SSD" + cpu: select_first([runtime_attr.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_attr.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, runtime_default.max_retries]) docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, runtime_default.boot_disk_gb]) } command <<< diff --git a/wdl/PruneAndAddVafs.wdl b/wdl/PruneAndAddVafs.wdl index 939744905..10178b7e5 100644 --- a/wdl/PruneAndAddVafs.wdl +++ b/wdl/PruneAndAddVafs.wdl @@ -13,73 +13,57 @@ workflow PruneAndAddVafs { File vcf File vcf_idx - File contig_list - Int sv_per_shard String prefix + String contig File? sample_pop_assignments # Two-column file with sample ID & pop assignment. "." for pop will ignore sample - File? prune_list # List of samples to be excluded from the output vcf File? ped_file # Used for M/F AF calculations + File? par_bed + File? allosomes_list + File sample_list # List of samples to be retained from the output vcf String sv_base_mini_docker String sv_pipeline_docker - RuntimeAttr? runtime_attr_prune_vcf RuntimeAttr? runtime_attr_shard_vcf RuntimeAttr? runtime_attr_compute_AFs RuntimeAttr? runtime_attr_combine_vcfs RuntimeAttr? runtime_attr_concat_vcfs + RuntimeAttr? runtime_attr_extract_subset_samples_from_vcf } - - Array[Array[String]] contigs = read_tsv(contig_list) - - # Iterate over chromosomes - scatter (contig in contigs) { - - # Prune VCF - call PruneVcf { - input: - - vcf = vcf, - vcf_idx = vcf_idx, - contig = contig[0], - prune_list = prune_list, - prefix = prefix, - sv_base_mini_docker = sv_base_mini_docker, - runtime_attr_override = runtime_attr_prune_vcf - } - - # Compute AC, AN, and AF per population & sex combination - call calcAF.ChromosomeAlleleFrequencies as ChromosomeAlleleFrequencies { - input: - vcf = PruneVcf.pruned_vcf, - vcf_idx = PruneVcf.pruned_vcf_idx, - contig = contig[0], - sv_per_shard = sv_per_shard, - prefix = prefix, - sample_pop_assignments = sample_pop_assignments, - ped_file = ped_file, - sv_base_mini_docker = sv_base_mini_docker, - sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_shard_vcf = runtime_attr_shard_vcf, - runtime_attr_compute_AFs = runtime_attr_compute_AFs, - runtime_attr_combine_vcfs = runtime_attr_combine_vcfs - } + + # Prune VCF + call ExtractSubsetSamples { + input: + vcf = vcf, + vcf_idx = vcf_idx, + sample_list = sample_list, + midfix = prefix, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_extract_subset_samples_from_vcf } - # Merge pruned VCFs with allele info - call MiniTasks.ConcatVcfs as ConcatVcfs{ + # Compute AC, AN, and AF per population & sex combination + call calcAF.ChromosomeAlleleFrequencies as ChromosomeAlleleFrequencies { input: - vcfs = ChromosomeAlleleFrequencies.vcf_wAFs, - vcfs_idx = ChromosomeAlleleFrequencies.vcf_wAFs_idx, - outfile_prefix = "${prefix}.pruned_wAFs", - sv_base_mini_docker = sv_base_mini_docker, - runtime_attr_override = runtime_attr_concat_vcfs + vcf = ExtractSubsetSamples.out_vcf, + vcf_idx = ExtractSubsetSamples.out_vcf_idx, + contig = contig, + prefix = prefix, + sample_pop_assignments = sample_pop_assignments, + ped_file = ped_file, + par_bed = par_bed, + allosomes_list = allosomes_list, + sv_base_mini_docker = sv_base_mini_docker, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_shard_vcf = runtime_attr_shard_vcf, + runtime_attr_compute_AFs = runtime_attr_compute_AFs, + runtime_attr_combine_vcfs = runtime_attr_combine_vcfs } output { - File output_vcf = ConcatVcfs.concat_vcf - File output_vcf_idx = ConcatVcfs.concat_vcf_idx + File output_vcf = ChromosomeAlleleFrequencies.vcf_wAFs + File output_vcf_idx = ChromosomeAlleleFrequencies.vcf_wAFs_idx } } @@ -121,7 +105,7 @@ task PruneVcf { | fgrep -wf ~{prune_list} \ | cut -f1 | paste -s -d, ) zcat ~{contig}.vcf.gz \ - | cut --complement -f"$dropidx" \ + | cut --complement -f "$dropidx" \ | bgzip -c \ > "~{prefix}.~{contig}.pruned.vcf.gz" else @@ -152,3 +136,53 @@ task PruneVcf { docker: sv_base_mini_docker } } + +task ExtractSubsetSamples { + input { + File vcf + File vcf_idx + File sample_list + String midfix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + + Float input_size = size(vcf, "GB") + Float base_disk_gb = 10.0 + RuntimeAttr runtime_default = object { + mem_gb: 3, + disk_gb: ceil(base_disk_gb + (input_size * 2.0)), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + String prefix = basename(vcf, '.vcf.gz') + command <<< + set -eu -o pipefail + + bcftools view -S ~{sample_list} ~{vcf} \ + | bgzip > ~{prefix}.~{midfix}.vcf.gz + + tabix -p vcf ~{prefix}.~{midfix}.vcf.gz + + >>> + + output { + File out_vcf = "~{prefix}.~{midfix}.vcf.gz" + File out_vcf_idx = "~{prefix}.~{midfix}.vcf.gz.tbi" + } +} + diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl new file mode 100755 index 000000000..0ba0a2f45 --- /dev/null +++ b/wdl/ShardedAnnotateVcf.wdl @@ -0,0 +1,290 @@ +version 1.0 + +import "Structs.wdl" +import "TasksMakeCohortVcf.wdl" as MiniTasks +import "HailMerge.wdl" as HailMerge +import "AnnotateFunctionalConsequences.wdl" as func +import "PruneAndAddVafs.wdl" as pav +import "AnnotateExternalAF.wdl" as eaf + +workflow ShardedAnnotateVcf { + + input { + File vcf + File vcf_idx + String prefix + String contig + + File protein_coding_gtf + File? noncoding_bed + Int? promoter_window + Int? max_breakend_as_cnv_length + String? svannotate_additional_args + + Int max_shards_per_chrom_step1 + Int min_records_per_shard_step1 + + File? sample_pop_assignments # Two-column file with sample ID & pop assignment. "." for pop will ignore sample + File sample_list + File? ped_file # Used for M/F AF calculations + File? par_bed + File? allosomes_list + Int sv_per_shard + + File? ref_bed # File with external allele frequencies + String? ref_prefix # prefix name for external AF call set (required if ref_bed set) + Array[String]? population # populations to annotate external AF for (required if ref_bed set) + + Boolean use_hail + String? gcs_project + + String sv_pipeline_docker + String sv_pipeline_hail_docker + String sv_base_mini_docker + String gatk_docker + + RuntimeAttr? runtime_attr_svannotate + RuntimeAttr? runtime_attr_concat_vcfs + RuntimeAttr? runtime_attr_shard_vcf + RuntimeAttr? runtime_attr_compute_AFs + RuntimeAttr? runtime_attr_combine_vcfs + RuntimeAttr? runtime_attr_modify_vcf + RuntimeAttr? runtime_attr_combine_vcfs + RuntimeAttr? runtime_attr_split_vcf + RuntimeAttr? runtime_attr_split_ref_bed + RuntimeAttr? runtime_attr_split_query_vcf + RuntimeAttr? runtime_attr_bedtools_closest + RuntimeAttr? runtime_attr_select_matched_svs + RuntimeAttr? runtime_attr_scatter_vcf + RuntimeAttr? runtime_attr_fix_ends_rescale_GQ + RuntimeAttr? runtime_attr_concat_sharded_cluster + RuntimeAttr? runtime_attr_preconcat_sharded_cluster + RuntimeAttr? runtime_attr_hail_merge_sharded_cluster + RuntimeAttr? runtime_attr_fix_header_sharded_cluster + RuntimeAttr? runtime_attr_get_vcf_header_with_members_info_line + } + + call MiniTasks.ScatterVcf{ + input: + vcf = vcf, + prefix = prefix, + records_per_shard = sv_per_shard, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_scatter_vcf + } + + scatter (i in range(length(ScatterVcf.shards))) { + + call FixEndsRescaleGQ { + input: + vcf = ScatterVcf.shards[i], + prefix = "~{prefix}.~{i}", + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_fix_ends_rescale_GQ + } + + call func.AnnotateFunctionalConsequences { + input: + vcf = FixEndsRescaleGQ.out, + vcf_index = FixEndsRescaleGQ.out_idx, + prefix = "~{prefix}.~{i}", + protein_coding_gtf = protein_coding_gtf, + noncoding_bed = noncoding_bed, + promoter_window = promoter_window, + max_breakend_as_cnv_length = max_breakend_as_cnv_length, + additional_args = svannotate_additional_args, + gatk_docker = gatk_docker, + runtime_attr_svannotate = runtime_attr_svannotate + } + + call pav.PruneAndAddVafs as PruneAndAddVafs { + input: + vcf = AnnotateFunctionalConsequences.annotated_vcf, + vcf_idx = AnnotateFunctionalConsequences.annotated_vcf_index, + prefix = prefix, + contig = contig, + ped_file = ped_file, + par_bed = par_bed, + sample_list = sample_list, + allosomes_list = allosomes_list, + sample_pop_assignments = sample_pop_assignments, + + sv_base_mini_docker = sv_base_mini_docker, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_shard_vcf = runtime_attr_shard_vcf, + runtime_attr_compute_AFs = runtime_attr_compute_AFs, + runtime_attr_combine_vcfs = runtime_attr_combine_vcfs, + runtime_attr_concat_vcfs = runtime_attr_concat_vcfs + } + + if (defined(ref_bed)) { + call eaf.AnnotateExternalAF as AnnotateExternalAF { + input: + vcf = PruneAndAddVafs.output_vcf, + vcf_idx = PruneAndAddVafs.output_vcf_idx, + ref_bed = select_first([ref_bed]), + population = select_first([population]), + ref_prefix = select_first([ref_prefix]), + prefix = prefix, + contigs = [contig], + max_shards_per_chrom_step1 = max_shards_per_chrom_step1, + min_records_per_shard_step1 = min_records_per_shard_step1, + sv_base_mini_docker = sv_base_mini_docker, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_modify_vcf = runtime_attr_modify_vcf, + runtime_attr_split_vcf = runtime_attr_split_vcf, + runtime_attr_combine_vcfs = runtime_attr_combine_vcfs, + runtime_attr_split_ref_bed = runtime_attr_split_ref_bed, + runtime_attr_split_query_vcf = runtime_attr_split_query_vcf, + runtime_attr_bedtools_closest = runtime_attr_bedtools_closest, + runtime_attr_select_matched_svs = runtime_attr_select_matched_svs + } + } + + } + + #Array[File?] sharded_annotated_vcf = select_first([AnnotateExternalAF.annotated_vcf, PruneAndAddVafs.output_vcf]) + #Array[File?] sharded_annotated_vcf_idx = select_first([AnnotateExternalAF.annotated_vcf_tbi, PruneAndAddVafs.output_vcf_idx]) + Array[File] sharded_annotated_vcf = PruneAndAddVafs.output_vcf + Array[File] sharded_annotated_vcf_idx = PruneAndAddVafs.output_vcf_idx + + + if (length(sharded_annotated_vcf) == 0) { + call MiniTasks.GetVcfHeaderWithMembersInfoLine as GetVcfHeader_annotated { + input: + vcf_gz=vcf, + prefix="~{prefix}.annotated", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_attr_get_vcf_header_with_members_info_line + } + } + + if (length(sharded_annotated_vcf) > 0) { + if (use_hail) { + call HailMerge.HailMerge as ConcatVcfsHail_annotated { + input: + vcfs=sharded_annotated_vcf, + prefix="~{prefix}.annotated", + gcs_project=gcs_project, + sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, + runtime_attr_preconcat=runtime_attr_preconcat_sharded_cluster, + runtime_attr_hail_merge=runtime_attr_hail_merge_sharded_cluster, + runtime_attr_fix_header=runtime_attr_fix_header_sharded_cluster + } + } + + if (!use_hail) { + call MiniTasks.ConcatVcfs as ConcatVcfs_annotated { + input: + vcfs=sharded_annotated_vcf, + vcfs_idx=sharded_annotated_vcf_idx, + allow_overlaps=true, + outfile_prefix="~{prefix}.annotatedd", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_attr_concat_sharded_cluster + } + } + + } + + + output { + File output_vcf = select_first([GetVcfHeader_annotated.out, ConcatVcfs_annotated.concat_vcf, ConcatVcfsHail_annotated.merged_vcf]) + File output_vcf_idx = select_first([GetVcfHeader_annotated.out_idx, ConcatVcfs_annotated.concat_vcf_idx, ConcatVcfsHail_annotated.merged_vcf_index]) + } +} + + +#function to fix BND, CTX, CPX, INS that have END and END2 represent the breakpoint on the 2nd chromosome +#Note: this is a temp function for the first beta version of gnomad SV callset. It'll be revised and added as part of the manunal revise / clean up script +task FixEndsRescaleGQ { + input { + File vcf + String prefix + + Boolean? fix_ends + Boolean? rescale_gq + + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: ceil(10 + size(vcf, "GB") * 2), + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + String outfile = "~{prefix}.vcf.gz" + Boolean fix_ends_ = select_first([fix_ends, true]) + Boolean rescale_gq_ = select_first([rescale_gq, true]) + + output { + File out = "~{outfile}" + File out_idx = "~{outfile}.tbi" + } + command <<< + + set -euo pipefail + + python <>> + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + diff --git a/wdl/TasksMakeCohortVcf.wdl b/wdl/TasksMakeCohortVcf.wdl index 1cf9237d7..0fc76b120 100644 --- a/wdl/TasksMakeCohortVcf.wdl +++ b/wdl/TasksMakeCohortVcf.wdl @@ -959,19 +959,19 @@ task RenameVariantIds { task ScatterVcf { input { File vcf - File? vcf_index String prefix Int records_per_shard Int? threads = 1 - String? contig String sv_pipeline_docker RuntimeAttr? runtime_attr_override } Float input_size = size(vcf, "GB") + Float base_disk_gb = 10.0 + RuntimeAttr runtime_default = object { mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 5.0), + disk_gb: ceil(base_disk_gb + input_size * 5.0), cpu_cores: 2, preemptible_tries: 3, max_retries: 1, @@ -991,18 +991,20 @@ task ScatterVcf { command <<< set -euo pipefail # in case the file is empty create an empty shard - bcftools view -h ~{vcf} | bgzip -c > "~{prefix}.0.vcf.gz" - bcftools +scatter ~{vcf} -o . -O z -p "~{prefix}". --threads ~{threads} -n ~{records_per_shard} ~{"-r " + contig} + bcftools view -h ~{vcf} | bgzip -c > ~{prefix}.0.vcf.gz + bcftools +scatter ~{vcf} -o . -O z -p ~{prefix}. --threads ~{threads} -n ~{records_per_shard} - ls "~{prefix}".*.vcf.gz | sort -k1,1V > vcfs.list + ls ~{prefix}.*.vcf.gz | sort -k1,1V > vcfs.list i=0 - while read VCF; do + while read vcf; do shard_no=`printf %06d $i` - mv "$VCF" "~{prefix}.shard_${shard_no}.vcf.gz" + mv ${vcf} ~{prefix}.shard_${shard_no}.vcf.gz + tabix -p vcf ~{prefix}.shard_${shard_no}.vcf.gz i=$((i+1)) done < vcfs.list >>> output { Array[File] shards = glob("~{prefix}.shard_*.vcf.gz") + Array[File] shards_idx = glob("~{prefix}.shard_*.vcf.gz.tbi") } } From e8562c96a04506ff11a0dbab4dc7ddbcf9edc66c Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 15 Dec 2022 19:12:01 -0500 Subject: [PATCH 02/26] remove gnomAD-specific END fix and make contigs list a file --- wdl/AnnotateVcf.wdl | 6 ++- wdl/ShardedAnnotateVcf.wdl | 106 ++----------------------------------- 2 files changed, 7 insertions(+), 105 deletions(-) diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl index 1682e025b..501b6736a 100644 --- a/wdl/AnnotateVcf.wdl +++ b/wdl/AnnotateVcf.wdl @@ -8,7 +8,7 @@ workflow AnnotateVcf { input { Array[File] vcf_list Array[File] vcf_idx_list - Array[String] contig_list + File contig_list Array[String] prefix_list File protein_coding_gtf @@ -58,12 +58,14 @@ workflow AnnotateVcf { RuntimeAttr? runtime_attr_get_vcf_header_with_members_info_line } + Array[String] contigs = read_lines(contig_list) + scatter (i in range(length(vcf_list))) { call sharded_annotate_vcf.ShardedAnnotateVcf as ShardedAnnotateVcf{ input: vcf = vcf_list[i], vcf_idx = vcf_idx_list[i], - contig = contig_list[i], + contig = contigs[i], prefix = prefix_list[i], protein_coding_gtf = protein_coding_gtf, noncoding_bed = noncoding_bed, diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl index 0ba0a2f45..aa1ddea2e 100755 --- a/wdl/ShardedAnnotateVcf.wdl +++ b/wdl/ShardedAnnotateVcf.wdl @@ -64,7 +64,7 @@ workflow ShardedAnnotateVcf { RuntimeAttr? runtime_attr_get_vcf_header_with_members_info_line } - call MiniTasks.ScatterVcf{ + call MiniTasks.ScatterVcf { input: vcf = vcf, prefix = prefix, @@ -75,18 +75,10 @@ workflow ShardedAnnotateVcf { scatter (i in range(length(ScatterVcf.shards))) { - call FixEndsRescaleGQ { - input: - vcf = ScatterVcf.shards[i], - prefix = "~{prefix}.~{i}", - sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_override = runtime_attr_fix_ends_rescale_GQ - } - call func.AnnotateFunctionalConsequences { input: - vcf = FixEndsRescaleGQ.out, - vcf_index = FixEndsRescaleGQ.out_idx, + vcf = ScatterVcf.shards[i], + vcf_index = ScatterVcf.shards_idx[i], prefix = "~{prefix}.~{i}", protein_coding_gtf = protein_coding_gtf, noncoding_bed = noncoding_bed, @@ -196,95 +188,3 @@ workflow ShardedAnnotateVcf { } } - -#function to fix BND, CTX, CPX, INS that have END and END2 represent the breakpoint on the 2nd chromosome -#Note: this is a temp function for the first beta version of gnomad SV callset. It'll be revised and added as part of the manunal revise / clean up script -task FixEndsRescaleGQ { - input { - File vcf - String prefix - - Boolean? fix_ends - Boolean? rescale_gq - - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 3.75, - disk_gb: ceil(10 + size(vcf, "GB") * 2), - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - - String outfile = "~{prefix}.vcf.gz" - Boolean fix_ends_ = select_first([fix_ends, true]) - Boolean rescale_gq_ = select_first([rescale_gq, true]) - - output { - File out = "~{outfile}" - File out_idx = "~{outfile}.tbi" - } - command <<< - - set -euo pipefail - - python <>> - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_pipeline_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - } -} - From d46986f806649e990d846a667a640a0cec11694d Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 15 Dec 2022 19:15:53 -0500 Subject: [PATCH 03/26] make hail docker optional --- wdl/AnnotateVcf.wdl | 2 +- wdl/ShardedAnnotateVcf.wdl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl index 501b6736a..3555f2804 100644 --- a/wdl/AnnotateVcf.wdl +++ b/wdl/AnnotateVcf.wdl @@ -35,7 +35,7 @@ workflow AnnotateVcf { String? gcs_project String sv_pipeline_docker - String sv_pipeline_hail_docker + String? sv_pipeline_hail_docker String sv_base_mini_docker String gatk_docker diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl index aa1ddea2e..b6878c399 100755 --- a/wdl/ShardedAnnotateVcf.wdl +++ b/wdl/ShardedAnnotateVcf.wdl @@ -39,7 +39,7 @@ workflow ShardedAnnotateVcf { String? gcs_project String sv_pipeline_docker - String sv_pipeline_hail_docker + String? sv_pipeline_hail_docker String sv_base_mini_docker String gatk_docker @@ -160,7 +160,7 @@ workflow ShardedAnnotateVcf { gcs_project=gcs_project, sv_base_mini_docker=sv_base_mini_docker, sv_pipeline_docker=sv_pipeline_docker, - sv_pipeline_hail_docker=sv_pipeline_hail_docker, + sv_pipeline_hail_docker=select_first([sv_pipeline_hail_docker]), runtime_attr_preconcat=runtime_attr_preconcat_sharded_cluster, runtime_attr_hail_merge=runtime_attr_hail_merge_sharded_cluster, runtime_attr_fix_header=runtime_attr_fix_header_sharded_cluster From 95fc0260bba9884fcd981f1e8139ff6c568fe4ef Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 15 Dec 2022 19:25:32 -0500 Subject: [PATCH 04/26] make prune vcf samples step & input optional --- wdl/AnnotateVcf.wdl | 4 +- wdl/PruneAndAddVafs.wdl | 93 ++++++-------------------------------- wdl/ShardedAnnotateVcf.wdl | 4 +- 3 files changed, 17 insertions(+), 84 deletions(-) diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl index 3555f2804..0853bec7a 100644 --- a/wdl/AnnotateVcf.wdl +++ b/wdl/AnnotateVcf.wdl @@ -21,7 +21,7 @@ workflow AnnotateVcf { Int min_records_per_shard_step1 File? sample_pop_assignments # Two-column file with sample ID & pop assignment. "." for pop will ignore sample - File sample_list # List of samples to be retained from the output vcf + File? sample_keep_list # List of samples to be retained from the output vcf File? ped_file # Used for M/F AF calculations File? par_bed File? allosomes_list @@ -76,7 +76,7 @@ workflow AnnotateVcf { max_shards_per_chrom_step1 = max_shards_per_chrom_step1, min_records_per_shard_step1 = min_records_per_shard_step1, sample_pop_assignments = sample_pop_assignments, - sample_list = sample_list, + sample_keep_list = sample_keep_list, ped_file = ped_file, par_bed = par_bed, sv_per_shard = sv_per_shard, diff --git a/wdl/PruneAndAddVafs.wdl b/wdl/PruneAndAddVafs.wdl index 10178b7e5..49ff1a93f 100644 --- a/wdl/PruneAndAddVafs.wdl +++ b/wdl/PruneAndAddVafs.wdl @@ -20,7 +20,7 @@ workflow PruneAndAddVafs { File? ped_file # Used for M/F AF calculations File? par_bed File? allosomes_list - File sample_list # List of samples to be retained from the output vcf + File? sample_keep_list # List of samples to be retained from the output vcf String sv_base_mini_docker String sv_pipeline_docker @@ -33,21 +33,23 @@ workflow PruneAndAddVafs { } # Prune VCF - call ExtractSubsetSamples { - input: - vcf = vcf, - vcf_idx = vcf_idx, - sample_list = sample_list, - midfix = prefix, - sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_override = runtime_attr_extract_subset_samples_from_vcf + if (defined(sample_keep_list)) { + call ExtractSubsetSamples { + input: + vcf = vcf, + vcf_idx = vcf_idx, + sample_list = select_first([sample_keep_list]), + midfix = prefix, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_extract_subset_samples_from_vcf + } } # Compute AC, AN, and AF per population & sex combination call calcAF.ChromosomeAlleleFrequencies as ChromosomeAlleleFrequencies { input: - vcf = ExtractSubsetSamples.out_vcf, - vcf_idx = ExtractSubsetSamples.out_vcf_idx, + vcf = select_first([ExtractSubsetSamples.out_vcf, vcf]), + vcf_idx = select_first([ExtractSubsetSamples.out_vcf_idx, vcf_idx]), contig = contig, prefix = prefix, sample_pop_assignments = sample_pop_assignments, @@ -67,75 +69,6 @@ workflow PruneAndAddVafs { } } -# Prune off samples from annotated VCF -task PruneVcf { - - input { - File vcf - File vcf_idx - String contig - String prefix - - File? prune_list - - String sv_base_mini_docker - - RuntimeAttr? runtime_attr_override - } - - output { - File pruned_vcf = "${prefix}.${contig}.pruned.vcf.gz" - File pruned_vcf_idx = "${prefix}.${contig}.pruned.vcf.gz.tbi" - } - - command <<< - - set -euo pipefail - - # Tabix chromosome of interest - tabix -h ~{vcf} ~{contig} | bgzip -c > ~{contig}.vcf.gz - - # Get column indexes corresponding to samples to drop, if any exist - if ~{defined(prune_list)}; then - dropidx=$( zcat ~{contig}.vcf.gz \ - | sed -n '1,500p' \ - | grep "^#CHROM" \ - | sed 's/\t/\n/g' \ - | awk -v OFS="\t" '{ print NR, $1 }' \ - | fgrep -wf ~{prune_list} \ - | cut -f1 | paste -s -d, ) - zcat ~{contig}.vcf.gz \ - | cut --complement -f "$dropidx" \ - | bgzip -c \ - > "~{prefix}.~{contig}.pruned.vcf.gz" - else - cp "~{contig}.vcf.gz" "~{prefix}.~{contig}.pruned.vcf.gz" - fi - - tabix -f "~{prefix}.~{contig}.pruned.vcf.gz" - - >>> - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 3.75, - disk_gb: 250, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: sv_base_mini_docker - } -} task ExtractSubsetSamples { input { diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl index b6878c399..f8afd9997 100755 --- a/wdl/ShardedAnnotateVcf.wdl +++ b/wdl/ShardedAnnotateVcf.wdl @@ -25,7 +25,7 @@ workflow ShardedAnnotateVcf { Int min_records_per_shard_step1 File? sample_pop_assignments # Two-column file with sample ID & pop assignment. "." for pop will ignore sample - File sample_list + File? sample_keep_list File? ped_file # Used for M/F AF calculations File? par_bed File? allosomes_list @@ -97,7 +97,7 @@ workflow ShardedAnnotateVcf { contig = contig, ped_file = ped_file, par_bed = par_bed, - sample_list = sample_list, + sample_keep_list = sample_keep_list, allosomes_list = allosomes_list, sample_pop_assignments = sample_pop_assignments, From b0fa6fdcf4aed069ce47b8d6f4bd1fd86db8447c Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Fri, 6 Jan 2023 17:34:30 -0500 Subject: [PATCH 05/26] create tabix index if not in expected location in svannotate --- wdl/AnnotateFunctionalConsequences.wdl | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/wdl/AnnotateFunctionalConsequences.wdl b/wdl/AnnotateFunctionalConsequences.wdl index 5df4f1969..c018c492b 100644 --- a/wdl/AnnotateFunctionalConsequences.wdl +++ b/wdl/AnnotateFunctionalConsequences.wdl @@ -75,9 +75,14 @@ task SVAnnotate { } command <<< - set -euo pipefail + set -euo pipefail - gatk --java-options "-Xmx~{java_mem_mb}m" SVAnnotate \ + # check index is in expected location. if not, tabix + if [ ! -f "~{vcf}.tbi" ]; then + tabix -p vcf ~{vcf} + fi + + gatk --java-options "-Xmx~{java_mem_mb}m" SVAnnotate \ -V ~{vcf} \ -O ~{outfile} \ --protein-coding-gtf ~{protein_coding_gtf} \ From 09b695bdc8fa4c1e07d849cbd3f0c7544dee6afc Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 26 Jan 2023 18:56:05 -0500 Subject: [PATCH 06/26] samples list input to compute_AFs.py --- src/sv-pipeline/05_annotation/scripts/compute_AFs.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/sv-pipeline/05_annotation/scripts/compute_AFs.py b/src/sv-pipeline/05_annotation/scripts/compute_AFs.py index 31b5a0c0a..dbbd476d4 100755 --- a/src/sv-pipeline/05_annotation/scripts/compute_AFs.py +++ b/src/sv-pipeline/05_annotation/scripts/compute_AFs.py @@ -294,6 +294,8 @@ def main(): 'sex-specific AFs).', default=None) parser.add_argument('--par', help='BED file of pseudoautosomal regions (used ' + 'for sex-specific AFs).', default=None) + parser.add_argument('--samples-list', help='List of samples to use for AF calculations', + default=None) parser.add_argument( 'fout', help='Output vcf. Also accepts "stdout" and "-".') args = parser.parse_args() @@ -305,7 +307,10 @@ def main(): vcf = pysam.VariantFile(args.vcf) # Get list of all samples in vcf - samples_list = list(vcf.header.samples) + if args.samples_list is None: + samples_list = list(vcf.header.samples) + else: + samples_list = [line.strip() for line in open(args.samples_list)] # Get lists of males and females parbt = pbt.BedTool('', from_string=True) From d053a58d12784be0d8fd7576c7a5d0a066fa48c2 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 26 Jan 2023 19:21:35 -0500 Subject: [PATCH 07/26] update WDLs and docker with samples list for compute afs --- src/sv-pipeline/05_annotation/scripts/compute_AFs.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/sv-pipeline/05_annotation/scripts/compute_AFs.py b/src/sv-pipeline/05_annotation/scripts/compute_AFs.py index dbbd476d4..31b5a0c0a 100755 --- a/src/sv-pipeline/05_annotation/scripts/compute_AFs.py +++ b/src/sv-pipeline/05_annotation/scripts/compute_AFs.py @@ -294,8 +294,6 @@ def main(): 'sex-specific AFs).', default=None) parser.add_argument('--par', help='BED file of pseudoautosomal regions (used ' + 'for sex-specific AFs).', default=None) - parser.add_argument('--samples-list', help='List of samples to use for AF calculations', - default=None) parser.add_argument( 'fout', help='Output vcf. Also accepts "stdout" and "-".') args = parser.parse_args() @@ -307,10 +305,7 @@ def main(): vcf = pysam.VariantFile(args.vcf) # Get list of all samples in vcf - if args.samples_list is None: - samples_list = list(vcf.header.samples) - else: - samples_list = [line.strip() for line in open(args.samples_list)] + samples_list = list(vcf.header.samples) # Get lists of males and females parbt = pbt.BedTool('', from_string=True) From 1a583bea0ab0940db25f437206f66d706d88eccd Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Tue, 2 May 2023 16:49:34 -0400 Subject: [PATCH 08/26] keep external af annotation, update json templates --- .../AnnotateVcf.SingleBatch.json.tmpl | 7 ++++--- .../workflow_configurations/AnnotateVcf.json.tmpl | 7 ++++--- inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl | 7 ++++--- wdl/ShardedAnnotateVcf.wdl | 4 ++-- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl index 73724c902..a0c461b8c 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl @@ -1,6 +1,6 @@ { - "AnnotateVcf.vcf" : "${this.cleaned_vcf}", - "AnnotateVcf.vcf_idx" : "${this.cleaned_vcf_index}", + "AnnotateVcf.vcf_list" : "${this.cleaned_vcf}", + "AnnotateVcf.vcf_idx_list" : "${this.cleaned_vcf_index}", "AnnotateVcf.protein_coding_gtf" : "${workspace.protein_coding_gtf}", "AnnotateVcf.noncoding_bed" : "${workspace.noncoding_bed}", @@ -15,7 +15,8 @@ "AnnotateVcf.max_shards_per_chrom_step1" : 200, "AnnotateVcf.min_records_per_shard_step1" : 5000, - "AnnotateVcf.prefix" : "${this.sample_set_id}", + "AnnotateVcf.prefix_list" : "${this.sample_set_id}", + "AnnotateVcf.use_hail": "false", "AnnotateVcf.gatk_docker" : "${workspace.gatk_docker}", "AnnotateVcf.sv_base_mini_docker" : "${workspace.sv_base_mini_docker}", diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl index adc72e9be..ca973e632 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl @@ -1,6 +1,6 @@ { - "AnnotateVcf.vcf" : "${this.cleaned_vcf}", - "AnnotateVcf.vcf_idx" : "${this.cleaned_vcf_index}", + "AnnotateVcf.vcf_list" : "${this.cleaned_vcf}", + "AnnotateVcf.vcf_idx_list" : "${this.cleaned_vcf_index}", "AnnotateVcf.protein_coding_gtf" : "${workspace.protein_coding_gtf}", "AnnotateVcf.noncoding_bed" : "${workspace.noncoding_bed}", @@ -15,7 +15,8 @@ "AnnotateVcf.max_shards_per_chrom_step1" : 200, "AnnotateVcf.min_records_per_shard_step1" : 5000, - "AnnotateVcf.prefix" : "${this.sample_set_set_id}", + "AnnotateVcf.prefix_list" : "${this.sample_set_set_id}", + "AnnotateVcf.use_hail": "false", "AnnotateVcf.gatk_docker" : "${workspace.gatk_docker}", "AnnotateVcf.sv_base_mini_docker" : "${workspace.sv_base_mini_docker}", diff --git a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl index 74b441f78..c030a94dc 100644 --- a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl +++ b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl @@ -1,6 +1,6 @@ { - "AnnotateVcf.vcf" : {{ test_batch.clean_vcf | tojson }}, - "AnnotateVcf.vcf_idx" : {{ test_batch.clean_vcf_index | tojson }}, + "AnnotateVcf.vcf_list" : [ {{ test_batch.clean_vcf | tojson }} ], + "AnnotateVcf.vcf_idx_list" : [{{ test_batch.clean_vcf_index | tojson }}], "AnnotateVcf.protein_coding_gtf" : {{ reference_resources.protein_coding_gtf | tojson }}, "AnnotateVcf.noncoding_bed" : {{ reference_resources.noncoding_bed | tojson }}, @@ -15,7 +15,8 @@ "AnnotateVcf.max_shards_per_chrom_step1" : 200, "AnnotateVcf.min_records_per_shard_step1" : 5000, - "AnnotateVcf.prefix" : {{ test_batch.name | tojson }}, + "AnnotateVcf.prefix_list" : [{{ test_batch.name | tojson }}], + "AnnotateVcf.use_hail": "false", "AnnotateVcf.gatk_docker":{{ dockers.gatk_docker | tojson }}, "AnnotateVcf.sv_base_mini_docker":{{ dockers.sv_base_mini_docker | tojson }}, diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl index f8afd9997..4065791d5 100755 --- a/wdl/ShardedAnnotateVcf.wdl +++ b/wdl/ShardedAnnotateVcf.wdl @@ -137,8 +137,8 @@ workflow ShardedAnnotateVcf { #Array[File?] sharded_annotated_vcf = select_first([AnnotateExternalAF.annotated_vcf, PruneAndAddVafs.output_vcf]) #Array[File?] sharded_annotated_vcf_idx = select_first([AnnotateExternalAF.annotated_vcf_tbi, PruneAndAddVafs.output_vcf_idx]) - Array[File] sharded_annotated_vcf = PruneAndAddVafs.output_vcf - Array[File] sharded_annotated_vcf_idx = PruneAndAddVafs.output_vcf_idx + Array[File] sharded_annotated_vcf = select_first([select_all(AnnotateExternalAF.annotated_vcf), PruneAndAddVafs.output_vcf]) + Array[File] sharded_annotated_vcf_idx = select_first([select_all(AnnotateExternalAF.annotated_vcf_tbi), PruneAndAddVafs.output_vcf_idx]) if (length(sharded_annotated_vcf) == 0) { From 3e65184904a9258b4b4f5a733862c83d4bf6478c Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 8 May 2023 15:18:03 -0400 Subject: [PATCH 09/26] shard by contig if inputs are not already --- wdl/AnnotateVcf.wdl | 12 +++++++----- wdl/ShardedAnnotateVcf.wdl | 3 +-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl index 0853bec7a..e16ec32d6 100644 --- a/wdl/AnnotateVcf.wdl +++ b/wdl/AnnotateVcf.wdl @@ -6,10 +6,11 @@ import "ShardedAnnotateVcf.wdl" as sharded_annotate_vcf workflow AnnotateVcf { input { - Array[File] vcf_list + Array[File] vcf_list # Must be either single full VCF (array of length 1) or array of VCFs sharded by contig. Index & prefix list inputs should match Array[File] vcf_idx_list File contig_list Array[String] prefix_list + Boolean sharded_by_contig # True if providing a vcf_list sharded by contig. False if providing a single full VCF File protein_coding_gtf File? noncoding_bed @@ -60,13 +61,14 @@ workflow AnnotateVcf { Array[String] contigs = read_lines(contig_list) - scatter (i in range(length(vcf_list))) { + scatter (i in range(length(contigs))) { + Int array_index = if (sharded_by_contig && length(vcf_list) > 1) then i else 0 call sharded_annotate_vcf.ShardedAnnotateVcf as ShardedAnnotateVcf{ input: - vcf = vcf_list[i], - vcf_idx = vcf_idx_list[i], + vcf = vcf_list[array_index], + vcf_idx = vcf_idx_list[array_index], contig = contigs[i], - prefix = prefix_list[i], + prefix = prefix_list[array_index], protein_coding_gtf = protein_coding_gtf, noncoding_bed = noncoding_bed, promoter_window = promoter_window, diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl index 4065791d5..9a791f9be 100755 --- a/wdl/ShardedAnnotateVcf.wdl +++ b/wdl/ShardedAnnotateVcf.wdl @@ -69,6 +69,7 @@ workflow ShardedAnnotateVcf { vcf = vcf, prefix = prefix, records_per_shard = sv_per_shard, + contig = contig, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_scatter_vcf } @@ -135,8 +136,6 @@ workflow ShardedAnnotateVcf { } - #Array[File?] sharded_annotated_vcf = select_first([AnnotateExternalAF.annotated_vcf, PruneAndAddVafs.output_vcf]) - #Array[File?] sharded_annotated_vcf_idx = select_first([AnnotateExternalAF.annotated_vcf_tbi, PruneAndAddVafs.output_vcf_idx]) Array[File] sharded_annotated_vcf = select_first([select_all(AnnotateExternalAF.annotated_vcf), PruneAndAddVafs.output_vcf]) Array[File] sharded_annotated_vcf_idx = select_first([select_all(AnnotateExternalAF.annotated_vcf_tbi), PruneAndAddVafs.output_vcf_idx]) From 087364dc68972b7375b23e9d364f2ce02846c73c Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 8 May 2023 15:42:48 -0400 Subject: [PATCH 10/26] use latest ScatterVcf. also reverted sample list for compute AFs during rebase --- wdl/AnnotateFunctionalConsequences.wdl | 4 ++-- wdl/ShardedAnnotateVcf.wdl | 3 +-- wdl/TasksMakeCohortVcf.wdl | 18 ++++++++---------- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/wdl/AnnotateFunctionalConsequences.wdl b/wdl/AnnotateFunctionalConsequences.wdl index c018c492b..2a08b5d09 100644 --- a/wdl/AnnotateFunctionalConsequences.wdl +++ b/wdl/AnnotateFunctionalConsequences.wdl @@ -5,7 +5,7 @@ import "Structs.wdl" workflow AnnotateFunctionalConsequences { input { File vcf - File vcf_index + File? vcf_index String prefix File protein_coding_gtf @@ -41,7 +41,7 @@ workflow AnnotateFunctionalConsequences { task SVAnnotate { input { File vcf - File vcf_index + File? vcf_index String prefix File protein_coding_gtf diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl index 9a791f9be..cce4aa451 100755 --- a/wdl/ShardedAnnotateVcf.wdl +++ b/wdl/ShardedAnnotateVcf.wdl @@ -79,8 +79,7 @@ workflow ShardedAnnotateVcf { call func.AnnotateFunctionalConsequences { input: vcf = ScatterVcf.shards[i], - vcf_index = ScatterVcf.shards_idx[i], - prefix = "~{prefix}.~{i}", + prefix = "~{prefix}.~{contig}.~{i}", protein_coding_gtf = protein_coding_gtf, noncoding_bed = noncoding_bed, promoter_window = promoter_window, diff --git a/wdl/TasksMakeCohortVcf.wdl b/wdl/TasksMakeCohortVcf.wdl index 0fc76b120..1cf9237d7 100644 --- a/wdl/TasksMakeCohortVcf.wdl +++ b/wdl/TasksMakeCohortVcf.wdl @@ -959,19 +959,19 @@ task RenameVariantIds { task ScatterVcf { input { File vcf + File? vcf_index String prefix Int records_per_shard Int? threads = 1 + String? contig String sv_pipeline_docker RuntimeAttr? runtime_attr_override } Float input_size = size(vcf, "GB") - Float base_disk_gb = 10.0 - RuntimeAttr runtime_default = object { mem_gb: 3.75, - disk_gb: ceil(base_disk_gb + input_size * 5.0), + disk_gb: ceil(10.0 + input_size * 5.0), cpu_cores: 2, preemptible_tries: 3, max_retries: 1, @@ -991,20 +991,18 @@ task ScatterVcf { command <<< set -euo pipefail # in case the file is empty create an empty shard - bcftools view -h ~{vcf} | bgzip -c > ~{prefix}.0.vcf.gz - bcftools +scatter ~{vcf} -o . -O z -p ~{prefix}. --threads ~{threads} -n ~{records_per_shard} + bcftools view -h ~{vcf} | bgzip -c > "~{prefix}.0.vcf.gz" + bcftools +scatter ~{vcf} -o . -O z -p "~{prefix}". --threads ~{threads} -n ~{records_per_shard} ~{"-r " + contig} - ls ~{prefix}.*.vcf.gz | sort -k1,1V > vcfs.list + ls "~{prefix}".*.vcf.gz | sort -k1,1V > vcfs.list i=0 - while read vcf; do + while read VCF; do shard_no=`printf %06d $i` - mv ${vcf} ~{prefix}.shard_${shard_no}.vcf.gz - tabix -p vcf ~{prefix}.shard_${shard_no}.vcf.gz + mv "$VCF" "~{prefix}.shard_${shard_no}.vcf.gz" i=$((i+1)) done < vcfs.list >>> output { Array[File] shards = glob("~{prefix}.shard_*.vcf.gz") - Array[File] shards_idx = glob("~{prefix}.shard_*.vcf.gz.tbi") } } From 148319c5c563734cdedf861ff19c6a22e78b49dc Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 8 May 2023 16:01:51 -0400 Subject: [PATCH 11/26] womtool validation --- .../AnnotateVcf.SingleBatch.json.tmpl | 1 + .../AnnotateVcf.json.tmpl | 1 + .../test/AnnotateVcf/AnnotateVcf.json.tmpl | 1 + wdl/AnnotateVcf.wdl | 2 +- wdl/GATKSVPipelineSingleSample.wdl | 20 ++++++++++--------- wdl/HailMerge.wdl | 18 ++++++++--------- wdl/ShardedAnnotateVcf.wdl | 6 +++--- 7 files changed, 27 insertions(+), 22 deletions(-) diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl index a0c461b8c..701490993 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl @@ -16,6 +16,7 @@ "AnnotateVcf.min_records_per_shard_step1" : 5000, "AnnotateVcf.prefix_list" : "${this.sample_set_id}", + "AnnotateVcf.sharded_by_contig": "false", "AnnotateVcf.use_hail": "false", "AnnotateVcf.gatk_docker" : "${workspace.gatk_docker}", diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl index ca973e632..404abac9f 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl @@ -16,6 +16,7 @@ "AnnotateVcf.min_records_per_shard_step1" : 5000, "AnnotateVcf.prefix_list" : "${this.sample_set_set_id}", + "AnnotateVcf.sharded_by_contig": "false", "AnnotateVcf.use_hail": "false", "AnnotateVcf.gatk_docker" : "${workspace.gatk_docker}", diff --git a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl index c030a94dc..b23b074a1 100644 --- a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl +++ b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl @@ -16,6 +16,7 @@ "AnnotateVcf.min_records_per_shard_step1" : 5000, "AnnotateVcf.prefix_list" : [{{ test_batch.name | tojson }}], + "AnnotateVcf.sharded_by_contig": "false", "AnnotateVcf.use_hail": "false", "AnnotateVcf.gatk_docker":{{ dockers.gatk_docker | tojson }}, diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl index e16ec32d6..ad7dd9661 100644 --- a/wdl/AnnotateVcf.wdl +++ b/wdl/AnnotateVcf.wdl @@ -6,7 +6,7 @@ import "ShardedAnnotateVcf.wdl" as sharded_annotate_vcf workflow AnnotateVcf { input { - Array[File] vcf_list # Must be either single full VCF (array of length 1) or array of VCFs sharded by contig. Index & prefix list inputs should match + Array[File] vcf_list # Must be either single full VCF (array of length 1) or array of VCFs sharded by contig. Outputs will match Array[File] vcf_idx_list File contig_list Array[String] prefix_list diff --git a/wdl/GATKSVPipelineSingleSample.wdl b/wdl/GATKSVPipelineSingleSample.wdl index 464638389..d61231ee2 100644 --- a/wdl/GATKSVPipelineSingleSample.wdl +++ b/wdl/GATKSVPipelineSingleSample.wdl @@ -1389,9 +1389,9 @@ workflow GATKSVPipelineSingleSample { call annotate.AnnotateVcf { input: - vcf = FilterSample.out, - vcf_idx = FilterSample.out_idx, - prefix = batch, + vcf_list = [FilterSample.out], + vcf_idx_list = [FilterSample.out_idx], + prefix_list = [batch], contig_list = primary_contigs_list, protein_coding_gtf = protein_coding_gtf, noncoding_bed = noncoding_bed, @@ -1400,6 +1400,8 @@ workflow GATKSVPipelineSingleSample { ref_bed = external_af_ref_bed, ref_prefix = external_af_ref_bed_prefix, population = external_af_population, + use_hail = false, + sharded_by_contig = false, sv_per_shard = annotation_sv_per_shard, max_shards_per_chrom_step1 = annotation_max_shards_per_chrom_step1, min_records_per_shard_step1 = annotation_min_records_per_shard_step1, @@ -1411,18 +1413,18 @@ workflow GATKSVPipelineSingleSample { call SingleSampleFiltering.VcfToBed as VcfToBed { input: - vcf = AnnotateVcf.output_vcf, + vcf = AnnotateVcf.output_vcf_list[0], prefix = batch, sv_pipeline_docker = sv_pipeline_docker } call SingleSampleFiltering.UpdateBreakendRepresentation { input: - vcf=AnnotateVcf.output_vcf, - vcf_idx=AnnotateVcf.output_vcf_idx, + vcf=AnnotateVcf.output_vcf_list[0], + vcf_idx=AnnotateVcf.output_vcf_idx_list[0], ref_fasta=reference_fasta, ref_fasta_idx=reference_index, - prefix=basename(AnnotateVcf.output_vcf, ".vcf.gz") + ".final_cleanup", + prefix=basename(AnnotateVcf.output_vcf_list[0], ".vcf.gz") + ".final_cleanup", sv_pipeline_docker=sv_pipeline_docker } @@ -1462,8 +1464,8 @@ workflow GATKSVPipelineSingleSample { # These files contain events reported in the internal VCF representation # They are less VCF-spec compliant but may be useful if components of the pipeline need to be re-run # on the output. - File pre_cleanup_vcf = AnnotateVcf.output_vcf - File pre_cleanup_vcf_idx = AnnotateVcf.output_vcf_idx + File pre_cleanup_vcf = AnnotateVcf.output_vcf_list[0] + File pre_cleanup_vcf_idx = AnnotateVcf.output_vcf_idx_list[0] File ploidy_matrix = select_first([GatherBatchEvidence.batch_ploidy_matrix]) File ploidy_plots = select_first([GatherBatchEvidence.batch_ploidy_plots]) diff --git a/wdl/HailMerge.wdl b/wdl/HailMerge.wdl index 9571da1fa..31d0bd31c 100644 --- a/wdl/HailMerge.wdl +++ b/wdl/HailMerge.wdl @@ -12,9 +12,9 @@ workflow HailMerge { String sv_base_mini_docker String sv_pipeline_docker String sv_pipeline_hail_docker - RuntimeAttr? runtime_attr_preconcat - RuntimeAttr? runtime_attr_hail_merge - RuntimeAttr? runtime_attr_fix_header + RuntimeAttr? runtime_override_preconcat + RuntimeAttr? runtime_override_hail_merge + RuntimeAttr? runtime_override_fix_header } # Concatenate vcfs naively to prevent ClassTooLargeException in Hail @@ -26,27 +26,27 @@ workflow HailMerge { generate_index=false, outfile_prefix="~{prefix}.preconcat", sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_attr_preconcat + runtime_attr_override=runtime_override_preconcat } } - call HailMerge { + call HailMergeTask { input: vcfs = [select_first([Preconcat.concat_vcf, vcfs[0]])], prefix = prefix, gcs_project = select_first([gcs_project]), sv_pipeline_hail_docker=sv_pipeline_hail_docker, - runtime_attr_override=runtime_attr_hail_merge + runtime_attr_override=runtime_override_hail_merge } call FixHeader { input: - merged_vcf = HailMerge.merged_vcf, + merged_vcf = HailMergeTask.merged_vcf, example_vcf = vcfs[0], prefix = prefix + ".reheadered", reset_cnv_gts = select_first([reset_cnv_gts, false]), sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_override=runtime_attr_fix_header + runtime_attr_override=runtime_override_fix_header } output { @@ -55,7 +55,7 @@ workflow HailMerge { } } -task HailMerge { +task HailMergeTask { input { Array[File] vcfs String prefix diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl index cce4aa451..a5245099f 100755 --- a/wdl/ShardedAnnotateVcf.wdl +++ b/wdl/ShardedAnnotateVcf.wdl @@ -159,9 +159,9 @@ workflow ShardedAnnotateVcf { sv_base_mini_docker=sv_base_mini_docker, sv_pipeline_docker=sv_pipeline_docker, sv_pipeline_hail_docker=select_first([sv_pipeline_hail_docker]), - runtime_attr_preconcat=runtime_attr_preconcat_sharded_cluster, - runtime_attr_hail_merge=runtime_attr_hail_merge_sharded_cluster, - runtime_attr_fix_header=runtime_attr_fix_header_sharded_cluster + runtime_override_preconcat=runtime_attr_preconcat_sharded_cluster, + runtime_override_hail_merge=runtime_attr_hail_merge_sharded_cluster, + runtime_override_fix_header=runtime_attr_fix_header_sharded_cluster } } From fc16f003e879c60f02755406021e189f70f2b9e4 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 8 May 2023 16:24:41 -0400 Subject: [PATCH 12/26] cleanup --- wdl/AnnotateVcf.wdl | 34 +++-- wdl/ChromosomeAlleleFrequencies.wdl | 197 +--------------------------- wdl/PruneAndAddVafs.wdl | 86 +++--------- wdl/ShardedAnnotateVcf.wdl | 81 +++++------- 4 files changed, 72 insertions(+), 326 deletions(-) diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl index ad7dd9661..982ec7364 100644 --- a/wdl/AnnotateVcf.wdl +++ b/wdl/AnnotateVcf.wdl @@ -56,7 +56,6 @@ workflow AnnotateVcf { RuntimeAttr? runtime_attr_preconcat_sharded_cluster RuntimeAttr? runtime_attr_hail_merge_sharded_cluster RuntimeAttr? runtime_attr_fix_header_sharded_cluster - RuntimeAttr? runtime_attr_get_vcf_header_with_members_info_line } Array[String] contigs = read_lines(contig_list) @@ -96,23 +95,22 @@ workflow AnnotateVcf { sv_base_mini_docker = sv_base_mini_docker, sv_pipeline_hail_docker = sv_pipeline_hail_docker, - runtime_attr_svannotate = runtime_attr_svannotate , - runtime_attr_concat_vcfs = runtime_attr_concat_vcfs , - runtime_attr_shard_vcf = runtime_attr_shard_vcf , - runtime_attr_compute_AFs = runtime_attr_compute_AFs , - runtime_attr_combine_vcfs = runtime_attr_combine_vcfs , - runtime_attr_modify_vcf = runtime_attr_modify_vcf , - runtime_attr_combine_vcfs = runtime_attr_combine_vcfs , - runtime_attr_split_vcf = runtime_attr_split_vcf , - runtime_attr_split_ref_bed = runtime_attr_split_ref_bed , - runtime_attr_split_query_vcf = runtime_attr_split_query_vcf , - runtime_attr_bedtools_closest = runtime_attr_bedtools_closest , - runtime_attr_select_matched_svs = runtime_attr_select_matched_svs , - runtime_attr_concat_sharded_cluster = runtime_attr_concat_sharded_cluster , - runtime_attr_preconcat_sharded_cluster = runtime_attr_preconcat_sharded_cluster , - runtime_attr_hail_merge_sharded_cluster = runtime_attr_hail_merge_sharded_cluster , - runtime_attr_fix_header_sharded_cluster = runtime_attr_fix_header_sharded_cluster , - runtime_attr_get_vcf_header_with_members_info_line = runtime_attr_get_vcf_header_with_members_info_line + runtime_attr_svannotate = runtime_attr_svannotate, + runtime_attr_concat_vcfs = runtime_attr_concat_vcfs, + runtime_attr_shard_vcf = runtime_attr_shard_vcf, + runtime_attr_compute_AFs = runtime_attr_compute_AFs, + runtime_attr_combine_vcfs = runtime_attr_combine_vcfs, + runtime_attr_modify_vcf = runtime_attr_modify_vcf, + runtime_attr_combine_vcfs = runtime_attr_combine_vcfs, + runtime_attr_split_vcf = runtime_attr_split_vcf, + runtime_attr_split_ref_bed = runtime_attr_split_ref_bed, + runtime_attr_split_query_vcf = runtime_attr_split_query_vcf, + runtime_attr_bedtools_closest = runtime_attr_bedtools_closest, + runtime_attr_select_matched_svs = runtime_attr_select_matched_svs, + runtime_attr_concat_sharded_cluster = runtime_attr_concat_sharded_cluster, + runtime_attr_preconcat_sharded_cluster = runtime_attr_preconcat_sharded_cluster, + runtime_attr_hail_merge_sharded_cluster = runtime_attr_hail_merge_sharded_cluster, + runtime_attr_fix_header_sharded_cluster = runtime_attr_fix_header_sharded_cluster } } diff --git a/wdl/ChromosomeAlleleFrequencies.wdl b/wdl/ChromosomeAlleleFrequencies.wdl index 6e7397a2e..8eb422350 100644 --- a/wdl/ChromosomeAlleleFrequencies.wdl +++ b/wdl/ChromosomeAlleleFrequencies.wdl @@ -28,8 +28,6 @@ workflow ChromosomeAlleleFrequencies { RuntimeAttr? runtime_attr_combine_vcfs } - # Tabix to chromosome of interest, and shard input VCF for stats collection - # Scatter over VCF shards call ComputeShardAFs { input: vcf = vcf, @@ -50,128 +48,6 @@ workflow ChromosomeAlleleFrequencies { } } -# Shard VCF into fixed size chunks -task ShardVcf { - - input { - File vcf - File vcf_idx - Int sv_per_shard - String contig - - String sv_pipeline_docker - - RuntimeAttr? runtime_attr_override - } - - output { - Array[File] shard_vcfs = glob("vcf.shard.*.vcf.gz") - } - - command <<< - - set -euo pipefail - - # Tabix chromosome of interest - tabix -h ~{vcf} ~{contig} | bgzip -c > ~{contig}.vcf.gz - - # Then shard VCF - /opt/sv-pipeline/scripts/shard_VCF.sh \ - ~{contig}.vcf.gz \ - ~{sv_per_shard} \ - "vcf.shard." - - # if there were no shards created just make an empty one - if [ ! -e vcf.shard.000000.vcf.gz ]; then - cp ~{contig}.vcf.gz vcf.shard.000000.vcf.gz - fi - >>> - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 3.75, - disk_gb: 250, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 0 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: sv_pipeline_docker - } -} - -# Subset a vcf to a single chromosome, and add global AF information (no subpop) -task ComputeShardAlleleFrequencies { - - input { - - File vcf - String prefix - - File? sample_pop_assignments - File? ped_file - - String sv_pipeline_docker - - RuntimeAttr? runtime_attr_override - } - - output { - File shard_wAFs = "~{prefix}.wAFs.vcf.gz" - File shard_wAFs_idx = "~{prefix}.wAFs.vcf.gz.tbi" - } - - command <<< - - set -euo pipefail - - optionals=" " - if ~{defined(sample_pop_assignments)}; then - optionals="$( echo "$optionals" ) -p ~{sample_pop_assignments}" - fi - - if ~{defined(ped_file)}; then - optionals="$( echo "$optionals" ) -f ~{ped_file}" - fi - - echo -e "OPTIONALS INTERPRETED AS: $optionals" - echo -e "NOW RUNNING: /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $( echo "$optionals" ) ~{vcf} stdout" - # Tabix chromosome of interest & compute AN, AC, and AF - /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $optionals "~{vcf}" stdout \ - | bgzip -c \ - > "~{prefix}.wAFs.vcf.gz" - - tabix -p vcf ~{prefix}.wAFs.vcf.gz - - >>> - - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 3.75, - disk_gb: 20, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 0 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: sv_pipeline_docker - } -} task ComputeShardAFs { input { @@ -196,23 +72,11 @@ task ComputeShardAFs { command <<< set -euo pipefail - optionals=" " - if [ ~{default="SKIP" sample_pop_assignments} != "SKIP" ]; then - optionals="$( echo "$optionals" ) -p ~{sample_pop_assignments}" - fi - if [ ~{default="SKIP" ped_file} != "SKIP" ]; then - optionals="$( echo "$optionals" ) -f ~{ped_file}" - fi - if [ ~{default="SKIP" par_bed} != "SKIP" ]; then - optionals="$( echo "$optionals" ) --par ~{par_bed}" - fi - if [ ~{default="SKIP" allosomes_list} != "SKIP" ]; then - optionals="$( echo "$optionals" ) --allosomes-list ~{allosomes_list}" - fi - echo -e "OPTIONALS INTERPRETED AS: $optionals" - echo -e "NOW RUNNING: /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $( echo "$optionals" ) ~{vcf} stdout" - #Tabix chromosome of interest & compute AN, AC, and AF - /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $optionals "~{vcf}" stdout \ + /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py "~{vcf}" stdout \ + ~{"-p " + sample_pop_assignments} \ + ~{"-f " + ped_file} \ + ~{"-par " + par_bed} \ + ~{"--allosomes-list " + allosomes_list} \ | bgzip -c \ > "~{prefix}.wAFs.vcf.gz" @@ -234,54 +98,3 @@ task ComputeShardAFs { maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) } } - -# Merge VCF shards -task CombineShardedVcfs { - - input { - - Array[File] vcfs - String prefix - - String sv_base_mini_docker - - RuntimeAttr? runtime_attr_override - } - - - output { - File vcf_out = "${prefix}.wAFs.vcf.gz" - File vcf_out_idx = "${prefix}.wAFs.vcf.gz.tbi" - } - - command <<< - - set -euo pipefail - vcf-concat ~{sep=" " vcfs} \ - | vcf-sort \ - | bgzip -c \ - > "~{prefix}.wAFs.vcf.gz"; - tabix -p vcf "~{prefix}.wAFs.vcf.gz" - - >>> - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 3.75, - disk_gb: 50, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 0 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: sv_base_mini_docker - } -} diff --git a/wdl/PruneAndAddVafs.wdl b/wdl/PruneAndAddVafs.wdl index 49ff1a93f..bd685fe97 100644 --- a/wdl/PruneAndAddVafs.wdl +++ b/wdl/PruneAndAddVafs.wdl @@ -5,6 +5,7 @@ version 1.0 import "TasksMakeCohortVcf.wdl" as MiniTasks import "ChromosomeAlleleFrequencies.wdl" as calcAF +import "Utils.wdl" as util # Prune off samples in annotated VCF, add VAF annotation workflow PruneAndAddVafs { @@ -29,37 +30,36 @@ workflow PruneAndAddVafs { RuntimeAttr? runtime_attr_compute_AFs RuntimeAttr? runtime_attr_combine_vcfs RuntimeAttr? runtime_attr_concat_vcfs - RuntimeAttr? runtime_attr_extract_subset_samples_from_vcf + RuntimeAttr? runtime_attr_subset_vcf_by_samples_list } # Prune VCF if (defined(sample_keep_list)) { - call ExtractSubsetSamples { + call util.SubsetVcfBySamplesList { input: - vcf = vcf, - vcf_idx = vcf_idx, - sample_list = select_first([sample_keep_list]), - midfix = prefix, - sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_override = runtime_attr_extract_subset_samples_from_vcf + vcf = vcf, + vcf_idx = vcf_idx, + list_of_samples = select_first([sample_keep_list]), + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_subset_vcf_by_samples_list } } # Compute AC, AN, and AF per population & sex combination call calcAF.ChromosomeAlleleFrequencies as ChromosomeAlleleFrequencies { input: - vcf = select_first([ExtractSubsetSamples.out_vcf, vcf]), - vcf_idx = select_first([ExtractSubsetSamples.out_vcf_idx, vcf_idx]), - contig = contig, - prefix = prefix, + vcf = select_first([SubsetVcfBySamplesList.vcf_subset, vcf]), + vcf_idx = select_first([SubsetVcfBySamplesList.vcf_subset_index, vcf_idx]), + contig = contig, + prefix = prefix, sample_pop_assignments = sample_pop_assignments, - ped_file = ped_file, - par_bed = par_bed, - allosomes_list = allosomes_list, - sv_base_mini_docker = sv_base_mini_docker, + ped_file = ped_file, + par_bed = par_bed, + allosomes_list = allosomes_list, + sv_base_mini_docker = sv_base_mini_docker, sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_shard_vcf = runtime_attr_shard_vcf, - runtime_attr_compute_AFs = runtime_attr_compute_AFs, + runtime_attr_shard_vcf = runtime_attr_shard_vcf, + runtime_attr_compute_AFs = runtime_attr_compute_AFs, runtime_attr_combine_vcfs = runtime_attr_combine_vcfs } @@ -69,53 +69,3 @@ workflow PruneAndAddVafs { } } - -task ExtractSubsetSamples { - input { - File vcf - File vcf_idx - File sample_list - String midfix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - - Float input_size = size(vcf, "GB") - Float base_disk_gb = 10.0 - RuntimeAttr runtime_default = object { - mem_gb: 3, - disk_gb: ceil(base_disk_gb + (input_size * 2.0)), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - String prefix = basename(vcf, '.vcf.gz') - command <<< - set -eu -o pipefail - - bcftools view -S ~{sample_list} ~{vcf} \ - | bgzip > ~{prefix}.~{midfix}.vcf.gz - - tabix -p vcf ~{prefix}.~{midfix}.vcf.gz - - >>> - - output { - File out_vcf = "~{prefix}.~{midfix}.vcf.gz" - File out_vcf_idx = "~{prefix}.~{midfix}.vcf.gz.tbi" - } -} - diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl index a5245099f..46abafe4d 100755 --- a/wdl/ShardedAnnotateVcf.wdl +++ b/wdl/ShardedAnnotateVcf.wdl @@ -61,7 +61,6 @@ workflow ShardedAnnotateVcf { RuntimeAttr? runtime_attr_preconcat_sharded_cluster RuntimeAttr? runtime_attr_hail_merge_sharded_cluster RuntimeAttr? runtime_attr_fix_header_sharded_cluster - RuntimeAttr? runtime_attr_get_vcf_header_with_members_info_line } call MiniTasks.ScatterVcf { @@ -91,28 +90,28 @@ workflow ShardedAnnotateVcf { call pav.PruneAndAddVafs as PruneAndAddVafs { input: - vcf = AnnotateFunctionalConsequences.annotated_vcf, - vcf_idx = AnnotateFunctionalConsequences.annotated_vcf_index, - prefix = prefix, - contig = contig, - ped_file = ped_file, - par_bed = par_bed, - sample_keep_list = sample_keep_list, - allosomes_list = allosomes_list, + vcf = AnnotateFunctionalConsequences.annotated_vcf, + vcf_idx = AnnotateFunctionalConsequences.annotated_vcf_index, + prefix = prefix, + contig = contig, + ped_file = ped_file, + par_bed = par_bed, + sample_keep_list = sample_keep_list, + allosomes_list = allosomes_list, sample_pop_assignments = sample_pop_assignments, - sv_base_mini_docker = sv_base_mini_docker, + sv_base_mini_docker = sv_base_mini_docker, sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_shard_vcf = runtime_attr_shard_vcf, - runtime_attr_compute_AFs = runtime_attr_compute_AFs, + runtime_attr_shard_vcf = runtime_attr_shard_vcf, + runtime_attr_compute_AFs = runtime_attr_compute_AFs, runtime_attr_combine_vcfs = runtime_attr_combine_vcfs, - runtime_attr_concat_vcfs = runtime_attr_concat_vcfs + runtime_attr_concat_vcfs = runtime_attr_concat_vcfs } if (defined(ref_bed)) { call eaf.AnnotateExternalAF as AnnotateExternalAF { input: - vcf = PruneAndAddVafs.output_vcf, + vcf = PruneAndAddVafs.output_vcf, vcf_idx = PruneAndAddVafs.output_vcf_idx, ref_bed = select_first([ref_bed]), population = select_first([population]), @@ -138,51 +137,37 @@ workflow ShardedAnnotateVcf { Array[File] sharded_annotated_vcf = select_first([select_all(AnnotateExternalAF.annotated_vcf), PruneAndAddVafs.output_vcf]) Array[File] sharded_annotated_vcf_idx = select_first([select_all(AnnotateExternalAF.annotated_vcf_tbi), PruneAndAddVafs.output_vcf_idx]) - - if (length(sharded_annotated_vcf) == 0) { - call MiniTasks.GetVcfHeaderWithMembersInfoLine as GetVcfHeader_annotated { + if (use_hail) { + call HailMerge.HailMerge { input: - vcf_gz=vcf, + vcfs=sharded_annotated_vcf, prefix="~{prefix}.annotated", + gcs_project=gcs_project, sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_attr_get_vcf_header_with_members_info_line + sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=select_first([sv_pipeline_hail_docker]), + runtime_override_preconcat=runtime_attr_preconcat_sharded_cluster, + runtime_override_hail_merge=runtime_attr_hail_merge_sharded_cluster, + runtime_override_fix_header=runtime_attr_fix_header_sharded_cluster } } - if (length(sharded_annotated_vcf) > 0) { - if (use_hail) { - call HailMerge.HailMerge as ConcatVcfsHail_annotated { - input: - vcfs=sharded_annotated_vcf, - prefix="~{prefix}.annotated", - gcs_project=gcs_project, - sv_base_mini_docker=sv_base_mini_docker, - sv_pipeline_docker=sv_pipeline_docker, - sv_pipeline_hail_docker=select_first([sv_pipeline_hail_docker]), - runtime_override_preconcat=runtime_attr_preconcat_sharded_cluster, - runtime_override_hail_merge=runtime_attr_hail_merge_sharded_cluster, - runtime_override_fix_header=runtime_attr_fix_header_sharded_cluster - } - } - - if (!use_hail) { - call MiniTasks.ConcatVcfs as ConcatVcfs_annotated { - input: - vcfs=sharded_annotated_vcf, - vcfs_idx=sharded_annotated_vcf_idx, - allow_overlaps=true, - outfile_prefix="~{prefix}.annotatedd", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_attr_concat_sharded_cluster - } + if (!use_hail) { + call MiniTasks.ConcatVcfs { + input: + vcfs=sharded_annotated_vcf, + vcfs_idx=sharded_annotated_vcf_idx, + allow_overlaps=true, + outfile_prefix="~{prefix}.annotatedd", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_attr_concat_sharded_cluster } - } output { - File output_vcf = select_first([GetVcfHeader_annotated.out, ConcatVcfs_annotated.concat_vcf, ConcatVcfsHail_annotated.merged_vcf]) - File output_vcf_idx = select_first([GetVcfHeader_annotated.out_idx, ConcatVcfs_annotated.concat_vcf_idx, ConcatVcfsHail_annotated.merged_vcf_index]) + File output_vcf = select_first([ConcatVcfs.concat_vcf, HailMerge.merged_vcf]) + File output_vcf_idx = select_first([ConcatVcfs.concat_vcf_idx, HailMerge.merged_vcf_index]) } } From f87ea28f6c0c5334e50337d3108936f91b3ba28d Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 8 May 2023 16:43:00 -0400 Subject: [PATCH 13/26] more cleanup --- wdl/AnnotateVcf.wdl | 4 +- wdl/ChromosomeAlleleFrequencies.wdl | 100 ---------------------------- wdl/PruneAndAddVafs.wdl | 70 ++++++++++++++----- wdl/ShardedAnnotateVcf.wdl | 13 ++-- 4 files changed, 62 insertions(+), 125 deletions(-) delete mode 100644 wdl/ChromosomeAlleleFrequencies.wdl diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl index 982ec7364..49039c27a 100644 --- a/wdl/AnnotateVcf.wdl +++ b/wdl/AnnotateVcf.wdl @@ -43,6 +43,7 @@ workflow AnnotateVcf { RuntimeAttr? runtime_attr_svannotate RuntimeAttr? runtime_attr_concat_vcfs RuntimeAttr? runtime_attr_shard_vcf + RuntimeAttr? runtime_attr_subset_vcf_by_samples_list RuntimeAttr? runtime_attr_compute_AFs RuntimeAttr? runtime_attr_combine_vcfs RuntimeAttr? runtime_attr_modify_vcf @@ -96,8 +97,7 @@ workflow AnnotateVcf { sv_pipeline_hail_docker = sv_pipeline_hail_docker, runtime_attr_svannotate = runtime_attr_svannotate, - runtime_attr_concat_vcfs = runtime_attr_concat_vcfs, - runtime_attr_shard_vcf = runtime_attr_shard_vcf, + runtime_attr_subset_vcf_by_samples_list = runtime_attr_subset_vcf_by_samples_list, runtime_attr_compute_AFs = runtime_attr_compute_AFs, runtime_attr_combine_vcfs = runtime_attr_combine_vcfs, runtime_attr_modify_vcf = runtime_attr_modify_vcf, diff --git a/wdl/ChromosomeAlleleFrequencies.wdl b/wdl/ChromosomeAlleleFrequencies.wdl deleted file mode 100644 index 8eb422350..000000000 --- a/wdl/ChromosomeAlleleFrequencies.wdl +++ /dev/null @@ -1,100 +0,0 @@ -# Helper workflow to calculate basic AF statistics for a single chromosome on an input VCF - -version 1.0 - -import "Structs.wdl" - -# Add VAF annotation -workflow ChromosomeAlleleFrequencies { - - input { - - File vcf - File vcf_idx - String contig - String prefix - - File? sample_pop_assignments # Two-column file with sample ID & pop assignment. "." for pop will ignore sample - File? ped_file # Used for M/F AF calculations - File? par_bed - File? allosomes_list - - - String sv_pipeline_docker - String sv_base_mini_docker - - RuntimeAttr? runtime_attr_shard_vcf - RuntimeAttr? runtime_attr_compute_AFs - RuntimeAttr? runtime_attr_combine_vcfs - } - - call ComputeShardAFs { - input: - vcf = vcf, - prefix = "${prefix}.${contig}", - sample_pop_assignments = sample_pop_assignments, - ped_file = ped_file, - par_bed = par_bed, - allosomes_list = allosomes_list, - sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_override = runtime_attr_compute_AFs - } - - - # Final output - output { - File vcf_wAFs = ComputeShardAFs.shard_wAFs - File vcf_wAFs_idx = ComputeShardAFs.shard_wAFs_idx - } -} - - -task ComputeShardAFs { - input { - File vcf - String prefix - String sv_pipeline_docker - File? sample_pop_assignments - File? ped_file - File? par_bed - File? allosomes_list - RuntimeAttr? runtime_attr_override - } - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 1.5, - disk_gb: ceil(20 + size(vcf, "GB") * 2), - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - - command <<< - set -euo pipefail - /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py "~{vcf}" stdout \ - ~{"-p " + sample_pop_assignments} \ - ~{"-f " + ped_file} \ - ~{"-par " + par_bed} \ - ~{"--allosomes-list " + allosomes_list} \ - | bgzip -c \ - > "~{prefix}.wAFs.vcf.gz" - - tabix -p vcf "~{prefix}.wAFs.vcf.gz" - >>> - - output { - File shard_wAFs = "~{prefix}.wAFs.vcf.gz" - File shard_wAFs_idx = "~{prefix}.wAFs.vcf.gz.tbi" - } - - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_pipeline_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - } -} diff --git a/wdl/PruneAndAddVafs.wdl b/wdl/PruneAndAddVafs.wdl index bd685fe97..f01b65c3f 100644 --- a/wdl/PruneAndAddVafs.wdl +++ b/wdl/PruneAndAddVafs.wdl @@ -4,7 +4,6 @@ version 1.0 import "TasksMakeCohortVcf.wdl" as MiniTasks -import "ChromosomeAlleleFrequencies.wdl" as calcAF import "Utils.wdl" as util # Prune off samples in annotated VCF, add VAF annotation @@ -12,8 +11,8 @@ workflow PruneAndAddVafs { input { - File vcf - File vcf_idx + File vcf + File vcf_idx String prefix String contig @@ -26,10 +25,7 @@ workflow PruneAndAddVafs { String sv_base_mini_docker String sv_pipeline_docker - RuntimeAttr? runtime_attr_shard_vcf RuntimeAttr? runtime_attr_compute_AFs - RuntimeAttr? runtime_attr_combine_vcfs - RuntimeAttr? runtime_attr_concat_vcfs RuntimeAttr? runtime_attr_subset_vcf_by_samples_list } @@ -46,26 +42,70 @@ workflow PruneAndAddVafs { } # Compute AC, AN, and AF per population & sex combination - call calcAF.ChromosomeAlleleFrequencies as ChromosomeAlleleFrequencies { + call ComputeShardAFs { input: vcf = select_first([SubsetVcfBySamplesList.vcf_subset, vcf]), - vcf_idx = select_first([SubsetVcfBySamplesList.vcf_subset_index, vcf_idx]), - contig = contig, prefix = prefix, sample_pop_assignments = sample_pop_assignments, ped_file = ped_file, par_bed = par_bed, allosomes_list = allosomes_list, - sv_base_mini_docker = sv_base_mini_docker, sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_shard_vcf = runtime_attr_shard_vcf, - runtime_attr_compute_AFs = runtime_attr_compute_AFs, - runtime_attr_combine_vcfs = runtime_attr_combine_vcfs + runtime_attr_override = runtime_attr_compute_AFs } output { - File output_vcf = ChromosomeAlleleFrequencies.vcf_wAFs - File output_vcf_idx = ChromosomeAlleleFrequencies.vcf_wAFs_idx + File output_vcf = ComputeShardAFs.shard_wAFs + File output_vcf_idx = ComputeShardAFs.shard_wAFs_idx } } +task ComputeShardAFs { + input { + File vcf + String prefix + File? sample_pop_assignments + File? ped_file + File? par_bed + File? allosomes_list + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1.5, + disk_gb: ceil(20 + size(vcf, "GB") * 2), + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command <<< + set -euo pipefail + /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py "~{vcf}" stdout \ + ~{"-p " + sample_pop_assignments} \ + ~{"-f " + ped_file} \ + ~{"-par " + par_bed} \ + ~{"--allosomes-list " + allosomes_list} \ + | bgzip -c \ + > "~{prefix}.wAFs.vcf.gz" + + tabix -p vcf "~{prefix}.wAFs.vcf.gz" + >>> + + output { + File shard_wAFs = "~{prefix}.wAFs.vcf.gz" + File shard_wAFs_idx = "~{prefix}.wAFs.vcf.gz.tbi" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl index 46abafe4d..3fdd1849c 100755 --- a/wdl/ShardedAnnotateVcf.wdl +++ b/wdl/ShardedAnnotateVcf.wdl @@ -44,9 +44,8 @@ workflow ShardedAnnotateVcf { String gatk_docker RuntimeAttr? runtime_attr_svannotate - RuntimeAttr? runtime_attr_concat_vcfs - RuntimeAttr? runtime_attr_shard_vcf RuntimeAttr? runtime_attr_compute_AFs + RuntimeAttr? runtime_attr_subset_vcf_by_samples_list RuntimeAttr? runtime_attr_combine_vcfs RuntimeAttr? runtime_attr_modify_vcf RuntimeAttr? runtime_attr_combine_vcfs @@ -92,7 +91,7 @@ workflow ShardedAnnotateVcf { input: vcf = AnnotateFunctionalConsequences.annotated_vcf, vcf_idx = AnnotateFunctionalConsequences.annotated_vcf_index, - prefix = prefix, + prefix = "~{prefix}.~{contig}.~{i}", contig = contig, ped_file = ped_file, par_bed = par_bed, @@ -102,10 +101,8 @@ workflow ShardedAnnotateVcf { sv_base_mini_docker = sv_base_mini_docker, sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_shard_vcf = runtime_attr_shard_vcf, + runtime_attr_subset_vcf_by_samples_list = runtime_attr_subset_vcf_by_samples_list, runtime_attr_compute_AFs = runtime_attr_compute_AFs, - runtime_attr_combine_vcfs = runtime_attr_combine_vcfs, - runtime_attr_concat_vcfs = runtime_attr_concat_vcfs } if (defined(ref_bed)) { @@ -116,7 +113,7 @@ workflow ShardedAnnotateVcf { ref_bed = select_first([ref_bed]), population = select_first([population]), ref_prefix = select_first([ref_prefix]), - prefix = prefix, + prefix = "~{prefix}.~{contig}.~{i}", contigs = [contig], max_shards_per_chrom_step1 = max_shards_per_chrom_step1, min_records_per_shard_step1 = min_records_per_shard_step1, @@ -158,7 +155,7 @@ workflow ShardedAnnotateVcf { vcfs=sharded_annotated_vcf, vcfs_idx=sharded_annotated_vcf_idx, allow_overlaps=true, - outfile_prefix="~{prefix}.annotatedd", + outfile_prefix="~{prefix}.annotated", sv_base_mini_docker=sv_base_mini_docker, runtime_attr_override=runtime_attr_concat_sharded_cluster } From 6e51b6d2b087e47e0e1403a57b37a3727e75e502 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 8 May 2023 16:44:12 -0400 Subject: [PATCH 14/26] whitespace --- wdl/PruneAndAddVafs.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wdl/PruneAndAddVafs.wdl b/wdl/PruneAndAddVafs.wdl index f01b65c3f..e06263aeb 100644 --- a/wdl/PruneAndAddVafs.wdl +++ b/wdl/PruneAndAddVafs.wdl @@ -72,7 +72,7 @@ task ComputeShardAFs { RuntimeAttr? runtime_attr_override } RuntimeAttr default_attr = object { - cpu_cores: 1, + cpu_cores: 1, mem_gb: 1.5, disk_gb: ceil(20 + size(vcf, "GB") * 2), boot_disk_gb: 10, @@ -98,7 +98,7 @@ task ComputeShardAFs { File shard_wAFs = "~{prefix}.wAFs.vcf.gz" File shard_wAFs_idx = "~{prefix}.wAFs.vcf.gz.tbi" } - + runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" From b92591b75d5055cb71f2e415ebf84587f637936b Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 8 May 2023 17:28:56 -0400 Subject: [PATCH 15/26] merge output to same level as input --- wdl/AnnotateVcf.wdl | 45 ++++++++++++++++++++++++++++++++++---- wdl/ShardedAnnotateVcf.wdl | 36 ++---------------------------- 2 files changed, 43 insertions(+), 38 deletions(-) diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl index 49039c27a..67db5e183 100644 --- a/wdl/AnnotateVcf.wdl +++ b/wdl/AnnotateVcf.wdl @@ -2,6 +2,8 @@ version 1.0 import "Structs.wdl" import "ShardedAnnotateVcf.wdl" as sharded_annotate_vcf +import "TasksMakeCohortVcf.wdl" as MiniTasks +import "HailMerge.wdl" as HailMerge workflow AnnotateVcf { @@ -62,8 +64,8 @@ workflow AnnotateVcf { Array[String] contigs = read_lines(contig_list) scatter (i in range(length(contigs))) { - Int array_index = if (sharded_by_contig && length(vcf_list) > 1) then i else 0 - call sharded_annotate_vcf.ShardedAnnotateVcf as ShardedAnnotateVcf{ + Int array_index = if (sharded_by_contig) then i else 0 + call sharded_annotate_vcf.ShardedAnnotateVcf { input: vcf = vcf_list[array_index], vcf_idx = vcf_idx_list[array_index], @@ -114,8 +116,43 @@ workflow AnnotateVcf { } } + # Concat VCFs to the contig level or fully depending on format of input + # ShardedAnnotateVcf.sharded_annotated_vcf is is an Array[Array[File]] with one inner Array[File] of shards per contig + Array[Array[File]] vcfs_for_concatenation = if sharded_by_contig then ShardedAnnotateVcf.sharded_annotated_vcf else [flatten(ShardedAnnotateVcf.sharded_annotated_vcf)] + Array[Array[File]] vcf_idxs_for_concatenation = if sharded_by_contig then ShardedAnnotateVcf.sharded_annotated_vcf_idx else [flatten(ShardedAnnotateVcf.sharded_annotated_vcf_idx)] + if (use_hail) { + scatter (i in range(length(vcfs_for_concatenation))) { + call HailMerge.HailMerge { + input: + vcfs=vcfs_for_concatenation[i], + prefix="~{prefix_list[i]}.annotated", + gcs_project=gcs_project, + sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=select_first([sv_pipeline_hail_docker]), + runtime_override_preconcat=runtime_attr_preconcat_sharded_cluster, + runtime_override_hail_merge=runtime_attr_hail_merge_sharded_cluster, + runtime_override_fix_header=runtime_attr_fix_header_sharded_cluster + } + } + } + + if (!use_hail) { + scatter (i in range(length(vcfs_for_concatenation))) { + call MiniTasks.ConcatVcfs { + input: + vcfs=vcfs_for_concatenation[i], + vcfs_idx=vcf_idxs_for_concatenation[i], + allow_overlaps=true, + outfile_prefix="~{prefix_list[i]}.annotated", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_attr_concat_sharded_cluster + } + } + } + output { - Array[File] output_vcf_list = ShardedAnnotateVcf.output_vcf - Array[File] output_vcf_idx_list = ShardedAnnotateVcf.output_vcf_idx + Array[File] output_vcf_list = select_first([ConcatVcfs.concat_vcf, HailMerge.merged_vcf]) + Array[File] output_vcf_idx_list = select_first([ConcatVcfs.concat_vcf_idx, HailMerge.merged_vcf_index]) } } \ No newline at end of file diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl index 3fdd1849c..71fbf22ac 100755 --- a/wdl/ShardedAnnotateVcf.wdl +++ b/wdl/ShardedAnnotateVcf.wdl @@ -128,43 +128,11 @@ workflow ShardedAnnotateVcf { runtime_attr_select_matched_svs = runtime_attr_select_matched_svs } } - - } - - Array[File] sharded_annotated_vcf = select_first([select_all(AnnotateExternalAF.annotated_vcf), PruneAndAddVafs.output_vcf]) - Array[File] sharded_annotated_vcf_idx = select_first([select_all(AnnotateExternalAF.annotated_vcf_tbi), PruneAndAddVafs.output_vcf_idx]) - - if (use_hail) { - call HailMerge.HailMerge { - input: - vcfs=sharded_annotated_vcf, - prefix="~{prefix}.annotated", - gcs_project=gcs_project, - sv_base_mini_docker=sv_base_mini_docker, - sv_pipeline_docker=sv_pipeline_docker, - sv_pipeline_hail_docker=select_first([sv_pipeline_hail_docker]), - runtime_override_preconcat=runtime_attr_preconcat_sharded_cluster, - runtime_override_hail_merge=runtime_attr_hail_merge_sharded_cluster, - runtime_override_fix_header=runtime_attr_fix_header_sharded_cluster - } } - if (!use_hail) { - call MiniTasks.ConcatVcfs { - input: - vcfs=sharded_annotated_vcf, - vcfs_idx=sharded_annotated_vcf_idx, - allow_overlaps=true, - outfile_prefix="~{prefix}.annotated", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_attr_concat_sharded_cluster - } - } - - output { - File output_vcf = select_first([ConcatVcfs.concat_vcf, HailMerge.merged_vcf]) - File output_vcf_idx = select_first([ConcatVcfs.concat_vcf_idx, HailMerge.merged_vcf_index]) + Array[File] sharded_annotated_vcf = select_first([select_all(AnnotateExternalAF.annotated_vcf), PruneAndAddVafs.output_vcf]) + Array[File] sharded_annotated_vcf_idx = select_first([select_all(AnnotateExternalAF.annotated_vcf_tbi), PruneAndAddVafs.output_vcf_idx]) } } From 6f4f350847aaf5c7cd177c42ef36dbffd963fb34 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 6 Jul 2023 16:51:08 -0400 Subject: [PATCH 16/26] pass index to scatter --- wdl/ShardedAnnotateVcf.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl index 71fbf22ac..984fd169a 100755 --- a/wdl/ShardedAnnotateVcf.wdl +++ b/wdl/ShardedAnnotateVcf.wdl @@ -65,6 +65,7 @@ workflow ShardedAnnotateVcf { call MiniTasks.ScatterVcf { input: vcf = vcf, + vcf_index = vcf_idx, prefix = prefix, records_per_shard = sv_per_shard, contig = contig, From 28a0a935d6e1da5d96f61c72cfe3ff3ecc8bbb11 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 6 Jul 2023 17:24:05 -0400 Subject: [PATCH 17/26] annotate ext af per shard --- .../AnnotateVcf.SingleBatch.json.tmpl | 2 - .../AnnotateVcf.json.tmpl | 4 +- ...TKSVPipelineSingleSample.no_melt.json.tmpl | 2 - .../test/AnnotateVcf/AnnotateVcf.json.tmpl | 2 - .../GATKSVPipelineSingleSample.json.tmpl | 2 - ...TKSVPipelineSingleSample.no_melt.json.tmpl | 2 - wdl/AnnotateExternalAF.wdl | 10 +- ...tig.wdl => AnnotateExternalAFPerShard.wdl} | 134 +++++++----------- wdl/AnnotateVcf.wdl | 11 -- wdl/GATKSVPipelineSingleSample.wdl | 4 - wdl/ShardedAnnotateVcf.wdl | 36 ++--- 11 files changed, 77 insertions(+), 132 deletions(-) rename wdl/{AnnotateExternalAFperContig.wdl => AnnotateExternalAFPerShard.wdl} (81%) diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl index 701490993..81b4c20e9 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl @@ -12,8 +12,6 @@ "AnnotateVcf.contig_list" : "${workspace.primary_contigs_list}", "AnnotateVcf.ped_file": "${workspace.cohort_ped_file}", "AnnotateVcf.sv_per_shard" : "5000", - "AnnotateVcf.max_shards_per_chrom_step1" : 200, - "AnnotateVcf.min_records_per_shard_step1" : 5000, "AnnotateVcf.prefix_list" : "${this.sample_set_id}", "AnnotateVcf.sharded_by_contig": "false", diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl index 404abac9f..5d378d6c0 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl @@ -12,9 +12,7 @@ "AnnotateVcf.contig_list" : "${workspace.primary_contigs_list}", "AnnotateVcf.ped_file": "${workspace.cohort_ped_file}", "AnnotateVcf.sv_per_shard" : "5000", - "AnnotateVcf.max_shards_per_chrom_step1" : 200, - "AnnotateVcf.min_records_per_shard_step1" : 5000, - + "AnnotateVcf.prefix_list" : "${this.sample_set_set_id}", "AnnotateVcf.sharded_by_contig": "false", "AnnotateVcf.use_hail": "false", diff --git a/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.no_melt.json.tmpl b/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.no_melt.json.tmpl index 1f07adddc..565512d85 100644 --- a/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.no_melt.json.tmpl +++ b/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.no_melt.json.tmpl @@ -102,8 +102,6 @@ "GATKSVPipelineSingleSample.external_af_population" : {{ reference_resources.external_af_population | tojson }}, "GATKSVPipelineSingleSample.annotation_sv_per_shard" : "5000", - "GATKSVPipelineSingleSample.annotation_max_shards_per_chrom_step1" : 200, - "GATKSVPipelineSingleSample.annotation_min_records_per_shard_step1" : 5000, "GATKSVPipelineSingleSample.ref_samples_list" : "${workspace.ref_panel_samples_list}", "GATKSVPipelineSingleSample.ref_std_manta_vcf_tar" : "${workspace.ref_panel_std_manta_vcf_tar}", diff --git a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl index b23b074a1..150ae0136 100644 --- a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl +++ b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl @@ -12,8 +12,6 @@ "AnnotateVcf.contig_list" : {{ reference_resources.primary_contigs_list | tojson }}, "AnnotateVcf.ped_file": {{ test_batch.ped_file | tojson }}, "AnnotateVcf.sv_per_shard" : "5000", - "AnnotateVcf.max_shards_per_chrom_step1" : 200, - "AnnotateVcf.min_records_per_shard_step1" : 5000, "AnnotateVcf.prefix_list" : [{{ test_batch.name | tojson }}], "AnnotateVcf.sharded_by_contig": "false", diff --git a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl index 91d9fff1e..c60129a43 100644 --- a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl +++ b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl @@ -102,8 +102,6 @@ "GATKSVPipelineSingleSample.external_af_population" : {{ reference_resources.external_af_population | tojson }}, "GATKSVPipelineSingleSample.annotation_sv_per_shard" : "5000", - "GATKSVPipelineSingleSample.annotation_max_shards_per_chrom_step1" : 200, - "GATKSVPipelineSingleSample.annotation_min_records_per_shard_step1" : 5000, "GATKSVPipelineSingleSample.ref_samples_list" : {{ ref_panel.samples_list | tojson }}, "GATKSVPipelineSingleSample.ref_std_manta_vcf_tar" : {{ ref_panel.std_manta_vcf_tar | tojson }}, diff --git a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.no_melt.json.tmpl b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.no_melt.json.tmpl index 1c894fb63..a265b5444 100644 --- a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.no_melt.json.tmpl +++ b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.no_melt.json.tmpl @@ -104,8 +104,6 @@ "GATKSVPipelineSingleSample.external_af_population" : {{ reference_resources.external_af_population | tojson }}, "GATKSVPipelineSingleSample.annotation_sv_per_shard" : "5000", - "GATKSVPipelineSingleSample.annotation_max_shards_per_chrom_step1" : 200, - "GATKSVPipelineSingleSample.annotation_min_records_per_shard_step1" : 5000, "GATKSVPipelineSingleSample.ref_samples_list" : {{ ref_panel.samples_list | tojson }}, "GATKSVPipelineSingleSample.ref_std_manta_vcf_tar" : {{ ref_panel.std_manta_vcf_tar | tojson }}, diff --git a/wdl/AnnotateExternalAF.wdl b/wdl/AnnotateExternalAF.wdl index 322a48a74..4e98af36e 100644 --- a/wdl/AnnotateExternalAF.wdl +++ b/wdl/AnnotateExternalAF.wdl @@ -4,7 +4,7 @@ version 1.0 import "Structs.wdl" import "TasksMakeCohortVcf.wdl" as MiniTasks -import "AnnotateExternalAFperContig.wdl" as AnnotateExternalAFperContig +import "AnnotateExternalAFperShard.wdl" as AnnotateExternalAFperShard workflow AnnotateExternalAF { input { @@ -32,7 +32,7 @@ workflow AnnotateExternalAF { RuntimeAttr? runtime_attr_select_matched_svs } - call SplitBed as split_ref_bed { + call SplitRefBed as split_ref_bed { input: bed = ref_bed, sv_base_mini_docker = sv_base_mini_docker, @@ -45,10 +45,8 @@ workflow AnnotateExternalAF { runtime_attr_override = runtime_attr_split_query_vcf } - Array[String] svtype_list = ["DEL","DUP","INS","INV_CPX","BND_CTX"] - scatter ( contig in contigs ) { - call AnnotateExternalAFperContig.AnnotateExternalAFperContig as AnnotateExternalAFperContig{ + call AnnotateExternalAFperShard.AnnotateExternalAFperShard { input: vcf = vcf, vcf_idx = vcf_idx, @@ -94,7 +92,7 @@ workflow AnnotateExternalAF { } -task SplitBed { +task SplitRefBed { input { File bed String sv_base_mini_docker diff --git a/wdl/AnnotateExternalAFperContig.wdl b/wdl/AnnotateExternalAFPerShard.wdl similarity index 81% rename from wdl/AnnotateExternalAFperContig.wdl rename to wdl/AnnotateExternalAFPerShard.wdl index cd0fc233b..c7b00ba3b 100644 --- a/wdl/AnnotateExternalAFperContig.wdl +++ b/wdl/AnnotateExternalAFPerShard.wdl @@ -5,16 +5,11 @@ version 1.0 import "Structs.wdl" import "TasksMakeCohortVcf.wdl" as MiniTasks -workflow AnnotateExternalAFperContig { +workflow AnnotateExternalAFPerShard { input { File vcf File vcf_idx - File ref_bed - File split_query_vcf_del - File split_query_vcf_dup - File split_query_vcf_ins - File split_query_vcf_inv - File split_query_vcf_bnd + String prefix File split_ref_bed_del File split_ref_bed_dup File split_ref_bed_ins @@ -22,69 +17,66 @@ workflow AnnotateExternalAFperContig { File split_ref_bed_bnd Array[String] population - String contig String ref_prefix - Int max_shards_per_chrom_step1 - Int min_records_per_shard_step1 - String sv_base_mini_docker String sv_pipeline_docker # overrides for local tasks RuntimeAttr? runtime_attr_modify_vcf - RuntimeAttr? runtime_attr_split_vcf - RuntimeAttr? runtime_attr_combine_vcfs + RuntimeAttr? runtime_attr_split_query_vcf RuntimeAttr? runtime_attr_bedtools_closest RuntimeAttr? runtime_attr_select_matched_svs } + call SplitQueryVcf { + input: + vcf = vcf, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_split_query_vcf + } + call BedtoolsClosest as compare_del { input: - bed_a = split_query_vcf_del, + bed_a = SplitQueryVcf.del, bed_b = split_ref_bed_del, svtype = "del", - contig = contig, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_bedtools_closest } call BedtoolsClosest as compare_dup { input: - bed_a = split_query_vcf_dup, + bed_a = SplitQueryVcf.dup, bed_b = split_ref_bed_dup, svtype = "dup", - contig = contig, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_bedtools_closest } call BedtoolsClosest as compare_ins { input: - bed_a = split_query_vcf_ins, + bed_a = SplitQueryVcf.ins, bed_b = split_ref_bed_ins, svtype = "ins", - contig = contig, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_bedtools_closest } call BedtoolsClosest as compare_inv { input: - bed_a = split_query_vcf_inv, + bed_a = SplitQueryVcf.inv, bed_b = split_ref_bed_inv, svtype = "inv", - contig = contig, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_bedtools_closest } call BedtoolsClosest as compare_bnd { input: - bed_a = split_query_vcf_bnd, + bed_a = SplitQueryVcf.bnd, bed_b = split_ref_bed_bnd, svtype = "bnd", - contig = contig, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_bedtools_closest } @@ -133,55 +125,32 @@ workflow AnnotateExternalAFperContig { sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_select_matched_svs } - - call MiniTasks.SplitVcf as SplitVcf { - input: - vcf = vcf, - vcf_idx = vcf_idx, - contig=contig, - prefix="~{contig}.shard_", - n_shards=max_shards_per_chrom_step1, - min_vars_per_shard=min_records_per_shard_step1, - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_attr_split_vcf - } - - - scatter (vcf_shard in SplitVcf.vcf_shards) { - call ModifyVcf { - input: - labeled_del = calcu_del.output_comp, - labeled_dup = calcu_dup.output_comp, - labeled_ins = calcu_ins.output_comp, - labeled_inv = calcu_inv.output_comp, - labeled_bnd = calcu_bnd.output_comp, - vcf = vcf_shard, - ref_prefix = ref_prefix, - sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_override = runtime_attr_modify_vcf - } - } - - call MiniTasks.ConcatVcfs as CombineVcfStep1 { - input: - vcfs = ModifyVcf.annotated_vcf, - vcfs_idx = ModifyVcf.annotated_vcf_tbi, - naive = true, - outfile_prefix = "~{contig}.annotated.vcf", - sv_base_mini_docker = sv_base_mini_docker, - runtime_attr_override = runtime_attr_combine_vcfs + + call ModifyVcf { + input: + labeled_del = calcu_del.output_comp, + labeled_dup = calcu_dup.output_comp, + labeled_ins = calcu_ins.output_comp, + labeled_inv = calcu_inv.output_comp, + labeled_bnd = calcu_bnd.output_comp, + vcf = vcf, + prefix = prefix, + ref_prefix = ref_prefix, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_modify_vcf } output { - File annotated_vcf = CombineVcfStep1.concat_vcf - File annotated_vcf_tbi = CombineVcfStep1.concat_vcf_idx + File annotated_vcf = ModifyVcf.annotated_vcf + File annotated_vcf_tbi = ModifyVcf.annotated_vcf_tbi } } -task SplitBed { +task SplitRefBed { input { File bed + String contig String sv_base_mini_docker RuntimeAttr? runtime_attr_override } @@ -210,24 +179,26 @@ task SplitBed { String prefix = basename(bed, ".bed.gz") command <<< + set -eu zcat ~{bed} | head -1 > header - cat header <(zcat ~{bed} | awk '{if ($6=="DEL") print}') > ~{prefix}.DEL.bed - cat header <(zcat ~{bed} | awk '{if ($6=="DUP") print}') > ~{prefix}.DUP.bed - cat header <(zcat ~{bed} | awk '{if ($6=="INS" || $6=="INS:ME" || $6=="INS:ME:ALU" || $6=="INS:ME:LINE1" || $6=="INS:ME:SVA" || $6=="ALU" || $6=="LINE1" || $6=="SVA" || $6=="HERVK" ) print}') > ~{prefix}.INS.bed - cat header <(zcat ~{bed} | awk '{if ($6=="INV" || $6=="CPX") print}' ) > ~{prefix}.INV_CPX.bed - cat header <(zcat ~{bed} | awk '{if ($6=="BND" || $6=="CTX") print}' ) > ~{prefix}.BND_CTX.bed + set -o pipefail + cat header <(zcat ~{bed} | awk '{if ($1=="~{contig}" && $6=="DEL") print}') > ~{prefix}.~{contig}.DEL.bed + cat header <(zcat ~{bed} | awk '{if ($1=="~{contig}" && $6=="DUP") print}') > ~{prefix}.~{contig}.DUP.bed + cat header <(zcat ~{bed} | awk '{if ($1=="~{contig}" && $6=="INS" || $6=="INS:ME" || $6=="INS:ME:ALU" || $6=="INS:ME:LINE1" || $6=="INS:ME:SVA" || $6=="ALU" || $6=="LINE1" || $6=="SVA" || $6=="HERVK" ) print}') > ~{prefix}.~{contig}.INS.bed + cat header <(zcat ~{bed} | awk '{if ($1=="~{contig}" && $6=="INV" || $6=="CPX") print}' ) > ~{prefix}.~{contig}.INV_CPX.bed + cat header <(zcat ~{bed} | awk '{if ($1=="~{contig}" && $6=="BND" || $6=="CTX") print}' ) > ~{prefix}.~{contig}.BND_CTX.bed >>> output { - File del = "~{prefix}.DEL.bed" - File dup = "~{prefix}.DUP.bed" - File ins = "~{prefix}.INS.bed" - File inv = "~{prefix}.INV_CPX.bed" - File bnd = "~{prefix}.BND_CTX.bed" + File del = "~{prefix}.~{contig}.DEL.bed" + File dup = "~{prefix}.~{contig}.DUP.bed" + File ins = "~{prefix}.~{contig}.INS.bed" + File inv = "~{prefix}.~{contig}.INV_CPX.bed" + File bnd = "~{prefix}.~{contig}.BND_CTX.bed" } } -task SplitVcf { +task SplitQueryVcf { input { File vcf String sv_pipeline_docker @@ -258,9 +229,12 @@ task SplitVcf { String prefix = basename(vcf, ".vcf.gz") command <<< + set -euo pipefail svtk vcf2bed -i SVTYPE -i SVLEN ~{vcf} tmp.bed cut -f1-4,7-8 tmp.bed > ~{prefix}.bed + set +o pipefail head -1 ~{prefix}.bed > header + set -o pipefail cat header <(awk '{if ($5=="DEL") print}' ~{prefix}.bed )> ~{prefix}.DEL.bed cat header <(awk '{if ($5=="DUP") print}' ~{prefix}.bed )> ~{prefix}.DUP.bed cat header <(awk '{if ($5=="INS" || $5=="INS:ME" || $5=="INS:ME:ALU" || $5=="INS:ME:LINE1" || $5=="INS:ME:SVA" || $5=="ALU" || $5=="LINE1" || $5=="SVA" || $5=="HERVK" ) print}' ~{prefix}.bed )> ~{prefix}.INS.bed @@ -283,7 +257,6 @@ task BedtoolsClosest { File bed_a File bed_b String svtype - String contig String sv_pipeline_docker RuntimeAttr? runtime_attr_override } @@ -310,12 +283,10 @@ task BedtoolsClosest { } command <<< - awk '{if ($1=="~{contig}") print}' ~{bed_a} > filea.bed - awk '{if ($1=="~{contig}") print}' ~{bed_b} > fileb.bed - + set -eu paste <(head -1 ~{bed_a}) <(head -1 ~{bed_b}) | sed -e "s/#//g" > ~{svtype}.bed - - bedtools closest -wo -a <(sort -k1,1 -k2,2n filea.bed) -b <(sort -k1,1 -k2,2n fileb.bed) >> ~{svtype}.bed + set -o pipefail + bedtools closest -wo -a <(sort -k1,1 -k2,2n ~{bed_a}) -b <(sort -k1,1 -k2,2n ~{bed_b}) >> ~{svtype}.bed >>> output { @@ -357,6 +328,7 @@ task SelectMatchedSVs { File pop_list = write_lines(population) command <<< + set -euo pipefail Rscript /opt/sv-pipeline/05_annotation/scripts/R1.bedtools_closest_CNV.R \ -i ~{input_bed} \ -o ~{prefix}.comparison \ @@ -421,6 +393,7 @@ task ModifyVcf { File labeled_inv File labeled_bnd File vcf + String prefix String ref_prefix String sv_pipeline_docker RuntimeAttr? runtime_attr_override @@ -447,7 +420,6 @@ task ModifyVcf { bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, runtime_default.boot_disk_gb]) } - String prefix = basename(vcf,'.vcf.gz') command <<< cat ~{labeled_del} > labeled.bed cat ~{labeled_dup} >> labeled.bed diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl index 67db5e183..7e56f3bc1 100644 --- a/wdl/AnnotateVcf.wdl +++ b/wdl/AnnotateVcf.wdl @@ -20,9 +20,6 @@ workflow AnnotateVcf { Int? max_breakend_as_cnv_length String? svannotate_additional_args - Int max_shards_per_chrom_step1 - Int min_records_per_shard_step1 - File? sample_pop_assignments # Two-column file with sample ID & pop assignment. "." for pop will ignore sample File? sample_keep_list # List of samples to be retained from the output vcf File? ped_file # Used for M/F AF calculations @@ -47,10 +44,7 @@ workflow AnnotateVcf { RuntimeAttr? runtime_attr_shard_vcf RuntimeAttr? runtime_attr_subset_vcf_by_samples_list RuntimeAttr? runtime_attr_compute_AFs - RuntimeAttr? runtime_attr_combine_vcfs RuntimeAttr? runtime_attr_modify_vcf - RuntimeAttr? runtime_attr_combine_vcfs - RuntimeAttr? runtime_attr_split_vcf RuntimeAttr? runtime_attr_split_ref_bed RuntimeAttr? runtime_attr_split_query_vcf RuntimeAttr? runtime_attr_bedtools_closest @@ -77,8 +71,6 @@ workflow AnnotateVcf { svannotate_additional_args = svannotate_additional_args, max_breakend_as_cnv_length = max_breakend_as_cnv_length, - max_shards_per_chrom_step1 = max_shards_per_chrom_step1, - min_records_per_shard_step1 = min_records_per_shard_step1, sample_pop_assignments = sample_pop_assignments, sample_keep_list = sample_keep_list, ped_file = ped_file, @@ -101,10 +93,7 @@ workflow AnnotateVcf { runtime_attr_svannotate = runtime_attr_svannotate, runtime_attr_subset_vcf_by_samples_list = runtime_attr_subset_vcf_by_samples_list, runtime_attr_compute_AFs = runtime_attr_compute_AFs, - runtime_attr_combine_vcfs = runtime_attr_combine_vcfs, runtime_attr_modify_vcf = runtime_attr_modify_vcf, - runtime_attr_combine_vcfs = runtime_attr_combine_vcfs, - runtime_attr_split_vcf = runtime_attr_split_vcf, runtime_attr_split_ref_bed = runtime_attr_split_ref_bed, runtime_attr_split_query_vcf = runtime_attr_split_query_vcf, runtime_attr_bedtools_closest = runtime_attr_bedtools_closest, diff --git a/wdl/GATKSVPipelineSingleSample.wdl b/wdl/GATKSVPipelineSingleSample.wdl index d61231ee2..b3a97e6a3 100644 --- a/wdl/GATKSVPipelineSingleSample.wdl +++ b/wdl/GATKSVPipelineSingleSample.wdl @@ -561,8 +561,6 @@ workflow GATKSVPipelineSingleSample { Int? promoter_window Int? max_breakend_as_cnv_length Int annotation_sv_per_shard - Int annotation_max_shards_per_chrom_step1 - Int annotation_min_records_per_shard_step1 File? external_af_ref_bed # bed file with population AFs for annotation String? external_af_ref_bed_prefix # name of external AF bed file call set @@ -1403,8 +1401,6 @@ workflow GATKSVPipelineSingleSample { use_hail = false, sharded_by_contig = false, sv_per_shard = annotation_sv_per_shard, - max_shards_per_chrom_step1 = annotation_max_shards_per_chrom_step1, - min_records_per_shard_step1 = annotation_min_records_per_shard_step1, sv_base_mini_docker = sv_base_mini_docker, sv_pipeline_docker = sv_pipeline_docker, gatk_docker = gatk_docker, diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl index 984fd169a..b6bc2706f 100755 --- a/wdl/ShardedAnnotateVcf.wdl +++ b/wdl/ShardedAnnotateVcf.wdl @@ -5,7 +5,7 @@ import "TasksMakeCohortVcf.wdl" as MiniTasks import "HailMerge.wdl" as HailMerge import "AnnotateFunctionalConsequences.wdl" as func import "PruneAndAddVafs.wdl" as pav -import "AnnotateExternalAF.wdl" as eaf +import "AnnotateExternalAFPerShard.wdl" as eaf workflow ShardedAnnotateVcf { @@ -21,9 +21,6 @@ workflow ShardedAnnotateVcf { Int? max_breakend_as_cnv_length String? svannotate_additional_args - Int max_shards_per_chrom_step1 - Int min_records_per_shard_step1 - File? sample_pop_assignments # Two-column file with sample ID & pop assignment. "." for pop will ignore sample File? sample_keep_list File? ped_file # Used for M/F AF calculations @@ -46,10 +43,7 @@ workflow ShardedAnnotateVcf { RuntimeAttr? runtime_attr_svannotate RuntimeAttr? runtime_attr_compute_AFs RuntimeAttr? runtime_attr_subset_vcf_by_samples_list - RuntimeAttr? runtime_attr_combine_vcfs RuntimeAttr? runtime_attr_modify_vcf - RuntimeAttr? runtime_attr_combine_vcfs - RuntimeAttr? runtime_attr_split_vcf RuntimeAttr? runtime_attr_split_ref_bed RuntimeAttr? runtime_attr_split_query_vcf RuntimeAttr? runtime_attr_bedtools_closest @@ -62,6 +56,16 @@ workflow ShardedAnnotateVcf { RuntimeAttr? runtime_attr_fix_header_sharded_cluster } + if (defined(ref_bed)) { + call eaf.SplitRefBed { + input: + bed = select_first([ref_bed]), + contig = contig, + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_split_ref_bed + } + } + call MiniTasks.ScatterVcf { input: vcf = vcf, @@ -107,23 +111,21 @@ workflow ShardedAnnotateVcf { } if (defined(ref_bed)) { - call eaf.AnnotateExternalAF as AnnotateExternalAF { + call eaf.AnnotateExternalAFPerShard { input: vcf = PruneAndAddVafs.output_vcf, vcf_idx = PruneAndAddVafs.output_vcf_idx, - ref_bed = select_first([ref_bed]), + split_ref_bed_del = select_first([SplitRefBed.del]), + split_ref_bed_dup = select_first([SplitRefBed.dup]), + split_ref_bed_ins = select_first([SplitRefBed.ins]), + split_ref_bed_inv = select_first([SplitRefBed.inv]), + split_ref_bed_bnd = select_first([SplitRefBed.bnd]), population = select_first([population]), ref_prefix = select_first([ref_prefix]), prefix = "~{prefix}.~{contig}.~{i}", - contigs = [contig], - max_shards_per_chrom_step1 = max_shards_per_chrom_step1, - min_records_per_shard_step1 = min_records_per_shard_step1, sv_base_mini_docker = sv_base_mini_docker, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_modify_vcf = runtime_attr_modify_vcf, - runtime_attr_split_vcf = runtime_attr_split_vcf, - runtime_attr_combine_vcfs = runtime_attr_combine_vcfs, - runtime_attr_split_ref_bed = runtime_attr_split_ref_bed, runtime_attr_split_query_vcf = runtime_attr_split_query_vcf, runtime_attr_bedtools_closest = runtime_attr_bedtools_closest, runtime_attr_select_matched_svs = runtime_attr_select_matched_svs @@ -132,8 +134,8 @@ workflow ShardedAnnotateVcf { } output { - Array[File] sharded_annotated_vcf = select_first([select_all(AnnotateExternalAF.annotated_vcf), PruneAndAddVafs.output_vcf]) - Array[File] sharded_annotated_vcf_idx = select_first([select_all(AnnotateExternalAF.annotated_vcf_tbi), PruneAndAddVafs.output_vcf_idx]) + Array[File] sharded_annotated_vcf = select_first([select_all(AnnotateExternalAFPerShard.annotated_vcf), PruneAndAddVafs.output_vcf]) + Array[File] sharded_annotated_vcf_idx = select_first([select_all(AnnotateExternalAFPerShard.annotated_vcf_tbi), PruneAndAddVafs.output_vcf_idx]) } } From 0880aef577e12e647df4687a78eea0306dea0450 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Fri, 7 Jul 2023 13:26:00 -0400 Subject: [PATCH 18/26] remove AnnotateExternalAF.wdl --- wdl/AnnotateExternalAF.wdl | 428 ------------------------------------- 1 file changed, 428 deletions(-) delete mode 100644 wdl/AnnotateExternalAF.wdl diff --git a/wdl/AnnotateExternalAF.wdl b/wdl/AnnotateExternalAF.wdl deleted file mode 100644 index 4e98af36e..000000000 --- a/wdl/AnnotateExternalAF.wdl +++ /dev/null @@ -1,428 +0,0 @@ -version 1.0 - -# Author: Xuefang Zhao - -import "Structs.wdl" -import "TasksMakeCohortVcf.wdl" as MiniTasks -import "AnnotateExternalAFperShard.wdl" as AnnotateExternalAFperShard - -workflow AnnotateExternalAF { - input { - File vcf - File vcf_idx - File ref_bed - Array[String] population - Array[String] contigs - String ref_prefix - String prefix - - Int max_shards_per_chrom_step1 - Int min_records_per_shard_step1 - - String sv_base_mini_docker - String sv_pipeline_docker - - # overrides for local tasks - RuntimeAttr? runtime_attr_modify_vcf - RuntimeAttr? runtime_attr_combine_vcfs - RuntimeAttr? runtime_attr_split_vcf - RuntimeAttr? runtime_attr_split_ref_bed - RuntimeAttr? runtime_attr_split_query_vcf - RuntimeAttr? runtime_attr_bedtools_closest - RuntimeAttr? runtime_attr_select_matched_svs - - } - call SplitRefBed as split_ref_bed { - input: - bed = ref_bed, - sv_base_mini_docker = sv_base_mini_docker, - runtime_attr_override = runtime_attr_split_ref_bed - } - call SplitVcf as split_query_vcf { - input: - vcf = vcf, - sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_override = runtime_attr_split_query_vcf - } - - scatter ( contig in contigs ) { - call AnnotateExternalAFperShard.AnnotateExternalAFperShard { - input: - vcf = vcf, - vcf_idx = vcf_idx, - ref_bed = ref_bed, - split_query_vcf_del = split_query_vcf.del, - split_query_vcf_dup = split_query_vcf.dup, - split_query_vcf_ins = split_query_vcf.ins, - split_query_vcf_inv = split_query_vcf.inv, - split_query_vcf_bnd = split_query_vcf.bnd, - split_ref_bed_del = split_ref_bed.del, - split_ref_bed_dup = split_ref_bed.dup, - split_ref_bed_ins = split_ref_bed.ins, - split_ref_bed_inv = split_ref_bed.inv, - split_ref_bed_bnd = split_ref_bed.bnd, - population = population, - contig = contig, - ref_prefix = ref_prefix, - max_shards_per_chrom_step1 = max_shards_per_chrom_step1, - min_records_per_shard_step1 = min_records_per_shard_step1, - sv_base_mini_docker = sv_base_mini_docker, - sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_split_vcf = runtime_attr_split_vcf, - runtime_attr_modify_vcf = runtime_attr_modify_vcf, - runtime_attr_select_matched_svs = runtime_attr_select_matched_svs, - runtime_attr_bedtools_closest = runtime_attr_bedtools_closest - } - } - - call MiniTasks.ConcatVcfs as CombineVcfStep2 { - input: - vcfs = AnnotateExternalAFperContig.annotated_vcf, - vcfs_idx = AnnotateExternalAFperContig.annotated_vcf_tbi, - naive = true, - outfile_prefix = "~{prefix}.annotated", - sv_base_mini_docker = sv_base_mini_docker, - runtime_attr_override = runtime_attr_combine_vcfs - } - - output { - File annotated_vcf = CombineVcfStep2.concat_vcf - File annotated_vcf_tbi = CombineVcfStep2.concat_vcf_idx - } - -} - -task SplitRefBed { - input { - File bed - String sv_base_mini_docker - RuntimeAttr? runtime_attr_override - } - - RuntimeAttr runtime_default = object { - mem_gb: 3, - disk_gb: 5, - cpu_cores: 1, - preemptible_tries: 1, - max_retries: 1, - boot_disk_gb: 10 - } - - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_base_mini_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - String prefix = basename(bed, ".bed.gz") - - command <<< - zcat ~{bed} | head -1 > header - cat header <(zcat ~{bed} | awk '{if ($6=="DEL") print}') > ~{prefix}.DEL.bed - cat header <(zcat ~{bed} | awk '{if ($6=="DUP") print}') > ~{prefix}.DUP.bed - cat header <(zcat ~{bed} | awk '{if ($6=="INS" || $6=="INS:ME" || $6=="INS:ME:ALU" || $6=="INS:ME:LINE1" || $6=="INS:ME:SVA" || $6=="ALU" || $6=="LINE1" || $6=="SVA" || $6=="HERVK" ) print}') > ~{prefix}.INS.bed - cat header <(zcat ~{bed} | awk '{if ($6=="INV" || $6=="CPX") print}' ) > ~{prefix}.INV_CPX.bed - cat header <(zcat ~{bed} | awk '{if ($6=="BND" || $6=="CTX") print}' ) > ~{prefix}.BND_CTX.bed - >>> - - output { - File del = "~{prefix}.DEL.bed" - File dup = "~{prefix}.DUP.bed" - File ins = "~{prefix}.INS.bed" - File inv = "~{prefix}.INV_CPX.bed" - File bnd = "~{prefix}.BND_CTX.bed" - } -} - -task SplitVcf { - input { - File vcf - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - RuntimeAttr runtime_default = object { - mem_gb: 3, - disk_gb: 10, - cpu_cores: 1, - preemptible_tries: 1, - max_retries: 1, - boot_disk_gb: 10 - } - - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - String prefix = basename(vcf, ".vcf.gz") - - command <<< - svtk vcf2bed -i SVTYPE -i SVLEN ~{vcf} - | - cut -f1-4,7-8 > tmp.bed - head -1 tmp.bed > ~{prefix}.bed - awk 'NR > 1' < tmp.bed \ - | sort -k1,1V -k2,2n -k3,3n >> ~{prefix}.bed - rm tmp.bed - head -1 ~{prefix}.bed > header - cat header <(awk '{if ($5=="DEL") print}' ~{prefix}.bed )> ~{prefix}.DEL.bed - cat header <(awk '{if ($5=="DUP") print}' ~{prefix}.bed )> ~{prefix}.DUP.bed - cat header <(awk '{if ($5=="INS" || $5=="INS:ME" || $5=="INS:ME:ALU" || $5=="INS:ME:LINE1" || $5=="INS:ME:SVA" || $5=="ALU" || $5=="LINE1" || $5=="SVA" || $5=="HERVK" ) print}' ~{prefix}.bed )> ~{prefix}.INS.bed - cat header <(awk '{if ($5=="INV" || $5=="CPX") print}' ~{prefix}.bed )> ~{prefix}.INV_CPX.bed - cat header <(awk '{if ($5=="BND" || $5=="CTX") print}' ~{prefix}.bed )> ~{prefix}.BND_CTX.bed - >>> - - output { - File bed = "~{prefix}.bed" - File del = "~{prefix}.DEL.bed" - File dup = "~{prefix}.DUP.bed" - File ins = "~{prefix}.INS.bed" - File inv = "~{prefix}.INV_CPX.bed" - File bnd = "~{prefix}.BND_CTX.bed" - } -} - -task BedtoolsClosest { - input { - File bed_a - File bed_b - String svtype - String contig - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - RuntimeAttr runtime_default = object { - mem_gb: 3, - disk_gb: 5, - cpu_cores: 1, - preemptible_tries: 1, - max_retries: 1, - boot_disk_gb: 10 - } - - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - awk '{if ($1=="~{contig}") print}' ~{bed_a} > filea.bed - awk '{if ($1=="~{contig}") print}' ~{bed_b} > fileb.bed - - paste <(head -1 ~{bed_a}) <(head -1 ~{bed_b}) | sed -e "s/#//g" > ~{svtype}.bed - - bedtools closest -wo -a <(sort -k1,1 -k2,2n filea.bed) -b <(sort -k1,1 -k2,2n fileb.bed) >> ~{svtype}.bed - >>> - - output { - File output_bed = "~{svtype}.bed" - } -} - -task SelectMatchedSVs { - input { - File input_bed - String svtype - Array[String] population - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - RuntimeAttr runtime_default = object { - mem_gb: 3, - disk_gb: 5, - cpu_cores: 1, - preemptible_tries: 1, - max_retries: 1, - boot_disk_gb: 10 - } - - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - String prefix = basename(input_bed, ".bed") - File pop_list = write_lines(population) - - command <<< - Rscript /opt/sv-pipeline/05_annotation/scripts/R1.bedtools_closest_CNV.R \ - -i ~{input_bed} \ - -o ~{prefix}.comparison \ - -p ~{pop_list} - >>> - - output { - File output_comp = "~{prefix}.comparison" - } -} - -task SelectMatchedINSs { - input { - File input_bed - String svtype - Array[String] population - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - RuntimeAttr runtime_default = object { - mem_gb: 3, - disk_gb: 5, - cpu_cores: 1, - preemptible_tries: 1, - max_retries: 1, - boot_disk_gb: 10 - } - - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - String prefix = basename(input_bed, ".bed") - File pop_list = write_lines(population) - - command <<< - Rscript /opt/sv-pipeline/05_annotation/scripts/R2.bedtools_closest_INS.R \ - -i ~{input_bed} \ - -o ~{prefix}.comparison \ - -p ~{pop_list} - >>> - - output { - File output_comp = "~{prefix}.comparison" - } -} - -task ModifyVcf { - input { - Array[File] labeled_del - Array[File] labeled_dup - Array[File] labeled_ins - Array[File] labeled_inv - Array[File] labeled_bnd - File vcf - String ref_prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - RuntimeAttr runtime_default = object { - mem_gb: 3, - disk_gb: 5, - cpu_cores: 1, - preemptible_tries: 1, - max_retries: 1, - boot_disk_gb: 10 - } - - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - String prefix = basename(vcf,'.vcf.gz') - command <<< - cat ~{sep=" " labeled_del} > labeled.bed - cat ~{sep=" " labeled_dup} >> labeled.bed - cat ~{sep=" " labeled_ins} >> labeled.bed - cat ~{sep=" " labeled_inv} >> labeled.bed - cat ~{sep=" " labeled_bnd} >> labeled.bed - - python <']) - - fin.close() - fin=open('labeled.bed') - colname = fin.readline().strip().split() - - for j in range(len(colname)-1): - if j>1: - header.append(['##INFO=']) - - for line in fin: - pin=line.strip().split() - if pin[0]=='query_svid': continue - info_add = ["~{ref_prefix}"+'_SVID'+'='+pin[1]] - for j in range(len(colname)-1): - if j>1: - info_add.append("~{ref_prefix}"+'_'+colname[j]+'='+pin[j]) - body[pin[0]][7]+=';'+';'.join(info_add) - fin.close() - - fo=open('~{prefix}.annotated.vcf','w') - for i in header: - print(' '.join(i), file=fo) - for i in SVID_key: - print('\t'.join(body[i]), file=fo) - fo.close() - CODE - - bgzip ~{prefix}.annotated.vcf - tabix ~{prefix}.annotated.vcf.gz - >>> - - output { - File annotated_vcf = "~{prefix}.annotated.vcf.gz" - File annotated_vcf_tbi = "~{prefix}.annotated.vcf.gz.tbi" - } -} - - - - From d94df6df797afcc117076e141564d39686986c08 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 10 Jul 2023 14:06:18 -0400 Subject: [PATCH 19/26] make index optional --- wdl/AnnotateVcf.wdl | 8 +++++--- wdl/ShardedAnnotateVcf.wdl | 2 +- wdl/TasksMakeCohortVcf.wdl | 1 + 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl index 7e56f3bc1..9440fda95 100644 --- a/wdl/AnnotateVcf.wdl +++ b/wdl/AnnotateVcf.wdl @@ -9,10 +9,9 @@ workflow AnnotateVcf { input { Array[File] vcf_list # Must be either single full VCF (array of length 1) or array of VCFs sharded by contig. Outputs will match - Array[File] vcf_idx_list + Array[File]? vcf_idx_list File contig_list Array[String] prefix_list - Boolean sharded_by_contig # True if providing a vcf_list sharded by contig. False if providing a single full VCF File protein_coding_gtf File? noncoding_bed @@ -39,6 +38,8 @@ workflow AnnotateVcf { String sv_base_mini_docker String gatk_docker + File? NONE_FILE_ + RuntimeAttr? runtime_attr_svannotate RuntimeAttr? runtime_attr_concat_vcfs RuntimeAttr? runtime_attr_shard_vcf @@ -56,13 +57,14 @@ workflow AnnotateVcf { } Array[String] contigs = read_lines(contig_list) + Boolean sharded_by_contig = (length(vcf_list) == length(contigs)) scatter (i in range(length(contigs))) { Int array_index = if (sharded_by_contig) then i else 0 call sharded_annotate_vcf.ShardedAnnotateVcf { input: vcf = vcf_list[array_index], - vcf_idx = vcf_idx_list[array_index], + vcf_idx = if defined(vcf_idx_list) then select_first([vcf_idx_list])[array_index] else NONE_FILE_, contig = contigs[i], prefix = prefix_list[array_index], protein_coding_gtf = protein_coding_gtf, diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl index b6bc2706f..b7186771b 100755 --- a/wdl/ShardedAnnotateVcf.wdl +++ b/wdl/ShardedAnnotateVcf.wdl @@ -11,7 +11,7 @@ workflow ShardedAnnotateVcf { input { File vcf - File vcf_idx + File? vcf_idx String prefix String contig diff --git a/wdl/TasksMakeCohortVcf.wdl b/wdl/TasksMakeCohortVcf.wdl index 1cf9237d7..b344c0a42 100644 --- a/wdl/TasksMakeCohortVcf.wdl +++ b/wdl/TasksMakeCohortVcf.wdl @@ -990,6 +990,7 @@ task ScatterVcf { command <<< set -euo pipefail + ~{if !defined(vcf_index) then "tabix ~{vcf}" else ""} # in case the file is empty create an empty shard bcftools view -h ~{vcf} | bgzip -c > "~{prefix}.0.vcf.gz" bcftools +scatter ~{vcf} -o . -O z -p "~{prefix}". --threads ~{threads} -n ~{records_per_shard} ~{"-r " + contig} From dcc341d69b987dfd6f92c25b48478cafd72c289b Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 10 Jul 2023 14:27:59 -0400 Subject: [PATCH 20/26] remove sharded_by_contig input from jsons --- .../workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl | 1 - .../cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl | 1 - inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl | 1 - 3 files changed, 3 deletions(-) diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl index 81b4c20e9..f8dc2a433 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl @@ -14,7 +14,6 @@ "AnnotateVcf.sv_per_shard" : "5000", "AnnotateVcf.prefix_list" : "${this.sample_set_id}", - "AnnotateVcf.sharded_by_contig": "false", "AnnotateVcf.use_hail": "false", "AnnotateVcf.gatk_docker" : "${workspace.gatk_docker}", diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl index 5d378d6c0..b89279374 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl @@ -14,7 +14,6 @@ "AnnotateVcf.sv_per_shard" : "5000", "AnnotateVcf.prefix_list" : "${this.sample_set_set_id}", - "AnnotateVcf.sharded_by_contig": "false", "AnnotateVcf.use_hail": "false", "AnnotateVcf.gatk_docker" : "${workspace.gatk_docker}", diff --git a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl index 150ae0136..ccb9cf6a6 100644 --- a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl +++ b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl @@ -14,7 +14,6 @@ "AnnotateVcf.sv_per_shard" : "5000", "AnnotateVcf.prefix_list" : [{{ test_batch.name | tojson }}], - "AnnotateVcf.sharded_by_contig": "false", "AnnotateVcf.use_hail": "false", "AnnotateVcf.gatk_docker":{{ dockers.gatk_docker | tojson }}, From b8ccd52a78c63b0c4250b1d39fa96614dd73fde5 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 10 Jul 2023 14:40:22 -0400 Subject: [PATCH 21/26] womtool validation --- wdl/GATKSVPipelineSingleSample.wdl | 1 - 1 file changed, 1 deletion(-) diff --git a/wdl/GATKSVPipelineSingleSample.wdl b/wdl/GATKSVPipelineSingleSample.wdl index b3a97e6a3..cd5750cfc 100644 --- a/wdl/GATKSVPipelineSingleSample.wdl +++ b/wdl/GATKSVPipelineSingleSample.wdl @@ -1399,7 +1399,6 @@ workflow GATKSVPipelineSingleSample { ref_prefix = external_af_ref_bed_prefix, population = external_af_population, use_hail = false, - sharded_by_contig = false, sv_per_shard = annotation_sv_per_shard, sv_base_mini_docker = sv_base_mini_docker, sv_pipeline_docker = sv_pipeline_docker, From 578af9efcd746da98f8638b23059815021c76d17 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 27 Jul 2023 14:43:47 -0400 Subject: [PATCH 22/26] single vcf input. require index & infer path --- .../AnnotateVcf.SingleBatch.json.tmpl | 5 +- .../AnnotateVcf.json.tmpl | 5 +- .../test/AnnotateVcf/AnnotateVcf.json.tmpl | 5 +- wdl/AnnotateVcf.wdl | 73 ++++++++----------- wdl/GATKSVPipelineSingleSample.wdl | 17 ++--- wdl/ShardedAnnotateVcf.wdl | 4 +- 6 files changed, 49 insertions(+), 60 deletions(-) diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl index f8dc2a433..448606ad5 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl @@ -1,6 +1,5 @@ { - "AnnotateVcf.vcf_list" : "${this.cleaned_vcf}", - "AnnotateVcf.vcf_idx_list" : "${this.cleaned_vcf_index}", + "AnnotateVcf.vcf" : "${this.cleaned_vcf}", "AnnotateVcf.protein_coding_gtf" : "${workspace.protein_coding_gtf}", "AnnotateVcf.noncoding_bed" : "${workspace.noncoding_bed}", @@ -13,7 +12,7 @@ "AnnotateVcf.ped_file": "${workspace.cohort_ped_file}", "AnnotateVcf.sv_per_shard" : "5000", - "AnnotateVcf.prefix_list" : "${this.sample_set_id}", + "AnnotateVcf.prefix" : "${this.sample_set_id}", "AnnotateVcf.use_hail": "false", "AnnotateVcf.gatk_docker" : "${workspace.gatk_docker}", diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl index b89279374..fac58dce0 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl @@ -1,6 +1,5 @@ { - "AnnotateVcf.vcf_list" : "${this.cleaned_vcf}", - "AnnotateVcf.vcf_idx_list" : "${this.cleaned_vcf_index}", + "AnnotateVcf.vcf" : "${this.cleaned_vcf}", "AnnotateVcf.protein_coding_gtf" : "${workspace.protein_coding_gtf}", "AnnotateVcf.noncoding_bed" : "${workspace.noncoding_bed}", @@ -13,7 +12,7 @@ "AnnotateVcf.ped_file": "${workspace.cohort_ped_file}", "AnnotateVcf.sv_per_shard" : "5000", - "AnnotateVcf.prefix_list" : "${this.sample_set_set_id}", + "AnnotateVcf.prefix" : "${this.sample_set_set_id}", "AnnotateVcf.use_hail": "false", "AnnotateVcf.gatk_docker" : "${workspace.gatk_docker}", diff --git a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl index ccb9cf6a6..5d6bbb582 100644 --- a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl +++ b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl @@ -1,6 +1,5 @@ { - "AnnotateVcf.vcf_list" : [ {{ test_batch.clean_vcf | tojson }} ], - "AnnotateVcf.vcf_idx_list" : [{{ test_batch.clean_vcf_index | tojson }}], + "AnnotateVcf.vcf": {{ test_batch.clean_vcf | tojson }}, "AnnotateVcf.protein_coding_gtf" : {{ reference_resources.protein_coding_gtf | tojson }}, "AnnotateVcf.noncoding_bed" : {{ reference_resources.noncoding_bed | tojson }}, @@ -13,7 +12,7 @@ "AnnotateVcf.ped_file": {{ test_batch.ped_file | tojson }}, "AnnotateVcf.sv_per_shard" : "5000", - "AnnotateVcf.prefix_list" : [{{ test_batch.name | tojson }}], + "AnnotateVcf.prefix" : {{ test_batch.name | tojson }}, "AnnotateVcf.use_hail": "false", "AnnotateVcf.gatk_docker":{{ dockers.gatk_docker | tojson }}, diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl index 9440fda95..925b25555 100644 --- a/wdl/AnnotateVcf.wdl +++ b/wdl/AnnotateVcf.wdl @@ -8,10 +8,9 @@ import "HailMerge.wdl" as HailMerge workflow AnnotateVcf { input { - Array[File] vcf_list # Must be either single full VCF (array of length 1) or array of VCFs sharded by contig. Outputs will match - Array[File]? vcf_idx_list - File contig_list - Array[String] prefix_list + File vcf # GATK-SV VCF for annotation. Index .tbi must be located at the same path + File contig_list # Ordered list of contigs to annotate that are present in the input VCF + String prefix File protein_coding_gtf File? noncoding_bed @@ -38,8 +37,6 @@ workflow AnnotateVcf { String sv_base_mini_docker String gatk_docker - File? NONE_FILE_ - RuntimeAttr? runtime_attr_svannotate RuntimeAttr? runtime_attr_concat_vcfs RuntimeAttr? runtime_attr_shard_vcf @@ -57,16 +54,14 @@ workflow AnnotateVcf { } Array[String] contigs = read_lines(contig_list) - Boolean sharded_by_contig = (length(vcf_list) == length(contigs)) - scatter (i in range(length(contigs))) { - Int array_index = if (sharded_by_contig) then i else 0 + scatter (contig in contigs) { call sharded_annotate_vcf.ShardedAnnotateVcf { input: - vcf = vcf_list[array_index], - vcf_idx = if defined(vcf_idx_list) then select_first([vcf_idx_list])[array_index] else NONE_FILE_, - contig = contigs[i], - prefix = prefix_list[array_index], + vcf = vcf, + vcf_idx = vcf + ".tbi", + contig = contig, + prefix = prefix, protein_coding_gtf = protein_coding_gtf, noncoding_bed = noncoding_bed, promoter_window = promoter_window, @@ -107,43 +102,39 @@ workflow AnnotateVcf { } } - # Concat VCFs to the contig level or fully depending on format of input + # Concat VCF shards # ShardedAnnotateVcf.sharded_annotated_vcf is is an Array[Array[File]] with one inner Array[File] of shards per contig - Array[Array[File]] vcfs_for_concatenation = if sharded_by_contig then ShardedAnnotateVcf.sharded_annotated_vcf else [flatten(ShardedAnnotateVcf.sharded_annotated_vcf)] - Array[Array[File]] vcf_idxs_for_concatenation = if sharded_by_contig then ShardedAnnotateVcf.sharded_annotated_vcf_idx else [flatten(ShardedAnnotateVcf.sharded_annotated_vcf_idx)] + Array[File] vcfs_for_concatenation = flatten(ShardedAnnotateVcf.sharded_annotated_vcf) + Array[File] vcf_idxs_for_concatenation = flatten(ShardedAnnotateVcf.sharded_annotated_vcf_idx) if (use_hail) { - scatter (i in range(length(vcfs_for_concatenation))) { - call HailMerge.HailMerge { - input: - vcfs=vcfs_for_concatenation[i], - prefix="~{prefix_list[i]}.annotated", - gcs_project=gcs_project, - sv_base_mini_docker=sv_base_mini_docker, - sv_pipeline_docker=sv_pipeline_docker, - sv_pipeline_hail_docker=select_first([sv_pipeline_hail_docker]), - runtime_override_preconcat=runtime_attr_preconcat_sharded_cluster, - runtime_override_hail_merge=runtime_attr_hail_merge_sharded_cluster, - runtime_override_fix_header=runtime_attr_fix_header_sharded_cluster - } + call HailMerge.HailMerge { + input: + vcfs=vcfs_for_concatenation, + prefix="~{prefix}.annotated", + gcs_project=gcs_project, + sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=select_first([sv_pipeline_hail_docker]), + runtime_override_preconcat=runtime_attr_preconcat_sharded_cluster, + runtime_override_hail_merge=runtime_attr_hail_merge_sharded_cluster, + runtime_override_fix_header=runtime_attr_fix_header_sharded_cluster } } if (!use_hail) { - scatter (i in range(length(vcfs_for_concatenation))) { - call MiniTasks.ConcatVcfs { - input: - vcfs=vcfs_for_concatenation[i], - vcfs_idx=vcf_idxs_for_concatenation[i], - allow_overlaps=true, - outfile_prefix="~{prefix_list[i]}.annotated", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_attr_concat_sharded_cluster - } + call MiniTasks.ConcatVcfs { + input: + vcfs=vcfs_for_concatenation, + vcfs_idx=vcf_idxs_for_concatenation, + allow_overlaps=true, + outfile_prefix="~{prefix}.annotated", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_attr_concat_sharded_cluster } } output { - Array[File] output_vcf_list = select_first([ConcatVcfs.concat_vcf, HailMerge.merged_vcf]) - Array[File] output_vcf_idx_list = select_first([ConcatVcfs.concat_vcf_idx, HailMerge.merged_vcf_index]) + File annotated_vcf = select_first([ConcatVcfs.concat_vcf, HailMerge.merged_vcf]) + File annotated_vcf_index = select_first([ConcatVcfs.concat_vcf_idx, HailMerge.merged_vcf_index]) } } \ No newline at end of file diff --git a/wdl/GATKSVPipelineSingleSample.wdl b/wdl/GATKSVPipelineSingleSample.wdl index cd5750cfc..f82cc96e6 100644 --- a/wdl/GATKSVPipelineSingleSample.wdl +++ b/wdl/GATKSVPipelineSingleSample.wdl @@ -1387,9 +1387,8 @@ workflow GATKSVPipelineSingleSample { call annotate.AnnotateVcf { input: - vcf_list = [FilterSample.out], - vcf_idx_list = [FilterSample.out_idx], - prefix_list = [batch], + vcf = FilterSample.out, + prefix = batch, contig_list = primary_contigs_list, protein_coding_gtf = protein_coding_gtf, noncoding_bed = noncoding_bed, @@ -1408,18 +1407,18 @@ workflow GATKSVPipelineSingleSample { call SingleSampleFiltering.VcfToBed as VcfToBed { input: - vcf = AnnotateVcf.output_vcf_list[0], + vcf = AnnotateVcf.annotated_vcf, prefix = batch, sv_pipeline_docker = sv_pipeline_docker } call SingleSampleFiltering.UpdateBreakendRepresentation { input: - vcf=AnnotateVcf.output_vcf_list[0], - vcf_idx=AnnotateVcf.output_vcf_idx_list[0], + vcf=AnnotateVcf.annotated_vcf, + vcf_idx=AnnotateVcf.annotated_vcf_index, ref_fasta=reference_fasta, ref_fasta_idx=reference_index, - prefix=basename(AnnotateVcf.output_vcf_list[0], ".vcf.gz") + ".final_cleanup", + prefix=basename(AnnotateVcf.annotated_vcf, ".vcf.gz") + ".final_cleanup", sv_pipeline_docker=sv_pipeline_docker } @@ -1459,8 +1458,8 @@ workflow GATKSVPipelineSingleSample { # These files contain events reported in the internal VCF representation # They are less VCF-spec compliant but may be useful if components of the pipeline need to be re-run # on the output. - File pre_cleanup_vcf = AnnotateVcf.output_vcf_list[0] - File pre_cleanup_vcf_idx = AnnotateVcf.output_vcf_idx_list[0] + File pre_cleanup_vcf = AnnotateVcf.annotated_vcf + File pre_cleanup_vcf_idx = AnnotateVcf.annotated_vcf_index File ploidy_matrix = select_first([GatherBatchEvidence.batch_ploidy_matrix]) File ploidy_plots = select_first([GatherBatchEvidence.batch_ploidy_plots]) diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl index b7186771b..b7bd0adc1 100755 --- a/wdl/ShardedAnnotateVcf.wdl +++ b/wdl/ShardedAnnotateVcf.wdl @@ -7,11 +7,13 @@ import "AnnotateFunctionalConsequences.wdl" as func import "PruneAndAddVafs.wdl" as pav import "AnnotateExternalAFPerShard.wdl" as eaf +# Perform annotation per contig + workflow ShardedAnnotateVcf { input { File vcf - File? vcf_idx + File vcf_idx String prefix String contig From b107c0535eb3e4468660af40ddb529cf225e0e5f Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 27 Jul 2023 14:46:20 -0400 Subject: [PATCH 23/26] don't generate index if not present in ScatterVcf --- wdl/AnnotateVcf.wdl | 2 +- wdl/TasksMakeCohortVcf.wdl | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl index 925b25555..99f95eb92 100644 --- a/wdl/AnnotateVcf.wdl +++ b/wdl/AnnotateVcf.wdl @@ -137,4 +137,4 @@ workflow AnnotateVcf { File annotated_vcf = select_first([ConcatVcfs.concat_vcf, HailMerge.merged_vcf]) File annotated_vcf_index = select_first([ConcatVcfs.concat_vcf_idx, HailMerge.merged_vcf_index]) } -} \ No newline at end of file +} diff --git a/wdl/TasksMakeCohortVcf.wdl b/wdl/TasksMakeCohortVcf.wdl index b344c0a42..1cf9237d7 100644 --- a/wdl/TasksMakeCohortVcf.wdl +++ b/wdl/TasksMakeCohortVcf.wdl @@ -990,7 +990,6 @@ task ScatterVcf { command <<< set -euo pipefail - ~{if !defined(vcf_index) then "tabix ~{vcf}" else ""} # in case the file is empty create an empty shard bcftools view -h ~{vcf} | bgzip -c > "~{prefix}.0.vcf.gz" bcftools +scatter ~{vcf} -o . -O z -p "~{prefix}". --threads ~{threads} -n ~{records_per_shard} ~{"-r " + contig} From e1bb41cae3543aeb81458ec1e45bbd467929bea0 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 27 Jul 2023 15:05:12 -0400 Subject: [PATCH 24/26] clean up runtime attrs --- wdl/AnnotateVcf.wdl | 26 +++++++++++--------------- wdl/ShardedAnnotateVcf.wdl | 7 +------ 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl index 99f95eb92..85d802cb0 100644 --- a/wdl/AnnotateVcf.wdl +++ b/wdl/AnnotateVcf.wdl @@ -38,8 +38,7 @@ workflow AnnotateVcf { String gatk_docker RuntimeAttr? runtime_attr_svannotate - RuntimeAttr? runtime_attr_concat_vcfs - RuntimeAttr? runtime_attr_shard_vcf + RuntimeAttr? runtime_attr_scatter_vcf RuntimeAttr? runtime_attr_subset_vcf_by_samples_list RuntimeAttr? runtime_attr_compute_AFs RuntimeAttr? runtime_attr_modify_vcf @@ -47,10 +46,10 @@ workflow AnnotateVcf { RuntimeAttr? runtime_attr_split_query_vcf RuntimeAttr? runtime_attr_bedtools_closest RuntimeAttr? runtime_attr_select_matched_svs - RuntimeAttr? runtime_attr_concat_sharded_cluster - RuntimeAttr? runtime_attr_preconcat_sharded_cluster - RuntimeAttr? runtime_attr_hail_merge_sharded_cluster - RuntimeAttr? runtime_attr_fix_header_sharded_cluster + RuntimeAttr? runtime_attr_concat + RuntimeAttr? runtime_attr_preconcat + RuntimeAttr? runtime_attr_hail_merge + RuntimeAttr? runtime_attr_fix_header } Array[String] contigs = read_lines(contig_list) @@ -88,17 +87,14 @@ workflow AnnotateVcf { sv_pipeline_hail_docker = sv_pipeline_hail_docker, runtime_attr_svannotate = runtime_attr_svannotate, + runtime_attr_scatter_vcf = runtime_attr_scatter_vcf, runtime_attr_subset_vcf_by_samples_list = runtime_attr_subset_vcf_by_samples_list, runtime_attr_compute_AFs = runtime_attr_compute_AFs, runtime_attr_modify_vcf = runtime_attr_modify_vcf, runtime_attr_split_ref_bed = runtime_attr_split_ref_bed, runtime_attr_split_query_vcf = runtime_attr_split_query_vcf, runtime_attr_bedtools_closest = runtime_attr_bedtools_closest, - runtime_attr_select_matched_svs = runtime_attr_select_matched_svs, - runtime_attr_concat_sharded_cluster = runtime_attr_concat_sharded_cluster, - runtime_attr_preconcat_sharded_cluster = runtime_attr_preconcat_sharded_cluster, - runtime_attr_hail_merge_sharded_cluster = runtime_attr_hail_merge_sharded_cluster, - runtime_attr_fix_header_sharded_cluster = runtime_attr_fix_header_sharded_cluster + runtime_attr_select_matched_svs = runtime_attr_select_matched_svs } } @@ -115,9 +111,9 @@ workflow AnnotateVcf { sv_base_mini_docker=sv_base_mini_docker, sv_pipeline_docker=sv_pipeline_docker, sv_pipeline_hail_docker=select_first([sv_pipeline_hail_docker]), - runtime_override_preconcat=runtime_attr_preconcat_sharded_cluster, - runtime_override_hail_merge=runtime_attr_hail_merge_sharded_cluster, - runtime_override_fix_header=runtime_attr_fix_header_sharded_cluster + runtime_override_preconcat=runtime_attr_preconcat, + runtime_override_hail_merge=runtime_attr_hail_merge, + runtime_override_fix_header=runtime_attr_fix_header } } @@ -129,7 +125,7 @@ workflow AnnotateVcf { allow_overlaps=true, outfile_prefix="~{prefix}.annotated", sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_attr_concat_sharded_cluster + runtime_attr_override=runtime_attr_concat } } diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl index b7bd0adc1..cd667dddd 100755 --- a/wdl/ShardedAnnotateVcf.wdl +++ b/wdl/ShardedAnnotateVcf.wdl @@ -51,11 +51,6 @@ workflow ShardedAnnotateVcf { RuntimeAttr? runtime_attr_bedtools_closest RuntimeAttr? runtime_attr_select_matched_svs RuntimeAttr? runtime_attr_scatter_vcf - RuntimeAttr? runtime_attr_fix_ends_rescale_GQ - RuntimeAttr? runtime_attr_concat_sharded_cluster - RuntimeAttr? runtime_attr_preconcat_sharded_cluster - RuntimeAttr? runtime_attr_hail_merge_sharded_cluster - RuntimeAttr? runtime_attr_fix_header_sharded_cluster } if (defined(ref_bed)) { @@ -109,7 +104,7 @@ workflow ShardedAnnotateVcf { sv_base_mini_docker = sv_base_mini_docker, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_subset_vcf_by_samples_list = runtime_attr_subset_vcf_by_samples_list, - runtime_attr_compute_AFs = runtime_attr_compute_AFs, + runtime_attr_compute_AFs = runtime_attr_compute_AFs } if (defined(ref_bed)) { From d52ac3d0efe04615f5ea32f8677a6cc1418e0f35 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 27 Jul 2023 15:16:54 -0400 Subject: [PATCH 25/26] move PruneAndAddVafs tasks to ShardedAnnotateVcf --- wdl/AnnotateVcf.wdl | 2 +- wdl/PruneAndAddVafs.wdl | 49 --------------------- wdl/ShardedAnnotateVcf.wdl | 89 ++++++++++++++++++++++++++++++-------- 3 files changed, 73 insertions(+), 67 deletions(-) diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl index 85d802cb0..d1f5ed2ad 100644 --- a/wdl/AnnotateVcf.wdl +++ b/wdl/AnnotateVcf.wdl @@ -98,7 +98,7 @@ workflow AnnotateVcf { } } - # Concat VCF shards + # Concat VCF shards with or without hail # ShardedAnnotateVcf.sharded_annotated_vcf is is an Array[Array[File]] with one inner Array[File] of shards per contig Array[File] vcfs_for_concatenation = flatten(ShardedAnnotateVcf.sharded_annotated_vcf) Array[File] vcf_idxs_for_concatenation = flatten(ShardedAnnotateVcf.sharded_annotated_vcf_idx) diff --git a/wdl/PruneAndAddVafs.wdl b/wdl/PruneAndAddVafs.wdl index e06263aeb..e977c1468 100644 --- a/wdl/PruneAndAddVafs.wdl +++ b/wdl/PruneAndAddVafs.wdl @@ -60,52 +60,3 @@ workflow PruneAndAddVafs { } } -task ComputeShardAFs { - input { - File vcf - String prefix - File? sample_pop_assignments - File? ped_file - File? par_bed - File? allosomes_list - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 1.5, - disk_gb: ceil(20 + size(vcf, "GB") * 2), - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - - command <<< - set -euo pipefail - /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py "~{vcf}" stdout \ - ~{"-p " + sample_pop_assignments} \ - ~{"-f " + ped_file} \ - ~{"-par " + par_bed} \ - ~{"--allosomes-list " + allosomes_list} \ - | bgzip -c \ - > "~{prefix}.wAFs.vcf.gz" - - tabix -p vcf "~{prefix}.wAFs.vcf.gz" - >>> - - output { - File shard_wAFs = "~{prefix}.wAFs.vcf.gz" - File shard_wAFs_idx = "~{prefix}.wAFs.vcf.gz.tbi" - } - - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_pipeline_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - } -} diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl index cd667dddd..feefeebb9 100755 --- a/wdl/ShardedAnnotateVcf.wdl +++ b/wdl/ShardedAnnotateVcf.wdl @@ -4,7 +4,6 @@ import "Structs.wdl" import "TasksMakeCohortVcf.wdl" as MiniTasks import "HailMerge.wdl" as HailMerge import "AnnotateFunctionalConsequences.wdl" as func -import "PruneAndAddVafs.wdl" as pav import "AnnotateExternalAFPerShard.wdl" as eaf # Perform annotation per contig @@ -75,11 +74,23 @@ workflow ShardedAnnotateVcf { } scatter (i in range(length(ScatterVcf.shards))) { + String shard_prefix = "~{prefix}.~{contig}.~{i}" + + if (defined(sample_keep_list)) { + call util.SubsetVcfBySamplesList { + input: + vcf = ScatterVcf.shards[i], + list_of_samples = select_first([sample_keep_list]), + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_subset_vcf_by_samples_list + } + } call func.AnnotateFunctionalConsequences { input: - vcf = ScatterVcf.shards[i], - prefix = "~{prefix}.~{contig}.~{i}", + vcf = select_first([SubsetVcfBySamplesList.vcf_subset, ScatterVcf.shards[i]]), + vcf_index = SubsetVcfBySamplesList.vcf_subset_index, + prefix = shard_prefix, protein_coding_gtf = protein_coding_gtf, noncoding_bed = noncoding_bed, promoter_window = promoter_window, @@ -89,29 +100,24 @@ workflow ShardedAnnotateVcf { runtime_attr_svannotate = runtime_attr_svannotate } - call pav.PruneAndAddVafs as PruneAndAddVafs { + # Compute AC, AN, and AF per population & sex combination + call ComputeAFs { input: vcf = AnnotateFunctionalConsequences.annotated_vcf, - vcf_idx = AnnotateFunctionalConsequences.annotated_vcf_index, - prefix = "~{prefix}.~{contig}.~{i}", - contig = contig, + prefix = shard_prefix, + sample_pop_assignments = sample_pop_assignments, ped_file = ped_file, par_bed = par_bed, - sample_keep_list = sample_keep_list, allosomes_list = allosomes_list, - sample_pop_assignments = sample_pop_assignments, - - sv_base_mini_docker = sv_base_mini_docker, sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_subset_vcf_by_samples_list = runtime_attr_subset_vcf_by_samples_list, - runtime_attr_compute_AFs = runtime_attr_compute_AFs + runtime_attr_override = runtime_attr_compute_AFs } if (defined(ref_bed)) { call eaf.AnnotateExternalAFPerShard { input: - vcf = PruneAndAddVafs.output_vcf, - vcf_idx = PruneAndAddVafs.output_vcf_idx, + vcf = ComputeAFs.af_vcf, + vcf_idx = ComputeAFs.af_vcf_idx, split_ref_bed_del = select_first([SplitRefBed.del]), split_ref_bed_dup = select_first([SplitRefBed.dup]), split_ref_bed_ins = select_first([SplitRefBed.ins]), @@ -131,8 +137,57 @@ workflow ShardedAnnotateVcf { } output { - Array[File] sharded_annotated_vcf = select_first([select_all(AnnotateExternalAFPerShard.annotated_vcf), PruneAndAddVafs.output_vcf]) - Array[File] sharded_annotated_vcf_idx = select_first([select_all(AnnotateExternalAFPerShard.annotated_vcf_tbi), PruneAndAddVafs.output_vcf_idx]) + Array[File] sharded_annotated_vcf = select_first([select_all(AnnotateExternalAFPerShard.annotated_vcf), ComputeAFs.af_vcf]) + Array[File] sharded_annotated_vcf_idx = select_first([select_all(AnnotateExternalAFPerShard.annotated_vcf_tbi), ComputeAFs.af_vcf_idx]) } } +task ComputeAFs { + input { + File vcf + String prefix + File? sample_pop_assignments + File? ped_file + File? par_bed + File? allosomes_list + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1.5, + disk_gb: ceil(20 + size(vcf, "GB") * 2), + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command <<< + set -euo pipefail + /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py "~{vcf}" stdout \ + ~{"-p " + sample_pop_assignments} \ + ~{"-f " + ped_file} \ + ~{"-par " + par_bed} \ + ~{"--allosomes-list " + allosomes_list} \ + | bgzip -c \ + > "~{prefix}.wAFs.vcf.gz" + + tabix -p vcf "~{prefix}.wAFs.vcf.gz" + >>> + + output { + File af_vcf = "~{prefix}.wAFs.vcf.gz" + File af_vcf_idx = "~{prefix}.wAFs.vcf.gz.tbi" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} From ae48efae999c7043d9783ede2d427cf9d7e9300e Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 27 Jul 2023 15:22:09 -0400 Subject: [PATCH 26/26] remove PruneAndAddVafs.wdl --- wdl/PruneAndAddVafs.wdl | 62 -------------------------------------- wdl/ShardedAnnotateVcf.wdl | 1 + 2 files changed, 1 insertion(+), 62 deletions(-) delete mode 100644 wdl/PruneAndAddVafs.wdl diff --git a/wdl/PruneAndAddVafs.wdl b/wdl/PruneAndAddVafs.wdl deleted file mode 100644 index e977c1468..000000000 --- a/wdl/PruneAndAddVafs.wdl +++ /dev/null @@ -1,62 +0,0 @@ -# Workflow to perform final sample pruning & compute all relevant AF statistics -# for a VCF from the Talkowski SV pipeline - -version 1.0 - -import "TasksMakeCohortVcf.wdl" as MiniTasks -import "Utils.wdl" as util - -# Prune off samples in annotated VCF, add VAF annotation -workflow PruneAndAddVafs { - - input { - - File vcf - File vcf_idx - String prefix - String contig - - File? sample_pop_assignments # Two-column file with sample ID & pop assignment. "." for pop will ignore sample - File? ped_file # Used for M/F AF calculations - File? par_bed - File? allosomes_list - File? sample_keep_list # List of samples to be retained from the output vcf - - String sv_base_mini_docker - String sv_pipeline_docker - - RuntimeAttr? runtime_attr_compute_AFs - RuntimeAttr? runtime_attr_subset_vcf_by_samples_list - } - - # Prune VCF - if (defined(sample_keep_list)) { - call util.SubsetVcfBySamplesList { - input: - vcf = vcf, - vcf_idx = vcf_idx, - list_of_samples = select_first([sample_keep_list]), - sv_base_mini_docker = sv_base_mini_docker, - runtime_attr_override = runtime_attr_subset_vcf_by_samples_list - } - } - - # Compute AC, AN, and AF per population & sex combination - call ComputeShardAFs { - input: - vcf = select_first([SubsetVcfBySamplesList.vcf_subset, vcf]), - prefix = prefix, - sample_pop_assignments = sample_pop_assignments, - ped_file = ped_file, - par_bed = par_bed, - allosomes_list = allosomes_list, - sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_override = runtime_attr_compute_AFs - } - - output { - File output_vcf = ComputeShardAFs.shard_wAFs - File output_vcf_idx = ComputeShardAFs.shard_wAFs_idx - } -} - diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl index feefeebb9..06a8619d9 100755 --- a/wdl/ShardedAnnotateVcf.wdl +++ b/wdl/ShardedAnnotateVcf.wdl @@ -3,6 +3,7 @@ version 1.0 import "Structs.wdl" import "TasksMakeCohortVcf.wdl" as MiniTasks import "HailMerge.wdl" as HailMerge +import "Utils.wdl" as util import "AnnotateFunctionalConsequences.wdl" as func import "AnnotateExternalAFPerShard.wdl" as eaf