From 3619cf4b14d4c3091f6f481c797ea4e3a8436e1a Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 15 Dec 2022 14:45:49 -0500
Subject: [PATCH 01/26] carry over Xuefang's changes to AnnotateVcf from gnomAD
 v3

---
 wdl/AnnotateExternalAF.wdl          |   8 +-
 wdl/AnnotateExternalAFperContig.wdl |  92 ++++-----
 wdl/AnnotateVcf.wdl                 | 136 ++++++-------
 wdl/ChromosomeAlleleFrequencies.wdl | 106 +++++++---
 wdl/HailMerge.wdl                   |  46 ++---
 wdl/PruneAndAddVafs.wdl             | 132 ++++++++-----
 wdl/ShardedAnnotateVcf.wdl          | 290 ++++++++++++++++++++++++++++
 wdl/TasksMakeCohortVcf.wdl          |  18 +-
 8 files changed, 604 insertions(+), 224 deletions(-)
 create mode 100755 wdl/ShardedAnnotateVcf.wdl

diff --git a/wdl/AnnotateExternalAF.wdl b/wdl/AnnotateExternalAF.wdl
index b44c81a46..322a48a74 100644
--- a/wdl/AnnotateExternalAF.wdl
+++ b/wdl/AnnotateExternalAF.wdl
@@ -24,8 +24,8 @@ workflow AnnotateExternalAF {
 
         # overrides for local tasks
         RuntimeAttr? runtime_attr_modify_vcf
-        RuntimeAttr? runtime_override_combine_vcfs
-        RuntimeAttr? runtime_override_split_vcf
+        RuntimeAttr? runtime_attr_combine_vcfs
+        RuntimeAttr? runtime_attr_split_vcf
         RuntimeAttr? runtime_attr_split_ref_bed
         RuntimeAttr? runtime_attr_split_query_vcf
         RuntimeAttr? runtime_attr_bedtools_closest
@@ -70,7 +70,7 @@ workflow AnnotateExternalAF {
                 min_records_per_shard_step1 = min_records_per_shard_step1,
                 sv_base_mini_docker = sv_base_mini_docker,
                 sv_pipeline_docker = sv_pipeline_docker,
-                runtime_override_split_vcf = runtime_override_split_vcf,
+                runtime_attr_split_vcf = runtime_attr_split_vcf,
                 runtime_attr_modify_vcf = runtime_attr_modify_vcf,
                 runtime_attr_select_matched_svs = runtime_attr_select_matched_svs,
                 runtime_attr_bedtools_closest = runtime_attr_bedtools_closest
@@ -84,7 +84,7 @@ workflow AnnotateExternalAF {
         naive = true,
         outfile_prefix = "~{prefix}.annotated",
         sv_base_mini_docker = sv_base_mini_docker,
-        runtime_attr_override = runtime_override_combine_vcfs
+        runtime_attr_override = runtime_attr_combine_vcfs
     }
 
      output {
diff --git a/wdl/AnnotateExternalAFperContig.wdl b/wdl/AnnotateExternalAFperContig.wdl
index 18fd17a9f..cd0fc233b 100644
--- a/wdl/AnnotateExternalAFperContig.wdl
+++ b/wdl/AnnotateExternalAFperContig.wdl
@@ -33,8 +33,8 @@ workflow AnnotateExternalAFperContig {
 
         # overrides for local tasks
         RuntimeAttr? runtime_attr_modify_vcf
-        RuntimeAttr? runtime_override_split_vcf
-        RuntimeAttr? runtime_override_combine_vcfs
+        RuntimeAttr? runtime_attr_split_vcf
+        RuntimeAttr? runtime_attr_combine_vcfs
         RuntimeAttr? runtime_attr_bedtools_closest
         RuntimeAttr? runtime_attr_select_matched_svs
     }
@@ -143,7 +143,7 @@ workflow AnnotateExternalAFperContig {
         n_shards=max_shards_per_chrom_step1,
         min_vars_per_shard=min_records_per_shard_step1,
         sv_base_mini_docker=sv_base_mini_docker,
-        runtime_attr_override=runtime_override_split_vcf
+        runtime_attr_override=runtime_attr_split_vcf
     }
 
 
@@ -169,7 +169,7 @@ workflow AnnotateExternalAFperContig {
         naive = true,
         outfile_prefix = "~{contig}.annotated.vcf",
         sv_base_mini_docker = sv_base_mini_docker,
-        runtime_attr_override = runtime_override_combine_vcfs
+        runtime_attr_override = runtime_attr_combine_vcfs
     }
 
     output {
@@ -195,16 +195,16 @@ task SplitBed {
         boot_disk_gb: 10
     }
 
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, runtime_default])
     
     runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        memory: "~{select_first([runtime_attr.mem_gb, runtime_default.mem_gb])} GiB"
+        disks: "local-disk ~{select_first([runtime_attr.disk_gb, runtime_default.disk_gb])} HDD"
+        cpu: select_first([runtime_attr.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_attr.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, runtime_default.max_retries])
         docker: sv_base_mini_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, runtime_default.boot_disk_gb])
     }
 
     String prefix = basename(bed, ".bed.gz")
@@ -243,16 +243,16 @@ task SplitVcf {
         boot_disk_gb: 10
     }
 
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, runtime_default])
     
     runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        memory: "~{select_first([runtime_attr.mem_gb, runtime_default.mem_gb])} GiB"
+        disks: "local-disk ~{select_first([runtime_attr.disk_gb, runtime_default.disk_gb])} HDD"
+        cpu: select_first([runtime_attr.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_attr.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, runtime_default.max_retries])
         docker: sv_pipeline_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, runtime_default.boot_disk_gb])
     }
 
     String prefix = basename(vcf, ".vcf.gz")
@@ -297,16 +297,16 @@ task BedtoolsClosest {
         boot_disk_gb: 10
     }
 
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, runtime_default])
     
     runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        memory: "~{select_first([runtime_attr.mem_gb, runtime_default.mem_gb])} GiB"
+        disks: "local-disk ~{select_first([runtime_attr.disk_gb, runtime_default.disk_gb])} HDD"
+        cpu: select_first([runtime_attr.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_attr.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, runtime_default.max_retries])
         docker: sv_pipeline_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, runtime_default.boot_disk_gb])
     }
     
     command <<<
@@ -341,16 +341,16 @@ task SelectMatchedSVs {
         boot_disk_gb: 10
     }
 
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, runtime_default])
     
     runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        memory: "~{select_first([runtime_attr.mem_gb, runtime_default.mem_gb])} GiB"
+        disks: "local-disk ~{select_first([runtime_attr.disk_gb, runtime_default.disk_gb])} HDD"
+        cpu: select_first([runtime_attr.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_attr.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, runtime_default.max_retries])
         docker: sv_pipeline_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, runtime_default.boot_disk_gb])
     }
 
     String prefix = basename(input_bed, ".bed")
@@ -386,16 +386,16 @@ task SelectMatchedINSs {
         boot_disk_gb: 10
     }
 
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, runtime_default])
     
     runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        memory: "~{select_first([runtime_attr.mem_gb, runtime_default.mem_gb])} GiB"
+        disks: "local-disk ~{select_first([runtime_attr.disk_gb, runtime_default.disk_gb])} HDD"
+        cpu: select_first([runtime_attr.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_attr.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, runtime_default.max_retries])
         docker: sv_pipeline_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, runtime_default.boot_disk_gb])
     }
 
     String prefix = basename(input_bed, ".bed")
@@ -435,16 +435,16 @@ task ModifyVcf {
         boot_disk_gb: 10
     }
 
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, runtime_default])
     
     runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        memory: "~{select_first([runtime_attr.mem_gb, runtime_default.mem_gb])} GiB"
+        disks: "local-disk ~{select_first([runtime_attr.disk_gb, runtime_default.disk_gb])} HDD"
+        cpu: select_first([runtime_attr.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_attr.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, runtime_default.max_retries])
         docker: sv_pipeline_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, runtime_default.boot_disk_gb])
     }
 
     String prefix = basename(vcf,'.vcf.gz')
diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl
index b95e5c55c..1682e025b 100644
--- a/wdl/AnnotateVcf.wdl
+++ b/wdl/AnnotateVcf.wdl
@@ -1,16 +1,15 @@
 version 1.0
 
-import "AnnotateFunctionalConsequences.wdl" as func
-import "PruneAndAddVafs.wdl" as pav
-import "AnnotateExternalAF.wdl" as eaf
+import "Structs.wdl"
+import "ShardedAnnotateVcf.wdl" as sharded_annotate_vcf
 
 workflow AnnotateVcf {
 
   input {
-    File vcf
-    File vcf_idx
-    File contig_list
-    String prefix
+    Array[File] vcf_list
+    Array[File] vcf_idx_list
+    Array[String] contig_list
+    Array[String] prefix_list
 
     File protein_coding_gtf
     File? noncoding_bed
@@ -22,92 +21,99 @@ workflow AnnotateVcf {
     Int min_records_per_shard_step1
 
     File? sample_pop_assignments  # Two-column file with sample ID & pop assignment. "." for pop will ignore sample
-    File? prune_list              # List of samples to be excluded from the output vcf
+    File sample_list              # List of samples to be retained from the output vcf
     File? ped_file                # Used for M/F AF calculations
+    File? par_bed
+    File? allosomes_list
     Int   sv_per_shard
 
     File? ref_bed              # File with external allele frequencies
     String? ref_prefix         # prefix name for external AF call set (required if ref_bed set)
     Array[String]? population  # populations to annotate external AF for (required if ref_bed set)
 
-    String sv_base_mini_docker
+    Boolean use_hail
+    String? gcs_project
+
     String sv_pipeline_docker
+    String sv_pipeline_hail_docker
+    String sv_base_mini_docker
     String gatk_docker
 
     RuntimeAttr? runtime_attr_svannotate
     RuntimeAttr? runtime_attr_concat_vcfs
-    RuntimeAttr? runtime_attr_prune_vcf
     RuntimeAttr? runtime_attr_shard_vcf
     RuntimeAttr? runtime_attr_compute_AFs
     RuntimeAttr? runtime_attr_combine_vcfs
     RuntimeAttr? runtime_attr_modify_vcf
-    RuntimeAttr? runtime_override_combine_vcfs
-    RuntimeAttr? runtime_override_split_vcf
+    RuntimeAttr? runtime_attr_combine_vcfs
+    RuntimeAttr? runtime_attr_split_vcf
     RuntimeAttr? runtime_attr_split_ref_bed
     RuntimeAttr? runtime_attr_split_query_vcf
     RuntimeAttr? runtime_attr_bedtools_closest
     RuntimeAttr? runtime_attr_select_matched_svs
+    RuntimeAttr? runtime_attr_concat_sharded_cluster
+    RuntimeAttr? runtime_attr_preconcat_sharded_cluster
+    RuntimeAttr? runtime_attr_hail_merge_sharded_cluster
+    RuntimeAttr? runtime_attr_fix_header_sharded_cluster
+    RuntimeAttr? runtime_attr_get_vcf_header_with_members_info_line
   }
 
-  call func.AnnotateFunctionalConsequences {
-    input:
-      vcf = vcf,
-      vcf_index = vcf_idx,
-      prefix = prefix,
-      protein_coding_gtf = protein_coding_gtf,
-      noncoding_bed = noncoding_bed,
-      promoter_window = promoter_window,
-      max_breakend_as_cnv_length = max_breakend_as_cnv_length,
-      additional_args = svannotate_additional_args,
-      gatk_docker = gatk_docker,
-      runtime_attr_svannotate = runtime_attr_svannotate
-  }
-
-  call pav.PruneAndAddVafs as PruneAndAddVafs {
-    input:
-      vcf                    = AnnotateFunctionalConsequences.annotated_vcf,
-      vcf_idx                = AnnotateFunctionalConsequences.annotated_vcf_index,
-      prefix                 = prefix,
-      sample_pop_assignments = sample_pop_assignments,
-      prune_list             = prune_list,
-      ped_file               = ped_file,
-      sv_per_shard           = sv_per_shard,
-      contig_list            = contig_list,
-      sv_base_mini_docker     = sv_base_mini_docker,
-      sv_pipeline_docker = sv_pipeline_docker,
-      runtime_attr_prune_vcf    = runtime_attr_prune_vcf,
-      runtime_attr_shard_vcf    = runtime_attr_shard_vcf,
-      runtime_attr_compute_AFs  = runtime_attr_compute_AFs,
-      runtime_attr_combine_vcfs = runtime_attr_combine_vcfs,
-      runtime_attr_concat_vcfs  = runtime_attr_concat_vcfs
-  }
-
-  if (defined(ref_bed)) {
-    call eaf.AnnotateExternalAF as AnnotateExternalAF {
+  scatter (i in range(length(vcf_list))) {
+    call sharded_annotate_vcf.ShardedAnnotateVcf as ShardedAnnotateVcf{
       input:
-        vcf     = PruneAndAddVafs.output_vcf,
-        vcf_idx = PruneAndAddVafs.output_vcf_idx,
-        ref_bed = select_first([ref_bed]),
-        population = select_first([population]),
-        ref_prefix = select_first([ref_prefix]),
-        prefix = prefix,
-        contigs = read_lines(contig_list),
+        vcf = vcf_list[i],
+        vcf_idx = vcf_idx_list[i],
+        contig = contig_list[i],
+        prefix = prefix_list[i],
+        protein_coding_gtf = protein_coding_gtf,
+        noncoding_bed = noncoding_bed,
+        promoter_window = promoter_window,
+        svannotate_additional_args = svannotate_additional_args,
+        max_breakend_as_cnv_length = max_breakend_as_cnv_length,
+
         max_shards_per_chrom_step1 = max_shards_per_chrom_step1,
         min_records_per_shard_step1 = min_records_per_shard_step1,
-        sv_base_mini_docker = sv_base_mini_docker,
+        sample_pop_assignments = sample_pop_assignments,
+        sample_list = sample_list,
+        ped_file = ped_file,
+        par_bed = par_bed,
+        sv_per_shard = sv_per_shard,
+        allosomes_list = allosomes_list,
+
+        ref_bed = ref_bed,
+        ref_prefix = ref_prefix,
+        population = population,
+
+        use_hail = use_hail,
+        gcs_project = gcs_project,
+
+        gatk_docker = gatk_docker,
         sv_pipeline_docker = sv_pipeline_docker,
-        runtime_attr_modify_vcf = runtime_attr_modify_vcf,
-        runtime_override_split_vcf = runtime_override_split_vcf,
-        runtime_override_combine_vcfs = runtime_override_combine_vcfs,
-        runtime_attr_split_ref_bed = runtime_attr_split_ref_bed,
-        runtime_attr_split_query_vcf = runtime_attr_split_query_vcf,
-        runtime_attr_bedtools_closest = runtime_attr_bedtools_closest,
-        runtime_attr_select_matched_svs = runtime_attr_select_matched_svs
+        sv_base_mini_docker = sv_base_mini_docker,
+        sv_pipeline_hail_docker = sv_pipeline_hail_docker,
+
+        runtime_attr_svannotate = runtime_attr_svannotate ,
+        runtime_attr_concat_vcfs  = runtime_attr_concat_vcfs  ,
+        runtime_attr_shard_vcf  = runtime_attr_shard_vcf  ,
+        runtime_attr_compute_AFs  = runtime_attr_compute_AFs  ,
+        runtime_attr_combine_vcfs = runtime_attr_combine_vcfs ,
+        runtime_attr_modify_vcf = runtime_attr_modify_vcf ,
+        runtime_attr_combine_vcfs = runtime_attr_combine_vcfs ,
+        runtime_attr_split_vcf  = runtime_attr_split_vcf  ,
+        runtime_attr_split_ref_bed  = runtime_attr_split_ref_bed  ,
+        runtime_attr_split_query_vcf  = runtime_attr_split_query_vcf  ,
+        runtime_attr_bedtools_closest = runtime_attr_bedtools_closest ,
+        runtime_attr_select_matched_svs = runtime_attr_select_matched_svs ,
+        runtime_attr_concat_sharded_cluster = runtime_attr_concat_sharded_cluster ,
+        runtime_attr_preconcat_sharded_cluster  = runtime_attr_preconcat_sharded_cluster  ,
+        runtime_attr_hail_merge_sharded_cluster = runtime_attr_hail_merge_sharded_cluster ,
+        runtime_attr_fix_header_sharded_cluster = runtime_attr_fix_header_sharded_cluster ,
+        runtime_attr_get_vcf_header_with_members_info_line  = runtime_attr_get_vcf_header_with_members_info_line
     }
   }
 
   output {
-    File output_vcf     = select_first([AnnotateExternalAF.annotated_vcf, PruneAndAddVafs.output_vcf])
-    File output_vcf_idx = select_first([AnnotateExternalAF.annotated_vcf_tbi, PruneAndAddVafs.output_vcf_idx])
+    Array[File] output_vcf_list     = ShardedAnnotateVcf.output_vcf
+    Array[File] output_vcf_idx_list = ShardedAnnotateVcf.output_vcf_idx
   }
-}
+}
\ No newline at end of file
diff --git a/wdl/ChromosomeAlleleFrequencies.wdl b/wdl/ChromosomeAlleleFrequencies.wdl
index 4e8ab434d..6e7397a2e 100644
--- a/wdl/ChromosomeAlleleFrequencies.wdl
+++ b/wdl/ChromosomeAlleleFrequencies.wdl
@@ -11,12 +11,14 @@ workflow ChromosomeAlleleFrequencies {
 
     File   vcf
     File   vcf_idx
-    Int    sv_per_shard
     String contig
     String prefix
 
     File? sample_pop_assignments   # Two-column file with sample ID & pop assignment. "." for pop will ignore sample
     File? ped_file                 # Used for M/F AF calculations
+    File? par_bed
+    File? allosomes_list
+
 
     String sv_pipeline_docker
     String sv_base_mini_docker
@@ -27,43 +29,24 @@ workflow ChromosomeAlleleFrequencies {
   }
 
   # Tabix to chromosome of interest, and shard input VCF for stats collection
-  call ShardVcf {
-    input:
-      vcf          = vcf,
-      vcf_idx      = vcf_idx,
-      contig       = contig,
-      sv_per_shard = sv_per_shard,
-      sv_pipeline_docker = sv_pipeline_docker,
-      runtime_attr_override = runtime_attr_shard_vcf
-  }
-
   # Scatter over VCF shards
-  scatter ( shard in ShardVcf.shard_vcfs ) {
-    # Collect AF summary stats
-    call ComputeShardAlleleFrequencies {
+  call ComputeShardAFs {
       input:
-        vcf                    = shard,
-        prefix                 = "${prefix}.${contig}",
+        vcf = vcf,
+        prefix = "${prefix}.${contig}",
         sample_pop_assignments = sample_pop_assignments,
-        ped_file               = ped_file,
+        ped_file = ped_file,
+        par_bed  = par_bed,
+        allosomes_list = allosomes_list,
         sv_pipeline_docker = sv_pipeline_docker,
         runtime_attr_override = runtime_attr_compute_AFs
-    }
   }
 
-  # Merge shards into single VCF
-  call CombineShardedVcfs {
-    input:
-      vcfs   = ComputeShardAlleleFrequencies.shard_wAFs,
-      prefix = "${prefix}.${contig}",
-      sv_base_mini_docker = sv_base_mini_docker,
-      runtime_attr_override = runtime_attr_combine_vcfs
-  }
 
   # Final output
   output {
-    File vcf_wAFs = CombineShardedVcfs.vcf_out
-    File vcf_wAFs_idx = CombineShardedVcfs.vcf_out_idx
+    File vcf_wAFs = ComputeShardAFs.shard_wAFs
+    File vcf_wAFs_idx = ComputeShardAFs.shard_wAFs_idx
   }
 }
 
@@ -142,7 +125,8 @@ task ComputeShardAlleleFrequencies {
   }
   
   output {
-    File shard_wAFs = "${prefix}.wAFs.vcf.gz"
+    File shard_wAFs = "~{prefix}.wAFs.vcf.gz"
+    File shard_wAFs_idx = "~{prefix}.wAFs.vcf.gz.tbi"
   }
 
   command <<<
@@ -164,6 +148,8 @@ task ComputeShardAlleleFrequencies {
     /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $optionals "~{vcf}" stdout \
       | bgzip -c \
       > "~{prefix}.wAFs.vcf.gz"
+
+    tabix -p vcf ~{prefix}.wAFs.vcf.gz
   
   >>>
   
@@ -187,6 +173,68 @@ task ComputeShardAlleleFrequencies {
   }
 }
 
+task ComputeShardAFs {
+  input {
+    File vcf
+    String prefix
+    String sv_pipeline_docker
+    File? sample_pop_assignments
+    File? ped_file
+    File? par_bed
+    File? allosomes_list
+    RuntimeAttr? runtime_attr_override
+  }
+  RuntimeAttr default_attr = object {
+    cpu_cores: 1, 
+    mem_gb: 1.5,
+    disk_gb: ceil(20 + size(vcf, "GB") * 2),
+    boot_disk_gb: 10,
+    preemptible_tries: 3,
+    max_retries: 1
+  }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+  command <<<
+    set -euo pipefail
+    optionals=" "
+    if [ ~{default="SKIP" sample_pop_assignments} != "SKIP" ]; then
+      optionals="$( echo "$optionals" ) -p ~{sample_pop_assignments}"
+    fi
+    if [ ~{default="SKIP" ped_file} != "SKIP" ]; then
+      optionals="$( echo "$optionals" ) -f ~{ped_file}"
+    fi
+    if [ ~{default="SKIP" par_bed} != "SKIP" ]; then
+      optionals="$( echo "$optionals" ) --par ~{par_bed}"
+    fi
+    if [ ~{default="SKIP" allosomes_list} != "SKIP" ]; then
+      optionals="$( echo "$optionals" ) --allosomes-list ~{allosomes_list}"
+    fi
+    echo -e "OPTIONALS INTERPRETED AS: $optionals"
+    echo -e "NOW RUNNING: /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $( echo "$optionals" ) ~{vcf} stdout"
+    #Tabix chromosome of interest & compute AN, AC, and AF
+    /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $optionals "~{vcf}" stdout \
+    | bgzip -c \
+    > "~{prefix}.wAFs.vcf.gz"
+
+    tabix -p vcf "~{prefix}.wAFs.vcf.gz"
+  >>>
+
+  output {
+    File shard_wAFs = "~{prefix}.wAFs.vcf.gz"
+    File shard_wAFs_idx = "~{prefix}.wAFs.vcf.gz.tbi"
+  }
+  
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_pipeline_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
+
 # Merge VCF shards
 task CombineShardedVcfs {
 
diff --git a/wdl/HailMerge.wdl b/wdl/HailMerge.wdl
index 1d1ef7498..9571da1fa 100644
--- a/wdl/HailMerge.wdl
+++ b/wdl/HailMerge.wdl
@@ -12,9 +12,9 @@ workflow HailMerge {
     String sv_base_mini_docker
     String sv_pipeline_docker
     String sv_pipeline_hail_docker
-    RuntimeAttr? runtime_override_preconcat
-    RuntimeAttr? runtime_override_hail_merge
-    RuntimeAttr? runtime_override_fix_header
+    RuntimeAttr? runtime_attr_preconcat
+    RuntimeAttr? runtime_attr_hail_merge
+    RuntimeAttr? runtime_attr_fix_header
   }
 
   # Concatenate vcfs naively to prevent ClassTooLargeException in Hail
@@ -26,27 +26,27 @@ workflow HailMerge {
         generate_index=false,
         outfile_prefix="~{prefix}.preconcat",
         sv_base_mini_docker=sv_base_mini_docker,
-        runtime_attr_override=runtime_override_preconcat
+        runtime_attr_override=runtime_attr_preconcat
     }
   }
 
-  call HailMergeTask {
+  call HailMerge {
     input:
       vcfs = [select_first([Preconcat.concat_vcf, vcfs[0]])],
       prefix = prefix,
       gcs_project = select_first([gcs_project]),
       sv_pipeline_hail_docker=sv_pipeline_hail_docker,
-      runtime_attr_override=runtime_override_hail_merge
+      runtime_attr_override=runtime_attr_hail_merge
   }
 
   call FixHeader {
     input:
-      merged_vcf = HailMergeTask.merged_vcf,
+      merged_vcf = HailMerge.merged_vcf,
       example_vcf = vcfs[0],
       prefix = prefix + ".reheadered",
       reset_cnv_gts = select_first([reset_cnv_gts, false]),
       sv_pipeline_docker = sv_pipeline_docker,
-      runtime_attr_override=runtime_override_fix_header
+      runtime_attr_override=runtime_attr_fix_header
   }
 
   output {
@@ -55,7 +55,7 @@ workflow HailMerge {
   }
 }
 
-task HailMergeTask {
+task HailMerge {
   input {
     Array[File] vcfs
     String prefix
@@ -81,15 +81,15 @@ task HailMergeTask {
                                   max_retries: 1,
                                   boot_disk_gb: 10
                                 }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, runtime_default])
   runtime {
-    memory: select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + " GB"
-    disks: "local-disk " + select_first([runtime_override.disk_gb, runtime_default.disk_gb]) + " SSD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    memory: select_first([runtime_attr.mem_gb, runtime_default.mem_gb]) + " GB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, runtime_default.disk_gb]) + " SSD"
+    cpu: select_first([runtime_attr.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_attr.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, runtime_default.max_retries])
     docker: sv_pipeline_hail_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, runtime_default.boot_disk_gb])
   }
 
   command <<<
@@ -155,15 +155,15 @@ task FixHeader {
                                   max_retries: 1,
                                   boot_disk_gb: 10
                                 }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, runtime_default])
   runtime {
-    memory: select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + " GB"
-    disks: "local-disk " + select_first([runtime_override.disk_gb, runtime_default.disk_gb]) + " SSD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    memory: select_first([runtime_attr.mem_gb, runtime_default.mem_gb]) + " GB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, runtime_default.disk_gb]) + " SSD"
+    cpu: select_first([runtime_attr.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_attr.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, runtime_default.max_retries])
     docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, runtime_default.boot_disk_gb])
   }
 
   command <<<
diff --git a/wdl/PruneAndAddVafs.wdl b/wdl/PruneAndAddVafs.wdl
index 939744905..10178b7e5 100644
--- a/wdl/PruneAndAddVafs.wdl
+++ b/wdl/PruneAndAddVafs.wdl
@@ -13,73 +13,57 @@ workflow PruneAndAddVafs {
 
     File   vcf
     File   vcf_idx
-    File   contig_list
-    Int    sv_per_shard
     String prefix
+    String contig
 
     File? sample_pop_assignments  # Two-column file with sample ID & pop assignment. "." for pop will ignore sample
-    File? prune_list              # List of samples to be excluded from the output vcf
     File? ped_file                # Used for M/F AF calculations
+    File? par_bed
+    File? allosomes_list
+    File sample_list              # List of samples to be retained from the output vcf
 
     String sv_base_mini_docker
     String sv_pipeline_docker
 
-    RuntimeAttr? runtime_attr_prune_vcf
     RuntimeAttr? runtime_attr_shard_vcf
     RuntimeAttr? runtime_attr_compute_AFs
     RuntimeAttr? runtime_attr_combine_vcfs
     RuntimeAttr? runtime_attr_concat_vcfs
+    RuntimeAttr? runtime_attr_extract_subset_samples_from_vcf
   }
-
-  Array[Array[String]] contigs = read_tsv(contig_list)
-
-  # Iterate over chromosomes
-  scatter (contig in contigs) {
-    
-    # Prune VCF
-    call PruneVcf {
-      input:
-
-        vcf        = vcf,
-        vcf_idx    = vcf_idx,
-        contig     = contig[0],
-        prune_list = prune_list,
-        prefix     = prefix,
-        sv_base_mini_docker = sv_base_mini_docker,
-        runtime_attr_override = runtime_attr_prune_vcf
-    }
-
-    # Compute AC, AN, and AF per population & sex combination
-    call calcAF.ChromosomeAlleleFrequencies as ChromosomeAlleleFrequencies {
-      input:
-        vcf                    = PruneVcf.pruned_vcf,
-        vcf_idx                = PruneVcf.pruned_vcf_idx,
-        contig                 = contig[0],
-        sv_per_shard           = sv_per_shard,
-        prefix                 = prefix,
-        sample_pop_assignments = sample_pop_assignments,
-        ped_file               = ped_file,
-        sv_base_mini_docker    = sv_base_mini_docker,
-        sv_pipeline_docker = sv_pipeline_docker,
-        runtime_attr_shard_vcf    = runtime_attr_shard_vcf,
-        runtime_attr_compute_AFs  = runtime_attr_compute_AFs,
-        runtime_attr_combine_vcfs = runtime_attr_combine_vcfs
-    }
+  
+  # Prune VCF
+  call ExtractSubsetSamples {
+    input:
+      vcf        = vcf,
+      vcf_idx    = vcf_idx,
+      sample_list = sample_list,
+      midfix = prefix,
+      sv_pipeline_docker = sv_pipeline_docker,
+      runtime_attr_override = runtime_attr_extract_subset_samples_from_vcf
   }
 
-  # Merge pruned VCFs with allele info
-  call MiniTasks.ConcatVcfs as ConcatVcfs{
+  # Compute AC, AN, and AF per population & sex combination
+  call calcAF.ChromosomeAlleleFrequencies as ChromosomeAlleleFrequencies {
     input:
-      vcfs = ChromosomeAlleleFrequencies.vcf_wAFs,
-      vcfs_idx = ChromosomeAlleleFrequencies.vcf_wAFs_idx,
-      outfile_prefix = "${prefix}.pruned_wAFs",
-      sv_base_mini_docker = sv_base_mini_docker,
-      runtime_attr_override = runtime_attr_concat_vcfs
+      vcf                    = ExtractSubsetSamples.out_vcf,
+      vcf_idx                = ExtractSubsetSamples.out_vcf_idx,
+      contig                 = contig,
+      prefix                 = prefix,
+      sample_pop_assignments = sample_pop_assignments,
+      ped_file               = ped_file,
+      par_bed                = par_bed,
+      allosomes_list         = allosomes_list,
+      sv_base_mini_docker    = sv_base_mini_docker,
+      sv_pipeline_docker = sv_pipeline_docker,
+      runtime_attr_shard_vcf    = runtime_attr_shard_vcf,
+      runtime_attr_compute_AFs  = runtime_attr_compute_AFs,
+      runtime_attr_combine_vcfs = runtime_attr_combine_vcfs
   }
 
   output {
-    File output_vcf     = ConcatVcfs.concat_vcf
-    File output_vcf_idx = ConcatVcfs.concat_vcf_idx
+    File output_vcf     = ChromosomeAlleleFrequencies.vcf_wAFs
+    File output_vcf_idx = ChromosomeAlleleFrequencies.vcf_wAFs_idx
   }
 }
 
@@ -121,7 +105,7 @@ task PruneVcf {
         | fgrep -wf ~{prune_list} \
         | cut -f1 | paste -s -d, )
       zcat ~{contig}.vcf.gz \
-        | cut --complement -f"$dropidx" \
+        | cut --complement -f "$dropidx" \
         | bgzip -c \
         > "~{prefix}.~{contig}.pruned.vcf.gz"
     else
@@ -152,3 +136,53 @@ task PruneVcf {
     docker:                 sv_base_mini_docker
   }
 }
+
+task ExtractSubsetSamples {
+    input {
+        File vcf
+        File vcf_idx
+        File sample_list
+        String midfix
+        String sv_pipeline_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+
+    Float input_size = size(vcf, "GB")
+    Float base_disk_gb = 10.0
+    RuntimeAttr runtime_default = object {
+            mem_gb: 3,
+            disk_gb: ceil(base_disk_gb + (input_size * 2.0)),
+            cpu_cores: 1,
+            preemptible_tries: 3,
+            max_retries: 1,
+            boot_disk_gb: 10
+    }
+    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    runtime {
+            memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+            disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+            cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+            preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+            maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+            docker: sv_pipeline_docker
+            bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+    }
+
+    String prefix = basename(vcf, '.vcf.gz')
+    command <<<
+        set -eu -o pipefail
+
+        bcftools view -S ~{sample_list} ~{vcf} \
+        | bgzip > ~{prefix}.~{midfix}.vcf.gz
+
+        tabix -p vcf ~{prefix}.~{midfix}.vcf.gz
+
+    >>>
+
+    output {
+        File out_vcf = "~{prefix}.~{midfix}.vcf.gz"
+        File out_vcf_idx = "~{prefix}.~{midfix}.vcf.gz.tbi"
+    }
+}
+
diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl
new file mode 100755
index 000000000..0ba0a2f45
--- /dev/null
+++ b/wdl/ShardedAnnotateVcf.wdl
@@ -0,0 +1,290 @@
+version 1.0
+
+import "Structs.wdl"
+import "TasksMakeCohortVcf.wdl" as MiniTasks
+import "HailMerge.wdl" as HailMerge
+import "AnnotateFunctionalConsequences.wdl" as func
+import "PruneAndAddVafs.wdl" as pav
+import "AnnotateExternalAF.wdl" as eaf
+
+workflow ShardedAnnotateVcf {
+
+  input {
+    File vcf
+    File vcf_idx
+    String prefix
+    String contig
+
+    File protein_coding_gtf
+    File? noncoding_bed
+    Int? promoter_window
+    Int? max_breakend_as_cnv_length
+    String? svannotate_additional_args
+
+    Int max_shards_per_chrom_step1
+    Int min_records_per_shard_step1
+
+    File? sample_pop_assignments  # Two-column file with sample ID & pop assignment. "." for pop will ignore sample
+    File sample_list
+    File? ped_file                # Used for M/F AF calculations
+    File? par_bed
+    File? allosomes_list
+    Int   sv_per_shard
+
+    File? ref_bed              # File with external allele frequencies
+    String? ref_prefix         # prefix name for external AF call set (required if ref_bed set)
+    Array[String]? population  # populations to annotate external AF for (required if ref_bed set)
+
+    Boolean use_hail
+    String? gcs_project
+
+    String sv_pipeline_docker
+    String sv_pipeline_hail_docker
+    String sv_base_mini_docker
+    String gatk_docker
+
+    RuntimeAttr? runtime_attr_svannotate
+    RuntimeAttr? runtime_attr_concat_vcfs
+    RuntimeAttr? runtime_attr_shard_vcf
+    RuntimeAttr? runtime_attr_compute_AFs
+    RuntimeAttr? runtime_attr_combine_vcfs
+    RuntimeAttr? runtime_attr_modify_vcf
+    RuntimeAttr? runtime_attr_combine_vcfs
+    RuntimeAttr? runtime_attr_split_vcf
+    RuntimeAttr? runtime_attr_split_ref_bed
+    RuntimeAttr? runtime_attr_split_query_vcf
+    RuntimeAttr? runtime_attr_bedtools_closest
+    RuntimeAttr? runtime_attr_select_matched_svs
+    RuntimeAttr? runtime_attr_scatter_vcf
+    RuntimeAttr? runtime_attr_fix_ends_rescale_GQ
+    RuntimeAttr? runtime_attr_concat_sharded_cluster
+    RuntimeAttr? runtime_attr_preconcat_sharded_cluster
+    RuntimeAttr? runtime_attr_hail_merge_sharded_cluster
+    RuntimeAttr? runtime_attr_fix_header_sharded_cluster
+    RuntimeAttr? runtime_attr_get_vcf_header_with_members_info_line
+  }
+
+  call MiniTasks.ScatterVcf{
+    input:
+      vcf = vcf,
+      prefix = prefix,
+      records_per_shard = sv_per_shard,
+      sv_pipeline_docker = sv_pipeline_docker,
+      runtime_attr_override = runtime_attr_scatter_vcf
+  }
+
+  scatter (i in range(length(ScatterVcf.shards))) {
+
+    call FixEndsRescaleGQ {
+      input:
+        vcf = ScatterVcf.shards[i],
+        prefix = "~{prefix}.~{i}",
+        sv_pipeline_docker = sv_pipeline_docker,
+        runtime_attr_override = runtime_attr_fix_ends_rescale_GQ
+      }
+
+    call func.AnnotateFunctionalConsequences {
+      input:
+        vcf = FixEndsRescaleGQ.out,
+        vcf_index = FixEndsRescaleGQ.out_idx,
+        prefix = "~{prefix}.~{i}",
+        protein_coding_gtf = protein_coding_gtf,
+        noncoding_bed = noncoding_bed,
+        promoter_window = promoter_window,
+        max_breakend_as_cnv_length = max_breakend_as_cnv_length,
+        additional_args = svannotate_additional_args,
+        gatk_docker = gatk_docker,
+        runtime_attr_svannotate = runtime_attr_svannotate
+    }
+
+    call pav.PruneAndAddVafs as PruneAndAddVafs {
+      input:
+        vcf                    = AnnotateFunctionalConsequences.annotated_vcf,
+        vcf_idx                = AnnotateFunctionalConsequences.annotated_vcf_index,
+        prefix                 = prefix,
+        contig                 = contig,
+        ped_file               = ped_file,
+        par_bed                = par_bed,
+        sample_list            = sample_list,
+        allosomes_list         = allosomes_list,
+        sample_pop_assignments = sample_pop_assignments,
+
+        sv_base_mini_docker     = sv_base_mini_docker,
+        sv_pipeline_docker = sv_pipeline_docker,
+        runtime_attr_shard_vcf    = runtime_attr_shard_vcf,
+        runtime_attr_compute_AFs  = runtime_attr_compute_AFs,
+        runtime_attr_combine_vcfs = runtime_attr_combine_vcfs,
+        runtime_attr_concat_vcfs  = runtime_attr_concat_vcfs
+    }
+
+    if (defined(ref_bed)) {
+      call eaf.AnnotateExternalAF as AnnotateExternalAF {
+        input:
+          vcf     = PruneAndAddVafs.output_vcf,
+          vcf_idx = PruneAndAddVafs.output_vcf_idx,
+          ref_bed = select_first([ref_bed]),
+          population = select_first([population]),
+          ref_prefix = select_first([ref_prefix]),
+          prefix = prefix,
+          contigs = [contig],
+          max_shards_per_chrom_step1 = max_shards_per_chrom_step1,
+          min_records_per_shard_step1 = min_records_per_shard_step1,
+          sv_base_mini_docker = sv_base_mini_docker,
+          sv_pipeline_docker = sv_pipeline_docker,
+          runtime_attr_modify_vcf = runtime_attr_modify_vcf,
+          runtime_attr_split_vcf = runtime_attr_split_vcf,
+          runtime_attr_combine_vcfs = runtime_attr_combine_vcfs,
+          runtime_attr_split_ref_bed = runtime_attr_split_ref_bed,
+          runtime_attr_split_query_vcf = runtime_attr_split_query_vcf,
+          runtime_attr_bedtools_closest = runtime_attr_bedtools_closest,
+          runtime_attr_select_matched_svs = runtime_attr_select_matched_svs
+      }
+    }
+
+  }
+
+  #Array[File?] sharded_annotated_vcf = select_first([AnnotateExternalAF.annotated_vcf, PruneAndAddVafs.output_vcf])
+  #Array[File?] sharded_annotated_vcf_idx = select_first([AnnotateExternalAF.annotated_vcf_tbi, PruneAndAddVafs.output_vcf_idx])
+  Array[File] sharded_annotated_vcf = PruneAndAddVafs.output_vcf
+  Array[File] sharded_annotated_vcf_idx = PruneAndAddVafs.output_vcf_idx
+
+
+  if (length(sharded_annotated_vcf) == 0) {
+    call MiniTasks.GetVcfHeaderWithMembersInfoLine as GetVcfHeader_annotated {
+      input:
+        vcf_gz=vcf,
+        prefix="~{prefix}.annotated",
+        sv_base_mini_docker=sv_base_mini_docker,
+        runtime_attr_override=runtime_attr_get_vcf_header_with_members_info_line
+    }
+  }
+
+  if (length(sharded_annotated_vcf) > 0) {
+    if (use_hail) {
+      call HailMerge.HailMerge as ConcatVcfsHail_annotated {
+        input:
+          vcfs=sharded_annotated_vcf,
+          prefix="~{prefix}.annotated",
+          gcs_project=gcs_project,
+          sv_base_mini_docker=sv_base_mini_docker,
+          sv_pipeline_docker=sv_pipeline_docker,
+          sv_pipeline_hail_docker=sv_pipeline_hail_docker,
+          runtime_attr_preconcat=runtime_attr_preconcat_sharded_cluster,
+          runtime_attr_hail_merge=runtime_attr_hail_merge_sharded_cluster,
+          runtime_attr_fix_header=runtime_attr_fix_header_sharded_cluster
+      }
+    }
+
+    if (!use_hail) {
+      call MiniTasks.ConcatVcfs as ConcatVcfs_annotated {
+        input:
+          vcfs=sharded_annotated_vcf,
+          vcfs_idx=sharded_annotated_vcf_idx,
+          allow_overlaps=true,
+          outfile_prefix="~{prefix}.annotatedd",
+          sv_base_mini_docker=sv_base_mini_docker,
+          runtime_attr_override=runtime_attr_concat_sharded_cluster
+      }
+    }
+
+  }
+
+
+  output {
+    File output_vcf = select_first([GetVcfHeader_annotated.out, ConcatVcfs_annotated.concat_vcf, ConcatVcfsHail_annotated.merged_vcf])
+    File output_vcf_idx = select_first([GetVcfHeader_annotated.out_idx, ConcatVcfs_annotated.concat_vcf_idx, ConcatVcfsHail_annotated.merged_vcf_index])
+  }
+}
+
+
+#function to fix BND, CTX, CPX, INS that have END and END2 represent the breakpoint on the 2nd chromosome
+#Note: this is a temp function for the first beta version of gnomad SV callset. It'll be revised and added as part of the manunal revise / clean up script
+task FixEndsRescaleGQ {
+  input {
+    File vcf
+    String prefix
+
+    Boolean? fix_ends
+    Boolean? rescale_gq
+
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  RuntimeAttr default_attr = object {
+    cpu_cores: 1,
+    mem_gb: 3.75,
+    disk_gb: ceil(10 + size(vcf, "GB") * 2),
+    boot_disk_gb: 10,
+    preemptible_tries: 3,
+    max_retries: 1
+  }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+  String outfile = "~{prefix}.vcf.gz"
+  Boolean fix_ends_ = select_first([fix_ends, true])
+  Boolean rescale_gq_ = select_first([rescale_gq, true])
+
+  output {
+    File out = "~{outfile}"
+    File out_idx = "~{outfile}.tbi"
+  }
+  command <<<
+
+    set -euo pipefail
+
+    python <<CODE
+    import pysam
+    import argparse
+    from math import floor
+
+
+    GQ_FIELDS = ["GQ", "PE_GQ", "SR_GQ", "RD_GQ"]
+
+    filts_for_info = 'PESR_GT_OVERDISPERSION HIGH_SR_BACKGROUND BOTHSIDES_SUPPORT VARIABLE_ACROSS_BATCHES'.split(' ')
+    filts_to_remove = 'HIGH_PCRPLUS_NOCALL_RATE HIGH_PCRMINUS_NOCALL_RATE'.split(' ')
+    filts_to_remove = filts_to_remove + filts_for_info
+
+    def fix_bad_end(record):
+      # pysam converts to 0-based half-open intervals by subtracting 1 from start, but END is unaltered
+      if record.stop < record.start + 2:
+        if record.info["SVTYPE"] == "BND" or record.info["SVTYPE"] == "CTX":
+          record.info["END2"] = record.stop  # just in case it is not already set. not needed for INS or CPX
+        record.stop = record.start + 1
+
+    def rescale_gq(record):
+      for sample in record.samples:
+        for gq_field in GQ_FIELDS:
+          if gq_field in record.samples[sample] and record.samples[sample][gq_field] is not None:
+            record.samples[sample][gq_field] = floor(record.samples[sample][gq_field] / 10)
+
+
+    with pysam.VariantFile("~{vcf}", 'r') as f_in, pysam.VariantFile("~{outfile}", 'w', header=f_in.header) as f_out:
+      for record in f_in:
+        newfilts = [filt for filt in record.filter if filt not in filts_to_remove]
+        record.filter.clear()
+        for filt in newfilts:
+            record.filter.add(filt)
+        if len(record.filter) == 0:
+            record.filter.add('PASS')
+        if "~{fix_ends_}" == "true":
+          fix_bad_end(record)
+        if "~{rescale_gq_}" == "true":
+          rescale_gq(record)
+        f_out.write(record)
+
+    CODE
+    tabix ~{outfile}
+
+  >>>
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_pipeline_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
+
diff --git a/wdl/TasksMakeCohortVcf.wdl b/wdl/TasksMakeCohortVcf.wdl
index 1cf9237d7..0fc76b120 100644
--- a/wdl/TasksMakeCohortVcf.wdl
+++ b/wdl/TasksMakeCohortVcf.wdl
@@ -959,19 +959,19 @@ task RenameVariantIds {
 task ScatterVcf {
   input {
     File vcf
-    File? vcf_index
     String prefix
     Int records_per_shard
     Int? threads = 1
-    String? contig
     String sv_pipeline_docker
     RuntimeAttr? runtime_attr_override
   }
 
   Float input_size = size(vcf, "GB")
+  Float base_disk_gb = 10.0
+
   RuntimeAttr runtime_default = object {
                                   mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
+                                  disk_gb: ceil(base_disk_gb + input_size * 5.0),
                                   cpu_cores: 2,
                                   preemptible_tries: 3,
                                   max_retries: 1,
@@ -991,18 +991,20 @@ task ScatterVcf {
   command <<<
     set -euo pipefail
     # in case the file is empty create an empty shard
-    bcftools view -h ~{vcf} | bgzip -c > "~{prefix}.0.vcf.gz"
-    bcftools +scatter ~{vcf} -o . -O z -p "~{prefix}". --threads ~{threads} -n ~{records_per_shard} ~{"-r " + contig}
+    bcftools view -h ~{vcf} | bgzip -c > ~{prefix}.0.vcf.gz
+    bcftools +scatter ~{vcf} -o . -O z -p ~{prefix}. --threads ~{threads} -n ~{records_per_shard}
 
-    ls "~{prefix}".*.vcf.gz | sort -k1,1V > vcfs.list
+    ls ~{prefix}.*.vcf.gz | sort -k1,1V > vcfs.list
     i=0
-    while read VCF; do
+    while read vcf; do
       shard_no=`printf %06d $i`
-      mv "$VCF" "~{prefix}.shard_${shard_no}.vcf.gz"
+      mv ${vcf} ~{prefix}.shard_${shard_no}.vcf.gz
+      tabix -p vcf ~{prefix}.shard_${shard_no}.vcf.gz
       i=$((i+1))
     done < vcfs.list
   >>>
   output {
     Array[File] shards = glob("~{prefix}.shard_*.vcf.gz")
+    Array[File] shards_idx = glob("~{prefix}.shard_*.vcf.gz.tbi")
   }
 }

From e8562c96a04506ff11a0dbab4dc7ddbcf9edc66c Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 15 Dec 2022 19:12:01 -0500
Subject: [PATCH 02/26] remove gnomAD-specific END fix and make contigs list a
 file

---
 wdl/AnnotateVcf.wdl        |   6 ++-
 wdl/ShardedAnnotateVcf.wdl | 106 ++-----------------------------------
 2 files changed, 7 insertions(+), 105 deletions(-)

diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl
index 1682e025b..501b6736a 100644
--- a/wdl/AnnotateVcf.wdl
+++ b/wdl/AnnotateVcf.wdl
@@ -8,7 +8,7 @@ workflow AnnotateVcf {
   input {
     Array[File] vcf_list
     Array[File] vcf_idx_list
-    Array[String] contig_list
+    File contig_list
     Array[String] prefix_list
 
     File protein_coding_gtf
@@ -58,12 +58,14 @@ workflow AnnotateVcf {
     RuntimeAttr? runtime_attr_get_vcf_header_with_members_info_line
   }
 
+  Array[String] contigs = read_lines(contig_list)
+
   scatter (i in range(length(vcf_list))) {
     call sharded_annotate_vcf.ShardedAnnotateVcf as ShardedAnnotateVcf{
       input:
         vcf = vcf_list[i],
         vcf_idx = vcf_idx_list[i],
-        contig = contig_list[i],
+        contig = contigs[i],
         prefix = prefix_list[i],
         protein_coding_gtf = protein_coding_gtf,
         noncoding_bed = noncoding_bed,
diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl
index 0ba0a2f45..aa1ddea2e 100755
--- a/wdl/ShardedAnnotateVcf.wdl
+++ b/wdl/ShardedAnnotateVcf.wdl
@@ -64,7 +64,7 @@ workflow ShardedAnnotateVcf {
     RuntimeAttr? runtime_attr_get_vcf_header_with_members_info_line
   }
 
-  call MiniTasks.ScatterVcf{
+  call MiniTasks.ScatterVcf {
     input:
       vcf = vcf,
       prefix = prefix,
@@ -75,18 +75,10 @@ workflow ShardedAnnotateVcf {
 
   scatter (i in range(length(ScatterVcf.shards))) {
 
-    call FixEndsRescaleGQ {
-      input:
-        vcf = ScatterVcf.shards[i],
-        prefix = "~{prefix}.~{i}",
-        sv_pipeline_docker = sv_pipeline_docker,
-        runtime_attr_override = runtime_attr_fix_ends_rescale_GQ
-      }
-
     call func.AnnotateFunctionalConsequences {
       input:
-        vcf = FixEndsRescaleGQ.out,
-        vcf_index = FixEndsRescaleGQ.out_idx,
+        vcf = ScatterVcf.shards[i],
+        vcf_index = ScatterVcf.shards_idx[i],
         prefix = "~{prefix}.~{i}",
         protein_coding_gtf = protein_coding_gtf,
         noncoding_bed = noncoding_bed,
@@ -196,95 +188,3 @@ workflow ShardedAnnotateVcf {
   }
 }
 
-
-#function to fix BND, CTX, CPX, INS that have END and END2 represent the breakpoint on the 2nd chromosome
-#Note: this is a temp function for the first beta version of gnomad SV callset. It'll be revised and added as part of the manunal revise / clean up script
-task FixEndsRescaleGQ {
-  input {
-    File vcf
-    String prefix
-
-    Boolean? fix_ends
-    Boolean? rescale_gq
-
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  RuntimeAttr default_attr = object {
-    cpu_cores: 1,
-    mem_gb: 3.75,
-    disk_gb: ceil(10 + size(vcf, "GB") * 2),
-    boot_disk_gb: 10,
-    preemptible_tries: 3,
-    max_retries: 1
-  }
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
-
-  String outfile = "~{prefix}.vcf.gz"
-  Boolean fix_ends_ = select_first([fix_ends, true])
-  Boolean rescale_gq_ = select_first([rescale_gq, true])
-
-  output {
-    File out = "~{outfile}"
-    File out_idx = "~{outfile}.tbi"
-  }
-  command <<<
-
-    set -euo pipefail
-
-    python <<CODE
-    import pysam
-    import argparse
-    from math import floor
-
-
-    GQ_FIELDS = ["GQ", "PE_GQ", "SR_GQ", "RD_GQ"]
-
-    filts_for_info = 'PESR_GT_OVERDISPERSION HIGH_SR_BACKGROUND BOTHSIDES_SUPPORT VARIABLE_ACROSS_BATCHES'.split(' ')
-    filts_to_remove = 'HIGH_PCRPLUS_NOCALL_RATE HIGH_PCRMINUS_NOCALL_RATE'.split(' ')
-    filts_to_remove = filts_to_remove + filts_for_info
-
-    def fix_bad_end(record):
-      # pysam converts to 0-based half-open intervals by subtracting 1 from start, but END is unaltered
-      if record.stop < record.start + 2:
-        if record.info["SVTYPE"] == "BND" or record.info["SVTYPE"] == "CTX":
-          record.info["END2"] = record.stop  # just in case it is not already set. not needed for INS or CPX
-        record.stop = record.start + 1
-
-    def rescale_gq(record):
-      for sample in record.samples:
-        for gq_field in GQ_FIELDS:
-          if gq_field in record.samples[sample] and record.samples[sample][gq_field] is not None:
-            record.samples[sample][gq_field] = floor(record.samples[sample][gq_field] / 10)
-
-
-    with pysam.VariantFile("~{vcf}", 'r') as f_in, pysam.VariantFile("~{outfile}", 'w', header=f_in.header) as f_out:
-      for record in f_in:
-        newfilts = [filt for filt in record.filter if filt not in filts_to_remove]
-        record.filter.clear()
-        for filt in newfilts:
-            record.filter.add(filt)
-        if len(record.filter) == 0:
-            record.filter.add('PASS')
-        if "~{fix_ends_}" == "true":
-          fix_bad_end(record)
-        if "~{rescale_gq_}" == "true":
-          rescale_gq(record)
-        f_out.write(record)
-
-    CODE
-    tabix ~{outfile}
-
-  >>>
-  runtime {
-    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
-    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
-    docker: sv_pipeline_docker
-    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
-  }
-}
-

From d46986f806649e990d846a667a640a0cec11694d Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 15 Dec 2022 19:15:53 -0500
Subject: [PATCH 03/26] make hail docker optional

---
 wdl/AnnotateVcf.wdl        | 2 +-
 wdl/ShardedAnnotateVcf.wdl | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl
index 501b6736a..3555f2804 100644
--- a/wdl/AnnotateVcf.wdl
+++ b/wdl/AnnotateVcf.wdl
@@ -35,7 +35,7 @@ workflow AnnotateVcf {
     String? gcs_project
 
     String sv_pipeline_docker
-    String sv_pipeline_hail_docker
+    String? sv_pipeline_hail_docker
     String sv_base_mini_docker
     String gatk_docker
 
diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl
index aa1ddea2e..b6878c399 100755
--- a/wdl/ShardedAnnotateVcf.wdl
+++ b/wdl/ShardedAnnotateVcf.wdl
@@ -39,7 +39,7 @@ workflow ShardedAnnotateVcf {
     String? gcs_project
 
     String sv_pipeline_docker
-    String sv_pipeline_hail_docker
+    String? sv_pipeline_hail_docker
     String sv_base_mini_docker
     String gatk_docker
 
@@ -160,7 +160,7 @@ workflow ShardedAnnotateVcf {
           gcs_project=gcs_project,
           sv_base_mini_docker=sv_base_mini_docker,
           sv_pipeline_docker=sv_pipeline_docker,
-          sv_pipeline_hail_docker=sv_pipeline_hail_docker,
+          sv_pipeline_hail_docker=select_first([sv_pipeline_hail_docker]),
           runtime_attr_preconcat=runtime_attr_preconcat_sharded_cluster,
           runtime_attr_hail_merge=runtime_attr_hail_merge_sharded_cluster,
           runtime_attr_fix_header=runtime_attr_fix_header_sharded_cluster

From 95fc0260bba9884fcd981f1e8139ff6c568fe4ef Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 15 Dec 2022 19:25:32 -0500
Subject: [PATCH 04/26] make prune vcf samples step & input optional

---
 wdl/AnnotateVcf.wdl        |  4 +-
 wdl/PruneAndAddVafs.wdl    | 93 ++++++--------------------------------
 wdl/ShardedAnnotateVcf.wdl |  4 +-
 3 files changed, 17 insertions(+), 84 deletions(-)

diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl
index 3555f2804..0853bec7a 100644
--- a/wdl/AnnotateVcf.wdl
+++ b/wdl/AnnotateVcf.wdl
@@ -21,7 +21,7 @@ workflow AnnotateVcf {
     Int min_records_per_shard_step1
 
     File? sample_pop_assignments  # Two-column file with sample ID & pop assignment. "." for pop will ignore sample
-    File sample_list              # List of samples to be retained from the output vcf
+    File? sample_keep_list              # List of samples to be retained from the output vcf
     File? ped_file                # Used for M/F AF calculations
     File? par_bed
     File? allosomes_list
@@ -76,7 +76,7 @@ workflow AnnotateVcf {
         max_shards_per_chrom_step1 = max_shards_per_chrom_step1,
         min_records_per_shard_step1 = min_records_per_shard_step1,
         sample_pop_assignments = sample_pop_assignments,
-        sample_list = sample_list,
+        sample_keep_list = sample_keep_list,
         ped_file = ped_file,
         par_bed = par_bed,
         sv_per_shard = sv_per_shard,
diff --git a/wdl/PruneAndAddVafs.wdl b/wdl/PruneAndAddVafs.wdl
index 10178b7e5..49ff1a93f 100644
--- a/wdl/PruneAndAddVafs.wdl
+++ b/wdl/PruneAndAddVafs.wdl
@@ -20,7 +20,7 @@ workflow PruneAndAddVafs {
     File? ped_file                # Used for M/F AF calculations
     File? par_bed
     File? allosomes_list
-    File sample_list              # List of samples to be retained from the output vcf
+    File? sample_keep_list              # List of samples to be retained from the output vcf
 
     String sv_base_mini_docker
     String sv_pipeline_docker
@@ -33,21 +33,23 @@ workflow PruneAndAddVafs {
   }
   
   # Prune VCF
-  call ExtractSubsetSamples {
-    input:
-      vcf        = vcf,
-      vcf_idx    = vcf_idx,
-      sample_list = sample_list,
-      midfix = prefix,
-      sv_pipeline_docker = sv_pipeline_docker,
-      runtime_attr_override = runtime_attr_extract_subset_samples_from_vcf
+  if (defined(sample_keep_list)) {
+    call ExtractSubsetSamples {
+      input:
+        vcf        = vcf,
+        vcf_idx    = vcf_idx,
+        sample_list = select_first([sample_keep_list]),
+        midfix = prefix,
+        sv_pipeline_docker = sv_pipeline_docker,
+        runtime_attr_override = runtime_attr_extract_subset_samples_from_vcf
+    }
   }
 
   # Compute AC, AN, and AF per population & sex combination
   call calcAF.ChromosomeAlleleFrequencies as ChromosomeAlleleFrequencies {
     input:
-      vcf                    = ExtractSubsetSamples.out_vcf,
-      vcf_idx                = ExtractSubsetSamples.out_vcf_idx,
+      vcf                    = select_first([ExtractSubsetSamples.out_vcf, vcf]),
+      vcf_idx                = select_first([ExtractSubsetSamples.out_vcf_idx, vcf_idx]),
       contig                 = contig,
       prefix                 = prefix,
       sample_pop_assignments = sample_pop_assignments,
@@ -67,75 +69,6 @@ workflow PruneAndAddVafs {
   }
 }
 
-# Prune off samples from annotated VCF
-task PruneVcf {
-  
-  input {
-    File   vcf
-    File   vcf_idx
-    String contig
-    String prefix
-    
-    File? prune_list
-
-    String sv_base_mini_docker
-    
-    RuntimeAttr? runtime_attr_override
-  }
-  
-  output {
-    File pruned_vcf     = "${prefix}.${contig}.pruned.vcf.gz"
-    File pruned_vcf_idx = "${prefix}.${contig}.pruned.vcf.gz.tbi"
-  }
-
-  command <<<
-
-    set -euo pipefail
-    
-    # Tabix chromosome of interest
-    tabix -h ~{vcf} ~{contig} | bgzip -c > ~{contig}.vcf.gz
-    
-    # Get column indexes corresponding to samples to drop, if any exist
-    if ~{defined(prune_list)}; then
-      dropidx=$( zcat ~{contig}.vcf.gz \
-        | sed -n '1,500p' \
-        | grep "^#CHROM" \
-        | sed 's/\t/\n/g' \
-        | awk -v OFS="\t" '{ print NR, $1 }' \
-        | fgrep -wf ~{prune_list} \
-        | cut -f1 | paste -s -d, )
-      zcat ~{contig}.vcf.gz \
-        | cut --complement -f "$dropidx" \
-        | bgzip -c \
-        > "~{prefix}.~{contig}.pruned.vcf.gz"
-    else
-      cp "~{contig}.vcf.gz" "~{prefix}.~{contig}.pruned.vcf.gz"
-    fi
-    
-    tabix -f "~{prefix}.~{contig}.pruned.vcf.gz"
-  
-  >>>
-
-  #########################
-  RuntimeAttr default_attr = object {
-    cpu_cores:          1, 
-    mem_gb:             3.75, 
-    disk_gb:            250,
-    boot_disk_gb:       10,
-    preemptible_tries:  3,
-    max_retries:        1
-  }
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])  
-  runtime {
-    cpu:                    select_first([runtime_attr.cpu_cores,         default_attr.cpu_cores])
-    memory:                 select_first([runtime_attr.mem_gb,            default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " +  select_first([runtime_attr.disk_gb,           default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb:         select_first([runtime_attr.boot_disk_gb,      default_attr.boot_disk_gb])
-    preemptible:            select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries:             select_first([runtime_attr.max_retries,       default_attr.max_retries])
-    docker:                 sv_base_mini_docker
-  }
-}
 
 task ExtractSubsetSamples {
     input {
diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl
index b6878c399..f8afd9997 100755
--- a/wdl/ShardedAnnotateVcf.wdl
+++ b/wdl/ShardedAnnotateVcf.wdl
@@ -25,7 +25,7 @@ workflow ShardedAnnotateVcf {
     Int min_records_per_shard_step1
 
     File? sample_pop_assignments  # Two-column file with sample ID & pop assignment. "." for pop will ignore sample
-    File sample_list
+    File? sample_keep_list
     File? ped_file                # Used for M/F AF calculations
     File? par_bed
     File? allosomes_list
@@ -97,7 +97,7 @@ workflow ShardedAnnotateVcf {
         contig                 = contig,
         ped_file               = ped_file,
         par_bed                = par_bed,
-        sample_list            = sample_list,
+        sample_keep_list       = sample_keep_list,
         allosomes_list         = allosomes_list,
         sample_pop_assignments = sample_pop_assignments,
 

From b0fa6fdcf4aed069ce47b8d6f4bd1fd86db8447c Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Fri, 6 Jan 2023 17:34:30 -0500
Subject: [PATCH 05/26] create tabix index if not in expected location in
 svannotate

---
 wdl/AnnotateFunctionalConsequences.wdl | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/wdl/AnnotateFunctionalConsequences.wdl b/wdl/AnnotateFunctionalConsequences.wdl
index 5df4f1969..c018c492b 100644
--- a/wdl/AnnotateFunctionalConsequences.wdl
+++ b/wdl/AnnotateFunctionalConsequences.wdl
@@ -75,9 +75,14 @@ task SVAnnotate {
   }
   command <<<
 
-     set -euo pipefail
+    set -euo pipefail
 
-     gatk --java-options "-Xmx~{java_mem_mb}m" SVAnnotate \
+    # check index is in expected location. if not, tabix
+    if [ ! -f "~{vcf}.tbi" ]; then
+      tabix -p vcf ~{vcf}
+    fi
+
+    gatk --java-options "-Xmx~{java_mem_mb}m" SVAnnotate \
       -V ~{vcf} \
       -O ~{outfile} \
       --protein-coding-gtf ~{protein_coding_gtf} \

From 09b695bdc8fa4c1e07d849cbd3f0c7544dee6afc Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 26 Jan 2023 18:56:05 -0500
Subject: [PATCH 06/26] samples list input to compute_AFs.py

---
 src/sv-pipeline/05_annotation/scripts/compute_AFs.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/sv-pipeline/05_annotation/scripts/compute_AFs.py b/src/sv-pipeline/05_annotation/scripts/compute_AFs.py
index 31b5a0c0a..dbbd476d4 100755
--- a/src/sv-pipeline/05_annotation/scripts/compute_AFs.py
+++ b/src/sv-pipeline/05_annotation/scripts/compute_AFs.py
@@ -294,6 +294,8 @@ def main():
                         'sex-specific AFs).', default=None)
     parser.add_argument('--par', help='BED file of pseudoautosomal regions (used ' +
                         'for sex-specific AFs).', default=None)
+    parser.add_argument('--samples-list', help='List of samples to use for AF calculations',
+                        default=None)
     parser.add_argument(
         'fout', help='Output vcf. Also accepts "stdout" and "-".')
     args = parser.parse_args()
@@ -305,7 +307,10 @@ def main():
         vcf = pysam.VariantFile(args.vcf)
 
     # Get list of all samples in vcf
-    samples_list = list(vcf.header.samples)
+    if args.samples_list is None:
+        samples_list = list(vcf.header.samples)
+    else:
+        samples_list = [line.strip() for line in open(args.samples_list)]
 
     # Get lists of males and females
     parbt = pbt.BedTool('', from_string=True)

From d053a58d12784be0d8fd7576c7a5d0a066fa48c2 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 26 Jan 2023 19:21:35 -0500
Subject: [PATCH 07/26] update WDLs and docker with samples list for compute
 afs

---
 src/sv-pipeline/05_annotation/scripts/compute_AFs.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/sv-pipeline/05_annotation/scripts/compute_AFs.py b/src/sv-pipeline/05_annotation/scripts/compute_AFs.py
index dbbd476d4..31b5a0c0a 100755
--- a/src/sv-pipeline/05_annotation/scripts/compute_AFs.py
+++ b/src/sv-pipeline/05_annotation/scripts/compute_AFs.py
@@ -294,8 +294,6 @@ def main():
                         'sex-specific AFs).', default=None)
     parser.add_argument('--par', help='BED file of pseudoautosomal regions (used ' +
                         'for sex-specific AFs).', default=None)
-    parser.add_argument('--samples-list', help='List of samples to use for AF calculations',
-                        default=None)
     parser.add_argument(
         'fout', help='Output vcf. Also accepts "stdout" and "-".')
     args = parser.parse_args()
@@ -307,10 +305,7 @@ def main():
         vcf = pysam.VariantFile(args.vcf)
 
     # Get list of all samples in vcf
-    if args.samples_list is None:
-        samples_list = list(vcf.header.samples)
-    else:
-        samples_list = [line.strip() for line in open(args.samples_list)]
+    samples_list = list(vcf.header.samples)
 
     # Get lists of males and females
     parbt = pbt.BedTool('', from_string=True)

From 1a583bea0ab0940db25f437206f66d706d88eccd Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Tue, 2 May 2023 16:49:34 -0400
Subject: [PATCH 08/26] keep external af annotation, update json templates

---
 .../AnnotateVcf.SingleBatch.json.tmpl                      | 7 ++++---
 .../workflow_configurations/AnnotateVcf.json.tmpl          | 7 ++++---
 inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl    | 7 ++++---
 wdl/ShardedAnnotateVcf.wdl                                 | 4 ++--
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl
index 73724c902..a0c461b8c 100644
--- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl
+++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl
@@ -1,6 +1,6 @@
 {
-  "AnnotateVcf.vcf" : "${this.cleaned_vcf}",
-  "AnnotateVcf.vcf_idx" : "${this.cleaned_vcf_index}",
+  "AnnotateVcf.vcf_list" : "${this.cleaned_vcf}",
+  "AnnotateVcf.vcf_idx_list" : "${this.cleaned_vcf_index}",
 
   "AnnotateVcf.protein_coding_gtf" : "${workspace.protein_coding_gtf}",
   "AnnotateVcf.noncoding_bed" : "${workspace.noncoding_bed}",
@@ -15,7 +15,8 @@
   "AnnotateVcf.max_shards_per_chrom_step1" : 200,
   "AnnotateVcf.min_records_per_shard_step1" : 5000,
 
-  "AnnotateVcf.prefix" : "${this.sample_set_id}",
+  "AnnotateVcf.prefix_list" : "${this.sample_set_id}",
+  "AnnotateVcf.use_hail": "false",
 
   "AnnotateVcf.gatk_docker" : "${workspace.gatk_docker}",
   "AnnotateVcf.sv_base_mini_docker" : "${workspace.sv_base_mini_docker}",
diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl
index adc72e9be..ca973e632 100644
--- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl
+++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl
@@ -1,6 +1,6 @@
 {
-  "AnnotateVcf.vcf" : "${this.cleaned_vcf}",
-  "AnnotateVcf.vcf_idx" : "${this.cleaned_vcf_index}",
+  "AnnotateVcf.vcf_list" : "${this.cleaned_vcf}",
+  "AnnotateVcf.vcf_idx_list" : "${this.cleaned_vcf_index}",
 
   "AnnotateVcf.protein_coding_gtf" : "${workspace.protein_coding_gtf}",
   "AnnotateVcf.noncoding_bed" : "${workspace.noncoding_bed}",
@@ -15,7 +15,8 @@
   "AnnotateVcf.max_shards_per_chrom_step1" : 200,
   "AnnotateVcf.min_records_per_shard_step1" : 5000,
 
-  "AnnotateVcf.prefix" : "${this.sample_set_set_id}",
+  "AnnotateVcf.prefix_list" : "${this.sample_set_set_id}",
+  "AnnotateVcf.use_hail": "false",
 
   "AnnotateVcf.gatk_docker" : "${workspace.gatk_docker}",
   "AnnotateVcf.sv_base_mini_docker" : "${workspace.sv_base_mini_docker}",
diff --git a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl
index 74b441f78..c030a94dc 100644
--- a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl
+++ b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl
@@ -1,6 +1,6 @@
 {
-  "AnnotateVcf.vcf" :   {{ test_batch.clean_vcf | tojson }},
-  "AnnotateVcf.vcf_idx" : {{ test_batch.clean_vcf_index | tojson }},
+  "AnnotateVcf.vcf_list" :  [ {{ test_batch.clean_vcf | tojson }} ],
+  "AnnotateVcf.vcf_idx_list" : [{{ test_batch.clean_vcf_index | tojson }}],
 
   "AnnotateVcf.protein_coding_gtf" : {{ reference_resources.protein_coding_gtf | tojson }},
   "AnnotateVcf.noncoding_bed" :       {{ reference_resources.noncoding_bed | tojson }},
@@ -15,7 +15,8 @@
   "AnnotateVcf.max_shards_per_chrom_step1" : 200,
   "AnnotateVcf.min_records_per_shard_step1" :  5000,
 
-  "AnnotateVcf.prefix" : {{ test_batch.name | tojson }},
+  "AnnotateVcf.prefix_list" : [{{ test_batch.name | tojson }}],
+  "AnnotateVcf.use_hail": "false",
 
   "AnnotateVcf.gatk_docker":{{ dockers.gatk_docker | tojson }},
   "AnnotateVcf.sv_base_mini_docker":{{ dockers.sv_base_mini_docker | tojson }},
diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl
index f8afd9997..4065791d5 100755
--- a/wdl/ShardedAnnotateVcf.wdl
+++ b/wdl/ShardedAnnotateVcf.wdl
@@ -137,8 +137,8 @@ workflow ShardedAnnotateVcf {
 
   #Array[File?] sharded_annotated_vcf = select_first([AnnotateExternalAF.annotated_vcf, PruneAndAddVafs.output_vcf])
   #Array[File?] sharded_annotated_vcf_idx = select_first([AnnotateExternalAF.annotated_vcf_tbi, PruneAndAddVafs.output_vcf_idx])
-  Array[File] sharded_annotated_vcf = PruneAndAddVafs.output_vcf
-  Array[File] sharded_annotated_vcf_idx = PruneAndAddVafs.output_vcf_idx
+  Array[File] sharded_annotated_vcf = select_first([select_all(AnnotateExternalAF.annotated_vcf), PruneAndAddVafs.output_vcf])
+  Array[File] sharded_annotated_vcf_idx = select_first([select_all(AnnotateExternalAF.annotated_vcf_tbi), PruneAndAddVafs.output_vcf_idx])
 
 
   if (length(sharded_annotated_vcf) == 0) {

From 3e65184904a9258b4b4f5a733862c83d4bf6478c Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 8 May 2023 15:18:03 -0400
Subject: [PATCH 09/26] shard by contig if inputs are not already

---
 wdl/AnnotateVcf.wdl        | 12 +++++++-----
 wdl/ShardedAnnotateVcf.wdl |  3 +--
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl
index 0853bec7a..e16ec32d6 100644
--- a/wdl/AnnotateVcf.wdl
+++ b/wdl/AnnotateVcf.wdl
@@ -6,10 +6,11 @@ import "ShardedAnnotateVcf.wdl" as sharded_annotate_vcf
 workflow AnnotateVcf {
 
   input {
-    Array[File] vcf_list
+    Array[File] vcf_list  # Must be either single full VCF (array of length 1) or array of VCFs sharded by contig. Index & prefix list inputs should match
     Array[File] vcf_idx_list
     File contig_list
     Array[String] prefix_list
+    Boolean sharded_by_contig  # True if providing a vcf_list sharded by contig. False if providing a single full VCF
 
     File protein_coding_gtf
     File? noncoding_bed
@@ -60,13 +61,14 @@ workflow AnnotateVcf {
 
   Array[String] contigs = read_lines(contig_list)
 
-  scatter (i in range(length(vcf_list))) {
+  scatter (i in range(length(contigs))) {
+    Int array_index = if (sharded_by_contig && length(vcf_list) > 1) then i else 0
     call sharded_annotate_vcf.ShardedAnnotateVcf as ShardedAnnotateVcf{
       input:
-        vcf = vcf_list[i],
-        vcf_idx = vcf_idx_list[i],
+        vcf = vcf_list[array_index],
+        vcf_idx = vcf_idx_list[array_index],
         contig = contigs[i],
-        prefix = prefix_list[i],
+        prefix = prefix_list[array_index],
         protein_coding_gtf = protein_coding_gtf,
         noncoding_bed = noncoding_bed,
         promoter_window = promoter_window,
diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl
index 4065791d5..9a791f9be 100755
--- a/wdl/ShardedAnnotateVcf.wdl
+++ b/wdl/ShardedAnnotateVcf.wdl
@@ -69,6 +69,7 @@ workflow ShardedAnnotateVcf {
       vcf = vcf,
       prefix = prefix,
       records_per_shard = sv_per_shard,
+      contig = contig,
       sv_pipeline_docker = sv_pipeline_docker,
       runtime_attr_override = runtime_attr_scatter_vcf
   }
@@ -135,8 +136,6 @@ workflow ShardedAnnotateVcf {
 
   }
 
-  #Array[File?] sharded_annotated_vcf = select_first([AnnotateExternalAF.annotated_vcf, PruneAndAddVafs.output_vcf])
-  #Array[File?] sharded_annotated_vcf_idx = select_first([AnnotateExternalAF.annotated_vcf_tbi, PruneAndAddVafs.output_vcf_idx])
   Array[File] sharded_annotated_vcf = select_first([select_all(AnnotateExternalAF.annotated_vcf), PruneAndAddVafs.output_vcf])
   Array[File] sharded_annotated_vcf_idx = select_first([select_all(AnnotateExternalAF.annotated_vcf_tbi), PruneAndAddVafs.output_vcf_idx])
 

From 087364dc68972b7375b23e9d364f2ce02846c73c Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 8 May 2023 15:42:48 -0400
Subject: [PATCH 10/26] use latest ScatterVcf. also reverted sample list for
 compute AFs during rebase

---
 wdl/AnnotateFunctionalConsequences.wdl |  4 ++--
 wdl/ShardedAnnotateVcf.wdl             |  3 +--
 wdl/TasksMakeCohortVcf.wdl             | 18 ++++++++----------
 3 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/wdl/AnnotateFunctionalConsequences.wdl b/wdl/AnnotateFunctionalConsequences.wdl
index c018c492b..2a08b5d09 100644
--- a/wdl/AnnotateFunctionalConsequences.wdl
+++ b/wdl/AnnotateFunctionalConsequences.wdl
@@ -5,7 +5,7 @@ import "Structs.wdl"
 workflow AnnotateFunctionalConsequences {
   input {
     File vcf
-    File vcf_index
+    File? vcf_index
     String prefix
 
     File protein_coding_gtf
@@ -41,7 +41,7 @@ workflow AnnotateFunctionalConsequences {
 task SVAnnotate {
   input {
     File vcf
-    File vcf_index
+    File? vcf_index
     String prefix
 
     File protein_coding_gtf
diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl
index 9a791f9be..cce4aa451 100755
--- a/wdl/ShardedAnnotateVcf.wdl
+++ b/wdl/ShardedAnnotateVcf.wdl
@@ -79,8 +79,7 @@ workflow ShardedAnnotateVcf {
     call func.AnnotateFunctionalConsequences {
       input:
         vcf = ScatterVcf.shards[i],
-        vcf_index = ScatterVcf.shards_idx[i],
-        prefix = "~{prefix}.~{i}",
+        prefix = "~{prefix}.~{contig}.~{i}",
         protein_coding_gtf = protein_coding_gtf,
         noncoding_bed = noncoding_bed,
         promoter_window = promoter_window,
diff --git a/wdl/TasksMakeCohortVcf.wdl b/wdl/TasksMakeCohortVcf.wdl
index 0fc76b120..1cf9237d7 100644
--- a/wdl/TasksMakeCohortVcf.wdl
+++ b/wdl/TasksMakeCohortVcf.wdl
@@ -959,19 +959,19 @@ task RenameVariantIds {
 task ScatterVcf {
   input {
     File vcf
+    File? vcf_index
     String prefix
     Int records_per_shard
     Int? threads = 1
+    String? contig
     String sv_pipeline_docker
     RuntimeAttr? runtime_attr_override
   }
 
   Float input_size = size(vcf, "GB")
-  Float base_disk_gb = 10.0
-
   RuntimeAttr runtime_default = object {
                                   mem_gb: 3.75,
-                                  disk_gb: ceil(base_disk_gb + input_size * 5.0),
+                                  disk_gb: ceil(10.0 + input_size * 5.0),
                                   cpu_cores: 2,
                                   preemptible_tries: 3,
                                   max_retries: 1,
@@ -991,20 +991,18 @@ task ScatterVcf {
   command <<<
     set -euo pipefail
     # in case the file is empty create an empty shard
-    bcftools view -h ~{vcf} | bgzip -c > ~{prefix}.0.vcf.gz
-    bcftools +scatter ~{vcf} -o . -O z -p ~{prefix}. --threads ~{threads} -n ~{records_per_shard}
+    bcftools view -h ~{vcf} | bgzip -c > "~{prefix}.0.vcf.gz"
+    bcftools +scatter ~{vcf} -o . -O z -p "~{prefix}". --threads ~{threads} -n ~{records_per_shard} ~{"-r " + contig}
 
-    ls ~{prefix}.*.vcf.gz | sort -k1,1V > vcfs.list
+    ls "~{prefix}".*.vcf.gz | sort -k1,1V > vcfs.list
     i=0
-    while read vcf; do
+    while read VCF; do
       shard_no=`printf %06d $i`
-      mv ${vcf} ~{prefix}.shard_${shard_no}.vcf.gz
-      tabix -p vcf ~{prefix}.shard_${shard_no}.vcf.gz
+      mv "$VCF" "~{prefix}.shard_${shard_no}.vcf.gz"
       i=$((i+1))
     done < vcfs.list
   >>>
   output {
     Array[File] shards = glob("~{prefix}.shard_*.vcf.gz")
-    Array[File] shards_idx = glob("~{prefix}.shard_*.vcf.gz.tbi")
   }
 }

From 148319c5c563734cdedf861ff19c6a22e78b49dc Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 8 May 2023 16:01:51 -0400
Subject: [PATCH 11/26] womtool validation

---
 .../AnnotateVcf.SingleBatch.json.tmpl         |  1 +
 .../AnnotateVcf.json.tmpl                     |  1 +
 .../test/AnnotateVcf/AnnotateVcf.json.tmpl    |  1 +
 wdl/AnnotateVcf.wdl                           |  2 +-
 wdl/GATKSVPipelineSingleSample.wdl            | 20 ++++++++++---------
 wdl/HailMerge.wdl                             | 18 ++++++++---------
 wdl/ShardedAnnotateVcf.wdl                    |  6 +++---
 7 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl
index a0c461b8c..701490993 100644
--- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl
+++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl
@@ -16,6 +16,7 @@
   "AnnotateVcf.min_records_per_shard_step1" : 5000,
 
   "AnnotateVcf.prefix_list" : "${this.sample_set_id}",
+  "AnnotateVcf.sharded_by_contig": "false",
   "AnnotateVcf.use_hail": "false",
 
   "AnnotateVcf.gatk_docker" : "${workspace.gatk_docker}",
diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl
index ca973e632..404abac9f 100644
--- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl
+++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl
@@ -16,6 +16,7 @@
   "AnnotateVcf.min_records_per_shard_step1" : 5000,
 
   "AnnotateVcf.prefix_list" : "${this.sample_set_set_id}",
+  "AnnotateVcf.sharded_by_contig": "false",
   "AnnotateVcf.use_hail": "false",
 
   "AnnotateVcf.gatk_docker" : "${workspace.gatk_docker}",
diff --git a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl
index c030a94dc..b23b074a1 100644
--- a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl
+++ b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl
@@ -16,6 +16,7 @@
   "AnnotateVcf.min_records_per_shard_step1" :  5000,
 
   "AnnotateVcf.prefix_list" : [{{ test_batch.name | tojson }}],
+  "AnnotateVcf.sharded_by_contig": "false",
   "AnnotateVcf.use_hail": "false",
 
   "AnnotateVcf.gatk_docker":{{ dockers.gatk_docker | tojson }},
diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl
index e16ec32d6..ad7dd9661 100644
--- a/wdl/AnnotateVcf.wdl
+++ b/wdl/AnnotateVcf.wdl
@@ -6,7 +6,7 @@ import "ShardedAnnotateVcf.wdl" as sharded_annotate_vcf
 workflow AnnotateVcf {
 
   input {
-    Array[File] vcf_list  # Must be either single full VCF (array of length 1) or array of VCFs sharded by contig. Index & prefix list inputs should match
+    Array[File] vcf_list  # Must be either single full VCF (array of length 1) or array of VCFs sharded by contig. Outputs will match
     Array[File] vcf_idx_list
     File contig_list
     Array[String] prefix_list
diff --git a/wdl/GATKSVPipelineSingleSample.wdl b/wdl/GATKSVPipelineSingleSample.wdl
index 464638389..d61231ee2 100644
--- a/wdl/GATKSVPipelineSingleSample.wdl
+++ b/wdl/GATKSVPipelineSingleSample.wdl
@@ -1389,9 +1389,9 @@ workflow GATKSVPipelineSingleSample {
 
   call annotate.AnnotateVcf {
        input:
-        vcf = FilterSample.out,
-        vcf_idx = FilterSample.out_idx,
-        prefix = batch,
+        vcf_list = [FilterSample.out],
+        vcf_idx_list = [FilterSample.out_idx],
+        prefix_list = [batch],
         contig_list = primary_contigs_list,
         protein_coding_gtf = protein_coding_gtf,
         noncoding_bed = noncoding_bed,
@@ -1400,6 +1400,8 @@ workflow GATKSVPipelineSingleSample {
         ref_bed = external_af_ref_bed,
         ref_prefix = external_af_ref_bed_prefix,
         population = external_af_population,
+        use_hail = false,
+        sharded_by_contig = false,
         sv_per_shard = annotation_sv_per_shard,
         max_shards_per_chrom_step1 = annotation_max_shards_per_chrom_step1,
         min_records_per_shard_step1 = annotation_min_records_per_shard_step1,
@@ -1411,18 +1413,18 @@ workflow GATKSVPipelineSingleSample {
 
   call SingleSampleFiltering.VcfToBed as VcfToBed {
     input:
-      vcf = AnnotateVcf.output_vcf,
+      vcf = AnnotateVcf.output_vcf_list[0],
       prefix = batch,
       sv_pipeline_docker = sv_pipeline_docker
   }
 
   call SingleSampleFiltering.UpdateBreakendRepresentation {
     input:
-      vcf=AnnotateVcf.output_vcf,
-      vcf_idx=AnnotateVcf.output_vcf_idx,
+      vcf=AnnotateVcf.output_vcf_list[0],
+      vcf_idx=AnnotateVcf.output_vcf_idx_list[0],
       ref_fasta=reference_fasta,
       ref_fasta_idx=reference_index,
-      prefix=basename(AnnotateVcf.output_vcf, ".vcf.gz") + ".final_cleanup",
+      prefix=basename(AnnotateVcf.output_vcf_list[0], ".vcf.gz") + ".final_cleanup",
       sv_pipeline_docker=sv_pipeline_docker
   }
 
@@ -1462,8 +1464,8 @@ workflow GATKSVPipelineSingleSample {
     # These files contain events reported in the internal VCF representation
     # They are less VCF-spec compliant but may be useful if components of the pipeline need to be re-run
     # on the output.
-    File pre_cleanup_vcf = AnnotateVcf.output_vcf
-    File pre_cleanup_vcf_idx = AnnotateVcf.output_vcf_idx
+    File pre_cleanup_vcf = AnnotateVcf.output_vcf_list[0]
+    File pre_cleanup_vcf_idx = AnnotateVcf.output_vcf_idx_list[0]
 
     File ploidy_matrix = select_first([GatherBatchEvidence.batch_ploidy_matrix])
     File ploidy_plots = select_first([GatherBatchEvidence.batch_ploidy_plots])
diff --git a/wdl/HailMerge.wdl b/wdl/HailMerge.wdl
index 9571da1fa..31d0bd31c 100644
--- a/wdl/HailMerge.wdl
+++ b/wdl/HailMerge.wdl
@@ -12,9 +12,9 @@ workflow HailMerge {
     String sv_base_mini_docker
     String sv_pipeline_docker
     String sv_pipeline_hail_docker
-    RuntimeAttr? runtime_attr_preconcat
-    RuntimeAttr? runtime_attr_hail_merge
-    RuntimeAttr? runtime_attr_fix_header
+    RuntimeAttr? runtime_override_preconcat
+    RuntimeAttr? runtime_override_hail_merge
+    RuntimeAttr? runtime_override_fix_header
   }
 
   # Concatenate vcfs naively to prevent ClassTooLargeException in Hail
@@ -26,27 +26,27 @@ workflow HailMerge {
         generate_index=false,
         outfile_prefix="~{prefix}.preconcat",
         sv_base_mini_docker=sv_base_mini_docker,
-        runtime_attr_override=runtime_attr_preconcat
+        runtime_attr_override=runtime_override_preconcat
     }
   }
 
-  call HailMerge {
+  call HailMergeTask {
     input:
       vcfs = [select_first([Preconcat.concat_vcf, vcfs[0]])],
       prefix = prefix,
       gcs_project = select_first([gcs_project]),
       sv_pipeline_hail_docker=sv_pipeline_hail_docker,
-      runtime_attr_override=runtime_attr_hail_merge
+      runtime_attr_override=runtime_override_hail_merge
   }
 
   call FixHeader {
     input:
-      merged_vcf = HailMerge.merged_vcf,
+      merged_vcf = HailMergeTask.merged_vcf,
       example_vcf = vcfs[0],
       prefix = prefix + ".reheadered",
       reset_cnv_gts = select_first([reset_cnv_gts, false]),
       sv_pipeline_docker = sv_pipeline_docker,
-      runtime_attr_override=runtime_attr_fix_header
+      runtime_attr_override=runtime_override_fix_header
   }
 
   output {
@@ -55,7 +55,7 @@ workflow HailMerge {
   }
 }
 
-task HailMerge {
+task HailMergeTask {
   input {
     Array[File] vcfs
     String prefix
diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl
index cce4aa451..a5245099f 100755
--- a/wdl/ShardedAnnotateVcf.wdl
+++ b/wdl/ShardedAnnotateVcf.wdl
@@ -159,9 +159,9 @@ workflow ShardedAnnotateVcf {
           sv_base_mini_docker=sv_base_mini_docker,
           sv_pipeline_docker=sv_pipeline_docker,
           sv_pipeline_hail_docker=select_first([sv_pipeline_hail_docker]),
-          runtime_attr_preconcat=runtime_attr_preconcat_sharded_cluster,
-          runtime_attr_hail_merge=runtime_attr_hail_merge_sharded_cluster,
-          runtime_attr_fix_header=runtime_attr_fix_header_sharded_cluster
+          runtime_override_preconcat=runtime_attr_preconcat_sharded_cluster,
+          runtime_override_hail_merge=runtime_attr_hail_merge_sharded_cluster,
+          runtime_override_fix_header=runtime_attr_fix_header_sharded_cluster
       }
     }
 

From fc16f003e879c60f02755406021e189f70f2b9e4 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 8 May 2023 16:24:41 -0400
Subject: [PATCH 12/26] cleanup

---
 wdl/AnnotateVcf.wdl                 |  34 +++--
 wdl/ChromosomeAlleleFrequencies.wdl | 197 +---------------------------
 wdl/PruneAndAddVafs.wdl             |  86 +++---------
 wdl/ShardedAnnotateVcf.wdl          |  81 +++++-------
 4 files changed, 72 insertions(+), 326 deletions(-)

diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl
index ad7dd9661..982ec7364 100644
--- a/wdl/AnnotateVcf.wdl
+++ b/wdl/AnnotateVcf.wdl
@@ -56,7 +56,6 @@ workflow AnnotateVcf {
     RuntimeAttr? runtime_attr_preconcat_sharded_cluster
     RuntimeAttr? runtime_attr_hail_merge_sharded_cluster
     RuntimeAttr? runtime_attr_fix_header_sharded_cluster
-    RuntimeAttr? runtime_attr_get_vcf_header_with_members_info_line
   }
 
   Array[String] contigs = read_lines(contig_list)
@@ -96,23 +95,22 @@ workflow AnnotateVcf {
         sv_base_mini_docker = sv_base_mini_docker,
         sv_pipeline_hail_docker = sv_pipeline_hail_docker,
 
-        runtime_attr_svannotate = runtime_attr_svannotate ,
-        runtime_attr_concat_vcfs  = runtime_attr_concat_vcfs  ,
-        runtime_attr_shard_vcf  = runtime_attr_shard_vcf  ,
-        runtime_attr_compute_AFs  = runtime_attr_compute_AFs  ,
-        runtime_attr_combine_vcfs = runtime_attr_combine_vcfs ,
-        runtime_attr_modify_vcf = runtime_attr_modify_vcf ,
-        runtime_attr_combine_vcfs = runtime_attr_combine_vcfs ,
-        runtime_attr_split_vcf  = runtime_attr_split_vcf  ,
-        runtime_attr_split_ref_bed  = runtime_attr_split_ref_bed  ,
-        runtime_attr_split_query_vcf  = runtime_attr_split_query_vcf  ,
-        runtime_attr_bedtools_closest = runtime_attr_bedtools_closest ,
-        runtime_attr_select_matched_svs = runtime_attr_select_matched_svs ,
-        runtime_attr_concat_sharded_cluster = runtime_attr_concat_sharded_cluster ,
-        runtime_attr_preconcat_sharded_cluster  = runtime_attr_preconcat_sharded_cluster  ,
-        runtime_attr_hail_merge_sharded_cluster = runtime_attr_hail_merge_sharded_cluster ,
-        runtime_attr_fix_header_sharded_cluster = runtime_attr_fix_header_sharded_cluster ,
-        runtime_attr_get_vcf_header_with_members_info_line  = runtime_attr_get_vcf_header_with_members_info_line
+        runtime_attr_svannotate = runtime_attr_svannotate,
+        runtime_attr_concat_vcfs  = runtime_attr_concat_vcfs,
+        runtime_attr_shard_vcf  = runtime_attr_shard_vcf,
+        runtime_attr_compute_AFs  = runtime_attr_compute_AFs,
+        runtime_attr_combine_vcfs = runtime_attr_combine_vcfs,
+        runtime_attr_modify_vcf = runtime_attr_modify_vcf,
+        runtime_attr_combine_vcfs = runtime_attr_combine_vcfs,
+        runtime_attr_split_vcf  = runtime_attr_split_vcf,
+        runtime_attr_split_ref_bed  = runtime_attr_split_ref_bed,
+        runtime_attr_split_query_vcf  = runtime_attr_split_query_vcf,
+        runtime_attr_bedtools_closest = runtime_attr_bedtools_closest,
+        runtime_attr_select_matched_svs = runtime_attr_select_matched_svs,
+        runtime_attr_concat_sharded_cluster = runtime_attr_concat_sharded_cluster,
+        runtime_attr_preconcat_sharded_cluster  = runtime_attr_preconcat_sharded_cluster,
+        runtime_attr_hail_merge_sharded_cluster = runtime_attr_hail_merge_sharded_cluster,
+        runtime_attr_fix_header_sharded_cluster = runtime_attr_fix_header_sharded_cluster
     }
   }
 
diff --git a/wdl/ChromosomeAlleleFrequencies.wdl b/wdl/ChromosomeAlleleFrequencies.wdl
index 6e7397a2e..8eb422350 100644
--- a/wdl/ChromosomeAlleleFrequencies.wdl
+++ b/wdl/ChromosomeAlleleFrequencies.wdl
@@ -28,8 +28,6 @@ workflow ChromosomeAlleleFrequencies {
     RuntimeAttr? runtime_attr_combine_vcfs
   }
 
-  # Tabix to chromosome of interest, and shard input VCF for stats collection
-  # Scatter over VCF shards
   call ComputeShardAFs {
       input:
         vcf = vcf,
@@ -50,128 +48,6 @@ workflow ChromosomeAlleleFrequencies {
   }
 }
 
-# Shard VCF into fixed size chunks
-task ShardVcf {
-
-  input {
-    File   vcf
-    File   vcf_idx
-    Int    sv_per_shard
-    String contig
-
-    String sv_pipeline_docker
-
-    RuntimeAttr? runtime_attr_override
-  }
-  
-  output {
-    Array[File] shard_vcfs = glob("vcf.shard.*.vcf.gz")
-  }
-
-  command <<<
-
-    set -euo pipefail
-
-    # Tabix chromosome of interest
-    tabix -h ~{vcf} ~{contig} | bgzip -c > ~{contig}.vcf.gz
-    
-    # Then shard VCF
-    /opt/sv-pipeline/scripts/shard_VCF.sh \
-      ~{contig}.vcf.gz \
-      ~{sv_per_shard} \
-      "vcf.shard."
-
-    # if there were no shards created just make an empty one
-    if [ ! -e vcf.shard.000000.vcf.gz ]; then
-      cp ~{contig}.vcf.gz vcf.shard.000000.vcf.gz
-    fi
-  >>>
-  
-  #########################
-  RuntimeAttr default_attr = object {
-    cpu_cores:          1, 
-    mem_gb:             3.75, 
-    disk_gb:            250,
-    boot_disk_gb:       10,
-    preemptible_tries:  3,
-    max_retries:        0
-  }
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
-  runtime {
-    cpu:                    select_first([runtime_attr.cpu_cores,         default_attr.cpu_cores])
-    memory:                 select_first([runtime_attr.mem_gb,            default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " +  select_first([runtime_attr.disk_gb,           default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb:         select_first([runtime_attr.boot_disk_gb,      default_attr.boot_disk_gb])
-    preemptible:            select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries:             select_first([runtime_attr.max_retries,       default_attr.max_retries])
-    docker:                 sv_pipeline_docker
-  }
-}
-
-# Subset a vcf to a single chromosome, and add global AF information (no subpop)
-task ComputeShardAlleleFrequencies {
-
-  input {
-
-    File   vcf
-    String prefix
-    
-    File? sample_pop_assignments
-    File? ped_file
-    
-    String sv_pipeline_docker
-
-    RuntimeAttr? runtime_attr_override
-  }
-  
-  output {
-    File shard_wAFs = "~{prefix}.wAFs.vcf.gz"
-    File shard_wAFs_idx = "~{prefix}.wAFs.vcf.gz.tbi"
-  }
-
-  command <<<
-
-    set -euo pipefail
-    
-    optionals=" "
-    if ~{defined(sample_pop_assignments)}; then
-      optionals="$( echo "$optionals" ) -p ~{sample_pop_assignments}"
-    fi
-    
-    if ~{defined(ped_file)}; then
-      optionals="$( echo "$optionals" ) -f ~{ped_file}"
-    fi
-    
-    echo -e "OPTIONALS INTERPRETED AS: $optionals"
-    echo -e "NOW RUNNING: /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $( echo "$optionals" ) ~{vcf} stdout"
-    # Tabix chromosome of interest & compute AN, AC, and AF
-    /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $optionals "~{vcf}" stdout \
-      | bgzip -c \
-      > "~{prefix}.wAFs.vcf.gz"
-
-    tabix -p vcf ~{prefix}.wAFs.vcf.gz
-  
-  >>>
-  
-  RuntimeAttr default_attr = object {
-    cpu_cores:          1, 
-    mem_gb:             3.75, 
-    disk_gb:            20,
-    boot_disk_gb:       10,
-    preemptible_tries:  3,
-    max_retries:        0
-  }
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
-  runtime {
-    cpu:                    select_first([runtime_attr.cpu_cores,         default_attr.cpu_cores])
-    memory:                 select_first([runtime_attr.mem_gb,            default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " +  select_first([runtime_attr.disk_gb,           default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb:         select_first([runtime_attr.boot_disk_gb,      default_attr.boot_disk_gb])
-    preemptible:            select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries:             select_first([runtime_attr.max_retries,       default_attr.max_retries])
-    docker:                 sv_pipeline_docker
-  }
-}
 
 task ComputeShardAFs {
   input {
@@ -196,23 +72,11 @@ task ComputeShardAFs {
 
   command <<<
     set -euo pipefail
-    optionals=" "
-    if [ ~{default="SKIP" sample_pop_assignments} != "SKIP" ]; then
-      optionals="$( echo "$optionals" ) -p ~{sample_pop_assignments}"
-    fi
-    if [ ~{default="SKIP" ped_file} != "SKIP" ]; then
-      optionals="$( echo "$optionals" ) -f ~{ped_file}"
-    fi
-    if [ ~{default="SKIP" par_bed} != "SKIP" ]; then
-      optionals="$( echo "$optionals" ) --par ~{par_bed}"
-    fi
-    if [ ~{default="SKIP" allosomes_list} != "SKIP" ]; then
-      optionals="$( echo "$optionals" ) --allosomes-list ~{allosomes_list}"
-    fi
-    echo -e "OPTIONALS INTERPRETED AS: $optionals"
-    echo -e "NOW RUNNING: /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $( echo "$optionals" ) ~{vcf} stdout"
-    #Tabix chromosome of interest & compute AN, AC, and AF
-    /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $optionals "~{vcf}" stdout \
+    /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py "~{vcf}" stdout \
+      ~{"-p " + sample_pop_assignments} \
+      ~{"-f " + ped_file} \
+      ~{"-par " + par_bed} \
+      ~{"--allosomes-list " + allosomes_list} \
     | bgzip -c \
     > "~{prefix}.wAFs.vcf.gz"
 
@@ -234,54 +98,3 @@ task ComputeShardAFs {
     maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
   }
 }
-
-# Merge VCF shards
-task CombineShardedVcfs {
-
-  input {
-    
-    Array[File] vcfs
-    String      prefix
-
-    String sv_base_mini_docker
-    
-    RuntimeAttr? runtime_attr_override
-  }
-
-  
-  output {
-    File vcf_out     = "${prefix}.wAFs.vcf.gz"
-    File vcf_out_idx = "${prefix}.wAFs.vcf.gz.tbi"
-  }
-
-  command <<<
-
-    set -euo pipefail
-    vcf-concat ~{sep=" "  vcfs} \
-      | vcf-sort \
-      | bgzip -c \
-      > "~{prefix}.wAFs.vcf.gz";
-    tabix -p vcf "~{prefix}.wAFs.vcf.gz"
-  
-  >>>
- 
-  #########################
-  RuntimeAttr default_attr = object {
-    cpu_cores:          1, 
-    mem_gb:             3.75, 
-    disk_gb:            50,
-    boot_disk_gb:       10,
-    preemptible_tries:  3,
-    max_retries:        0
-  }
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) 
-  runtime {
-    cpu:                    select_first([runtime_attr.cpu_cores,         default_attr.cpu_cores])
-    memory:                 select_first([runtime_attr.mem_gb,            default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " +  select_first([runtime_attr.disk_gb,           default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb:         select_first([runtime_attr.boot_disk_gb,      default_attr.boot_disk_gb])
-    preemptible:            select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries:             select_first([runtime_attr.max_retries,       default_attr.max_retries])
-    docker:                 sv_base_mini_docker
-  }
-}
diff --git a/wdl/PruneAndAddVafs.wdl b/wdl/PruneAndAddVafs.wdl
index 49ff1a93f..bd685fe97 100644
--- a/wdl/PruneAndAddVafs.wdl
+++ b/wdl/PruneAndAddVafs.wdl
@@ -5,6 +5,7 @@ version 1.0
 
 import "TasksMakeCohortVcf.wdl" as MiniTasks
 import "ChromosomeAlleleFrequencies.wdl" as calcAF
+import "Utils.wdl" as util
 
 # Prune off samples in annotated VCF, add VAF annotation
 workflow PruneAndAddVafs {
@@ -29,37 +30,36 @@ workflow PruneAndAddVafs {
     RuntimeAttr? runtime_attr_compute_AFs
     RuntimeAttr? runtime_attr_combine_vcfs
     RuntimeAttr? runtime_attr_concat_vcfs
-    RuntimeAttr? runtime_attr_extract_subset_samples_from_vcf
+    RuntimeAttr? runtime_attr_subset_vcf_by_samples_list
   }
   
   # Prune VCF
   if (defined(sample_keep_list)) {
-    call ExtractSubsetSamples {
+    call util.SubsetVcfBySamplesList {
       input:
-        vcf        = vcf,
-        vcf_idx    = vcf_idx,
-        sample_list = select_first([sample_keep_list]),
-        midfix = prefix,
-        sv_pipeline_docker = sv_pipeline_docker,
-        runtime_attr_override = runtime_attr_extract_subset_samples_from_vcf
+        vcf = vcf,
+        vcf_idx = vcf_idx,
+        list_of_samples = select_first([sample_keep_list]),
+        sv_base_mini_docker = sv_base_mini_docker,
+        runtime_attr_override = runtime_attr_subset_vcf_by_samples_list
     }
   }
 
   # Compute AC, AN, and AF per population & sex combination
   call calcAF.ChromosomeAlleleFrequencies as ChromosomeAlleleFrequencies {
     input:
-      vcf                    = select_first([ExtractSubsetSamples.out_vcf, vcf]),
-      vcf_idx                = select_first([ExtractSubsetSamples.out_vcf_idx, vcf_idx]),
-      contig                 = contig,
-      prefix                 = prefix,
+      vcf = select_first([SubsetVcfBySamplesList.vcf_subset, vcf]),
+      vcf_idx = select_first([SubsetVcfBySamplesList.vcf_subset_index, vcf_idx]),
+      contig = contig,
+      prefix = prefix,
       sample_pop_assignments = sample_pop_assignments,
-      ped_file               = ped_file,
-      par_bed                = par_bed,
-      allosomes_list         = allosomes_list,
-      sv_base_mini_docker    = sv_base_mini_docker,
+      ped_file = ped_file,
+      par_bed = par_bed,
+      allosomes_list = allosomes_list,
+      sv_base_mini_docker = sv_base_mini_docker,
       sv_pipeline_docker = sv_pipeline_docker,
-      runtime_attr_shard_vcf    = runtime_attr_shard_vcf,
-      runtime_attr_compute_AFs  = runtime_attr_compute_AFs,
+      runtime_attr_shard_vcf = runtime_attr_shard_vcf,
+      runtime_attr_compute_AFs = runtime_attr_compute_AFs,
       runtime_attr_combine_vcfs = runtime_attr_combine_vcfs
   }
 
@@ -69,53 +69,3 @@ workflow PruneAndAddVafs {
   }
 }
 
-
-task ExtractSubsetSamples {
-    input {
-        File vcf
-        File vcf_idx
-        File sample_list
-        String midfix
-        String sv_pipeline_docker
-        RuntimeAttr? runtime_attr_override
-    }
-
-
-    Float input_size = size(vcf, "GB")
-    Float base_disk_gb = 10.0
-    RuntimeAttr runtime_default = object {
-            mem_gb: 3,
-            disk_gb: ceil(base_disk_gb + (input_size * 2.0)),
-            cpu_cores: 1,
-            preemptible_tries: 3,
-            max_retries: 1,
-            boot_disk_gb: 10
-    }
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-    runtime {
-            memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-            disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-            cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-            preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-            maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-            docker: sv_pipeline_docker
-            bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-    }
-
-    String prefix = basename(vcf, '.vcf.gz')
-    command <<<
-        set -eu -o pipefail
-
-        bcftools view -S ~{sample_list} ~{vcf} \
-        | bgzip > ~{prefix}.~{midfix}.vcf.gz
-
-        tabix -p vcf ~{prefix}.~{midfix}.vcf.gz
-
-    >>>
-
-    output {
-        File out_vcf = "~{prefix}.~{midfix}.vcf.gz"
-        File out_vcf_idx = "~{prefix}.~{midfix}.vcf.gz.tbi"
-    }
-}
-
diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl
index a5245099f..46abafe4d 100755
--- a/wdl/ShardedAnnotateVcf.wdl
+++ b/wdl/ShardedAnnotateVcf.wdl
@@ -61,7 +61,6 @@ workflow ShardedAnnotateVcf {
     RuntimeAttr? runtime_attr_preconcat_sharded_cluster
     RuntimeAttr? runtime_attr_hail_merge_sharded_cluster
     RuntimeAttr? runtime_attr_fix_header_sharded_cluster
-    RuntimeAttr? runtime_attr_get_vcf_header_with_members_info_line
   }
 
   call MiniTasks.ScatterVcf {
@@ -91,28 +90,28 @@ workflow ShardedAnnotateVcf {
 
     call pav.PruneAndAddVafs as PruneAndAddVafs {
       input:
-        vcf                    = AnnotateFunctionalConsequences.annotated_vcf,
-        vcf_idx                = AnnotateFunctionalConsequences.annotated_vcf_index,
-        prefix                 = prefix,
-        contig                 = contig,
-        ped_file               = ped_file,
-        par_bed                = par_bed,
-        sample_keep_list       = sample_keep_list,
-        allosomes_list         = allosomes_list,
+        vcf = AnnotateFunctionalConsequences.annotated_vcf,
+        vcf_idx = AnnotateFunctionalConsequences.annotated_vcf_index,
+        prefix = prefix,
+        contig = contig,
+        ped_file = ped_file,
+        par_bed = par_bed,
+        sample_keep_list = sample_keep_list,
+        allosomes_list = allosomes_list,
         sample_pop_assignments = sample_pop_assignments,
 
-        sv_base_mini_docker     = sv_base_mini_docker,
+        sv_base_mini_docker = sv_base_mini_docker,
         sv_pipeline_docker = sv_pipeline_docker,
-        runtime_attr_shard_vcf    = runtime_attr_shard_vcf,
-        runtime_attr_compute_AFs  = runtime_attr_compute_AFs,
+        runtime_attr_shard_vcf = runtime_attr_shard_vcf,
+        runtime_attr_compute_AFs = runtime_attr_compute_AFs,
         runtime_attr_combine_vcfs = runtime_attr_combine_vcfs,
-        runtime_attr_concat_vcfs  = runtime_attr_concat_vcfs
+        runtime_attr_concat_vcfs = runtime_attr_concat_vcfs
     }
 
     if (defined(ref_bed)) {
       call eaf.AnnotateExternalAF as AnnotateExternalAF {
         input:
-          vcf     = PruneAndAddVafs.output_vcf,
+          vcf = PruneAndAddVafs.output_vcf,
           vcf_idx = PruneAndAddVafs.output_vcf_idx,
           ref_bed = select_first([ref_bed]),
           population = select_first([population]),
@@ -138,51 +137,37 @@ workflow ShardedAnnotateVcf {
   Array[File] sharded_annotated_vcf = select_first([select_all(AnnotateExternalAF.annotated_vcf), PruneAndAddVafs.output_vcf])
   Array[File] sharded_annotated_vcf_idx = select_first([select_all(AnnotateExternalAF.annotated_vcf_tbi), PruneAndAddVafs.output_vcf_idx])
 
-
-  if (length(sharded_annotated_vcf) == 0) {
-    call MiniTasks.GetVcfHeaderWithMembersInfoLine as GetVcfHeader_annotated {
+  if (use_hail) {
+    call HailMerge.HailMerge {
       input:
-        vcf_gz=vcf,
+        vcfs=sharded_annotated_vcf,
         prefix="~{prefix}.annotated",
+        gcs_project=gcs_project,
         sv_base_mini_docker=sv_base_mini_docker,
-        runtime_attr_override=runtime_attr_get_vcf_header_with_members_info_line
+        sv_pipeline_docker=sv_pipeline_docker,
+        sv_pipeline_hail_docker=select_first([sv_pipeline_hail_docker]),
+        runtime_override_preconcat=runtime_attr_preconcat_sharded_cluster,
+        runtime_override_hail_merge=runtime_attr_hail_merge_sharded_cluster,
+        runtime_override_fix_header=runtime_attr_fix_header_sharded_cluster
     }
   }
 
-  if (length(sharded_annotated_vcf) > 0) {
-    if (use_hail) {
-      call HailMerge.HailMerge as ConcatVcfsHail_annotated {
-        input:
-          vcfs=sharded_annotated_vcf,
-          prefix="~{prefix}.annotated",
-          gcs_project=gcs_project,
-          sv_base_mini_docker=sv_base_mini_docker,
-          sv_pipeline_docker=sv_pipeline_docker,
-          sv_pipeline_hail_docker=select_first([sv_pipeline_hail_docker]),
-          runtime_override_preconcat=runtime_attr_preconcat_sharded_cluster,
-          runtime_override_hail_merge=runtime_attr_hail_merge_sharded_cluster,
-          runtime_override_fix_header=runtime_attr_fix_header_sharded_cluster
-      }
-    }
-
-    if (!use_hail) {
-      call MiniTasks.ConcatVcfs as ConcatVcfs_annotated {
-        input:
-          vcfs=sharded_annotated_vcf,
-          vcfs_idx=sharded_annotated_vcf_idx,
-          allow_overlaps=true,
-          outfile_prefix="~{prefix}.annotatedd",
-          sv_base_mini_docker=sv_base_mini_docker,
-          runtime_attr_override=runtime_attr_concat_sharded_cluster
-      }
+  if (!use_hail) {
+    call MiniTasks.ConcatVcfs {
+      input:
+        vcfs=sharded_annotated_vcf,
+        vcfs_idx=sharded_annotated_vcf_idx,
+        allow_overlaps=true,
+        outfile_prefix="~{prefix}.annotatedd",
+        sv_base_mini_docker=sv_base_mini_docker,
+        runtime_attr_override=runtime_attr_concat_sharded_cluster
     }
-
   }
 
 
   output {
-    File output_vcf = select_first([GetVcfHeader_annotated.out, ConcatVcfs_annotated.concat_vcf, ConcatVcfsHail_annotated.merged_vcf])
-    File output_vcf_idx = select_first([GetVcfHeader_annotated.out_idx, ConcatVcfs_annotated.concat_vcf_idx, ConcatVcfsHail_annotated.merged_vcf_index])
+    File output_vcf = select_first([ConcatVcfs.concat_vcf, HailMerge.merged_vcf])
+    File output_vcf_idx = select_first([ConcatVcfs.concat_vcf_idx, HailMerge.merged_vcf_index])
   }
 }
 

From f87ea28f6c0c5334e50337d3108936f91b3ba28d Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 8 May 2023 16:43:00 -0400
Subject: [PATCH 13/26] more cleanup

---
 wdl/AnnotateVcf.wdl                 |   4 +-
 wdl/ChromosomeAlleleFrequencies.wdl | 100 ----------------------------
 wdl/PruneAndAddVafs.wdl             |  70 ++++++++++++++-----
 wdl/ShardedAnnotateVcf.wdl          |  13 ++--
 4 files changed, 62 insertions(+), 125 deletions(-)
 delete mode 100644 wdl/ChromosomeAlleleFrequencies.wdl

diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl
index 982ec7364..49039c27a 100644
--- a/wdl/AnnotateVcf.wdl
+++ b/wdl/AnnotateVcf.wdl
@@ -43,6 +43,7 @@ workflow AnnotateVcf {
     RuntimeAttr? runtime_attr_svannotate
     RuntimeAttr? runtime_attr_concat_vcfs
     RuntimeAttr? runtime_attr_shard_vcf
+    RuntimeAttr? runtime_attr_subset_vcf_by_samples_list
     RuntimeAttr? runtime_attr_compute_AFs
     RuntimeAttr? runtime_attr_combine_vcfs
     RuntimeAttr? runtime_attr_modify_vcf
@@ -96,8 +97,7 @@ workflow AnnotateVcf {
         sv_pipeline_hail_docker = sv_pipeline_hail_docker,
 
         runtime_attr_svannotate = runtime_attr_svannotate,
-        runtime_attr_concat_vcfs  = runtime_attr_concat_vcfs,
-        runtime_attr_shard_vcf  = runtime_attr_shard_vcf,
+        runtime_attr_subset_vcf_by_samples_list = runtime_attr_subset_vcf_by_samples_list,
         runtime_attr_compute_AFs  = runtime_attr_compute_AFs,
         runtime_attr_combine_vcfs = runtime_attr_combine_vcfs,
         runtime_attr_modify_vcf = runtime_attr_modify_vcf,
diff --git a/wdl/ChromosomeAlleleFrequencies.wdl b/wdl/ChromosomeAlleleFrequencies.wdl
deleted file mode 100644
index 8eb422350..000000000
--- a/wdl/ChromosomeAlleleFrequencies.wdl
+++ /dev/null
@@ -1,100 +0,0 @@
-# Helper workflow to calculate basic AF statistics for a single chromosome on an input VCF
-
-version 1.0
-
-import "Structs.wdl"
-
-# Add VAF annotation
-workflow ChromosomeAlleleFrequencies {
-
-  input {
-
-    File   vcf
-    File   vcf_idx
-    String contig
-    String prefix
-
-    File? sample_pop_assignments   # Two-column file with sample ID & pop assignment. "." for pop will ignore sample
-    File? ped_file                 # Used for M/F AF calculations
-    File? par_bed
-    File? allosomes_list
-
-
-    String sv_pipeline_docker
-    String sv_base_mini_docker
-
-    RuntimeAttr? runtime_attr_shard_vcf
-    RuntimeAttr? runtime_attr_compute_AFs
-    RuntimeAttr? runtime_attr_combine_vcfs
-  }
-
-  call ComputeShardAFs {
-      input:
-        vcf = vcf,
-        prefix = "${prefix}.${contig}",
-        sample_pop_assignments = sample_pop_assignments,
-        ped_file = ped_file,
-        par_bed  = par_bed,
-        allosomes_list = allosomes_list,
-        sv_pipeline_docker = sv_pipeline_docker,
-        runtime_attr_override = runtime_attr_compute_AFs
-  }
-
-
-  # Final output
-  output {
-    File vcf_wAFs = ComputeShardAFs.shard_wAFs
-    File vcf_wAFs_idx = ComputeShardAFs.shard_wAFs_idx
-  }
-}
-
-
-task ComputeShardAFs {
-  input {
-    File vcf
-    String prefix
-    String sv_pipeline_docker
-    File? sample_pop_assignments
-    File? ped_file
-    File? par_bed
-    File? allosomes_list
-    RuntimeAttr? runtime_attr_override
-  }
-  RuntimeAttr default_attr = object {
-    cpu_cores: 1, 
-    mem_gb: 1.5,
-    disk_gb: ceil(20 + size(vcf, "GB") * 2),
-    boot_disk_gb: 10,
-    preemptible_tries: 3,
-    max_retries: 1
-  }
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
-
-  command <<<
-    set -euo pipefail
-    /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py "~{vcf}" stdout \
-      ~{"-p " + sample_pop_assignments} \
-      ~{"-f " + ped_file} \
-      ~{"-par " + par_bed} \
-      ~{"--allosomes-list " + allosomes_list} \
-    | bgzip -c \
-    > "~{prefix}.wAFs.vcf.gz"
-
-    tabix -p vcf "~{prefix}.wAFs.vcf.gz"
-  >>>
-
-  output {
-    File shard_wAFs = "~{prefix}.wAFs.vcf.gz"
-    File shard_wAFs_idx = "~{prefix}.wAFs.vcf.gz.tbi"
-  }
-  
-  runtime {
-    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
-    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
-    docker: sv_pipeline_docker
-    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
-  }
-}
diff --git a/wdl/PruneAndAddVafs.wdl b/wdl/PruneAndAddVafs.wdl
index bd685fe97..f01b65c3f 100644
--- a/wdl/PruneAndAddVafs.wdl
+++ b/wdl/PruneAndAddVafs.wdl
@@ -4,7 +4,6 @@
 version 1.0
 
 import "TasksMakeCohortVcf.wdl" as MiniTasks
-import "ChromosomeAlleleFrequencies.wdl" as calcAF
 import "Utils.wdl" as util
 
 # Prune off samples in annotated VCF, add VAF annotation
@@ -12,8 +11,8 @@ workflow PruneAndAddVafs {
   
   input {
 
-    File   vcf
-    File   vcf_idx
+    File vcf
+    File vcf_idx
     String prefix
     String contig
 
@@ -26,10 +25,7 @@ workflow PruneAndAddVafs {
     String sv_base_mini_docker
     String sv_pipeline_docker
 
-    RuntimeAttr? runtime_attr_shard_vcf
     RuntimeAttr? runtime_attr_compute_AFs
-    RuntimeAttr? runtime_attr_combine_vcfs
-    RuntimeAttr? runtime_attr_concat_vcfs
     RuntimeAttr? runtime_attr_subset_vcf_by_samples_list
   }
   
@@ -46,26 +42,70 @@ workflow PruneAndAddVafs {
   }
 
   # Compute AC, AN, and AF per population & sex combination
-  call calcAF.ChromosomeAlleleFrequencies as ChromosomeAlleleFrequencies {
+  call ComputeShardAFs {
     input:
       vcf = select_first([SubsetVcfBySamplesList.vcf_subset, vcf]),
-      vcf_idx = select_first([SubsetVcfBySamplesList.vcf_subset_index, vcf_idx]),
-      contig = contig,
       prefix = prefix,
       sample_pop_assignments = sample_pop_assignments,
       ped_file = ped_file,
       par_bed = par_bed,
       allosomes_list = allosomes_list,
-      sv_base_mini_docker = sv_base_mini_docker,
       sv_pipeline_docker = sv_pipeline_docker,
-      runtime_attr_shard_vcf = runtime_attr_shard_vcf,
-      runtime_attr_compute_AFs = runtime_attr_compute_AFs,
-      runtime_attr_combine_vcfs = runtime_attr_combine_vcfs
+      runtime_attr_override = runtime_attr_compute_AFs
   }
 
   output {
-    File output_vcf     = ChromosomeAlleleFrequencies.vcf_wAFs
-    File output_vcf_idx = ChromosomeAlleleFrequencies.vcf_wAFs_idx
+    File output_vcf = ComputeShardAFs.shard_wAFs
+    File output_vcf_idx = ComputeShardAFs.shard_wAFs_idx
   }
 }
 
+task ComputeShardAFs {
+  input {
+    File vcf
+    String prefix
+    File? sample_pop_assignments
+    File? ped_file
+    File? par_bed
+    File? allosomes_list
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
+  RuntimeAttr default_attr = object {
+    cpu_cores: 1, 
+    mem_gb: 1.5,
+    disk_gb: ceil(20 + size(vcf, "GB") * 2),
+    boot_disk_gb: 10,
+    preemptible_tries: 3,
+    max_retries: 1
+  }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+  command <<<
+    set -euo pipefail
+    /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py "~{vcf}" stdout \
+      ~{"-p " + sample_pop_assignments} \
+      ~{"-f " + ped_file} \
+      ~{"-par " + par_bed} \
+      ~{"--allosomes-list " + allosomes_list} \
+    | bgzip -c \
+    > "~{prefix}.wAFs.vcf.gz"
+
+    tabix -p vcf "~{prefix}.wAFs.vcf.gz"
+  >>>
+
+  output {
+    File shard_wAFs = "~{prefix}.wAFs.vcf.gz"
+    File shard_wAFs_idx = "~{prefix}.wAFs.vcf.gz.tbi"
+  }
+  
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_pipeline_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl
index 46abafe4d..3fdd1849c 100755
--- a/wdl/ShardedAnnotateVcf.wdl
+++ b/wdl/ShardedAnnotateVcf.wdl
@@ -44,9 +44,8 @@ workflow ShardedAnnotateVcf {
     String gatk_docker
 
     RuntimeAttr? runtime_attr_svannotate
-    RuntimeAttr? runtime_attr_concat_vcfs
-    RuntimeAttr? runtime_attr_shard_vcf
     RuntimeAttr? runtime_attr_compute_AFs
+    RuntimeAttr? runtime_attr_subset_vcf_by_samples_list
     RuntimeAttr? runtime_attr_combine_vcfs
     RuntimeAttr? runtime_attr_modify_vcf
     RuntimeAttr? runtime_attr_combine_vcfs
@@ -92,7 +91,7 @@ workflow ShardedAnnotateVcf {
       input:
         vcf = AnnotateFunctionalConsequences.annotated_vcf,
         vcf_idx = AnnotateFunctionalConsequences.annotated_vcf_index,
-        prefix = prefix,
+        prefix = "~{prefix}.~{contig}.~{i}",
         contig = contig,
         ped_file = ped_file,
         par_bed = par_bed,
@@ -102,10 +101,8 @@ workflow ShardedAnnotateVcf {
 
         sv_base_mini_docker = sv_base_mini_docker,
         sv_pipeline_docker = sv_pipeline_docker,
-        runtime_attr_shard_vcf = runtime_attr_shard_vcf,
+        runtime_attr_subset_vcf_by_samples_list = runtime_attr_subset_vcf_by_samples_list,
         runtime_attr_compute_AFs = runtime_attr_compute_AFs,
-        runtime_attr_combine_vcfs = runtime_attr_combine_vcfs,
-        runtime_attr_concat_vcfs = runtime_attr_concat_vcfs
     }
 
     if (defined(ref_bed)) {
@@ -116,7 +113,7 @@ workflow ShardedAnnotateVcf {
           ref_bed = select_first([ref_bed]),
           population = select_first([population]),
           ref_prefix = select_first([ref_prefix]),
-          prefix = prefix,
+          prefix = "~{prefix}.~{contig}.~{i}",
           contigs = [contig],
           max_shards_per_chrom_step1 = max_shards_per_chrom_step1,
           min_records_per_shard_step1 = min_records_per_shard_step1,
@@ -158,7 +155,7 @@ workflow ShardedAnnotateVcf {
         vcfs=sharded_annotated_vcf,
         vcfs_idx=sharded_annotated_vcf_idx,
         allow_overlaps=true,
-        outfile_prefix="~{prefix}.annotatedd",
+        outfile_prefix="~{prefix}.annotated",
         sv_base_mini_docker=sv_base_mini_docker,
         runtime_attr_override=runtime_attr_concat_sharded_cluster
     }

From 6e51b6d2b087e47e0e1403a57b37a3727e75e502 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 8 May 2023 16:44:12 -0400
Subject: [PATCH 14/26] whitespace

---
 wdl/PruneAndAddVafs.wdl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wdl/PruneAndAddVafs.wdl b/wdl/PruneAndAddVafs.wdl
index f01b65c3f..e06263aeb 100644
--- a/wdl/PruneAndAddVafs.wdl
+++ b/wdl/PruneAndAddVafs.wdl
@@ -72,7 +72,7 @@ task ComputeShardAFs {
     RuntimeAttr? runtime_attr_override
   }
   RuntimeAttr default_attr = object {
-    cpu_cores: 1, 
+    cpu_cores: 1,
     mem_gb: 1.5,
     disk_gb: ceil(20 + size(vcf, "GB") * 2),
     boot_disk_gb: 10,
@@ -98,7 +98,7 @@ task ComputeShardAFs {
     File shard_wAFs = "~{prefix}.wAFs.vcf.gz"
     File shard_wAFs_idx = "~{prefix}.wAFs.vcf.gz.tbi"
   }
-  
+
   runtime {
     cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
     memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"

From b92591b75d5055cb71f2e415ebf84587f637936b Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 8 May 2023 17:28:56 -0400
Subject: [PATCH 15/26] merge output to same level as input

---
 wdl/AnnotateVcf.wdl        | 45 ++++++++++++++++++++++++++++++++++----
 wdl/ShardedAnnotateVcf.wdl | 36 ++----------------------------
 2 files changed, 43 insertions(+), 38 deletions(-)

diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl
index 49039c27a..67db5e183 100644
--- a/wdl/AnnotateVcf.wdl
+++ b/wdl/AnnotateVcf.wdl
@@ -2,6 +2,8 @@ version 1.0
 
 import "Structs.wdl"
 import "ShardedAnnotateVcf.wdl" as sharded_annotate_vcf
+import "TasksMakeCohortVcf.wdl" as MiniTasks
+import "HailMerge.wdl" as HailMerge
 
 workflow AnnotateVcf {
 
@@ -62,8 +64,8 @@ workflow AnnotateVcf {
   Array[String] contigs = read_lines(contig_list)
 
   scatter (i in range(length(contigs))) {
-    Int array_index = if (sharded_by_contig && length(vcf_list) > 1) then i else 0
-    call sharded_annotate_vcf.ShardedAnnotateVcf as ShardedAnnotateVcf{
+    Int array_index = if (sharded_by_contig) then i else 0
+    call sharded_annotate_vcf.ShardedAnnotateVcf {
       input:
         vcf = vcf_list[array_index],
         vcf_idx = vcf_idx_list[array_index],
@@ -114,8 +116,43 @@ workflow AnnotateVcf {
     }
   }
 
+  # Concat VCFs to the contig level or fully depending on format of input
+  # ShardedAnnotateVcf.sharded_annotated_vcf is is an Array[Array[File]] with one inner Array[File] of shards per contig
+  Array[Array[File]] vcfs_for_concatenation = if sharded_by_contig then ShardedAnnotateVcf.sharded_annotated_vcf else [flatten(ShardedAnnotateVcf.sharded_annotated_vcf)]
+  Array[Array[File]] vcf_idxs_for_concatenation = if sharded_by_contig then ShardedAnnotateVcf.sharded_annotated_vcf_idx else [flatten(ShardedAnnotateVcf.sharded_annotated_vcf_idx)]
+  if (use_hail) {
+    scatter (i in range(length(vcfs_for_concatenation))) {
+      call HailMerge.HailMerge {
+        input:
+          vcfs=vcfs_for_concatenation[i],
+          prefix="~{prefix_list[i]}.annotated",
+          gcs_project=gcs_project,
+          sv_base_mini_docker=sv_base_mini_docker,
+          sv_pipeline_docker=sv_pipeline_docker,
+          sv_pipeline_hail_docker=select_first([sv_pipeline_hail_docker]),
+          runtime_override_preconcat=runtime_attr_preconcat_sharded_cluster,
+          runtime_override_hail_merge=runtime_attr_hail_merge_sharded_cluster,
+          runtime_override_fix_header=runtime_attr_fix_header_sharded_cluster
+      }
+    }
+  }
+
+  if (!use_hail) {
+    scatter (i in range(length(vcfs_for_concatenation))) {
+      call MiniTasks.ConcatVcfs {
+        input:
+          vcfs=vcfs_for_concatenation[i],
+          vcfs_idx=vcf_idxs_for_concatenation[i],
+          allow_overlaps=true,
+          outfile_prefix="~{prefix_list[i]}.annotated",
+          sv_base_mini_docker=sv_base_mini_docker,
+          runtime_attr_override=runtime_attr_concat_sharded_cluster
+      }
+    }
+  }
+
   output {
-    Array[File] output_vcf_list     = ShardedAnnotateVcf.output_vcf
-    Array[File] output_vcf_idx_list = ShardedAnnotateVcf.output_vcf_idx
+    Array[File] output_vcf_list = select_first([ConcatVcfs.concat_vcf, HailMerge.merged_vcf])
+    Array[File] output_vcf_idx_list = select_first([ConcatVcfs.concat_vcf_idx, HailMerge.merged_vcf_index])
   }
 }
\ No newline at end of file
diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl
index 3fdd1849c..71fbf22ac 100755
--- a/wdl/ShardedAnnotateVcf.wdl
+++ b/wdl/ShardedAnnotateVcf.wdl
@@ -128,43 +128,11 @@ workflow ShardedAnnotateVcf {
           runtime_attr_select_matched_svs = runtime_attr_select_matched_svs
       }
     }
-
-  }
-
-  Array[File] sharded_annotated_vcf = select_first([select_all(AnnotateExternalAF.annotated_vcf), PruneAndAddVafs.output_vcf])
-  Array[File] sharded_annotated_vcf_idx = select_first([select_all(AnnotateExternalAF.annotated_vcf_tbi), PruneAndAddVafs.output_vcf_idx])
-
-  if (use_hail) {
-    call HailMerge.HailMerge {
-      input:
-        vcfs=sharded_annotated_vcf,
-        prefix="~{prefix}.annotated",
-        gcs_project=gcs_project,
-        sv_base_mini_docker=sv_base_mini_docker,
-        sv_pipeline_docker=sv_pipeline_docker,
-        sv_pipeline_hail_docker=select_first([sv_pipeline_hail_docker]),
-        runtime_override_preconcat=runtime_attr_preconcat_sharded_cluster,
-        runtime_override_hail_merge=runtime_attr_hail_merge_sharded_cluster,
-        runtime_override_fix_header=runtime_attr_fix_header_sharded_cluster
-    }
   }
 
-  if (!use_hail) {
-    call MiniTasks.ConcatVcfs {
-      input:
-        vcfs=sharded_annotated_vcf,
-        vcfs_idx=sharded_annotated_vcf_idx,
-        allow_overlaps=true,
-        outfile_prefix="~{prefix}.annotated",
-        sv_base_mini_docker=sv_base_mini_docker,
-        runtime_attr_override=runtime_attr_concat_sharded_cluster
-    }
-  }
-
-
   output {
-    File output_vcf = select_first([ConcatVcfs.concat_vcf, HailMerge.merged_vcf])
-    File output_vcf_idx = select_first([ConcatVcfs.concat_vcf_idx, HailMerge.merged_vcf_index])
+    Array[File] sharded_annotated_vcf = select_first([select_all(AnnotateExternalAF.annotated_vcf), PruneAndAddVafs.output_vcf])
+    Array[File] sharded_annotated_vcf_idx = select_first([select_all(AnnotateExternalAF.annotated_vcf_tbi), PruneAndAddVafs.output_vcf_idx])
   }
 }
 

From 6f4f350847aaf5c7cd177c42ef36dbffd963fb34 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 6 Jul 2023 16:51:08 -0400
Subject: [PATCH 16/26] pass index to scatter

---
 wdl/ShardedAnnotateVcf.wdl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl
index 71fbf22ac..984fd169a 100755
--- a/wdl/ShardedAnnotateVcf.wdl
+++ b/wdl/ShardedAnnotateVcf.wdl
@@ -65,6 +65,7 @@ workflow ShardedAnnotateVcf {
   call MiniTasks.ScatterVcf {
     input:
       vcf = vcf,
+      vcf_index = vcf_idx,
       prefix = prefix,
       records_per_shard = sv_per_shard,
       contig = contig,

From 28a0a935d6e1da5d96f61c72cfe3ff3ecc8bbb11 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 6 Jul 2023 17:24:05 -0400
Subject: [PATCH 17/26] annotate ext af per shard

---
 .../AnnotateVcf.SingleBatch.json.tmpl         |   2 -
 .../AnnotateVcf.json.tmpl                     |   4 +-
 ...TKSVPipelineSingleSample.no_melt.json.tmpl |   2 -
 .../test/AnnotateVcf/AnnotateVcf.json.tmpl    |   2 -
 .../GATKSVPipelineSingleSample.json.tmpl      |   2 -
 ...TKSVPipelineSingleSample.no_melt.json.tmpl |   2 -
 wdl/AnnotateExternalAF.wdl                    |  10 +-
 ...tig.wdl => AnnotateExternalAFPerShard.wdl} | 134 +++++++-----------
 wdl/AnnotateVcf.wdl                           |  11 --
 wdl/GATKSVPipelineSingleSample.wdl            |   4 -
 wdl/ShardedAnnotateVcf.wdl                    |  36 ++---
 11 files changed, 77 insertions(+), 132 deletions(-)
 rename wdl/{AnnotateExternalAFperContig.wdl => AnnotateExternalAFPerShard.wdl} (81%)

diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl
index 701490993..81b4c20e9 100644
--- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl
+++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl
@@ -12,8 +12,6 @@
   "AnnotateVcf.contig_list" : "${workspace.primary_contigs_list}",
   "AnnotateVcf.ped_file": "${workspace.cohort_ped_file}",
   "AnnotateVcf.sv_per_shard" : "5000",
-  "AnnotateVcf.max_shards_per_chrom_step1" : 200,
-  "AnnotateVcf.min_records_per_shard_step1" : 5000,
 
   "AnnotateVcf.prefix_list" : "${this.sample_set_id}",
   "AnnotateVcf.sharded_by_contig": "false",
diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl
index 404abac9f..5d378d6c0 100644
--- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl
+++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl
@@ -12,9 +12,7 @@
   "AnnotateVcf.contig_list" : "${workspace.primary_contigs_list}",
   "AnnotateVcf.ped_file": "${workspace.cohort_ped_file}",
   "AnnotateVcf.sv_per_shard" : "5000",
-  "AnnotateVcf.max_shards_per_chrom_step1" : 200,
-  "AnnotateVcf.min_records_per_shard_step1" : 5000,
-
+  
   "AnnotateVcf.prefix_list" : "${this.sample_set_set_id}",
   "AnnotateVcf.sharded_by_contig": "false",
   "AnnotateVcf.use_hail": "false",
diff --git a/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.no_melt.json.tmpl b/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.no_melt.json.tmpl
index 1f07adddc..565512d85 100644
--- a/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.no_melt.json.tmpl
+++ b/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.no_melt.json.tmpl
@@ -102,8 +102,6 @@
   "GATKSVPipelineSingleSample.external_af_population" :      {{ reference_resources.external_af_population | tojson }},
 
   "GATKSVPipelineSingleSample.annotation_sv_per_shard" : "5000",
-  "GATKSVPipelineSingleSample.annotation_max_shards_per_chrom_step1" : 200,
-  "GATKSVPipelineSingleSample.annotation_min_records_per_shard_step1" : 5000,
 
   "GATKSVPipelineSingleSample.ref_samples_list" : "${workspace.ref_panel_samples_list}",
   "GATKSVPipelineSingleSample.ref_std_manta_vcf_tar" : "${workspace.ref_panel_std_manta_vcf_tar}",
diff --git a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl
index b23b074a1..150ae0136 100644
--- a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl
+++ b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl
@@ -12,8 +12,6 @@
   "AnnotateVcf.contig_list" :  {{ reference_resources.primary_contigs_list | tojson }},
   "AnnotateVcf.ped_file":      {{ test_batch.ped_file | tojson }},
   "AnnotateVcf.sv_per_shard" : "5000",
-  "AnnotateVcf.max_shards_per_chrom_step1" : 200,
-  "AnnotateVcf.min_records_per_shard_step1" :  5000,
 
   "AnnotateVcf.prefix_list" : [{{ test_batch.name | tojson }}],
   "AnnotateVcf.sharded_by_contig": "false",
diff --git a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl
index 91d9fff1e..c60129a43 100644
--- a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl
+++ b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl
@@ -102,8 +102,6 @@
   "GATKSVPipelineSingleSample.external_af_population" :      {{ reference_resources.external_af_population | tojson }},
 
   "GATKSVPipelineSingleSample.annotation_sv_per_shard" : "5000",
-  "GATKSVPipelineSingleSample.annotation_max_shards_per_chrom_step1" : 200,
-  "GATKSVPipelineSingleSample.annotation_min_records_per_shard_step1" : 5000,
 
   "GATKSVPipelineSingleSample.ref_samples_list" : {{ ref_panel.samples_list | tojson }},
   "GATKSVPipelineSingleSample.ref_std_manta_vcf_tar" : {{ ref_panel.std_manta_vcf_tar | tojson }},
diff --git a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.no_melt.json.tmpl b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.no_melt.json.tmpl
index 1c894fb63..a265b5444 100644
--- a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.no_melt.json.tmpl
+++ b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.no_melt.json.tmpl
@@ -104,8 +104,6 @@
   "GATKSVPipelineSingleSample.external_af_population" :      {{ reference_resources.external_af_population | tojson }},
 
   "GATKSVPipelineSingleSample.annotation_sv_per_shard" : "5000",
-  "GATKSVPipelineSingleSample.annotation_max_shards_per_chrom_step1" : 200,
-  "GATKSVPipelineSingleSample.annotation_min_records_per_shard_step1" : 5000,
 
   "GATKSVPipelineSingleSample.ref_samples_list" : {{ ref_panel.samples_list | tojson }},
   "GATKSVPipelineSingleSample.ref_std_manta_vcf_tar" : {{ ref_panel.std_manta_vcf_tar | tojson }},
diff --git a/wdl/AnnotateExternalAF.wdl b/wdl/AnnotateExternalAF.wdl
index 322a48a74..4e98af36e 100644
--- a/wdl/AnnotateExternalAF.wdl
+++ b/wdl/AnnotateExternalAF.wdl
@@ -4,7 +4,7 @@ version 1.0
 
 import "Structs.wdl"
 import "TasksMakeCohortVcf.wdl" as MiniTasks
-import "AnnotateExternalAFperContig.wdl" as AnnotateExternalAFperContig
+import "AnnotateExternalAFperShard.wdl" as AnnotateExternalAFperShard
 
 workflow AnnotateExternalAF {
     input {
@@ -32,7 +32,7 @@ workflow AnnotateExternalAF {
         RuntimeAttr? runtime_attr_select_matched_svs
 
     }
-    call SplitBed as split_ref_bed {
+    call SplitRefBed as split_ref_bed {
         input:
             bed = ref_bed,
             sv_base_mini_docker = sv_base_mini_docker,
@@ -45,10 +45,8 @@ workflow AnnotateExternalAF {
             runtime_attr_override = runtime_attr_split_query_vcf
     }
 
-    Array[String] svtype_list = ["DEL","DUP","INS","INV_CPX","BND_CTX"]
-
     scatter ( contig in contigs ) {
-        call AnnotateExternalAFperContig.AnnotateExternalAFperContig as AnnotateExternalAFperContig{
+        call AnnotateExternalAFperShard.AnnotateExternalAFperShard {
             input:
                 vcf = vcf,
                 vcf_idx = vcf_idx,
@@ -94,7 +92,7 @@ workflow AnnotateExternalAF {
 
 }
 
-task SplitBed {
+task SplitRefBed {
     input {
         File bed
         String sv_base_mini_docker
diff --git a/wdl/AnnotateExternalAFperContig.wdl b/wdl/AnnotateExternalAFPerShard.wdl
similarity index 81%
rename from wdl/AnnotateExternalAFperContig.wdl
rename to wdl/AnnotateExternalAFPerShard.wdl
index cd0fc233b..c7b00ba3b 100644
--- a/wdl/AnnotateExternalAFperContig.wdl
+++ b/wdl/AnnotateExternalAFPerShard.wdl
@@ -5,16 +5,11 @@ version 1.0
 import "Structs.wdl"
 import "TasksMakeCohortVcf.wdl" as MiniTasks
 
-workflow AnnotateExternalAFperContig {
+workflow AnnotateExternalAFPerShard {
     input {
         File vcf
         File vcf_idx
-        File ref_bed
-        File split_query_vcf_del
-        File split_query_vcf_dup
-        File split_query_vcf_ins
-        File split_query_vcf_inv
-        File split_query_vcf_bnd
+        String prefix
         File split_ref_bed_del
         File split_ref_bed_dup
         File split_ref_bed_ins
@@ -22,69 +17,66 @@ workflow AnnotateExternalAFperContig {
         File split_ref_bed_bnd
 
         Array[String] population
-        String contig
         String ref_prefix
 
-        Int max_shards_per_chrom_step1
-        Int min_records_per_shard_step1
-
         String sv_base_mini_docker
         String sv_pipeline_docker
 
         # overrides for local tasks
         RuntimeAttr? runtime_attr_modify_vcf
-        RuntimeAttr? runtime_attr_split_vcf
-        RuntimeAttr? runtime_attr_combine_vcfs
+        RuntimeAttr? runtime_attr_split_query_vcf
         RuntimeAttr? runtime_attr_bedtools_closest
         RuntimeAttr? runtime_attr_select_matched_svs
     }
 
+    call SplitQueryVcf {
+        input:
+            vcf = vcf,
+            sv_pipeline_docker = sv_pipeline_docker,
+            runtime_attr_override = runtime_attr_split_query_vcf
+    }
+
     call BedtoolsClosest as compare_del {
         input:
-            bed_a = split_query_vcf_del,
+            bed_a = SplitQueryVcf.del,
             bed_b = split_ref_bed_del,
             svtype = "del",
-            contig = contig,
             sv_pipeline_docker = sv_pipeline_docker,
             runtime_attr_override = runtime_attr_bedtools_closest
     }
 
     call BedtoolsClosest as compare_dup {
         input:
-            bed_a = split_query_vcf_dup,
+            bed_a = SplitQueryVcf.dup,
             bed_b = split_ref_bed_dup,
             svtype = "dup",
-            contig = contig,
             sv_pipeline_docker = sv_pipeline_docker,
             runtime_attr_override = runtime_attr_bedtools_closest
     }
 
     call BedtoolsClosest as compare_ins {
         input:
-            bed_a = split_query_vcf_ins,
+            bed_a = SplitQueryVcf.ins,
             bed_b = split_ref_bed_ins,
             svtype = "ins",
-            contig = contig,
             sv_pipeline_docker = sv_pipeline_docker,
             runtime_attr_override = runtime_attr_bedtools_closest
     }
 
     call BedtoolsClosest as compare_inv {
         input:
-            bed_a = split_query_vcf_inv,
+            bed_a = SplitQueryVcf.inv,
             bed_b = split_ref_bed_inv,
             svtype = "inv",
-            contig = contig,
             sv_pipeline_docker = sv_pipeline_docker,
             runtime_attr_override = runtime_attr_bedtools_closest
     }
 
     call BedtoolsClosest as compare_bnd {
         input:
-            bed_a = split_query_vcf_bnd,
+            bed_a = SplitQueryVcf.bnd,
             bed_b = split_ref_bed_bnd,
             svtype = "bnd",
-            contig = contig,
             sv_pipeline_docker = sv_pipeline_docker,
             runtime_attr_override = runtime_attr_bedtools_closest
     }
@@ -133,55 +125,32 @@ workflow AnnotateExternalAFperContig {
             sv_pipeline_docker = sv_pipeline_docker,
             runtime_attr_override = runtime_attr_select_matched_svs
     }
-
-    call MiniTasks.SplitVcf as SplitVcf {
-      input:
-        vcf = vcf,
-        vcf_idx = vcf_idx,
-        contig=contig,
-        prefix="~{contig}.shard_",
-        n_shards=max_shards_per_chrom_step1,
-        min_vars_per_shard=min_records_per_shard_step1,
-        sv_base_mini_docker=sv_base_mini_docker,
-        runtime_attr_override=runtime_attr_split_vcf
-    }
-
-
-    scatter (vcf_shard in SplitVcf.vcf_shards) {
-        call ModifyVcf {
-            input:
-                labeled_del = calcu_del.output_comp,
-                labeled_dup = calcu_dup.output_comp,
-                labeled_ins = calcu_ins.output_comp,
-                labeled_inv = calcu_inv.output_comp,
-                labeled_bnd = calcu_bnd.output_comp,
-                vcf = vcf_shard,
-                ref_prefix = ref_prefix,
-                sv_pipeline_docker = sv_pipeline_docker,
-                runtime_attr_override = runtime_attr_modify_vcf       
-        }
-    }
-
-    call MiniTasks.ConcatVcfs as CombineVcfStep1 {
-      input:
-        vcfs = ModifyVcf.annotated_vcf,
-        vcfs_idx = ModifyVcf.annotated_vcf_tbi,
-        naive = true,
-        outfile_prefix = "~{contig}.annotated.vcf",
-        sv_base_mini_docker = sv_base_mini_docker,
-        runtime_attr_override = runtime_attr_combine_vcfs
+ 
+    call ModifyVcf {
+        input:
+            labeled_del = calcu_del.output_comp,
+            labeled_dup = calcu_dup.output_comp,
+            labeled_ins = calcu_ins.output_comp,
+            labeled_inv = calcu_inv.output_comp,
+            labeled_bnd = calcu_bnd.output_comp,
+            vcf = vcf,
+            prefix = prefix,
+            ref_prefix = ref_prefix,
+            sv_pipeline_docker = sv_pipeline_docker,
+            runtime_attr_override = runtime_attr_modify_vcf       
     }
 
     output {
-        File annotated_vcf = CombineVcfStep1.concat_vcf
-        File annotated_vcf_tbi = CombineVcfStep1.concat_vcf_idx
+        File annotated_vcf = ModifyVcf.annotated_vcf
+        File annotated_vcf_tbi = ModifyVcf.annotated_vcf_tbi
     }
 
 }
 
-task SplitBed {
+task SplitRefBed {
     input {
         File bed
+        String contig
         String sv_base_mini_docker
         RuntimeAttr? runtime_attr_override
     }
@@ -210,24 +179,26 @@ task SplitBed {
     String prefix = basename(bed, ".bed.gz")
     
     command <<<
+        set -eu
         zcat ~{bed} | head -1 > header
-        cat header <(zcat ~{bed} | awk '{if ($6=="DEL") print}') > ~{prefix}.DEL.bed
-        cat header <(zcat ~{bed} | awk '{if ($6=="DUP") print}') > ~{prefix}.DUP.bed
-        cat header <(zcat ~{bed} | awk '{if ($6=="INS" || $6=="INS:ME" || $6=="INS:ME:ALU" || $6=="INS:ME:LINE1" || $6=="INS:ME:SVA" || $6=="ALU" || $6=="LINE1" || $6=="SVA" || $6=="HERVK" ) print}') > ~{prefix}.INS.bed
-        cat header <(zcat ~{bed} | awk '{if ($6=="INV" || $6=="CPX") print}' ) > ~{prefix}.INV_CPX.bed
-        cat header <(zcat ~{bed} | awk '{if ($6=="BND" || $6=="CTX") print}' ) > ~{prefix}.BND_CTX.bed
+        set -o pipefail
+        cat header <(zcat ~{bed} | awk '{if ($1=="~{contig}" && $6=="DEL") print}') > ~{prefix}.~{contig}.DEL.bed
+        cat header <(zcat ~{bed} | awk '{if ($1=="~{contig}" && $6=="DUP") print}') > ~{prefix}.~{contig}.DUP.bed
+        cat header <(zcat ~{bed} | awk '{if ($1=="~{contig}" && $6=="INS" || $6=="INS:ME" || $6=="INS:ME:ALU" || $6=="INS:ME:LINE1" || $6=="INS:ME:SVA" || $6=="ALU" || $6=="LINE1" || $6=="SVA" || $6=="HERVK" ) print}') > ~{prefix}.~{contig}.INS.bed
+        cat header <(zcat ~{bed} | awk '{if ($1=="~{contig}" && $6=="INV" || $6=="CPX") print}' ) > ~{prefix}.~{contig}.INV_CPX.bed
+        cat header <(zcat ~{bed} | awk '{if ($1=="~{contig}" && $6=="BND" || $6=="CTX") print}' ) > ~{prefix}.~{contig}.BND_CTX.bed
     >>>
 
     output {
-        File del = "~{prefix}.DEL.bed"
-        File dup = "~{prefix}.DUP.bed"
-        File ins = "~{prefix}.INS.bed"
-        File inv = "~{prefix}.INV_CPX.bed"
-        File bnd = "~{prefix}.BND_CTX.bed"
+        File del = "~{prefix}.~{contig}.DEL.bed"
+        File dup = "~{prefix}.~{contig}.DUP.bed"
+        File ins = "~{prefix}.~{contig}.INS.bed"
+        File inv = "~{prefix}.~{contig}.INV_CPX.bed"
+        File bnd = "~{prefix}.~{contig}.BND_CTX.bed"
     }    
 }
 
-task SplitVcf {
+task SplitQueryVcf {
     input {
         File vcf
         String sv_pipeline_docker
@@ -258,9 +229,12 @@ task SplitVcf {
     String prefix = basename(vcf, ".vcf.gz")
     
     command <<<
+        set -euo pipefail
         svtk vcf2bed -i SVTYPE -i SVLEN ~{vcf} tmp.bed
         cut -f1-4,7-8 tmp.bed > ~{prefix}.bed
+        set +o pipefail
         head -1 ~{prefix}.bed > header
+        set -o pipefail
         cat header <(awk '{if ($5=="DEL") print}' ~{prefix}.bed )> ~{prefix}.DEL.bed
         cat header <(awk '{if ($5=="DUP") print}' ~{prefix}.bed )> ~{prefix}.DUP.bed
         cat header <(awk '{if ($5=="INS" || $5=="INS:ME" || $5=="INS:ME:ALU" || $5=="INS:ME:LINE1" || $5=="INS:ME:SVA" || $5=="ALU" || $5=="LINE1" || $5=="SVA" || $5=="HERVK" ) print}' ~{prefix}.bed )> ~{prefix}.INS.bed
@@ -283,7 +257,6 @@ task BedtoolsClosest {
         File bed_a
         File bed_b
         String svtype
-        String contig
         String sv_pipeline_docker
         RuntimeAttr? runtime_attr_override
     }
@@ -310,12 +283,10 @@ task BedtoolsClosest {
     }
     
     command <<<
-        awk '{if ($1=="~{contig}") print}' ~{bed_a} > filea.bed
-        awk '{if ($1=="~{contig}") print}' ~{bed_b} > fileb.bed
-
+        set -eu
         paste <(head -1 ~{bed_a}) <(head -1 ~{bed_b}) | sed -e "s/#//g" > ~{svtype}.bed
-
-        bedtools closest -wo -a <(sort -k1,1 -k2,2n filea.bed) -b <(sort -k1,1 -k2,2n fileb.bed) >> ~{svtype}.bed
+        set -o pipefail
+        bedtools closest -wo -a <(sort -k1,1 -k2,2n ~{bed_a}) -b <(sort -k1,1 -k2,2n ~{bed_b}) >> ~{svtype}.bed
     >>>
 
     output {
@@ -357,6 +328,7 @@ task SelectMatchedSVs {
     File pop_list = write_lines(population)
 
     command <<<
+        set -euo pipefail
         Rscript /opt/sv-pipeline/05_annotation/scripts/R1.bedtools_closest_CNV.R \
             -i ~{input_bed} \
             -o ~{prefix}.comparison \
@@ -421,6 +393,7 @@ task ModifyVcf {
         File labeled_inv
         File labeled_bnd
         File vcf
+        String prefix
         String ref_prefix
         String sv_pipeline_docker
         RuntimeAttr? runtime_attr_override
@@ -447,7 +420,6 @@ task ModifyVcf {
         bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, runtime_default.boot_disk_gb])
     }
 
-    String prefix = basename(vcf,'.vcf.gz')
     command <<<
         cat ~{labeled_del} > labeled.bed
         cat ~{labeled_dup} >> labeled.bed
diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl
index 67db5e183..7e56f3bc1 100644
--- a/wdl/AnnotateVcf.wdl
+++ b/wdl/AnnotateVcf.wdl
@@ -20,9 +20,6 @@ workflow AnnotateVcf {
     Int? max_breakend_as_cnv_length
     String? svannotate_additional_args
 
-    Int max_shards_per_chrom_step1
-    Int min_records_per_shard_step1
-
     File? sample_pop_assignments  # Two-column file with sample ID & pop assignment. "." for pop will ignore sample
     File? sample_keep_list              # List of samples to be retained from the output vcf
     File? ped_file                # Used for M/F AF calculations
@@ -47,10 +44,7 @@ workflow AnnotateVcf {
     RuntimeAttr? runtime_attr_shard_vcf
     RuntimeAttr? runtime_attr_subset_vcf_by_samples_list
     RuntimeAttr? runtime_attr_compute_AFs
-    RuntimeAttr? runtime_attr_combine_vcfs
     RuntimeAttr? runtime_attr_modify_vcf
-    RuntimeAttr? runtime_attr_combine_vcfs
-    RuntimeAttr? runtime_attr_split_vcf
     RuntimeAttr? runtime_attr_split_ref_bed
     RuntimeAttr? runtime_attr_split_query_vcf
     RuntimeAttr? runtime_attr_bedtools_closest
@@ -77,8 +71,6 @@ workflow AnnotateVcf {
         svannotate_additional_args = svannotate_additional_args,
         max_breakend_as_cnv_length = max_breakend_as_cnv_length,
 
-        max_shards_per_chrom_step1 = max_shards_per_chrom_step1,
-        min_records_per_shard_step1 = min_records_per_shard_step1,
         sample_pop_assignments = sample_pop_assignments,
         sample_keep_list = sample_keep_list,
         ped_file = ped_file,
@@ -101,10 +93,7 @@ workflow AnnotateVcf {
         runtime_attr_svannotate = runtime_attr_svannotate,
         runtime_attr_subset_vcf_by_samples_list = runtime_attr_subset_vcf_by_samples_list,
         runtime_attr_compute_AFs  = runtime_attr_compute_AFs,
-        runtime_attr_combine_vcfs = runtime_attr_combine_vcfs,
         runtime_attr_modify_vcf = runtime_attr_modify_vcf,
-        runtime_attr_combine_vcfs = runtime_attr_combine_vcfs,
-        runtime_attr_split_vcf  = runtime_attr_split_vcf,
         runtime_attr_split_ref_bed  = runtime_attr_split_ref_bed,
         runtime_attr_split_query_vcf  = runtime_attr_split_query_vcf,
         runtime_attr_bedtools_closest = runtime_attr_bedtools_closest,
diff --git a/wdl/GATKSVPipelineSingleSample.wdl b/wdl/GATKSVPipelineSingleSample.wdl
index d61231ee2..b3a97e6a3 100644
--- a/wdl/GATKSVPipelineSingleSample.wdl
+++ b/wdl/GATKSVPipelineSingleSample.wdl
@@ -561,8 +561,6 @@ workflow GATKSVPipelineSingleSample {
     Int? promoter_window
     Int? max_breakend_as_cnv_length
     Int annotation_sv_per_shard
-    Int annotation_max_shards_per_chrom_step1
-    Int annotation_min_records_per_shard_step1
 
     File? external_af_ref_bed             # bed file with population AFs for annotation
     String? external_af_ref_bed_prefix    # name of external AF bed file call set
@@ -1403,8 +1401,6 @@ workflow GATKSVPipelineSingleSample {
         use_hail = false,
         sharded_by_contig = false,
         sv_per_shard = annotation_sv_per_shard,
-        max_shards_per_chrom_step1 = annotation_max_shards_per_chrom_step1,
-        min_records_per_shard_step1 = annotation_min_records_per_shard_step1,
         sv_base_mini_docker = sv_base_mini_docker,
         sv_pipeline_docker = sv_pipeline_docker,
         gatk_docker = gatk_docker,
diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl
index 984fd169a..b6bc2706f 100755
--- a/wdl/ShardedAnnotateVcf.wdl
+++ b/wdl/ShardedAnnotateVcf.wdl
@@ -5,7 +5,7 @@ import "TasksMakeCohortVcf.wdl" as MiniTasks
 import "HailMerge.wdl" as HailMerge
 import "AnnotateFunctionalConsequences.wdl" as func
 import "PruneAndAddVafs.wdl" as pav
-import "AnnotateExternalAF.wdl" as eaf
+import "AnnotateExternalAFPerShard.wdl" as eaf
 
 workflow ShardedAnnotateVcf {
 
@@ -21,9 +21,6 @@ workflow ShardedAnnotateVcf {
     Int? max_breakend_as_cnv_length
     String? svannotate_additional_args
 
-    Int max_shards_per_chrom_step1
-    Int min_records_per_shard_step1
-
     File? sample_pop_assignments  # Two-column file with sample ID & pop assignment. "." for pop will ignore sample
     File? sample_keep_list
     File? ped_file                # Used for M/F AF calculations
@@ -46,10 +43,7 @@ workflow ShardedAnnotateVcf {
     RuntimeAttr? runtime_attr_svannotate
     RuntimeAttr? runtime_attr_compute_AFs
     RuntimeAttr? runtime_attr_subset_vcf_by_samples_list
-    RuntimeAttr? runtime_attr_combine_vcfs
     RuntimeAttr? runtime_attr_modify_vcf
-    RuntimeAttr? runtime_attr_combine_vcfs
-    RuntimeAttr? runtime_attr_split_vcf
     RuntimeAttr? runtime_attr_split_ref_bed
     RuntimeAttr? runtime_attr_split_query_vcf
     RuntimeAttr? runtime_attr_bedtools_closest
@@ -62,6 +56,16 @@ workflow ShardedAnnotateVcf {
     RuntimeAttr? runtime_attr_fix_header_sharded_cluster
   }
 
+  if (defined(ref_bed)) {
+    call eaf.SplitRefBed {
+      input:
+        bed = select_first([ref_bed]),
+        contig = contig,
+        sv_base_mini_docker = sv_base_mini_docker,
+        runtime_attr_override = runtime_attr_split_ref_bed
+    }
+  }
+
   call MiniTasks.ScatterVcf {
     input:
       vcf = vcf,
@@ -107,23 +111,21 @@ workflow ShardedAnnotateVcf {
     }
 
     if (defined(ref_bed)) {
-      call eaf.AnnotateExternalAF as AnnotateExternalAF {
+      call eaf.AnnotateExternalAFPerShard {
         input:
           vcf = PruneAndAddVafs.output_vcf,
           vcf_idx = PruneAndAddVafs.output_vcf_idx,
-          ref_bed = select_first([ref_bed]),
+          split_ref_bed_del = select_first([SplitRefBed.del]),
+          split_ref_bed_dup = select_first([SplitRefBed.dup]),
+          split_ref_bed_ins = select_first([SplitRefBed.ins]),
+          split_ref_bed_inv = select_first([SplitRefBed.inv]),
+          split_ref_bed_bnd = select_first([SplitRefBed.bnd]),
           population = select_first([population]),
           ref_prefix = select_first([ref_prefix]),
           prefix = "~{prefix}.~{contig}.~{i}",
-          contigs = [contig],
-          max_shards_per_chrom_step1 = max_shards_per_chrom_step1,
-          min_records_per_shard_step1 = min_records_per_shard_step1,
           sv_base_mini_docker = sv_base_mini_docker,
           sv_pipeline_docker = sv_pipeline_docker,
           runtime_attr_modify_vcf = runtime_attr_modify_vcf,
-          runtime_attr_split_vcf = runtime_attr_split_vcf,
-          runtime_attr_combine_vcfs = runtime_attr_combine_vcfs,
-          runtime_attr_split_ref_bed = runtime_attr_split_ref_bed,
           runtime_attr_split_query_vcf = runtime_attr_split_query_vcf,
           runtime_attr_bedtools_closest = runtime_attr_bedtools_closest,
           runtime_attr_select_matched_svs = runtime_attr_select_matched_svs
@@ -132,8 +134,8 @@ workflow ShardedAnnotateVcf {
   }
 
   output {
-    Array[File] sharded_annotated_vcf = select_first([select_all(AnnotateExternalAF.annotated_vcf), PruneAndAddVafs.output_vcf])
-    Array[File] sharded_annotated_vcf_idx = select_first([select_all(AnnotateExternalAF.annotated_vcf_tbi), PruneAndAddVafs.output_vcf_idx])
+    Array[File] sharded_annotated_vcf = select_first([select_all(AnnotateExternalAFPerShard.annotated_vcf), PruneAndAddVafs.output_vcf])
+    Array[File] sharded_annotated_vcf_idx = select_first([select_all(AnnotateExternalAFPerShard.annotated_vcf_tbi), PruneAndAddVafs.output_vcf_idx])
   }
 }
 

From 0880aef577e12e647df4687a78eea0306dea0450 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Fri, 7 Jul 2023 13:26:00 -0400
Subject: [PATCH 18/26] remove AnnotateExternalAF.wdl

---
 wdl/AnnotateExternalAF.wdl | 428 -------------------------------------
 1 file changed, 428 deletions(-)
 delete mode 100644 wdl/AnnotateExternalAF.wdl

diff --git a/wdl/AnnotateExternalAF.wdl b/wdl/AnnotateExternalAF.wdl
deleted file mode 100644
index 4e98af36e..000000000
--- a/wdl/AnnotateExternalAF.wdl
+++ /dev/null
@@ -1,428 +0,0 @@
-version 1.0
-
-# Author: Xuefang Zhao <XZHAO12@mgh.harvard.edu>
-
-import "Structs.wdl"
-import "TasksMakeCohortVcf.wdl" as MiniTasks
-import "AnnotateExternalAFperShard.wdl" as AnnotateExternalAFperShard
-
-workflow AnnotateExternalAF {
-    input {
-        File vcf
-        File vcf_idx
-        File ref_bed
-        Array[String] population
-        Array[String] contigs
-        String ref_prefix
-        String prefix
-
-        Int max_shards_per_chrom_step1
-        Int min_records_per_shard_step1
-
-        String sv_base_mini_docker
-        String sv_pipeline_docker
-
-        # overrides for local tasks
-        RuntimeAttr? runtime_attr_modify_vcf
-        RuntimeAttr? runtime_attr_combine_vcfs
-        RuntimeAttr? runtime_attr_split_vcf
-        RuntimeAttr? runtime_attr_split_ref_bed
-        RuntimeAttr? runtime_attr_split_query_vcf
-        RuntimeAttr? runtime_attr_bedtools_closest
-        RuntimeAttr? runtime_attr_select_matched_svs
-
-    }
-    call SplitRefBed as split_ref_bed {
-        input:
-            bed = ref_bed,
-            sv_base_mini_docker = sv_base_mini_docker,
-            runtime_attr_override = runtime_attr_split_ref_bed
-    }
-    call SplitVcf as split_query_vcf {
-        input:
-            vcf = vcf,
-            sv_pipeline_docker = sv_pipeline_docker,
-            runtime_attr_override = runtime_attr_split_query_vcf
-    }
-
-    scatter ( contig in contigs ) {
-        call AnnotateExternalAFperShard.AnnotateExternalAFperShard {
-            input:
-                vcf = vcf,
-                vcf_idx = vcf_idx,
-                ref_bed = ref_bed,
-                split_query_vcf_del = split_query_vcf.del,
-                split_query_vcf_dup = split_query_vcf.dup,
-                split_query_vcf_ins = split_query_vcf.ins,
-                split_query_vcf_inv = split_query_vcf.inv,
-                split_query_vcf_bnd = split_query_vcf.bnd,
-                split_ref_bed_del = split_ref_bed.del,
-                split_ref_bed_dup = split_ref_bed.dup,
-                split_ref_bed_ins = split_ref_bed.ins,
-                split_ref_bed_inv = split_ref_bed.inv,
-                split_ref_bed_bnd = split_ref_bed.bnd,
-                population = population,
-                contig = contig,
-                ref_prefix = ref_prefix,
-                max_shards_per_chrom_step1 = max_shards_per_chrom_step1,
-                min_records_per_shard_step1 = min_records_per_shard_step1,
-                sv_base_mini_docker = sv_base_mini_docker,
-                sv_pipeline_docker = sv_pipeline_docker,
-                runtime_attr_split_vcf = runtime_attr_split_vcf,
-                runtime_attr_modify_vcf = runtime_attr_modify_vcf,
-                runtime_attr_select_matched_svs = runtime_attr_select_matched_svs,
-                runtime_attr_bedtools_closest = runtime_attr_bedtools_closest
-        }
-    }
-
-    call MiniTasks.ConcatVcfs as CombineVcfStep2 {
-      input:
-        vcfs = AnnotateExternalAFperContig.annotated_vcf,
-        vcfs_idx = AnnotateExternalAFperContig.annotated_vcf_tbi,
-        naive = true,
-        outfile_prefix = "~{prefix}.annotated",
-        sv_base_mini_docker = sv_base_mini_docker,
-        runtime_attr_override = runtime_attr_combine_vcfs
-    }
-
-     output {
-        File annotated_vcf = CombineVcfStep2.concat_vcf
-        File annotated_vcf_tbi = CombineVcfStep2.concat_vcf_idx
-    }
-
-}
-
-task SplitRefBed {
-    input {
-        File bed
-        String sv_base_mini_docker
-        RuntimeAttr? runtime_attr_override
-    }
-
-    RuntimeAttr runtime_default = object {
-        mem_gb: 3,
-        disk_gb: 5,
-        cpu_cores: 1,
-        preemptible_tries: 1,
-        max_retries: 1,
-        boot_disk_gb: 10
-    }
-
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-    
-    runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-        docker: sv_base_mini_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-    }
-
-    String prefix = basename(bed, ".bed.gz")
-    
-    command <<<
-        zcat ~{bed} | head -1 > header
-        cat header <(zcat ~{bed} | awk '{if ($6=="DEL") print}') > ~{prefix}.DEL.bed
-        cat header <(zcat ~{bed} | awk '{if ($6=="DUP") print}') > ~{prefix}.DUP.bed
-        cat header <(zcat ~{bed} | awk '{if ($6=="INS" || $6=="INS:ME" || $6=="INS:ME:ALU" || $6=="INS:ME:LINE1" || $6=="INS:ME:SVA" || $6=="ALU" || $6=="LINE1" || $6=="SVA" || $6=="HERVK" ) print}') > ~{prefix}.INS.bed
-        cat header <(zcat ~{bed} | awk '{if ($6=="INV" || $6=="CPX") print}' ) > ~{prefix}.INV_CPX.bed
-        cat header <(zcat ~{bed} | awk '{if ($6=="BND" || $6=="CTX") print}' ) > ~{prefix}.BND_CTX.bed
-    >>>
-
-    output {
-        File del = "~{prefix}.DEL.bed"
-        File dup = "~{prefix}.DUP.bed"
-        File ins = "~{prefix}.INS.bed"
-        File inv = "~{prefix}.INV_CPX.bed"
-        File bnd = "~{prefix}.BND_CTX.bed"
-    }    
-}
-
-task SplitVcf {
-    input {
-        File vcf
-        String sv_pipeline_docker
-        RuntimeAttr? runtime_attr_override
-    }
-
-    RuntimeAttr runtime_default = object {
-        mem_gb: 3,
-        disk_gb: 10,
-        cpu_cores: 1,
-        preemptible_tries: 1,
-        max_retries: 1,
-        boot_disk_gb: 10
-    }
-
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-    
-    runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-        docker: sv_pipeline_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-    }
-
-    String prefix = basename(vcf, ".vcf.gz")
-    
-    command <<<
-        svtk vcf2bed -i SVTYPE -i SVLEN ~{vcf} - |
-            cut -f1-4,7-8 > tmp.bed
-        head -1 tmp.bed > ~{prefix}.bed
-        awk 'NR > 1' < tmp.bed \
-            | sort -k1,1V -k2,2n -k3,3n >> ~{prefix}.bed
-        rm tmp.bed
-        head -1 ~{prefix}.bed > header
-        cat header <(awk '{if ($5=="DEL") print}' ~{prefix}.bed )> ~{prefix}.DEL.bed
-        cat header <(awk '{if ($5=="DUP") print}' ~{prefix}.bed )> ~{prefix}.DUP.bed
-        cat header <(awk '{if ($5=="INS" || $5=="INS:ME" || $5=="INS:ME:ALU" || $5=="INS:ME:LINE1" || $5=="INS:ME:SVA" || $5=="ALU" || $5=="LINE1" || $5=="SVA" || $5=="HERVK" ) print}' ~{prefix}.bed )> ~{prefix}.INS.bed
-        cat header <(awk '{if ($5=="INV" || $5=="CPX") print}' ~{prefix}.bed )> ~{prefix}.INV_CPX.bed
-        cat header <(awk '{if ($5=="BND" || $5=="CTX") print}' ~{prefix}.bed )> ~{prefix}.BND_CTX.bed
-    >>>
-
-    output {
-        File bed = "~{prefix}.bed"
-        File del = "~{prefix}.DEL.bed"
-        File dup = "~{prefix}.DUP.bed"
-        File ins = "~{prefix}.INS.bed"
-        File inv = "~{prefix}.INV_CPX.bed"
-        File bnd = "~{prefix}.BND_CTX.bed"
-    }
-}
-
-task BedtoolsClosest {
-    input {
-        File bed_a
-        File bed_b
-        String svtype
-        String contig
-        String sv_pipeline_docker
-        RuntimeAttr? runtime_attr_override
-    }
-
-    RuntimeAttr runtime_default = object {
-        mem_gb: 3,
-        disk_gb: 5,
-        cpu_cores: 1,
-        preemptible_tries: 1,
-        max_retries: 1,
-        boot_disk_gb: 10
-    }
-
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-    
-    runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-        docker: sv_pipeline_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-    }
-    
-    command <<<
-        awk '{if ($1=="~{contig}") print}' ~{bed_a} > filea.bed
-        awk '{if ($1=="~{contig}") print}' ~{bed_b} > fileb.bed
-
-        paste <(head -1 ~{bed_a}) <(head -1 ~{bed_b}) | sed -e "s/#//g" > ~{svtype}.bed
-
-        bedtools closest -wo -a <(sort -k1,1 -k2,2n filea.bed) -b <(sort -k1,1 -k2,2n fileb.bed) >> ~{svtype}.bed
-    >>>
-
-    output {
-        File output_bed = "~{svtype}.bed"
-    }
-}
-
-task SelectMatchedSVs {
-    input {
-        File input_bed
-        String svtype
-        Array[String] population
-        String sv_pipeline_docker
-        RuntimeAttr? runtime_attr_override
-    }
-
-    RuntimeAttr runtime_default = object {
-        mem_gb: 3,
-        disk_gb: 5,
-        cpu_cores: 1,
-        preemptible_tries: 1,
-        max_retries: 1,
-        boot_disk_gb: 10
-    }
-
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-    
-    runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-        docker: sv_pipeline_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-    }
-
-    String prefix = basename(input_bed, ".bed")
-    File pop_list = write_lines(population)
-
-    command <<<
-        Rscript /opt/sv-pipeline/05_annotation/scripts/R1.bedtools_closest_CNV.R \
-            -i ~{input_bed} \
-            -o ~{prefix}.comparison \
-            -p ~{pop_list}
-    >>>
-
-    output {
-        File output_comp = "~{prefix}.comparison"
-    }    
-}
-
-task SelectMatchedINSs {
-    input {
-        File input_bed
-        String svtype
-        Array[String] population
-        String sv_pipeline_docker
-        RuntimeAttr? runtime_attr_override
-    }
-
-    RuntimeAttr runtime_default = object {
-        mem_gb: 3,
-        disk_gb: 5,
-        cpu_cores: 1,
-        preemptible_tries: 1,
-        max_retries: 1,
-        boot_disk_gb: 10
-    }
-
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-    
-    runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-        docker: sv_pipeline_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-    }
-
-    String prefix = basename(input_bed, ".bed")
-    File pop_list = write_lines(population)
-
-    command <<<
-        Rscript /opt/sv-pipeline/05_annotation/scripts/R2.bedtools_closest_INS.R \
-            -i ~{input_bed} \
-            -o ~{prefix}.comparison \
-            -p ~{pop_list}
-    >>>
-
-    output {
-        File output_comp = "~{prefix}.comparison"
-    }
-}
-
-task ModifyVcf {
-    input {
-        Array[File] labeled_del
-        Array[File] labeled_dup
-        Array[File] labeled_ins
-        Array[File] labeled_inv
-        Array[File] labeled_bnd
-        File vcf
-        String ref_prefix
-        String sv_pipeline_docker
-        RuntimeAttr? runtime_attr_override
-    }
-
-    RuntimeAttr runtime_default = object {
-        mem_gb: 3,
-        disk_gb: 5,
-        cpu_cores: 1,
-        preemptible_tries: 1,
-        max_retries: 1,
-        boot_disk_gb: 10
-    }
-
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-    
-    runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-        docker: sv_pipeline_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-    }
-
-    String prefix = basename(vcf,'.vcf.gz')
-    command <<<
-        cat ~{sep=" " labeled_del} > labeled.bed
-        cat ~{sep=" " labeled_dup} >> labeled.bed
-        cat ~{sep=" " labeled_ins} >> labeled.bed
-        cat ~{sep=" " labeled_inv} >> labeled.bed
-        cat ~{sep=" " labeled_bnd} >> labeled.bed
-
-        python <<CODE
-        import os
-        fin=os.popen(r'''zcat %s'''%("~{vcf}"))
-        header = []
-        body = {}
-        SVID_key = []
-        for line in fin:
-            pin=line.strip().split()
-            if pin[0][:2]=='##':
-                header.append(pin)
-            else:
-                body[pin[2]]=pin
-                SVID_key.append(pin[2])
-        header.append(['##INFO=<ID='+"~{ref_prefix}"+'_SVID'+',Number=1,Type=String,Description="SVID of an overlapping event in gnomad used for external allele frequency annotation.">'])
-
-        fin.close()
-        fin=open('labeled.bed')
-        colname = fin.readline().strip().split()
-
-        for j in range(len(colname)-1):
-            if j>1:
-                header.append(['##INFO=<ID='+"~{ref_prefix}"+'_'+colname[j]+',Number=1,Type=Float,Description="Allele frequency (for biallelic sites) or copy-state frequency (for multiallelic sites) of an overlapping event in gnomad.">'])
-
-        for line in fin:
-            pin=line.strip().split()
-            if pin[0]=='query_svid': continue
-            info_add = ["~{ref_prefix}"+'_SVID'+'='+pin[1]]
-            for j in range(len(colname)-1):
-                if j>1:
-                    info_add.append("~{ref_prefix}"+'_'+colname[j]+'='+pin[j])
-            body[pin[0]][7]+=';'+';'.join(info_add)
-        fin.close()
-
-        fo=open('~{prefix}.annotated.vcf','w')
-        for i in header:
-            print(' '.join(i), file=fo)
-        for i in SVID_key:
-            print('\t'.join(body[i]), file=fo)
-        fo.close()
-        CODE
-
-        bgzip ~{prefix}.annotated.vcf
-        tabix ~{prefix}.annotated.vcf.gz
-    >>>
-
-    output {
-        File annotated_vcf = "~{prefix}.annotated.vcf.gz"
-        File annotated_vcf_tbi = "~{prefix}.annotated.vcf.gz.tbi"
-    }        
-}
-
-
-
-

From d94df6df797afcc117076e141564d39686986c08 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 10 Jul 2023 14:06:18 -0400
Subject: [PATCH 19/26] make index optional

---
 wdl/AnnotateVcf.wdl        | 8 +++++---
 wdl/ShardedAnnotateVcf.wdl | 2 +-
 wdl/TasksMakeCohortVcf.wdl | 1 +
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl
index 7e56f3bc1..9440fda95 100644
--- a/wdl/AnnotateVcf.wdl
+++ b/wdl/AnnotateVcf.wdl
@@ -9,10 +9,9 @@ workflow AnnotateVcf {
 
   input {
     Array[File] vcf_list  # Must be either single full VCF (array of length 1) or array of VCFs sharded by contig. Outputs will match
-    Array[File] vcf_idx_list
+    Array[File]? vcf_idx_list
     File contig_list
     Array[String] prefix_list
-    Boolean sharded_by_contig  # True if providing a vcf_list sharded by contig. False if providing a single full VCF
 
     File protein_coding_gtf
     File? noncoding_bed
@@ -39,6 +38,8 @@ workflow AnnotateVcf {
     String sv_base_mini_docker
     String gatk_docker
 
+    File? NONE_FILE_
+
     RuntimeAttr? runtime_attr_svannotate
     RuntimeAttr? runtime_attr_concat_vcfs
     RuntimeAttr? runtime_attr_shard_vcf
@@ -56,13 +57,14 @@ workflow AnnotateVcf {
   }
 
   Array[String] contigs = read_lines(contig_list)
+  Boolean sharded_by_contig = (length(vcf_list) == length(contigs))
 
   scatter (i in range(length(contigs))) {
     Int array_index = if (sharded_by_contig) then i else 0
     call sharded_annotate_vcf.ShardedAnnotateVcf {
       input:
         vcf = vcf_list[array_index],
-        vcf_idx = vcf_idx_list[array_index],
+        vcf_idx = if defined(vcf_idx_list) then select_first([vcf_idx_list])[array_index] else NONE_FILE_,
         contig = contigs[i],
         prefix = prefix_list[array_index],
         protein_coding_gtf = protein_coding_gtf,
diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl
index b6bc2706f..b7186771b 100755
--- a/wdl/ShardedAnnotateVcf.wdl
+++ b/wdl/ShardedAnnotateVcf.wdl
@@ -11,7 +11,7 @@ workflow ShardedAnnotateVcf {
 
   input {
     File vcf
-    File vcf_idx
+    File? vcf_idx
     String prefix
     String contig
 
diff --git a/wdl/TasksMakeCohortVcf.wdl b/wdl/TasksMakeCohortVcf.wdl
index 1cf9237d7..b344c0a42 100644
--- a/wdl/TasksMakeCohortVcf.wdl
+++ b/wdl/TasksMakeCohortVcf.wdl
@@ -990,6 +990,7 @@ task ScatterVcf {
 
   command <<<
     set -euo pipefail
+    ~{if !defined(vcf_index) then "tabix ~{vcf}" else ""}
     # in case the file is empty create an empty shard
     bcftools view -h ~{vcf} | bgzip -c > "~{prefix}.0.vcf.gz"
     bcftools +scatter ~{vcf} -o . -O z -p "~{prefix}". --threads ~{threads} -n ~{records_per_shard} ~{"-r " + contig}

From dcc341d69b987dfd6f92c25b48478cafd72c289b Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 10 Jul 2023 14:27:59 -0400
Subject: [PATCH 20/26] remove sharded_by_contig input from jsons

---
 .../workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl    | 1 -
 .../cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl    | 1 -
 inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl          | 1 -
 3 files changed, 3 deletions(-)

diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl
index 81b4c20e9..f8dc2a433 100644
--- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl
+++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl
@@ -14,7 +14,6 @@
   "AnnotateVcf.sv_per_shard" : "5000",
 
   "AnnotateVcf.prefix_list" : "${this.sample_set_id}",
-  "AnnotateVcf.sharded_by_contig": "false",
   "AnnotateVcf.use_hail": "false",
 
   "AnnotateVcf.gatk_docker" : "${workspace.gatk_docker}",
diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl
index 5d378d6c0..b89279374 100644
--- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl
+++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl
@@ -14,7 +14,6 @@
   "AnnotateVcf.sv_per_shard" : "5000",
   
   "AnnotateVcf.prefix_list" : "${this.sample_set_set_id}",
-  "AnnotateVcf.sharded_by_contig": "false",
   "AnnotateVcf.use_hail": "false",
 
   "AnnotateVcf.gatk_docker" : "${workspace.gatk_docker}",
diff --git a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl
index 150ae0136..ccb9cf6a6 100644
--- a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl
+++ b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl
@@ -14,7 +14,6 @@
   "AnnotateVcf.sv_per_shard" : "5000",
 
   "AnnotateVcf.prefix_list" : [{{ test_batch.name | tojson }}],
-  "AnnotateVcf.sharded_by_contig": "false",
   "AnnotateVcf.use_hail": "false",
 
   "AnnotateVcf.gatk_docker":{{ dockers.gatk_docker | tojson }},

From b8ccd52a78c63b0c4250b1d39fa96614dd73fde5 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 10 Jul 2023 14:40:22 -0400
Subject: [PATCH 21/26] womtool validation

---
 wdl/GATKSVPipelineSingleSample.wdl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/wdl/GATKSVPipelineSingleSample.wdl b/wdl/GATKSVPipelineSingleSample.wdl
index b3a97e6a3..cd5750cfc 100644
--- a/wdl/GATKSVPipelineSingleSample.wdl
+++ b/wdl/GATKSVPipelineSingleSample.wdl
@@ -1399,7 +1399,6 @@ workflow GATKSVPipelineSingleSample {
         ref_prefix = external_af_ref_bed_prefix,
         population = external_af_population,
         use_hail = false,
-        sharded_by_contig = false,
         sv_per_shard = annotation_sv_per_shard,
         sv_base_mini_docker = sv_base_mini_docker,
         sv_pipeline_docker = sv_pipeline_docker,

From 578af9efcd746da98f8638b23059815021c76d17 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 27 Jul 2023 14:43:47 -0400
Subject: [PATCH 22/26] single vcf input. require index & infer path

---
 .../AnnotateVcf.SingleBatch.json.tmpl         |  5 +-
 .../AnnotateVcf.json.tmpl                     |  5 +-
 .../test/AnnotateVcf/AnnotateVcf.json.tmpl    |  5 +-
 wdl/AnnotateVcf.wdl                           | 73 ++++++++-----------
 wdl/GATKSVPipelineSingleSample.wdl            | 17 ++---
 wdl/ShardedAnnotateVcf.wdl                    |  4 +-
 6 files changed, 49 insertions(+), 60 deletions(-)

diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl
index f8dc2a433..448606ad5 100644
--- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl
+++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.SingleBatch.json.tmpl
@@ -1,6 +1,5 @@
 {
-  "AnnotateVcf.vcf_list" : "${this.cleaned_vcf}",
-  "AnnotateVcf.vcf_idx_list" : "${this.cleaned_vcf_index}",
+  "AnnotateVcf.vcf" : "${this.cleaned_vcf}",
 
   "AnnotateVcf.protein_coding_gtf" : "${workspace.protein_coding_gtf}",
   "AnnotateVcf.noncoding_bed" : "${workspace.noncoding_bed}",
@@ -13,7 +12,7 @@
   "AnnotateVcf.ped_file": "${workspace.cohort_ped_file}",
   "AnnotateVcf.sv_per_shard" : "5000",
 
-  "AnnotateVcf.prefix_list" : "${this.sample_set_id}",
+  "AnnotateVcf.prefix" : "${this.sample_set_id}",
   "AnnotateVcf.use_hail": "false",
 
   "AnnotateVcf.gatk_docker" : "${workspace.gatk_docker}",
diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl
index b89279374..fac58dce0 100644
--- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl
+++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/AnnotateVcf.json.tmpl
@@ -1,6 +1,5 @@
 {
-  "AnnotateVcf.vcf_list" : "${this.cleaned_vcf}",
-  "AnnotateVcf.vcf_idx_list" : "${this.cleaned_vcf_index}",
+  "AnnotateVcf.vcf" : "${this.cleaned_vcf}",
 
   "AnnotateVcf.protein_coding_gtf" : "${workspace.protein_coding_gtf}",
   "AnnotateVcf.noncoding_bed" : "${workspace.noncoding_bed}",
@@ -13,7 +12,7 @@
   "AnnotateVcf.ped_file": "${workspace.cohort_ped_file}",
   "AnnotateVcf.sv_per_shard" : "5000",
   
-  "AnnotateVcf.prefix_list" : "${this.sample_set_set_id}",
+  "AnnotateVcf.prefix" : "${this.sample_set_set_id}",
   "AnnotateVcf.use_hail": "false",
 
   "AnnotateVcf.gatk_docker" : "${workspace.gatk_docker}",
diff --git a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl
index ccb9cf6a6..5d6bbb582 100644
--- a/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl
+++ b/inputs/templates/test/AnnotateVcf/AnnotateVcf.json.tmpl
@@ -1,6 +1,5 @@
 {
-  "AnnotateVcf.vcf_list" :  [ {{ test_batch.clean_vcf | tojson }} ],
-  "AnnotateVcf.vcf_idx_list" : [{{ test_batch.clean_vcf_index | tojson }}],
+  "AnnotateVcf.vcf": {{ test_batch.clean_vcf | tojson }},
 
   "AnnotateVcf.protein_coding_gtf" : {{ reference_resources.protein_coding_gtf | tojson }},
   "AnnotateVcf.noncoding_bed" :       {{ reference_resources.noncoding_bed | tojson }},
@@ -13,7 +12,7 @@
   "AnnotateVcf.ped_file":      {{ test_batch.ped_file | tojson }},
   "AnnotateVcf.sv_per_shard" : "5000",
 
-  "AnnotateVcf.prefix_list" : [{{ test_batch.name | tojson }}],
+  "AnnotateVcf.prefix" : {{ test_batch.name | tojson }},
   "AnnotateVcf.use_hail": "false",
 
   "AnnotateVcf.gatk_docker":{{ dockers.gatk_docker | tojson }},
diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl
index 9440fda95..925b25555 100644
--- a/wdl/AnnotateVcf.wdl
+++ b/wdl/AnnotateVcf.wdl
@@ -8,10 +8,9 @@ import "HailMerge.wdl" as HailMerge
 workflow AnnotateVcf {
 
   input {
-    Array[File] vcf_list  # Must be either single full VCF (array of length 1) or array of VCFs sharded by contig. Outputs will match
-    Array[File]? vcf_idx_list
-    File contig_list
-    Array[String] prefix_list
+    File vcf  # GATK-SV VCF for annotation. Index .tbi must be located at the same path
+    File contig_list  # Ordered list of contigs to annotate that are present in the input VCF
+    String prefix
 
     File protein_coding_gtf
     File? noncoding_bed
@@ -38,8 +37,6 @@ workflow AnnotateVcf {
     String sv_base_mini_docker
     String gatk_docker
 
-    File? NONE_FILE_
-
     RuntimeAttr? runtime_attr_svannotate
     RuntimeAttr? runtime_attr_concat_vcfs
     RuntimeAttr? runtime_attr_shard_vcf
@@ -57,16 +54,14 @@ workflow AnnotateVcf {
   }
 
   Array[String] contigs = read_lines(contig_list)
-  Boolean sharded_by_contig = (length(vcf_list) == length(contigs))
 
-  scatter (i in range(length(contigs))) {
-    Int array_index = if (sharded_by_contig) then i else 0
+  scatter (contig in contigs) {
     call sharded_annotate_vcf.ShardedAnnotateVcf {
       input:
-        vcf = vcf_list[array_index],
-        vcf_idx = if defined(vcf_idx_list) then select_first([vcf_idx_list])[array_index] else NONE_FILE_,
-        contig = contigs[i],
-        prefix = prefix_list[array_index],
+        vcf = vcf,
+        vcf_idx = vcf + ".tbi",
+        contig = contig,
+        prefix = prefix,
         protein_coding_gtf = protein_coding_gtf,
         noncoding_bed = noncoding_bed,
         promoter_window = promoter_window,
@@ -107,43 +102,39 @@ workflow AnnotateVcf {
     }
   }
 
-  # Concat VCFs to the contig level or fully depending on format of input
+  # Concat VCF shards
   # ShardedAnnotateVcf.sharded_annotated_vcf is is an Array[Array[File]] with one inner Array[File] of shards per contig
-  Array[Array[File]] vcfs_for_concatenation = if sharded_by_contig then ShardedAnnotateVcf.sharded_annotated_vcf else [flatten(ShardedAnnotateVcf.sharded_annotated_vcf)]
-  Array[Array[File]] vcf_idxs_for_concatenation = if sharded_by_contig then ShardedAnnotateVcf.sharded_annotated_vcf_idx else [flatten(ShardedAnnotateVcf.sharded_annotated_vcf_idx)]
+  Array[File] vcfs_for_concatenation = flatten(ShardedAnnotateVcf.sharded_annotated_vcf)
+  Array[File] vcf_idxs_for_concatenation = flatten(ShardedAnnotateVcf.sharded_annotated_vcf_idx)
   if (use_hail) {
-    scatter (i in range(length(vcfs_for_concatenation))) {
-      call HailMerge.HailMerge {
-        input:
-          vcfs=vcfs_for_concatenation[i],
-          prefix="~{prefix_list[i]}.annotated",
-          gcs_project=gcs_project,
-          sv_base_mini_docker=sv_base_mini_docker,
-          sv_pipeline_docker=sv_pipeline_docker,
-          sv_pipeline_hail_docker=select_first([sv_pipeline_hail_docker]),
-          runtime_override_preconcat=runtime_attr_preconcat_sharded_cluster,
-          runtime_override_hail_merge=runtime_attr_hail_merge_sharded_cluster,
-          runtime_override_fix_header=runtime_attr_fix_header_sharded_cluster
-      }
+    call HailMerge.HailMerge {
+      input:
+        vcfs=vcfs_for_concatenation,
+        prefix="~{prefix}.annotated",
+        gcs_project=gcs_project,
+        sv_base_mini_docker=sv_base_mini_docker,
+        sv_pipeline_docker=sv_pipeline_docker,
+        sv_pipeline_hail_docker=select_first([sv_pipeline_hail_docker]),
+        runtime_override_preconcat=runtime_attr_preconcat_sharded_cluster,
+        runtime_override_hail_merge=runtime_attr_hail_merge_sharded_cluster,
+        runtime_override_fix_header=runtime_attr_fix_header_sharded_cluster
     }
   }
 
   if (!use_hail) {
-    scatter (i in range(length(vcfs_for_concatenation))) {
-      call MiniTasks.ConcatVcfs {
-        input:
-          vcfs=vcfs_for_concatenation[i],
-          vcfs_idx=vcf_idxs_for_concatenation[i],
-          allow_overlaps=true,
-          outfile_prefix="~{prefix_list[i]}.annotated",
-          sv_base_mini_docker=sv_base_mini_docker,
-          runtime_attr_override=runtime_attr_concat_sharded_cluster
-      }
+    call MiniTasks.ConcatVcfs {
+      input:
+        vcfs=vcfs_for_concatenation,
+        vcfs_idx=vcf_idxs_for_concatenation,
+        allow_overlaps=true,
+        outfile_prefix="~{prefix}.annotated",
+        sv_base_mini_docker=sv_base_mini_docker,
+        runtime_attr_override=runtime_attr_concat_sharded_cluster
     }
   }
 
   output {
-    Array[File] output_vcf_list = select_first([ConcatVcfs.concat_vcf, HailMerge.merged_vcf])
-    Array[File] output_vcf_idx_list = select_first([ConcatVcfs.concat_vcf_idx, HailMerge.merged_vcf_index])
+    File annotated_vcf = select_first([ConcatVcfs.concat_vcf, HailMerge.merged_vcf])
+    File annotated_vcf_index = select_first([ConcatVcfs.concat_vcf_idx, HailMerge.merged_vcf_index])
   }
 }
\ No newline at end of file
diff --git a/wdl/GATKSVPipelineSingleSample.wdl b/wdl/GATKSVPipelineSingleSample.wdl
index cd5750cfc..f82cc96e6 100644
--- a/wdl/GATKSVPipelineSingleSample.wdl
+++ b/wdl/GATKSVPipelineSingleSample.wdl
@@ -1387,9 +1387,8 @@ workflow GATKSVPipelineSingleSample {
 
   call annotate.AnnotateVcf {
        input:
-        vcf_list = [FilterSample.out],
-        vcf_idx_list = [FilterSample.out_idx],
-        prefix_list = [batch],
+        vcf = FilterSample.out,
+        prefix = batch,
         contig_list = primary_contigs_list,
         protein_coding_gtf = protein_coding_gtf,
         noncoding_bed = noncoding_bed,
@@ -1408,18 +1407,18 @@ workflow GATKSVPipelineSingleSample {
 
   call SingleSampleFiltering.VcfToBed as VcfToBed {
     input:
-      vcf = AnnotateVcf.output_vcf_list[0],
+      vcf = AnnotateVcf.annotated_vcf,
       prefix = batch,
       sv_pipeline_docker = sv_pipeline_docker
   }
 
   call SingleSampleFiltering.UpdateBreakendRepresentation {
     input:
-      vcf=AnnotateVcf.output_vcf_list[0],
-      vcf_idx=AnnotateVcf.output_vcf_idx_list[0],
+      vcf=AnnotateVcf.annotated_vcf,
+      vcf_idx=AnnotateVcf.annotated_vcf_index,
       ref_fasta=reference_fasta,
       ref_fasta_idx=reference_index,
-      prefix=basename(AnnotateVcf.output_vcf_list[0], ".vcf.gz") + ".final_cleanup",
+      prefix=basename(AnnotateVcf.annotated_vcf, ".vcf.gz") + ".final_cleanup",
       sv_pipeline_docker=sv_pipeline_docker
   }
 
@@ -1459,8 +1458,8 @@ workflow GATKSVPipelineSingleSample {
     # These files contain events reported in the internal VCF representation
     # They are less VCF-spec compliant but may be useful if components of the pipeline need to be re-run
     # on the output.
-    File pre_cleanup_vcf = AnnotateVcf.output_vcf_list[0]
-    File pre_cleanup_vcf_idx = AnnotateVcf.output_vcf_idx_list[0]
+    File pre_cleanup_vcf = AnnotateVcf.annotated_vcf
+    File pre_cleanup_vcf_idx = AnnotateVcf.annotated_vcf_index
 
     File ploidy_matrix = select_first([GatherBatchEvidence.batch_ploidy_matrix])
     File ploidy_plots = select_first([GatherBatchEvidence.batch_ploidy_plots])
diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl
index b7186771b..b7bd0adc1 100755
--- a/wdl/ShardedAnnotateVcf.wdl
+++ b/wdl/ShardedAnnotateVcf.wdl
@@ -7,11 +7,13 @@ import "AnnotateFunctionalConsequences.wdl" as func
 import "PruneAndAddVafs.wdl" as pav
 import "AnnotateExternalAFPerShard.wdl" as eaf
 
+# Perform annotation per contig
+
 workflow ShardedAnnotateVcf {
 
   input {
     File vcf
-    File? vcf_idx
+    File vcf_idx
     String prefix
     String contig
 

From b107c0535eb3e4468660af40ddb529cf225e0e5f Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 27 Jul 2023 14:46:20 -0400
Subject: [PATCH 23/26] don't generate index if not present in ScatterVcf

---
 wdl/AnnotateVcf.wdl        | 2 +-
 wdl/TasksMakeCohortVcf.wdl | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl
index 925b25555..99f95eb92 100644
--- a/wdl/AnnotateVcf.wdl
+++ b/wdl/AnnotateVcf.wdl
@@ -137,4 +137,4 @@ workflow AnnotateVcf {
     File annotated_vcf = select_first([ConcatVcfs.concat_vcf, HailMerge.merged_vcf])
     File annotated_vcf_index = select_first([ConcatVcfs.concat_vcf_idx, HailMerge.merged_vcf_index])
   }
-}
\ No newline at end of file
+}
diff --git a/wdl/TasksMakeCohortVcf.wdl b/wdl/TasksMakeCohortVcf.wdl
index b344c0a42..1cf9237d7 100644
--- a/wdl/TasksMakeCohortVcf.wdl
+++ b/wdl/TasksMakeCohortVcf.wdl
@@ -990,7 +990,6 @@ task ScatterVcf {
 
   command <<<
     set -euo pipefail
-    ~{if !defined(vcf_index) then "tabix ~{vcf}" else ""}
     # in case the file is empty create an empty shard
     bcftools view -h ~{vcf} | bgzip -c > "~{prefix}.0.vcf.gz"
     bcftools +scatter ~{vcf} -o . -O z -p "~{prefix}". --threads ~{threads} -n ~{records_per_shard} ~{"-r " + contig}

From e1bb41cae3543aeb81458ec1e45bbd467929bea0 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 27 Jul 2023 15:05:12 -0400
Subject: [PATCH 24/26] clean up runtime attrs

---
 wdl/AnnotateVcf.wdl        | 26 +++++++++++---------------
 wdl/ShardedAnnotateVcf.wdl |  7 +------
 2 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl
index 99f95eb92..85d802cb0 100644
--- a/wdl/AnnotateVcf.wdl
+++ b/wdl/AnnotateVcf.wdl
@@ -38,8 +38,7 @@ workflow AnnotateVcf {
     String gatk_docker
 
     RuntimeAttr? runtime_attr_svannotate
-    RuntimeAttr? runtime_attr_concat_vcfs
-    RuntimeAttr? runtime_attr_shard_vcf
+    RuntimeAttr? runtime_attr_scatter_vcf
     RuntimeAttr? runtime_attr_subset_vcf_by_samples_list
     RuntimeAttr? runtime_attr_compute_AFs
     RuntimeAttr? runtime_attr_modify_vcf
@@ -47,10 +46,10 @@ workflow AnnotateVcf {
     RuntimeAttr? runtime_attr_split_query_vcf
     RuntimeAttr? runtime_attr_bedtools_closest
     RuntimeAttr? runtime_attr_select_matched_svs
-    RuntimeAttr? runtime_attr_concat_sharded_cluster
-    RuntimeAttr? runtime_attr_preconcat_sharded_cluster
-    RuntimeAttr? runtime_attr_hail_merge_sharded_cluster
-    RuntimeAttr? runtime_attr_fix_header_sharded_cluster
+    RuntimeAttr? runtime_attr_concat
+    RuntimeAttr? runtime_attr_preconcat
+    RuntimeAttr? runtime_attr_hail_merge
+    RuntimeAttr? runtime_attr_fix_header
   }
 
   Array[String] contigs = read_lines(contig_list)
@@ -88,17 +87,14 @@ workflow AnnotateVcf {
         sv_pipeline_hail_docker = sv_pipeline_hail_docker,
 
         runtime_attr_svannotate = runtime_attr_svannotate,
+        runtime_attr_scatter_vcf = runtime_attr_scatter_vcf,
         runtime_attr_subset_vcf_by_samples_list = runtime_attr_subset_vcf_by_samples_list,
         runtime_attr_compute_AFs  = runtime_attr_compute_AFs,
         runtime_attr_modify_vcf = runtime_attr_modify_vcf,
         runtime_attr_split_ref_bed  = runtime_attr_split_ref_bed,
         runtime_attr_split_query_vcf  = runtime_attr_split_query_vcf,
         runtime_attr_bedtools_closest = runtime_attr_bedtools_closest,
-        runtime_attr_select_matched_svs = runtime_attr_select_matched_svs,
-        runtime_attr_concat_sharded_cluster = runtime_attr_concat_sharded_cluster,
-        runtime_attr_preconcat_sharded_cluster  = runtime_attr_preconcat_sharded_cluster,
-        runtime_attr_hail_merge_sharded_cluster = runtime_attr_hail_merge_sharded_cluster,
-        runtime_attr_fix_header_sharded_cluster = runtime_attr_fix_header_sharded_cluster
+        runtime_attr_select_matched_svs = runtime_attr_select_matched_svs
     }
   }
 
@@ -115,9 +111,9 @@ workflow AnnotateVcf {
         sv_base_mini_docker=sv_base_mini_docker,
         sv_pipeline_docker=sv_pipeline_docker,
         sv_pipeline_hail_docker=select_first([sv_pipeline_hail_docker]),
-        runtime_override_preconcat=runtime_attr_preconcat_sharded_cluster,
-        runtime_override_hail_merge=runtime_attr_hail_merge_sharded_cluster,
-        runtime_override_fix_header=runtime_attr_fix_header_sharded_cluster
+        runtime_override_preconcat=runtime_attr_preconcat,
+        runtime_override_hail_merge=runtime_attr_hail_merge,
+        runtime_override_fix_header=runtime_attr_fix_header
     }
   }
 
@@ -129,7 +125,7 @@ workflow AnnotateVcf {
         allow_overlaps=true,
         outfile_prefix="~{prefix}.annotated",
         sv_base_mini_docker=sv_base_mini_docker,
-        runtime_attr_override=runtime_attr_concat_sharded_cluster
+        runtime_attr_override=runtime_attr_concat
     }
   }
 
diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl
index b7bd0adc1..cd667dddd 100755
--- a/wdl/ShardedAnnotateVcf.wdl
+++ b/wdl/ShardedAnnotateVcf.wdl
@@ -51,11 +51,6 @@ workflow ShardedAnnotateVcf {
     RuntimeAttr? runtime_attr_bedtools_closest
     RuntimeAttr? runtime_attr_select_matched_svs
     RuntimeAttr? runtime_attr_scatter_vcf
-    RuntimeAttr? runtime_attr_fix_ends_rescale_GQ
-    RuntimeAttr? runtime_attr_concat_sharded_cluster
-    RuntimeAttr? runtime_attr_preconcat_sharded_cluster
-    RuntimeAttr? runtime_attr_hail_merge_sharded_cluster
-    RuntimeAttr? runtime_attr_fix_header_sharded_cluster
   }
 
   if (defined(ref_bed)) {
@@ -109,7 +104,7 @@ workflow ShardedAnnotateVcf {
         sv_base_mini_docker = sv_base_mini_docker,
         sv_pipeline_docker = sv_pipeline_docker,
         runtime_attr_subset_vcf_by_samples_list = runtime_attr_subset_vcf_by_samples_list,
-        runtime_attr_compute_AFs = runtime_attr_compute_AFs,
+        runtime_attr_compute_AFs = runtime_attr_compute_AFs
     }
 
     if (defined(ref_bed)) {

From d52ac3d0efe04615f5ea32f8677a6cc1418e0f35 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 27 Jul 2023 15:16:54 -0400
Subject: [PATCH 25/26] move PruneAndAddVafs tasks to ShardedAnnotateVcf

---
 wdl/AnnotateVcf.wdl        |  2 +-
 wdl/PruneAndAddVafs.wdl    | 49 ---------------------
 wdl/ShardedAnnotateVcf.wdl | 89 ++++++++++++++++++++++++++++++--------
 3 files changed, 73 insertions(+), 67 deletions(-)

diff --git a/wdl/AnnotateVcf.wdl b/wdl/AnnotateVcf.wdl
index 85d802cb0..d1f5ed2ad 100644
--- a/wdl/AnnotateVcf.wdl
+++ b/wdl/AnnotateVcf.wdl
@@ -98,7 +98,7 @@ workflow AnnotateVcf {
     }
   }
 
-  # Concat VCF shards
+  # Concat VCF shards with or without hail
   # ShardedAnnotateVcf.sharded_annotated_vcf is is an Array[Array[File]] with one inner Array[File] of shards per contig
   Array[File] vcfs_for_concatenation = flatten(ShardedAnnotateVcf.sharded_annotated_vcf)
   Array[File] vcf_idxs_for_concatenation = flatten(ShardedAnnotateVcf.sharded_annotated_vcf_idx)
diff --git a/wdl/PruneAndAddVafs.wdl b/wdl/PruneAndAddVafs.wdl
index e06263aeb..e977c1468 100644
--- a/wdl/PruneAndAddVafs.wdl
+++ b/wdl/PruneAndAddVafs.wdl
@@ -60,52 +60,3 @@ workflow PruneAndAddVafs {
   }
 }
 
-task ComputeShardAFs {
-  input {
-    File vcf
-    String prefix
-    File? sample_pop_assignments
-    File? ped_file
-    File? par_bed
-    File? allosomes_list
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-  RuntimeAttr default_attr = object {
-    cpu_cores: 1,
-    mem_gb: 1.5,
-    disk_gb: ceil(20 + size(vcf, "GB") * 2),
-    boot_disk_gb: 10,
-    preemptible_tries: 3,
-    max_retries: 1
-  }
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
-
-  command <<<
-    set -euo pipefail
-    /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py "~{vcf}" stdout \
-      ~{"-p " + sample_pop_assignments} \
-      ~{"-f " + ped_file} \
-      ~{"-par " + par_bed} \
-      ~{"--allosomes-list " + allosomes_list} \
-    | bgzip -c \
-    > "~{prefix}.wAFs.vcf.gz"
-
-    tabix -p vcf "~{prefix}.wAFs.vcf.gz"
-  >>>
-
-  output {
-    File shard_wAFs = "~{prefix}.wAFs.vcf.gz"
-    File shard_wAFs_idx = "~{prefix}.wAFs.vcf.gz.tbi"
-  }
-
-  runtime {
-    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
-    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
-    docker: sv_pipeline_docker
-    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
-  }
-}
diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl
index cd667dddd..feefeebb9 100755
--- a/wdl/ShardedAnnotateVcf.wdl
+++ b/wdl/ShardedAnnotateVcf.wdl
@@ -4,7 +4,6 @@ import "Structs.wdl"
 import "TasksMakeCohortVcf.wdl" as MiniTasks
 import "HailMerge.wdl" as HailMerge
 import "AnnotateFunctionalConsequences.wdl" as func
-import "PruneAndAddVafs.wdl" as pav
 import "AnnotateExternalAFPerShard.wdl" as eaf
 
 # Perform annotation per contig
@@ -75,11 +74,23 @@ workflow ShardedAnnotateVcf {
   }
 
   scatter (i in range(length(ScatterVcf.shards))) {
+    String shard_prefix = "~{prefix}.~{contig}.~{i}"
+
+    if (defined(sample_keep_list)) {
+      call util.SubsetVcfBySamplesList {
+        input:
+          vcf = ScatterVcf.shards[i],
+          list_of_samples = select_first([sample_keep_list]),
+          sv_base_mini_docker = sv_base_mini_docker,
+          runtime_attr_override = runtime_attr_subset_vcf_by_samples_list
+      }
+    }
 
     call func.AnnotateFunctionalConsequences {
       input:
-        vcf = ScatterVcf.shards[i],
-        prefix = "~{prefix}.~{contig}.~{i}",
+        vcf = select_first([SubsetVcfBySamplesList.vcf_subset, ScatterVcf.shards[i]]),
+        vcf_index = SubsetVcfBySamplesList.vcf_subset_index,
+        prefix = shard_prefix,
         protein_coding_gtf = protein_coding_gtf,
         noncoding_bed = noncoding_bed,
         promoter_window = promoter_window,
@@ -89,29 +100,24 @@ workflow ShardedAnnotateVcf {
         runtime_attr_svannotate = runtime_attr_svannotate
     }
 
-    call pav.PruneAndAddVafs as PruneAndAddVafs {
+    # Compute AC, AN, and AF per population & sex combination
+    call ComputeAFs {
       input:
         vcf = AnnotateFunctionalConsequences.annotated_vcf,
-        vcf_idx = AnnotateFunctionalConsequences.annotated_vcf_index,
-        prefix = "~{prefix}.~{contig}.~{i}",
-        contig = contig,
+        prefix = shard_prefix,
+        sample_pop_assignments = sample_pop_assignments,
         ped_file = ped_file,
         par_bed = par_bed,
-        sample_keep_list = sample_keep_list,
         allosomes_list = allosomes_list,
-        sample_pop_assignments = sample_pop_assignments,
-
-        sv_base_mini_docker = sv_base_mini_docker,
         sv_pipeline_docker = sv_pipeline_docker,
-        runtime_attr_subset_vcf_by_samples_list = runtime_attr_subset_vcf_by_samples_list,
-        runtime_attr_compute_AFs = runtime_attr_compute_AFs
+        runtime_attr_override = runtime_attr_compute_AFs
     }
 
     if (defined(ref_bed)) {
       call eaf.AnnotateExternalAFPerShard {
         input:
-          vcf = PruneAndAddVafs.output_vcf,
-          vcf_idx = PruneAndAddVafs.output_vcf_idx,
+          vcf = ComputeAFs.af_vcf,
+          vcf_idx = ComputeAFs.af_vcf_idx,
           split_ref_bed_del = select_first([SplitRefBed.del]),
           split_ref_bed_dup = select_first([SplitRefBed.dup]),
           split_ref_bed_ins = select_first([SplitRefBed.ins]),
@@ -131,8 +137,57 @@ workflow ShardedAnnotateVcf {
   }
 
   output {
-    Array[File] sharded_annotated_vcf = select_first([select_all(AnnotateExternalAFPerShard.annotated_vcf), PruneAndAddVafs.output_vcf])
-    Array[File] sharded_annotated_vcf_idx = select_first([select_all(AnnotateExternalAFPerShard.annotated_vcf_tbi), PruneAndAddVafs.output_vcf_idx])
+    Array[File] sharded_annotated_vcf = select_first([select_all(AnnotateExternalAFPerShard.annotated_vcf), ComputeAFs.af_vcf])
+    Array[File] sharded_annotated_vcf_idx = select_first([select_all(AnnotateExternalAFPerShard.annotated_vcf_tbi), ComputeAFs.af_vcf_idx])
   }
 }
 
+task ComputeAFs {
+  input {
+    File vcf
+    String prefix
+    File? sample_pop_assignments
+    File? ped_file
+    File? par_bed
+    File? allosomes_list
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
+  RuntimeAttr default_attr = object {
+    cpu_cores: 1,
+    mem_gb: 1.5,
+    disk_gb: ceil(20 + size(vcf, "GB") * 2),
+    boot_disk_gb: 10,
+    preemptible_tries: 3,
+    max_retries: 1
+  }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+  command <<<
+    set -euo pipefail
+    /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py "~{vcf}" stdout \
+      ~{"-p " + sample_pop_assignments} \
+      ~{"-f " + ped_file} \
+      ~{"-par " + par_bed} \
+      ~{"--allosomes-list " + allosomes_list} \
+    | bgzip -c \
+    > "~{prefix}.wAFs.vcf.gz"
+
+    tabix -p vcf "~{prefix}.wAFs.vcf.gz"
+  >>>
+
+  output {
+    File af_vcf = "~{prefix}.wAFs.vcf.gz"
+    File af_vcf_idx = "~{prefix}.wAFs.vcf.gz.tbi"
+  }
+
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_pipeline_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}

From ae48efae999c7043d9783ede2d427cf9d7e9300e Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 27 Jul 2023 15:22:09 -0400
Subject: [PATCH 26/26] remove PruneAndAddVafs.wdl

---
 wdl/PruneAndAddVafs.wdl    | 62 --------------------------------------
 wdl/ShardedAnnotateVcf.wdl |  1 +
 2 files changed, 1 insertion(+), 62 deletions(-)
 delete mode 100644 wdl/PruneAndAddVafs.wdl

diff --git a/wdl/PruneAndAddVafs.wdl b/wdl/PruneAndAddVafs.wdl
deleted file mode 100644
index e977c1468..000000000
--- a/wdl/PruneAndAddVafs.wdl
+++ /dev/null
@@ -1,62 +0,0 @@
-# Workflow to perform final sample pruning & compute all relevant AF statistics
-# for a VCF from the Talkowski SV pipeline
-
-version 1.0
-
-import "TasksMakeCohortVcf.wdl" as MiniTasks
-import "Utils.wdl" as util
-
-# Prune off samples in annotated VCF, add VAF annotation
-workflow PruneAndAddVafs {
-  
-  input {
-
-    File vcf
-    File vcf_idx
-    String prefix
-    String contig
-
-    File? sample_pop_assignments  # Two-column file with sample ID & pop assignment. "." for pop will ignore sample
-    File? ped_file                # Used for M/F AF calculations
-    File? par_bed
-    File? allosomes_list
-    File? sample_keep_list              # List of samples to be retained from the output vcf
-
-    String sv_base_mini_docker
-    String sv_pipeline_docker
-
-    RuntimeAttr? runtime_attr_compute_AFs
-    RuntimeAttr? runtime_attr_subset_vcf_by_samples_list
-  }
-  
-  # Prune VCF
-  if (defined(sample_keep_list)) {
-    call util.SubsetVcfBySamplesList {
-      input:
-        vcf = vcf,
-        vcf_idx = vcf_idx,
-        list_of_samples = select_first([sample_keep_list]),
-        sv_base_mini_docker = sv_base_mini_docker,
-        runtime_attr_override = runtime_attr_subset_vcf_by_samples_list
-    }
-  }
-
-  # Compute AC, AN, and AF per population & sex combination
-  call ComputeShardAFs {
-    input:
-      vcf = select_first([SubsetVcfBySamplesList.vcf_subset, vcf]),
-      prefix = prefix,
-      sample_pop_assignments = sample_pop_assignments,
-      ped_file = ped_file,
-      par_bed = par_bed,
-      allosomes_list = allosomes_list,
-      sv_pipeline_docker = sv_pipeline_docker,
-      runtime_attr_override = runtime_attr_compute_AFs
-  }
-
-  output {
-    File output_vcf = ComputeShardAFs.shard_wAFs
-    File output_vcf_idx = ComputeShardAFs.shard_wAFs_idx
-  }
-}
-
diff --git a/wdl/ShardedAnnotateVcf.wdl b/wdl/ShardedAnnotateVcf.wdl
index feefeebb9..06a8619d9 100755
--- a/wdl/ShardedAnnotateVcf.wdl
+++ b/wdl/ShardedAnnotateVcf.wdl
@@ -3,6 +3,7 @@ version 1.0
 import "Structs.wdl"
 import "TasksMakeCohortVcf.wdl" as MiniTasks
 import "HailMerge.wdl" as HailMerge
+import "Utils.wdl" as util
 import "AnnotateFunctionalConsequences.wdl" as func
 import "AnnotateExternalAFPerShard.wdl" as eaf