From 5e0bc160e0c9ca4532d3ab81dbdf02264bd8cf0b Mon Sep 17 00:00:00 2001
From: Justin Lim <jlim21@mgh.harvard.edu>
Date: Mon, 19 Aug 2024 17:22:06 -0400
Subject: [PATCH 1/4] use chrX and chrY to match sex chromosomes

The ploidy estimation script uses string comparison to "X" and "Y" to
identify sex chromsomes, but the ploidy matrix input to the script uses
"chrX" and "chrY" for the sex chromsomes. This commit updates the script
to correctly identify sex chrosomomes.
---
 src/WGD/bin/estimatePloidy.R | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/WGD/bin/estimatePloidy.R b/src/WGD/bin/estimatePloidy.R
index fa9b8de83..bdfb28dab 100755
--- a/src/WGD/bin/estimatePloidy.R
+++ b/src/WGD/bin/estimatePloidy.R
@@ -26,7 +26,7 @@ readMatrix <- function(INFILE){
 #############################################################
 #####Helper function to normalize contigs for a single sample
 #############################################################
-normalizeContigsPerSample <- function(mat, exclude=c("X", "Y"), ploidy=2){
+normalizeContigsPerSample <- function(mat, exclude=c("chrX", "chrY"), ploidy=2){
   #Convert vals to numeric
   mat[, -c(1:3)] <- apply(mat[, -c(1:3)], 2, as.numeric)
 
@@ -53,7 +53,7 @@ normalizeContigsPerSample <- function(mat, exclude=c("X", "Y"), ploidy=2){
 ###########################################################################
 #####Helper function to remove rows where >X% of samples have <=Y% coverage
 ###########################################################################
-filterZeroBins <- function(mat, exclude=c("X", "Y"), minSamp=0.8, minCov=0.2){
+filterZeroBins <- function(mat, exclude=c("chrX", "chrY"), minSamp=0.8, minCov=0.2){
   #Convert vals to numeric
   mat[, -c(1:3)] <- apply(mat[, -c(1:3)], 2, as.numeric)
 
@@ -74,7 +74,7 @@ filterZeroBins <- function(mat, exclude=c("X", "Y"), minSamp=0.8, minCov=0.2){
 ##################################################
 #####Helper function to run PCA on a binCov matrix
 ##################################################
-binCovPCA <- function(dat, exclude=c("X", "Y"), topPCs=10){
+binCovPCA <- function(dat, exclude=c("chrX", "chrY"), topPCs=10){
   #Runs PCA
   PCA <- prcomp(dat[which(!(dat[, 1] %in% exclude)), -c(1:3)], center=T, scale=T)
 
@@ -1007,7 +1007,7 @@ if(!dir.exists(OUTDIR)){
 #####PART 1: DATA PROCESSING#####
 #Read, normalize, and clean coverage data
 dat <- readMatrix(INFILE)
-dat <- normalizeContigsPerSample(dat, exclude=c("X", "Y"), ploidy=2)
+dat <- normalizeContigsPerSample(dat, exclude=c("chrX", "chrY"), ploidy=2)
 dat <- filterZeroBins(dat)
 chr.dat <- medianPerContigPerSample(dat)
 # chr.dat.norm <- normalizeContigsPerMatrix(chr.dat, scale.exclude=c("X", "Y"))
@@ -1016,7 +1016,7 @@ chr.dat <- medianPerContigPerSample(dat)
 #Only run if kmeans is optioned
 if(kmeans==T){
   #Perform PCA on full matrix
-  PCs <- binCovPCA(dat, exclude=c("X", "Y"), topPCs=nPCs)
+  PCs <- binCovPCA(dat, exclude=c("chrX", "chrY"), topPCs=nPCs)
 
   #Cluster samples based on dosage PCA
   #Note: tries this with seeds 1-100 (iterated sequentially) until first success
@@ -1193,7 +1193,7 @@ binwise.dat <- data.frame(colnames(dat)[-c(1:3)],
 bin.IDs <- as.character(apply(dat[1:3], 1, paste, collapse="_", sep=""))
 bin.IDs <- as.character(sapply(bin.IDs, function(ID){gsub(" ", "", ID, fixed=T)}))
 colnames(binwise.dat) <- c("ID", bin.IDs)
-sapply(setdiff(unique(dat[, 1]), c("X", "Y")), function(contig){
+sapply(setdiff(unique(dat[, 1]), c("chrX", "chrY")), function(contig){
   png(paste(OUTDIR, "/estimated_CN_per_bin.all_samples.", contig, ".png", sep=""), 
       height=1250, width=2500, res=300)
   plot.dat <- binwise.dat[, c(1, which(dat[, 1]==contig)+1)]
@@ -1209,7 +1209,7 @@ colnames(binwise.dat.males) <- c("ID", bin.IDs)
 binwise.dat.females <- data.frame(chr.dat.females$ID, 
                                 t(dat[, which(colnames(dat) %in% chr.dat.females$ID)]))
 colnames(binwise.dat.females) <- c("ID", bin.IDs)
-sapply(intersect(unique(dat[, 1]), c("X", "Y")), function(contig){
+sapply(intersect(unique(dat[, 1]), c("chrX", "chrY")), function(contig){
   #Males
   png(paste(OUTDIR, "/estimated_CN_per_bin.chrX_lessThan_2copies.", contig, ".png", sep=""), 
       height=1250, width=2500, res=300)

From 31cafce9a560ec5a7327d6494da29ab5a896310d Mon Sep 17 00:00:00 2001
From: Justin Lim <jlim21@mgh.harvard.edu>
Date: Wed, 18 Sep 2024 15:25:48 -0400
Subject: [PATCH 2/4] Add allosome contigs input to EvidenceQC

Update the ploidy estimation script and associated WDLs to accept a
custom set of allosome contigs, with the default being chrX and chrY.
---
 src/WGD/bin/estimatePloidy.R       | 20 ++++++++++++--------
 wdl/EvidenceQC.wdl                 |  2 ++
 wdl/GATKSVPipelineBatch.wdl        |  1 +
 wdl/GATKSVPipelineSingleSample.wdl |  1 +
 wdl/PloidyEstimation.wdl           |  6 +++++-
 5 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/WGD/bin/estimatePloidy.R b/src/WGD/bin/estimatePloidy.R
index bdfb28dab..e37a06dc0 100755
--- a/src/WGD/bin/estimatePloidy.R
+++ b/src/WGD/bin/estimatePloidy.R
@@ -26,7 +26,7 @@ readMatrix <- function(INFILE){
 #############################################################
 #####Helper function to normalize contigs for a single sample
 #############################################################
-normalizeContigsPerSample <- function(mat, exclude=c("chrX", "chrY"), ploidy=2){
+normalizeContigsPerSample <- function(mat, exclude, ploidy=2){
   #Convert vals to numeric
   mat[, -c(1:3)] <- apply(mat[, -c(1:3)], 2, as.numeric)
 
@@ -53,7 +53,7 @@ normalizeContigsPerSample <- function(mat, exclude=c("chrX", "chrY"), ploidy=2){
 ###########################################################################
 #####Helper function to remove rows where >X% of samples have <=Y% coverage
 ###########################################################################
-filterZeroBins <- function(mat, exclude=c("chrX", "chrY"), minSamp=0.8, minCov=0.2){
+filterZeroBins <- function(mat, exclude, minSamp=0.8, minCov=0.2){
   #Convert vals to numeric
   mat[, -c(1:3)] <- apply(mat[, -c(1:3)], 2, as.numeric)
 
@@ -74,7 +74,7 @@ filterZeroBins <- function(mat, exclude=c("chrX", "chrY"), minSamp=0.8, minCov=0
 ##################################################
 #####Helper function to run PCA on a binCov matrix
 ##################################################
-binCovPCA <- function(dat, exclude=c("chrX", "chrY"), topPCs=10){
+binCovPCA <- function(dat, exclude, topPCs=10){
   #Runs PCA
   PCA <- prcomp(dat[which(!(dat[, 1] %in% exclude)), -c(1:3)], center=T, scale=T)
 
@@ -961,6 +961,9 @@ option_list <- list(
   make_option(c("--maxBatch"), type="integer", default=150, 
               help="maximum number of samples per batch (requires -k) [default: %default]", 
               metavar="integer")
+  make_option(c("--allosomes"), type="character", default=NULL,
+              help="file with allosome contigs, one per line",
+              metavar="character")
 )
 
 #Get command-line arguments & options
@@ -986,6 +989,7 @@ nPCs <- args$options$dimensions
 batch.ideal <- args$options$batchSize
 batch.min <- args$options$minBatch
 batch.max <- args$options$batchMax
+allosomes <- if (is.null(args$options$allosomes)) c("chrX", "chrY") else trimws(readLines(args$options$allosomes))
 
 # ##Jan 2020 dev parameters (on local machine)
 # # INFILE <- "/Users/rlc/scratch/1KGP_2504_sub_batch_10_ploidy_matrix.bed.gz"
@@ -1007,8 +1011,8 @@ if(!dir.exists(OUTDIR)){
 #####PART 1: DATA PROCESSING#####
 #Read, normalize, and clean coverage data
 dat <- readMatrix(INFILE)
-dat <- normalizeContigsPerSample(dat, exclude=c("chrX", "chrY"), ploidy=2)
-dat <- filterZeroBins(dat)
+dat <- normalizeContigsPerSample(dat, exclude=allosomes, ploidy=2)
+dat <- filterZeroBins(dat, exclude=allosomes)
 chr.dat <- medianPerContigPerSample(dat)
 # chr.dat.norm <- normalizeContigsPerMatrix(chr.dat, scale.exclude=c("X", "Y"))
 
@@ -1016,7 +1020,7 @@ chr.dat <- medianPerContigPerSample(dat)
 #Only run if kmeans is optioned
 if(kmeans==T){
   #Perform PCA on full matrix
-  PCs <- binCovPCA(dat, exclude=c("chrX", "chrY"), topPCs=nPCs)
+  PCs <- binCovPCA(dat, exclude=allosomes, topPCs=nPCs)
 
   #Cluster samples based on dosage PCA
   #Note: tries this with seeds 1-100 (iterated sequentially) until first success
@@ -1193,7 +1197,7 @@ binwise.dat <- data.frame(colnames(dat)[-c(1:3)],
 bin.IDs <- as.character(apply(dat[1:3], 1, paste, collapse="_", sep=""))
 bin.IDs <- as.character(sapply(bin.IDs, function(ID){gsub(" ", "", ID, fixed=T)}))
 colnames(binwise.dat) <- c("ID", bin.IDs)
-sapply(setdiff(unique(dat[, 1]), c("chrX", "chrY")), function(contig){
+sapply(setdiff(unique(dat[, 1]), allosomes), function(contig){
   png(paste(OUTDIR, "/estimated_CN_per_bin.all_samples.", contig, ".png", sep=""), 
       height=1250, width=2500, res=300)
   plot.dat <- binwise.dat[, c(1, which(dat[, 1]==contig)+1)]
@@ -1209,7 +1213,7 @@ colnames(binwise.dat.males) <- c("ID", bin.IDs)
 binwise.dat.females <- data.frame(chr.dat.females$ID, 
                                 t(dat[, which(colnames(dat) %in% chr.dat.females$ID)]))
 colnames(binwise.dat.females) <- c("ID", bin.IDs)
-sapply(intersect(unique(dat[, 1]), c("chrX", "chrY")), function(contig){
+sapply(intersect(unique(dat[, 1]), allosomes), function(contig){
   #Males
   png(paste(OUTDIR, "/estimated_CN_per_bin.chrX_lessThan_2copies.", contig, ".png", sep=""), 
       height=1250, width=2500, res=300)
diff --git a/wdl/EvidenceQC.wdl b/wdl/EvidenceQC.wdl
index 91daa1ab4..5c06033dc 100644
--- a/wdl/EvidenceQC.wdl
+++ b/wdl/EvidenceQC.wdl
@@ -23,6 +23,7 @@ workflow EvidenceQC {
 
     # Global files
     File genome_file
+    File allosome_contigs
 
     # Coverage files
     Array[File] counts
@@ -88,6 +89,7 @@ workflow EvidenceQC {
       input:
         bincov_matrix = MakeBincovMatrix.merged_bincov,
         batch = batch,
+        allosome_contigs = allosome_contigs,
         sv_base_mini_docker = sv_base_mini_docker,
         sv_pipeline_qc_docker = sv_pipeline_qc_docker,
         runtime_attr_score = ploidy_score_runtime_attr,
diff --git a/wdl/GATKSVPipelineBatch.wdl b/wdl/GATKSVPipelineBatch.wdl
index e328806bc..057d3af19 100644
--- a/wdl/GATKSVPipelineBatch.wdl
+++ b/wdl/GATKSVPipelineBatch.wdl
@@ -178,6 +178,7 @@ workflow GATKSVPipelineBatch {
       genome_file=genome_file,
       counts=counts_files_,
       run_ploidy = false,
+      allosome_contigs=allosome_file,
       sv_pipeline_docker=sv_pipeline_docker,
       sv_pipeline_qc_docker=sv_pipeline_qc_docker,
       sv_base_mini_docker=sv_base_mini_docker,
diff --git a/wdl/GATKSVPipelineSingleSample.wdl b/wdl/GATKSVPipelineSingleSample.wdl
index d9e55d5e6..53569fd3f 100644
--- a/wdl/GATKSVPipelineSingleSample.wdl
+++ b/wdl/GATKSVPipelineSingleSample.wdl
@@ -671,6 +671,7 @@ workflow GATKSVPipelineSingleSample {
       counts=[case_counts_file_],
       run_ploidy = false,
       wgd_scoring_mask=wgd_scoring_mask,
+      allosome_contigs=allosome_file,
       sv_pipeline_docker=sv_pipeline_docker,
       sv_pipeline_qc_docker=sv_pipeline_qc_docker,
       sv_base_mini_docker=sv_base_mini_docker,
diff --git a/wdl/PloidyEstimation.wdl b/wdl/PloidyEstimation.wdl
index d56d0dda9..0f2d73bdd 100644
--- a/wdl/PloidyEstimation.wdl
+++ b/wdl/PloidyEstimation.wdl
@@ -8,6 +8,7 @@ workflow Ploidy {
     String batch
     String sv_base_mini_docker
     String sv_pipeline_qc_docker
+    File allosome_contigs
     RuntimeAttr? runtime_attr_score
     RuntimeAttr? runtime_attr_build
   }
@@ -27,6 +28,7 @@ workflow Ploidy {
     input:
       ploidy_matrix = BuildPloidyMatrix.ploidy_matrix,
       batch = batch,
+      allosome_contigs = allosome_contigs,
       sv_pipeline_qc_docker = sv_pipeline_qc_docker,
       runtime_attr_override = runtime_attr_score
   }
@@ -94,6 +96,7 @@ task PloidyScore {
   input {
     File ploidy_matrix
     String batch
+    File allosome_contigs
     String sv_pipeline_qc_docker
     RuntimeAttr? runtime_attr_override
   }
@@ -114,7 +117,8 @@ task PloidyScore {
 
     set -euo pipefail
     mkdir ploidy_est
-    Rscript /opt/WGD/bin/estimatePloidy.R -z -O ./ploidy_est ~{ploidy_matrix}
+    cut -f 1 '~{allosome_contigs}' > allosome_contigs.list
+    Rscript /opt/WGD/bin/estimatePloidy.R -z -O ./ploidy_est --allosomes allosome_contigs.list ~{ploidy_matrix}
 
     #TODO: hotfix for "file changed as we read it" error caused by non-blocking system() calls in the R script
     sleep 10

From 2f29a6f113bd401acadb5298bb2bb19220fd1f78 Mon Sep 17 00:00:00 2001
From: Justin Lim <jlim21@mgh.harvard.edu>
Date: Thu, 26 Sep 2024 12:00:44 -0400
Subject: [PATCH 3/4] Separate allosome inputs for PloidyEstimation

Expose separate inputs to specify the allosomal contig names in
PloidyEstimation with the defaults being "chrX" and "chrY".
---
 wdl/EvidenceQC.wdl          |  7 +++++--
 wdl/GatherBatchEvidence.wdl |  6 ++++++
 wdl/PloidyEstimation.wdl    | 16 +++++++++++-----
 3 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/wdl/EvidenceQC.wdl b/wdl/EvidenceQC.wdl
index 5c06033dc..b7b25b89f 100644
--- a/wdl/EvidenceQC.wdl
+++ b/wdl/EvidenceQC.wdl
@@ -23,7 +23,6 @@ workflow EvidenceQC {
 
     # Global files
     File genome_file
-    File allosome_contigs
 
     # Coverage files
     Array[File] counts
@@ -37,6 +36,9 @@ workflow EvidenceQC {
     # WGD files
     File wgd_scoring_mask
 
+    String? chr_x
+    String? chr_y
+
     # Runtime parameters
     String sv_base_mini_docker
     String sv_base_docker
@@ -89,7 +91,8 @@ workflow EvidenceQC {
       input:
         bincov_matrix = MakeBincovMatrix.merged_bincov,
         batch = batch,
-        allosome_contigs = allosome_contigs,
+        chr_x = chr_x,
+        chr_y = chr_y,
         sv_base_mini_docker = sv_base_mini_docker,
         sv_pipeline_qc_docker = sv_pipeline_qc_docker,
         runtime_attr_score = ploidy_score_runtime_attr,
diff --git a/wdl/GatherBatchEvidence.wdl b/wdl/GatherBatchEvidence.wdl
index f3f01a7e1..4637446ff 100644
--- a/wdl/GatherBatchEvidence.wdl
+++ b/wdl/GatherBatchEvidence.wdl
@@ -126,6 +126,10 @@ workflow GatherBatchEvidence {
     # QC files
     Int matrix_qc_distance
 
+    # Allosomes
+    String? chr_x
+    String? chr_y
+
     # Module metrics parameters
     # Run module metrics workflow at the end - off by default for GatherBatchEvidence because of runtime/expense
     Boolean? run_module_metrics
@@ -206,6 +210,8 @@ workflow GatherBatchEvidence {
       input:
         bincov_matrix = merged_bincov_,
         batch = batch,
+        chr_x = chr_x,
+        chr_y = chr_y,
         sv_base_mini_docker = sv_base_mini_docker,
         sv_pipeline_qc_docker = sv_pipeline_qc_docker,
         runtime_attr_score = ploidy_score_runtime_attr,
diff --git a/wdl/PloidyEstimation.wdl b/wdl/PloidyEstimation.wdl
index 0f2d73bdd..6072c1cb0 100644
--- a/wdl/PloidyEstimation.wdl
+++ b/wdl/PloidyEstimation.wdl
@@ -8,7 +8,8 @@ workflow Ploidy {
     String batch
     String sv_base_mini_docker
     String sv_pipeline_qc_docker
-    File allosome_contigs
+    String? chr_x
+    String? chr_y
     RuntimeAttr? runtime_attr_score
     RuntimeAttr? runtime_attr_build
   }
@@ -28,7 +29,8 @@ workflow Ploidy {
     input:
       ploidy_matrix = BuildPloidyMatrix.ploidy_matrix,
       batch = batch,
-      allosome_contigs = allosome_contigs,
+      chr_x = chr_x,
+      chr_y = chr_y,
       sv_pipeline_qc_docker = sv_pipeline_qc_docker,
       runtime_attr_override = runtime_attr_score
   }
@@ -96,11 +98,14 @@ task PloidyScore {
   input {
     File ploidy_matrix
     String batch
-    File allosome_contigs
+    String? chr_x
+    String? chr_y
     String sv_pipeline_qc_docker
     RuntimeAttr? runtime_attr_override
   }
 
+  Array[String] allosomes = [select_first([chr_x, "chrX"]), select_first([chr_y, "chrY"])]
+
   RuntimeAttr default_attr = object {
     cpu_cores: 1, 
     mem_gb: 3.75,
@@ -117,8 +122,9 @@ task PloidyScore {
 
     set -euo pipefail
     mkdir ploidy_est
-    cut -f 1 '~{allosome_contigs}' > allosome_contigs.list
-    Rscript /opt/WGD/bin/estimatePloidy.R -z -O ./ploidy_est --allosomes allosome_contigs.list ~{ploidy_matrix}
+    Rscript /opt/WGD/bin/estimatePloidy.R -z -O ./ploidy_est \
+      --allosomes '~{write_lines(allosomes)}' \
+      ~{ploidy_matrix}
 
     #TODO: hotfix for "file changed as we read it" error caused by non-blocking system() calls in the R script
     sleep 10

From 3b18fb28e3fd65a40388e3400e5ed77f0536db2e Mon Sep 17 00:00:00 2001
From: Justin Lim <jlim21@mgh.harvard.edu>
Date: Thu, 26 Sep 2024 12:24:03 -0400
Subject: [PATCH 4/4] Revert usage of allosomal contigs file

The method to pass the names of the allosomes to PloidyEstimation was
changed from passing a file to passing two strings, but some calling
workflows were not updated to reflect the change. This commit fixes the
inputs.
---
 wdl/GATKSVPipelineBatch.wdl        | 1 -
 wdl/GATKSVPipelineSingleSample.wdl | 1 -
 2 files changed, 2 deletions(-)

diff --git a/wdl/GATKSVPipelineBatch.wdl b/wdl/GATKSVPipelineBatch.wdl
index 057d3af19..e328806bc 100644
--- a/wdl/GATKSVPipelineBatch.wdl
+++ b/wdl/GATKSVPipelineBatch.wdl
@@ -178,7 +178,6 @@ workflow GATKSVPipelineBatch {
       genome_file=genome_file,
       counts=counts_files_,
       run_ploidy = false,
-      allosome_contigs=allosome_file,
       sv_pipeline_docker=sv_pipeline_docker,
       sv_pipeline_qc_docker=sv_pipeline_qc_docker,
       sv_base_mini_docker=sv_base_mini_docker,
diff --git a/wdl/GATKSVPipelineSingleSample.wdl b/wdl/GATKSVPipelineSingleSample.wdl
index 53569fd3f..d9e55d5e6 100644
--- a/wdl/GATKSVPipelineSingleSample.wdl
+++ b/wdl/GATKSVPipelineSingleSample.wdl
@@ -671,7 +671,6 @@ workflow GATKSVPipelineSingleSample {
       counts=[case_counts_file_],
       run_ploidy = false,
       wgd_scoring_mask=wgd_scoring_mask,
-      allosome_contigs=allosome_file,
       sv_pipeline_docker=sv_pipeline_docker,
       sv_pipeline_qc_docker=sv_pipeline_qc_docker,
       sv_base_mini_docker=sv_base_mini_docker,