broadinstitute · CuriousTim · Aug 19, 2024 · Sep 17, 2024 · Sep 18, 2024 · Sep 26, 2024
diff --git a/src/WGD/bin/estimatePloidy.R b/src/WGD/bin/estimatePloidy.R
@@ -26,7 +26,7 @@ readMatrix <- function(INFILE){
 #############################################################
 #####Helper function to normalize contigs for a single sample
 #############################################################
-normalizeContigsPerSample <- function(mat, exclude=c("X", "Y"), ploidy=2){
+normalizeContigsPerSample <- function(mat, exclude, ploidy=2){
   #Convert vals to numeric
   mat[, -c(1:3)] <- apply(mat[, -c(1:3)], 2, as.numeric)
 
@@ -53,7 +53,7 @@ normalizeContigsPerSample <- function(mat, exclude=c("X", "Y"), ploidy=2){
 ###########################################################################
 #####Helper function to remove rows where >X% of samples have <=Y% coverage
 ###########################################################################
-filterZeroBins <- function(mat, exclude=c("X", "Y"), minSamp=0.8, minCov=0.2){
+filterZeroBins <- function(mat, exclude, minSamp=0.8, minCov=0.2){
   #Convert vals to numeric
   mat[, -c(1:3)] <- apply(mat[, -c(1:3)], 2, as.numeric)
 
@@ -74,7 +74,7 @@ filterZeroBins <- function(mat, exclude=c("X", "Y"), minSamp=0.8, minCov=0.2){
 ##################################################
 #####Helper function to run PCA on a binCov matrix
 ##################################################
-binCovPCA <- function(dat, exclude=c("X", "Y"), topPCs=10){
+binCovPCA <- function(dat, exclude, topPCs=10){
   #Runs PCA
   PCA <- prcomp(dat[which(!(dat[, 1] %in% exclude)), -c(1:3)], center=T, scale=T)
 
@@ -961,6 +961,9 @@ option_list <- list(
   make_option(c("--maxBatch"), type="integer", default=150, 
               help="maximum number of samples per batch (requires -k) [default: %default]", 
               metavar="integer")
+  make_option(c("--allosomes"), type="character", default=NULL,
+              help="file with allosome contigs, one per line",
+              metavar="character")
 )
 
 #Get command-line arguments & options
@@ -986,6 +989,7 @@ nPCs <- args$options$dimensions
 batch.ideal <- args$options$batchSize
 batch.min <- args$options$minBatch
 batch.max <- args$options$batchMax
+allosomes <- if (is.null(args$options$allosomes)) c("chrX", "chrY") else trimws(readLines(args$options$allosomes))
 
 # ##Jan 2020 dev parameters (on local machine)
 # # INFILE <- "/Users/rlc/scratch/1KGP_2504_sub_batch_10_ploidy_matrix.bed.gz"
@@ -1007,16 +1011,16 @@ if(!dir.exists(OUTDIR)){
 #####PART 1: DATA PROCESSING#####
 #Read, normalize, and clean coverage data
 dat <- readMatrix(INFILE)
-dat <- normalizeContigsPerSample(dat, exclude=c("X", "Y"), ploidy=2)
-dat <- filterZeroBins(dat)
+dat <- normalizeContigsPerSample(dat, exclude=allosomes, ploidy=2)
+dat <- filterZeroBins(dat, exclude=allosomes)
 chr.dat <- medianPerContigPerSample(dat)
 # chr.dat.norm <- normalizeContigsPerMatrix(chr.dat, scale.exclude=c("X", "Y"))
 
 #####PART 2: DOSAGE-BASED BATCHING#####
 #Only run if kmeans is optioned
 if(kmeans==T){
   #Perform PCA on full matrix
-  PCs <- binCovPCA(dat, exclude=c("X", "Y"), topPCs=nPCs)
+  PCs <- binCovPCA(dat, exclude=allosomes, topPCs=nPCs)
 
   #Cluster samples based on dosage PCA
   #Note: tries this with seeds 1-100 (iterated sequentially) until first success
@@ -1193,7 +1197,7 @@ binwise.dat <- data.frame(colnames(dat)[-c(1:3)],
 bin.IDs <- as.character(apply(dat[1:3], 1, paste, collapse="_", sep=""))
 bin.IDs <- as.character(sapply(bin.IDs, function(ID){gsub(" ", "", ID, fixed=T)}))
 colnames(binwise.dat) <- c("ID", bin.IDs)
-sapply(setdiff(unique(dat[, 1]), c("X", "Y")), function(contig){
+sapply(setdiff(unique(dat[, 1]), allosomes), function(contig){
   png(paste(OUTDIR, "/estimated_CN_per_bin.all_samples.", contig, ".png", sep=""), 
       height=1250, width=2500, res=300)
   plot.dat <- binwise.dat[, c(1, which(dat[, 1]==contig)+1)]
@@ -1209,7 +1213,7 @@ colnames(binwise.dat.males) <- c("ID", bin.IDs)
 binwise.dat.females <- data.frame(chr.dat.females$ID, 
                                 t(dat[, which(colnames(dat) %in% chr.dat.females$ID)]))
 colnames(binwise.dat.females) <- c("ID", bin.IDs)
-sapply(intersect(unique(dat[, 1]), c("X", "Y")), function(contig){
+sapply(intersect(unique(dat[, 1]), allosomes), function(contig){
   #Males
   png(paste(OUTDIR, "/estimated_CN_per_bin.chrX_lessThan_2copies.", contig, ".png", sep=""), 
       height=1250, width=2500, res=300)

diff --git a/wdl/EvidenceQC.wdl b/wdl/EvidenceQC.wdl
@@ -36,6 +36,9 @@ workflow EvidenceQC {
     # WGD files
     File wgd_scoring_mask
 
+    String? chr_x
+    String? chr_y
+
     # Runtime parameters
     String sv_base_mini_docker
     String sv_base_docker
@@ -88,6 +91,8 @@ workflow EvidenceQC {
       input:
         bincov_matrix = MakeBincovMatrix.merged_bincov,
         batch = batch,
+        chr_x = chr_x,
+        chr_y = chr_y,
         sv_base_mini_docker = sv_base_mini_docker,
         sv_pipeline_qc_docker = sv_pipeline_qc_docker,
         runtime_attr_score = ploidy_score_runtime_attr,

diff --git a/wdl/GatherBatchEvidence.wdl b/wdl/GatherBatchEvidence.wdl
@@ -126,6 +126,10 @@ workflow GatherBatchEvidence {
     # QC files
     Int matrix_qc_distance
 
+    # Allosomes
+    String? chr_x
+    String? chr_y
+
     # Module metrics parameters
     # Run module metrics workflow at the end - off by default for GatherBatchEvidence because of runtime/expense
     Boolean? run_module_metrics
@@ -206,6 +210,8 @@ workflow GatherBatchEvidence {
       input:
         bincov_matrix = merged_bincov_,
         batch = batch,
+        chr_x = chr_x,
+        chr_y = chr_y,
         sv_base_mini_docker = sv_base_mini_docker,
         sv_pipeline_qc_docker = sv_pipeline_qc_docker,
         runtime_attr_score = ploidy_score_runtime_attr,

diff --git a/wdl/PloidyEstimation.wdl b/wdl/PloidyEstimation.wdl
@@ -8,6 +8,8 @@ workflow Ploidy {
     String batch
     String sv_base_mini_docker
     String sv_pipeline_qc_docker
+    String? chr_x
+    String? chr_y
     RuntimeAttr? runtime_attr_score
     RuntimeAttr? runtime_attr_build
   }
@@ -27,6 +29,8 @@ workflow Ploidy {
     input:
       ploidy_matrix = BuildPloidyMatrix.ploidy_matrix,
       batch = batch,
+      chr_x = chr_x,
+      chr_y = chr_y,
       sv_pipeline_qc_docker = sv_pipeline_qc_docker,
       runtime_attr_override = runtime_attr_score
   }
@@ -94,10 +98,14 @@ task PloidyScore {
   input {
     File ploidy_matrix
     String batch
+    String? chr_x
+    String? chr_y
     String sv_pipeline_qc_docker
     RuntimeAttr? runtime_attr_override
   }
 
+  Array[String] allosomes = [select_first([chr_x, "chrX"]), select_first([chr_y, "chrY"])]
+
   RuntimeAttr default_attr = object {
     cpu_cores: 1, 
     mem_gb: 3.75,
@@ -114,7 +122,9 @@ task PloidyScore {
 
     set -euo pipefail
     mkdir ploidy_est
-    Rscript /opt/WGD/bin/estimatePloidy.R -z -O ./ploidy_est ~{ploidy_matrix}
+    Rscript /opt/WGD/bin/estimatePloidy.R -z -O ./ploidy_est \
+      --allosomes '~{write_lines(allosomes)}' \
+      ~{ploidy_matrix}
 
     #TODO: hotfix for "file changed as we read it" error caused by non-blocking system() calls in the R script
     sleep 10