Add allosome contigs input to EvidenceQC

Update the ploidy estimation script and associated WDLs to accept a custom set of allosome contigs, with the default being chrX and chrY.
broadinstitute · Sep 18, 2024 · 1477a28 · 1477a28
1 parent e3fe34a
commit 1477a28
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 9 deletions.
diff --git a/src/WGD/bin/estimatePloidy.R b/src/WGD/bin/estimatePloidy.R
@@ -26,7 +26,7 @@ readMatrix <- function(INFILE){
 #############################################################
 #####Helper function to normalize contigs for a single sample
 #############################################################
-normalizeContigsPerSample <- function(mat, exclude=c("chrX", "chrY"), ploidy=2){
+normalizeContigsPerSample <- function(mat, exclude, ploidy=2){
   #Convert vals to numeric
   mat[, -c(1:3)] <- apply(mat[, -c(1:3)], 2, as.numeric)
 
@@ -53,7 +53,7 @@ normalizeContigsPerSample <- function(mat, exclude=c("chrX", "chrY"), ploidy=2){
 ###########################################################################
 #####Helper function to remove rows where >X% of samples have <=Y% coverage
 ###########################################################################
-filterZeroBins <- function(mat, exclude=c("chrX", "chrY"), minSamp=0.8, minCov=0.2){
+filterZeroBins <- function(mat, exclude, minSamp=0.8, minCov=0.2){
   #Convert vals to numeric
   mat[, -c(1:3)] <- apply(mat[, -c(1:3)], 2, as.numeric)
 
@@ -74,7 +74,7 @@ filterZeroBins <- function(mat, exclude=c("chrX", "chrY"), minSamp=0.8, minCov=0
 ##################################################
 #####Helper function to run PCA on a binCov matrix
 ##################################################
-binCovPCA <- function(dat, exclude=c("chrX", "chrY"), topPCs=10){
+binCovPCA <- function(dat, exclude, topPCs=10){
   #Runs PCA
   PCA <- prcomp(dat[which(!(dat[, 1] %in% exclude)), -c(1:3)], center=T, scale=T)
 
@@ -961,6 +961,9 @@ option_list <- list(
   make_option(c("--maxBatch"), type="integer", default=150, 
               help="maximum number of samples per batch (requires -k) [default: %default]", 
               metavar="integer")
+  make_option(c("--allosomes"), type="character", default=NULL,
+              help="file with allosome contigs, one per line",
+              metavar="character")
 )
 
 #Get command-line arguments & options
@@ -986,6 +989,7 @@ nPCs <- args$options$dimensions
 batch.ideal <- args$options$batchSize
 batch.min <- args$options$minBatch
 batch.max <- args$options$batchMax
+allosomes <- if (is.null(args$options$allosomes)) c("chrX", "chrY") else trimws(readLines(args$options$allosomes))
 
 # ##Jan 2020 dev parameters (on local machine)
 # # INFILE <- "/Users/rlc/scratch/1KGP_2504_sub_batch_10_ploidy_matrix.bed.gz"
@@ -1007,16 +1011,16 @@ if(!dir.exists(OUTDIR)){
 #####PART 1: DATA PROCESSING#####
 #Read, normalize, and clean coverage data
 dat <- readMatrix(INFILE)
-dat <- normalizeContigsPerSample(dat, exclude=c("chrX", "chrY"), ploidy=2)
-dat <- filterZeroBins(dat)
+dat <- normalizeContigsPerSample(dat, exclude=allosomes, ploidy=2)
+dat <- filterZeroBins(dat, exclude=allosomes)
 chr.dat <- medianPerContigPerSample(dat)
 # chr.dat.norm <- normalizeContigsPerMatrix(chr.dat, scale.exclude=c("X", "Y"))
 
 #####PART 2: DOSAGE-BASED BATCHING#####
 #Only run if kmeans is optioned
 if(kmeans==T){
   #Perform PCA on full matrix
-  PCs <- binCovPCA(dat, exclude=c("chrX", "chrY"), topPCs=nPCs)
+  PCs <- binCovPCA(dat, exclude=allosomes, topPCs=nPCs)
 
   #Cluster samples based on dosage PCA
   #Note: tries this with seeds 1-100 (iterated sequentially) until first success
@@ -1193,7 +1197,7 @@ binwise.dat <- data.frame(colnames(dat)[-c(1:3)],
 bin.IDs <- as.character(apply(dat[1:3], 1, paste, collapse="_", sep=""))
 bin.IDs <- as.character(sapply(bin.IDs, function(ID){gsub(" ", "", ID, fixed=T)}))
 colnames(binwise.dat) <- c("ID", bin.IDs)
-sapply(setdiff(unique(dat[, 1]), c("chrX", "chrY")), function(contig){
+sapply(setdiff(unique(dat[, 1]), allosomes), function(contig){
   png(paste(OUTDIR, "/estimated_CN_per_bin.all_samples.", contig, ".png", sep=""), 
       height=1250, width=2500, res=300)
   plot.dat <- binwise.dat[, c(1, which(dat[, 1]==contig)+1)]
@@ -1209,7 +1213,7 @@ colnames(binwise.dat.males) <- c("ID", bin.IDs)
 binwise.dat.females <- data.frame(chr.dat.females$ID, 
                                 t(dat[, which(colnames(dat) %in% chr.dat.females$ID)]))
 colnames(binwise.dat.females) <- c("ID", bin.IDs)
-sapply(intersect(unique(dat[, 1]), c("chrX", "chrY")), function(contig){
+sapply(intersect(unique(dat[, 1]), allosomes), function(contig){
   #Males
   png(paste(OUTDIR, "/estimated_CN_per_bin.chrX_lessThan_2copies.", contig, ".png", sep=""), 
       height=1250, width=2500, res=300)

diff --git a/wdl/EvidenceQC.wdl b/wdl/EvidenceQC.wdl
@@ -23,6 +23,7 @@ workflow EvidenceQC {
 
     # Global files
     File genome_file
+    File allosome_contigs
 
     # Coverage files
     Array[File] counts
@@ -88,6 +89,7 @@ workflow EvidenceQC {
       input:
         bincov_matrix = MakeBincovMatrix.merged_bincov,
         batch = batch,
+        allosome_contigs = allosome_contigs,
         sv_base_mini_docker = sv_base_mini_docker,
         sv_pipeline_qc_docker = sv_pipeline_qc_docker,
         runtime_attr_score = ploidy_score_runtime_attr,

diff --git a/wdl/PloidyEstimation.wdl b/wdl/PloidyEstimation.wdl
@@ -8,6 +8,7 @@ workflow Ploidy {
     String batch
     String sv_base_mini_docker
     String sv_pipeline_qc_docker
+    File allosome_contigs
     RuntimeAttr? runtime_attr_score
     RuntimeAttr? runtime_attr_build
   }
@@ -94,6 +95,7 @@ task PloidyScore {
   input {
     File ploidy_matrix
     String batch
+    File allosome_contigs
     String sv_pipeline_qc_docker
     RuntimeAttr? runtime_attr_override
   }
@@ -114,7 +116,8 @@ task PloidyScore {
 
     set -euo pipefail
     mkdir ploidy_est
-    Rscript /opt/WGD/bin/estimatePloidy.R -z -O ./ploidy_est ~{ploidy_matrix}
+    cut -f 1 '~{allosome_contigs}' > allosome_contigs.list
+    Rscript /opt/WGD/bin/estimatePloidy.R -z -O ./ploidy_est --allosomes allosome_contigs.list ~{ploidy_matrix}
 
     #TODO: hotfix for "file changed as we read it" error caused by non-blocking system() calls in the R script
     sleep 10