diff --git a/src/WGD/bin/estimatePloidy.R b/src/WGD/bin/estimatePloidy.R index fa9b8de83..e37a06dc0 100755 --- a/src/WGD/bin/estimatePloidy.R +++ b/src/WGD/bin/estimatePloidy.R @@ -26,7 +26,7 @@ readMatrix <- function(INFILE){ ############################################################# #####Helper function to normalize contigs for a single sample ############################################################# -normalizeContigsPerSample <- function(mat, exclude=c("X", "Y"), ploidy=2){ +normalizeContigsPerSample <- function(mat, exclude, ploidy=2){ #Convert vals to numeric mat[, -c(1:3)] <- apply(mat[, -c(1:3)], 2, as.numeric) @@ -53,7 +53,7 @@ normalizeContigsPerSample <- function(mat, exclude=c("X", "Y"), ploidy=2){ ########################################################################### #####Helper function to remove rows where >X% of samples have <=Y% coverage ########################################################################### -filterZeroBins <- function(mat, exclude=c("X", "Y"), minSamp=0.8, minCov=0.2){ +filterZeroBins <- function(mat, exclude, minSamp=0.8, minCov=0.2){ #Convert vals to numeric mat[, -c(1:3)] <- apply(mat[, -c(1:3)], 2, as.numeric) @@ -74,7 +74,7 @@ filterZeroBins <- function(mat, exclude=c("X", "Y"), minSamp=0.8, minCov=0.2){ ################################################## #####Helper function to run PCA on a binCov matrix ################################################## -binCovPCA <- function(dat, exclude=c("X", "Y"), topPCs=10){ +binCovPCA <- function(dat, exclude, topPCs=10){ #Runs PCA PCA <- prcomp(dat[which(!(dat[, 1] %in% exclude)), -c(1:3)], center=T, scale=T) @@ -961,6 +961,9 @@ option_list <- list( make_option(c("--maxBatch"), type="integer", default=150, help="maximum number of samples per batch (requires -k) [default: %default]", metavar="integer") + make_option(c("--allosomes"), type="character", default=NULL, + help="file with allosome contigs, one per line", + metavar="character") ) #Get command-line arguments & options @@ -986,6 +989,7 @@ nPCs <- args$options$dimensions batch.ideal <- args$options$batchSize batch.min <- args$options$minBatch batch.max <- args$options$batchMax +allosomes <- if (is.null(args$options$allosomes)) c("chrX", "chrY") else trimws(readLines(args$options$allosomes)) # ##Jan 2020 dev parameters (on local machine) # # INFILE <- "/Users/rlc/scratch/1KGP_2504_sub_batch_10_ploidy_matrix.bed.gz" @@ -1007,8 +1011,8 @@ if(!dir.exists(OUTDIR)){ #####PART 1: DATA PROCESSING##### #Read, normalize, and clean coverage data dat <- readMatrix(INFILE) -dat <- normalizeContigsPerSample(dat, exclude=c("X", "Y"), ploidy=2) -dat <- filterZeroBins(dat) +dat <- normalizeContigsPerSample(dat, exclude=allosomes, ploidy=2) +dat <- filterZeroBins(dat, exclude=allosomes) chr.dat <- medianPerContigPerSample(dat) # chr.dat.norm <- normalizeContigsPerMatrix(chr.dat, scale.exclude=c("X", "Y")) @@ -1016,7 +1020,7 @@ chr.dat <- medianPerContigPerSample(dat) #Only run if kmeans is optioned if(kmeans==T){ #Perform PCA on full matrix - PCs <- binCovPCA(dat, exclude=c("X", "Y"), topPCs=nPCs) + PCs <- binCovPCA(dat, exclude=allosomes, topPCs=nPCs) #Cluster samples based on dosage PCA #Note: tries this with seeds 1-100 (iterated sequentially) until first success @@ -1193,7 +1197,7 @@ binwise.dat <- data.frame(colnames(dat)[-c(1:3)], bin.IDs <- as.character(apply(dat[1:3], 1, paste, collapse="_", sep="")) bin.IDs <- as.character(sapply(bin.IDs, function(ID){gsub(" ", "", ID, fixed=T)})) colnames(binwise.dat) <- c("ID", bin.IDs) -sapply(setdiff(unique(dat[, 1]), c("X", "Y")), function(contig){ +sapply(setdiff(unique(dat[, 1]), allosomes), function(contig){ png(paste(OUTDIR, "/estimated_CN_per_bin.all_samples.", contig, ".png", sep=""), height=1250, width=2500, res=300) plot.dat <- binwise.dat[, c(1, which(dat[, 1]==contig)+1)] @@ -1209,7 +1213,7 @@ colnames(binwise.dat.males) <- c("ID", bin.IDs) binwise.dat.females <- data.frame(chr.dat.females$ID, t(dat[, which(colnames(dat) %in% chr.dat.females$ID)])) colnames(binwise.dat.females) <- c("ID", bin.IDs) -sapply(intersect(unique(dat[, 1]), c("X", "Y")), function(contig){ +sapply(intersect(unique(dat[, 1]), allosomes), function(contig){ #Males png(paste(OUTDIR, "/estimated_CN_per_bin.chrX_lessThan_2copies.", contig, ".png", sep=""), height=1250, width=2500, res=300) diff --git a/wdl/EvidenceQC.wdl b/wdl/EvidenceQC.wdl index 91daa1ab4..b7b25b89f 100644 --- a/wdl/EvidenceQC.wdl +++ b/wdl/EvidenceQC.wdl @@ -36,6 +36,9 @@ workflow EvidenceQC { # WGD files File wgd_scoring_mask + String? chr_x + String? chr_y + # Runtime parameters String sv_base_mini_docker String sv_base_docker @@ -88,6 +91,8 @@ workflow EvidenceQC { input: bincov_matrix = MakeBincovMatrix.merged_bincov, batch = batch, + chr_x = chr_x, + chr_y = chr_y, sv_base_mini_docker = sv_base_mini_docker, sv_pipeline_qc_docker = sv_pipeline_qc_docker, runtime_attr_score = ploidy_score_runtime_attr, diff --git a/wdl/GatherBatchEvidence.wdl b/wdl/GatherBatchEvidence.wdl index f3f01a7e1..4637446ff 100644 --- a/wdl/GatherBatchEvidence.wdl +++ b/wdl/GatherBatchEvidence.wdl @@ -126,6 +126,10 @@ workflow GatherBatchEvidence { # QC files Int matrix_qc_distance + # Allosomes + String? chr_x + String? chr_y + # Module metrics parameters # Run module metrics workflow at the end - off by default for GatherBatchEvidence because of runtime/expense Boolean? run_module_metrics @@ -206,6 +210,8 @@ workflow GatherBatchEvidence { input: bincov_matrix = merged_bincov_, batch = batch, + chr_x = chr_x, + chr_y = chr_y, sv_base_mini_docker = sv_base_mini_docker, sv_pipeline_qc_docker = sv_pipeline_qc_docker, runtime_attr_score = ploidy_score_runtime_attr, diff --git a/wdl/PloidyEstimation.wdl b/wdl/PloidyEstimation.wdl index d56d0dda9..6072c1cb0 100644 --- a/wdl/PloidyEstimation.wdl +++ b/wdl/PloidyEstimation.wdl @@ -8,6 +8,8 @@ workflow Ploidy { String batch String sv_base_mini_docker String sv_pipeline_qc_docker + String? chr_x + String? chr_y RuntimeAttr? runtime_attr_score RuntimeAttr? runtime_attr_build } @@ -27,6 +29,8 @@ workflow Ploidy { input: ploidy_matrix = BuildPloidyMatrix.ploidy_matrix, batch = batch, + chr_x = chr_x, + chr_y = chr_y, sv_pipeline_qc_docker = sv_pipeline_qc_docker, runtime_attr_override = runtime_attr_score } @@ -94,10 +98,14 @@ task PloidyScore { input { File ploidy_matrix String batch + String? chr_x + String? chr_y String sv_pipeline_qc_docker RuntimeAttr? runtime_attr_override } + Array[String] allosomes = [select_first([chr_x, "chrX"]), select_first([chr_y, "chrY"])] + RuntimeAttr default_attr = object { cpu_cores: 1, mem_gb: 3.75, @@ -114,7 +122,9 @@ task PloidyScore { set -euo pipefail mkdir ploidy_est - Rscript /opt/WGD/bin/estimatePloidy.R -z -O ./ploidy_est ~{ploidy_matrix} + Rscript /opt/WGD/bin/estimatePloidy.R -z -O ./ploidy_est \ + --allosomes '~{write_lines(allosomes)}' \ + ~{ploidy_matrix} #TODO: hotfix for "file changed as we read it" error caused by non-blocking system() calls in the R script sleep 10