From 5e0bc160e0c9ca4532d3ab81dbdf02264bd8cf0b Mon Sep 17 00:00:00 2001 From: Justin Lim Date: Mon, 19 Aug 2024 17:22:06 -0400 Subject: [PATCH 1/4] use chrX and chrY to match sex chromosomes The ploidy estimation script uses string comparison to "X" and "Y" to identify sex chromsomes, but the ploidy matrix input to the script uses "chrX" and "chrY" for the sex chromsomes. This commit updates the script to correctly identify sex chrosomomes. --- src/WGD/bin/estimatePloidy.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/WGD/bin/estimatePloidy.R b/src/WGD/bin/estimatePloidy.R index fa9b8de83..bdfb28dab 100755 --- a/src/WGD/bin/estimatePloidy.R +++ b/src/WGD/bin/estimatePloidy.R @@ -26,7 +26,7 @@ readMatrix <- function(INFILE){ ############################################################# #####Helper function to normalize contigs for a single sample ############################################################# -normalizeContigsPerSample <- function(mat, exclude=c("X", "Y"), ploidy=2){ +normalizeContigsPerSample <- function(mat, exclude=c("chrX", "chrY"), ploidy=2){ #Convert vals to numeric mat[, -c(1:3)] <- apply(mat[, -c(1:3)], 2, as.numeric) @@ -53,7 +53,7 @@ normalizeContigsPerSample <- function(mat, exclude=c("X", "Y"), ploidy=2){ ########################################################################### #####Helper function to remove rows where >X% of samples have <=Y% coverage ########################################################################### -filterZeroBins <- function(mat, exclude=c("X", "Y"), minSamp=0.8, minCov=0.2){ +filterZeroBins <- function(mat, exclude=c("chrX", "chrY"), minSamp=0.8, minCov=0.2){ #Convert vals to numeric mat[, -c(1:3)] <- apply(mat[, -c(1:3)], 2, as.numeric) @@ -74,7 +74,7 @@ filterZeroBins <- function(mat, exclude=c("X", "Y"), minSamp=0.8, minCov=0.2){ ################################################## #####Helper function to run PCA on a binCov matrix ################################################## -binCovPCA <- function(dat, exclude=c("X", "Y"), topPCs=10){ +binCovPCA <- function(dat, exclude=c("chrX", "chrY"), topPCs=10){ #Runs PCA PCA <- prcomp(dat[which(!(dat[, 1] %in% exclude)), -c(1:3)], center=T, scale=T) @@ -1007,7 +1007,7 @@ if(!dir.exists(OUTDIR)){ #####PART 1: DATA PROCESSING##### #Read, normalize, and clean coverage data dat <- readMatrix(INFILE) -dat <- normalizeContigsPerSample(dat, exclude=c("X", "Y"), ploidy=2) +dat <- normalizeContigsPerSample(dat, exclude=c("chrX", "chrY"), ploidy=2) dat <- filterZeroBins(dat) chr.dat <- medianPerContigPerSample(dat) # chr.dat.norm <- normalizeContigsPerMatrix(chr.dat, scale.exclude=c("X", "Y")) @@ -1016,7 +1016,7 @@ chr.dat <- medianPerContigPerSample(dat) #Only run if kmeans is optioned if(kmeans==T){ #Perform PCA on full matrix - PCs <- binCovPCA(dat, exclude=c("X", "Y"), topPCs=nPCs) + PCs <- binCovPCA(dat, exclude=c("chrX", "chrY"), topPCs=nPCs) #Cluster samples based on dosage PCA #Note: tries this with seeds 1-100 (iterated sequentially) until first success @@ -1193,7 +1193,7 @@ binwise.dat <- data.frame(colnames(dat)[-c(1:3)], bin.IDs <- as.character(apply(dat[1:3], 1, paste, collapse="_", sep="")) bin.IDs <- as.character(sapply(bin.IDs, function(ID){gsub(" ", "", ID, fixed=T)})) colnames(binwise.dat) <- c("ID", bin.IDs) -sapply(setdiff(unique(dat[, 1]), c("X", "Y")), function(contig){ +sapply(setdiff(unique(dat[, 1]), c("chrX", "chrY")), function(contig){ png(paste(OUTDIR, "/estimated_CN_per_bin.all_samples.", contig, ".png", sep=""), height=1250, width=2500, res=300) plot.dat <- binwise.dat[, c(1, which(dat[, 1]==contig)+1)] @@ -1209,7 +1209,7 @@ colnames(binwise.dat.males) <- c("ID", bin.IDs) binwise.dat.females <- data.frame(chr.dat.females$ID, t(dat[, which(colnames(dat) %in% chr.dat.females$ID)])) colnames(binwise.dat.females) <- c("ID", bin.IDs) -sapply(intersect(unique(dat[, 1]), c("X", "Y")), function(contig){ +sapply(intersect(unique(dat[, 1]), c("chrX", "chrY")), function(contig){ #Males png(paste(OUTDIR, "/estimated_CN_per_bin.chrX_lessThan_2copies.", contig, ".png", sep=""), height=1250, width=2500, res=300) From 31cafce9a560ec5a7327d6494da29ab5a896310d Mon Sep 17 00:00:00 2001 From: Justin Lim Date: Wed, 18 Sep 2024 15:25:48 -0400 Subject: [PATCH 2/4] Add allosome contigs input to EvidenceQC Update the ploidy estimation script and associated WDLs to accept a custom set of allosome contigs, with the default being chrX and chrY. --- src/WGD/bin/estimatePloidy.R | 20 ++++++++++++-------- wdl/EvidenceQC.wdl | 2 ++ wdl/GATKSVPipelineBatch.wdl | 1 + wdl/GATKSVPipelineSingleSample.wdl | 1 + wdl/PloidyEstimation.wdl | 6 +++++- 5 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/WGD/bin/estimatePloidy.R b/src/WGD/bin/estimatePloidy.R index bdfb28dab..e37a06dc0 100755 --- a/src/WGD/bin/estimatePloidy.R +++ b/src/WGD/bin/estimatePloidy.R @@ -26,7 +26,7 @@ readMatrix <- function(INFILE){ ############################################################# #####Helper function to normalize contigs for a single sample ############################################################# -normalizeContigsPerSample <- function(mat, exclude=c("chrX", "chrY"), ploidy=2){ +normalizeContigsPerSample <- function(mat, exclude, ploidy=2){ #Convert vals to numeric mat[, -c(1:3)] <- apply(mat[, -c(1:3)], 2, as.numeric) @@ -53,7 +53,7 @@ normalizeContigsPerSample <- function(mat, exclude=c("chrX", "chrY"), ploidy=2){ ########################################################################### #####Helper function to remove rows where >X% of samples have <=Y% coverage ########################################################################### -filterZeroBins <- function(mat, exclude=c("chrX", "chrY"), minSamp=0.8, minCov=0.2){ +filterZeroBins <- function(mat, exclude, minSamp=0.8, minCov=0.2){ #Convert vals to numeric mat[, -c(1:3)] <- apply(mat[, -c(1:3)], 2, as.numeric) @@ -74,7 +74,7 @@ filterZeroBins <- function(mat, exclude=c("chrX", "chrY"), minSamp=0.8, minCov=0 ################################################## #####Helper function to run PCA on a binCov matrix ################################################## -binCovPCA <- function(dat, exclude=c("chrX", "chrY"), topPCs=10){ +binCovPCA <- function(dat, exclude, topPCs=10){ #Runs PCA PCA <- prcomp(dat[which(!(dat[, 1] %in% exclude)), -c(1:3)], center=T, scale=T) @@ -961,6 +961,9 @@ option_list <- list( make_option(c("--maxBatch"), type="integer", default=150, help="maximum number of samples per batch (requires -k) [default: %default]", metavar="integer") + make_option(c("--allosomes"), type="character", default=NULL, + help="file with allosome contigs, one per line", + metavar="character") ) #Get command-line arguments & options @@ -986,6 +989,7 @@ nPCs <- args$options$dimensions batch.ideal <- args$options$batchSize batch.min <- args$options$minBatch batch.max <- args$options$batchMax +allosomes <- if (is.null(args$options$allosomes)) c("chrX", "chrY") else trimws(readLines(args$options$allosomes)) # ##Jan 2020 dev parameters (on local machine) # # INFILE <- "/Users/rlc/scratch/1KGP_2504_sub_batch_10_ploidy_matrix.bed.gz" @@ -1007,8 +1011,8 @@ if(!dir.exists(OUTDIR)){ #####PART 1: DATA PROCESSING##### #Read, normalize, and clean coverage data dat <- readMatrix(INFILE) -dat <- normalizeContigsPerSample(dat, exclude=c("chrX", "chrY"), ploidy=2) -dat <- filterZeroBins(dat) +dat <- normalizeContigsPerSample(dat, exclude=allosomes, ploidy=2) +dat <- filterZeroBins(dat, exclude=allosomes) chr.dat <- medianPerContigPerSample(dat) # chr.dat.norm <- normalizeContigsPerMatrix(chr.dat, scale.exclude=c("X", "Y")) @@ -1016,7 +1020,7 @@ chr.dat <- medianPerContigPerSample(dat) #Only run if kmeans is optioned if(kmeans==T){ #Perform PCA on full matrix - PCs <- binCovPCA(dat, exclude=c("chrX", "chrY"), topPCs=nPCs) + PCs <- binCovPCA(dat, exclude=allosomes, topPCs=nPCs) #Cluster samples based on dosage PCA #Note: tries this with seeds 1-100 (iterated sequentially) until first success @@ -1193,7 +1197,7 @@ binwise.dat <- data.frame(colnames(dat)[-c(1:3)], bin.IDs <- as.character(apply(dat[1:3], 1, paste, collapse="_", sep="")) bin.IDs <- as.character(sapply(bin.IDs, function(ID){gsub(" ", "", ID, fixed=T)})) colnames(binwise.dat) <- c("ID", bin.IDs) -sapply(setdiff(unique(dat[, 1]), c("chrX", "chrY")), function(contig){ +sapply(setdiff(unique(dat[, 1]), allosomes), function(contig){ png(paste(OUTDIR, "/estimated_CN_per_bin.all_samples.", contig, ".png", sep=""), height=1250, width=2500, res=300) plot.dat <- binwise.dat[, c(1, which(dat[, 1]==contig)+1)] @@ -1209,7 +1213,7 @@ colnames(binwise.dat.males) <- c("ID", bin.IDs) binwise.dat.females <- data.frame(chr.dat.females$ID, t(dat[, which(colnames(dat) %in% chr.dat.females$ID)])) colnames(binwise.dat.females) <- c("ID", bin.IDs) -sapply(intersect(unique(dat[, 1]), c("chrX", "chrY")), function(contig){ +sapply(intersect(unique(dat[, 1]), allosomes), function(contig){ #Males png(paste(OUTDIR, "/estimated_CN_per_bin.chrX_lessThan_2copies.", contig, ".png", sep=""), height=1250, width=2500, res=300) diff --git a/wdl/EvidenceQC.wdl b/wdl/EvidenceQC.wdl index 91daa1ab4..5c06033dc 100644 --- a/wdl/EvidenceQC.wdl +++ b/wdl/EvidenceQC.wdl @@ -23,6 +23,7 @@ workflow EvidenceQC { # Global files File genome_file + File allosome_contigs # Coverage files Array[File] counts @@ -88,6 +89,7 @@ workflow EvidenceQC { input: bincov_matrix = MakeBincovMatrix.merged_bincov, batch = batch, + allosome_contigs = allosome_contigs, sv_base_mini_docker = sv_base_mini_docker, sv_pipeline_qc_docker = sv_pipeline_qc_docker, runtime_attr_score = ploidy_score_runtime_attr, diff --git a/wdl/GATKSVPipelineBatch.wdl b/wdl/GATKSVPipelineBatch.wdl index e328806bc..057d3af19 100644 --- a/wdl/GATKSVPipelineBatch.wdl +++ b/wdl/GATKSVPipelineBatch.wdl @@ -178,6 +178,7 @@ workflow GATKSVPipelineBatch { genome_file=genome_file, counts=counts_files_, run_ploidy = false, + allosome_contigs=allosome_file, sv_pipeline_docker=sv_pipeline_docker, sv_pipeline_qc_docker=sv_pipeline_qc_docker, sv_base_mini_docker=sv_base_mini_docker, diff --git a/wdl/GATKSVPipelineSingleSample.wdl b/wdl/GATKSVPipelineSingleSample.wdl index d9e55d5e6..53569fd3f 100644 --- a/wdl/GATKSVPipelineSingleSample.wdl +++ b/wdl/GATKSVPipelineSingleSample.wdl @@ -671,6 +671,7 @@ workflow GATKSVPipelineSingleSample { counts=[case_counts_file_], run_ploidy = false, wgd_scoring_mask=wgd_scoring_mask, + allosome_contigs=allosome_file, sv_pipeline_docker=sv_pipeline_docker, sv_pipeline_qc_docker=sv_pipeline_qc_docker, sv_base_mini_docker=sv_base_mini_docker, diff --git a/wdl/PloidyEstimation.wdl b/wdl/PloidyEstimation.wdl index d56d0dda9..0f2d73bdd 100644 --- a/wdl/PloidyEstimation.wdl +++ b/wdl/PloidyEstimation.wdl @@ -8,6 +8,7 @@ workflow Ploidy { String batch String sv_base_mini_docker String sv_pipeline_qc_docker + File allosome_contigs RuntimeAttr? runtime_attr_score RuntimeAttr? runtime_attr_build } @@ -27,6 +28,7 @@ workflow Ploidy { input: ploidy_matrix = BuildPloidyMatrix.ploidy_matrix, batch = batch, + allosome_contigs = allosome_contigs, sv_pipeline_qc_docker = sv_pipeline_qc_docker, runtime_attr_override = runtime_attr_score } @@ -94,6 +96,7 @@ task PloidyScore { input { File ploidy_matrix String batch + File allosome_contigs String sv_pipeline_qc_docker RuntimeAttr? runtime_attr_override } @@ -114,7 +117,8 @@ task PloidyScore { set -euo pipefail mkdir ploidy_est - Rscript /opt/WGD/bin/estimatePloidy.R -z -O ./ploidy_est ~{ploidy_matrix} + cut -f 1 '~{allosome_contigs}' > allosome_contigs.list + Rscript /opt/WGD/bin/estimatePloidy.R -z -O ./ploidy_est --allosomes allosome_contigs.list ~{ploidy_matrix} #TODO: hotfix for "file changed as we read it" error caused by non-blocking system() calls in the R script sleep 10 From 2f29a6f113bd401acadb5298bb2bb19220fd1f78 Mon Sep 17 00:00:00 2001 From: Justin Lim Date: Thu, 26 Sep 2024 12:00:44 -0400 Subject: [PATCH 3/4] Separate allosome inputs for PloidyEstimation Expose separate inputs to specify the allosomal contig names in PloidyEstimation with the defaults being "chrX" and "chrY". --- wdl/EvidenceQC.wdl | 7 +++++-- wdl/GatherBatchEvidence.wdl | 6 ++++++ wdl/PloidyEstimation.wdl | 16 +++++++++++----- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/wdl/EvidenceQC.wdl b/wdl/EvidenceQC.wdl index 5c06033dc..b7b25b89f 100644 --- a/wdl/EvidenceQC.wdl +++ b/wdl/EvidenceQC.wdl @@ -23,7 +23,6 @@ workflow EvidenceQC { # Global files File genome_file - File allosome_contigs # Coverage files Array[File] counts @@ -37,6 +36,9 @@ workflow EvidenceQC { # WGD files File wgd_scoring_mask + String? chr_x + String? chr_y + # Runtime parameters String sv_base_mini_docker String sv_base_docker @@ -89,7 +91,8 @@ workflow EvidenceQC { input: bincov_matrix = MakeBincovMatrix.merged_bincov, batch = batch, - allosome_contigs = allosome_contigs, + chr_x = chr_x, + chr_y = chr_y, sv_base_mini_docker = sv_base_mini_docker, sv_pipeline_qc_docker = sv_pipeline_qc_docker, runtime_attr_score = ploidy_score_runtime_attr, diff --git a/wdl/GatherBatchEvidence.wdl b/wdl/GatherBatchEvidence.wdl index f3f01a7e1..4637446ff 100644 --- a/wdl/GatherBatchEvidence.wdl +++ b/wdl/GatherBatchEvidence.wdl @@ -126,6 +126,10 @@ workflow GatherBatchEvidence { # QC files Int matrix_qc_distance + # Allosomes + String? chr_x + String? chr_y + # Module metrics parameters # Run module metrics workflow at the end - off by default for GatherBatchEvidence because of runtime/expense Boolean? run_module_metrics @@ -206,6 +210,8 @@ workflow GatherBatchEvidence { input: bincov_matrix = merged_bincov_, batch = batch, + chr_x = chr_x, + chr_y = chr_y, sv_base_mini_docker = sv_base_mini_docker, sv_pipeline_qc_docker = sv_pipeline_qc_docker, runtime_attr_score = ploidy_score_runtime_attr, diff --git a/wdl/PloidyEstimation.wdl b/wdl/PloidyEstimation.wdl index 0f2d73bdd..6072c1cb0 100644 --- a/wdl/PloidyEstimation.wdl +++ b/wdl/PloidyEstimation.wdl @@ -8,7 +8,8 @@ workflow Ploidy { String batch String sv_base_mini_docker String sv_pipeline_qc_docker - File allosome_contigs + String? chr_x + String? chr_y RuntimeAttr? runtime_attr_score RuntimeAttr? runtime_attr_build } @@ -28,7 +29,8 @@ workflow Ploidy { input: ploidy_matrix = BuildPloidyMatrix.ploidy_matrix, batch = batch, - allosome_contigs = allosome_contigs, + chr_x = chr_x, + chr_y = chr_y, sv_pipeline_qc_docker = sv_pipeline_qc_docker, runtime_attr_override = runtime_attr_score } @@ -96,11 +98,14 @@ task PloidyScore { input { File ploidy_matrix String batch - File allosome_contigs + String? chr_x + String? chr_y String sv_pipeline_qc_docker RuntimeAttr? runtime_attr_override } + Array[String] allosomes = [select_first([chr_x, "chrX"]), select_first([chr_y, "chrY"])] + RuntimeAttr default_attr = object { cpu_cores: 1, mem_gb: 3.75, @@ -117,8 +122,9 @@ task PloidyScore { set -euo pipefail mkdir ploidy_est - cut -f 1 '~{allosome_contigs}' > allosome_contigs.list - Rscript /opt/WGD/bin/estimatePloidy.R -z -O ./ploidy_est --allosomes allosome_contigs.list ~{ploidy_matrix} + Rscript /opt/WGD/bin/estimatePloidy.R -z -O ./ploidy_est \ + --allosomes '~{write_lines(allosomes)}' \ + ~{ploidy_matrix} #TODO: hotfix for "file changed as we read it" error caused by non-blocking system() calls in the R script sleep 10 From 3b18fb28e3fd65a40388e3400e5ed77f0536db2e Mon Sep 17 00:00:00 2001 From: Justin Lim Date: Thu, 26 Sep 2024 12:24:03 -0400 Subject: [PATCH 4/4] Revert usage of allosomal contigs file The method to pass the names of the allosomes to PloidyEstimation was changed from passing a file to passing two strings, but some calling workflows were not updated to reflect the change. This commit fixes the inputs. --- wdl/GATKSVPipelineBatch.wdl | 1 - wdl/GATKSVPipelineSingleSample.wdl | 1 - 2 files changed, 2 deletions(-) diff --git a/wdl/GATKSVPipelineBatch.wdl b/wdl/GATKSVPipelineBatch.wdl index 057d3af19..e328806bc 100644 --- a/wdl/GATKSVPipelineBatch.wdl +++ b/wdl/GATKSVPipelineBatch.wdl @@ -178,7 +178,6 @@ workflow GATKSVPipelineBatch { genome_file=genome_file, counts=counts_files_, run_ploidy = false, - allosome_contigs=allosome_file, sv_pipeline_docker=sv_pipeline_docker, sv_pipeline_qc_docker=sv_pipeline_qc_docker, sv_base_mini_docker=sv_base_mini_docker, diff --git a/wdl/GATKSVPipelineSingleSample.wdl b/wdl/GATKSVPipelineSingleSample.wdl index 53569fd3f..d9e55d5e6 100644 --- a/wdl/GATKSVPipelineSingleSample.wdl +++ b/wdl/GATKSVPipelineSingleSample.wdl @@ -671,7 +671,6 @@ workflow GATKSVPipelineSingleSample { counts=[case_counts_file_], run_ploidy = false, wgd_scoring_mask=wgd_scoring_mask, - allosome_contigs=allosome_file, sv_pipeline_docker=sv_pipeline_docker, sv_pipeline_qc_docker=sv_pipeline_qc_docker, sv_base_mini_docker=sv_base_mini_docker,