Skip to content

Commit

Permalink
Add allosome contigs input to EvidenceQC
Browse files Browse the repository at this point in the history
Update the ploidy estimation script and associated WDLs to accept a
custom set of allosome contigs, with the default being chrX and chrY.
  • Loading branch information
CuriousTim committed Sep 18, 2024
1 parent e3fe34a commit 1477a28
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 9 deletions.
20 changes: 12 additions & 8 deletions src/WGD/bin/estimatePloidy.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ readMatrix <- function(INFILE){
#############################################################
#####Helper function to normalize contigs for a single sample
#############################################################
normalizeContigsPerSample <- function(mat, exclude=c("chrX", "chrY"), ploidy=2){
normalizeContigsPerSample <- function(mat, exclude, ploidy=2){
#Convert vals to numeric
mat[, -c(1:3)] <- apply(mat[, -c(1:3)], 2, as.numeric)

Expand All @@ -53,7 +53,7 @@ normalizeContigsPerSample <- function(mat, exclude=c("chrX", "chrY"), ploidy=2){
###########################################################################
#####Helper function to remove rows where >X% of samples have <=Y% coverage
###########################################################################
filterZeroBins <- function(mat, exclude=c("chrX", "chrY"), minSamp=0.8, minCov=0.2){
filterZeroBins <- function(mat, exclude, minSamp=0.8, minCov=0.2){
#Convert vals to numeric
mat[, -c(1:3)] <- apply(mat[, -c(1:3)], 2, as.numeric)

Expand All @@ -74,7 +74,7 @@ filterZeroBins <- function(mat, exclude=c("chrX", "chrY"), minSamp=0.8, minCov=0
##################################################
#####Helper function to run PCA on a binCov matrix
##################################################
binCovPCA <- function(dat, exclude=c("chrX", "chrY"), topPCs=10){
binCovPCA <- function(dat, exclude, topPCs=10){
#Runs PCA
PCA <- prcomp(dat[which(!(dat[, 1] %in% exclude)), -c(1:3)], center=T, scale=T)

Expand Down Expand Up @@ -961,6 +961,9 @@ option_list <- list(
make_option(c("--maxBatch"), type="integer", default=150,
help="maximum number of samples per batch (requires -k) [default: %default]",
metavar="integer")
make_option(c("--allosomes"), type="character", default=NULL,
help="file with allosome contigs, one per line",
metavar="character")
)

#Get command-line arguments & options
Expand All @@ -986,6 +989,7 @@ nPCs <- args$options$dimensions
batch.ideal <- args$options$batchSize
batch.min <- args$options$minBatch
batch.max <- args$options$batchMax
allosomes <- if (is.null(args$options$allosomes)) c("chrX", "chrY") else trimws(readLines(args$options$allosomes))

# ##Jan 2020 dev parameters (on local machine)
# # INFILE <- "/Users/rlc/scratch/1KGP_2504_sub_batch_10_ploidy_matrix.bed.gz"
Expand All @@ -1007,16 +1011,16 @@ if(!dir.exists(OUTDIR)){
#####PART 1: DATA PROCESSING#####
#Read, normalize, and clean coverage data
dat <- readMatrix(INFILE)
dat <- normalizeContigsPerSample(dat, exclude=c("chrX", "chrY"), ploidy=2)
dat <- filterZeroBins(dat)
dat <- normalizeContigsPerSample(dat, exclude=allosomes, ploidy=2)
dat <- filterZeroBins(dat, exclude=allosomes)
chr.dat <- medianPerContigPerSample(dat)
# chr.dat.norm <- normalizeContigsPerMatrix(chr.dat, scale.exclude=c("X", "Y"))

#####PART 2: DOSAGE-BASED BATCHING#####
#Only run if kmeans is optioned
if(kmeans==T){
#Perform PCA on full matrix
PCs <- binCovPCA(dat, exclude=c("chrX", "chrY"), topPCs=nPCs)
PCs <- binCovPCA(dat, exclude=allosomes, topPCs=nPCs)

#Cluster samples based on dosage PCA
#Note: tries this with seeds 1-100 (iterated sequentially) until first success
Expand Down Expand Up @@ -1193,7 +1197,7 @@ binwise.dat <- data.frame(colnames(dat)[-c(1:3)],
bin.IDs <- as.character(apply(dat[1:3], 1, paste, collapse="_", sep=""))
bin.IDs <- as.character(sapply(bin.IDs, function(ID){gsub(" ", "", ID, fixed=T)}))
colnames(binwise.dat) <- c("ID", bin.IDs)
sapply(setdiff(unique(dat[, 1]), c("chrX", "chrY")), function(contig){
sapply(setdiff(unique(dat[, 1]), allosomes), function(contig){
png(paste(OUTDIR, "/estimated_CN_per_bin.all_samples.", contig, ".png", sep=""),
height=1250, width=2500, res=300)
plot.dat <- binwise.dat[, c(1, which(dat[, 1]==contig)+1)]
Expand All @@ -1209,7 +1213,7 @@ colnames(binwise.dat.males) <- c("ID", bin.IDs)
binwise.dat.females <- data.frame(chr.dat.females$ID,
t(dat[, which(colnames(dat) %in% chr.dat.females$ID)]))
colnames(binwise.dat.females) <- c("ID", bin.IDs)
sapply(intersect(unique(dat[, 1]), c("chrX", "chrY")), function(contig){
sapply(intersect(unique(dat[, 1]), allosomes), function(contig){
#Males
png(paste(OUTDIR, "/estimated_CN_per_bin.chrX_lessThan_2copies.", contig, ".png", sep=""),
height=1250, width=2500, res=300)
Expand Down
2 changes: 2 additions & 0 deletions wdl/EvidenceQC.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ workflow EvidenceQC {

# Global files
File genome_file
File allosome_contigs

# Coverage files
Array[File] counts
Expand Down Expand Up @@ -88,6 +89,7 @@ workflow EvidenceQC {
input:
bincov_matrix = MakeBincovMatrix.merged_bincov,
batch = batch,
allosome_contigs = allosome_contigs,
sv_base_mini_docker = sv_base_mini_docker,
sv_pipeline_qc_docker = sv_pipeline_qc_docker,
runtime_attr_score = ploidy_score_runtime_attr,
Expand Down
5 changes: 4 additions & 1 deletion wdl/PloidyEstimation.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ workflow Ploidy {
String batch
String sv_base_mini_docker
String sv_pipeline_qc_docker
File allosome_contigs
RuntimeAttr? runtime_attr_score
RuntimeAttr? runtime_attr_build
}
Expand Down Expand Up @@ -94,6 +95,7 @@ task PloidyScore {
input {
File ploidy_matrix
String batch
File allosome_contigs
String sv_pipeline_qc_docker
RuntimeAttr? runtime_attr_override
}
Expand All @@ -114,7 +116,8 @@ task PloidyScore {

set -euo pipefail
mkdir ploidy_est
Rscript /opt/WGD/bin/estimatePloidy.R -z -O ./ploidy_est ~{ploidy_matrix}
cut -f 1 '~{allosome_contigs}' > allosome_contigs.list
Rscript /opt/WGD/bin/estimatePloidy.R -z -O ./ploidy_est --allosomes allosome_contigs.list ~{ploidy_matrix}

#TODO: hotfix for "file changed as we read it" error caused by non-blocking system() calls in the R script
sleep 10
Expand Down

0 comments on commit 1477a28

Please sign in to comment.