Skip to content

Commit

Permalink
Fix header issues when combining SV counts from multiple VCFs
Browse files Browse the repository at this point in the history
  • Loading branch information
RCollins13 committed Feb 14, 2023
1 parent 6442927 commit c9b332f
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 14 deletions.
2 changes: 1 addition & 1 deletion inputs/values/dockers.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,4 @@
"sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2023-02-01-v0.26.8-beta-9b25c72d",
"gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/tbrookin/gatk:0a7e1d86f",
"str": "us.gcr.io/broad-dsde-methods/vjalili/str:5994670"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,32 @@
# in final_outlier_sample_filter.wdl


###Set parameters & read arguments
### Set parameters & read arguments
options(stringsAsFactors=F, scipen=1000)
args <- commandArgs(trailingOnly=TRUE)
INFILE <- args[1]
OUTFILE <- args[2]

###Read input data & reformat
### Read input data & reformat
dat <- read.table(INFILE, header=F, check.names=F, sep="\t", comment.char="")
if(dat[1, 1] %in% c("sample", "#sample")){
dat <- dat[-c(1), ]
drop.rows <- which(dat[, 1] %in% c("sample", "#sample"))
if(length(drop.rows) > 0){
dat <- dat[-drop.rows, ]
}
colnames(dat) <- c("sample", "svtype", "count", "chrom")[1:ncol(dat)]
dat$count <- as.numeric(dat$count)
samples <- as.character(unique(dat$sample))
svtypes <- as.character(unique(dat$svtype))

###Get sum of counts per sample per svtype
### Get sum of counts per sample per svtype
summed.res <- do.call("rbind", lapply(samples, function(sample){
return(do.call("rbind", lapply(svtypes, function(svtype){
return(data.frame("sample"=sample,
do.call("rbind", lapply(svtypes, function(svtype){
data.frame("sample"=sample,
"svtype"=svtype,
"count"=sum(dat[which(dat$sample==sample & dat$svtype==svtype), ]$count,na.rm=T)))
})))
"count"=sum(dat[which(dat$sample==sample & dat$svtype==svtype), ]$count,
na.rm=T))
}))
}))

###Write summed results to outfile
### Write summed results to outfile
write.table(summed.res, OUTFILE, col.names=T, row.names=F, sep="\t", quote=F)
10 changes: 9 additions & 1 deletion wdl/FilterOutlierSamples.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ workflow FilterOutlierSamples {
String? vcf_identifier # required (enter algorithm here) if providing outlier_cutoff_table, otherwise used in some file prefixes
String? bcftools_preprocessing_options
Boolean plot_counts = false
Array[Pair[String, File]]? sample_subsets # if provided, will identify outliers separately within each subset. Expected format is array of pairs, where pair.left is the subset name and pair.right is a text file with all relevant sample IDs
Array[String]? sample_subset_prefixes # if provided, will identify outliers separately within each subset
Array[String]? sample_subset_lists # if provided, will identify outliers separately within each subset
String sv_pipeline_docker
String sv_base_mini_docker
String linux_docker
Expand All @@ -28,9 +29,15 @@ workflow FilterOutlierSamples {
RuntimeAttr? runtime_attr_filter_samples
RuntimeAttr? runtime_attr_ids_from_vcf
RuntimeAttr? runtime_attr_count_svs
RuntimeAttr? runtime_attr_combine_counts
RuntimeAttr? runtime_attr_plot_svcounts
}
if (defined(sample_subset_prefixes) && defined(sample_subset_lists)) {
Array[Pair[String, File]]? sample_subsets = zip(select_first([sample_subset_prefixes]),
select_first([sample_subset_lists]))
}

call identify_outliers.IdentifyOutlierSamples {
input:
vcfs = vcfs,
Expand All @@ -51,6 +58,7 @@ workflow FilterOutlierSamples {
runtime_attr_cat_outliers = runtime_attr_cat_outliers,
runtime_attr_subset_counts = runtime_attr_subset_counts,
runtime_attr_count_svs = runtime_attr_count_svs,
runtime_attr_combine_counts = runtime_attr_combine_counts,
runtime_attr_plot_svcounts = runtime_attr_plot_svcounts
}
Expand Down
6 changes: 4 additions & 2 deletions wdl/IdentifyOutlierSamples.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ workflow IdentifyOutlierSamples {
RuntimeAttr? runtime_attr_cat_outliers
RuntimeAttr? runtime_attr_subset_counts
RuntimeAttr? runtime_attr_count_svs
RuntimeAttr? runtime_attr_combine_counts
RuntimeAttr? runtime_attr_plot_svcounts
}
Expand All @@ -41,7 +42,7 @@ workflow IdentifyOutlierSamples {
runtime_attr_override = runtime_attr_ids_from_vcf
}
}
Array[Pair[String, File]] subsets_to_eval = select_first([sample_subsets, [("ALL", select_first([GetSamplesList.out_file]))]])
Array[Pair[String, File]] subsets_to_eval = select_first([sample_subsets, [("ALL", select_all([GetSamplesList.out_file]))]])
# Collect SV counts for each VCF in parallel unless sv_counts is provided
if (!defined(sv_counts)) {
Expand Down Expand Up @@ -75,7 +76,8 @@ workflow IdentifyOutlierSamples {
input:
svcounts = CountPerVcf.sv_counts,
prefix = prefix,
sv_pipeline_docker = sv_pipeline_docker
sv_pipeline_docker = sv_pipeline_docker,
runtime_attr_override = runtime_attr_combine_counts
}
}
File final_counts = select_first([sv_counts, Combine.summed_svcounts])
Expand Down

0 comments on commit c9b332f

Please sign in to comment.