From c9b332fa6a37f53f5a01782ec8a1e8a1e16369d0 Mon Sep 17 00:00:00 2001 From: Ryan Collins Date: Thu, 9 Feb 2023 11:04:59 -0500 Subject: [PATCH] Fix header issues when combining SV counts from multiple VCFs --- inputs/values/dockers.json | 2 +- .../sum_svcounts_perSample.R | 23 +++++++++++-------- wdl/FilterOutlierSamples.wdl | 10 +++++++- wdl/IdentifyOutlierSamples.wdl | 6 +++-- 4 files changed, 27 insertions(+), 14 deletions(-) diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index 8448074b6..8d20e0d3e 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -33,4 +33,4 @@ "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2023-02-01-v0.26.8-beta-9b25c72d", "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/tbrookin/gatk:0a7e1d86f", "str": "us.gcr.io/broad-dsde-methods/vjalili/str:5994670" -} \ No newline at end of file +} diff --git a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/sum_svcounts_perSample.R b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/sum_svcounts_perSample.R index e49b8f129..8989a6b2d 100755 --- a/src/sv-pipeline/scripts/downstream_analysis_and_filtering/sum_svcounts_perSample.R +++ b/src/sv-pipeline/scripts/downstream_analysis_and_filtering/sum_svcounts_perSample.R @@ -4,29 +4,32 @@ # in final_outlier_sample_filter.wdl -###Set parameters & read arguments +### Set parameters & read arguments options(stringsAsFactors=F, scipen=1000) args <- commandArgs(trailingOnly=TRUE) INFILE <- args[1] OUTFILE <- args[2] -###Read input data & reformat +### Read input data & reformat dat <- read.table(INFILE, header=F, check.names=F, sep="\t", comment.char="") -if(dat[1, 1] %in% c("sample", "#sample")){ - dat <- dat[-c(1), ] +drop.rows <- which(dat[, 1] %in% c("sample", "#sample")) +if(length(drop.rows) > 0){ + dat <- dat[-drop.rows, ] } colnames(dat) <- c("sample", "svtype", "count", "chrom")[1:ncol(dat)] +dat$count <- as.numeric(dat$count) samples <- as.character(unique(dat$sample)) svtypes <- as.character(unique(dat$svtype)) -###Get sum of counts per sample per svtype +### Get sum of counts per sample per svtype summed.res <- do.call("rbind", lapply(samples, function(sample){ - return(do.call("rbind", lapply(svtypes, function(svtype){ - return(data.frame("sample"=sample, + do.call("rbind", lapply(svtypes, function(svtype){ + data.frame("sample"=sample, "svtype"=svtype, - "count"=sum(dat[which(dat$sample==sample & dat$svtype==svtype), ]$count,na.rm=T))) - }))) + "count"=sum(dat[which(dat$sample==sample & dat$svtype==svtype), ]$count, + na.rm=T)) + })) })) -###Write summed results to outfile +### Write summed results to outfile write.table(summed.res, OUTFILE, col.names=T, row.names=F, sep="\t", quote=F) diff --git a/wdl/FilterOutlierSamples.wdl b/wdl/FilterOutlierSamples.wdl index 3e0fcfa68..93be211e6 100644 --- a/wdl/FilterOutlierSamples.wdl +++ b/wdl/FilterOutlierSamples.wdl @@ -16,7 +16,8 @@ workflow FilterOutlierSamples { String? vcf_identifier # required (enter algorithm here) if providing outlier_cutoff_table, otherwise used in some file prefixes String? bcftools_preprocessing_options Boolean plot_counts = false - Array[Pair[String, File]]? sample_subsets # if provided, will identify outliers separately within each subset. Expected format is array of pairs, where pair.left is the subset name and pair.right is a text file with all relevant sample IDs + Array[String]? sample_subset_prefixes # if provided, will identify outliers separately within each subset + Array[String]? sample_subset_lists # if provided, will identify outliers separately within each subset String sv_pipeline_docker String sv_base_mini_docker String linux_docker @@ -28,9 +29,15 @@ workflow FilterOutlierSamples { RuntimeAttr? runtime_attr_filter_samples RuntimeAttr? runtime_attr_ids_from_vcf RuntimeAttr? runtime_attr_count_svs + RuntimeAttr? runtime_attr_combine_counts RuntimeAttr? runtime_attr_plot_svcounts } + if (defined(sample_subset_prefixes) && defined(sample_subset_lists)) { + Array[Pair[String, File]]? sample_subsets = zip(select_first([sample_subset_prefixes]), + select_first([sample_subset_lists])) + } + call identify_outliers.IdentifyOutlierSamples { input: vcfs = vcfs, @@ -51,6 +58,7 @@ workflow FilterOutlierSamples { runtime_attr_cat_outliers = runtime_attr_cat_outliers, runtime_attr_subset_counts = runtime_attr_subset_counts, runtime_attr_count_svs = runtime_attr_count_svs, + runtime_attr_combine_counts = runtime_attr_combine_counts, runtime_attr_plot_svcounts = runtime_attr_plot_svcounts } diff --git a/wdl/IdentifyOutlierSamples.wdl b/wdl/IdentifyOutlierSamples.wdl index 87beb8bdd..83cbe10d5 100644 --- a/wdl/IdentifyOutlierSamples.wdl +++ b/wdl/IdentifyOutlierSamples.wdl @@ -28,6 +28,7 @@ workflow IdentifyOutlierSamples { RuntimeAttr? runtime_attr_cat_outliers RuntimeAttr? runtime_attr_subset_counts RuntimeAttr? runtime_attr_count_svs + RuntimeAttr? runtime_attr_combine_counts RuntimeAttr? runtime_attr_plot_svcounts } @@ -41,7 +42,7 @@ workflow IdentifyOutlierSamples { runtime_attr_override = runtime_attr_ids_from_vcf } } - Array[Pair[String, File]] subsets_to_eval = select_first([sample_subsets, [("ALL", select_first([GetSamplesList.out_file]))]]) + Array[Pair[String, File]] subsets_to_eval = select_first([sample_subsets, [("ALL", select_all([GetSamplesList.out_file]))]]) # Collect SV counts for each VCF in parallel unless sv_counts is provided if (!defined(sv_counts)) { @@ -75,7 +76,8 @@ workflow IdentifyOutlierSamples { input: svcounts = CountPerVcf.sv_counts, prefix = prefix, - sv_pipeline_docker = sv_pipeline_docker + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_combine_counts } } File final_counts = select_first([sv_counts, Combine.summed_svcounts])