Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce SVConcordance memory footprint #8623

Merged
merged 3 commits into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,12 @@ private void validateCoordinates(final SAMSequenceDictionary dictionary) {
Utils.nonNull(dictionary);
validatePosition(contigA, positionA, dictionary);
validatePosition(contigB, positionB, dictionary);
Utils.validateArg(IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) <= 0,
"End coordinate cannot precede start");
// CPX types may have position B precede A, such as dispersed duplications where A is the insertion point and
// B references the source sequence.
if (type != GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) {
Utils.validateArg(IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) <= 0,
"End coordinate cannot precede start");
}
}

private static void validatePosition(final String contig, final int position, final SAMSequenceDictionary dictionary) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
*/
public class ClosestSVFinder {

protected final boolean sortOutput;
protected final Map<Long, SVCallRecord> truthIdToItemMap;
protected final Map<Long, ActiveClosestPair> idToClusterMap;
private final SVConcordanceLinkage linkage;
Expand All @@ -49,7 +50,9 @@ public class ClosestSVFinder {
*/
public ClosestSVFinder(final SVConcordanceLinkage linkage,
final Function<ClosestPair, SVCallRecord> collapser,
final boolean sortOutput,
final SAMSequenceDictionary dictionary) {
this.sortOutput = sortOutput;
this.linkage = Utils.nonNull(linkage);
this.collapser = Utils.nonNull(collapser);
outputBuffer = new PriorityQueue<>(SVCallRecordUtils.getCallComparator(dictionary));
Expand All @@ -60,6 +63,15 @@ public ClosestSVFinder(final SVConcordanceLinkage linkage,
lastItemContig = null;
}

/**
* Sorts output by default
*/
public ClosestSVFinder(final SVConcordanceLinkage linkage,
final Function<ClosestPair, SVCallRecord> collapser,
final SAMSequenceDictionary dictionary) {
this(linkage, collapser, true, dictionary);
}

/**
* Should be run frequently to reduce memory usage. Forced flushes must be run when a new contig is encountered.
* @param force flushes all variants in the output buffer regardless of position
Expand All @@ -70,8 +82,12 @@ public List<SVCallRecord> flush(final boolean force) {
.map(c -> new ClosestPair(c.getItem(), c.getClosest()))
.map(collapser)
.collect(Collectors.toList());
outputBuffer.addAll(collapsedRecords);
return flushBuffer(force);
if (sortOutput) {
outputBuffer.addAll(collapsedRecords);
return flushBuffer(force);
} else {
return collapsedRecords;
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import com.google.common.collect.Sets;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
Expand Down Expand Up @@ -30,8 +32,8 @@
import org.broadinstitute.hellbender.utils.SequenceDictionaryUtils;
import picard.vcf.GenotypeConcordance;

import java.util.HashSet;
import java.util.Set;
import java.util.*;
import java.util.stream.Collectors;

/**
* <p>This tool calculates SV genotype concordance between an "evaluation" VCF and a "truth" VCF. For each evaluation
Expand All @@ -50,6 +52,8 @@
* of the specific fields. For multi-allelic CNVs, only a copy state concordance metric is
* annotated. Allele frequencies will be recalculated automatically if unavailable in the provided VCFs.
*
* For large inputs, users may enable the --do-not-sort flag to reduce memory usage.
*
* <h3>Inputs</h3>
*
* <ul>
Expand Down Expand Up @@ -90,7 +94,7 @@
@DocumentedFeature
public final class SVConcordance extends AbstractConcordanceWalker {

public static final String USE_TRUTH_AF_LONG_NAME = "use-truth-af";
public static final String UNSORTED_OUTPUT_LONG_NAME = "do-not-sort";

@Argument(
doc = "Output VCF",
Expand All @@ -99,6 +103,12 @@ public final class SVConcordance extends AbstractConcordanceWalker {
)
private GATKPath outputFile;

@Argument(
doc = "Skip output sorting. This can substantially reduce memory usage.",
fullName = UNSORTED_OUTPUT_LONG_NAME
)
private boolean doNotSort;

@ArgumentCollection
private final SVClusterEngineArgumentsCollection clusterParameterArgs = new SVClusterEngineArgumentsCollection();

Expand Down Expand Up @@ -136,8 +146,11 @@ public void onTraversalStart() {
new HashSet<>(getEvalHeader().getGenotypeSamples()),
new HashSet<>(getTruthHeader().getGenotypeSamples()));
final SVConcordanceAnnotator collapser = new SVConcordanceAnnotator(commonSamples);
engine = new ClosestSVFinder(linkage, collapser::annotate, dictionary);
engine = new ClosestSVFinder(linkage, collapser::annotate, !doNotSort, dictionary);

if (doNotSort) {
createOutputVariantIndex = false;
}
writer = createVCFWriter(outputFile);
writer.writeHeader(createHeader(getEvalHeader()));
}
Expand Down Expand Up @@ -172,10 +185,32 @@ private void add(final VariantContext variant, final boolean isTruth) {
flushClusters(true);
currentContig = record.getContigA();
}
if (isTruth) {
record = minimizeTruthFootprint(record);
}
engine.add(record, isTruth);
flushClusters(false);
}

/**
* Strips unneeded attributes from a truth variant to save memory.
*/
protected SVCallRecord minimizeTruthFootprint(final SVCallRecord item) {
final List<Genotype> genotypes = item.getGenotypes().stream().map(SVConcordance::stripTruthGenotype).collect(Collectors.toList());
return new SVCallRecord(item.getId(), item.getContigA(), item.getPositionA(),
item.getStrandA(), item.getContigB(), item.getPositionB(), item.getStrandB(), item.getType(),
item.getCpxSubtype(), item.getLength(), item.getAlgorithms(), item.getAlleles(), genotypes,
item.getAttributes(), item.getFilters(), item.getLog10PError(), dictionary);
}

private static Genotype stripTruthGenotype(final Genotype genotype) {
final GenotypeBuilder builder = new GenotypeBuilder(genotype.getSampleName()).alleles(genotype.getAlleles());
if (genotype.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT)) {
builder.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT));
}
return builder.make();
}

@Override
protected boolean areVariantsAtSameLocusConcordant(final VariantContext truth, final VariantContext eval) {
return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public class SVConcordanceIntegrationTest extends CommandLineProgramTest {

@Test
public void testRefPanel() {
final File output = createTempFile("concord", ".vcf");
final File output = createTempFile("concord", ".vcf.gz");
final String evalVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz";
/**
* Test file produced from raw standardized VCFs from manta, melt, wham, and cnv callers with SVCluster parameters:
Expand Down Expand Up @@ -171,7 +171,7 @@ public void testSelf() {

// Run a vcf against itself and check that every variant matched itself

final File output = createTempFile("concord", ".vcf");
final File output = createTempFile("concord", ".vcf.gz");
final String vcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz";

final ArgumentsBuilder args = new ArgumentsBuilder()
Expand Down Expand Up @@ -260,7 +260,7 @@ public void testSelfEvalSubset() {

// Run a vcf against itself but with extra samples and variants

final File output = createTempFile("concord", ".vcf");
final File output = createTempFile("concord", ".vcf.gz");
final String evalVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.sample_subset.vcf.gz";
final String truthVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz";

Expand Down Expand Up @@ -322,7 +322,7 @@ private void assertPerfectConcordance(final File output, final String evalVcfPat

@Test
public void testSitesOnly() {
final File output = createTempFile("concord_sites_only", ".vcf");
final File output = createTempFile("concord_sites_only", ".vcf.gz");
final String evalVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.sites_only.vcf.gz";
final String truthVcfPath = getToolTestDataDir() + "ref_panel_1kg.raw_calls.chr22_chrY.sites_only.vcf.gz";

Expand Down Expand Up @@ -358,7 +358,7 @@ public void testSitesOnly() {

@Test(expectedExceptions = UserException.IncompatibleSequenceDictionaries.class)
public void testHeaderContigsOutOfOrder() {
final File output = createTempFile("concord_sites_only", ".vcf");
final File output = createTempFile("concord_sites_only", ".vcf.gz");
final String evalVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz";
final String truthVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.contigs_out_of_order.vcf.gz";

Expand All @@ -370,4 +370,24 @@ public void testHeaderContigsOutOfOrder() {

runCommandLine(args, SVConcordance.class.getSimpleName());
}

@Test
public void testSelfUnsorted() {

// Run a vcf against itself but don't sort the output

final File output = createTempFile("concord", ".vcf");
final String evalVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz";
final String truthVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz";

final ArgumentsBuilder args = new ArgumentsBuilder()
.addOutput(output)
.add(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, GATKBaseTest.FULL_HG38_DICT)
.add(AbstractConcordanceWalker.EVAL_VARIANTS_LONG_NAME, evalVcfPath)
.add(AbstractConcordanceWalker.TRUTH_VARIANTS_LONG_NAME, truthVcfPath)
.add(SVConcordance.UNSORTED_OUTPUT_LONG_NAME, true);

runCommandLine(args, SVConcordance.class.getSimpleName());
assertPerfectConcordance(output, evalVcfPath);
}
}
Loading