diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java index 8edcd595881..0f95878b389 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java @@ -132,8 +132,12 @@ private void validateCoordinates(final SAMSequenceDictionary dictionary) { Utils.nonNull(dictionary); validatePosition(contigA, positionA, dictionary); validatePosition(contigB, positionB, dictionary); - Utils.validateArg(IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) <= 0, - "End coordinate cannot precede start"); + // CPX types may have position B precede A, such as dispersed duplications where A is the insertion point and + // B references the source sequence. + if (type != GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) { + Utils.validateArg(IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) <= 0, + "End coordinate cannot precede start"); + } } private static void validatePosition(final String contig, final int position, final SAMSequenceDictionary dictionary) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/concordance/ClosestSVFinder.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/concordance/ClosestSVFinder.java index da4e66f3fb6..9ca5056a679 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/sv/concordance/ClosestSVFinder.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/concordance/ClosestSVFinder.java @@ -31,6 +31,7 @@ */ public class ClosestSVFinder { + protected final boolean sortOutput; protected final Map truthIdToItemMap; protected final Map idToClusterMap; private final SVConcordanceLinkage linkage; @@ -49,7 +50,9 @@ public class ClosestSVFinder { */ public ClosestSVFinder(final SVConcordanceLinkage linkage, final Function collapser, + final boolean sortOutput, final SAMSequenceDictionary dictionary) { + this.sortOutput = sortOutput; this.linkage = Utils.nonNull(linkage); this.collapser = Utils.nonNull(collapser); outputBuffer = new PriorityQueue<>(SVCallRecordUtils.getCallComparator(dictionary)); @@ -60,6 +63,15 @@ public ClosestSVFinder(final SVConcordanceLinkage linkage, lastItemContig = null; } + /** + * Sorts output by default + */ + public ClosestSVFinder(final SVConcordanceLinkage linkage, + final Function collapser, + final SAMSequenceDictionary dictionary) { + this(linkage, collapser, true, dictionary); + } + /** * Should be run frequently to reduce memory usage. Forced flushes must be run when a new contig is encountered. * @param force flushes all variants in the output buffer regardless of position @@ -70,8 +82,12 @@ public List flush(final boolean force) { .map(c -> new ClosestPair(c.getItem(), c.getClosest())) .map(collapser) .collect(Collectors.toList()); - outputBuffer.addAll(collapsedRecords); - return flushBuffer(force); + if (sortOutput) { + outputBuffer.addAll(collapsedRecords); + return flushBuffer(force); + } else { + return collapsedRecords; + } } /** diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance.java index f5a9d247c0a..d53c3a22b0b 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance.java @@ -2,6 +2,8 @@ import com.google.common.collect.Sets; import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; @@ -30,8 +32,8 @@ import org.broadinstitute.hellbender.utils.SequenceDictionaryUtils; import picard.vcf.GenotypeConcordance; -import java.util.HashSet; -import java.util.Set; +import java.util.*; +import java.util.stream.Collectors; /** *

This tool calculates SV genotype concordance between an "evaluation" VCF and a "truth" VCF. For each evaluation @@ -50,6 +52,8 @@ * of the specific fields. For multi-allelic CNVs, only a copy state concordance metric is * annotated. Allele frequencies will be recalculated automatically if unavailable in the provided VCFs. * + * For large inputs, users may enable the --do-not-sort flag to reduce memory usage. + * *

Inputs

* *
    @@ -90,7 +94,7 @@ @DocumentedFeature public final class SVConcordance extends AbstractConcordanceWalker { - public static final String USE_TRUTH_AF_LONG_NAME = "use-truth-af"; + public static final String UNSORTED_OUTPUT_LONG_NAME = "do-not-sort"; @Argument( doc = "Output VCF", @@ -99,6 +103,12 @@ public final class SVConcordance extends AbstractConcordanceWalker { ) private GATKPath outputFile; + @Argument( + doc = "Skip output sorting. This can substantially reduce memory usage.", + fullName = UNSORTED_OUTPUT_LONG_NAME + ) + private boolean doNotSort; + @ArgumentCollection private final SVClusterEngineArgumentsCollection clusterParameterArgs = new SVClusterEngineArgumentsCollection(); @@ -136,8 +146,11 @@ public void onTraversalStart() { new HashSet<>(getEvalHeader().getGenotypeSamples()), new HashSet<>(getTruthHeader().getGenotypeSamples())); final SVConcordanceAnnotator collapser = new SVConcordanceAnnotator(commonSamples); - engine = new ClosestSVFinder(linkage, collapser::annotate, dictionary); + engine = new ClosestSVFinder(linkage, collapser::annotate, !doNotSort, dictionary); + if (doNotSort) { + createOutputVariantIndex = false; + } writer = createVCFWriter(outputFile); writer.writeHeader(createHeader(getEvalHeader())); } @@ -172,10 +185,32 @@ private void add(final VariantContext variant, final boolean isTruth) { flushClusters(true); currentContig = record.getContigA(); } + if (isTruth) { + record = minimizeTruthFootprint(record); + } engine.add(record, isTruth); flushClusters(false); } + /** + * Strips unneeded attributes from a truth variant to save memory. + */ + protected SVCallRecord minimizeTruthFootprint(final SVCallRecord item) { + final List genotypes = item.getGenotypes().stream().map(SVConcordance::stripTruthGenotype).collect(Collectors.toList()); + return new SVCallRecord(item.getId(), item.getContigA(), item.getPositionA(), + item.getStrandA(), item.getContigB(), item.getPositionB(), item.getStrandB(), item.getType(), + item.getCpxSubtype(), item.getLength(), item.getAlgorithms(), item.getAlleles(), genotypes, + item.getAttributes(), item.getFilters(), item.getLog10PError(), dictionary); + } + + private static Genotype stripTruthGenotype(final Genotype genotype) { + final GenotypeBuilder builder = new GenotypeBuilder(genotype.getSampleName()).alleles(genotype.getAlleles()); + if (genotype.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT)) { + builder.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT)); + } + return builder.make(); + } + @Override protected boolean areVariantsAtSameLocusConcordant(final VariantContext truth, final VariantContext eval) { return true; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordanceIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordanceIntegrationTest.java index 7f846624de4..6f4157724d4 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordanceIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordanceIntegrationTest.java @@ -38,7 +38,7 @@ public class SVConcordanceIntegrationTest extends CommandLineProgramTest { @Test public void testRefPanel() { - final File output = createTempFile("concord", ".vcf"); + final File output = createTempFile("concord", ".vcf.gz"); final String evalVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz"; /** * Test file produced from raw standardized VCFs from manta, melt, wham, and cnv callers with SVCluster parameters: @@ -171,7 +171,7 @@ public void testSelf() { // Run a vcf against itself and check that every variant matched itself - final File output = createTempFile("concord", ".vcf"); + final File output = createTempFile("concord", ".vcf.gz"); final String vcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz"; final ArgumentsBuilder args = new ArgumentsBuilder() @@ -260,7 +260,7 @@ public void testSelfEvalSubset() { // Run a vcf against itself but with extra samples and variants - final File output = createTempFile("concord", ".vcf"); + final File output = createTempFile("concord", ".vcf.gz"); final String evalVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.sample_subset.vcf.gz"; final String truthVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz"; @@ -322,7 +322,7 @@ private void assertPerfectConcordance(final File output, final String evalVcfPat @Test public void testSitesOnly() { - final File output = createTempFile("concord_sites_only", ".vcf"); + final File output = createTempFile("concord_sites_only", ".vcf.gz"); final String evalVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.sites_only.vcf.gz"; final String truthVcfPath = getToolTestDataDir() + "ref_panel_1kg.raw_calls.chr22_chrY.sites_only.vcf.gz"; @@ -358,7 +358,7 @@ public void testSitesOnly() { @Test(expectedExceptions = UserException.IncompatibleSequenceDictionaries.class) public void testHeaderContigsOutOfOrder() { - final File output = createTempFile("concord_sites_only", ".vcf"); + final File output = createTempFile("concord_sites_only", ".vcf.gz"); final String evalVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz"; final String truthVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.contigs_out_of_order.vcf.gz"; @@ -370,4 +370,24 @@ public void testHeaderContigsOutOfOrder() { runCommandLine(args, SVConcordance.class.getSimpleName()); } + + @Test + public void testSelfUnsorted() { + + // Run a vcf against itself but don't sort the output + + final File output = createTempFile("concord", ".vcf"); + final String evalVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz"; + final String truthVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz"; + + final ArgumentsBuilder args = new ArgumentsBuilder() + .addOutput(output) + .add(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, GATKBaseTest.FULL_HG38_DICT) + .add(AbstractConcordanceWalker.EVAL_VARIANTS_LONG_NAME, evalVcfPath) + .add(AbstractConcordanceWalker.TRUTH_VARIANTS_LONG_NAME, truthVcfPath) + .add(SVConcordance.UNSORTED_OUTPUT_LONG_NAME, true); + + runCommandLine(args, SVConcordance.class.getSimpleName()); + assertPerfectConcordance(output, evalVcfPath); + } }