-
Notifications
You must be signed in to change notification settings - Fork 589
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adds VcfComparator tool #8933
Merged
Merged
Adds VcfComparator tool #8933
Changes from all commits
Commits
Show all changes
17 commits
Select commit
Hold shift + click to select a range
49c5c83
Systematically compare the outputs of two pipeline versions, especially
ldgauthier ddb6ea7
Subset annotations to account for different alleles
ldgauthier a8236e5
Refactor and clean up
ldgauthier ce06629
Check ref blocks
ldgauthier 1f4a335
Be explicit about AS_RAW annotation differences
ldgauthier fea7be4
Update changed star attributes from NaNs to missing; add option to
ldgauthier 5efa74a
Ignoring star attributes still needs work
ldgauthier 82d5e53
Lookin' pretty good...
ldgauthier 9982fff
Add new mode to log all differences before failing
ldgauthier db49952
Misplaced brace prevented PL checks
ldgauthier 6a6872f
Check on */* genotypes was failing DRAGEN outputs -- removed
ldgauthier 45825a9
clean up after rebase
meganshand 923d1d5
adding test
meganshand 2b041df
style
meganshand 684f021
not sure why we need to cap values for the comparator so removing it
meganshand 9f0310c
addressing comments
meganshand 6618ca5
more comments
meganshand File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
206 changes: 206 additions & 0 deletions
206
src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerGroupedByOverlap.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,206 @@ | ||
package org.broadinstitute.hellbender.engine; | ||
|
||
import htsjdk.samtools.util.Locatable; | ||
import htsjdk.samtools.util.OverlapDetector; | ||
import htsjdk.variant.variantcontext.Allele; | ||
import htsjdk.variant.variantcontext.VariantContext; | ||
import org.broadinstitute.barclay.argparser.Advanced; | ||
import org.broadinstitute.barclay.argparser.Argument; | ||
import org.broadinstitute.hellbender.exceptions.GATKException; | ||
import org.broadinstitute.hellbender.utils.SimpleInterval; | ||
import org.broadinstitute.hellbender.utils.Utils; | ||
|
||
import java.util.ArrayList; | ||
import java.util.HashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.stream.Collectors; | ||
|
||
|
||
/** | ||
* A MultiVariantWalker that walks over multiple variant context sources in reference order and emits to client tools | ||
* groups of all input variant contexts by their start position. This is intended to mimic GATK3 traversal behavior for | ||
* some tools. | ||
* | ||
* As such, the argument '-ignore-variants-starting-outside-interval' has been provided to mimic GATK3's behavior | ||
* only presenting variants that start inside the requested interval regardless of whether there is a spanning variant. | ||
* | ||
* Client tools must implement apply(List<VariantContext> variantContexts, ReferenceContext referenceContext) | ||
*/ | ||
public abstract class MultiVariantWalkerGroupedByOverlap extends MultiVariantWalker { | ||
private List<VariantContext> currentVariants = new ArrayList<>(); | ||
private String lastCurrentVariantContig; | ||
private int lastCurrentVariantEnd; | ||
private List<ReadsContext> currentReadsContexts = new ArrayList<>(); | ||
private OverlapDetector<SimpleInterval> overlapDetector; | ||
|
||
public static final String IGNORE_VARIANTS_THAT_START_OUTSIDE_INTERVAL = "ignore-variants-starting-outside-interval"; | ||
|
||
public static final String COMBINE_VARIANTS_DISTANCE = "combine-variants-distance"; | ||
|
||
public static final String MAX_COMBINED_DISTANCE = "max-distance"; | ||
|
||
public static final String REFERENCE_WINDOW_PADDING = "ref-padding"; | ||
|
||
/** | ||
* this option has no effect unless intervals are specified. | ||
* <p> | ||
* This exists to mimic GATK3 interval traversal patterns | ||
*/ | ||
@Advanced | ||
@Argument(fullName = IGNORE_VARIANTS_THAT_START_OUTSIDE_INTERVAL, | ||
doc = "Restrict variant output to sites that start within provided intervals (only applies when an interval is specified)", | ||
optional = true) | ||
protected boolean ignoreIntervalsOutsideStart = false; | ||
|
||
@Advanced | ||
@Argument(fullName = COMBINE_VARIANTS_DISTANCE, doc = "Maximum distance for variants to be grouped together", optional = true) | ||
protected int distanceToCombineVariants = defaultDistanceToGroupVariants(); | ||
|
||
@Advanced | ||
@Argument(fullName = MAX_COMBINED_DISTANCE, doc = "Maximum distance for variants to be grouped together", optional = true) | ||
protected int maxCombinedDistance = defaultMaxGroupedSpan(); | ||
|
||
@Advanced | ||
@Argument(fullName = REFERENCE_WINDOW_PADDING, doc = "Number of bases on either side to expand spanning reference window", optional = true) | ||
protected int referenceWindowPadding = defaultReferenceWindowPadding(); | ||
|
||
@Advanced | ||
@Argument(fullName = "ignore-reference-blocks", shortName = "ignore-ref-blocks", optional = true) | ||
protected boolean ignoreReferenceBlocks = false; | ||
|
||
// override to group variants that start nearby but not at the same locus | ||
protected int defaultDistanceToGroupVariants() { | ||
return 10000; | ||
} | ||
|
||
// override to change reference padding | ||
protected int defaultReferenceWindowPadding() { | ||
return 1; | ||
} | ||
|
||
// override to cap the size span of combined variants | ||
protected int defaultMaxGroupedSpan() { | ||
return Integer.MAX_VALUE; | ||
} | ||
|
||
@Override | ||
public boolean requiresReference() { | ||
return true; | ||
} | ||
|
||
/** | ||
* This method keeps track of all the variants it is passed and will feed all the variants that start at the same | ||
* site to the reduce method. | ||
* | ||
* {@inheritDoc} | ||
*/ | ||
@Override | ||
public final void apply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { | ||
|
||
// Filtering out variants that start outside of the specified intervals | ||
if (ignoreIntervalsOutsideStart && !isWithinInterval(new SimpleInterval(variant.getContig(), variant.getStart(), variant.getStart()))) { | ||
return; | ||
} | ||
|
||
if (ignoreReferenceBlocks && variant.getAlternateAlleles().size() ==1 && variant.getAlternateAllele(0).equals(Allele.NON_REF_ALLELE)) { | ||
return; | ||
} | ||
|
||
// Collecting all the reads that start at a particular base into one. | ||
if (currentVariants.isEmpty()) { | ||
lastCurrentVariantContig = variant.getContig(); | ||
} else if (!currentVariants.get(0).contigsMatch(variant) | ||
|| variant.getStart() > lastCurrentVariantEnd) { | ||
// Emptying any sites which should emit a new VC since the last one | ||
apply(new ArrayList<>(currentVariants), currentReadsContexts); | ||
currentVariants.clear(); | ||
currentReadsContexts.clear(); | ||
} | ||
|
||
currentVariants.add(variant); | ||
currentReadsContexts.add(readsContext); | ||
if (variant.getEnd() > lastCurrentVariantEnd || !lastCurrentVariantContig.equals(variant.getContig())) { | ||
lastCurrentVariantEnd = variant.getEnd(); | ||
lastCurrentVariantContig = variant.getContig(); | ||
} | ||
} | ||
|
||
/** | ||
* This method must be implemented by tool authors. | ||
* | ||
* This is the primary traversal for any MultiVariantWalkerGroupedOnStart walkers. Will traverse over input variant contexts | ||
* and call #apply() exactly once for each unique reference start position. All variants starting at each locus | ||
* across source files will be grouped and passed as a list of VariantContext objects. | ||
* @param variantContexts VariantContexts from driving variants with matching start position | ||
* NOTE: This will never be empty | ||
* @param referenceContext ReferenceContext object covering the reference of the longest spanning VariantContext | ||
* @param readsContexts | ||
*/ | ||
public abstract void apply(final List<VariantContext> variantContexts, final ReferenceContext referenceContext, final List<ReadsContext> readsContexts); | ||
|
||
public void apply(List<VariantContext> variantContexts, final List<ReadsContext> readsContexts) { | ||
apply(variantContexts, makeSpanningReferenceContext(variantContexts, referenceWindowPadding), readsContexts); | ||
} | ||
|
||
/** | ||
* Helper method that ensures the reference context it returns is adequate to span the length of all the accumulated | ||
* VariantContexts. It assumes that all variant contexts in currentVariants have the same contig. | ||
*/ | ||
private ReferenceContext makeSpanningReferenceContext(final List<VariantContext> variantContexts, final int referenceWindowPadding) { | ||
Utils.nonEmpty(variantContexts, "Must have at least one current variant context"); | ||
final List<String> contigs = variantContexts.stream().map(VariantContext::getContig).distinct().collect(Collectors.toList()); | ||
Utils.validate(contigs.size() == 1, "variant contexts should all have the same contig"); | ||
final int minStart = variantContexts.stream().mapToInt(VariantContext::getStart).min().getAsInt(); | ||
final int maxEnd = variantContexts.stream().mapToInt(VariantContext::getEnd).max().getAsInt(); | ||
final SimpleInterval combinedInterval = new SimpleInterval(contigs.get(0), minStart, maxEnd); | ||
|
||
final ReferenceContext combinedReferenceContext = new ReferenceContext(reference, combinedInterval); | ||
combinedReferenceContext.setWindow(referenceWindowPadding,referenceWindowPadding); | ||
return combinedReferenceContext; | ||
} | ||
|
||
/** | ||
* {@inheritDoc} | ||
* | ||
* Implementation of multi-variant grouped on start traversal. | ||
* | ||
* NOTE: You should only override {@link #traverse()} if you are writing a new walker base class in the | ||
* engine package that extends this class. It is not meant to be overridden by tools outside of the | ||
* engine package. | ||
*/ | ||
@Override | ||
public void traverse() { | ||
beforeTraverse(); | ||
super.traverse(); | ||
afterTraverse(); | ||
} | ||
|
||
/** | ||
* Marked final so that tool authors don't override it. Tool authors should override {@link #onTraversalStart} instead. | ||
*/ | ||
private void beforeTraverse() { | ||
overlapDetector = hasUserSuppliedIntervals() ? OverlapDetector.create(intervalArgumentCollection.getIntervals(getBestAvailableSequenceDictionary())) : null; | ||
} | ||
|
||
/** | ||
* @param loc locatable to query | ||
* @return true if the query loc is entirely contained by the interval, true if no interval | ||
*/ | ||
protected final boolean isWithinInterval(Locatable loc) { | ||
return (overlapDetector==null || overlapDetector.overlapsAny(loc)); | ||
} | ||
|
||
/** | ||
* Clear accumulated reads before {@link #onTraversalSuccess()} is accessed | ||
*/ | ||
private void afterTraverse() { | ||
// Clearing the accumulator | ||
if (currentVariants.isEmpty()) { | ||
logger.warn("Error: The requested interval contained no data in source VCF files"); | ||
|
||
} else { | ||
apply(currentVariants, currentReadsContexts); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not gonna lie, I have no memory of where this class comes from. Apparently this was two years ago, so maybe it shouldn't be a surprise. I think it's a modified version of MultiVariantWalkerGroupedOnStart. I suspect we needed it because I wanted the star alleles to be returned with their deletions? I just talked myself into keeping this, although we probably don't need it for this exact implementation if you turned off the spandel checks.