Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds VcfComparator tool #8933

Merged
merged 17 commits into from
Aug 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
package org.broadinstitute.hellbender.engine;

import htsjdk.samtools.util.Locatable;
import htsjdk.samtools.util.OverlapDetector;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;


/**
* A MultiVariantWalker that walks over multiple variant context sources in reference order and emits to client tools
* groups of all input variant contexts by their start position. This is intended to mimic GATK3 traversal behavior for
* some tools.
*
* As such, the argument '-ignore-variants-starting-outside-interval' has been provided to mimic GATK3's behavior
* only presenting variants that start inside the requested interval regardless of whether there is a spanning variant.
*
* Client tools must implement apply(List<VariantContext> variantContexts, ReferenceContext referenceContext)
*/
public abstract class MultiVariantWalkerGroupedByOverlap extends MultiVariantWalker {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not gonna lie, I have no memory of where this class comes from. Apparently this was two years ago, so maybe it shouldn't be a surprise. I think it's a modified version of MultiVariantWalkerGroupedOnStart. I suspect we needed it because I wanted the star alleles to be returned with their deletions? I just talked myself into keeping this, although we probably don't need it for this exact implementation if you turned off the spandel checks.

private List<VariantContext> currentVariants = new ArrayList<>();
private String lastCurrentVariantContig;
private int lastCurrentVariantEnd;
private List<ReadsContext> currentReadsContexts = new ArrayList<>();
private OverlapDetector<SimpleInterval> overlapDetector;

public static final String IGNORE_VARIANTS_THAT_START_OUTSIDE_INTERVAL = "ignore-variants-starting-outside-interval";

public static final String COMBINE_VARIANTS_DISTANCE = "combine-variants-distance";

public static final String MAX_COMBINED_DISTANCE = "max-distance";

public static final String REFERENCE_WINDOW_PADDING = "ref-padding";

/**
* this option has no effect unless intervals are specified.
* <p>
* This exists to mimic GATK3 interval traversal patterns
*/
@Advanced
@Argument(fullName = IGNORE_VARIANTS_THAT_START_OUTSIDE_INTERVAL,
doc = "Restrict variant output to sites that start within provided intervals (only applies when an interval is specified)",
optional = true)
protected boolean ignoreIntervalsOutsideStart = false;

@Advanced
@Argument(fullName = COMBINE_VARIANTS_DISTANCE, doc = "Maximum distance for variants to be grouped together", optional = true)
protected int distanceToCombineVariants = defaultDistanceToGroupVariants();

@Advanced
@Argument(fullName = MAX_COMBINED_DISTANCE, doc = "Maximum distance for variants to be grouped together", optional = true)
protected int maxCombinedDistance = defaultMaxGroupedSpan();

@Advanced
@Argument(fullName = REFERENCE_WINDOW_PADDING, doc = "Number of bases on either side to expand spanning reference window", optional = true)
protected int referenceWindowPadding = defaultReferenceWindowPadding();

@Advanced
@Argument(fullName = "ignore-reference-blocks", shortName = "ignore-ref-blocks", optional = true)
protected boolean ignoreReferenceBlocks = false;

// override to group variants that start nearby but not at the same locus
protected int defaultDistanceToGroupVariants() {
return 10000;
}

// override to change reference padding
protected int defaultReferenceWindowPadding() {
return 1;
}

// override to cap the size span of combined variants
protected int defaultMaxGroupedSpan() {
return Integer.MAX_VALUE;
}

@Override
public boolean requiresReference() {
return true;
}

/**
* This method keeps track of all the variants it is passed and will feed all the variants that start at the same
* site to the reduce method.
*
* {@inheritDoc}
*/
@Override
public final void apply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) {

// Filtering out variants that start outside of the specified intervals
if (ignoreIntervalsOutsideStart && !isWithinInterval(new SimpleInterval(variant.getContig(), variant.getStart(), variant.getStart()))) {
return;
}

if (ignoreReferenceBlocks && variant.getAlternateAlleles().size() ==1 && variant.getAlternateAllele(0).equals(Allele.NON_REF_ALLELE)) {
return;
}

// Collecting all the reads that start at a particular base into one.
if (currentVariants.isEmpty()) {
lastCurrentVariantContig = variant.getContig();
} else if (!currentVariants.get(0).contigsMatch(variant)
|| variant.getStart() > lastCurrentVariantEnd) {
// Emptying any sites which should emit a new VC since the last one
apply(new ArrayList<>(currentVariants), currentReadsContexts);
currentVariants.clear();
currentReadsContexts.clear();
}

currentVariants.add(variant);
currentReadsContexts.add(readsContext);
if (variant.getEnd() > lastCurrentVariantEnd || !lastCurrentVariantContig.equals(variant.getContig())) {
lastCurrentVariantEnd = variant.getEnd();
lastCurrentVariantContig = variant.getContig();
}
}

/**
* This method must be implemented by tool authors.
*
* This is the primary traversal for any MultiVariantWalkerGroupedOnStart walkers. Will traverse over input variant contexts
* and call #apply() exactly once for each unique reference start position. All variants starting at each locus
* across source files will be grouped and passed as a list of VariantContext objects.
* @param variantContexts VariantContexts from driving variants with matching start position
* NOTE: This will never be empty
* @param referenceContext ReferenceContext object covering the reference of the longest spanning VariantContext
* @param readsContexts
*/
public abstract void apply(final List<VariantContext> variantContexts, final ReferenceContext referenceContext, final List<ReadsContext> readsContexts);

public void apply(List<VariantContext> variantContexts, final List<ReadsContext> readsContexts) {
apply(variantContexts, makeSpanningReferenceContext(variantContexts, referenceWindowPadding), readsContexts);
}

/**
* Helper method that ensures the reference context it returns is adequate to span the length of all the accumulated
* VariantContexts. It assumes that all variant contexts in currentVariants have the same contig.
*/
private ReferenceContext makeSpanningReferenceContext(final List<VariantContext> variantContexts, final int referenceWindowPadding) {
Utils.nonEmpty(variantContexts, "Must have at least one current variant context");
final List<String> contigs = variantContexts.stream().map(VariantContext::getContig).distinct().collect(Collectors.toList());
Utils.validate(contigs.size() == 1, "variant contexts should all have the same contig");
final int minStart = variantContexts.stream().mapToInt(VariantContext::getStart).min().getAsInt();
final int maxEnd = variantContexts.stream().mapToInt(VariantContext::getEnd).max().getAsInt();
final SimpleInterval combinedInterval = new SimpleInterval(contigs.get(0), minStart, maxEnd);

final ReferenceContext combinedReferenceContext = new ReferenceContext(reference, combinedInterval);
combinedReferenceContext.setWindow(referenceWindowPadding,referenceWindowPadding);
return combinedReferenceContext;
}

/**
* {@inheritDoc}
*
* Implementation of multi-variant grouped on start traversal.
*
* NOTE: You should only override {@link #traverse()} if you are writing a new walker base class in the
* engine package that extends this class. It is not meant to be overridden by tools outside of the
* engine package.
*/
@Override
public void traverse() {
beforeTraverse();
super.traverse();
afterTraverse();
}

/**
* Marked final so that tool authors don't override it. Tool authors should override {@link #onTraversalStart} instead.
*/
private void beforeTraverse() {
overlapDetector = hasUserSuppliedIntervals() ? OverlapDetector.create(intervalArgumentCollection.getIntervals(getBestAvailableSequenceDictionary())) : null;
}

/**
* @param loc locatable to query
* @return true if the query loc is entirely contained by the interval, true if no interval
*/
protected final boolean isWithinInterval(Locatable loc) {
return (overlapDetector==null || overlapDetector.overlapsAny(loc));
}

/**
* Clear accumulated reads before {@link #onTraversalSuccess()} is accessed
*/
private void afterTraverse() {
// Clearing the accumulator
if (currentVariants.isEmpty()) {
logger.warn("Error: The requested interval contained no data in source VCF files");

} else {
apply(currentVariants, currentReadsContexts);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
import org.apache.commons.lang3.StringUtils;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import htsjdk.variant.vcf.VCFConstants;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.*;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;

import java.util.*;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,7 @@ private boolean isMonomorphicCallWithAlts(final VariantContext result) {
final Genotype genotype = result.getGenotype(0);
return (hasGenotypeValuesArray(posteriorsKey, genotype)
&& (genotype.isHomRef() || isNoCalledHomRef(posteriorsKey, genotype) || (genotype.hasPL() && MathUtils.minElementIndex(genotype.getPL()) == 0))
&& result.getAlternateAlleles().stream().anyMatch(this::isConcreteAlt));
&& result.getAlternateAlleles().stream().anyMatch(GATKVariantContextUtils::isConcreteAlt));
}

/**
Expand Down Expand Up @@ -526,26 +526,13 @@ boolean shouldBeReblocked(final VariantContext vc) {

final List<Allele> finalAlleles = alleleCounts.asAlleleList(vc.getAlleles());
return (pls != null && pls[0] < rgqThreshold)
|| !genotypeHasConcreteAlt(finalAlleles)
|| !GATKVariantContextUtils.genotypeHasConcreteAlt(finalAlleles)
|| finalAlleles.stream().anyMatch(a -> a.equals(Allele.NON_REF_ALLELE))
|| (!genotype.hasPL() && !genotype.hasGQ()
// If TREE_SCORE threshold is set to be > 0 then sites without TREE_SCORE should be reblocked.
|| vc.getAttributeAsDouble(GATKVCFConstants.TREE_SCORE, 0) < treeScoreThreshold);
}

/**
meganshand marked this conversation as resolved.
Show resolved Hide resolved
* Is there a real ALT allele that's not <NON_REF> or * ?
* @param alleles alleles in the called genotype
* @return true if the genotype has a called allele that is a "real" alternate
*/
private boolean genotypeHasConcreteAlt(final List<Allele> alleles) {
return alleles.stream().anyMatch(this::isConcreteAlt);
}

private boolean isConcreteAlt(final Allele a) {
return !a.isReference() && !a.isSymbolic() && !a.equals(Allele.SPAN_DEL);
}

/**
* "reblock" a variant by converting its genotype to homRef, changing PLs, adding reblock END tags and other attributes
* @param lowQualityVariant a variant already determined to be low quality
Expand Down
Loading
Loading