Skip to content

Commit

Permalink
Adding a new GATKTool level argument to control which if any output v…
Browse files Browse the repository at this point in the history
…ariants are filtered.
  • Loading branch information
lbergelson committed Aug 5, 2020
1 parent e7faa26 commit 20c049d
Show file tree
Hide file tree
Showing 11 changed files with 440 additions and 129 deletions.
33 changes: 31 additions & 2 deletions src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@
import java.time.ZonedDateTime;
import java.util.*;
import java.util.stream.Stream;

import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.barclay.argparser.CommandLinePluginDescriptor;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKAnnotationPluginDescriptor;
Expand All @@ -42,6 +45,7 @@
import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter;
import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.writers.IntervalFilteringVcfWriter;

//TODO:
//UserException overloads
Expand Down Expand Up @@ -123,6 +127,14 @@ public abstract class GATKTool extends CommandLineProgram {
doc = "If true, don't emit genotype fields when writing vcf file output.", optional = true)
public boolean outputSitesOnlyVCFs = false;

public static final String VARIANT_OUTPUT_INTERVAL_FILTERING_MODE = "variant-output-interval-filtering-mode";
@Argument(fullName = VARIANT_OUTPUT_INTERVAL_FILTERING_MODE,
doc = "Restrict the output variants to ones that match the specified intervals according to the specified matching mode.",
optional = true)
@Advanced
public IntervalFilteringVcfWriter.Mode outputVariantIntervalFilteringMode = getDefaultVariantOutputFilterMode();


/**
* Master sequence dictionary to be used instead of all other dictionaries (if provided).
*/
Expand Down Expand Up @@ -413,6 +425,13 @@ public int getDefaultCloudIndexPrefetchBufferSize() {
*/
public String getProgressMeterRecordLabel() { return ProgressMeter.DEFAULT_RECORD_LABEL; }

/**
* @return Default interval filtering mode for variant output. Subclasses may override this to set a different default.
*/
public IntervalFilteringVcfWriter.Mode getDefaultVariantOutputFilterMode(){
return null;
}

protected List<SimpleInterval> transformTraversalIntervals(final List<SimpleInterval> getIntervals, final SAMSequenceDictionary sequenceDictionary) {
return getIntervals;
}
Expand Down Expand Up @@ -710,12 +729,16 @@ protected void onStartup() {

initializeIntervals(); // Must be initialized after reference, reads and features, since intervals currently require a sequence dictionary from another data source

if ( seqValidationArguments.performSequenceDictionaryValidation()) {
if (seqValidationArguments.performSequenceDictionaryValidation()) {
validateSequenceDictionaries();
}

checkToolRequirements();

if (outputVariantIntervalFilteringMode != null && userIntervals == null){
throw new CommandLineException.MissingArgument("-L or -XL", "Intervals are required if --" + VARIANT_OUTPUT_INTERVAL_FILTERING_MODE + " was specified.");
}

progressMeter = new ProgressMeter(secondsBetweenProgressUpdates);
progressMeter.setRecordLabel(getProgressMeterRecordLabel());
}
Expand Down Expand Up @@ -884,11 +907,17 @@ public VariantContextWriter createVCFWriter(final Path outPath) {
options.add(Options.DO_NOT_WRITE_GENOTYPES);
}

return GATKVariantContextUtils.createVCFWriter(
final VariantContextWriter vcfWriter = GATKVariantContextUtils.createVCFWriter(
outPath,
sequenceDictionary,
createOutputVariantMD5,
options.toArray(new Options[options.size()]));

if(outputVariantIntervalFilteringMode != null){
return new IntervalFilteringVcfWriter(vcfWriter, intervalArgumentCollection.getIntervals(getBestAvailableSequenceDictionary()), outputVariantIntervalFilteringMode);
} else {
return vcfWriter;
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,16 @@
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.broadinstitute.barclay.argparser.*;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.DbsnpArgumentCollection;
import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.GATKTool;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.VariantLocusWalker;
Expand All @@ -24,11 +28,22 @@
import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeCalculationArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.mutect.M2ArgumentCollection;
import org.broadinstitute.hellbender.utils.*;
import org.broadinstitute.hellbender.utils.GenomeLoc;
import org.broadinstitute.hellbender.utils.GenomeLocParser;
import org.broadinstitute.hellbender.utils.GenomeLocSortedSet;
import org.broadinstitute.hellbender.utils.IntervalMergingRule;
import org.broadinstitute.hellbender.utils.IntervalSetRule;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.writers.IntervalFilteringVcfWriter;

import java.io.File;
import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;

/**
* Perform joint genotyping on one or more samples pre-called with HaplotypeCaller
Expand Down Expand Up @@ -108,7 +123,7 @@ public final class GenotypeGVCFs extends VariantLocusWalker {
/**
* Import all data between specified intervals. Improves performance using large lists of intervals, as in exome
* sequencing, especially if GVCF data only exists for specified intervals. Use with
* --only-output-calls-starting-in-intervals if input GVCFs contain calls outside the specified intervals.
* --{@value GATKTool#VARIANT_OUTPUT_INTERVAL_FILTERING_MODE} if input GVCFs contain calls outside the specified intervals.
*/
@Argument(fullName = GenomicsDBImport.MERGE_INPUT_INTERVALS_LONG_NAME,
shortName = GenomicsDBImport.MERGE_INPUT_INTERVALS_LONG_NAME,
Expand Down Expand Up @@ -149,16 +164,6 @@ public final class GenotypeGVCFs extends VariantLocusWalker {
@ArgumentCollection
private GenomicsDBArgumentCollection genomicsdbArgs = new GenomicsDBArgumentCollection();

/**
* This option can only be activated if intervals are specified.
*/
@Advanced
@Argument(fullName= ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME,
doc="Restrict variant output to sites that start within provided intervals",
optional=true)
private boolean onlyOutputCallsStartingInIntervals = false;


@Argument(fullName = FORCE_OUTPUT_INTERVALS_NAME,
suppressFileExpansion = true, doc = "sites at which to output genotypes even if non-variant in samples", optional = true)
protected final List<String> forceOutputIntervalStrings = new ArrayList<>();
Expand All @@ -177,15 +182,14 @@ public final class GenotypeGVCFs extends VariantLocusWalker {

private VariantContextWriter vcfWriter;

/** these are used when {@link #onlyOutputCallsStartingInIntervals) is true */
private List<SimpleInterval> intervals;

private OverlapDetector<GenomeLoc> forceOutputIntervals;

private boolean forceOutputIntervalsPresent;

private GenotypeGVCFsEngine gvcfEngine;



/**
* Get the largest interval per contig that contains the intervals specified on the command line.
* @param getIntervals intervals to be transformed
Expand Down Expand Up @@ -248,29 +252,19 @@ public void onTraversalStart() {

final VCFHeader inputVCFHeader = getHeaderForVariants();

if(onlyOutputCallsStartingInIntervals) {
if( !hasUserSuppliedIntervals()) {
throw new CommandLineException.MissingArgument("-L or -XL", "Intervals are required if --" + ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME + " was specified.");
}
}

intervals = hasUserSuppliedIntervals() ? intervalArgumentCollection.getIntervals(getBestAvailableSequenceDictionary()) :
Collections.emptyList();

annotationEngine = new VariantAnnotatorEngine(makeVariantAnnotations(), dbsnp.dbsnp, Collections.emptyList(), false, keepCombined);

merger = new ReferenceConfidenceVariantContextMerger(annotationEngine, getHeaderForVariants(), somaticInput);

//methods that cannot be called in engine bc its protected
Set<VCFHeaderLine> defaultToolVCFHeaderLines = getDefaultToolVCFHeaderLines();
final Set<VCFHeaderLine> defaultToolVCFHeaderLines = getDefaultToolVCFHeaderLines();
vcfWriter = createVCFWriter(outputFile);

//create engine object
gvcfEngine = new GenotypeGVCFsEngine(annotationEngine, genotypeArgs, includeNonVariants, inputVCFHeader);

//call initialize method in engine class that creates VCFWriter object and writes a header to it
vcfWriter = gvcfEngine.setupVCFWriter(defaultToolVCFHeaderLines, keepCombined, dbsnp, vcfWriter);

}

@Override
Expand All @@ -281,9 +275,7 @@ public void apply(final Locatable loc, List<VariantContext> variants, ReadsConte
final VariantContext regenotypedVC = gvcfEngine.callRegion(loc, variants, ref, features, merger, somaticInput, tlodThreshold, afTolerance, forceOutput);

if (regenotypedVC != null) {
final SimpleInterval variantStart = new SimpleInterval(regenotypedVC.getContig(), regenotypedVC.getStart(), regenotypedVC.getStart());
if ((inForceOutputIntervals || !GATKVariantContextUtils.isSpanningDeletionOnly(regenotypedVC)) &&
(!onlyOutputCallsStartingInIntervals || intervals.stream().anyMatch(interval -> interval.contains (variantStart)))) {
if ((inForceOutputIntervals || !GATKVariantContextUtils.isSpanningDeletionOnly(regenotypedVC))) {
vcfWriter.add(regenotypedVC);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,15 +110,6 @@ public final class GnarlyGenotyper extends VariantWalker {
@Argument(fullName = "keep-all-sites", doc="Retain low quality and non-variant sites, applying appropriate filters", optional=true)
private boolean keepAllSites = false;

/**
* This option can only be activated if intervals are specified.
*/
@Advanced
@Argument(fullName = GenotypeGVCFs.ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME,
doc="Restrict variant output to sites that start within provided intervals",
optional=true)
private boolean onlyOutputCallsStartingInIntervals = false;

@Argument(fullName = GenomicsDBImport.MERGE_INPUT_INTERVALS_LONG_NAME,
shortName = GenomicsDBImport.MERGE_INPUT_INTERVALS_LONG_NAME,
doc = "Boolean flag to read in all data in between intervals. Improves performance reading from GenomicsDB " +
Expand Down Expand Up @@ -146,9 +137,6 @@ public final class GnarlyGenotyper extends VariantWalker {
private final RMSMappingQuality mqCalculator = RMSMappingQuality.getInstance();
private final Set<Class<? extends InfoFieldAnnotation>> allAlleleSpecificAnnotations = new HashSet<>();

/** these are used when {@link #onlyOutputCallsStartingInIntervals) is true */
private List<SimpleInterval> intervals;

@Override
public boolean requiresReference() {
return true;
Expand Down Expand Up @@ -183,14 +171,6 @@ protected GenomicsDBOptions getGenomicsDBOptions() {
public void onTraversalStart() {
final VCFHeader inputVCFHeader = getHeaderForVariants();

if(onlyOutputCallsStartingInIntervals) {
if( !intervalArgumentCollection.intervalsSpecified()) {
throw new CommandLineException.MissingArgument("-L or -XL", "Intervals are required if --" + GenotypeGVCFs.ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME + " was specified.");
}
}
intervals = intervalArgumentCollection.intervalsSpecified() ? intervalArgumentCollection.getIntervals(getBestAvailableSequenceDictionary()) :
Collections.emptyList();

final SampleList samples = new IndexedSampleList(inputVCFHeader.getGenotypeSamples());

setupVCFWriter(inputVCFHeader, samples);
Expand Down Expand Up @@ -266,11 +246,11 @@ private void setupVCFWriter(VCFHeader inputVCFHeader, SampleList samples) {
@SuppressWarnings({"unchecked", "rawtypes"})
@Override
public void apply(VariantContext variant, ReadsContext reads, ReferenceContext ref, FeatureContext features) {
SimpleInterval variantStart = new SimpleInterval(variant.getContig(), variant.getStart(), variant.getStart());
//return early if there's no non-symbolic ALT since GDB already did the merging
if ( !variant.isVariant() || !GATKVariantContextUtils.isProperlyPolymorphic(variant)
|| variant.getAttributeAsInt(VCFConstants.DEPTH_KEY,0) == 0
|| (onlyOutputCallsStartingInIntervals && !intervals.stream().anyMatch(interval -> interval.contains(variantStart)))) {
|| variant.getAttributeAsInt(VCFConstants.DEPTH_KEY,0) == 0 )
// todo this changes is a slight de-optimization since we will now process some sites whihc were previously ignored
{
if (keepAllSites) {
VariantContextBuilder builder = new VariantContextBuilder(mqCalculator.finalizeRawMQ(variant)); //don't fill in QUAL here because there's no alt data
builder.filter(GATKVCFConstants.LOW_QUAL_FILTER_NAME);
Expand All @@ -297,7 +277,7 @@ public void apply(VariantContext variant, ReadsContext reads, ReferenceContext r
finalizedVC = genotyperEngine.finalizeGenotype(variant);
}
//could return null if the variant didn't pass the genotyping arg calling/emission threshold
if (finalizedVC != null && (!onlyOutputCallsStartingInIntervals || intervals.stream().anyMatch(interval -> interval.contains(variantStart)))) {
if (finalizedVC != null) {
vcfWriter.add(finalizedVC);
}
}
Expand Down
Loading

0 comments on commit 20c049d

Please sign in to comment.