From 70e047d04d7596c5d9c907d97fa1bdd00fa3976a Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Thu, 21 Sep 2023 12:05:04 -0400 Subject: [PATCH] More refactoring PDHCE and preparing for joint detection (#8492) * streamlining EventGroup code * PartiallyDeterminedHaplotype can handle multiple determined events * Branching in terms of included events, not excluded events * unit tests for event group clustering and branching --- .../haplotypecaller/AssemblyResultSet.java | 6 +- .../HaplotypeCallerEngine.java | 2 +- ...yDeterminedHaplotypeComputationEngine.java | 467 +++++++++--------- .../PileupDetectionArgumentCollection.java | 2 +- .../hellbender/utils/SmallBitSet.java | 6 + .../hellbender/utils/haplotype/EventMap.java | 2 +- .../PartiallyDeterminedHaplotype.java | 67 +-- ...nedHaplotypeComputationEngineUnitTest.java | 218 +++++++- .../hellbender/utils/SmallBitSetUnitTest.java | 9 + 9 files changed, 469 insertions(+), 310 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyResultSet.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyResultSet.java index c85ca8dc4cc..e158644bea4 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyResultSet.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyResultSet.java @@ -588,7 +588,7 @@ public void clearHaplotypes() { haplotypes.clear(); refHaplotype = null; } - public void replaceAllHaplotypes(Set list) { + public void replaceAllHaplotypes(Collection list) { haplotypes.clear(); refHaplotype = null; for ( Haplotype h : list ) { @@ -612,8 +612,8 @@ private void removeHaplotype(final Haplotype hap) { } } - public void setPartiallyDeterminedMode() { - this.isPartiallyDeterminedList = true; + public void setPartiallyDeterminedMode(final boolean isPartiallyDetermined) { + this.isPartiallyDeterminedList = isPartiallyDetermined; } public boolean isPartiallyDeterminedList() { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java index 8cbc46dce49..71f6d64ede7 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java @@ -832,7 +832,7 @@ public List callRegion(final AssemblyRegion region, final Featur final SWParameters readToHaplotypeSWParameters = hcArgs.getReadToHaplotypeSWParameters(); // TODO Yes we skip realignment entirely when we are in DRAGEN-GATK PDHMM mode. Realignment of the reads makes no sense when // TODO the bases of the haplotypes used for calling no longer reflect specified variants present. - if (!(hcArgs.pileupDetectionArgs.generatePDHaplotypes && !hcArgs.pileupDetectionArgs.determinePDHaps)) { + if (!(hcArgs.pileupDetectionArgs.generatePDHaplotypes && !hcArgs.pileupDetectionArgs.useDeterminedHaplotypesDespitePdhmmMode)) { final Map readRealignments = AssemblyBasedCallerUtils.realignReadsToTheirBestHaplotype(subsettedReadLikelihoodsFinal, assemblyResult.getReferenceHaplotype(), assemblyResult.getPaddedReferenceLoc(), aligner, readToHaplotypeSWParameters); subsettedReadLikelihoodsFinal.changeEvidence(readRealignments); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/PartiallyDeterminedHaplotypeComputationEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/PartiallyDeterminedHaplotypeComputationEngine.java index c96d51e4e12..3ab3995a7ce 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/PartiallyDeterminedHaplotypeComputationEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/PartiallyDeterminedHaplotypeComputationEngine.java @@ -5,12 +5,11 @@ import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; import htsjdk.samtools.util.Locatable; -import htsjdk.samtools.util.Tuple; import htsjdk.variant.variantcontext.Allele; import org.apache.commons.lang3.ArrayUtils; import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy; import org.broadinstitute.gatk.nativebindings.smithwaterman.SWParameters; -import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.utils.IndexRange; import org.broadinstitute.hellbender.utils.SmallBitSet; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.haplotype.Event; @@ -95,7 +94,7 @@ public static AssemblyResultSet generatePDHaplotypes(final AssemblyResultSet sou final List eventsInOrder = makeFinalListOfEventsInOrder(sourceSet, badPileupEvents, goodPileupEvents, referenceHaplotype, pileupArgs, debug); // TODO this is where we filter out if indels > 32 (a heuristic known from DRAGEN that is not implemented here) - SortedMap> variantsByStartPos = eventsInOrder.stream() + SortedMap> eventsByStartPos = eventsInOrder.stream() .collect(Collectors.groupingBy(Event::getStart, TreeMap::new, Collectors.toList())); List> disallowedCombinations = smithWatermanRealignPairsOfVariantsForEquivalentEvents(referenceHaplotype, aligner, args.getHaplotypeToReferenceSWParameters(), debug, eventsInOrder); @@ -109,160 +108,88 @@ public static AssemblyResultSet generatePDHaplotypes(final AssemblyResultSet sou } Utils.printIf(debug,() -> "Event groups after merging:\n"+eventGroups.stream().map(eg -> eg.toDisplayString(referenceHaplotype.getStart())).collect(Collectors.joining("\n"))); - Set outputHaplotypes = new LinkedHashSet<>(); - if (pileupArgs.determinePDHaps) { - // only add the reference haplotype if we are producing regular haplotype objects (not PD haplotypes for the haplotype alleles) - outputHaplotypes.add(referenceHaplotype); - } - /* - Overall Loop: - Iterate over every cluster of variants with the same start position. + Outer loop: iterate over all loci with events, setting each in turn as the determined locus with all other loci being undetermined + Inner loop: iterate over all alleles at the determined locus, including the reference allele unless we are making determined haplotypes, to set as + the determined allele. For each choice of determined allele generate all possible partially determined haplotypes */ - for (final int thisEventGroupStart : variantsByStartPos.keySet()) { // it's a SortedMap -- iterating over its keyset is okay! - final List allEventsHere = variantsByStartPos.get(thisEventGroupStart); - Utils.printIf(debug, () -> "working with variants: " + allEventsHere + " at position " + thisEventGroupStart); + final Set outputHaplotypes = pileupArgs.useDeterminedHaplotypesDespitePdhmmMode ? Sets.newLinkedHashSet(List.of(referenceHaplotype)) : Sets.newLinkedHashSet(); // NOTE: output changes if this is not a LinkedHashSet! + for (final int determinedLocus : eventsByStartPos.keySet()) { // it's a SortedMap -- iterating over its keyset is okay! + final List allEventsHere = eventsByStartPos.get(determinedLocus); + Utils.printIf(debug, () -> "working with variants: " + allEventsHere + " at position " + determinedLocus); - if (!Range.closed(callingSpan.getStart(), callingSpan.getEnd()).contains(thisEventGroupStart)) { + if (!Range.closed(callingSpan.getStart(), callingSpan.getEnd()).contains(determinedLocus)) { Utils.printIf(debug, () -> "Skipping determined hap construction! Outside of span: " + callingSpan); continue; } - /* - Determined Event Loop: - We iterate over every ref position and select single alleles (including ref) from that reference position (in R space) to be "determined" - - NOTE: we skip the reference allele in the event that we are making determined haplotypes instead of undetermined haplotypes - */ - for (int determinedAlleleIndex = (pileupArgs.determinePDHaps?0:-1); determinedAlleleIndex < allEventsHere.size(); determinedAlleleIndex++) { //note -1 for I here corresponds to the reference allele at this site - final boolean isRef = determinedAlleleIndex == -1; - final Set determinedEvents = isRef ? Set.of() : Set.of(allEventsHere.get(determinedAlleleIndex)); - final Event determinedEventToTest = allEventsHere.get(isRef ? 0 : determinedAlleleIndex); - Utils.printIf(debug, () -> "Working with allele at site: "+(isRef? "[ref:"+(thisEventGroupStart-referenceHaplotype.getStart())+"]" : PartiallyDeterminedHaplotype.getDRAGENDebugEventString(referenceHaplotype.getStart()).apply(determinedEventToTest))); - // This corresponds to the DRAGEN code for - // 0 0 - // 0 1 - // 1 0 - - - /* - * Here we handle any of the necessary work to deal with the event groups and maybe forming compound branches out of the groups - */ - - // Loop over eventGroups, have each of them return a list of VariantContexts - List> branchExcludeAlleles = new ArrayList<>(); - branchExcludeAlleles.add(new HashSet<>()); // Add the null branch (assuming no exclusions) - - /* Note for future posterity: - * An assembly region could potentially have any number of (within some limitations) of event groups. When we are constructing - * haplotypes out of the assembled variants we want to take the dot product of the branches for each set of event groups that - * we find. I.E. if an event group with mutex variants (B,C) requires two branches for Variant A and Variant A also leads to two branches in - * another event group with mutex variants (D,E). Then we would want to ultimately generate the branches A,B,D -> A,B,E -> A,C,D -> A,C,E. - * This is why we iterate over branchExcludeAlleles internally here. - */ - for(EventGroup group : eventGroups ) { - if (group.causesBranching()) { - List> branchingSets = group.setsForBranching(allEventsHere, determinedEvents, true); - // Combinatorially expand the branches as necessary - List> newBranchesToAdd = new ArrayList<>(); - for (Set excludedVars : branchExcludeAlleles) { - //For every exclude group, fork it by each subset we have: - for (int i = 1; i < branchingSets.size(); i++) { //NOTE: iterate starting at 1 here because we special case that branch at the end - newBranchesToAdd.add(Sets.union(excludedVars, branchingSets.get(i)).immutableCopy()); - } - // Be careful since this event group might have returned nothing - if (!branchingSets.isEmpty()) { - excludedVars.addAll(branchingSets.get(0)); - } - } - branchExcludeAlleles.addAll(newBranchesToAdd); - - if (branchExcludeAlleles.size() > MAX_BRANCH_PD_HAPS) { - Utils.printIf(debug, () -> "Found too many branches for variants at: " + determinedEventToTest.getStart() + " aborting and falling back to Assembly Variants!"); - return sourceSet; - } - } - } - - branchExcludeAllelesMessage(referenceHaplotype, debug, eventsInOrder, branchExcludeAlleles); - + for (int determinedAlleleIndex = (pileupArgs.useDeterminedHaplotypesDespitePdhmmMode ?0:-1); determinedAlleleIndex < allEventsHere.size(); determinedAlleleIndex++) { //note -1 for I here corresponds to the reference allele at this site + final boolean determinedAlleleIsRef = determinedAlleleIndex == -1; + final Set determinedEvents = determinedAlleleIsRef ? Set.of() : Set.of(allEventsHere.get(determinedAlleleIndex)); + Utils.printIf(debug, () -> "Working with determined allele(s) at site: "+(determinedAlleleIsRef? "[ref:"+(determinedLocus-referenceHaplotype.getStart())+"]" : + determinedEvents.stream().map(PartiallyDeterminedHaplotype.getDRAGENDebugEventString(referenceHaplotype.getStart())).collect(Collectors.joining(", ")))); - /* - Now handle each branch independently of the others. (the logic is the same in every case except we must ensure that none of the excluded alleles get included when constructing haps. - */ - for (Set excludeEvents : branchExcludeAlleles) { + final List> branches = computeBranches(eventGroups, determinedEvents, allEventsHere); - List branchHaps = new ArrayList<>(); - List newBranch = new ArrayList<>(); + if (branches == null) { + Utils.printIf(debug, () -> "Found too many branches for variants at: " + determinedLocus + " aborting and falling back to Assembly Variants!"); + return sourceSet; + } - // Handle the simple case of making PD haplotypes - if (!pileupArgs.determinePDHaps) { - for (final int otherEventGroupStart : variantsByStartPos.keySet()) { - if (otherEventGroupStart == thisEventGroupStart) { - newBranch.add(determinedEventToTest); - } else { - // We know here that nothing illegally overlaps because there are no groups. - // Also exclude any events that overlap the determined allele since we cant construct them (also this stops compound alleles from being formed) - // NOTE: it is important that we allow reference alleles to overlap undetermined variants as it leads to mismatches against DRAGEN otherwise. - variantsByStartPos.get(otherEventGroupStart).stream().filter(vc -> !excludeEvents.contains(vc)).forEach(newBranch::add); - } - } - newBranch.sort(HAPLOTYPE_SNP_FIRST_COMPARATOR); - PartiallyDeterminedHaplotype newPDHaplotypeFromEvents = createNewPDHaplotypeFromEvents(referenceHaplotype, determinedEventToTest, isRef, newBranch); - newPDHaplotypeFromEvents.setAllDeterminedEventsAtThisSite(allEventsHere); // accounting for determined variants for later in case we are in optimization mode - branchHaps.add(newPDHaplotypeFromEvents); + branchExcludeAllelesMessage(referenceHaplotype, debug, eventsInOrder, determinedEvents, branches); + // from each set of branch exclusions make a single PD haplotype or a combinatorial number of determined haplotypes + for (Set branch : branches) { + if (!pileupArgs.useDeterminedHaplotypesDespitePdhmmMode) { + final List eventsInPDHap = branch.stream().sorted(HAPLOTYPE_SNP_FIRST_COMPARATOR).toList(); + PartiallyDeterminedHaplotype newPDHaplotype = createNewPDHaplotypeFromEvents(referenceHaplotype, determinedEvents, determinedLocus, eventsInPDHap, allEventsHere); + branchHaplotypesDebugMessage(referenceHaplotype, debug, branch, List.of(newPDHaplotype)); + outputHaplotypes.add(newPDHaplotype); } else { // TODO currently this approach doesn't properly handle a bunch of duplicate events... - // If we are producing determined bases, then we want to enforce that every new event at least has THIS event as a variant. - List> growingEventGroups = new ArrayList<>(); - growingEventGroups.add(new ArrayList<>()); - for (final int otherEventGroupStart : variantsByStartPos.keySet()) { - // Iterate through the growing combinatorial expansion of haps, split it into either having or not having the variant. - if (growingEventGroups.size() > MAX_BRANCH_PD_HAPS) { - Utils.printIf(debug, () -> "Too many branch haplotypes ["+growingEventGroups.size()+"] generated from site, falling back on assembly variants!"); - return sourceSet; - } else if (otherEventGroupStart == thisEventGroupStart) { - growingEventGroups.forEach(group -> group.add(determinedEventToTest)); - } else if (thisEventGroupStart < otherEventGroupStart) { - variantsByStartPos.get(otherEventGroupStart).stream() - .filter(event -> !excludeEvents.contains(event)) - .flatMap(event -> growingEventGroups.stream().map(group -> growEventGroup(group, event))) - .forEach(growingEventGroups::add); + // Start with a single "seed" haplotype containing the determined event(s). At each downstream locus, + // every haplotype can grow by every event, or remain unchanged. For example, if we have haplotypes + // H1 and H2 so far and reach a locus with events A and B our list grows to H1/ref, H2/ref, H1/A, H1/B, H2/A, H2/B. + List> fullyDeterminedHaplotypes = new ArrayList<>(); + fullyDeterminedHaplotypes.add(new ArrayList<>(determinedEvents)); // the seed haplotype + + for (final int locus : eventsByStartPos.keySet()) { + if (determinedLocus < locus) { + final List> children = eventsByStartPos.get(locus).stream().filter(branch::contains) + .flatMap(event -> fullyDeterminedHaplotypes.stream().map(group -> growEventList(group, event))).toList(); + children.forEach(child -> child.sort(HAPLOTYPE_SNP_FIRST_COMPARATOR)); + fullyDeterminedHaplotypes.addAll(children); + + if (fullyDeterminedHaplotypes.size() > MAX_BRANCH_PD_HAPS) { + Utils.printIf(debug, () -> "Too many branch haplotypes [" + fullyDeterminedHaplotypes.size() + "] generated from site, falling back on assembly variants!"); + return sourceSet; + } } } - - for (List subset : growingEventGroups) { - subset.sort(HAPLOTYPE_SNP_FIRST_COMPARATOR); - Utils.printIf(debug, () -> "Constructing Hap From Events:"+ formatEventsLikeDragenLogs(subset, referenceHaplotype.getStart())); - branchHaps.add(constructHaplotypeFromEvents(referenceHaplotype, subset, true)); - } + fullyDeterminedHaplotypes.forEach(events -> Utils.printIf(debug, () -> "Constructing Haplotype From Events:" + formatEventsLikeDragenLogs(events, referenceHaplotype.getStart()))); + final List branchHaps = fullyDeterminedHaplotypes.stream() + .map(events -> constructHaplotypeFromEvents(referenceHaplotype, events, true)).toList(); + branchHaplotypesDebugMessage(referenceHaplotype, debug, branch, branchHaps); + outputHaplotypes.addAll(branchHaps); } - branchHaplotypesDebugMessage(referenceHaplotype, debug, excludeEvents, branchHaps); - outputHaplotypes.addAll(branchHaps); if (outputHaplotypes.size() > MAX_PD_HAPS_TO_GENERATE) { - Utils.printIf(debug,() -> "Too many Haps ["+outputHaplotypes.size()+"] generated at this site! Aborting!"); + Utils.printIf(debug,() -> "Too many branch haplotypes found, aborting ["+outputHaplotypes.size()+"]"); return sourceSet; } } } } - if (outputHaplotypes.size() > MAX_PD_HAPS_TO_GENERATE) { - Utils.printIf(debug,() -> "Too many branch haplotypes found, aborting ["+outputHaplotypes.size()+"]"); - return sourceSet; - } sourceSet.storeAssemblyHaplotypes(); - // TODO this is an entirely unnecessary step that can be done away with but i leave in because it makes debugging against previous versions much easier. - final Set result = outputHaplotypes.stream().sorted(Comparator.comparing(Haplotype::getBaseString)).collect(Collectors.toCollection(LinkedHashSet::new)); + // TODO: Sorting haplotypes is unnecessary but makes debugging against previous versions much easier. + final List result = outputHaplotypes.stream().sorted(Comparator.comparing(Haplotype::getBaseString)).toList(); sourceSet.replaceAllHaplotypes(result); Utils.printIf(debug, () -> "Constructed Haps for Branch"+sourceSet.getHaplotypeList().stream().map(Haplotype::toString).collect(Collectors.joining("\n"))); - if (!pileupArgs.determinePDHaps) { - // Setting a boolean on the source-set to indicate to downstream code that we have PD haplotypes - sourceSet.setPartiallyDeterminedMode(); - } + + // Set PDHMM flag to enable later PDHMM genotyping. We only set it now because earlier code might fail and revert to non-PDHMM mode. + sourceSet.setPartiallyDeterminedMode(!pileupArgs.useDeterminedHaplotypesDespitePdhmmMode); Utils.printIf(debug, () -> "Returning "+outputHaplotypes.size()+" to the HMM"); return sourceSet; } @@ -382,16 +309,34 @@ private static List> smithWatermanRealignPairsOfVariantsForEquivalen } /** - * Partition events into clusters that must be considered together, either because they overlap or because they belong to the - * same mutually exclusive pair or trio. To find this clustering we calculate the connected components of an undirected graph - * with an edge connecting events that overlap or are mutually excluded. + * Partition events into the largest possible clusters such that events in distinct clusters are mutually compatible + * i.e. can exist on the same haplotype. + * + * The resulting event groups are a disjoint cover of the input events; that is, every input event belongs to exactly one + * output EventGroup. + * + * In the common case that all events are compatible -- they are non-overlapping SNPs, for example -- the result is a bunch of singleton event groups * - * return null if any event group exceeds the allowed size -- this tells the calling code to fall back on the original - * GATK assembly. + * Incompatibility comes in two types, the first and more obvious being overlap. The second is defined by the + * heuristic in which we inject two or three events into the reference haplotype and realign the resulting two- or three-event + * haplotype against the reference haplotype via Smith-Waterman. If the resulting reduced representation contains other events + * that have already been found the two or three events are mutually forbidden. + * + * To find the desired partition we calculate the connected components of an undirected graph whose edges correspond to + * either type of incompatibility. That is, overlapping events and Smith-Waterman-forbidden pairs get an edge; Smith-Waterman-forbidden + * trios get three edges, one for every two events in the trio. + * + * Note: Although the resulting partition is the same as DRAGEN's, the graph-based algorithm below is a very different + * implementation. + * + * @param eventsInOrder all events in a calling region + * @param swForbiddenPairsAndTrios list of lists of two or three non-overlapping events that are mutually excluded according + * to the Smith-Waterman heuristic in which injecting them into the reference haplotype and + * simplifying via Smith Waterman yields other events */ - private static List getEventGroupClusters(List eventsInOrder, List> disallowedPairsAndTrios) { - final Graph graph = new SimpleGraph<>(DefaultEdge.class); - eventsInOrder.forEach(graph::addVertex); + @VisibleForTesting + static List getEventGroupClusters(List eventsInOrder, List> swForbiddenPairsAndTrios) { + final List> allMutexes = new ArrayList<>(swForbiddenPairsAndTrios); // edges due to overlapping position for (int e1 = 0; e1 < eventsInOrder.size(); e1++) { @@ -399,22 +344,55 @@ private static List getEventGroupClusters(List eventsInOrder, for (int e2 = e1 + 1; e2 < eventsInOrder.size() && eventsInOrder.get(e2).getStart() <= event1.getEnd() + 1; e2++) { final Event event2 = eventsInOrder.get(e2); if (eventsOverlapForPDHapsCode(event1, event2)) { - graph.addEdge(event1, event2); + allMutexes.add(List.of(event1, event2)); } } } - // edges due to mutual exclusion - for (final List excludedGroup : disallowedPairsAndTrios) { - graph.addEdge(excludedGroup.get(0), excludedGroup.get(1)); - if (excludedGroup.size() == 3) { - graph.addEdge(excludedGroup.get(1), excludedGroup.get(2)); - } - } + // for each mutex, add edges 0-1, 1-2. . . to put all mutex events in one connected component + final Graph graph = new SimpleGraph<>(DefaultEdge.class); + eventsInOrder.forEach(graph::addVertex); + allMutexes.forEach(mutex -> new IndexRange(0, mutex.size() - 1).forEach(n -> graph.addEdge(mutex.get(n), mutex.get(n+1)))); final List> components = new ConnectivityInspector<>(graph).connectedSets(); return components.stream().anyMatch(comp -> comp.size() > MAX_VAR_IN_EVENT_GROUP) ? null : - components.stream().map(component -> new EventGroup(component, disallowedPairsAndTrios)).toList(); + components.stream().map(component -> new EventGroup(component, allMutexes)).toList(); + } + + /** + * Compute the branches (sets of mutually allowed Events) that define partially determined haplotypes. For a given set of determined events + * each event group has one or more 1) maximal subsets that 2) contain all the determined events (if they belong to the event group) + * and 3) contain no mutually-exclusive pairs or trios of events. + * + * When constructing PD haplotypes we take all possible unions of these subsets over all event groups. + * + * Returns null if the combinatorial explosion of branches becomes too large. + */ + @VisibleForTesting + static List> computeBranches(List eventGroups, Set determinedEvents, final List allEventsAtDeterminedLocus) { + List> branches = new ArrayList<>(); + branches.add(new HashSet<>()); // start with a single empty branch + + for (EventGroup group : eventGroups) { + final List> setsToAdd = group.eventSetsForPDHaplotypes(determinedEvents, allEventsAtDeterminedLocus); + + // Take every possible union of existing branches and this event group's sets to add. As an optimization + // we add the 0th set's elements in-place, and append unions with the 1st, 2nd etc sets to add. + final List> extraBranches = setsToAdd.size() < 2 ? List.of() : branches.stream() + .flatMap(branch -> setsToAdd.stream().skip(1).map(setToAdd -> Sets.newHashSet(Sets.union(branch, setToAdd)))) + .toList(); + + // add the 0th exclusion set in-place + if (!setsToAdd.isEmpty()) { + branches.forEach(branch -> branch.addAll(setsToAdd.get(0))); + } + branches.addAll(extraBranches); + + if (branches.size() > MAX_BRANCH_PD_HAPS) { + return null; + } + } + return branches; } /** @@ -555,8 +533,10 @@ private static byte[] basesAfterFirst(final Allele altAllele) { */ @VisibleForTesting //TODO When we implement JointDetection we will need to allow multiple eventWithVariants to be prsent... - static PartiallyDeterminedHaplotype createNewPDHaplotypeFromEvents(final Haplotype refHap, final Event eventWithVariant, final boolean useRef, final List constituentEvents) { + static PartiallyDeterminedHaplotype createNewPDHaplotypeFromEvents(final Haplotype refHap, final Set determinedEvents, final int determinedLocus, + final List constituentEvents, final List allEventsAtDeterminedLocus) { Utils.validate(refHap.isReference() && refHap.getCigar().numCigarElements() == 1, "This is not a valid base haplotype for construction"); + final boolean refIsDetermined = determinedEvents.isEmpty(); //TODO add a more stringent check that the format of constituentEvents works int refStart = refHap.getStart(); @@ -576,14 +556,14 @@ static PartiallyDeterminedHaplotype createNewPDHaplotypeFromEvents(final Haploty final int actualStart = event.getStart() + (event.isIndel() ? 1 : 0); // the +1 for indels accounts for the initial alt=ref dummy base int basesBeforeNextEvent = actualStart - lastPositionAdded; - // Special case for two SNPs at the same position + // Special case for two SNPs at the same position -- merge them into a single undetermined SNP via bitwise OR if (basesBeforeNextEvent == -1 && event.isSNP() && lastEventWasSnp) { final byte byteForThisSnp = PartiallyDeterminedHaplotype.getPDBytesForHaplotypes(event.refAllele(), event.altAllele())[0]; final int bufferPositionOfLastSnp = pdBytes.position() - 1; final byte byteForLastSnp = pdBytes.get(bufferPositionOfLastSnp); pdBytes.put(bufferPositionOfLastSnp,(byte) (byteForLastSnp | byteForThisSnp) ); continue; - } else if (event.getStart() == eventWithVariant.getStart() && useRef) { + } else if (event.getStart() == determinedLocus && refIsDetermined) { // Ref alleles (even if they overlap undetermined events) should be skipped continue; } @@ -594,19 +574,19 @@ static PartiallyDeterminedHaplotype createNewPDHaplotypeFromEvents(final Haploty Allele altAllele = event.altAllele(); final int altRefLengthDiff = altAllele.length() - refAllele.length(); - boolean isInsertion = altRefLengthDiff > 0; // If its an insertion we flip to "ADD" the bases to the ref. - final boolean isEvent = event.equals(eventWithVariant); + boolean isInsertion = altRefLengthDiff > 0; // If it's an insertion we flip to "ADD" the bases to the ref. + final boolean isEvent = determinedEvents.stream().anyMatch(event::equals); runningCigar.add(new CigarElement(basesBeforeNextEvent, CigarOperator.M)); // Figure out the cigar element to add: // - If we are in the ref, simply add the cigar corresponding to the allele we are using if (event.isSNP()) { // SNPs are straightforward, whether or not it's the special event - runningCigar.add(new CigarElement(refAllele.length(), useRef || !isEvent ? CigarOperator.M : CigarOperator.X)); + runningCigar.add(new CigarElement(refAllele.length(), refIsDetermined || !isEvent ? CigarOperator.M : CigarOperator.X)); } else if (isEvent) { // special indel // The event is considered a deletion of the longer allele, regardless of which is ref // we subtract 1 for the dummy initial indel base. - final int elementLength = isInsertion && useRef ? 0 : Math.max(refAllele.length(), altAllele.length()) - 1; - final CigarOperator operator = isInsertion ? CigarOperator.I : (useRef ? CigarOperator.M : CigarOperator.D); + final int elementLength = isInsertion && refIsDetermined ? 0 : Math.max(refAllele.length(), altAllele.length()) - 1; + final CigarOperator operator = isInsertion ? CigarOperator.I : (refIsDetermined ? CigarOperator.M : CigarOperator.D); runningCigar.add(new CigarElement(elementLength, operator)); } else { // non-special indel. Insertions are treated as such; deletions become matches runningCigar.add(new CigarElement(Math.abs(altRefLengthDiff), altRefLengthDiff > 0 ? CigarOperator.I : CigarOperator.M)); @@ -618,7 +598,7 @@ static PartiallyDeterminedHaplotype createNewPDHaplotypeFromEvents(final Haploty // Now add event bases. If this is the blessed variant, add the ref or alt as appropriate // Otherwise make sure we are adding the longest allele (for indels) or the ref allele for snps. - final boolean alleleToUseIsRef = (isEvent && useRef) || (!isEvent && altRefLengthDiff <= 0); + final boolean alleleToUseIsRef = (isEvent && refIsDetermined) || (!isEvent && altRefLengthDiff <= 0); final Allele alleleToUse = alleleToUseIsRef ? refAllele : altAllele; final Allele otherAllele = alleleToUseIsRef ? altAllele : refAllele; final byte[] basesToAdd = event.isIndel() ? basesAfterFirst(alleleToUse) : alleleToUse.getBases(); @@ -638,131 +618,138 @@ static PartiallyDeterminedHaplotype createNewPDHaplotypeFromEvents(final Haploty return new PartiallyDeterminedHaplotype( new Haplotype(ArrayUtils.subarray(newHapBases.array(), 0, newHapBases.position()), false, refHap.getGenomeLocation(), runningCigar.make()), - useRef, ArrayUtils.subarray(pdBytes.array(), 0, pdBytes.position()), constituentEvents, - eventWithVariant, + determinedEvents, runningCigar.make(), - eventWithVariant.getStart(), + determinedLocus, + allEventsAtDeterminedLocus, refHap.getAlignmentStartHapwrtRef()); } - // A helper class for managing mutually exclusive event clusters and the logic arround forming valid events vs eachother. - private static class EventGroup { + // A helper class for managing mutually exclusive event clusters and the logic around forming valid events vs each other. + @VisibleForTesting + static class EventGroup { private final ImmutableList eventsInOrder; private final ImmutableMap eventIndices; - private final BitSet allowedEvents; + + // There are BitSets and SmallBitSets going around. The SmallBitSets represent sets of events -- mutexes and other subsets + // of the event group. This BitSet is a set of sets of events! Each element/bit represents a subset of events. If the bit is true + // then the corresponding subset of events is allowed i.e. doesn't contain a mutex. For example, if the 11th bit is true we know + // that the subset containing events 0, 1, and 3 (11 = 2^3 + 2^1 + 2^0) is allowed because neither {0,1}, {0,3}, {1,3}, or {0,1,3} + // are mutually excluded. + private final BitSet allowedSubsets; // Optimization to save ourselves recomputing the subsets at every point its necessary to do so. List> cachedEventSets = null; - public EventGroup(final Collection events, List> disallowedCombinations) { + /** + * @param events all events in this event group + * @param mutexPairsAndTrios all mutexes of two or three events, defined by overlap and by the Smith-Waterman heuristic, + * that determined the event groups in this calling region + */ + public EventGroup(final Collection events, List> mutexPairsAndTrios) { Utils.validate(events.size() <= MAX_VAR_IN_EVENT_GROUP, () -> "Too many events (" + events.size() + ") for populating bitset."); eventsInOrder = events.stream().sorted(HAPLOTYPE_SNP_FIRST_COMPARATOR).collect(ImmutableList.toImmutableList()); eventIndices = IntStream.range(0, events.size()).boxed().collect(ImmutableMap.toImmutableMap(eventsInOrder::get, n -> n)); - allowedEvents = new BitSet(1 << eventsInOrder.size()); + allowedSubsets = new BitSet(1 << eventsInOrder.size()); - final List> overlappingMutexes = disallowedCombinations.stream() - .filter(mutext -> mutext.stream().anyMatch(eventIndices::containsKey)).toList(); + final List> overlappingMutexes = mutexPairsAndTrios.stream() + // overlapping SNPs are mutexes for forming event groups, but they are NOT forbidden combinations for PD haplotypes! + .filter(mutex -> !(mutex.size() == 2 && mutex.get(0).getStart() == mutex.get(1).getStart() && mutex.get(0).isSNP() && mutex.get(1).isSNP())) + .filter(mutex -> mutex.stream().anyMatch(eventIndices::containsKey)).toList(); for (final List mutex : overlappingMutexes) { Utils.validate(mutex.stream().allMatch(eventIndices::containsKey), () -> "Mutex group " + mutex + " only partially overlaps event group " + this); } - populateBitset(overlappingMutexes); + computeAllowedSubsets(overlappingMutexes); } /** - * This is the primary method for handling mutually exclusive events in this subgroup. This code amd methods comes directly from DRAGEN: + * Compute the BitSet of allowed subsets of events i.e. subsets of events that do not contain a mutex group. * - * Create a #Variants bitset to store valid pairings: - * - The index of each element corresponds to an enumerated subset of alleles in this group - * - Each bit in the index corresponds to the presence or absence of a variant from the vcf list. - * - For example with variants [A,B,C] the number 5 corresponds to subset [A,C] - * - A false in the bitset corresponds to a disallowed pair. - * - NOTE: we can use 32bit ints for these bitshift operations by virtue of the fact that we limit ourselves to at most 22 variants per group. - * Iterate through pairs of Variants that overlap and mark off any pairings including this. - * Iterate through the mutex variants and ensure pairs containing all mutex variant groups are marked as true + * NOTE: we can use SmallBitSets because we limit ourselves to at most 22 variants per group. * - * @param mutexes Groups of mutually forbidden events. Note that when this is called we have already ensured - * that each mutex group comprises only events contained in this EventGroup. + * @param mutexPairsAndTrios The groups of mutually forbidden events that generated all the event groups in this calling region. */ - private void populateBitset(List> mutexes) { - if (eventsInOrder.size() < 2) { - return; - } - // initialize all events as being allowed and then disallow them in turn . - allowedEvents.set(1, 1 << eventsInOrder.size()); - final List forbiddenCombinations = new ArrayList<>(); + private void computeAllowedSubsets(List> mutexPairsAndTrios) { + if (eventsInOrder.size() < 2) { + return; + } - // Mark as disallowed all events that overlap each other, excluding pairs of SNPs - for (int i = 0; i < eventsInOrder.size(); i++) { - final Event first = eventsInOrder.get(i); - for (int j = i+1; j < eventsInOrder.size(); j++) { - final Event second = eventsInOrder.get(j); - if (!(first.isSNP() && second.isSNP()) && eventsOverlapForPDHapsCode(first, second)) { - forbiddenCombinations.add(new SmallBitSet(i,j)); - } - } - } + // initialize all events as being allowed and then disallow them in turn . + allowedSubsets.set(1, 1 << eventsInOrder.size()); - // make SmallBitSet of the event indices of each mutex - mutexes.stream().map(mutex -> new SmallBitSet(mutex.stream().map(eventIndices::get).toList())).forEach(forbiddenCombinations::add); - - // Now forbid all subsets that contain forbidden combinations - //TODO This method is potentially very inefficient! We don't technically have to iterate over every i, - //TODO we know there is an optimization involving minimizing the number of checks necessary here by iterating - //TODO using the bitmask values themselves for the loop - if (!forbiddenCombinations.isEmpty()) { - for (final SmallBitSet subset = new SmallBitSet().increment(); !subset.hasElementGreaterThan(eventsInOrder.size()); subset.increment()) { - if (forbiddenCombinations.stream().anyMatch(subset::contains)) { - allowedEvents.set(subset.index(), false); - } - } - } - } + // make SmallBitSet of the event indices of each mutex + final List mutexes = mutexPairsAndTrios.stream().map(mutex -> new SmallBitSet(mutex.stream().map(eventIndices::get).toList())).toList(); + + // Now forbid all subsets that contain forbidden combinations + //TODO This method is potentially very inefficient! We don't technically have to iterate over every i, + //TODO we know there is an optimization involving minimizing the number of checks necessary here by iterating + //TODO using the bitmask values themselves for the loop + if (!mutexes.isEmpty()) { + for (final SmallBitSet subset = new SmallBitSet().increment(); !subset.hasElementGreaterThan(eventsInOrder.size()); subset.increment()) { + if (mutexes.stream().anyMatch(subset::contains)) { + allowedSubsets.set(subset.index(), false); + } + } + } + } /** - * This method handles the logic involved in getting all of the allowed subsets of alleles for this event group. + * For a given set of determined events, find all subsets of the event group such that each subset + * + * 1) contains all determined events that overlap this event group + * 2) contains no two or three mutually exclusive elements i.e. is an allowed subset as computed in this object's constructor + * 3) is not a subset of any larger subset that satisfies 1) and 2) + * + * Note that the subsets need not be disjoint and need not cover all events in this EventGroup. + * + * Because these subsets contain no mutexes, one can sensibly make partially-determined haplotypes out of them, + * with each subset entailing a unique set of undetermined alleles. Property 3) above ensures that we make as + * few partially-determined haplotypes as possible. * - * @param disallowSubsets * @return */ - public List> setsForBranching(final List locusEvents, final Set determinedEvents, final boolean disallowSubsets) { + public List> eventSetsForPDHaplotypes(final Set determinedEvents, final List locusEvents) { final SmallBitSet locusOverlapSet = overlapSet(locusEvents); final SmallBitSet determinedOverlapSet = overlapSet(determinedEvents); - // Special case (if we are determining bases outside of this mutex cluster we can reuse the work from previous iterations) - if (locusOverlapSet.isEmpty() && cachedEventSets != null) { + if (eventsInOrder.size() == 1) { + // if this is the determined locus and ref is determined, return an empty set; otherwise return a singleton set of the lone event + return (!locusOverlapSet.isEmpty() && determinedEvents.isEmpty()) ? Collections.singletonList(Set.of()) : + Collections.singletonList(Set.of(eventsInOrder.get(0))); // if only one event, there are no mutexes + } + + // the repeated case is when the determined locus is external to this event group + final boolean cachedCase = locusOverlapSet.isEmpty() && determinedOverlapSet.isEmpty(); + + // We use a cache for the recurring case where the determined events do not belong to this event group + if (cachedCase && cachedEventSets != null) { return cachedEventSets; } - final List allowedAndDetermined = new ArrayList<>(); + final List maximalAllowedContainingDetermined = new ArrayList<>(); + // Iterate from the full set (containing every event) to the empty set (no events), which lets us output the largest possible subsets // NOTE: we skip over 0 here since that corresponds to ref-only events, handle those externally to this code for (final SmallBitSet subset = SmallBitSet.fullSet(eventsInOrder.size()); !subset.isEmpty(); subset.decrement()) { - if (allowedEvents.get(subset.index()) && subset.intersection(locusOverlapSet).equals(determinedOverlapSet)) { + if (allowedSubsets.get(subset.index()) && subset.intersection(locusOverlapSet).equals(determinedOverlapSet)) { // Only check for subsets if we need to - if (!disallowSubsets || allowedAndDetermined.stream().noneMatch(group -> group.contains(subset))) { - allowedAndDetermined.add(subset.copy()); // copy subset since the decrement() mutates it in-place + if (maximalAllowedContainingDetermined.stream().noneMatch(group -> group.contains(subset))) { + maximalAllowedContainingDetermined.add(subset.copy()); // copy subset since the decrement() mutates it in-place } } } - // Now that we have all the mutex groups, unpack them into lists of variants - List> output = new ArrayList<>(); - for (SmallBitSet grp : allowedAndDetermined) { - Set newGrp = new HashSet<>(); - for (int i = 0; i < eventsInOrder.size(); i++) { - if (!grp.get(i)) { - newGrp.add(eventsInOrder.get(i)); - } - } - output.add(newGrp); - } + // Now that we have all the maximal sets, map bitset indices to Events + List> output = maximalAllowedContainingDetermined.stream() + .map(bitset -> bitset.stream(eventsInOrder.size()).mapToObj(eventsInOrder::get).collect(Collectors.toSet())).toList(); + // Cache the result - if(locusOverlapSet.isEmpty()) { + if(cachedCase) { cachedEventSets = Collections.unmodifiableList(output); } return output; @@ -783,10 +770,13 @@ public String toDisplayString(int startPos) { } public int size() { return eventsInOrder.size(); } + + @VisibleForTesting + List eventsInOrderForTesting() { return eventsInOrder; } } - private static List growEventGroup(final List group, final Event event) { - final List result = new ArrayList<>(group); + private static List growEventList(final List eventList, final Event event) { + final List result = new ArrayList<>(eventList); result.add(event); return result; } @@ -826,15 +816,16 @@ private static void dragenDisallowedGroupsMessage(int refStart, boolean debug, L Utils.printIf(debug, () -> "disallowed groups:" + disallowedPairs.stream().map(group -> formatEventsLikeDragenLogs(group, refStart)).collect(Collectors.joining("\n"))); } - private static void branchExcludeAllelesMessage(Haplotype referenceHaplotype, boolean debug, Collection eventsInOrder, List> branchExcludeAlleles) { + private static void branchExcludeAllelesMessage(Haplotype referenceHaplotype, boolean debug, Collection eventsInOrder, + final Set determinedEvents, List> branches) { if (debug) { System.out.println("Branches:"); - for (int i = 0; i < branchExcludeAlleles.size(); i++) { - final int ifinal = i; - System.out.println("Branch "+i+" VCs:"); - System.out.println("exclude:" + formatEventsLikeDragenLogs(branchExcludeAlleles.get(i), referenceHaplotype.getStart())); + for (int i = 0; i < branches.size(); i++) { + System.out.println("Branch " + i + " VCs:"); + final Set included = branches.get(i); + final Set excluded = eventsInOrder.stream().filter(event -> !included.contains(event)).collect(Collectors.toSet()); + System.out.println("exclude:" + formatEventsLikeDragenLogs(excluded, referenceHaplotype.getStart())); //to match dragen debug output for personal sanity - final Collection included = eventsInOrder.stream().filter(variantContext -> !branchExcludeAlleles.get(ifinal).contains(variantContext)).collect(Collectors.toList()); System.out.println("include:" + formatEventsLikeDragenLogs(included, referenceHaplotype.getStart())); } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/PileupDetectionArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/PileupDetectionArgumentCollection.java index 2e134ce8d3a..5d3e33be0b2 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/PileupDetectionArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/PileupDetectionArgumentCollection.java @@ -77,7 +77,7 @@ public final class PileupDetectionArgumentCollection { public GATKPath pdhmmDebugOutputResults = null; @Hidden @Argument(fullName= DETERMINE_PD_HAPS, doc = "PDHMM: As an alternative to using the PDHMM, run all of the haplotype branching/determination code and instead of using the PDHMM use the old HMM with determined haplotypes. NOTE: this often fails and fallsback to other code due to combinatorial expansion. (Requires '--"+GENERATE_PARTIALLY_DETERMINED_HAPLOTYPES_LONG_NAME+"' argument)", optional = true) - public boolean determinePDHaps = false; + public boolean useDeterminedHaplotypesDespitePdhmmMode = false; @Hidden @Argument(fullName= DEBUG_PILEUPCALLING_ARG, doc = "PDHMM: If set, print to stdout a prodigious amount of debugging information about each of the steps involved in artificial haplotype construction and filtering. (Requires '--"+GENERATE_PARTIALLY_DETERMINED_HAPLOTYPES_LONG_NAME+"' argument)", optional = true) public boolean debugPileupStdout = false; diff --git a/src/main/java/org/broadinstitute/hellbender/utils/SmallBitSet.java b/src/main/java/org/broadinstitute/hellbender/utils/SmallBitSet.java index 80b75d3562b..2f49433d4f5 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/SmallBitSet.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/SmallBitSet.java @@ -6,6 +6,7 @@ import java.util.Collection; import java.util.Iterator; +import java.util.stream.IntStream; /** * Small BitSet with a capacity of 30 elements, corresponding to the number of bits in an int. @@ -121,6 +122,11 @@ public boolean get(final int element) { return (bits & elementIndex(element)) != 0; } + // IntStream of indices set to true i.e. contained in the set + public IntStream stream(final int capacity) { + return IntStream.range(0, capacity).filter(this::get); + } + public boolean isEmpty() { return bits == 0; } public boolean hasElementGreaterThan(final int element) { return bits >= 1 << element; } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/haplotype/EventMap.java b/src/main/java/org/broadinstitute/hellbender/utils/haplotype/EventMap.java index 1794b22dcd2..795c66a98ff 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/haplotype/EventMap.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/haplotype/EventMap.java @@ -250,7 +250,7 @@ public static void buildEventMapsForHaplotypes( final Collection hapl //TODO h.recomputeAndSetEventMap() with overrides for the two haplotype classes should replace this casting+checking code here which is prone to error. // Since PD haplotypes Know what alleles are variants, simply ask it and generate the map that way. - final EventMap events = h.isPartiallyDetermined() ? new EventMap(((PartiallyDeterminedHaplotype) h).getDeterminedAlleles()) : + final EventMap events = h.isPartiallyDetermined() ? new EventMap(((PartiallyDeterminedHaplotype) h).getDeterminedEvents()) : fromHaplotype(h, ref, refLoc, maxMnpDistance); h.setEventMap(events); diff --git a/src/main/java/org/broadinstitute/hellbender/utils/haplotype/PartiallyDeterminedHaplotype.java b/src/main/java/org/broadinstitute/hellbender/utils/haplotype/PartiallyDeterminedHaplotype.java index e0eec0975a9..4975f0308ed 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/haplotype/PartiallyDeterminedHaplotype.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/haplotype/PartiallyDeterminedHaplotype.java @@ -7,10 +7,7 @@ import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; -import java.util.Arrays; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; +import java.util.*; import java.util.function.Function; import java.util.stream.Collectors; @@ -75,48 +72,43 @@ public byte[] getAlternateBases() { private final byte[] alternateBases; private final List constituentBuiltEvents; - // NOTE: we must store ALL of the determined events at this site (which is different than the constituent events, we expect the constituent - // events for one of these objects to only be a single element) for the purposes of the overlapping reads PDHMM optimization. - // At Multiallelic sites, we ultimately genotype all of the alleles at once. If we aren't careful, reads that only overlap some - // alleles at a site will end up with incorrect/undetermined PDHMM scores for a subset of alleles in the genotyper which can - // lead to false positives/poorly called sites. - private List allDeterminedEventsAtThisSite; //TODO Eventually these will have to be refactored to support multiple determined alleles per PDHaplotype - private final Event alleleBearingEvent; // NOTE this must be in both of the previous lists - private final long determinedPosition; - private final boolean isDeterminedAlleleRef; + private final Set determinedEvents; // NOTE this must be a subset (possibly empty if the determined allele is ref) of both of the previous lists + private final int determinedPosition; - private SimpleInterval cachedExtent; - - /** - * Compares two haplotypes first by their lengths and then by lexicographic order of their bases. - */ - public static final Comparator SIZE_AND_BASE_ORDER = - Comparator.comparingInt((Haplotype hap) -> hap.getBases().length) - .thenComparing(Allele::getBaseString); + // NOTE: we need all of the events at the determined site (all of which are determined in *some* PD haplotype) for the purposes + // of the overlapping reads PDHMM optimization. At Multiallelic sites, we ultimately genotype all of the alleles at once. + // If we aren't careful, reads that only overlap some alleles at a site will end up with incorrect/undetermined PDHMM scores + // for a subset of alleles in the genotyper which can lead to false positives/poorly called sites. + //NOTE: we never want the genotyper to handle reads that were not HMM scored, caching this extent helps keep us safe from messy sites + private final SimpleInterval determinedExtent; /** * @param base base (reference) haplotype used to construct the PDHaplotype - * @param isRefAllele is the determined allele reference or alt * @param pdBytes array of bytes indicating what bases are skips for the pdhmm * @param constituentEvents events (both determined and undetermined) covered by this haplotype, should follow the rules for PD variants - * @param eventWithVariant event from @param constituentEvents that is determined for this pd haplotype + * @param determinedEvents events from @param constituentEvents that are determined for this pd haplotype. Empty if determined allele is ref * @param cigar haplotype cigar agianst the reference * @param determinedPosition position (wrt the reference contig) that the haplotype should be considered determined //TODO this will be refactored to be a range of events in JointDetection * @param getAlignmentStartHapwrtRef alignment startHapwrtRef from baseHaplotype corresponding to the in-memory storage of reference bases (must be set for trimming/clipping ops to work) */ - public PartiallyDeterminedHaplotype(final Haplotype base, boolean isRefAllele, byte[] pdBytes, List constituentEvents, - final Event eventWithVariant, final Cigar cigar, long determinedPosition, int getAlignmentStartHapwrtRef) { + public PartiallyDeterminedHaplotype(final Haplotype base, byte[] pdBytes, List constituentEvents, final Set determinedEvents, + final Cigar cigar, final int determinedPosition, List allEventsAtDeterminedLocus, int getAlignmentStartHapwrtRef) { super(base.getBases(), false, base.getAlignmentStartHapwrtRef(), cigar); Utils.validateArg(base.length() == pdBytes.length, "pdBytes array must have same length as base haplotype."); this.setGenomeLocation(base.getGenomeLocation()); this.alternateBases = pdBytes; this.constituentBuiltEvents = constituentEvents; - this.alleleBearingEvent = eventWithVariant; - this.allDeterminedEventsAtThisSite = Collections.singletonList(eventWithVariant); + // TODO: this needs to generalize to a set of determined events or empty if ref is determined + this.determinedEvents = determinedEvents; + + final int minStart = allEventsAtDeterminedLocus.stream().mapToInt(Event::getStart).min().orElse(determinedPosition); + final int maxEnd = allEventsAtDeterminedLocus.stream().mapToInt(Event::getEnd).max().orElse(determinedPosition); + determinedExtent = new SimpleInterval(getContig(), minStart, maxEnd); + + // TODO: eventually determined events can be at different positions this.determinedPosition = determinedPosition; - this.isDeterminedAlleleRef = isRefAllele; setAlignmentStartHapwrtRef(getAlignmentStartHapwrtRef); } @@ -136,7 +128,7 @@ public String toString() { String output = "HapLen:"+length() +", "+new String(getDisplayBases()); output = output + "\nUnresolved Bases["+alternateBases.length+"] "+Arrays.toString(alternateBases); return output + "\n"+getCigar().toString()+" "+ constituentBuiltEvents.stream() - .map(v ->(v==this.alleleBearingEvent ?"*":"")+ getDRAGENDebugEventString( getStart()).apply(v) ) + .map(v ->(determinedEvents.contains(v) ?"*":"")+ getDRAGENDebugEventString( getStart()).apply(v) ) .collect(Collectors.joining("->")); } @@ -162,24 +154,13 @@ public int hashCode() { return Objects.hash(Arrays.hashCode(getBases()),Arrays.hashCode(alternateBases)); } - public List getDeterminedAlleles() { - return isDeterminedAlleleRef ? Collections.emptyList() : Collections.singletonList(alleleBearingEvent); - } - - public void setAllDeterminedEventsAtThisSite(List allDeterminedEventsAtThisSite) { - this.allDeterminedEventsAtThisSite = allDeterminedEventsAtThisSite; - cachedExtent = null; + public Set getDeterminedEvents() { + return determinedEvents; } //NOTE: we never want the genotyper to handle reads that were not HMM scored, caching this extent helps keep us safe from messy sites public SimpleInterval getMaximumExtentOfSiteDeterminedAlleles() { - if (cachedExtent == null) { - cachedExtent = new SimpleInterval(alleleBearingEvent); - for( Event event : allDeterminedEventsAtThisSite) { - cachedExtent = cachedExtent.mergeWithContiguous(event); - } - } - return cachedExtent; + return determinedExtent; } public long getDeterminedPosition() { diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/PartiallyDeterminedHaplotypeComputationEngineUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/PartiallyDeterminedHaplotypeComputationEngineUnitTest.java index 95445ac65f1..b173cea9448 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/PartiallyDeterminedHaplotypeComputationEngineUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/PartiallyDeterminedHaplotypeComputationEngineUnitTest.java @@ -14,7 +14,10 @@ import java.util.Collections; import java.util.List; +import java.util.OptionalInt; +import java.util.Set; import java.util.function.BiPredicate; +import java.util.stream.Collectors; public class PartiallyDeterminedHaplotypeComputationEngineUnitTest extends GATKBaseTest { @@ -45,6 +48,174 @@ public class PartiallyDeterminedHaplotypeComputationEngineUnitTest extends GATKB // TODO THESE ARE FOR INVALID TEST CASES Event SNP_C_99 = new Event("20",99, Allele.REF_A,Allele.ALT_C); Event SNP_C_120 = new Event("20",120, Allele.REF_A,Allele.ALT_C); + Event SNP_G_120 = new Event("20",120, Allele.REF_A,Allele.ALT_G); + + @DataProvider + public Object[][] makeEventGroupClustersDataProvider() { + // format: + // 1) list of all events + // 2) list of 2- and 3-element groups of events that are mutually excluded due to the Smith-Waterman heuristic + // (note that there is no reference sequence in this test, hence we can set whatever exclusions we want here) + // (note also that this is in addition to any mutexes due to overlapping loci) + // 3) list of sets of events that make up the desired partition into event groups + // for convenience, 2) and 3) are representing by indices within the list 1). + return new Object[][] { + // no mutexes; singleton event groups result + { List.of(SNP_C_90), List.of(), List.of(List.of(0))}, + { List.of(SNP_C_90, SNP_C_100), List.of(), List.of(List.of(0), List.of(1))}, + { List.of(SNP_C_90, SNP_C_100, SNP_C_105), List.of(), List.of(List.of(0), List.of(1), List.of(2))}, + { List.of(SNP_C_90, SNP_C_100, INS_TT_105, SNP_C_109), List.of(), List.of(List.of(0), List.of(1), List.of(2), List.of(3))}, + + // all events are connected by a path of overlaps; everything belongs to a single event group + { List.of(SNP_C_105, SNP_G_105), List.of(), List.of(List.of(0,1))}, + { List.of(DEL_AAAAAAA_102, SNP_C_105, SNP_G_105), List.of(), List.of(List.of(0,1,2))}, + { List.of(DEL_AAAAAAA_102, SNP_C_105, SNP_G_105, SNP_C_106), List.of(), List.of(List.of(0,1,2,3))}, + + // multiple event groups due to independent overlaps -- note that insertions have 0.5 added to their start for DRAGEN + { List.of(DEL_AAAAAAA_102, SNP_C_105, SNP_G_105, SNP_C_120), List.of(), List.of(List.of(0,1,2), List.of(3))}, + { List.of(SNP_C_105, SNP_G_105, INS_TT_105), List.of(), List.of(List.of(0,1), List.of(2))}, + { List.of(SNP_C_105, SNP_G_105, INS_GGG_106, SNP_C_107), List.of(), List.of(List.of(0,1), List.of(2), List.of(3))}, + { List.of(DEL_AA_100, SNP_G_101, DEL_AA_105, SNP_C_106), List.of(), List.of(List.of(0,1), List.of(2,3))}, + { List.of(DEL_AA_100, SNP_G_101, DEL_AA_105, SNP_C_106, SNP_C_120), List.of(), List.of(List.of(0,1), List.of(2,3), List.of(4))}, + + // Smith-Waterman pair mutex joining event groups that would otherwise be independent + { List.of(SNP_C_90, SNP_C_100), List.of(List.of(0,1)), List.of(List.of(0,1))}, + { List.of(SNP_C_90, SNP_C_100, SNP_C_105), List.of(List.of(0,1)), List.of(List.of(0,1), List.of(2))}, + { List.of(SNP_C_90, SNP_C_100, SNP_C_105), List.of(List.of(0,2)), List.of(List.of(0,2), List.of(1))}, // this example is unrealistic + { List.of(DEL_AA_100, SNP_G_101, DEL_AA_105, SNP_C_106), List.of(List.of(1,2)), List.of(List.of(0,1,2,3))}, + { List.of(DEL_AA_100, SNP_G_101, DEL_AA_105, SNP_C_106, SNP_C_120), List.of(List.of(1,2)), List.of(List.of(0,1, 2,3), List.of(4))}, + + // two Smith-Waterman pair mutexes transitively combining three event groups + { List.of(DEL_AA_100, SNP_G_101, DEL_AA_105, SNP_C_106, SNP_C_120), List.of(List.of(1,2), List.of(3,4)), List.of(List.of(0,1, 2, 3, 4))}, + + // Smith-Waterman pair mutex doing nothing because it is redundant with an overlap mutex + { List.of(DEL_AA_100, SNP_G_101, DEL_AA_105, SNP_C_106, SNP_C_120), List.of(List.of(2,3)), List.of(List.of(0,1), List.of(2,3), List.of(4))}, + + // Smith-Waterman trio mutex transitively combining three event groups + { List.of(DEL_AA_100, SNP_G_101, DEL_AA_105, SNP_C_106, SNP_C_120), List.of(List.of(1,2,4)), List.of(List.of(0,1, 2, 3, 4))}, + }; + } + + @Test(dataProvider = "makeEventGroupClustersDataProvider") + public void testMakeEventGroupClusters(List eventsInOrder, List> swMutexes, List> expectedEventGroups) { + // convert indices to events + final List> mutexes = swMutexes.stream() + .map(mutexIndices -> mutexIndices.stream().map(eventsInOrder::get).toList()) + .toList(); + + // set of actual partition -- each list in the set is in HAPLOTYPE_SNP_FIRST_COMPARATOR order because that's how the code works + final Set> actualPartition = PartiallyDeterminedHaplotypeComputationEngine.getEventGroupClusters(eventsInOrder, mutexes).stream() + .map(eventGroup -> eventGroup.eventsInOrderForTesting()) + .collect(Collectors.toSet()); + + // set of expected partition -- each list in the set is in HAPLOTYPE_SNP_FIRST_COMPARATOR order because we sort it explicitly + final Set> expectedPartition = expectedEventGroups.stream() + .map(eventsList -> eventsList.stream().map(eventsInOrder::get).sorted(PartiallyDeterminedHaplotypeComputationEngine.HAPLOTYPE_SNP_FIRST_COMPARATOR).toList()) + .collect(Collectors.toSet()); + + Assert.assertEquals(expectedPartition, actualPartition); + } + + @DataProvider + public Object[][] makeBranchesDataProvider() { + // format: + // 1) list of all events + // 2) list of 2- and 3-element groups of events that are mutually excluded due to the Smith-Waterman heuristic + // (note that there is no reference sequence in this test, hence we can set whatever exclusions we want here) + // (note also that this is in addition to any mutexes due to overlapping loci) + // 3) determined locus + // 4) OptionalInt of determined event's index (if present), empty if ref allele at determined locus is determined + // 5) expected branches as list of sets + // for convenience, 2), 4), and 5) are representing by indices within the list 1). + return new Object[][] { + // no mutexes, hence a single branch with all events, except alt events at a determined ref locus + { List.of(SNP_C_90), List.of(), 90, OptionalInt.empty(), List.of(Set.of())}, + { List.of(SNP_C_90), List.of(), 90, OptionalInt.of(0), List.of(Set.of(0))}, + { List.of(SNP_C_90, SNP_C_100), List.of(), 100, OptionalInt.empty(), List.of(Set.of(0))}, + { List.of(SNP_C_90, SNP_C_100), List.of(), 100, OptionalInt.of(1), List.of(Set.of(0,1))}, + { List.of(SNP_C_90, SNP_C_100, SNP_C_105), List.of(), 100, OptionalInt.empty(), List.of(Set.of(0,2))}, + { List.of(SNP_C_90, SNP_C_100, SNP_C_105), List.of(), 100, OptionalInt.of(1), List.of(Set.of(0,1,2))}, + { List.of(SNP_C_90, SNP_C_100, INS_TT_105, SNP_C_109), List.of(), 90, OptionalInt.of(0), List.of(Set.of(0,1,2,3))}, + + // all events are connected by a path of overlaps; everything belongs to a single event group + { List.of(SNP_C_105, SNP_G_105), List.of(), 105, OptionalInt.empty(), List.of(Set.of())}, + { List.of(SNP_C_105, SNP_G_105), List.of(), 105, OptionalInt.of(0), List.of(Set.of(0))}, + + // ref is determined at the spanning deletion, the two SNPs coexist as undetermined alleles + { List.of(DEL_AAAAAAA_102, SNP_C_105, SNP_G_105), List.of(), 102, OptionalInt.empty(), List.of(Set.of(1,2))}, + + // spanning deletion is determined, the two SNPs are incompatible + { List.of(DEL_AAAAAAA_102, SNP_C_105, SNP_G_105), List.of(), 102, OptionalInt.of(0), List.of(Set.of(0))}, + + // ref is determined at the biallelic SNP locus, spanning deletion is a valid undetermined allele + { List.of(DEL_AAAAAAA_102, SNP_C_105, SNP_G_105), List.of(), 105, OptionalInt.empty(), List.of(Set.of(0))}, + + // one SNP is determined, spanning deletion and the other SNP are invalid + { List.of(DEL_AAAAAAA_102, SNP_C_105, SNP_G_105), List.of(), 105, OptionalInt.of(1), List.of(Set.of(1))}, + + // SNP at 106 is incompatible with the spanning deletion, both SNPs at 105 are undetermined + { List.of(DEL_AAAAAAA_102, SNP_C_105, SNP_G_105, SNP_C_106), List.of(), 106, OptionalInt.of(3), List.of(Set.of(1,2,3))}, + + // ref is determined at 106, hence we branch! Either we have the spanning deletion or the two SNPs + // note that spanning deletions being compatible with the ref allele at a SNP *is* DRAGEN behavior, even if it's suspect + { List.of(DEL_AAAAAAA_102, SNP_C_105, SNP_G_105, SNP_C_106), List.of(), 106, OptionalInt.empty(), List.of(Set.of(0), Set.of(1,2))}, + + // spanning deletion forbids spanned SNPs and allows the other SNP + { List.of(DEL_AAAAAAA_102, SNP_C_105, SNP_G_105, SNP_C_120), List.of(), 102, OptionalInt.of(0), List.of(Set.of(0,3))}, + + // spanning deletion forbids spanned SNPs and allows the other two overlapping SNPs + { List.of(DEL_AAAAAAA_102, SNP_C_105, SNP_G_105, SNP_C_120, SNP_G_120), List.of(), 102, OptionalInt.of(0), List.of(Set.of(0,3,4))}, + + // ref is determined at 105, insertion at 106 and SNP at 107 don't overlap + { List.of(SNP_C_105, SNP_G_105, INS_GGG_106, SNP_C_107), List.of(), 105, OptionalInt.empty(), List.of(Set.of(2,3))}, + + // deletion at 105 is determined, so spanned SNP at 106 is forbidden. SNP at 120 is always allowed, and the + // deletion at 100 and its spanned SNP at 101 induce branching + { List.of(DEL_AA_100, SNP_G_101, DEL_AA_105, SNP_C_106, SNP_C_120), List.of(), 105, OptionalInt.of(2), List.of(Set.of(0,2,4), Set.of(1,2,4))}, + + // just a mess -- the deletion at 100 spans the SNP at 101, which is SW mutexed with the deletion at 105, which spans the SNP at 106, + // which has an (unrealistic) SW mutex with the SNP at 120. We're testing here that although there is only one event group we still + // get several branches with more than one event. + { List.of(DEL_AA_100, SNP_G_101, DEL_AA_105, SNP_C_106, SNP_C_120), List.of(List.of(1,2), List.of(3,4)), 120, OptionalInt.of(4), + List.of(Set.of(0,2,4), Set.of(1,4))}, + + // another example from the same balagan + { List.of(DEL_AA_100, SNP_G_101, DEL_AA_105, SNP_C_106, SNP_C_120), List.of(List.of(1,2), List.of(3,4)), 105, OptionalInt.of(2), + List.of(Set.of(0,2,4))}, + + // another messy one -- note that the deletion at 98 overlaps events that start at 104 but not 105 + { List.of(DEL_AAAAAAA_98, DEL_AA_100, SNP_G_101, DEL_AAAAAAA_102, DEL_AA_105, SNP_C_106, SNP_C_120), List.of(), 120, OptionalInt.of(6), + List.of(Set.of(0,4,6), Set.of(0,5,6), Set.of(1,3,6), Set.of(1,4,6), Set.of(1,5,6), Set.of(2,3,6), Set.of(2,4,6), Set.of(2,5,6))}, + + }; + } + + @Test(dataProvider = "makeBranchesDataProvider") + public void testMakeBranches(List eventsInOrder, List> swMutexes, final int determinedLocus, final OptionalInt determinedEvent, + final List> expectedBranchIndices) { + // convert indices to events + final List> mutexes = swMutexes.stream() + .map(mutexIndices -> mutexIndices.stream().map(eventsInOrder::get).toList()) + .toList(); + + final List eventGroups = + PartiallyDeterminedHaplotypeComputationEngine.getEventGroupClusters(eventsInOrder, mutexes); + + // an absent determined event denotes that the ref allele is determined + // TODO: eventually this needs to be generalized to multiple determined events + final Set determinedEvents = determinedEvent.isPresent() ? Set.of(eventsInOrder.get(determinedEvent.getAsInt())) : Set.of(); + + final List allEventsAtDeterminedLocus = eventsInOrder.stream().filter(event -> event.getStart() == determinedLocus).toList(); + + final Set> actualBranches = PartiallyDeterminedHaplotypeComputationEngine.computeBranches(eventGroups, determinedEvents, allEventsAtDeterminedLocus) + .stream().collect(Collectors.toSet()); + + final Set> expectedBranches = expectedBranchIndices.stream() + .map(indexSet -> indexSet.stream().map(eventsInOrder::get).collect(Collectors.toSet())) + .collect(Collectors.toSet()); + + Assert.assertEquals(actualBranches, expectedBranches); + } @DataProvider public Object[][] testConstructHaplotypeFromVariantsDataProvider() { @@ -159,41 +330,42 @@ public void testMessyAlignmentSite() { @DataProvider public Object[][] testGeneratePDHaplotypeDataProvider() { return new Object[][] { - {List.of(SNP_C_105, SNP_C_106), SNP_C_106, false, "AAAAAACAAA", new byte[]{0,0,0,0,0,17,0,0,0,0}, "6M1X3M"}, - {List.of(SNP_C_105, SNP_C_106), SNP_C_106, true , "AAAAAAAAAA", new byte[]{0,0,0,0,0,17,0,0,0,0}, "10M"}, + {List.of(SNP_C_105, SNP_C_106), Set.of(SNP_C_106), 106, "AAAAAACAAA", new byte[]{0,0,0,0,0,17,0,0,0,0}, "6M1X3M"}, + {List.of(SNP_C_105, SNP_C_106), Set.of(), 106, "AAAAAAAAAA", new byte[]{0,0,0,0,0,17,0,0,0,0}, "10M"}, - {List.of(INS_TT_103, SNP_C_105, SNP_C_106), INS_TT_103, false, "AAAATAAAAAA", new byte[]{0,0,0,0,0,0,17,17,0,0,0}, "4M1I6M"}, - {List.of(INS_TT_103, SNP_C_105, SNP_C_106), INS_TT_103, true , "AAAAAAAAAA", new byte[]{0,0,0,0,0,17,17,0,0,0}, "10M"}, - {List.of(INS_TT_103, SNP_C_105, SNP_C_106), SNP_C_105, false, "AAAATACAAAA", new byte[]{0,0,0,0,6,0,0,17,0,0,0}, "4M1I1M1X4M"}, - {List.of(INS_TT_103, SNP_C_105, SNP_C_106), SNP_C_105, true , "AAAATAAAAAA", new byte[]{0,0,0,0,6,0,0,17,0,0,0}, "4M1I6M"}, + {List.of(INS_TT_103, SNP_C_105, SNP_C_106), Set.of(INS_TT_103), 103, "AAAATAAAAAA", new byte[]{0,0,0,0,0,0,17,17,0,0,0}, "4M1I6M"}, + {List.of(INS_TT_103, SNP_C_105, SNP_C_106), Set.of(), 103, "AAAAAAAAAA", new byte[]{0,0,0,0,0,17,17,0,0,0}, "10M"}, + {List.of(INS_TT_103, SNP_C_105, SNP_C_106), Set.of(SNP_C_105), 105, "AAAATACAAAA", new byte[]{0,0,0,0,6,0,0,17,0,0,0}, "4M1I1M1X4M"}, + {List.of(INS_TT_103, SNP_C_105, SNP_C_106), Set.of(), 105, "AAAATAAAAAA", new byte[]{0,0,0,0,6,0,0,17,0,0,0}, "4M1I6M"}, - {List.of(DEL_AAA_102, SNP_C_105, SNP_C_106), DEL_AAA_102, false, "AAAAAAAA" , new byte[]{0,0,0,17,17,0,0,0}, "3M2D5M"}, - {List.of(DEL_AAA_102, SNP_C_105, SNP_C_106), DEL_AAA_102, true , "AAAAAAAAAA", new byte[]{0,0,0,0,0,17,17,0,0,0}, "10M"}, - {List.of(DEL_AAA_102, SNP_C_105, SNP_C_106), SNP_C_105, false, "AAAAACAAAA", new byte[]{0,0,0,2,4,0,17,0,0,0}, "5M1X4M"}, - {List.of(DEL_AAA_102, SNP_C_105, SNP_C_106), SNP_C_105, true , "AAAAAAAAAA", new byte[]{0,0,0,2,4,0,17,0,0,0}, "10M"}, - {List.of(DEL_AAA_102, SNP_C_105, SNP_C_106), SNP_C_106, false, "AAAAAACAAA", new byte[]{0,0,0,2,4,17,0,0,0,0}, "6M1X3M"}, - {List.of(DEL_AAA_102, SNP_C_105, SNP_C_106), SNP_C_106, true , "AAAAAAAAAA", new byte[]{0,0,0,2,4,17,0,0,0,0}, "10M"}, + {List.of(DEL_AAA_102, SNP_C_105, SNP_C_106), Set.of(DEL_AAA_102), 102, "AAAAAAAA" , new byte[]{0,0,0,17,17,0,0,0}, "3M2D5M"}, + {List.of(DEL_AAA_102, SNP_C_105, SNP_C_106), Set.of(), 102, "AAAAAAAAAA", new byte[]{0,0,0,0,0,17,17,0,0,0}, "10M"}, + {List.of(DEL_AAA_102, SNP_C_105, SNP_C_106), Set.of(SNP_C_105), 105, "AAAAACAAAA", new byte[]{0,0,0,2,4,0,17,0,0,0}, "5M1X4M"}, + {List.of(DEL_AAA_102, SNP_C_105, SNP_C_106), Set.of(), 105, "AAAAAAAAAA", new byte[]{0,0,0,2,4,0,17,0,0,0}, "10M"}, + {List.of(DEL_AAA_102, SNP_C_105, SNP_C_106), Set.of(SNP_C_106), 106, "AAAAAACAAA", new byte[]{0,0,0,2,4,17,0,0,0,0}, "6M1X3M"}, + {List.of(DEL_AAA_102, SNP_C_105, SNP_C_106), Set.of(), 106, "AAAAAAAAAA", new byte[]{0,0,0,2,4,17,0,0,0,0}, "10M"}, // making sure we support "complex alleles" from DRAGEN - {List.of(DEL_AAA_102, SNP_C_105, SNP_C_106, INS_GGG_106), SNP_C_105, false , "AAAAACAGGAAA", new byte[]{0,0,0,2,4,0,17,2,4,0,0,0}, "5M1X1M2I3M"}, - {List.of(DEL_AAA_102, SNP_C_105, SNP_C_106, SNP_T_106, INS_GGG_106), SNP_C_105, true , "AAAAAAAGGAAA", new byte[]{0,0,0,2,4,0,81,2,4,0,0,0}, "7M2I3M"}, - {List.of(DEL_AAA_102, SNP_C_105, SNP_C_106, INS_GGG_106), DEL_AAA_102, false , "AAAAAGGAAA", new byte[]{0,0,0,17,17,2,4,0,0,0}, "3M2D2M2I3M"}, - {List.of(DEL_AAA_102, SNP_C_105, SNP_C_106, SNP_T_106, INS_GGG_106), DEL_AAA_102, true , "AAAAAAAGGAAA", new byte[]{0,0,0,0,0,17,81,2,4,0,0,0}, "7M2I3M"}, - {List.of(SNP_G_101, SNP_C_105, DEL_AA_105), SNP_G_101, false , "AGAAAAAAAA", new byte[]{0,0,0,0,0,17,6,0,0,0}, "1M1X8M"}, - {List.of(SNP_G_101, SNP_C_105, DEL_AA_105), SNP_G_101, true , "AAAAAAAAAA", new byte[]{0,0,0,0,0,17,6,0,0,0}, "10M"}, - + {List.of(DEL_AAA_102, SNP_C_105, SNP_C_106, INS_GGG_106), Set.of(SNP_C_105), 105, "AAAAACAGGAAA", new byte[]{0,0,0,2,4,0,17,2,4,0,0,0}, "5M1X1M2I3M"}, + {List.of(DEL_AAA_102, SNP_C_105, SNP_C_106, SNP_T_106, INS_GGG_106), Set.of(), 105, "AAAAAAAGGAAA", new byte[]{0,0,0,2,4,0,81,2,4,0,0,0}, "7M2I3M"}, + {List.of(DEL_AAA_102, SNP_C_105, SNP_C_106, INS_GGG_106), Set.of(DEL_AAA_102), 102, "AAAAAGGAAA", new byte[]{0,0,0,17,17,2,4,0,0,0}, "3M2D2M2I3M"}, + {List.of(DEL_AAA_102, SNP_C_105, SNP_C_106, SNP_T_106, INS_GGG_106), Set.of(), 102, "AAAAAAAGGAAA", new byte[]{0,0,0,0,0,17,81,2,4,0,0,0}, "7M2I3M"}, + {List.of(SNP_G_101, SNP_C_105, DEL_AA_105), Set.of(SNP_G_101), 101, "AGAAAAAAAA", new byte[]{0,0,0,0,0,17,6,0,0,0}, "1M1X8M"}, + {List.of(SNP_G_101, SNP_C_105, DEL_AA_105), Set.of(), 101, "AAAAAAAAAA", new byte[]{0,0,0,0,0,17,6,0,0,0}, "10M"}, }; } @Test(dataProvider = "testGeneratePDHaplotypeDataProvider") - public void testGeneratePDHaplotypeFromVariants(List events, Event targetEvent, boolean useRefBase, String expectedBases, byte[] expectedAltArray, String expectedCigar) { + public void testGeneratePDHaplotypeFromVariants(List events, Set determinedEvents, int determinedLocus, String expectedBases, byte[] expectedAltArray, String expectedCigar) { Haplotype ref = new Haplotype("AAAAAAAAAA".getBytes(), true, 500, TextCigarCodec.decode("10M")); ref.setGenomeLocation(new SimpleInterval("20", 100, 110)); - PartiallyDeterminedHaplotype result = PartiallyDeterminedHaplotypeComputationEngine.createNewPDHaplotypeFromEvents(ref, targetEvent, useRefBase, events); + final List allEventsAtDeterminedLocus = events.stream().filter(event -> event.getStart() == determinedLocus).toList(); + + PartiallyDeterminedHaplotype result = PartiallyDeterminedHaplotypeComputationEngine.createNewPDHaplotypeFromEvents(ref, determinedEvents, determinedLocus, events, allEventsAtDeterminedLocus); Assert.assertEquals(new String(result.getBases()), expectedBases); Assert.assertEquals(result.getAlternateBases(), expectedAltArray); Assert.assertEquals(result.getCigar(), TextCigarCodec.decode(expectedCigar)); - Assert.assertEquals(result.getDeterminedPosition(), targetEvent.getStart()); + Assert.assertEquals(result.getDeterminedPosition(), determinedLocus); } // NOTE: This is an enforcement of a behavior that I consider to be a bug in DRAGEN. Specifically my assumption that we needn't ever concern @@ -207,7 +379,7 @@ public void testDeletionUnderlappingDeterminedBases() { Haplotype ref = new Haplotype("AAAAAAAAAA".getBytes(), true, 500, TextCigarCodec.decode("10M")); ref.setGenomeLocation(new SimpleInterval("20", 100, 110)); - PartiallyDeterminedHaplotype result = PartiallyDeterminedHaplotypeComputationEngine.createNewPDHaplotypeFromEvents(ref, DEL_AA_105, true, List.of(DEL_AAAAAAA_102, DEL_AA_105)); + PartiallyDeterminedHaplotype result = PartiallyDeterminedHaplotypeComputationEngine.createNewPDHaplotypeFromEvents(ref, Set.of(), 105, List.of(DEL_AAAAAAA_102, DEL_AA_105), List.of(DEL_AA_105)); Assert.assertEquals(new String(result.getBases()), "AAAAAAAAAA"); Assert.assertEquals(result.getAlternateBases(), new byte[]{0,0,0,2,0,0,0,0,4,0}); Assert.assertEquals(result.getCigar(), TextCigarCodec.decode("10M")); diff --git a/src/test/java/org/broadinstitute/hellbender/utils/SmallBitSetUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/SmallBitSetUnitTest.java index beb4ad6c924..ff5540bc4c3 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/SmallBitSetUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/SmallBitSetUnitTest.java @@ -46,6 +46,15 @@ public void testUnion() { new SmallBitSet(List.of(1,3,5,7,9))); } + @Test + public void testStream() { + Assert.assertEquals((new SmallBitSet(1,3,5)).stream(10).toArray(), + new int[] {1,3,5}); + + Assert.assertEquals((new SmallBitSet(1,5,11)).stream(10).toArray(), + new int[] {1, 5}); + } + @Test public void testContains() { Assert.assertTrue(THREE_ELEMENTS.contains(TWO_ELEMENTS));