From d744b2b06d1feb1db85da6139cbabd15dcf7fe59 Mon Sep 17 00:00:00 2001
From: Mark Walker
Date: Fri, 23 Aug 2024 14:32:48 -0400
Subject: [PATCH 1/2] Require both overlap and breakend proximity for
depth-only SV clustering
This is a combination of 2 commits.
Implement SVStratify
Interchrom events
Start unit tests
Finish unit tests
Multiple contexts per stratum
Integration tests; fix empty context bug
Test duplicate context name
Handle CPX/CTX, add some tests
Compiler error
Prioritize SR and PE evidence types for representative breakpoint strategy
Add SVStratifiedCluster
Integration tests for SVStratifiedClustering
Unused line
Spooled sorting in cluster tools
Start fixing JointGCNVSegmentation
Fix JointGCNVSegmentation integration test
Rename to GroupedSVCluster
Fix sorting bug
Comment
Use RD_CN for CNV sample overlap; improve testing
Documentation
Add comment about requiresOverlapAndProximity
Allow empty EVIDENCE list
Add STRAT INFO field
Representative breakpoint by variant quality
Handle BOTHSIDES_SUPPORT and HIGH_SR_BACKGROUND in variant collapsing
Add expected exception
---
.../spark/sv/utils/GATKSVVCFConstants.java | 17 +
.../hellbender/tools/sv/SVCallRecord.java | 16 +-
.../tools/sv/SVCallRecordUtils.java | 35 +-
.../sv/cluster/CanonicalSVCollapser.java | 90 +-
.../tools/sv/cluster/SVClusterEngine.java | 101 +-
.../sv/cluster/SVClusterEngineFactory.java | 6 +-
.../tools/sv/cluster/SVClusterLinkage.java | 28 +-
.../tools/sv/cluster/SVClusterWalker.java | 281 +++++
.../StratifiedClusteringTableParser.java | 44 +
.../sv/stratify/SVStatificationEngine.java | 347 ++++++
...ratificationEngineArgumentsCollection.java | 70 ++
.../tools/walkers/sv/GroupedSVCluster.java | 241 ++++
.../sv/JointGermlineCNVSegmentation.java | 53 +-
.../tools/walkers/sv/SVCluster.java | 257 +----
.../tools/walkers/sv/SVConcordance.java | 2 +-
.../tools/walkers/sv/SVStratify.java | 291 +++++
.../tools/sv/SVCallRecordUnitTest.java | 6 +-
.../tools/sv/SVCallRecordUtilsUnitTest.java | 48 +-
.../hellbender/tools/sv/SVTestUtils.java | 91 +-
.../sv/cluster/BinnedCNVDefragmenterTest.java | 24 +-
.../tools/sv/cluster/CNVDefragmenterTest.java | 22 +-
.../cluster/CanonicalSVCollapserUnitTest.java | 386 ++++++-
.../tools/sv/cluster/SVClusterEngineTest.java | 186 ++-
.../SVStratificationEngineUnitTest.java | 555 +++++++++
.../sv/GroupedSVClusterIntegrationTest.java | 112 ++
...ermlineCNVSegmentationIntegrationTest.java | 25 +-
.../walkers/sv/SVClusterIntegrationTest.java | 54 +-
.../walkers/sv/SVStratifyIntegrationTest.java | 169 +++
.../tools/sv/sv_stratify_config.tsv | 8 +
.../stratified_cluster_params.tsv | 3 +
.../stratified_cluster_strata.tsv | 3 +
.../bwa_melt.cleaned.chr22_chrY.vcf.gz | Bin 0 -> 468997 bytes
.../bwa_melt.cleaned.chr22_chrY.vcf.gz.tbi | Bin 0 -> 7957 bytes
...f_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz | Bin 501331 -> 502596 bytes
...nel_1kg.cleaned.gatk.chr22_chrY.vcf.gz.tbi | Bin 7763 -> 7763 bytes
.../sv/SVStratify/bwa_melt.chr22.vcf.gz | Bin 0 -> 599446 bytes
.../sv/SVStratify/bwa_melt.chr22.vcf.gz.tbi | Bin 0 -> 6704 bytes
.../SVStratify/hg38.RM.chr22_subsampled.bed | 1000 +++++++++++++++++
.../sv/SVStratify/hg38.SegDup.chr22.bed | 229 ++++
.../walkers/sv/SVStratify/test_config.tsv | 7 +
.../sv/SVStratify/test_config_duplicate.tsv | 8 +
.../sv/SVStratify/test_config_redundant.tsv | 8 +
42 files changed, 4281 insertions(+), 542 deletions(-)
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterWalker.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/StratifiedClusteringTableParser.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStatificationEngine.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStratificationEngineArgumentsCollection.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStratificationEngineUnitTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVClusterIntegrationTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratifyIntegrationTest.java
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/sv/sv_stratify_config.tsv
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster/stratified_cluster_params.tsv
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster/stratified_cluster_strata.tsv
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/bwa_melt.cleaned.chr22_chrY.vcf.gz
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/bwa_melt.cleaned.chr22_chrY.vcf.gz.tbi
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/bwa_melt.chr22.vcf.gz
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/bwa_melt.chr22.vcf.gz.tbi
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/hg38.RM.chr22_subsampled.bed
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/hg38.SegDup.chr22.bed
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config.tsv
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config_duplicate.tsv
create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config_redundant.tsv
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
index 5224fe5f7d6..5f1a2dc17ab 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
@@ -13,6 +13,7 @@ public final class GATKSVVCFConstants {
// VCF standard keys reserved for sv
public static final String SVTYPE = "SVTYPE";
public static final String SVLEN = "SVLEN";
+ public static final String EVIDENCE = "EVIDENCE";
public static final String IMPRECISE = "IMPRECISE";
public static final String CIPOS = "CIPOS";
public static final String CIEND = "CIEND";
@@ -31,6 +32,14 @@ public final class GATKSVVCFConstants {
public static final Allele DEL_ALLELE = Allele.create("", false);
public static final Allele DUP_ALLELE = Allele.create("", false);
+ // Evidence types
+ public enum EvidenceTypes {
+ BAF,
+ PE,
+ RD,
+ SR
+ }
+
// GATK-SV specific header lines
// TODO: 10/3/17 the following comment is a goal we are trying to achieve
// applicable to all records all the time
@@ -136,8 +145,13 @@ public enum ComplexVariantSubtype {
public static final String BND_DELETION_STRANDS = "+-";
public static final String BND_DUPLICATION_STRANDS = "-+";
+ // SR support
+ public static final String BOTHSIDES_SUPPORT_ATTRIBUTE = "BOTHSIDES_SUPPORT";
+ public static final String HIGH_SR_BACKGROUND_ATTRIBUTE = "HIGH_SR_BACKGROUND";
+
// format block
public static final String COPY_NUMBER_FORMAT = "CN";
+ public static final String DEPTH_GENOTYPE_COPY_NUMBER_FORMAT = "RD_CN";
public static final String EXPECTED_COPY_NUMBER_FORMAT = "ECN";
public static final String COPY_NUMBER_QUALITY_FORMAT = "CNQ";
@@ -175,6 +189,9 @@ public enum ComplexVariantSubtype {
public static final String TRUTH_ALLELE_NUMBER_INFO = "TRUTH_AN";
public static final String TRUTH_ALLELE_FREQUENCY_INFO = "TRUTH_AF";
+ // stratification
+ public static final String STRATUM_INFO_KEY = "STRAT";
+
// functional annotations
public static final String LOF = "PREDICTED_LOF";
public static final String INT_EXON_DUP = "PREDICTED_INTRAGENIC_EXON_DUP";
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java
index 3f3258d6161..3b0466f4bd6 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java
@@ -21,6 +21,7 @@
import java.util.stream.Stream;
import static org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants.COPY_NUMBER_FORMAT;
+import static org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants.DEPTH_GENOTYPE_COPY_NUMBER_FORMAT;
public class SVCallRecord implements SVLocatable {
@@ -31,6 +32,7 @@ public class SVCallRecord implements SVLocatable {
VCFConstants.END_KEY,
GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE,
GATKSVVCFConstants.SVLEN,
+ GATKSVVCFConstants.EVIDENCE,
GATKSVVCFConstants.CONTIG2_ATTRIBUTE,
GATKSVVCFConstants.END2_ATTRIBUTE,
GATKSVVCFConstants.STRANDS_ATTRIBUTE,
@@ -48,6 +50,7 @@ public class SVCallRecord implements SVLocatable {
private final Boolean strandB;
private final GATKSVVCFConstants.StructuralVariantAnnotationType type;
private final Integer length;
+ private final List evidence;
private final List algorithms;
private final List alleles;
private final Allele refAllele;
@@ -72,6 +75,7 @@ public SVCallRecord(final String id,
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype,
final List cpxIntervals,
final Integer length,
+ final List evidence,
final List algorithms,
final List alleles,
final List genotypes,
@@ -79,7 +83,7 @@ public SVCallRecord(final String id,
final Set filters,
final Double log10PError,
final SAMSequenceDictionary dictionary) {
- this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, cpxIntervals, length, algorithms, alleles, genotypes, attributes, filters, log10PError);
+ this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, cpxIntervals, length, evidence, algorithms, alleles, genotypes, attributes, filters, log10PError);
validateCoordinates(dictionary);
}
@@ -94,6 +98,7 @@ protected SVCallRecord(final String id,
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype,
final List cpxIntervals,
final Integer length,
+ final List evidence,
final List algorithms,
final List alleles,
final List genotypes,
@@ -106,6 +111,7 @@ protected SVCallRecord(final String id,
Utils.nonNull(attributes);
Utils.nonNull(filters);
Utils.nonNull(cpxIntervals);
+ Utils.nonNull(evidence);
this.id = Utils.nonNull(id);
this.contigA = contigA;
this.positionA = positionA;
@@ -123,6 +129,7 @@ protected SVCallRecord(final String id,
this.genotypes = GenotypesContext.copy(genotypes).immutable();
this.attributes = validateAttributes(attributes);
this.length = inferLength(type, positionA, positionB, length);
+ this.evidence = evidence;
final Pair strands = inferStrands(type, strandA, strandB);
this.strandA = strands.getLeft();
this.strandB = strands.getRight();
@@ -272,7 +279,8 @@ private boolean isCarrier(final Genotype genotype) {
}
// Otherwise, try to infer status if it's a biallelic CNV with a copy number call
- final int copyNumber = VariantContextGetters.getAttributeAsInt(genotype, COPY_NUMBER_FORMAT, expectedCopyNumber);
+ final int copyNumber = VariantContextGetters.getAttributeAsInt(genotype, COPY_NUMBER_FORMAT,
+ VariantContextGetters.getAttributeAsInt(genotype, DEPTH_GENOTYPE_COPY_NUMBER_FORMAT, expectedCopyNumber));
if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.DEL) {
return copyNumber < expectedCopyNumber;
} else if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.DUP) {
@@ -370,6 +378,10 @@ public Integer getLength() {
return length;
}
+ public List getEvidence() {
+ return evidence;
+ }
+
public List getAlgorithms() {
return algorithms;
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java
index cf31d654727..4a13d62119e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java
@@ -18,6 +18,7 @@
import java.util.stream.Collectors;
import java.util.stream.Stream;
+import static htsjdk.variant.vcf.VCFConstants.MISSING_VALUE_v4;
import static org.broadinstitute.hellbender.tools.sv.SVCallRecord.UNDEFINED_LENGTH;
public final class SVCallRecordUtils {
@@ -91,6 +92,9 @@ public static VariantContextBuilder getVariantBuilder(final SVCallRecord record)
&& record.getStrandA() != null && record.getStrandB() != null) {
builder.attribute(GATKSVVCFConstants.STRANDS_ATTRIBUTE, getStrandString(record));
}
+ if (!record.getEvidence().isEmpty()) {
+ builder.attribute(GATKSVVCFConstants.EVIDENCE, record.getEvidence());
+ }
if (!record.getFilters().isEmpty()) {
builder.filters(record.getFilters());
}
@@ -173,12 +177,12 @@ public static GenotypesContext populateGenotypesForMissingSamplesWithAlleles(fin
*/
public static SVCallRecord copyCallWithNewGenotypes(final SVCallRecord record, final GenotypesContext genotypes) {
return new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), record.getStrandA(), record.getContigB(),
- record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
+ record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getEvidence(), record.getAlgorithms(), record.getAlleles(),
genotypes, record.getAttributes(), record.getFilters(), record.getLog10PError());
}
public static SVCallRecord copyCallWithNewAttributes(final SVCallRecord record, final Map attr) {
return new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), record.getStrandA(), record.getContigB(),
- record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
+ record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getEvidence(), record.getAlgorithms(), record.getAlleles(),
record.getGenotypes(), attr, record.getFilters(), record.getLog10PError());
}
@@ -291,10 +295,10 @@ public static Stream convertInversionsToBreakends(final SVCallReco
Utils.validateArg(record.isIntrachromosomal(), "Inversion " + record.getId() + " is not intrachromosomal");
final SVCallRecord positiveBreakend = new SVCallRecord(record.getId(), record.getContigA(),
record.getPositionA(), true, record.getContigB(), record.getPositionB(), true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,record.getComplexEventIntervals(), null,
- record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
+ record.getEvidence(), record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
final SVCallRecord negativeBreakend = new SVCallRecord(record.getId(), record.getContigA(),
record.getPositionA(), false, record.getContigB(), record.getPositionB(), false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,record.getComplexEventIntervals(), null,
- record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
+ record.getEvidence(), record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
return Stream.of(positiveBreakend, negativeBreakend);
}
@@ -319,8 +323,9 @@ public static SVCallRecord create(final VariantContext variant, boolean keepVari
final GATKSVVCFConstants.StructuralVariantAnnotationType type = inferStructuralVariantType(variant);
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype = getComplexSubtype(variant);
- final List cpxIntervals = parseComplexIntervals(variant.getAttributeAsStringList(GATKSVVCFConstants.CPX_INTERVALS, null), dictionary);
+ final List cpxIntervals = parseComplexIntervals(variant, dictionary);
final List algorithms = getAlgorithms(variant);
+ final List evidence = getEvidence(variant);
final String strands;
if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.DEL
@@ -375,12 +380,13 @@ public static SVCallRecord create(final VariantContext variant, boolean keepVari
final Map sanitizedAttributes = sanitizeAttributes(attributes);
return new SVCallRecord(id, contigA, positionA, strand1, contigB, positionB, strand2, type, cpxSubtype,
- cpxIntervals, length, algorithms, variant.getAlleles(), variant.getGenotypes(), sanitizedAttributes,
+ cpxIntervals, length, evidence, algorithms, variant.getAlleles(), variant.getGenotypes(), sanitizedAttributes,
variant.getFilters(), log10PError);
}
- private static List parseComplexIntervals(final List intervals, final SAMSequenceDictionary dictionary) {
- return intervals.stream().map(i -> SVCallRecord.ComplexEventInterval.decode(i, dictionary)).toList();
+ private static List parseComplexIntervals(final VariantContext variant, final SAMSequenceDictionary dictionary) {
+ return variant.getAttributeAsStringList(GATKSVVCFConstants.CPX_INTERVALS, null).stream()
+ .map(i -> SVCallRecord.ComplexEventInterval.decode(i, dictionary)).toList();
}
private static Map sanitizeAttributes(final Map attributes) {
@@ -402,6 +408,19 @@ private static Integer getLength(final VariantContext variant, final GATKSVVCFCo
return length;
}
+ public static List getEvidence(final VariantContext variant) {
+ Utils.nonNull(variant);
+ final List value = variant.getAttributeAsStringList(GATKSVVCFConstants.EVIDENCE, null);
+ if (value == null) {
+ return Collections.emptyList();
+ } else {
+ return value.stream()
+ .filter(v -> v != null && !v.equals(MISSING_VALUE_v4))
+ .map(GATKSVVCFConstants.EvidenceTypes::valueOf)
+ .collect(Collectors.toList());
+ }
+ }
+
public static List getAlgorithms(final VariantContext variant) {
Utils.nonNull(variant);
Utils.validateArg(variant.hasAttribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE), "Expected " + GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE + " field for variant " + variant.getID());
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java
index 39228617d26..824973fc577 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java
@@ -23,6 +23,7 @@
import java.util.*;
import java.util.stream.Collectors;
+import java.util.stream.Stream;
/**
* Class for collapsing a collection of similar {@link SVCallRecord} objects, such as clusters produced by
@@ -79,6 +80,32 @@ public enum AltAlleleSummaryStrategy {
}
+ /**
+ * Flag field logic
+ */
+ public enum FlagFieldLogic {
+ /**
+ * Require all members to have the flag set
+ */
+ AND,
+
+ /**
+ * Require at least one member to have the flag set
+ */
+ OR,
+
+ /**
+ * Always set to false
+ */
+ ALWAYS_FALSE
+
+ }
+
+ public static final Set FLAG_TYPE_INFO_FIELDS = Sets.newHashSet(
+ GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE,
+ GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE
+ );
+
private static final Set SUPPORTED_SV_TYPES = Sets.newHashSet(
GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
GATKSVVCFConstants.StructuralVariantAnnotationType.DUP,
@@ -90,6 +117,8 @@ public enum AltAlleleSummaryStrategy {
GATKSVVCFConstants.StructuralVariantAnnotationType.CTX
);
+ private static final BreakpointEvidenceComparator breakpointEvidenceComparator = new BreakpointEvidenceComparator();
+
/**
* Comparators used for picking the representative genotype for a given sample
*/
@@ -139,16 +168,19 @@ public int compare(Genotype o1, Genotype o2) {
private final AltAlleleSummaryStrategy altAlleleSummaryStrategy;
private final BreakpointSummaryStrategy breakpointSummaryStrategy;
+ private final FlagFieldLogic flagFieldLogic;
private final ReferenceSequenceFile reference;
private final SAMSequenceDictionary dictionary;
public CanonicalSVCollapser(final ReferenceSequenceFile reference,
final AltAlleleSummaryStrategy altAlleleSummaryStrategy,
- final BreakpointSummaryStrategy breakpointSummaryStrategy) {
+ final BreakpointSummaryStrategy breakpointSummaryStrategy,
+ final FlagFieldLogic flagFieldLogic) {
this.reference = Utils.nonNull(reference);
this.dictionary = reference.getSequenceDictionary();
this.altAlleleSummaryStrategy = altAlleleSummaryStrategy;
this.breakpointSummaryStrategy = breakpointSummaryStrategy;
+ this.flagFieldLogic = flagFieldLogic;
}
private static final int distance(final SVCallRecord item, final int newStart, final int newEnd) {
@@ -193,7 +225,7 @@ public SVCallRecord collapse(final SVClusterEngine.OutputCluster cluster) {
return new SVCallRecord(representative.getId(), representative.getContigA(), start, strandA, representative.getContigB(),
end, strandB, type, representative.getComplexSubtype(), representative.getComplexEventIntervals(),
- length, algorithms, alleles, genotypes, attributes, filters, quality, dictionary);
+ length, representative.getEvidence(), algorithms, alleles, genotypes, attributes, filters, quality, dictionary);
}
protected List collapseAlleles(final List altAlleles, final Allele refAllele) {
@@ -562,15 +594,37 @@ public static List makeBiallelicList(final Allele alt, final Allele ref,
return alleles;
}
+ private Stream getItemFlagStream(final String key, final Collection items) {
+ return items.stream()
+ .map(item ->item.getAttributes().get(key) != null && item.getAttributes().get(key).equals(Boolean.TRUE));
+ }
+
protected Map collapseAttributes(final SVCallRecord representative,
final Collection items) {
Utils.nonNull(items);
Utils.nonEmpty(items);
final Map attributes = new HashMap<>();
for (final Map.Entry entry : representative.getAttributes().entrySet()) {
- attributes.put(entry.getKey(), entry.getValue());
+ if (!FLAG_TYPE_INFO_FIELDS.contains(entry.getKey())) {
+ attributes.put(entry.getKey(), entry.getValue());
+ }
}
attributes.put(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, items.stream().map(SVCallRecord::getId).sorted().collect(Collectors.toList()));
+ for (final String key : FLAG_TYPE_INFO_FIELDS) {
+ if (flagFieldLogic == FlagFieldLogic.AND) {
+ if (getItemFlagStream(key, items).allMatch(Boolean::booleanValue)) {
+ attributes.put(key, Boolean.TRUE);
+ }
+ } else if (flagFieldLogic == FlagFieldLogic.OR) {
+ if (getItemFlagStream(key, items).anyMatch(Boolean::booleanValue)) {
+ attributes.put(key, Boolean.TRUE);
+ }
+ } else if (flagFieldLogic == FlagFieldLogic.ALWAYS_FALSE) {
+ // Leave empty to imply FALSE
+ } else {
+ throw new IllegalArgumentException("Unsupported " + FlagFieldLogic.class.getSimpleName() + " value: " + flagFieldLogic.name());
+ }
+ }
return attributes;
}
@@ -671,16 +725,40 @@ private SVCallRecord getRepresentativeIntervalItem(final Collection qualityComparator = Comparator.comparing(r -> r.getLog10PError() == null ? 0 : r.getLog10PError());
final Comparator carrierCountComparator = Comparator.comparing(r -> -r.getCarrierGenotypeList().size());
final Comparator distanceComparator = Comparator.comparing(r -> getDistance(r.getPositionA(), r.getPositionB(), starts, ends));
- final Comparator idComparator = Comparator.comparing(r -> getDistance(r.getPositionA(), r.getPositionB(), starts, ends)); // stabilizes order
+ final Comparator idComparator = Comparator.comparing(SVCallRecord::getId); // stabilizes order
return records.stream().min(
- carrierCountComparator
+ qualityComparator
+ .thenComparing(breakpointEvidenceComparator)
+ .thenComparing(carrierCountComparator)
.thenComparing(distanceComparator)
.thenComparing(idComparator)).get();
}
+ protected static class BreakpointEvidenceComparator implements Comparator {
+ @Override
+ public int compare(final SVCallRecord a, final SVCallRecord b) {
+ final Set evidenceA = new HashSet<>(a.getEvidence());
+ final Set evidenceB = new HashSet<>(b.getEvidence());
+ // SR < PE and if neither they are considered equal
+ // Note sorting is in ascending order, and we want the highest-priority record first
+ if (evidenceA.contains(GATKSVVCFConstants.EvidenceTypes.SR) && !evidenceB.contains(GATKSVVCFConstants.EvidenceTypes.SR)) {
+ return -1;
+ } else if (!evidenceA.contains(GATKSVVCFConstants.EvidenceTypes.SR) && evidenceB.contains(GATKSVVCFConstants.EvidenceTypes.SR)) {
+ return 1;
+ } else if (evidenceA.contains(GATKSVVCFConstants.EvidenceTypes.PE) && !evidenceB.contains(GATKSVVCFConstants.EvidenceTypes.PE)) {
+ return -1;
+ } else if (!evidenceA.contains(GATKSVVCFConstants.EvidenceTypes.PE) && evidenceB.contains(GATKSVVCFConstants.EvidenceTypes.PE)) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+ }
+
protected static long getDistance(final int posA,
final int posB,
final int[] starts,
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngine.java
index 0199eaae3e3..a08989bc947 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngine.java
@@ -23,8 +23,6 @@
*
* NOTE: precise implementation of {@link SVClusterLinkage#getMaxClusterableStartingPosition(SVLocatable)}
* is important for efficiency because it determines when a cluster can be finalized and omitted from further clustering tests.
- *
- * @param class of items to cluster
*/
public class SVClusterEngine {
@@ -41,7 +39,6 @@ public enum CLUSTERING_TYPE {
private Map idToClusterMap; // Active clusters
private final Map idToItemMap; // Active items
protected final CLUSTERING_TYPE clusteringType;
- private final ItemSortingBuffer buffer;
private final Comparator itemComparator;
private String currentContig;
@@ -65,30 +62,12 @@ public SVClusterEngine(final CLUSTERING_TYPE clusteringType,
currentContig = null;
idToItemMap = new HashMap<>();
itemComparator = SVCallRecordUtils.getSVLocatableComparator(dictionary);
- buffer = new ItemSortingBuffer();
nextItemId = 0;
nextClusterId = 0;
lastStart = 0;
minActiveStartingPositionItemId = null;
}
-
- /**
- * Flushes all active clusters, adding them to the output buffer. Results from the output buffer are then copied out
- * and the buffer is cleared. This should be called between contigs to save memory.
- */
- public final List forceFlush() {
- flushClusters();
- return buffer.forceFlush();
- }
-
- /**
- * Gets any available finalized clusters.
- */
- public final List flush() {
- return buffer.flush();
- }
-
@VisibleForTesting
public Function getCollapser() {
return collapser;
@@ -109,25 +88,26 @@ public SVCallRecord getMinActiveStartingPositionItem() {
* Returns true if there are any active or finalized clusters.
*/
public final boolean isEmpty() {
- return idToClusterMap.isEmpty() && buffer.isEmpty();
+ return idToClusterMap.isEmpty();
}
/**
* Adds and clusters the given item. Note that items must be added in order of increasing start position.
* @param item item to cluster
*/
- public final void add(final SVCallRecord item) {
+ public final List add(final SVCallRecord item) {
// Start a new cluster if on a new contig
if (!item.getContigA().equals(currentContig)) {
- flushClusters();
+ final List result = flush();
currentContig = item.getContigA();
lastStart = 0;
seedCluster(registerItem(item));
- return;
+ return result;
+ } else {
+ final int itemId = registerItem(item);
+ final List clusterIdsToProcess = cluster(itemId);
+ return processClusters(clusterIdsToProcess);
}
- final int itemId = registerItem(item);
- final List clusterIdsToProcess = cluster(itemId);
- processClusters(clusterIdsToProcess);
}
private final int registerItem(final SVCallRecord item) {
@@ -263,12 +243,12 @@ private final void combineClusters(final Collection clusterIds, final I
/**
* Finalizes a single cluster, removing it from the currently active set and adding it to the output buffer.
*/
- private final void processCluster(final int clusterIndex) {
+ private final SVCallRecord processCluster(final int clusterIndex) {
final Cluster cluster = getCluster(clusterIndex);
idToClusterMap.remove(clusterIndex);
final List clusterItemIds = cluster.getItemIds();
final OutputCluster outputCluster = new OutputCluster(clusterItemIds.stream().map(idToItemMap::get).collect(Collectors.toList()));
- buffer.add(collapser.apply(outputCluster));
+ final SVCallRecord result = collapser.apply(outputCluster);
// Clean up item id map
if (clusterItemIds.size() == 1) {
// Singletons won't be present in any other clusters
@@ -289,6 +269,7 @@ private final void processCluster(final int clusterIndex) {
if (clusterItemIds.contains(minActiveStartingPositionItemId)) {
findAndSetMinActiveStart();
}
+ return result;
}
/**
@@ -309,25 +290,29 @@ private final void findAndSetMinActiveStart() {
/**
* Finalizes a set of clusters.
*/
- private final void processClusters(final List clusterIdsToProcess) {
+ private final List processClusters(final List clusterIdsToProcess) {
+ final List result = new ArrayList<>(clusterIdsToProcess.size());
for (final Integer clusterId : clusterIdsToProcess) {
- processCluster(clusterId);
+ result.add(processCluster(clusterId));
}
+ return result;
}
/**
* Finalizes all active clusters and adds them to the output buffer. Also clears the currently active set of clusters
* and items.
*/
- private final void flushClusters() {
+ public final List flush() {
final List clustersToFlush = new ArrayList<>(idToClusterMap.keySet());
+ final List result = new ArrayList<>(clustersToFlush.size());
for (final Integer clusterId : clustersToFlush) {
- processCluster(clusterId);
+ result.add(processCluster(clusterId));
}
idToItemMap.clear();
minActiveStartingPositionItemId = null;
nextItemId = 0;
nextClusterId = 0;
+ return result;
}
/**
@@ -431,52 +416,4 @@ public int hashCode() {
return Objects.hash(itemIds);
}
}
-
- private final class ItemSortingBuffer {
- private PriorityQueue buffer;
-
- public ItemSortingBuffer() {
- Utils.nonNull(itemComparator);
- this.buffer = new PriorityQueue<>(itemComparator);
- }
-
- public void add(final SVCallRecord record) {
- buffer.add(record);
- }
-
- /**
- * Returns any records that can be safely flushed based on the current minimum starting position
- * of items still being actively clustered.
- */
- public List flush() {
- if (buffer.isEmpty()) {
- return Collections.emptyList();
- }
- final SVCallRecord minActiveStartItem = getMinActiveStartingPositionItem();
- if (minActiveStartItem == null) {
- forceFlush();
- }
- final List out = new ArrayList<>();
- while (!buffer.isEmpty() && buffer.comparator().compare(buffer.peek(), minActiveStartItem) < 0) {
- out.add(buffer.poll());
- }
- return out;
- }
-
- /**
- * Returns all buffered records, regardless of any active clusters. To be used only when certain that no
- * active clusters can be clustered with any future inputs.
- */
- public List forceFlush() {
- final List result = new ArrayList<>(buffer.size());
- while (!buffer.isEmpty()) {
- result.add(buffer.poll());
- }
- return result;
- }
-
- public boolean isEmpty() {
- return buffer.isEmpty();
- }
- }
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineFactory.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineFactory.java
index 8f3bcaf6112..525910cf2d8 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineFactory.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineFactory.java
@@ -25,7 +25,7 @@ public static SVClusterEngine createCanonical(final SVClusterEngine.CLUSTERING_T
linkage.setDepthOnlyParams(depthParameters);
linkage.setMixedParams(mixedParameters);
linkage.setEvidenceParams(pesrParameters);
- final CanonicalSVCollapser collapser = new CanonicalSVCollapser(reference, altAlleleSummaryStrategy, breakpointSummaryStrategy);
+ final CanonicalSVCollapser collapser = new CanonicalSVCollapser(reference, altAlleleSummaryStrategy, breakpointSummaryStrategy, CanonicalSVCollapser.FlagFieldLogic.OR);
return new SVClusterEngine(type, collapser::collapse, linkage, dictionary);
}
@@ -35,7 +35,7 @@ public static SVClusterEngine createCNVDefragmenter(final SAMSequenceDictionary
final double paddingFraction,
final double minSampleOverlap) {
final SVClusterLinkage linkage = new CNVLinkage(dictionary, paddingFraction, minSampleOverlap);
- final CanonicalSVCollapser collapser = new CanonicalSVCollapser(reference, altAlleleSummaryStrategy, CanonicalSVCollapser.BreakpointSummaryStrategy.MIN_START_MAX_END);
+ final CanonicalSVCollapser collapser = new CanonicalSVCollapser(reference, altAlleleSummaryStrategy, CanonicalSVCollapser.BreakpointSummaryStrategy.MIN_START_MAX_END, CanonicalSVCollapser.FlagFieldLogic.OR);
return new SVClusterEngine(SVClusterEngine.CLUSTERING_TYPE.SINGLE_LINKAGE, collapser::collapse, linkage, dictionary);
}
@@ -46,7 +46,7 @@ public static SVClusterEngine createBinnedCNVDefragmenter(final SAMSequenceDicti
final double minSampleOverlap,
final List coverageIntervals) {
final SVClusterLinkage linkage = new BinnedCNVLinkage(dictionary, paddingFraction, minSampleOverlap, coverageIntervals);
- final CanonicalSVCollapser collapser = new CanonicalSVCollapser(reference, altAlleleSummaryStrategy, CanonicalSVCollapser.BreakpointSummaryStrategy.MIN_START_MAX_END);
+ final CanonicalSVCollapser collapser = new CanonicalSVCollapser(reference, altAlleleSummaryStrategy, CanonicalSVCollapser.BreakpointSummaryStrategy.MIN_START_MAX_END, CanonicalSVCollapser.FlagFieldLogic.OR);
return new SVClusterEngine(SVClusterEngine.CLUSTERING_TYPE.SINGLE_LINKAGE, collapser::collapse, linkage, dictionary);
}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterLinkage.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterLinkage.java
index 433ec4ab46e..a467a1118b1 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterLinkage.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterLinkage.java
@@ -77,17 +77,17 @@ protected static boolean hasSampleOverlap(final SVCallRecord a, final SVCallReco
final Set samples = new HashSet<>(SVUtils.hashMapCapacity(genotypesA.size() + genotypesB.size()));
samples.addAll(genotypesA.getSampleNames());
samples.addAll(genotypesB.getSampleNames());
+ if (samples.isEmpty()) {
+ // Empty case considered perfect overlap
+ return true;
+ }
int numMatches = 0;
for (final String sample : samples) {
final Genotype genotypeA = genotypesA.get(sample);
final Genotype genotypeB = genotypesB.get(sample);
// If one sample doesn't exist in the other set, assume reference copy state
- final int cnA = genotypeA == null ?
- VariantContextGetters.getAttributeAsInt(genotypeB, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 0)
- : VariantContextGetters.getAttributeAsInt(genotypeA, GATKSVVCFConstants.COPY_NUMBER_FORMAT, 0);
- final int cnB = genotypeB == null ?
- VariantContextGetters.getAttributeAsInt(genotypeA, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 0)
- : VariantContextGetters.getAttributeAsInt(genotypeB, GATKSVVCFConstants.COPY_NUMBER_FORMAT, 0);
+ final int cnA = getCopyState(genotypeA, genotypeB);
+ final int cnB = getCopyState(genotypeB, genotypeA);
if (cnA == cnB) {
numMatches++;
}
@@ -105,4 +105,20 @@ protected static boolean hasSampleOverlap(final SVCallRecord a, final SVCallReco
}
}
+ /**
+ * Tries to get the best copy state from the genotype. If the genotype is null, uses ploidy from a "backup"
+ * genotype as the default. If we have no clue, just return 0.
+ */
+ private static int getCopyState(final Genotype genotype, final Genotype matchedSampleGenotype) {
+ if (genotype == null) {
+ if (matchedSampleGenotype != null) {
+ return VariantContextGetters.getAttributeAsInt(matchedSampleGenotype, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 0);
+ } else {
+ return 0;
+ }
+ } else {
+ return VariantContextGetters.getAttributeAsInt(genotype, GATKSVVCFConstants.COPY_NUMBER_FORMAT,
+ VariantContextGetters.getAttributeAsInt(genotype, GATKSVVCFConstants.DEPTH_GENOTYPE_COPY_NUMBER_FORMAT, 0));
+ }
+ }
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterWalker.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterWalker.java
new file mode 100644
index 00000000000..59454a8fcda
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterWalker.java
@@ -0,0 +1,281 @@
+package org.broadinstitute.hellbender.tools.sv.cluster;
+
+import htsjdk.samtools.SAMSequenceDictionary;
+import htsjdk.samtools.reference.ReferenceSequenceFile;
+import htsjdk.samtools.util.SortingCollection;
+import htsjdk.variant.variantcontext.GenotypesContext;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.variantcontext.VariantContextBuilder;
+import htsjdk.variant.variantcontext.writer.VariantContextWriter;
+import htsjdk.variant.vcf.*;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.engine.*;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFHeaderLines;
+import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
+import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils;
+import org.broadinstitute.hellbender.tools.walkers.sv.JointGermlineCNVSegmentation;
+import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;
+
+import java.util.Set;
+
+import static org.broadinstitute.hellbender.tools.walkers.sv.JointGermlineCNVSegmentation.BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME;
+import static org.broadinstitute.hellbender.tools.walkers.sv.JointGermlineCNVSegmentation.FLAG_FIELD_LOGIC_LONG_NAME;
+
+/***
+ * Base class for tools that a simple interface for utilizing {@link SVClusterEngine}. It handles input/output easily,
+ * including output sorting with spilling to disk to avoid excessive memory usage.
+ */
+public abstract class SVClusterWalker extends MultiVariantWalker {
+ public static final String PLOIDY_TABLE_LONG_NAME = "ploidy-table";
+ public static final String VARIANT_PREFIX_LONG_NAME = "variant-prefix";
+ public static final String ENABLE_CNV_LONG_NAME = "enable-cnv";
+ public static final String ALGORITHM_LONG_NAME = "algorithm";
+ public static final String FAST_MODE_LONG_NAME = "fast-mode";
+ public static final String OMIT_MEMBERS_LONG_NAME = "omit-members";
+ public static final String DEFAULT_NO_CALL_LONG_NAME = "default-no-call";
+ public static final String MAX_RECORDS_IN_RAM_LONG_NAME = "max-records-in-ram";
+
+ /**
+ * The enum Cluster algorithm.
+ */
+ public enum CLUSTER_ALGORITHM {
+ /**
+ * Defragment cnv cluster algorithm. Not supported with stratification.
+ */
+ DEFRAGMENT_CNV,
+ /**
+ * Single linkage cluster algorithm.
+ */
+ SINGLE_LINKAGE,
+ /**
+ * Max clique cluster algorithm.
+ */
+ MAX_CLIQUE
+ }
+
+ @Argument(
+ doc = "Output VCF",
+ fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
+ shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME
+ )
+ protected GATKPath outputFile;
+
+ /**
+ * Expected format is tab-delimited and contains a header with the first column SAMPLE and remaining columns
+ * contig names. Each row corresponds to a sample, with the sample ID in the first column and contig ploidy
+ * integers in their respective columns.
+ */
+ @Argument(
+ doc = "Sample ploidy table (.tsv)",
+ fullName = PLOIDY_TABLE_LONG_NAME
+ )
+ protected GATKPath ploidyTablePath;
+
+ @Argument(
+ doc = "If supplied, generate variant IDs with this prefix",
+ fullName = VARIANT_PREFIX_LONG_NAME,
+ optional = true
+ )
+ protected String variantPrefix = null;
+
+ /**
+ * When enabled, DEL and DUP variants will be clustered together. The resulting records with have an SVTYPE of CNV.
+ */
+ @Argument(
+ doc = "Enable clustering DEL/DUP variants together as CNVs (does not apply to CNV defragmentation)",
+ fullName = ENABLE_CNV_LONG_NAME,
+ optional = true
+ )
+ protected boolean enableCnv = false;
+
+ /**
+ * Results in substantial space and time costs for large sample sets by clearing genotypes that are not needed for
+ * clustering, but any associated annotation fields will be set to null in the output.
+ */
+ @Argument(
+ doc = "Fast mode. Drops hom-ref and no-call genotype fields and emits them as no-calls.",
+ fullName = FAST_MODE_LONG_NAME,
+ optional = true
+ )
+ protected boolean fastMode = false;
+
+ @Argument(
+ doc = "Omit cluster member ID annotations",
+ fullName = OMIT_MEMBERS_LONG_NAME,
+ optional = true
+ )
+ protected boolean omitMembers = false;
+
+ @Argument(fullName = BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME,
+ doc = "Strategy to use for choosing a representative value for a breakpoint cluster.",
+ optional = true)
+ protected CanonicalSVCollapser.BreakpointSummaryStrategy breakpointSummaryStrategy =
+ CanonicalSVCollapser.BreakpointSummaryStrategy.REPRESENTATIVE;
+
+ @Argument(fullName = JointGermlineCNVSegmentation.ALT_ALLELE_SUMMARY_STRATEGY_LONG_NAME,
+ doc = "Strategy to use for choosing a representative alt allele for non-CNV biallelic sites with " +
+ "different subtypes.",
+ optional = true)
+ protected CanonicalSVCollapser.AltAlleleSummaryStrategy altAlleleSummaryStrategy =
+ CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE;
+
+ @Argument(fullName = FLAG_FIELD_LOGIC_LONG_NAME,
+ doc = "Logic for collapsing Flag type INFO and FORMAT fields",
+ optional = true)
+ protected CanonicalSVCollapser.FlagFieldLogic flagFieldLogic = CanonicalSVCollapser.FlagFieldLogic.OR;
+
+ @Argument(fullName = ALGORITHM_LONG_NAME,
+ doc = "Clustering algorithm",
+ optional = true
+ )
+ protected CLUSTER_ALGORITHM algorithm = CLUSTER_ALGORITHM.SINGLE_LINKAGE;
+
+ /**
+ * Default genotypes are assigned when they cannot be inferred from the inputs, such as when VCFs with different
+ * variants and samples are provided.
+ */
+ @Argument(fullName = DEFAULT_NO_CALL_LONG_NAME,
+ doc = "Default to no-call GT (e.g. ./.) instead of reference alleles (e.g. 0/0) when a genotype is not" +
+ " available",
+ optional = true
+ )
+ protected boolean defaultNoCall = false;
+
+ @Argument(fullName = MAX_RECORDS_IN_RAM_LONG_NAME,
+ doc = "When writing VCF files that need to be sorted, this will specify the number of records stored in " +
+ "RAM before spilling to disk. Increasing this number reduces the number of file handles needed to sort a " +
+ "VCF file, and increases the amount of RAM needed.",
+ optional=true)
+ public int maxRecordsInRam = 10000;
+
+ protected SAMSequenceDictionary dictionary;
+ protected ReferenceSequenceFile reference;
+ protected PloidyTable ploidyTable;
+ protected SortingCollection sortingBuffer;
+ protected VariantContextWriter writer;
+ protected VCFHeader header;
+ protected Set samples;
+ protected String currentContig;
+ protected int numVariantsBuilt = 0;
+
+ @Override
+ public boolean requiresReference() {
+ return true;
+ }
+
+ @Override
+ public void onTraversalStart() {
+ reference = ReferenceUtils.createReferenceReader(referenceArguments.getReferenceSpecifier());
+ dictionary = reference.getSequenceDictionary();
+ if (dictionary == null) {
+ throw new UserException("Reference sequence dictionary required");
+ }
+ ploidyTable = new PloidyTable(ploidyTablePath.toPath());
+ samples = getSamplesForVariants();
+ writer = createVCFWriter(outputFile);
+ header = createHeader();
+ writer.writeHeader(header);
+ currentContig = null;
+ sortingBuffer = SortingCollection.newInstance(
+ VariantContext.class,
+ new VCFRecordCodec(header, true),
+ header.getVCFRecordComparator(),
+ maxRecordsInRam,
+ tmpDir.toPath());
+ }
+
+ @Override
+ public Object onTraversalSuccess() {
+ for (final VariantContext variant : sortingBuffer) {
+ writer.add(variant);
+ }
+ return super.onTraversalSuccess();
+ }
+
+ @Override
+ public void closeTool() {
+ super.closeTool();
+ if (sortingBuffer != null) {
+ sortingBuffer.cleanup();
+ }
+ if (writer != null) {
+ writer.close();
+ }
+ }
+
+ /**
+ * Subclasses should override this method
+ */
+ public abstract void applyRecord(final SVCallRecord record);
+
+ @Override
+ public void apply(final VariantContext variant, final ReadsContext readsContext,
+ final ReferenceContext referenceContext, final FeatureContext featureContext) {
+ SVCallRecord call = SVCallRecordUtils.create(variant, dictionary);
+ if (fastMode && call.getType() != GATKSVVCFConstants.StructuralVariantAnnotationType.CNV) {
+ // Strip out non-carrier genotypes to save memory and compute
+ // Don't do for multi-allelic CNVs since carrier status can't be determined
+ final GenotypesContext filteredGenotypes = GenotypesContext.copy(call.getCarrierGenotypeList());
+ call = SVCallRecordUtils.copyCallWithNewGenotypes(call, filteredGenotypes);
+ }
+ // Update current contig
+ if (!call.getContigA().equals(currentContig)) {
+ currentContig = call.getContigA();
+ logger.info("Processing contig " + currentContig + "...");
+ }
+ applyRecord(call);
+ }
+
+ protected VCFHeader createHeader() {
+ final VCFHeader header = new VCFHeader(getHeaderForVariants().getMetaDataInInputOrder(), samples);
+ header.setSequenceDictionary(dictionary);
+
+ // Required info lines
+ header.addMetaDataLine(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
+ header.addMetaDataLine(GATKSVVCFHeaderLines.getInfoLine(GATKSVVCFConstants.SVLEN));
+ header.addMetaDataLine(GATKSVVCFHeaderLines.getInfoLine(GATKSVVCFConstants.SVTYPE));
+ header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.END2_ATTRIBUTE, 1,
+ VCFHeaderLineType.Integer, "Second position"));
+ header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.CONTIG2_ATTRIBUTE, 1,
+ VCFHeaderLineType.String, "Second contig"));
+ header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.STRANDS_ATTRIBUTE, 1,
+ VCFHeaderLineType.String, "First and second strands"));
+ header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE,
+ VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Source algorithms"));
+ if (!omitMembers) {
+ header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY,
+ VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Cluster variant ids"));
+ }
+ // Required format lines
+ header.addMetaDataLine(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY));
+ return header;
+ }
+
+ protected void write(final SVCallRecord call) {
+ sortingBuffer.add(buildVariantContext(call));
+ }
+
+ protected VariantContext buildVariantContext(final SVCallRecord call) {
+ // Add genotypes for missing samples
+ final GenotypesContext filledGenotypes = SVCallRecordUtils.populateGenotypesForMissingSamplesWithAlleles(
+ call, samples, !defaultNoCall, ploidyTable, header);
+
+ // Assign new variant ID
+ final String newId = variantPrefix == null ? call.getId() : String.format("%s%08x", variantPrefix, numVariantsBuilt++);
+
+ // Build new variant
+ final SVCallRecord finalCall = new SVCallRecord(newId, call.getContigA(), call.getPositionA(), call.getStrandA(),
+ call.getContigB(), call.getPositionB(), call.getStrandB(), call.getType(), call.getComplexSubtype(),
+ call.getComplexEventIntervals(), call.getLength(), call.getEvidence(), call.getAlgorithms(), call.getAlleles(), filledGenotypes,
+ call.getAttributes(), call.getFilters(), call.getLog10PError(), dictionary);
+ final VariantContextBuilder builder = SVCallRecordUtils.getVariantBuilder(finalCall);
+ if (omitMembers) {
+ builder.rmAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY);
+ }
+ return builder.make();
+ }
+
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/StratifiedClusteringTableParser.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/StratifiedClusteringTableParser.java
new file mode 100644
index 00000000000..2bc5720393b
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/StratifiedClusteringTableParser.java
@@ -0,0 +1,44 @@
+package org.broadinstitute.hellbender.tools.sv.cluster;
+
+import com.google.common.collect.ImmutableSet;
+import org.broadinstitute.hellbender.utils.tsv.DataLine;
+import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
+
+import java.util.Set;
+import java.util.function.Function;
+
+public class StratifiedClusteringTableParser {
+
+ // Configuration table column names
+ public static final String NAME_COLUMN = "NAME";
+ public static final String RECIPROCAL_OVERLAP_COLUMN = "RECIPROCAL_OVERLAP";
+ public static final String SIZE_SIMILARITY_COLUMN = "SIZE_SIMILARITY";
+ public static final String BREAKEND_WINDOW_COLUMN = "BREAKEND_WINDOW";
+ public static final String SAMPLE_OVERLAP_COLUMN = "SAMPLE_OVERLAP";
+ protected static final Set COLUMN_NAMES = ImmutableSet.of(NAME_COLUMN, RECIPROCAL_OVERLAP_COLUMN, SIZE_SIMILARITY_COLUMN, BREAKEND_WINDOW_COLUMN, SAMPLE_OVERLAP_COLUMN);
+
+ public static Function tableParser(TableColumnCollection columns, Function exceptionFactory) {
+ for (final String column : COLUMN_NAMES) {
+ if (!columns.contains(column)) {
+ throw exceptionFactory.apply("Missing column " + column);
+ }
+ }
+ if (columns.columnCount() != COLUMN_NAMES.size()) {
+ throw exceptionFactory.apply("Expected " + columns.columnCount() + " columns but found " + columns.columnCount());
+ }
+ return StratifiedClusteringTableParser::parseTableLine;
+ }
+
+ protected static StratumParameters parseTableLine(final DataLine dataLine) {
+ final String name = dataLine.get(NAME_COLUMN);
+ final double reciprocalOverlap = dataLine.getDouble(RECIPROCAL_OVERLAP_COLUMN);
+ final double sizeSimilarity = dataLine.getDouble(SIZE_SIMILARITY_COLUMN);
+ final double sampleOverlap = dataLine.getDouble(SAMPLE_OVERLAP_COLUMN);
+ final int breakendWindow = dataLine.getInt(BREAKEND_WINDOW_COLUMN);
+ return new StratumParameters(name, reciprocalOverlap, sizeSimilarity, breakendWindow, sampleOverlap);
+ }
+
+ public record StratumParameters(String name, double reciprocalOverlap, double sizeSimilarity,
+ int breakendWindow, double sampleOverlap) {
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStatificationEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStatificationEngine.java
new file mode 100644
index 00000000000..e53bf0624b6
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStatificationEngine.java
@@ -0,0 +1,347 @@
+package org.broadinstitute.hellbender.tools.sv.stratify;
+
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Lists;
+import htsjdk.samtools.SAMSequenceDictionary;
+import htsjdk.samtools.util.Locatable;
+import htsjdk.samtools.util.OverlapDetector;
+import org.broadinstitute.hellbender.engine.GATKPath;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
+import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
+import org.broadinstitute.hellbender.utils.IntervalMergingRule;
+import org.broadinstitute.hellbender.utils.IntervalUtils;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.tsv.DataLine;
+import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
+import org.broadinstitute.hellbender.utils.tsv.TableReader;
+import org.broadinstitute.hellbender.utils.tsv.TableUtils;
+
+import java.io.IOException;
+import java.util.*;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+
+// Groups variants by SVTYPE, SVLEN, and overlap with one or more interval sets
+public class SVStatificationEngine {
+
+ // Configuration table column names
+ public static final String NAME_COLUMN = "NAME";
+ public static final String SVTYPE_COLUMN = "SVTYPE";
+ public static final String MIN_SIZE_COLUMN = "MIN_SIZE";
+ public static final String MAX_SIZE_COLUMN = "MAX_SIZE";
+ public static final String CONTEXT_COLUMN = "CONTEXT";
+ protected static final Set COLUMN_NAMES = ImmutableSet.of(NAME_COLUMN, SVTYPE_COLUMN, MIN_SIZE_COLUMN, MAX_SIZE_COLUMN, CONTEXT_COLUMN);
+ public static final String CONTEXT_COLUMN_DELIMITER = ",";
+
+ public static final Set NULL_TABLE_VALUES = Set.of("-1", "", "NULL", "NA");
+
+ protected final Map> contextMap;
+ protected final Map strata;
+ protected final SAMSequenceDictionary dictionary;
+
+ public SVStatificationEngine(final SAMSequenceDictionary dictionary) {
+ contextMap = new HashMap<>();
+ strata = new HashMap<>();
+ this.dictionary = Utils.nonNull(dictionary);
+ }
+
+ public void addContext(final String name, final List intervals) {
+ Utils.nonNull(name);
+ Utils.nonNull(intervals);
+ Utils.validateArg(!contextMap.containsKey(name), "Context with name " + name + " already exists");
+ contextMap.put(name, OverlapDetector.create(intervals));
+ }
+
+ /**
+ * Adds a new stratification group
+ * @param name a unique ID
+ * @param svType SV type, may be null
+ * @param minSize minimum size in bp (inclusive), may be null
+ * @param maxSize maximum size in bp (exclusive), may be null
+ * @param contextNames reference context names
+ */
+ public void addStratification(final String name, final GATKSVVCFConstants.StructuralVariantAnnotationType svType,
+ final Integer minSize, final Integer maxSize, final Set contextNames) {
+ addStratification(new Stratum(name, svType, minSize, maxSize, contextNames));
+ }
+
+ protected void addStratification(final Stratum stratification) {
+ Utils.validateArg(!strata.containsKey(stratification.getName()), "Encountered duplicate name " + stratification.getName());
+ strata.put(stratification.getName(), stratification);
+ }
+
+ /**
+ * Retrieves intervals for the given context
+ * @param name context ID
+ * @return searchable interval set
+ */
+ public OverlapDetector getContextIntervals(final String name) {
+ return contextMap.get(name);
+ }
+
+ /**
+ * Factory method for creating a new engine from a config file and set of reference contexts. The config file
+ * is a table parsable by {@link TableReader}, with mandatory columns defined in {@link #COLUMN_NAMES}.
+ * @param contextMap map from reference context name to interval set
+ * @param configFilePath path to stratification config table
+ * @param dictionary reference dict
+ * @return new engine
+ */
+ public static SVStatificationEngine create(final Map> contextMap,
+ final GATKPath configFilePath,
+ final SAMSequenceDictionary dictionary) {
+ Utils.nonNull(contextMap);
+ Utils.nonNull(configFilePath);
+ final SVStatificationEngine engine = new SVStatificationEngine(dictionary);
+ for (final Map.Entry> entry : contextMap.entrySet()) {
+ engine.addContext(entry.getKey(), entry.getValue());
+ }
+ try (final TableReader tableReader = TableUtils.reader(configFilePath.toPath(), engine::tableParser)) {
+ for (final Stratum stratification : tableReader) {
+ engine.addStratification(stratification);
+ }
+ } catch (final IOException e) {
+ throw new GATKException("IO error while reading config table", e);
+ }
+ return engine;
+ }
+
+ /**
+ * Get all stratification groups matching a given query record.
+ * @param record query record
+ * @param overlapFraction minimum overlap fraction (0 to 1)
+ * @param numBreakpointOverlaps minimum number of breakpoint ends that must lie in the reference context(s) (0, 1, 2)
+ * @param numBreakpointOverlapsInterchrom minimum breakpoint ends for interchromosomal variants (1, 2)
+ * @return all matching strata
+ */
+ public Collection getMatches(final SVCallRecord record, final double overlapFraction, final int numBreakpointOverlaps, final int numBreakpointOverlapsInterchrom) {
+ Utils.nonNull(record);
+ final List result = new ArrayList<>();
+ for (final Stratum stratification : strata.values()) {
+ if (stratification.matches(record, overlapFraction, numBreakpointOverlaps, numBreakpointOverlapsInterchrom)) {
+ result.add(stratification);
+ }
+ }
+ return result;
+ }
+
+ protected Function tableParser(TableColumnCollection columns, Function exceptionFactory) {
+ // Check for expected columns
+ for (final String column : COLUMN_NAMES) {
+ if (!columns.contains(column)) {
+ throw exceptionFactory.apply("Missing column " + column);
+ }
+ }
+ // Check there are no extra columns
+ if (columns.columnCount() != COLUMN_NAMES.size()) {
+ throw exceptionFactory.apply("Expected " + columns.columnCount() + " columns but found " + columns.columnCount());
+ }
+ return this::parseTableLine;
+ }
+
+ protected Stratum parseTableLine(final DataLine dataLine) {
+ final GATKSVVCFConstants.StructuralVariantAnnotationType svType = GATKSVVCFConstants.StructuralVariantAnnotationType.valueOf(dataLine.get(SVTYPE_COLUMN));
+ final String name = dataLine.get(NAME_COLUMN);
+ final Integer minSize = parseIntegerMaybeNull(dataLine.get(MIN_SIZE_COLUMN));
+ final Integer maxSize = parseIntegerMaybeNull(dataLine.get(MAX_SIZE_COLUMN));
+ final Set contextNames = parseContextString(dataLine.get(CONTEXT_COLUMN));
+ return new Stratum(name, svType, minSize, maxSize, contextNames);
+ }
+
+ protected Set parseContextString(final String val) {
+ if (NULL_TABLE_VALUES.contains(val)) {
+ return Collections.emptySet();
+ } else {
+ final String[] contextArray = val.split(CONTEXT_COLUMN_DELIMITER);
+ for (final String context : contextArray) {
+ if (!contextMap.containsKey(context)) {
+ throw new GATKException("Could not find context with name " + context);
+ }
+ }
+ return Lists.newArrayList(contextArray).stream().collect(Collectors.toUnmodifiableSet());
+ }
+ }
+
+ protected Integer parseIntegerMaybeNull(final String val) {
+ if (NULL_TABLE_VALUES.contains(val)) {
+ return null;
+ } else {
+ return Integer.valueOf(val);
+ }
+ }
+
+ public Collection getStrata() {
+ return strata.values();
+ }
+
+ public class Stratum {
+
+ final GATKSVVCFConstants.StructuralVariantAnnotationType svType;
+ final int minSize; // inclusive
+ final int maxSize; // exclusive
+ final List contextNames;
+ final String name;
+
+ Stratum(final String name, final GATKSVVCFConstants.StructuralVariantAnnotationType svType,
+ final Integer minSize, final Integer maxSize, final Set contextNames) {
+ this.name = Utils.nonNull(name);
+ for (final String contextName : contextNames) {
+ if (contextName != null && !contextMap.containsKey(contextName)) {
+ throw new IllegalArgumentException("Unregistered context name " + contextName);
+ }
+ }
+ if (maxSize != null && minSize != null && maxSize <= minSize) {
+ throw new IllegalArgumentException("Min size must be strictly less than max size");
+ }
+ if (maxSize != null && maxSize < 0) {
+ throw new IllegalArgumentException("Max size cannot be less than 0");
+ }
+ if (maxSize != null && maxSize == Integer.MAX_VALUE) {
+ throw new IllegalArgumentException("Max size " + Integer.MAX_VALUE + " is reserved");
+ }
+ if (minSize != null && minSize < 0) {
+ throw new IllegalArgumentException("Min size cannot be less than 0");
+ }
+ if ((svType == GATKSVVCFConstants.StructuralVariantAnnotationType.BND || svType == GATKSVVCFConstants.StructuralVariantAnnotationType.CTX) && (minSize != null || maxSize != null)) {
+ throw new IllegalArgumentException("BND/CTX categories cannot have min or max size (" + name + ")");
+ }
+ this.svType = svType;
+ // Map min from any negative number to negative infinity
+ if (minSize == null) {
+ this.minSize = Integer.MIN_VALUE;
+ } else {
+ this.minSize = minSize;
+ }
+ // Map max from any negative number to infinity
+ if (maxSize == null) {
+ this.maxSize = Integer.MAX_VALUE;
+ } else {
+ this.maxSize = maxSize;
+ }
+ this.contextNames = contextNames.stream().sorted().collect(Collectors.toList());
+ }
+
+ protected boolean matches(final SVCallRecord record, final double overlapFraction,
+ final int numBreakpointOverlaps, final int numBreakpointOverlapsInterchrom) {
+ return matchesType(record) && matchesSize(record) && matchesContext(record, overlapFraction, numBreakpointOverlaps, numBreakpointOverlapsInterchrom);
+ }
+
+ protected boolean matchesType(final SVCallRecord record) {
+ return record.getType() == svType;
+ }
+
+ protected boolean matchesSize(final SVCallRecord record) {
+ final Integer length = record.getLength();
+ if (length == null) {
+ // Undefined length requires null min/max boundaries
+ return minSize == Integer.MIN_VALUE && maxSize == Integer.MAX_VALUE;
+ } else {
+ return length >= minSize && length < maxSize;
+ }
+ }
+
+ /**
+ * Determines whether a given query record belongs to this context.
+ * @param record query record
+ * @param overlapFraction minimum variant interval overlap fraction
+ * @param numBreakpointOverlaps minimum number of breakpoint ends that must lie in the context
+ * @param numBreakpointOverlapsInterchrom minimum breakpoint ends if the variant is intermchromosomal
+ * @return true if the SV matches this context
+ */
+ public boolean matchesContext(final SVCallRecord record,
+ final double overlapFraction,
+ final int numBreakpointOverlaps,
+ final int numBreakpointOverlapsInterchrom) {
+ Utils.nonNull(record);
+ Utils.validate(overlapFraction >= 0 && overlapFraction <= 1,
+ "Overlap fraction threshold " + overlapFraction + " must be on [0, 1]");
+ Utils.validate(numBreakpointOverlaps >= 0 && numBreakpointOverlaps <= 2,
+ "Breakpoint overlaps threshold " + numBreakpointOverlaps + " must be 0, 1, or 2");
+ Utils.validate(numBreakpointOverlapsInterchrom == 1 || numBreakpointOverlapsInterchrom == 2,
+ "Interchromosomal breakpoint overlaps threshold " + numBreakpointOverlapsInterchrom + " must be 1 or 2");
+ Utils.validate(!(overlapFraction == 0 && numBreakpointOverlaps == 0),
+ "Overlap fraction and overlapping breakpoints thresholds cannot both be 0");
+ if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INS) {
+ // Just require the insertion locus to fall in an interval
+ return matchesContextBreakpointOverlap(record, 1);
+ } else if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.BND || record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CTX) {
+ // Interchromosomal variants
+ return matchesContextBreakpointOverlap(record, numBreakpointOverlapsInterchrom);
+ } else {
+ return matchesContextIntrachromosomal(record, overlapFraction, numBreakpointOverlaps);
+ }
+ }
+
+ protected boolean matchesContextIntrachromosomal(final SVCallRecord record,
+ final double overlapFraction,
+ final int numBreakpointOverlaps) {
+ return matchesContextOverlapFraction(record, overlapFraction) && matchesContextBreakpointOverlap(record, numBreakpointOverlaps);
+ }
+
+ protected boolean matchesContextOverlapFraction(final SVCallRecord record, final double overlapFraction) {
+ if (overlapFraction > 0 && !contextNames.isEmpty()) {
+ if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) {
+ throw new GATKException("Context overlap for CPX types not currently supported (" + name + ")");
+ }
+ final SimpleInterval interval = new SimpleInterval(record.getContigA(), record.getPositionA(), record.getPositionB());
+ final List overlaps = new ArrayList<>();
+ for (final String context : contextNames) {
+ overlaps.addAll(contextMap.get(context).getOverlaps(interval).stream().map(SimpleInterval::new).collect(Collectors.toList()));
+ }
+ final List mergedOverlaps = IntervalUtils.sortAndMergeIntervals(overlaps, dictionary, IntervalMergingRule.ALL)
+ .values().stream().flatMap(List::stream).collect(Collectors.toList());
+ long overlapLength = 0;
+ for (final Locatable overlap : mergedOverlaps) {
+ overlapLength += interval.intersect(overlap).size();
+ }
+ return overlapLength / (double) interval.getLengthOnReference() >= overlapFraction;
+ } else {
+ return true;
+ }
+ }
+
+ protected boolean matchesContextBreakpointOverlap(final SVCallRecord record, final int numBreakpointOverlaps) {
+ if (numBreakpointOverlaps > 0 && !contextNames.isEmpty()) {
+ if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) {
+ throw new GATKException("Context overlap for CPX types not currently supported (" + name + ")");
+ }
+ final SimpleInterval intervalA = new SimpleInterval(record.getContigA(), record.getPositionA(), record.getPositionA());
+ final SimpleInterval intervalB = new SimpleInterval(record.getContigB(), record.getPositionB(), record.getPositionB());
+ return countAnyContextOverlap(intervalA) + countAnyContextOverlap(intervalB) >= numBreakpointOverlaps;
+ } else {
+ return true;
+ }
+ }
+
+ protected int countAnyContextOverlap(final SimpleInterval interval) {
+ for (final String context : contextNames) {
+ if (contextMap.get(context).overlapsAny(interval)) {
+ return 1;
+ }
+ }
+ return 0;
+ }
+
+ public GATKSVVCFConstants.StructuralVariantAnnotationType getSvType() {
+ return svType;
+ }
+
+ public Integer getMinSize() {
+ return minSize;
+ }
+
+ public Integer getMaxSize() {
+ return maxSize;
+ }
+
+ public List getContextNames() {
+ return contextNames;
+ }
+
+ public String getName() {
+ return name;
+ }
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStratificationEngineArgumentsCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStratificationEngineArgumentsCollection.java
new file mode 100644
index 00000000000..470bbb3a679
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStratificationEngineArgumentsCollection.java
@@ -0,0 +1,70 @@
+package org.broadinstitute.hellbender.tools.sv.stratify;
+
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.hellbender.engine.GATKPath;
+import org.broadinstitute.hellbender.utils.tsv.TableUtils;
+
+import java.io.Serializable;
+import java.util.List;
+
+/**
+ * Arguments for use with {@link SVStatificationEngine}.
+ */
+public class SVStratificationEngineArgumentsCollection implements Serializable {
+ // Command-line arguments
+ public static final String STRATIFY_CONFIG_FILE_LONG_NAME = "stratify-config";
+ public static final String CONTEXT_NAME_FILE_LONG_NAME = "context-name";
+ public static final String CONTEXT_INTERVAL_FILE_LONG_NAME = "context-intervals";
+ public static final String OVERLAP_FRACTION_LONG_NAME = "stratify-overlap-fraction";
+ public static final String NUM_BREAKPOINT_OVERLAPS_LONG_NAME = "stratify-num-breakpoint-overlaps";
+ public static final String NUM_BREAKPOINT_INTERCHROM_OVERLAPS_LONG_NAME = "stratify-num-breakpoint-overlaps-interchromosomal";
+ private static final long serialVersionUID = 1L;
+
+ /**
+ * Expected format is tab-delimited and contains columns NAME, SVTYPE, MIN_SIZE, MAX_SIZE, CONTEXT. First line must
+ * be a header with column names. Comment lines starting with {@link TableUtils#COMMENT_PREFIX} are ignored.
+ */
+ @Argument(
+ doc = "Stratification configuration file (.tsv)",
+ fullName = STRATIFY_CONFIG_FILE_LONG_NAME
+ )
+ public GATKPath configFile;
+
+ @Argument(
+ doc = "Context intervals file. Can be specified multiple times.",
+ fullName = CONTEXT_INTERVAL_FILE_LONG_NAME,
+ optional = true
+ )
+ public List contextFileList;
+
+ @Argument(
+ doc = "Context names. Must be once for each --" + CONTEXT_INTERVAL_FILE_LONG_NAME,
+ fullName = CONTEXT_NAME_FILE_LONG_NAME,
+ optional = true
+ )
+ public List contextNameList;
+
+ @Argument(
+ doc = "Minimum overlap fraction for contexts",
+ minValue = 0,
+ maxValue = 1,
+ fullName = OVERLAP_FRACTION_LONG_NAME
+ )
+ public double overlapFraction = 0;
+
+ @Argument(
+ doc = "Minimum number of variant endpoint overlaps for contexts",
+ minValue = 0,
+ maxValue = 2,
+ fullName = NUM_BREAKPOINT_OVERLAPS_LONG_NAME
+ )
+ public int numBreakpointOverlaps = 1;
+
+ @Argument(
+ doc = "Minimum number of breakpoint overlaps for contexts for interchromosomal variants (e.g. BNDs)",
+ minValue = 1,
+ maxValue = 2,
+ fullName = NUM_BREAKPOINT_INTERCHROM_OVERLAPS_LONG_NAME
+ )
+ public int numBreakpointOverlapsInterchrom = 1;
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster.java
new file mode 100644
index 00000000000..274716c3c29
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster.java
@@ -0,0 +1,241 @@
+package org.broadinstitute.hellbender.tools.walkers.sv;
+
+import htsjdk.variant.vcf.VCFHeader;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.ArgumentCollection;
+import org.broadinstitute.barclay.argparser.BetaFeature;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup;
+import org.broadinstitute.hellbender.engine.GATKPath;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
+import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
+import org.broadinstitute.hellbender.tools.sv.cluster.*;
+import org.broadinstitute.hellbender.tools.sv.stratify.SVStatificationEngine;
+import org.broadinstitute.hellbender.tools.sv.stratify.SVStratificationEngineArgumentsCollection;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.tsv.TableReader;
+import org.broadinstitute.hellbender.utils.tsv.TableUtils;
+
+import java.io.IOException;
+import java.util.*;
+import java.util.stream.Collectors;
+
+/**
+ * Clusters structural variants using the same base algorithms as {@link SVCluster}. In addition, variants are
+ * grouped according to customizable stratification criteria including:
+ *
+ * - SV type
+ * - Size range
+ * - Reference context
+ *
+ * The first step is to define these groups in a stratification configuration TSV file. Please see the
+ * {@link SVStratify} tool for a description of the stratification method and expected table format.
+ *
+ * Each SV is only clustered with other SVs in its own group. Each group must be mutually exclusive, meaning that
+ * any given SV should only belong to one group. Furthermore, SVs that do not fall into any of the groups will not be
+ * clustered.
+ *
+ * The second step is to define the clustering configuration for each group. This is again done by creating a TSV
+ * file with the following columns defined on the first line:
+ *
+ * - NAME
+ * - RECIPROCAL_OVERLAP
+ * - SIZE_SIMILARITY
+ * - BREAKEND_WINDOW
+ * - SAMPLE_OVERLAP
+ *
+ * where NAME corresponds to the same name given in the stratification configuration. Every group needs to be given
+ * a configuration here. That is, there should be a 1:1 correspondence of the rows in the two configuration files
+ * (order does not matter).
+ *
+ *
+ * The remaining columns define the clustering parameters for the group. See {@link SVCluster} for more information
+ * on the different parameters. Note, unlike {@link SVCluster} that distinct parameter sets for depth-only,
+ * PESR, and "mixed" clustering cannot be defined for this tool. Instead, the same parameters are applied to
+ * all three cases.
+ *
+ * For example,
+ *
+ *
+ * NAME | RECIPROCAL_OVERLAP | SIZE_SIMILARITY | BREAKEND_WINDOW | SAMPLE_OVERLAP |
+ *
+ *
+ * DEL_large_SD | 0.3 | 0.5 | 1000000 | 0.1 |
+ *
+ *
+ * DUP_large_SD | 0.3 | 0.5 | 1000000 | 0.1 |
+ *
+ *
+ * DEL_small_SR_RM | 0.5 | 0.5 | 100 | 0.1 |
+ *
+ *
+ * DUP_small_SR_RM | 0.5 | 0.5 | 100 | 0.1 |
+ *
+ *
+ * INS_SR | 0.5 | 0.5 | 100 | 0 |
+ *
+ *
+ *
+ * This tool accepts multiple VCF inputs with no restrictions on site or sample overlap.
+ *
+ * This tool does not support CNV defragmentation via the {@link #algorithm} parameter.
+ *
+ * Inputs
+ *
+ *
+ * -
+ * One or more SV VCFs
+ *
+ * -
+ * Stratification configuration TSV file
+ *
+ * -
+ * Clustering configuration TSV file
+ *
+ * -
+ * Reference fasta
+ *
+ *
+ *
+ * Output
+ *
+ *
+ * -
+ * Clustered VCF
+ *
+ *
+ *
+ * Usage example
+ *
+ *
+ * gatk GroupedSVCluster \
+ * -R reference.fasta \
+ * -V variants.vcf.gz \
+ * -O clustered.vcf.gz \
+ * --context-name repeatmasker \
+ * --context-intervals repeatmasker.bed \
+ * --stratify-config strata.tsv \
+ * --clustering-config cluster.tsv
+ *
+ *
+ * @author Mark Walker <markw@broadinstitute.org>
+ */
+@CommandLineProgramProperties(
+ summary = "Clusters structural variants within independent stratification groups",
+ oneLineSummary = "Clusters structural variants within independent stratification groups",
+ programGroup = StructuralVariantDiscoveryProgramGroup.class
+)
+@BetaFeature
+@DocumentedFeature
+public final class GroupedSVCluster extends SVClusterWalker {
+ public static final String CLUSTERING_CONFIG_FILE_LONG_NAME = "clustering-config";
+
+ @ArgumentCollection
+ private final SVStratificationEngineArgumentsCollection stratArgs = new SVStratificationEngineArgumentsCollection();
+
+ /**
+ * Expected format is tab-delimited and contains columns NAME, RECIPROCAL_OVERLAP, SIZE_SIMILARITY, BREAKEND_WINDOW,
+ * SAMPLE_OVERLAP. First line must be a header with column names. Comment lines starting with
+ * {@link TableUtils#COMMENT_PREFIX} are ignored.
+ */
+ @Argument(
+ doc = "Configuration file (.tsv) containing the clustering parameters for each group",
+ fullName = CLUSTERING_CONFIG_FILE_LONG_NAME
+ )
+ public GATKPath strataClusteringConfigFile;
+
+ private SVStatificationEngine stratEngine;
+ private final Map clusterEngineMap = new HashMap<>();
+
+ @Override
+ public void onTraversalStart() {
+ super.onTraversalStart();
+ // sorting not guaranteed
+ createOutputVariantIndex = false;
+ stratEngine = SVStratify.loadStratificationConfig(stratArgs, dictionary);
+ Utils.validate(!stratEngine.getStrata().isEmpty(),
+ "No strata defined with --" + SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME);
+ readStrataClusteringConfig();
+ Utils.validate(stratEngine.getStrata().size() == clusterEngineMap.size(),
+ "Stratification and clustering configurations have a different number of groups.");
+ for (final SVStatificationEngine.Stratum stratum : stratEngine.getStrata()) {
+ Utils.validate(clusterEngineMap.containsKey(stratum.getName()),
+ "Could not find group " + stratum.getName() + " in clustering configuration.");
+ }
+ }
+
+ @Override
+ protected VCFHeader createHeader() {
+ final VCFHeader header = super.createHeader();
+ SVStratify.addStratifyMetadata(header);
+ return header;
+ }
+
+ private void readStrataClusteringConfig() {
+ try (final TableReader tableReader = TableUtils.reader(strataClusteringConfigFile.toPath(), StratifiedClusteringTableParser::tableParser)) {
+ for (final StratifiedClusteringTableParser.StratumParameters parameters : tableReader) {
+ // Identical parameters for each linkage type
+ final ClusteringParameters pesrParams = ClusteringParameters.createPesrParameters(parameters.reciprocalOverlap(), parameters.sizeSimilarity(), parameters.breakendWindow(), parameters.sampleOverlap());
+ final ClusteringParameters mixedParams = ClusteringParameters.createMixedParameters(parameters.reciprocalOverlap(), parameters.sizeSimilarity(), parameters.breakendWindow(), parameters.sampleOverlap());
+ final ClusteringParameters depthParams = ClusteringParameters.createDepthParameters(parameters.reciprocalOverlap(), parameters.sizeSimilarity(), parameters.breakendWindow(), parameters.sampleOverlap());
+ final SVClusterEngine clusterEngine = createClusteringEngine(pesrParams, mixedParams, depthParams);
+ clusterEngineMap.put(parameters.name(), clusterEngine);
+ }
+ } catch (final IOException e) {
+ throw new GATKException("IO error while reading config table", e);
+ }
+ }
+
+ private SVClusterEngine createClusteringEngine(final ClusteringParameters pesrParams, final ClusteringParameters mixedParams, final ClusteringParameters depthParams) {
+ if (algorithm == CLUSTER_ALGORITHM.SINGLE_LINKAGE || algorithm == CLUSTER_ALGORITHM.MAX_CLIQUE) {
+ final SVClusterEngine.CLUSTERING_TYPE type = algorithm == CLUSTER_ALGORITHM.SINGLE_LINKAGE ?
+ SVClusterEngine.CLUSTERING_TYPE.SINGLE_LINKAGE : SVClusterEngine.CLUSTERING_TYPE.MAX_CLIQUE;
+ return SVClusterEngineFactory.createCanonical(type, breakpointSummaryStrategy,
+ altAlleleSummaryStrategy, dictionary, reference, enableCnv,
+ depthParams, mixedParams, pesrParams);
+ } else {
+ throw new IllegalArgumentException("Unsupported algorithm: " + algorithm.name());
+ }
+ }
+
+ @Override
+ public Object onTraversalSuccess() {
+ for (final SVClusterEngine engine : clusterEngineMap.values()) {
+ engine.flush().stream().forEach(this::write);
+ }
+ return super.onTraversalSuccess();
+ }
+
+ @Override
+ public void closeTool() {
+ super.closeTool();
+ }
+
+ @Override
+ public void applyRecord(final SVCallRecord record) {
+ final Collection stratifications = stratEngine.getMatches(record,
+ stratArgs.overlapFraction, stratArgs.numBreakpointOverlaps, stratArgs.numBreakpointOverlapsInterchrom);
+ if (stratifications.size() > 1) {
+ // don't allow more than one match since it would proliferate variants
+ final String matchesString = String.join(", ", stratifications.stream().map(SVStatificationEngine.Stratum::getName).collect(Collectors.toList()));
+ throw new GATKException("Record " + record.getId() + " matched multiple groups: " + matchesString);
+ } else if (stratifications.isEmpty()) {
+ // no match, don't cluster
+ record.getAttributes().put(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, Collections.singletonList(record.getId()));
+ record.getAttributes().put(GATKSVVCFConstants.STRATUM_INFO_KEY, Collections.singletonList(SVStratify.DEFAULT_STRATUM));
+ write(record);
+ } else {
+ // exactly one match
+ final SVStatificationEngine.Stratum stratum = stratifications.iterator().next();
+ Utils.validate(clusterEngineMap.containsKey(stratum.getName()), "Group undefined: " + stratum.getName());
+ record.getAttributes().put(GATKSVVCFConstants.STRATUM_INFO_KEY, Collections.singletonList(stratum.getName()));
+ clusterAndWrite(record, clusterEngineMap.get(stratum.getName()));
+ }
+ }
+
+ private void clusterAndWrite(final SVCallRecord record, final SVClusterEngine clusterEngine) {
+ clusterEngine.add(record).stream().forEach(this::write);
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentation.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentation.java
index e447d2c1086..af2d6a0d266 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentation.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentation.java
@@ -105,6 +105,8 @@ public class JointGermlineCNVSegmentation extends MultiVariantWalkerGroupedOnSta
private SampleDB sampleDB;
private boolean isMultiSampleInput = false;
private ReferenceSequenceFile reference;
+ private Collection defragmentBuffer;
+ private Collection outputBuffer;
private final Set allosomalContigs = new LinkedHashSet<>(Arrays.asList("X","Y","chrX","chrY"));
class CopyNumberAndEndRecord {
@@ -132,6 +134,7 @@ public int getEndPosition() {
public static final String MODEL_CALL_INTERVALS_LONG_NAME = "model-call-intervals";
public static final String BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME = "breakpoint-summary-strategy";
public static final String ALT_ALLELE_SUMMARY_STRATEGY_LONG_NAME = "alt-allele-summary-strategy";
+ public static final String FLAG_FIELD_LOGIC_LONG_NAME = "flag-field-logic";
@Argument(fullName = MIN_QUALITY_LONG_NAME, doc = "Minimum QS score to combine a variant segment", optional = true)
private int minQS = 20;
@@ -200,6 +203,13 @@ public boolean requiresReference() {
// Cannot require sample overlap when clustering across samples
private static final double CLUSTER_SAMPLE_OVERLAP_FRACTION = 0;
+ @Argument(fullName = SVClusterWalker.MAX_RECORDS_IN_RAM_LONG_NAME,
+ doc = "When writing VCF files that need to be sorted, this will specify the number of records stored in " +
+ "RAM before spilling to disk. Increasing this number reduces the number of file handles needed to sort a " +
+ "VCF file, and increases the amount of RAM needed.",
+ optional=true)
+ public int maxRecordsInRam = 10000;
+
@Override
public void onTraversalStart() {
reference = ReferenceUtils.createReferenceReader(referenceArguments.getReferenceSpecifier());
@@ -223,6 +233,8 @@ public void onTraversalStart() {
clusterEngine = SVClusterEngineFactory.createCanonical(SVClusterEngine.CLUSTERING_TYPE.MAX_CLIQUE, breakpointSummaryStrategy, altAlleleSummaryStrategy,
dictionary, reference, true, clusterArgs, CanonicalSVLinkage.DEFAULT_MIXED_PARAMS, CanonicalSVLinkage.DEFAULT_PESR_PARAMS);
+ defragmentBuffer = new ArrayList<>();
+ outputBuffer = new ArrayList<>();
vcfWriter = getVCFWriter();
if (getSamplesForVariants().size() != 1) {
@@ -285,14 +297,38 @@ public void apply(final List variantContexts, final ReferenceCon
final SVCallRecord record = createDepthOnlyFromGCNVWithOriginalGenotypes(vc, minQS, allosomalContigs, refAutosomalCopyNumber, sampleDB);
if (record != null) {
if (!isMultiSampleInput) {
- defragmenter.add(record);
+ bufferDefragmenterOutput(defragmenter.add(record));
} else {
- clusterEngine.add(record);
+ bufferClusterOutput(clusterEngine.add(record));
}
}
}
}
+ private void bufferDefragmenterOutput(final List records) {
+ defragmentBuffer.addAll(records);
+ }
+
+ private List flushDefragmenterBuffer() {
+ final List result = defragmentBuffer.stream()
+ .sorted(Comparator.comparingInt(SVCallRecord::getPositionA))
+ .collect(Collectors.toUnmodifiableList());
+ defragmentBuffer = new ArrayList<>();
+ return result;
+ }
+
+ private void bufferClusterOutput(final List records) {
+ outputBuffer.addAll(records);
+ }
+
+ private List flushClusterBuffer() {
+ final List result = outputBuffer.stream()
+ .sorted(Comparator.comparingInt(SVCallRecord::getPositionA))
+ .collect(Collectors.toUnmodifiableList());
+ outputBuffer = new ArrayList<>();
+ return result;
+ }
+
@Override
public Object onTraversalSuccess() {
processClusters();
@@ -305,11 +341,16 @@ public Object onTraversalSuccess() {
* new contig.
*/
private void processClusters() {
- final List defragmentedCalls = defragmenter.forceFlush();
- defragmentedCalls.stream().forEachOrdered(clusterEngine::add);
+ bufferDefragmenterOutput(defragmenter.flush());
//Jack and Isaac cluster first and then defragment
- final List clusteredCalls = clusterEngine.forceFlush();
- write(clusteredCalls);
+ bufferClusterOutput(
+ flushDefragmenterBuffer().stream()
+ .map(clusterEngine::add)
+ .flatMap(List::stream)
+ .collect(Collectors.toList())
+ );
+ bufferClusterOutput(clusterEngine.flush());
+ write(flushClusterBuffer());
}
private VariantContext buildAndSanitizeRecord(final SVCallRecord record) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster.java
index 791befb79fb..3118f02e89b 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster.java
@@ -1,32 +1,13 @@
package org.broadinstitute.hellbender.tools.walkers.sv;
-import htsjdk.samtools.SAMSequenceDictionary;
-import htsjdk.samtools.reference.ReferenceSequenceFile;
-import htsjdk.variant.variantcontext.GenotypesContext;
-import htsjdk.variant.variantcontext.VariantContext;
-import htsjdk.variant.variantcontext.VariantContextBuilder;
-import htsjdk.variant.variantcontext.writer.VariantContextWriter;
-import htsjdk.variant.vcf.*;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
-import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup;
-import org.broadinstitute.hellbender.engine.*;
-import org.broadinstitute.hellbender.exceptions.UserException;
-import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
-import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFHeaderLines;
import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
-import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils;
import org.broadinstitute.hellbender.tools.sv.cluster.*;
-import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;
-
-import java.util.List;
-import java.util.Set;
-
-import static org.broadinstitute.hellbender.tools.walkers.sv.JointGermlineCNVSegmentation.BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME;
/**
* Clusters structural variants based on coordinates, event type, and supporting algorithms. Primary use cases include:
@@ -178,111 +159,10 @@
)
@BetaFeature
@DocumentedFeature
-public final class SVCluster extends MultiVariantWalker {
- public static final String PLOIDY_TABLE_LONG_NAME = "ploidy-table";
- public static final String VARIANT_PREFIX_LONG_NAME = "variant-prefix";
- public static final String ENABLE_CNV_LONG_NAME = "enable-cnv";
+public final class SVCluster extends SVClusterWalker {
+
public static final String DEFRAG_PADDING_FRACTION_LONG_NAME = "defrag-padding-fraction";
public static final String DEFRAG_SAMPLE_OVERLAP_LONG_NAME = "defrag-sample-overlap";
- public static final String CONVERT_INV_LONG_NAME = "convert-inv-to-bnd";
- public static final String ALGORITHM_LONG_NAME = "algorithm";
- public static final String FAST_MODE_LONG_NAME = "fast-mode";
- public static final String OMIT_MEMBERS_LONG_NAME = "omit-members";
- public static final String DEFAULT_NO_CALL_LONG_NAME = "default-no-call";
-
- /**
- * The enum Cluster algorithm.
- */
- enum CLUSTER_ALGORITHM {
- /**
- * Defragment cnv cluster algorithm.
- */
- DEFRAGMENT_CNV,
- /**
- * Single linkage cluster algorithm.
- */
- SINGLE_LINKAGE,
- /**
- * Max clique cluster algorithm.
- */
- MAX_CLIQUE
- }
-
- @Argument(
- doc = "Output VCF",
- fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
- shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME
- )
- private GATKPath outputFile;
-
- /**
- * Expected format is tab-delimited and contains a header with the first column SAMPLE and remaining columns
- * contig names. Each row corresponds to a sample, with the sample ID in the first column and contig ploidy
- * integers in their respective columns.
- */
- @Argument(
- doc = "Sample ploidy table (.tsv)",
- fullName = PLOIDY_TABLE_LONG_NAME
- )
- private GATKPath ploidyTablePath;
-
- @Argument(
- doc = "If supplied, generate variant IDs with this prefix",
- fullName = VARIANT_PREFIX_LONG_NAME,
- optional = true
- )
- private String variantPrefix = null;
-
- /**
- * When enabled, DEL and DUP variants will be clustered together. The resulting records with have an SVTYPE of CNV.
- */
- @Argument(
- doc = "Enable clustering DEL/DUP variants together as CNVs (does not apply to CNV defragmentation)",
- fullName = ENABLE_CNV_LONG_NAME,
- optional = true
- )
- private boolean enableCnv = false;
-
- /**
- * When enabled, INV records will be converted to a pairs of BNDs prior to clustering.
- */
- @Argument(
- doc = "Convert inversions to BND records",
- fullName = CONVERT_INV_LONG_NAME,
- optional = true
- )
- private boolean convertInversions = false;
-
- /**
- * Results in substantial space and time costs for large sample sets by clearing genotypes that are not needed for
- * clustering, but any associated annotation fields will be set to null in the output.
- */
- @Argument(
- doc = "Fast mode. Drops hom-ref and no-call genotype fields and emits them as no-calls.",
- fullName = FAST_MODE_LONG_NAME,
- optional = true
- )
- private boolean fastMode = false;
-
- @Argument(
- doc = "Omit cluster member ID annotations",
- fullName = OMIT_MEMBERS_LONG_NAME,
- optional = true
- )
- private boolean omitMembers = false;
-
- @Argument(fullName = BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME,
- doc = "Strategy to use for choosing a representative value for a breakpoint cluster.",
- optional = true)
- private CanonicalSVCollapser.BreakpointSummaryStrategy breakpointSummaryStrategy =
- CanonicalSVCollapser.BreakpointSummaryStrategy.REPRESENTATIVE;
-
- @Argument(fullName = JointGermlineCNVSegmentation.ALT_ALLELE_SUMMARY_STRATEGY_LONG_NAME,
- doc = "Strategy to use for choosing a representative alt allele for non-CNV biallelic sites with " +
- "different subtypes.",
- optional = true)
- private CanonicalSVCollapser.AltAlleleSummaryStrategy altAlleleSummaryStrategy =
- CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE;
@Argument(fullName = DEFRAG_PADDING_FRACTION_LONG_NAME,
doc = "Padding as a fraction of variant length for CNV defragmentation mode.",
@@ -296,51 +176,14 @@ enum CLUSTER_ALGORITHM {
)
private double defragSampleOverlapFraction = CNVLinkage.DEFAULT_SAMPLE_OVERLAP;
- @Argument(fullName = ALGORITHM_LONG_NAME,
- doc = "Clustering algorithm",
- optional = true
- )
- private CLUSTER_ALGORITHM algorithm = CLUSTER_ALGORITHM.SINGLE_LINKAGE;
-
- /**
- * Default genotypes are assigned when they cannot be inferred from the inputs, such as when VCFs with different
- * variants and samples are provided.
- */
- @Argument(fullName = DEFAULT_NO_CALL_LONG_NAME,
- doc = "Default to no-call GT (e.g. ./.) instead of reference alleles (e.g. 0/0) when a genotype is not" +
- " available",
- optional = true
- )
- private boolean defaultNoCall = false;
-
@ArgumentCollection
private final SVClusterEngineArgumentsCollection clusterParameterArgs = new SVClusterEngineArgumentsCollection();
- private SAMSequenceDictionary dictionary;
- private ReferenceSequenceFile reference;
- private PloidyTable ploidyTable;
- private VariantContextWriter writer;
- private VCFHeader header;
- private SVClusterEngine clusterEngine;
- private Set samples;
- private String currentContig;
- private int numVariantsBuilt = 0;
-
- @Override
- public boolean requiresReference() {
- return true;
- }
+ protected SVClusterEngine clusterEngine;
@Override
public void onTraversalStart() {
- reference = ReferenceUtils.createReferenceReader(referenceArguments.getReferenceSpecifier());
- dictionary = reference.getSequenceDictionary();
- if (dictionary == null) {
- throw new UserException("Reference sequence dictionary required");
- }
- ploidyTable = new PloidyTable(ploidyTablePath.toPath());
- samples = getSamplesForVariants();
-
+ super.onTraversalStart();
if (algorithm == CLUSTER_ALGORITHM.DEFRAGMENT_CNV) {
clusterEngine = SVClusterEngineFactory.createCNVDefragmenter(dictionary, altAlleleSummaryStrategy,
reference, defragPaddingFraction, defragSampleOverlapFraction);
@@ -354,107 +197,21 @@ public void onTraversalStart() {
} else {
throw new IllegalArgumentException("Unsupported algorithm: " + algorithm.name());
}
-
- writer = createVCFWriter(outputFile);
- header = createHeader();
- writer.writeHeader(header);
- currentContig = null;
}
@Override
public Object onTraversalSuccess() {
- write(true);
+ clusterEngine.flush().stream().forEach(this::write);
return super.onTraversalSuccess();
}
@Override
public void closeTool() {
super.closeTool();
- if (writer != null) {
- writer.close();
- }
}
@Override
- public void apply(final VariantContext variant, final ReadsContext readsContext,
- final ReferenceContext referenceContext, final FeatureContext featureContext) {
- final SVCallRecord call = SVCallRecordUtils.create(variant, dictionary);
- final SVCallRecord filteredCall;
- if (fastMode && call.getType() != GATKSVVCFConstants.StructuralVariantAnnotationType.CNV) {
- // Strip out non-carrier genotypes to save memory and compute
- // Don't do for multi-allelic CNVs since carrier status can't be determined
- final GenotypesContext filteredGenotypes = GenotypesContext.copy(call.getCarrierGenotypeList());
- filteredCall = SVCallRecordUtils.copyCallWithNewGenotypes(call, filteredGenotypes);
- } else {
- filteredCall = call;
- }
-
- // Update current contig
- if (!filteredCall.getContigA().equals(currentContig)) {
- currentContig = filteredCall.getContigA();
- logger.info("Processing contig " + currentContig + "...");
- }
-
- // Add to clustering buffer
- if (convertInversions) {
- SVCallRecordUtils.convertInversionsToBreakends(filteredCall, dictionary).forEachOrdered(clusterEngine::add);
- } else {
- clusterEngine.add(filteredCall);
- }
-
- write(false);
- }
-
- private void write(final boolean force) {
- final List records = force ? clusterEngine.forceFlush() : clusterEngine.flush();
- records.stream().map(this::buildVariantContext).forEachOrdered(writer::add);
+ public void applyRecord(final SVCallRecord record) {
+ clusterEngine.add(record).stream().forEach(this::write);
}
-
- private VCFHeader createHeader() {
- final VCFHeader header = new VCFHeader(getHeaderForVariants().getMetaDataInInputOrder(), samples);
- header.setSequenceDictionary(dictionary);
-
- // Required info lines
- header.addMetaDataLine(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
- header.addMetaDataLine(GATKSVVCFHeaderLines.getInfoLine(GATKSVVCFConstants.SVLEN));
- header.addMetaDataLine(GATKSVVCFHeaderLines.getInfoLine(GATKSVVCFConstants.SVTYPE));
- header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.END2_ATTRIBUTE, 1,
- VCFHeaderLineType.Integer, "Second position"));
- header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.CONTIG2_ATTRIBUTE, 1,
- VCFHeaderLineType.String, "Second contig"));
- header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.STRANDS_ATTRIBUTE, 1,
- VCFHeaderLineType.String, "First and second strands"));
- header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE,
- VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Source algorithms"));
- if (!omitMembers) {
- header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY,
- VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Cluster variant ids"));
- }
-
- // Required format lines
- header.addMetaDataLine(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY));
-
- return header;
- }
-
- public VariantContext buildVariantContext(final SVCallRecord call) {
- // Add genotypes for missing samples
- final GenotypesContext filledGenotypes = SVCallRecordUtils.populateGenotypesForMissingSamplesWithAlleles(
- call, samples, !defaultNoCall, ploidyTable, header);
-
- // Assign new variant ID
- final String newId = variantPrefix == null ? call.getId() : String.format("%s%08x", variantPrefix, numVariantsBuilt++);
-
- // Build new variant
- final SVCallRecord finalCall = new SVCallRecord(newId, call.getContigA(), call.getPositionA(), call.getStrandA(),
- call.getContigB(), call.getPositionB(), call.getStrandB(), call.getType(), call.getComplexSubtype(),
- call.getComplexEventIntervals(), call.getLength(), call.getAlgorithms(), call.getAlleles(), filledGenotypes,
- call.getAttributes(), call.getFilters(), call.getLog10PError(), dictionary);
- final VariantContextBuilder builder = SVCallRecordUtils.getVariantBuilder(finalCall);
- if (omitMembers) {
- builder.rmAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY);
- }
- return builder.make();
- }
-
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance.java
index 56c389539c6..18f405cef06 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance.java
@@ -199,7 +199,7 @@ protected SVCallRecord minimizeTruthFootprint(final SVCallRecord item) {
final List genotypes = item.getGenotypes().stream().map(SVConcordance::stripTruthGenotype).collect(Collectors.toList());
return new SVCallRecord(item.getId(), item.getContigA(), item.getPositionA(),
item.getStrandA(), item.getContigB(), item.getPositionB(), item.getStrandB(), item.getType(),
- item.getComplexSubtype(), item.getComplexEventIntervals(), item.getLength(), item.getAlgorithms(),
+ item.getComplexSubtype(), item.getComplexEventIntervals(), item.getLength(), item.getEvidence(), item.getAlgorithms(),
item.getAlleles(), genotypes, item.getAttributes(), item.getFilters(), item.getLog10PError(), dictionary);
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify.java
new file mode 100644
index 00000000000..a0df7c43bb2
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify.java
@@ -0,0 +1,291 @@
+package org.broadinstitute.hellbender.tools.walkers.sv;
+
+import htsjdk.samtools.SAMSequenceDictionary;
+import htsjdk.samtools.util.Locatable;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.variantcontext.VariantContextBuilder;
+import htsjdk.variant.variantcontext.writer.VariantContextWriter;
+import htsjdk.variant.vcf.VCFHeader;
+import htsjdk.variant.vcf.VCFHeaderLineType;
+import htsjdk.variant.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.ArgumentCollection;
+import org.broadinstitute.barclay.argparser.BetaFeature;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup;
+import org.broadinstitute.hellbender.engine.*;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberStandardArgument;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
+import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
+import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils;
+import org.broadinstitute.hellbender.tools.sv.stratify.SVStatificationEngine;
+import org.broadinstitute.hellbender.tools.sv.stratify.SVStratificationEngineArgumentsCollection;
+import org.broadinstitute.hellbender.utils.*;
+
+import java.nio.file.Path;
+import java.util.*;
+import java.util.stream.Collectors;
+
+/**
+ * Stratifies structural variants into separate groups according to the following customizable criteria:
+ *
+ * - SV type
+ * - Size range
+ * - Reference context
+ *
+ * Users should provide a stratification configuration .tsv file (tab-delimited table) with the following column
+ * header on the first line:
+ *
+ * - NAME
+ * - SVTYPE
+ * - MIN_SIZE
+ * - MAX_SIZE
+ * - CONTEXT
+ *
+ *
+ * For example:
+ *
+ *
+ * NAME | SVTYPE | MIN_SIZE | MAX_SIZE | CONTEXT |
+ *
+ *
+ * DEL_large_SD | DEL | 5000 | -1 | SD |
+ *
+ *
+ * DUP_large_SD | DUP | 5000 | -1 | SD |
+ *
+ *
+ * DEL_small_SR_RM | DEL | -1 | 5000 | SR,RM |
+ *
+ *
+ * DUP_small_SR_RM | DUP | -1 | 5000 | SR,RM |
+ *
+ *
+ * INS_SR | INS | -1 | -1 | SR |
+ *
+ *
+ *
+ * The "NAME" column is an arbitrary identifier, "SVTYPE" is the class of variant (DEL, DUP, INS, etc.), MIN_SIZE in an
+ * inclusive size lower-bound, MAX_SIZE is an exclusive size upper-bound, and CONTEXT is a comma-delimited list of
+ * reference contexts defined using the {@link SVStratificationEngineArgumentsCollection#contextFileList} and
+ * {@link SVStratificationEngineArgumentsCollection#contextNameList} parameters. For example,
+ *
+ *
+ * gatk GroupedSVCluster \
+ * --context-name RM \
+ * --context-intervals repeatmasker.bed \
+ * --context-name SD \
+ * --context-intervals segmental_duplications.bed \
+ * --context-name SR \
+ * --context-intervals simple_repeats.bed \
+ * ...
+ *
+ *
+ * The MIN_SIZE, MAX_SIZE, and CONTEXT columns may contain null values {"-1", "", "NULL", "NA"}. Null MIN_SIZE and
+ * MAX_SIZE correspond to negative and positive infinity, respectively, and a null CONTEXT value means that variants
+ * will not be matched based on context. Variants with undefined SVLEN will only match if both MIN_SIZE and MAX_SIZE
+ * are null.
+ *
+ *
+ * The {@link SVStratificationEngineArgumentsCollection#overlapFraction},
+ * {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlaps}, and
+ * {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlapsInterchrom} can be used to modify the overlap
+ * criteria for assigning variants to each group based on overlap with the given reference context intervals. By
+ * default, only one endpoint of the variant needs to lie in a context interval in order to match. INS variants are
+ * treated as single points and only {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlaps} is used,
+ * ignoring {@link SVStratificationEngineArgumentsCollection#overlapFraction}. Similarly, CTX and BND variant
+ * overlap is only defined by {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlapsInterchrom}.
+ *
+ *
+ * By default, each stratification group must be mutually exclusive, meaning that any given SV can only belong to
+ * one group. An error is thrown if the tool encounters a variant that meets the criteria for more than one group.
+ * This restriction can be overridden with the {@link SVStratify#ALLOW_MULTIPLE_MATCHES_LONG_NAME} argument.
+ * Furthermore, SVs that do not fall into any of the groups will be assigned to the
+ * {@link SVStratify#DEFAULT_STRATUM} group.
+ *
+ * The tool generates a set of VCFs as output, with each VCF containing the records of each group. In addition,
+ * records are annotated with their respective strata names in the {@link GATKSVVCFConstants#STRATUM_INFO_KEY} INFO
+ * field.
+ *
+ * This tool accepts multiple VCF inputs with no restrictions on site or sample overlap.
+ *
+ * Inputs
+ *
+ *
+ * -
+ * One or more SV VCFs
+ *
+ * -
+ * Stratification configuration TSV file
+ *
+ * -
+ * Reference dictionary
+ *
+ *
+ *
+ * Output
+ *
+ *
+ * -
+ * Set of stratified VCFs
+ *
+ *
+ *
+ * Usage example
+ *
+ *
+ * gatk GroupedSVCluster \
+ * -V variants.vcf.gz \
+ * -O out \
+ * --sequence-dictionary reference.dict \
+ * --context-name RM \
+ * --context-intervals repeatmasker.bed \
+ * --stratify-config strata.tsv
+ *
+ *
+ * @author Mark Walker <markw@broadinstitute.org>
+ */
+@CommandLineProgramProperties(
+ summary = "Splits VCFs by SV type, size, and reference context",
+ oneLineSummary = "Splits VCFs by SV type, size, and reference context",
+ programGroup = StructuralVariantDiscoveryProgramGroup.class
+)
+@BetaFeature
+@DocumentedFeature
+public final class SVStratify extends MultiVariantWalker {
+
+ public static final String ALLOW_MULTIPLE_MATCHES_LONG_NAME = "allow-multiple-matches";
+
+ // Default output group name for unmatched records
+ public static final String DEFAULT_STRATUM = "default";
+
+ @Argument(
+ doc = "Output directory",
+ fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
+ shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME
+ )
+ private GATKPath outputPath;
+
+ @Argument(
+ doc = "Prefix for output filenames",
+ fullName = CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME
+ )
+ private String outputPrefix;
+
+ @ArgumentCollection
+ private final SVStratificationEngineArgumentsCollection stratArgs = new SVStratificationEngineArgumentsCollection();
+
+ @Argument(
+ doc = "Do not enforce mutual exclusivity for each stratification group",
+ fullName = ALLOW_MULTIPLE_MATCHES_LONG_NAME
+ )
+ private boolean allowMultipleMatches = false;
+
+ protected SAMSequenceDictionary dictionary;
+ protected Map writers;
+ protected SVStatificationEngine engine;
+
+ @Override
+ public void onTraversalStart() {
+ super.onTraversalStart();
+ dictionary = getMasterSequenceDictionary();
+ Utils.validateArg(dictionary != null, "Reference dictionary is required; please specify with --" +
+ StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME);
+ engine = loadStratificationConfig(stratArgs, dictionary);
+ logger.debug("Loaded stratification groups:");
+ for (final SVStatificationEngine.Stratum s : engine.getStrata()) {
+ logger.debug(s);
+ }
+ initializeWriters();
+ }
+
+ protected void createGroupWriter(final String name) {
+ final String filename = outputPrefix + "." + name + ".vcf.gz";
+ final Path path = outputPath.toPath().resolve(filename);
+ final VariantContextWriter writer = createVCFWriter(path);
+ final VCFHeader header = new VCFHeader(getHeaderForVariants());
+ addStratifyMetadata(header);
+ writer.writeHeader(header);
+ if (writers.containsKey(name)) {
+ throw new GATKException.ShouldNeverReachHereException("Stratification name already exists: " + name);
+ }
+ writers.put(name, writer);
+ }
+
+ public static void addStratifyMetadata(final VCFHeader header) {
+ header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.STRATUM_INFO_KEY, 1,
+ VCFHeaderLineType.String, "Stratum ID"));
+ }
+
+ protected void initializeWriters() {
+ writers = new HashMap<>();
+ createGroupWriter(DEFAULT_STRATUM);
+ for (final SVStatificationEngine.Stratum s : engine.getStrata()) {
+ createGroupWriter(s.getName());
+ }
+ }
+
+ /**
+ * Reusable method for loading the stratification configuration table. See tool doc for the expected format.
+ */
+ public static SVStatificationEngine loadStratificationConfig(final SVStratificationEngineArgumentsCollection args,
+ final SAMSequenceDictionary dictionary) {
+ Utils.validateArg(args.contextNameList.size() == args.contextFileList.size(), "Arguments --" +
+ SVStratificationEngineArgumentsCollection.CONTEXT_NAME_FILE_LONG_NAME + " and --" + SVStratificationEngineArgumentsCollection.CONTEXT_INTERVAL_FILE_LONG_NAME +
+ " must be specified the same number of times.");
+ final Map> map = new HashMap<>();
+ final Iterator nameIterator = args.contextNameList.iterator();
+ final Iterator pathIterator = args.contextFileList.iterator();
+ final GenomeLocParser genomeLocParser = new GenomeLocParser(dictionary);
+ while (nameIterator.hasNext() && pathIterator.hasNext()) {
+ final String name = nameIterator.next();
+ final GATKPath path = pathIterator.next();
+ final GenomeLocSortedSet genomeLocs = IntervalUtils.loadIntervals(Collections.singletonList(path.toString()), IntervalSetRule.UNION, IntervalMergingRule.ALL, 0, genomeLocParser);
+ final List intervals = Collections.unmodifiableList(genomeLocs.toList());
+ if (map.containsKey(name)) {
+ throw new UserException.BadInput("Duplicate context name was specified: " + name);
+ }
+ map.put(name, intervals);
+ }
+ final SVStatificationEngine engine = SVStatificationEngine.create(map, args.configFile, dictionary);
+ if (engine.getStrata().stream().anyMatch(s -> s.getName().equals(DEFAULT_STRATUM))) {
+ throw new UserException.BadInput("Stratification configuration contains entry with reserved " +
+ "ID \"" + DEFAULT_STRATUM + "\"");
+ }
+ return engine;
+ }
+
+ @Override
+ public void closeTool() {
+ for (final VariantContextWriter writer : writers.values()) {
+ writer.close();
+ }
+ super.closeTool();
+ }
+
+ @Override
+ public void apply(final VariantContext variant, final ReadsContext readsContext,
+ final ReferenceContext referenceContext, final FeatureContext featureContext) {
+ // Save a ton of compute by not copying genotypes into the new record
+ final VariantContext variantNoGenotypes = new VariantContextBuilder(variant).genotypes(Collections.emptyList()).make();
+ final SVCallRecord record = SVCallRecordUtils.create(variantNoGenotypes, dictionary);
+ final Collection stratifications = engine.getMatches(record,
+ stratArgs.overlapFraction, stratArgs.numBreakpointOverlaps, stratArgs.numBreakpointOverlapsInterchrom);
+ final VariantContextBuilder builder = new VariantContextBuilder(variant);
+ if (stratifications.isEmpty()) {
+ writers.get(DEFAULT_STRATUM).add(builder.attribute(GATKSVVCFConstants.STRATUM_INFO_KEY, DEFAULT_STRATUM).make());
+ } else {
+ if (!allowMultipleMatches && stratifications.size() > 1) {
+ final String matchesString = String.join(", ", stratifications.stream().map(SVStatificationEngine.Stratum::getName).collect(Collectors.toList()));
+ throw new GATKException("Record " + record.getId() + " matched multiple groups: " + matchesString + ". Bypass this error using the --" + ALLOW_MULTIPLE_MATCHES_LONG_NAME + " argument");
+ }
+ for (final SVStatificationEngine.Stratum stratum : stratifications) {
+ writers.get(stratum.getName()).add(builder.attribute(GATKSVVCFConstants.STRATUM_INFO_KEY, stratum.getName()).make());
+ }
+ }
+ }
+}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUnitTest.java
index 35864d62f1a..f0c45c8f934 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUnitTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUnitTest.java
@@ -76,7 +76,7 @@ public Object[][] testCreateInvalidCoordinatesData() {
@Test(dataProvider="testCreateInvalidCoordinatesData", expectedExceptions = { IllegalArgumentException.class })
public void testCreateInvalidCoordinates(final String contigA, final int posA, final String contigB, final int posB) {
new SVCallRecord("var1", contigA, posA, true, contigB, posB, false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
- null, Collections.emptyList(), null, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(),
+ null, Collections.emptyList(), null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(),
Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
Assert.fail("Expected exception not thrown");
}
@@ -93,14 +93,14 @@ public Object[][] testCreateValidCoordinatesData() {
@Test(dataProvider="testCreateValidCoordinatesData")
public void testCreateValidCoordinates(final String contigA, final int posA, final String contigB, final int posB) {
new SVCallRecord("var1", contigA, posA, true, contigB, posB, false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
- null, Collections.emptyList(), null, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(),
+ null, Collections.emptyList(), null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(),
Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
}
@Test
public void testGetters() {
final SVCallRecord record = new SVCallRecord("var1", "chr1", 100, true, "chr1", 200, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
- GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Lists.newArrayList(SVCallRecord.ComplexEventInterval.decode("DUP_chr1:100-200", SVTestUtils.hg38Dict)), null, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
+ GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Lists.newArrayList(SVCallRecord.ComplexEventInterval.decode("DUP_chr1:100-200", SVTestUtils.hg38Dict)), null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
GenotypesContext.create(GenotypeBuilder.create("sample1", Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DEL))),
Collections.singletonMap("TEST_KEY", "TEST_VALUE"), Collections.singleton("TEST_FILTER"), Double.valueOf(30), SVTestUtils.hg38Dict);
Assert.assertEquals(record.getId(), "var1");
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtilsUnitTest.java
index 2c814a45576..1ee6107f3bb 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtilsUnitTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtilsUnitTest.java
@@ -69,7 +69,7 @@ public Object[][] testGetVariantBuilderData() {
return new Object[][]{
// DEL
{
- new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000,
+ new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, Collections.emptyList(),
SVTestUtils.DEPTH_ONLY_ALGORITHM_LIST,
ALLELES_DEL,
Lists.newArrayList(GENOTYPE_DEL_1, GENOTYPE_DEL_2), Collections.emptyMap(), Collections.singleton("TEST_FILTER"), Double.valueOf(-3)),
@@ -86,7 +86,7 @@ public Object[][] testGetVariantBuilderData() {
},
// DEL w/ null ref allele
{
- new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000,
+ new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, Collections.emptyList(),
SVTestUtils.DEPTH_ONLY_ALGORITHM_LIST,
Collections.singletonList(Allele.SV_SIMPLE_DEL),
Collections.singletonList(GENOTYPE_DEL_3),
@@ -102,7 +102,7 @@ public Object[][] testGetVariantBuilderData() {
},
// INS
{
- new SVCallRecord("var2", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500,
+ new SVCallRecord("var2", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
ALLELES_INS,
Lists.newArrayList(GENOTYPE_INS_1),
@@ -119,7 +119,7 @@ public Object[][] testGetVariantBuilderData() {
},
// INS, flipped strands
{
- new SVCallRecord("var2", "chr1", 1000, false, "chr1", 1000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500,
+ new SVCallRecord("var2", "chr1", 1000, false, "chr1", 1000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
ALLELES_INS,
Lists.newArrayList(GENOTYPE_INS_1),
@@ -136,7 +136,7 @@ public Object[][] testGetVariantBuilderData() {
},
// INS, undefined length
{
- new SVCallRecord("var2", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), null,
+ new SVCallRecord("var2", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), null, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
ALLELES_INS,
Lists.newArrayList(GENOTYPE_INS_1),
@@ -153,7 +153,7 @@ public Object[][] testGetVariantBuilderData() {
},
// BND
{
- new SVCallRecord("var_bnd", "chr1", 1000, false, "chr2", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null,
+ new SVCallRecord("var_bnd", "chr1", 1000, false, "chr2", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
ALLELES_BND,
Lists.newArrayList(GENOTYPE_BND_1),
@@ -172,7 +172,7 @@ public Object[][] testGetVariantBuilderData() {
},
// CTX
{
- new SVCallRecord("var_ctx", "chr1", 1000, false, "chr2", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, Collections.emptyList(), null,
+ new SVCallRecord("var_ctx", "chr1", 1000, false, "chr2", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, Collections.emptyList(), null, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
ALLELES_CTX,
Lists.newArrayList(GENOTYPE_CTX_1),
@@ -196,6 +196,7 @@ public Object[][] testGetVariantBuilderData() {
GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL,
Lists.newArrayList(SVCallRecord.ComplexEventInterval.decode("DUP_chr1:5000-5100", SVTestUtils.hg38Dict), SVCallRecord.ComplexEventInterval.decode("DEL_chr2:100-200", SVTestUtils.hg38Dict)),
100,
+ Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
ALLELES_CPX,
Lists.newArrayList(GENOTYPE_CPX_1),
@@ -223,7 +224,7 @@ public void testGetVariantBuilder(final SVCallRecord record, final VariantContex
@Test
public void testGetVariantBuilderHasSanitizedNullAttributes() {
- final SVCallRecord record = new SVCallRecord("var3", "chr1", 1000, false, "chr2", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null,
+ final SVCallRecord record = new SVCallRecord("var3", "chr1", 1000, false, "chr2", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
ALLELES_BND,
Lists.newArrayList(GENOTYPE_BND_1),
@@ -300,14 +301,14 @@ public void testFillMissingSamplesWithGenotypes() {
@Test
public void testCopyCallWithNewGenotypes() {
- final SVCallRecord record = new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000,
+ final SVCallRecord record = new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, Collections.emptyList(),
SVTestUtils.DEPTH_ONLY_ALGORITHM_LIST,
ALLELES_DEL,
Lists.newArrayList(GENOTYPE_DEL_1, GENOTYPE_DEL_2),
Collections.singletonMap(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, Collections.singletonList("sample")), Collections.emptySet(), null);
final GenotypesContext genotypes = GenotypesContext.copy(Collections.singletonList(GENOTYPE_DEL_3));
final SVCallRecord result = SVCallRecordUtils.copyCallWithNewGenotypes(record, genotypes);
- final SVCallRecord expected = new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000,
+ final SVCallRecord expected = new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, Collections.emptyList(),
SVTestUtils.DEPTH_ONLY_ALGORITHM_LIST,
ALLELES_DEL,
genotypes,
@@ -449,7 +450,7 @@ public void testConvertInversionsToBreakends() {
Assert.assertNotNull(nonInversionResult.get(0));
SVTestUtils.assertEqualsExceptMembership(nonInversionResult.get(0), nonInversion);
- final SVCallRecord inversion = new SVCallRecord("", "chr1", 1000, true, "chr1", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, Collections.emptyList(), 1000,
+ final SVCallRecord inversion = new SVCallRecord("", "chr1", 1000, true, "chr1", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, Collections.emptyList(), 1000, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
Collections.emptyList(),
Collections.emptyList(),
@@ -524,7 +525,7 @@ public Object[][] testCreateData() {
GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
null, null,
TEST_ATTRIBUTES, -90.),
- new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000,
+ new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, Collections.emptyList(),
Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), ALLELES_DEL, Lists.newArrayList(GENOTYPE_DEL_1, GENOTYPE_DEL_2),
TEST_ATTRIBUTES, Collections.emptySet(), -90.)
},
@@ -534,7 +535,7 @@ public Object[][] testCreateData() {
GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
null, null,
TEST_ATTRIBUTES, null),
- new SVCallRecord("var2", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000,
+ new SVCallRecord("var2", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, Collections.emptyList(),
Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), ALLELES_DEL, Lists.newArrayList(GENOTYPE_DEL_1, GENOTYPE_DEL_2),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -543,7 +544,7 @@ public Object[][] testCreateData() {
ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1, GENOTYPE_INS_2), 500, "+-",
GATKSVVCFConstants.StructuralVariantAnnotationType.INS, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
null, null, TEST_ATTRIBUTES, null),
- new SVCallRecord("var3", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500,
+ new SVCallRecord("var3", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1, GENOTYPE_INS_2),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -552,7 +553,7 @@ public Object[][] testCreateData() {
ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1, GENOTYPE_INS_2), 500, "-+",
GATKSVVCFConstants.StructuralVariantAnnotationType.INS, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
null, null, TEST_ATTRIBUTES, null),
- new SVCallRecord("var4", "chr1", 1000, false, "chr1", 1000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500,
+ new SVCallRecord("var4", "chr1", 1000, false, "chr1", 1000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1, GENOTYPE_INS_2),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -561,7 +562,7 @@ public Object[][] testCreateData() {
ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1, GENOTYPE_INS_2), -1, "-+",
GATKSVVCFConstants.StructuralVariantAnnotationType.INS, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
null, null, TEST_ATTRIBUTES, null),
- new SVCallRecord("var4b", "chr1", 1000, false, "chr1", 1000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), null,
+ new SVCallRecord("var4b", "chr1", 1000, false, "chr1", 1000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), null, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1, GENOTYPE_INS_2),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -570,7 +571,7 @@ public Object[][] testCreateData() {
ALLELES_BND, Collections.singletonList(GENOTYPE_BND_1), null, "++",
GATKSVVCFConstants.StructuralVariantAnnotationType.BND, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
"chrX", 2000, TEST_ATTRIBUTES, null),
- new SVCallRecord("var5", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null,
+ new SVCallRecord("var5", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_BND, Collections.singletonList(GENOTYPE_BND_1),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -579,7 +580,7 @@ public Object[][] testCreateData() {
ALLELES_BND, Collections.singletonList(GENOTYPE_BND_1), null, "++",
GATKSVVCFConstants.StructuralVariantAnnotationType.BND, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
"chrX", 2000, TEST_ATTRIBUTES, null),
- new SVCallRecord("var6", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null,
+ new SVCallRecord("var6", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_BND, Collections.singletonList(GENOTYPE_BND_1),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -588,7 +589,7 @@ public Object[][] testCreateData() {
ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1), 250, null,
GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
"chrX", 2000, TEST_ATTRIBUTES_CPX, null),
- new SVCallRecord("var7", "chr1", 1000, null, "chr1", 1000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Collections.emptyList(), 250,
+ new SVCallRecord("var7", "chr1", 1000, null, "chr1", 1000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Collections.emptyList(), 250, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -597,7 +598,7 @@ public Object[][] testCreateData() {
ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1), 250, null,
GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
"chr1", null, TEST_ATTRIBUTES_CPX, null),
- new SVCallRecord("var8", "chr1", 1000, null, "chr1", 2000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Collections.emptyList(), 250,
+ new SVCallRecord("var8", "chr1", 1000, null, "chr1", 2000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Collections.emptyList(), 250, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -606,7 +607,7 @@ public Object[][] testCreateData() {
ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1), 250, null,
GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
null, null, TEST_ATTRIBUTES_CPX, null),
- new SVCallRecord("var9", "chr1", 1000, null, "chr1", 2000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Collections.emptyList(), 250,
+ new SVCallRecord("var9", "chr1", 1000, null, "chr1", 2000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Collections.emptyList(), 250, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -622,6 +623,7 @@ public Object[][] testCreateData() {
GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL,
Lists.newArrayList(SVCallRecord.ComplexEventInterval.decode("DUP_chr1:100-200", SVTestUtils.hg38Dict), SVCallRecord.ComplexEventInterval.decode("DEL_chr2:300-400", SVTestUtils.hg38Dict)),
250,
+ Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -630,7 +632,7 @@ public Object[][] testCreateData() {
ALLELES_CTX, Collections.singletonList(GENOTYPE_CTX_1), null, "++",
GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
"chrX", 2000, TEST_ATTRIBUTES_CTX, null),
- new SVCallRecord("var11", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ, Collections.emptyList(), null,
+ new SVCallRecord("var11", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ, Collections.emptyList(), null, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_CTX, Collections.singletonList(GENOTYPE_CTX_1),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -642,7 +644,7 @@ public Object[][] testCreateData() {
"chrX", 2000,
Map.of("TEST_KEY", "TEST_VAL", GATKSVVCFConstants.CPX_TYPE, "CTX_PP/QQ", GATKSVVCFConstants.CPX_INTERVALS, Collections.emptyList()),
null),
- new SVCallRecord("var12", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ, Collections.emptyList(), null,
+ new SVCallRecord("var12", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ, Collections.emptyList(), null, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_CTX, Collections.singletonList(GENOTYPE_CTX_1),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVTestUtils.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVTestUtils.java
index 5a8e97a05fd..110516ac935 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVTestUtils.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVTestUtils.java
@@ -10,7 +10,10 @@
import org.broadinstitute.hellbender.testutils.VariantContextTestUtils;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.SimpleSVType;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
-import org.broadinstitute.hellbender.tools.sv.cluster.*;
+import org.broadinstitute.hellbender.tools.sv.cluster.CanonicalSVCollapser;
+import org.broadinstitute.hellbender.tools.sv.cluster.CanonicalSVLinkage;
+import org.broadinstitute.hellbender.tools.sv.cluster.ClusteringParameters;
+import org.broadinstitute.hellbender.tools.sv.cluster.SVClusterEngine;
import org.broadinstitute.hellbender.utils.GenomeLoc;
import org.broadinstitute.hellbender.utils.GenomeLocParser;
import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;
@@ -33,7 +36,8 @@ public class SVTestUtils {
new CanonicalSVCollapser(
hg38Reference,
CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE,
- CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END);
+ CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END,
+ CanonicalSVCollapser.FlagFieldLogic.OR);
public static final String PESR_ALGORITHM = "pesr";
public static final List DEPTH_ONLY_ALGORITHM_LIST = Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM);
@@ -121,6 +125,21 @@ public static SVClusterEngine getNewDefaultMaxCliqueEngine() {
.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 0);
public final static SVCallRecord makeRecord(final String id,
+ final String contigA,
+ final int positionA,
+ final Boolean strandA,
+ final String contigB,
+ final int positionB,
+ final Boolean strandB,
+ final GATKSVVCFConstants.StructuralVariantAnnotationType type,
+ final Integer length,
+ final List algorithms,
+ final List alleles,
+ final List genotypeBuilders) {
+ return makeRecordWithEvidenceAndQuality(id, contigA, positionA, strandA, contigB, positionB, strandB, type, length, Collections.emptyList(), algorithms, alleles, genotypeBuilders, null);
+ }
+
+ public final static SVCallRecord makeRecordWithEvidenceAndQuality(final String id,
final String contigA,
final int positionA,
final Boolean strandA,
@@ -129,17 +148,19 @@ public final static SVCallRecord makeRecord(final String id,
final Boolean strandB,
final GATKSVVCFConstants.StructuralVariantAnnotationType type,
final Integer length,
+ final List evidence,
final List algorithms,
final List alleles,
- final List genotypeBuilders) {
+ final List genotypeBuilders,
+ final Double log10PError) {
final Allele refAllele = Allele.create(ReferenceUtils.getRefBaseAtPosition(hg38Reference, contigA, positionA), true);
final List newAlleles = replaceRefAlleles(alleles, refAllele);
final List genotypes = new ArrayList<>(genotypeBuilders.size());
for (final GenotypeBuilder builder : genotypeBuilders) {
genotypes.add(makeGenotypeWithRefAllele(builder, refAllele));
}
- return new SVCallRecord(id, contigA, positionA, strandA, contigB, positionB, strandB, type, null, Collections.emptyList(), length, algorithms,
- newAlleles, genotypes, Collections.emptyMap(), Collections.emptySet(), null, hg38Dict);
+ return new SVCallRecord(id, contigA, positionA, strandA, contigB, positionB, strandB, type, null, Collections.emptyList(), length, evidence, algorithms,
+ newAlleles, genotypes, Collections.emptyMap(), Collections.emptySet(), log10PError, hg38Dict);
}
public static final Genotype makeGenotypeWithRefAllele(final GenotypeBuilder builder, final Allele refAllele) {
@@ -378,16 +399,16 @@ public static SVCallRecord newCallRecordWithAllelesAndSampleName(final String sa
builder = builder.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, copyNumber);
}
return new SVCallRecord("", "chr1", 100, getValidTestStrandA(svtype), "chr1", 199, getValidTestStrandB(svtype),
- svtype, null, Collections.emptyList(), 100, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ svtype, null, Collections.emptyList(), 100, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
variantAlleles,
Collections.singletonList(builder.make()),
Collections.emptyMap(), Collections.emptySet(), null);
}
- public static SVCallRecord newNamedDeletionRecordWithAttributes(final String id, final Map attributes) {
- return new SVCallRecord(id, "chr1", 100, true, "chr1", 199, false,
+ public static SVCallRecord newDeletionRecordWithAttributes(final Map attributes) {
+ return new SVCallRecord("", "chr1", 100, true, "chr1", 199, false,
GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- 100, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ 100, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Collections.emptyList(),
Collections.emptyList(),
attributes, Collections.emptySet(), null);
@@ -398,7 +419,7 @@ public static SVCallRecord newNamedDeletionRecordWithAttributesAndGenotypes(fina
final Map attributes) {
return new SVCallRecord(id, "chr1", 100, true, "chr1", 199, false,
GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- 100, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ 100, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
genotypes,
attributes, Collections.emptySet(), null);
@@ -416,40 +437,40 @@ public static final Map keyValueArraysToMap(final String[] keys,
public static SVCallRecord newCallRecordWithLengthAndType(final Integer length, final GATKSVVCFConstants.StructuralVariantAnnotationType svtype) {
final int positionB = length == null ? 1 : CoordMath.getEnd(1, length);
return new SVCallRecord("", "chr1", 1, getValidTestStrandA(svtype), "chr1", positionB, getValidTestStrandB(svtype),
- svtype, null, Collections.emptyList(), length, PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(),
+ svtype, null, Collections.emptyList(), length, Collections.emptyList(), PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(),
Collections.emptyMap(), Collections.emptySet(), null);
}
public static SVCallRecord newDeletionCallRecordWithIdAndAlgorithms(final String id, final List algorithms) {
return new SVCallRecord(id, "chr1", 1, true, "chr1", 100, false,
- GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 100, algorithms, Collections.emptyList(),
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 100, Collections.emptyList(), algorithms, Collections.emptyList(),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null);
}
// Note strands and length may not be set properly
public static SVCallRecord newPESRCallRecordWithIntervalAndType(final int start, final int end, final GATKSVVCFConstants.StructuralVariantAnnotationType svtype) {
return new SVCallRecord("", "chr1", start, getValidTestStrandA(svtype), "chr1", end, getValidTestStrandB(svtype),
- svtype, null, Collections.emptyList(), getLength(start, end, svtype), PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(),
+ svtype, null, Collections.emptyList(), getLength(start, end, svtype), Collections.emptyList(), PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null);
}
// Note strands and length may not be set properly
public static SVCallRecord newInsertionWithPositionAndLength(final int start, final int length) {
return new SVCallRecord("", "chr1", start, true, "chr1", start + 1, false,
- GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length, PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(),
+ GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length, Collections.emptyList(), PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null);
}
public static SVCallRecord newDepthCallRecordWithIntervalAndType(final int start, final int end, final GATKSVVCFConstants.StructuralVariantAnnotationType svtype) {
return new SVCallRecord("", "chr1", start, getValidTestStrandA(svtype), "chr1", end, getValidTestStrandB(svtype),
- svtype, null, Collections.emptyList(), getLength(start, end, svtype), DEPTH_ONLY_ALGORITHM_LIST, Collections.emptyList(),
+ svtype, null, Collections.emptyList(), getLength(start, end, svtype), Collections.emptyList(), DEPTH_ONLY_ALGORITHM_LIST, Collections.emptyList(),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null);
}
// Note strands and length may not be set properly
public static SVCallRecord newCallRecordWithContigsIntervalAndType(final String startContig, final int start, final String endContig, final int end, final GATKSVVCFConstants.StructuralVariantAnnotationType svtype) {
return new SVCallRecord("", startContig, start, getValidTestStrandA(svtype), endContig, end, getValidTestStrandB(svtype),
- svtype, null, Collections.emptyList(), getLength(start, end, svtype), PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(),
+ svtype, null, Collections.emptyList(), getLength(start, end, svtype), Collections.emptyList(), PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null);
}
@@ -463,7 +484,7 @@ public static Integer getLength(final int start, final int end, final GATKSVVCFC
}
public static SVCallRecord newBndCallRecordWithStrands(final boolean strandA, final boolean strandB) {
- return new SVCallRecord("", "chr1", 1000, strandA, "chr1", 1000, strandB, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null,
+ return new SVCallRecord("", "chr1", 1000, strandA, "chr1", 1000, strandB, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, Collections.emptyList(),
Collections.singletonList(PESR_ALGORITHM),
Collections.emptyList(),
Collections.emptyList(),
@@ -471,7 +492,7 @@ public static SVCallRecord newBndCallRecordWithStrands(final boolean strandA, fi
}
public static SVCallRecord newCtxCallRecord() {
- return new SVCallRecord("", "chr1", 1000, null, "chr1", 1000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, Collections.emptyList(), null,
+ return new SVCallRecord("", "chr1", 1000, null, "chr1", 1000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, Collections.emptyList(), null, Collections.emptyList(),
Collections.singletonList(PESR_ALGORITHM),
Collections.emptyList(),
Collections.emptyList(),
@@ -479,7 +500,7 @@ public static SVCallRecord newCtxCallRecord() {
}
public static SVCallRecord newCpxCallRecordWithLength(final int length) {
- return new SVCallRecord("", "chr1", 1000, null, "chr1", 1000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, null, Collections.emptyList(), length,
+ return new SVCallRecord("", "chr1", 1000, null, "chr1", 1000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, null, Collections.emptyList(), length, Collections.emptyList(),
Collections.singletonList(PESR_ALGORITHM),
Collections.emptyList(),
Collections.emptyList(),
@@ -487,7 +508,7 @@ public static SVCallRecord newCpxCallRecordWithLength(final int length) {
}
public static SVCallRecord newCnvCallRecordWithStrands(final Boolean strandA, final Boolean strandB) {
- return new SVCallRecord("", "chr1", 1000, strandA, "chr1", 1999, strandB, GATKSVVCFConstants.StructuralVariantAnnotationType.CNV, null, Collections.emptyList(), 1000,
+ return new SVCallRecord("", "chr1", 1000, strandA, "chr1", 1999, strandB, GATKSVVCFConstants.StructuralVariantAnnotationType.CNV, null, Collections.emptyList(), 1000, Collections.emptyList(),
Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Collections.emptyList(),
Collections.emptyList(),
@@ -495,7 +516,7 @@ public static SVCallRecord newCnvCallRecordWithStrands(final Boolean strandA, fi
}
public static SVCallRecord newCallRecordWithCoordinates(final String id, final String chrA, final int posA, final String chrB, final int posB) {
- return new SVCallRecord(id, chrA, posA, true, chrB, posB, false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null,
+ return new SVCallRecord(id, chrA, posA, true, chrB, posB, false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, Collections.emptyList(),
Collections.singletonList("peser"),
Collections.emptyList(),
Collections.emptyList(),
@@ -503,7 +524,7 @@ public static SVCallRecord newCallRecordWithCoordinates(final String id, final S
}
public static SVCallRecord newCallRecordWithCoordinatesAndType(final String id, final String chrA, final int posA, final String chrB, final int posB, final GATKSVVCFConstants.StructuralVariantAnnotationType type) {
- return new SVCallRecord(id, chrA, posA, true, chrB, posB, false, type, null, Collections.emptyList(), getLength(posA, posB, type),
+ return new SVCallRecord(id, chrA, posA, true, chrB, posB, false, type, null, Collections.emptyList(), getLength(posA, posB, type), Collections.emptyList(),
Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Collections.emptyList(),
Collections.emptyList(),
@@ -511,7 +532,7 @@ public static SVCallRecord newCallRecordWithCoordinatesAndType(final String id,
}
public static SVCallRecord newCallRecordWithAlgorithms(final List algorithms) {
- return new SVCallRecord("", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length,
+ return new SVCallRecord("", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length, Collections.emptyList(),
algorithms,
Collections.emptyList(),
Collections.emptyList(),
@@ -519,7 +540,15 @@ public static SVCallRecord newCallRecordWithAlgorithms(final List algori
}
public static SVCallRecord newCallRecordInsertionWithLength(final Integer length) {
- return new SVCallRecord("", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length,
+ return new SVCallRecord("", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length, Collections.emptyList(),
+ PESR_ONLY_ALGORITHM_LIST,
+ Collections.emptyList(),
+ Collections.emptyList(),
+ Collections.emptyMap(), Collections.emptySet(), null);
+ }
+
+ public static SVCallRecord newCallRecordInsertionWithLengthAndCoordinates(final String chrA, final int posA, final Integer length) {
+ return new SVCallRecord("", chrA, posA, true, chrA, posA, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length, Collections.emptyList(),
PESR_ONLY_ALGORITHM_LIST,
Collections.emptyList(),
Collections.emptyList(),
@@ -597,4 +626,18 @@ public static GenotypeBuilder getDiploidCNVGenotypeBuilder(final String sample,
.attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 2)
.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, copyNumber);
}
+
+ public static Map buildMapFromArrays(final String[] keys, final Object[] values) {
+ if (keys.length != values.length) {
+ throw new TestException("Keys and values have different lengths: " + keys.length + " and " + values.length);
+ }
+ final Map map = new HashMap<>();
+ for (int i = 0; i < keys.length; i++) {
+ if (keys[i] == null) {
+ throw new TestException("Encountered null key");
+ }
+ map.put(keys[i], values[i]);
+ }
+ return map;
+ }
}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/BinnedCNVDefragmenterTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/BinnedCNVDefragmenterTest.java
index 611763ff23c..0b6982ef5ce 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/BinnedCNVDefragmenterTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/BinnedCNVDefragmenterTest.java
@@ -9,6 +9,7 @@
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
@@ -95,20 +96,22 @@ public void testGetMaxClusterableStartingPosition() {
public void testAdd() {
//single-sample merge case, ignoring sample sets
final SVClusterEngine temp1 = SVClusterEngineFactory.createBinnedCNVDefragmenter(SVTestUtils.hg38Dict, CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, SVTestUtils.hg38Reference, paddingFraction, 0.8, SVTestUtils.targetIntervals);
- temp1.add(SVTestUtils.call1);
+ final List output1 = new ArrayList<>();
+ output1.addAll(temp1.add(SVTestUtils.call1));
//force new cluster by adding a non-overlapping event
- temp1.add(SVTestUtils.call3);
- final List output1 = temp1.forceFlush(); //flushes all clusters
+ output1.addAll(temp1.add(SVTestUtils.call3));
+ output1.addAll(temp1.flush()); //flushes all clusters
Assert.assertEquals(output1.size(), 2);
SVTestUtils.assertEqualsExceptMembershipAndGT(SVTestUtils.call1, output1.get(0));
SVTestUtils.assertEqualsExceptMembershipAndGT(SVTestUtils.call3, output1.get(1));
final SVClusterEngine temp2 = SVClusterEngineFactory.createBinnedCNVDefragmenter(SVTestUtils.hg38Dict, CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, SVTestUtils.hg38Reference, paddingFraction, 0.8, SVTestUtils.targetIntervals);
- temp2.add(SVTestUtils.call1);
- temp2.add(SVTestUtils.call2); //should overlap after padding
+ final List output2 = new ArrayList<>();
+ output2.addAll(temp2.add(SVTestUtils.call1));
+ output2.addAll(temp2.add(SVTestUtils.call2)); //should overlap after padding
//force new cluster by adding a call on another contig
- temp2.add(SVTestUtils.call4_chr10);
- final List output2 = temp2.forceFlush();
+ output2.addAll(temp2.add(SVTestUtils.call4_chr10));
+ output2.addAll(temp2.flush());
Assert.assertEquals(output2.size(), 2);
Assert.assertEquals(output2.get(0).getPositionA(), SVTestUtils.call1.getPositionA());
Assert.assertEquals(output2.get(0).getPositionB(), SVTestUtils.call2.getPositionB());
@@ -116,9 +119,10 @@ public void testAdd() {
//cohort case, checking sample set overlap
final SVClusterEngine temp3 = SVClusterEngineFactory.createCNVDefragmenter(SVTestUtils.hg38Dict, CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, SVTestUtils.hg38Reference, CNVLinkage.DEFAULT_PADDING_FRACTION, CNVLinkage.DEFAULT_SAMPLE_OVERLAP);
- temp3.add(SVTestUtils.call1);
- temp3.add(SVTestUtils.sameBoundsSampleMismatch);
- final List output3 = temp3.forceFlush();
+ final List output3 = new ArrayList<>();
+ output3.addAll(temp3.add(SVTestUtils.call1));
+ output3.addAll(temp3.add(SVTestUtils.sameBoundsSampleMismatch));
+ output3.addAll(temp3.flush());
Assert.assertEquals(output3.size(), 2);
}
}
\ No newline at end of file
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CNVDefragmenterTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CNVDefragmenterTest.java
index 012c302bd83..cb540d4988e 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CNVDefragmenterTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CNVDefragmenterTest.java
@@ -21,47 +21,47 @@ public class CNVDefragmenterTest {
@Test
public void testClusterTogether() {
final SVCallRecord deletion = new SVCallRecord("test_del", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
final SVCallRecord duplication = new SVCallRecord("test_dup", "chr1", 1000, false, "chr1", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, null, Collections.emptyList(),
- 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DUP),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
Assert.assertFalse(defragmenter.areClusterable(deletion, duplication), "Different sv types should not cluster");
final SVCallRecord duplicationNonDepthOnly = new SVCallRecord("test_dup", "chr1", 1000, false, "chr1", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, null, Collections.emptyList(),
- 1000, Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM, SVTestUtils.PESR_ALGORITHM),
+ 1000, Collections.emptyList(), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM, SVTestUtils.PESR_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DUP),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
Assert.assertFalse(defragmenter.areClusterable(duplication, duplicationNonDepthOnly), "Clustered records must be depth-only");
final SVCallRecord cnv = new SVCallRecord("test_cnv", "chr1", 1000, null, "chr1", 1999, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CNV, null, Collections.emptyList(),
- 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
Assert.assertFalse(defragmenter.areClusterable(deletion, cnv), "Different sv types should not cluster");
final SVCallRecord insertion = new SVCallRecord("test_ins", "chr1", 1000, true, "chr1", 1001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(),
- 1000, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
+ 1000, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
Assert.assertFalse(defragmenter.areClusterable(insertion, insertion), "Only CNVs should be valid");
final SVCallRecord deletion2 = new SVCallRecord("test_del2", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
Assert.assertTrue(defragmenter.areClusterable(deletion, deletion2), "Valid identical records should cluster");
final SVCallRecord deletion3 = new SVCallRecord("test_del3", "chr1", 2999, true, "chr1", 3998, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
Assert.assertTrue(defragmenter.areClusterable(deletion, deletion3), "Should cluster due to overlap");
final SVCallRecord deletion4 = new SVCallRecord("test_del3", "chr1", 3000, true, "chr1", 3999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
Assert.assertFalse(defragmenter.areClusterable(deletion, deletion4), "Should barely not cluster");
@@ -190,7 +190,7 @@ public Object[][] recordPairs() {
@Test(dataProvider= "maxPositionIntervals")
public void testGetMaxClusterableStartingPosition(final int start, final int end) {
final SVCallRecord call1 = new SVCallRecord("call1", "chr1", start, true, "chr1", end, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- end - start + 1, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ end - start + 1, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
final int maxClusterableStart = defragmenter.getMaxClusterableStartingPosition(call1);
@@ -198,7 +198,7 @@ public void testGetMaxClusterableStartingPosition(final int start, final int end
final int call2Start = maxClusterableStart;
final int call2End = dictionary.getSequence(call1.getContigA()).getSequenceLength();
final SVCallRecord call2 = new SVCallRecord("call2", "chr1", call2Start, true, "chr1", call2End, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- call2End - call2Start + 1, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ call2End - call2Start + 1, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
Assert.assertTrue(defragmenter.areClusterable(call1, call2));
@@ -206,7 +206,7 @@ public void testGetMaxClusterableStartingPosition(final int start, final int end
final int call3Start = maxClusterableStart + 1;
final int call3End = dictionary.getSequence(call1.getContigA()).getSequenceLength();
final SVCallRecord call3 = new SVCallRecord("call3", "chr1", call3Start, true, "chr1", call3End, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- call3End - call3Start + 1, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ call3End - call3Start + 1, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
Assert.assertFalse(defragmenter.areClusterable(call1, call3));
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapserUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapserUnitTest.java
index f46e4a6e7c9..3a692ab4001 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapserUnitTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapserUnitTest.java
@@ -26,27 +26,43 @@ public class CanonicalSVCollapserUnitTest {
private static final CanonicalSVCollapser collapser = new CanonicalSVCollapser(
SVTestUtils.hg38Reference,
CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE,
- CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END);
+ CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END,
+ CanonicalSVCollapser.FlagFieldLogic.OR);
private static final CanonicalSVCollapser collapserMinMax = new CanonicalSVCollapser(
SVTestUtils.hg38Reference,
CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE,
- CanonicalSVCollapser.BreakpointSummaryStrategy.MIN_START_MAX_END);
+ CanonicalSVCollapser.BreakpointSummaryStrategy.MIN_START_MAX_END,
+ CanonicalSVCollapser.FlagFieldLogic.OR);
private static final CanonicalSVCollapser collapserMaxMin = new CanonicalSVCollapser(
SVTestUtils.hg38Reference,
CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE,
- CanonicalSVCollapser.BreakpointSummaryStrategy.MAX_START_MIN_END);
+ CanonicalSVCollapser.BreakpointSummaryStrategy.MAX_START_MIN_END,
+ CanonicalSVCollapser.FlagFieldLogic.OR);
private static final CanonicalSVCollapser collapserMean = new CanonicalSVCollapser(
SVTestUtils.hg38Reference,
CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE,
- CanonicalSVCollapser.BreakpointSummaryStrategy.MEAN_START_MEAN_END);
+ CanonicalSVCollapser.BreakpointSummaryStrategy.MEAN_START_MEAN_END,
+ CanonicalSVCollapser.FlagFieldLogic.OR);
private static final CanonicalSVCollapser collapserRepresentative = new CanonicalSVCollapser(
SVTestUtils.hg38Reference,
CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE,
- CanonicalSVCollapser.BreakpointSummaryStrategy.REPRESENTATIVE);
+ CanonicalSVCollapser.BreakpointSummaryStrategy.REPRESENTATIVE,
+ CanonicalSVCollapser.FlagFieldLogic.OR);
private static final CanonicalSVCollapser collapserSpecificAltAllele = new CanonicalSVCollapser(
SVTestUtils.hg38Reference,
CanonicalSVCollapser.AltAlleleSummaryStrategy.MOST_SPECIFIC_SUBTYPE,
- CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END);
+ CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END,
+ CanonicalSVCollapser.FlagFieldLogic.OR);
+ private static final CanonicalSVCollapser collapserFlagAnd = new CanonicalSVCollapser(
+ SVTestUtils.hg38Reference,
+ CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE,
+ CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END,
+ CanonicalSVCollapser.FlagFieldLogic.AND);
+ private static final CanonicalSVCollapser collapserFlagAlwaysFalse = new CanonicalSVCollapser(
+ SVTestUtils.hg38Reference,
+ CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE,
+ CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END,
+ CanonicalSVCollapser.FlagFieldLogic.ALWAYS_FALSE);
private static final Allele MEI_INSERTION_ALLELE = Allele.create("");
private static final Allele SVA_INSERTION_ALLELE = Allele.create("");
@@ -1234,15 +1250,160 @@ public void collapseLengthTest(final SVCallRecord record,
Assert.assertEquals(collapser.collapseLength(record, type, record.getPositionA(), record.getPositionB()), expectedLength);
}
- @DataProvider(name = "collapseIdsTestData")
- public Object[][] collapseIdsTestData() {
+
+ @DataProvider(name = "collapseAttributesTestData")
+ public Object[][] collapseAttributesTestData() {
return new Object[][]{
- {Collections.singletonList("var1"), "var1"},
- {Lists.newArrayList("var1", "var2"), "var1"},
- {Lists.newArrayList("var2", "var1"), "var1"},
+ // Empty case
+ {
+ new String[]{}, new Object[]{},
+ new String[]{}, new Object[]{},
+ new String[]{}, new Object[]{},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ // Use representative
+ {
+ new String[]{"TEST_KEY"}, new Object[]{"TEST_VALUE"},
+ new String[]{}, new Object[]{},
+ new String[]{"TEST_KEY"}, new Object[]{"TEST_VALUE"},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ {
+ new String[]{}, new Object[]{},
+ new String[]{"TEST_KEY"}, new Object[]{"TEST_VALUE"},
+ new String[]{}, new Object[]{},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ {
+ new String[]{"TEST_KEY1"}, new Object[]{"TEST_VALUE1"},
+ new String[]{"TEST_KEY1"}, new Object[]{"TEST_VALUE2"},
+ new String[]{"TEST_KEY1"}, new Object[]{"TEST_VALUE1"},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ {
+ new String[]{"TEST_KEY1"}, new Object[]{"TEST_VALUE1"},
+ new String[]{"TEST_KEY1", "TEST_KEY2"}, new Object[]{"TEST_VALUE12", "TEST_VALUE22"},
+ new String[]{"TEST_KEY1"}, new Object[]{"TEST_VALUE1"},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ // Reserved flags OR
+ {
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{}, new Object[]{},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ {
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ {
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ {
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE},
+ new String[]{}, new Object[]{}, // False results in non-assignment, implying false
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ {
+ new String[]{}, new Object[]{},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE},
+ new String[]{}, new Object[]{},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ {
+ new String[]{GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{}, new Object[]{},
+ new String[]{GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ {
+ new String[]{}, new Object[]{},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE, GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE}, new Object[]{Boolean.TRUE, Boolean.TRUE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE, GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE}, new Object[]{Boolean.TRUE, Boolean.TRUE},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ // Reserved flags AND
+ {
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{}, new Object[]{},
+ new String[]{}, new Object[]{},
+ CanonicalSVCollapser.FlagFieldLogic.AND
+ },
+ {
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ CanonicalSVCollapser.FlagFieldLogic.AND
+ },
+ {
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE},
+ new String[]{}, new Object[]{},
+ CanonicalSVCollapser.FlagFieldLogic.AND
+ },
+ {
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE},
+ new String[]{}, new Object[]{},
+ CanonicalSVCollapser.FlagFieldLogic.AND
+ },
+ {
+ new String[]{}, new Object[]{},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE},
+ new String[]{}, new Object[]{},
+ CanonicalSVCollapser.FlagFieldLogic.AND
+ },
+ {
+ new String[]{GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{}, new Object[]{},
+ new String[]{}, new Object[]{},
+ CanonicalSVCollapser.FlagFieldLogic.AND
+ },
+ {
+ new String[]{}, new Object[]{},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE, GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE}, new Object[]{Boolean.TRUE, Boolean.TRUE},
+ new String[]{}, new Object[]{},
+ CanonicalSVCollapser.FlagFieldLogic.AND
+ },
+ // Reserved flags ALWAYS_FALSE
+ {
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{}, new Object[]{},
+ CanonicalSVCollapser.FlagFieldLogic.ALWAYS_FALSE
+ },
};
}
+ @Test(dataProvider= "collapseAttributesTestData")
+ public void collapseAttributesTest(final String[] representativeKeys, final Object[] representativeValues,
+ final String[] secondKeys, final Object[] secondValues,
+ final String[] expectedKeys, final Object[] expectedValues,
+ final CanonicalSVCollapser.FlagFieldLogic flagLogic) {
+ final Map representativeMap = SVTestUtils.buildMapFromArrays(representativeKeys, representativeValues);
+ final Map secondMap = SVTestUtils.buildMapFromArrays(secondKeys, secondValues);
+ final Map expectedMap = SVTestUtils.buildMapFromArrays(expectedKeys, expectedValues);
+ final SVCallRecord representativeCall = SVTestUtils.newDeletionRecordWithAttributes(representativeMap);
+ final SVCallRecord secondCall = SVTestUtils.newDeletionRecordWithAttributes(secondMap);
+ final Collection collection = Lists.newArrayList(secondCall, representativeCall);
+ final CanonicalSVCollapser testCollapser = new CanonicalSVCollapser(
+ SVTestUtils.hg38Reference,
+ CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE,
+ CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END,
+ flagLogic);
+ final Map result = new HashMap<>(testCollapser.collapseAttributes(representativeCall, collection));
+ // Ignore MEMBERS field
+ result.remove(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY);
+ Assert.assertEquals(result, expectedMap);
+ }
+
@DataProvider(name = "getMostPreciseCallsTestData")
public Object[][] getMostPreciseCallsTestData() {
return new Object[][]{
@@ -1428,32 +1589,205 @@ public void collapseIntervalTest(final String[] contigs, final int[] starts, fin
collapseIntervalTestHelper(collapserMean, svtype, contigs, records, expectedMean);
}
- @Test
- public void collapseIntervalRepresentativeTest() {
+ @DataProvider(name = "collapseIntervalRepresentativeTestData")
+ public Object[][] collapseIntervalRepresentativeTestData() {
+ return new Object[][]{
+ // equal evidence, expect second with more carriers
+ {
+ null,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ false
+ },
+ {
+ 0.,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ false
+ },
+ {
+ null,
+ 0.,
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ false
+ },
+ {
+ null,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.RD, GATKSVVCFConstants.EvidenceTypes.PE, GATKSVVCFConstants.EvidenceTypes.SR},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.RD, GATKSVVCFConstants.EvidenceTypes.PE, GATKSVVCFConstants.EvidenceTypes.SR},
+ false
+ },
+ {
+ -99.,
+ -99.,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR},
+ false
+ },
+ {
+ null,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.BAF},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.BAF},
+ false
+ },
+ // quality based
+ {
+ -99.,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ true
+ },
+ {
+ null,
+ -99.,
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ false
+ },
+ {
+ -10.,
+ -9.,
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ true
+ },
+ {
+ -10.,
+ -9.,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR},
+ true
+ },
+ // SR > PE
+ {
+ -99.,
+ -99.,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR},
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ true
+ },
+ // note quality null = 0
+ {
+ null,
+ 0.,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE},
+ true
+ },
+ {
+ 0.,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.RD},
+ true
+ },
+ {
+ null,
+ 0.,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.BAF},
+ true
+ },
+ {
+ null,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE, GATKSVVCFConstants.EvidenceTypes.RD, GATKSVVCFConstants.EvidenceTypes.BAF},
+ true
+ },
+ {
+ null,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE, GATKSVVCFConstants.EvidenceTypes.RD, GATKSVVCFConstants.EvidenceTypes.BAF},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR},
+ false
+ },
+ // PE > others
+ {
+ null,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE},
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ true
+ },
+ {
+ null,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.RD},
+ true
+ },
+ {
+ null,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.RD, GATKSVVCFConstants.EvidenceTypes.BAF},
+ true
+ },
+ // irrelevant evidence, expect second with more carriers
+ {
+ null,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.RD},
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ false
+ },
+ {
+ null,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.BAF},
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ false
+ },
+ };
+ }
+
+ @Test(dataProvider = "collapseIntervalRepresentativeTestData")
+ public void collapseIntervalRepresentativeTest(final Double log10PErrorA,
+ final Double log10PErrorB,
+ final GATKSVVCFConstants.EvidenceTypes[] evidenceA,
+ final GATKSVVCFConstants.EvidenceTypes[] evidenceB,
+ final boolean expectFirst) {
// Choose second record with more carriers
final List records =
Lists.newArrayList(
- SVTestUtils.makeRecord("record1", "chr1", 1000, true,
+ SVTestUtils.makeRecordWithEvidenceAndQuality("record1", "chr1", 1000, true,
"chr1", 2000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
- null, Collections.emptyList(), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
+ null, Arrays.asList(evidenceA), Collections.emptyList(), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Lists.newArrayList(
new GenotypeBuilder("sample1", Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL)).attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 2),
new GenotypeBuilder("sample2", Lists.newArrayList(Allele.REF_N, Allele.REF_N)).attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 2)
- )
+ ),
+ log10PErrorA
),
- SVTestUtils.makeRecord("record2", "chr1", 1001, true,
+ SVTestUtils.makeRecordWithEvidenceAndQuality("record2", "chr1", 1001, true,
"chr1", 2001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
- null, Collections.emptyList(), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
+ null, Arrays.asList(evidenceB), Collections.emptyList(), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Lists.newArrayList(
new GenotypeBuilder("sample1", Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL)).attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 2),
new GenotypeBuilder("sample2", Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL)).attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 2)
- )
+ ),
+ log10PErrorB
)
);
final Pair result = collapserRepresentative.collapseInterval(records);
- Assert.assertEquals((int) result.getLeft(), 1001);
- Assert.assertEquals((int) result.getRight(), 2001);
+ if (expectFirst) {
+ Assert.assertEquals((int) result.getLeft(), 1000);
+ Assert.assertEquals((int) result.getRight(), 2000);
+ } else {
+ Assert.assertEquals((int) result.getLeft(), 1001);
+ Assert.assertEquals((int) result.getRight(), 2001);
+ }
+ }
+ @Test
+ public void collapseIntervalRepresentativeByCoordinatesTest() {
// record2 and record3 have the best carrier status, but choose second record which is closer to all others on average
final List records2 =
Lists.newArrayList(
@@ -1483,9 +1817,9 @@ public void collapseIntervalRepresentativeTest() {
)
);
final Pair result2 = collapserRepresentative.collapseInterval(records2);
- Assert.assertEquals((int) result2.getLeft(), 999);
- Assert.assertEquals((int) result2.getRight(), 2000);
- }
+ Assert.assertEquals((int) result2.getLeft(), 999);
+ Assert.assertEquals((int) result2.getRight(), 2000);
+}
@DataProvider(name = "distanceDataProvider")
public Object[][] distanceDataProvider() {
@@ -1583,7 +1917,7 @@ public void testComplexSubtypeAndIntervals() {
GATKSVVCFConstants.StructuralVariantAnnotationType.CPX,
GATKSVVCFConstants.ComplexVariantSubtype.dDUP,
Arrays.asList(SVCallRecord.ComplexEventInterval.decode("DUP_chr1:6000-8000", SVTestUtils.hg38Dict)),
- null, Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
+ null, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
Lists.newArrayList(Allele.REF_N, SVTestUtils.CPX_ALLELE),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final SVCallRecord cpx2 = new SVCallRecord("cpx1", "chr1", 1000, null,
@@ -1591,7 +1925,7 @@ public void testComplexSubtypeAndIntervals() {
GATKSVVCFConstants.StructuralVariantAnnotationType.CPX,
GATKSVVCFConstants.ComplexVariantSubtype.dDUP,
Arrays.asList(SVCallRecord.ComplexEventInterval.decode("DUP_chr1:6000-8000", SVTestUtils.hg38Dict)),
- null, Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
+ null, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
Lists.newArrayList(Allele.REF_N, SVTestUtils.CPX_ALLELE),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final SVCallRecord result = collapser.collapse(new SVClusterEngine.OutputCluster(Lists.newArrayList(cpx1, cpx2)));
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineTest.java
index 3b51938e997..2144c30ed63 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineTest.java
@@ -1,7 +1,10 @@
package org.broadinstitute.hellbender.tools.sv.cluster;
import com.google.common.collect.Lists;
-import htsjdk.variant.variantcontext.*;
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.Genotype;
+import htsjdk.variant.variantcontext.GenotypeBuilder;
+import htsjdk.variant.variantcontext.GenotypesContext;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils;
@@ -167,12 +170,12 @@ public void testClusterTogetherInvalidInterval() {
// End position beyond contig end after padding
final SVCallRecord deletion1 = new SVCallRecord("test_del", "chr1", 1000, true, "chr1", 248956423 + SVTestUtils.defaultEvidenceParameters.getWindow(), false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
null, Collections.emptyList(),
- null, Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
+ null, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final SVCallRecord deletion2 = new SVCallRecord("test_del", "chr1", 1000, true, "chr1", 248956422, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
null, Collections.emptyList(),
- null, Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
+ null, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
engine.getLinkage().areClusterable(deletion1, deletion2);
@@ -204,7 +207,7 @@ public void testGetMaxClusterableStartingPosition(final int start, final int end
private void testGetMaxClusterableStartingPositionWithAlgorithm(final int start, final int end, final String algorithm) {
final SVCallRecord call1 = new SVCallRecord("call1", "chr1", start, true, "chr1", end, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
null, Collections.emptyList(),
- end - start + 1, Collections.singletonList(algorithm),
+ end - start + 1, Collections.emptyList(), Collections.singletonList(algorithm),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final int maxClusterableStart = engine.getLinkage().getMaxClusterableStartingPosition(call1);
@@ -212,12 +215,12 @@ private void testGetMaxClusterableStartingPositionWithAlgorithm(final int start,
final int call2Start = maxClusterableStart;
final SVCallRecord call2Depth = new SVCallRecord("call2", "chr1", call2Start, true, "chr1", call2Start + call1.getLength() - 1, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
null, Collections.emptyList(),
- call1.getLength(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ call1.getLength(), Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final SVCallRecord call2Pesr = new SVCallRecord("call2", "chr1", call2Start, true, "chr1", call2Start + call1.getLength() - 1, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
null, Collections.emptyList(),
- call1.getLength(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
+ call1.getLength(), Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
Assert.assertTrue(engine.getLinkage().areClusterable(call1, call2Depth) || engine.getLinkage().areClusterable(call1, call2Pesr));
@@ -225,12 +228,12 @@ private void testGetMaxClusterableStartingPositionWithAlgorithm(final int start,
final int call3Start = maxClusterableStart + 1;
final SVCallRecord call3Depth = new SVCallRecord("call2", "chr1", call3Start, true, "chr1", call3Start + call1.getLength() - 1, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
null, Collections.emptyList(),
- call1.getLength(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ call1.getLength(), Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final SVCallRecord call3Pesr = new SVCallRecord("call2", "chr1", call3Start, true, "chr1", call3Start + call1.getLength() - 1, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
null, Collections.emptyList(),
- call1.getLength(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
+ call1.getLength(), Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
Assert.assertFalse(engine.getLinkage().areClusterable(call1, call3Depth) || engine.getLinkage().areClusterable(call1, call3Pesr));
@@ -286,12 +289,12 @@ public Object[][] clusterTogetherVaryPositionsProvider() {
public void testClusterTogetherVaryPositions(final int start1, final int end1, final int start2, final int end2, final boolean result) {
final SVCallRecord call1 = new SVCallRecord("call1", "chr1", start1, true,
"chr1", end1, false,
- GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), end1 - start1 + 1, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), end1 - start1 + 1, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP),
SVTestUtils.threeGenotypes, Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final SVCallRecord call2 = new SVCallRecord("call2", "chr1", start2, true,
"chr1", end2, false,
- GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), end2 - start2 + 1, Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), end2 - start2 + 1, Collections.emptyList(), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP),
SVTestUtils.threeGenotypes, Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
Assert.assertEquals(engine.getLinkage().areClusterable(call1, call2), result);
@@ -303,12 +306,12 @@ public void testClusterTogetherVaryTypes() {
// Pass in null strands to let them be determined automatically
final SVCallRecord call1 = new SVCallRecord("call1", "chr1", 1000, SVTestUtils.getValidTestStrandA(type1),
"chr1", 2001, SVTestUtils.getValidTestStrandB(type1), type1, null, Collections.emptyList(),
- SVTestUtils.getLength(1000, 2001, type1), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ SVTestUtils.getLength(1000, 2001, type1), Collections.emptyList(), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
for (final GATKSVVCFConstants.StructuralVariantAnnotationType type2 : GATKSVVCFConstants.StructuralVariantAnnotationType.values()) {
final SVCallRecord call2 = new SVCallRecord("call2", "chr1", 1000, SVTestUtils.getValidTestStrandA(type2),
"chr1", 2001, SVTestUtils.getValidTestStrandB(type2), type2, null, Collections.emptyList(),
- SVTestUtils.getLength(1000, 2001, type2), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ SVTestUtils.getLength(1000, 2001, type2), Collections.emptyList(), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
// Should only cluster together if same type, except CNVs
if ((type1 == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV && call2.isSimpleCNV()) ||
@@ -328,13 +331,13 @@ public void testClusterTogetherVaryStrands() {
for (final Boolean strand1B : bools) {
final SVCallRecord call1 = new SVCallRecord("call1", "chr1", 1000, strand1A,
"chr1", 2001, strand1B, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(),
- null, Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ null, Collections.emptyList(), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
for (final Boolean strand2A : bools) {
for (final Boolean strand2B : bools) {
final SVCallRecord call2 = new SVCallRecord("call2", "chr1", 1000, strand2A,
"chr1", 2001, strand2B, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(),
- null, Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ null, Collections.emptyList(), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
// Should only cluster if strands match
Assert.assertEquals(engine.getLinkage().areClusterable(call1, call2), strand1A == strand2A && strand1B == strand2B);
@@ -353,7 +356,7 @@ public void testClusterTogetherVaryContigs() {
final String contig1B = contigs.get(j);
final SVCallRecord call1 = new SVCallRecord("call1", contig1A, 1000, true,
contig1B, 2001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(),
- null, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
+ null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
for (int k = 0; k < contigs.size(); k++) {
final String contig2A = contigs.get(k);
@@ -361,7 +364,7 @@ public void testClusterTogetherVaryContigs() {
final String contig2B = contigs.get(m);
final SVCallRecord call2 = new SVCallRecord("call2", contig2A, 1000, true,
contig2B, 2001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(),
- null, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
+ null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
// Should only cluster if contigs match
Assert.assertEquals(engine.getLinkage().areClusterable(call1, call2), contig1A.equals(contig2A) && contig1B.equals(contig2B));
@@ -381,11 +384,11 @@ public void testClusterTogetherVaryAlgorithms() {
for (final List algorithms1 : algorithmsList) {
final SVCallRecord call1 = new SVCallRecord("call1", "chr1", 1000, true,
"chr1", 2001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- 1002, algorithms1, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
+ 1002, Collections.emptyList(), algorithms1, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
for (final List algorithms2 : algorithmsList) {
final SVCallRecord call2 = new SVCallRecord("call2", "chr1", 1000, true,
"chr1", 2001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- 1002, algorithms2, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
+ 1002, Collections.emptyList(), algorithms2, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
// All combinations should cluster
Assert.assertTrue(engine.getLinkage().areClusterable(call1, call2));
}
@@ -424,6 +427,50 @@ public void testClusterTogetherCNVs() {
Assert.assertFalse(engine.getLinkage().areClusterable(del1, dup1));
}
+ @DataProvider(name = "testMatchCNVNoGTData")
+ public Object[][] testMatchCNVNoGTData() {
+ return new Object[][]{
+ // Empty
+ {0, new int[]{}, new int[]{}, true},
+ // Both equal
+ {0, new int[]{0}, new int[]{0}, true},
+ {1, new int[]{1}, new int[]{1}, true},
+ {2, new int[]{2}, new int[]{2}, true},
+ {2, new int[]{3}, new int[]{3}, true},
+ // Unequal
+ {2, new int[]{1}, new int[]{2}, false},
+ {2, new int[]{2}, new int[]{1}, false},
+ // Equal multiple
+ {2, new int[]{2, 2}, new int[]{2, 2}, true},
+ {2, new int[]{4, 2}, new int[]{4, 2}, true},
+ // Unequal multiple
+ {2, new int[]{2, 2}, new int[]{2, 1}, false},
+ {2, new int[]{0, 2}, new int[]{1, 1}, false},
+ {2, new int[]{3, 2}, new int[]{2, 2}, false},
+ {2, new int[]{6, 2}, new int[]{4, 2}, false},
+ };
+ }
+
+ @Test(dataProvider= "testMatchCNVNoGTData")
+ public void testMatchCNVNoGT(final int ploidy, final int[] copyNumbers1, final int[] copyNumbers2, final boolean expected) {
+ final List alleles = Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_CNV);
+ final GATKSVVCFConstants.StructuralVariantAnnotationType svtype = GATKSVVCFConstants.StructuralVariantAnnotationType.CNV;
+ // Create genotypes with copy number attribute (and no GT)
+ final SVCallRecord recordCN1 = getCNVRecordWithCN(ploidy, alleles, svtype, copyNumbers1, GATKSVVCFConstants.COPY_NUMBER_FORMAT);
+ final SVCallRecord recordCN2 = getCNVRecordWithCN(ploidy, alleles, svtype, copyNumbers2, GATKSVVCFConstants.COPY_NUMBER_FORMAT);
+
+ // With sample overlap
+ final ClusteringParameters depthOnlyParams = ClusteringParameters.createDepthParameters(0.8, 0, 10000000, 1);
+ final CanonicalSVLinkage linkage = new CanonicalSVLinkage<>(SVTestUtils.hg38Dict, false);
+ linkage.setDepthOnlyParams(depthOnlyParams);
+
+ Assert.assertEquals(linkage.areClusterable(recordCN1, recordCN2), expected);
+
+ final SVCallRecord recordRDCN1 = getCNVRecordWithCN(ploidy, alleles, svtype, copyNumbers1, GATKSVVCFConstants.DEPTH_GENOTYPE_COPY_NUMBER_FORMAT);
+ final SVCallRecord recordRDCN2 = getCNVRecordWithCN(ploidy, alleles, svtype, copyNumbers2, GATKSVVCFConstants.DEPTH_GENOTYPE_COPY_NUMBER_FORMAT);
+ Assert.assertEquals(linkage.areClusterable(recordRDCN1, recordRDCN2), expected);
+ }
+
@DataProvider(name = "testClusterTogetherIntervaledComplexData")
public Object[][] testClusterTogetherIntervaledComplexData() {
return new Object[][]{
@@ -510,7 +557,7 @@ public void testClusterTogetherIntervaledComplex(final String contigA, final int
GATKSVVCFConstants.StructuralVariantAnnotationType.CPX,
GATKSVVCFConstants.ComplexVariantSubtype.delINV,
Arrays.asList(SVCallRecord.ComplexEventInterval.decode("DEL_chr1:1100-1500", SVTestUtils.hg38Dict), SVCallRecord.ComplexEventInterval.decode("INV_chr1:1600-1900", SVTestUtils.hg38Dict)),
- 1000, Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
+ 1000, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
Lists.newArrayList(Allele.REF_N, SVTestUtils.CPX_ALLELE),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final Integer length2 = inferLength(contigA, posA, contigB, posB);
@@ -519,7 +566,7 @@ public void testClusterTogetherIntervaledComplex(final String contigA, final int
GATKSVVCFConstants.StructuralVariantAnnotationType.CPX,
subtype,
cpxIntervals,
- length2, Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
+ length2, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
Lists.newArrayList(Allele.REF_N, SVTestUtils.CPX_ALLELE),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
Assert.assertEquals(engine.getLinkage().areClusterable(cpx1, cpx2), expected);
@@ -588,7 +635,7 @@ public void testClusterTogetherInsertedComplex(final String contigA, final int p
GATKSVVCFConstants.StructuralVariantAnnotationType.CPX,
GATKSVVCFConstants.ComplexVariantSubtype.dDUP,
Arrays.asList(new SVCallRecord.ComplexEventInterval(GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, new SimpleInterval("chr1", 6000, 8000))),
- 2000, Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
+ 2000, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
Lists.newArrayList(Allele.REF_N, SVTestUtils.CPX_ALLELE),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final Integer length2 = cpxIntervals.get(0).getInterval().size();
@@ -597,7 +644,7 @@ public void testClusterTogetherInsertedComplex(final String contigA, final int p
GATKSVVCFConstants.StructuralVariantAnnotationType.CPX,
subtype,
cpxIntervals,
- length2, Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
+ length2, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
Lists.newArrayList(Allele.REF_N, SVTestUtils.CPX_ALLELE),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
Assert.assertEquals(engine.getLinkage().areClusterable(cpx1, cpx2), expected);
@@ -608,10 +655,10 @@ public void testClusterTogetherVaryParameters() {
final SVClusterEngine testEngine1 = SVTestUtils.getNewDefaultSingleLinkageEngine();
final SVCallRecord call1 = new SVCallRecord("call1", "chr1", 1000, true,
"chr1", 2001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- 1002, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
+ 1002, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final SVCallRecord call2 = new SVCallRecord("call2", "chr1", 1100, true,
"chr1", 2101, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- 1002, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
+ 1002, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
// Cluster with default parameters
Assert.assertTrue(testEngine1.getLinkage().areClusterable(call1, call2));
final ClusteringParameters exactMatchParameters = ClusteringParameters.createDepthParameters(1.0, 0, 0, 1.0);
@@ -650,20 +697,22 @@ public void testAddVaryPositions(final int positionA1, final int positionB1,
}
final SVCallRecord call1 = new SVCallRecord("call1", "chr1", positionA1, true,
"chr1", positionB1, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- positionB1 - positionA1 + 1, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ positionB1 - positionA1 + 1, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final SVCallRecord call2 = new SVCallRecord("call1", "chr1", positionA2, true,
"chr1", positionB2, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- positionB2 - positionA2 + 1, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ positionB2 - positionA2 + 1, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final SVCallRecord call3 = new SVCallRecord("call1", "chr1", positionA3, true,
"chr1", positionB3, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- positionB3 - positionA3 + 1, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ positionB3 - positionA3 + 1, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
- engine.add(call1);
- engine.add(call2);
- engine.add(call3);
- Assert.assertEquals(engine.forceFlush().size(), result);
+ final List output = new ArrayList<>();
+ output.addAll(engine.add(call1));
+ output.addAll(engine.add(call2));
+ output.addAll(engine.add(call3));
+ output.addAll(engine.flush());
+ Assert.assertEquals(output.size(), result);
}
@Test
@@ -671,22 +720,24 @@ public void testAdd() {
//single-sample merge case, ignoring sample sets
final SVClusterEngine temp1 = SVTestUtils.getNewDefaultSingleLinkageEngine();
Assert.assertTrue(temp1.isEmpty());
- temp1.add(SVTestUtils.call1);
+ final List output1 = new ArrayList<>();
+ output1.addAll(temp1.add(SVTestUtils.call1));
Assert.assertFalse(temp1.isEmpty());
//force new cluster by adding a non-overlapping event
- temp1.add(SVTestUtils.call3);
- final List output1 = temp1.forceFlush(); //flushes all clusters
+ output1.addAll(temp1.add(SVTestUtils.call3));
+ output1.addAll(temp1.flush()); //flushes all clusters
Assert.assertTrue(temp1.isEmpty());
Assert.assertEquals(output1.size(), 2);
SVTestUtils.assertEqualsExceptMembershipAndGT(SVTestUtils.call1, output1.get(0));
SVTestUtils.assertEqualsExceptMembershipAndGT(SVTestUtils.call3, output1.get(1));
final SVClusterEngine temp2 = SVTestUtils.getNewDefaultSingleLinkageEngine();
- temp2.add(SVTestUtils.call1);
- temp2.add(SVTestUtils.overlapsCall1);
+ final List output2 = new ArrayList<>();
+ output2.addAll(temp2.add(SVTestUtils.call1));
+ output2.addAll(temp2.add(SVTestUtils.overlapsCall1));
//force new cluster by adding a call on another contig
- temp2.add(SVTestUtils.call4_chr10);
- final List output2 = temp2.forceFlush();
+ output2.addAll(temp2.add(SVTestUtils.call4_chr10));
+ output2.addAll(temp2.flush());
Assert.assertEquals(output2.size(), 2);
//median of two items ends up being the second item here
Assert.assertEquals(output2.get(0).getPositionA(), SVTestUtils.call1.getPositionA());
@@ -695,9 +746,10 @@ public void testAdd() {
//checking insensitivity to sample set overlap
final SVClusterEngine temp3 = SVTestUtils.getNewDefaultSingleLinkageEngine();
- temp3.add(SVTestUtils.call1);
- temp3.add(SVTestUtils.sameBoundsSampleMismatch);
- final List output3 = temp3.forceFlush();
+ final List output3 = new ArrayList<>();
+ output3.addAll(temp3.add(SVTestUtils.call1));
+ output3.addAll(temp3.add(SVTestUtils.sameBoundsSampleMismatch));
+ output3.addAll(temp3.flush());
Assert.assertEquals(output3.size(), 1);
Assert.assertEquals(output3.get(0).getPositionA(), SVTestUtils.call1.getPositionA());
Assert.assertEquals(output3.get(0).getPositionB(), SVTestUtils.call1.getPositionB());
@@ -710,12 +762,13 @@ public void testAddMaxCliqueLarge() {
final int numRecords = 100;
final SVClusterEngine engine = SVTestUtils.getNewDefaultMaxCliqueEngine();
final int length = 5000;
+ final List result = new ArrayList<>();
for (int i = 0; i < numRecords; i++) {
final int start = 1000 + 10 * i;
final int end = start + length - 1;
- engine.add(SVTestUtils.newPESRCallRecordWithIntervalAndType(start, end, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL));
+ result.addAll(engine.add(SVTestUtils.newPESRCallRecordWithIntervalAndType(start, end, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
}
- final List result = engine.forceFlush();
+ result.addAll(engine.flush());
Assert.assertEquals(result.size(), 50);
for (final SVCallRecord resultRecord : result) {
Assert.assertTrue(resultRecord.getAttributes().containsKey(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY));
@@ -778,21 +831,14 @@ public void testGetCarrierSamplesBiallelic(final int ploidy, final Allele refAll
}
// Create genotypes with copy number attribute (and no GT)
- final List genotypesWithCopyNumber = IntStream.range(0, copyNumbers.length)
- .mapToObj(i -> new GenotypeBuilder(String.valueOf(i))
- .attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, copyNumbers[i])
- .attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, ploidy)
- .alleles(SVTestUtils.buildHomAlleleListWithPloidy(Allele.NO_CALL, ploidy))
- .make())
- .collect(Collectors.toList());
- final SVCallRecord recordWithCopyNumber = new SVCallRecord("", "chr1", 1000, SVTestUtils.getValidTestStrandA(svtype),
- "chr1", 1999, SVTestUtils.getValidTestStrandB(svtype), svtype, null, Collections.emptyList(),
- 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
- alleles, GenotypesContext.copy(genotypesWithCopyNumber), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
- final Set resultWithCopyNumber = recordWithCopyNumber.getCarrierSampleSet();
-
+ final SVCallRecord recordCN = getCNVRecordWithCN(ploidy, alleles, svtype, copyNumbers, GATKSVVCFConstants.COPY_NUMBER_FORMAT);
+ final Set resultWithCopyNumber = recordCN.getCarrierSampleSet();
Assert.assertEquals(resultWithCopyNumber, expectedResult);
+ final SVCallRecord recordRDCN = getCNVRecordWithCN(ploidy, alleles, svtype, copyNumbers, GATKSVVCFConstants.DEPTH_GENOTYPE_COPY_NUMBER_FORMAT);
+ final Set resultWithRDCopyNumber = recordRDCN.getCarrierSampleSet();
+ Assert.assertEquals(resultWithRDCopyNumber, expectedResult);
+
// Create genotypes with GT (and no copy number attribute)
final List genotypesWithGenotype = IntStream.range(0, copyNumbers.length)
.mapToObj(i -> new GenotypeBuilder(String.valueOf(i))
@@ -802,13 +848,29 @@ public void testGetCarrierSamplesBiallelic(final int ploidy, final Allele refAll
.collect(Collectors.toList());
final SVCallRecord recordWithGenotype = new SVCallRecord("", "chr1", 1000, SVTestUtils.getValidTestStrandA(svtype),
"chr1", 1999, SVTestUtils.getValidTestStrandB(svtype), svtype, null, Collections.emptyList(),
- 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
alleles, GenotypesContext.copy(genotypesWithGenotype), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final Set resultWithGenotype = recordWithGenotype.getCarrierSampleSet();
Assert.assertEquals(resultWithGenotype, expectedResult);
}
+ private SVCallRecord getCNVRecordWithCN(final int ploidy, List alleles, final GATKSVVCFConstants.StructuralVariantAnnotationType svtype,
+ final int[] copyNumbers, final String cnField) {
+ // Create genotypes with copy number attribute (and no GT)
+ final List genotypesWithCopyNumber = IntStream.range(0, copyNumbers.length)
+ .mapToObj(i -> new GenotypeBuilder(String.valueOf(i))
+ .attribute(cnField, copyNumbers[i])
+ .attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, ploidy)
+ .alleles(SVTestUtils.buildHomAlleleListWithPloidy(Allele.NO_CALL, ploidy))
+ .make())
+ .collect(Collectors.toList());
+ return new SVCallRecord("", "chr1", 1000, SVTestUtils.getValidTestStrandA(svtype),
+ "chr1", 1999, SVTestUtils.getValidTestStrandB(svtype), svtype, null, Collections.emptyList(),
+ 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ alleles, GenotypesContext.copy(genotypesWithCopyNumber), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
+ }
+
@Test
public void testLargeRandom() {
final Random rand = new Random(42);
@@ -819,8 +881,14 @@ public void testLargeRandom() {
records.add(SVTestUtils.newPESRCallRecordWithIntervalAndType(Math.min(pos1, pos2), Math.max(pos1, pos2), GATKSVVCFConstants.StructuralVariantAnnotationType.DEL));
}
final SVClusterEngine engine = SVTestUtils.getNewDefaultMaxCliqueEngine();
- records.stream().sorted(SVCallRecordUtils.getCallComparator(SVTestUtils.hg38Dict)).forEach(engine::add);
- final List output = engine.forceFlush();
+ final List output = new ArrayList<>(
+ records.stream()
+ .sorted(SVCallRecordUtils.getCallComparator(SVTestUtils.hg38Dict))
+ .map(engine::add)
+ .flatMap(List::stream)
+ .collect(Collectors.toUnmodifiableList())
+ );
+ output.addAll(engine.flush());
Assert.assertEquals(output.size(), 2926);
}
}
\ No newline at end of file
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStratificationEngineUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStratificationEngineUnitTest.java
new file mode 100644
index 00000000000..00b6c14e2e6
--- /dev/null
+++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStratificationEngineUnitTest.java
@@ -0,0 +1,555 @@
+package org.broadinstitute.hellbender.tools.sv.stratify;
+
+import com.google.common.collect.Lists;
+import htsjdk.samtools.util.Locatable;
+import org.broadinstitute.hellbender.GATKBaseTest;
+import org.broadinstitute.hellbender.engine.GATKPath;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
+import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
+import org.broadinstitute.hellbender.tools.sv.SVTestUtils;
+import org.broadinstitute.hellbender.tools.sv.stratify.SVStatificationEngine;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+public class SVStratificationEngineUnitTest extends GATKBaseTest {
+
+ private static final GATKPath CONFIG_FILE_PATH = new GATKPath(toolsTestDir + "/sv/sv_stratify_config.tsv");
+
+ private static final String CONTEXT_1_NAME = "context1";
+ private static final String CONTEXT_2_NAME = "context2";
+
+ private static final List CONTEXT_1_INTERVALS = Lists.newArrayList(new SimpleInterval("chr1", 1000, 2000));
+ private static final List CONTEXT_2_INTERVALS = Lists.newArrayList(new SimpleInterval("chr2", 1000, 2000));
+
+ private static SVStatificationEngine makeDefaultEngine() {
+ return new SVStatificationEngine(SVTestUtils.hg38Dict);
+ }
+
+ @Test
+ public void testAddContext() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addContext(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ Assert.assertNotNull(engine.getContextIntervals(CONTEXT_1_NAME));
+ Assert.assertNull(engine.getContextIntervals(CONTEXT_2_NAME));
+ }
+
+ @Test(expectedExceptions = IllegalArgumentException.class)
+ public void testAddDuplicateContext() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addContext(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ engine.addContext(CONTEXT_1_NAME, CONTEXT_2_INTERVALS);
+ }
+
+ @Test
+ public void testNoContexts() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ Assert.assertTrue(engine.getStrata().isEmpty());
+ }
+
+ @Test
+ public void testAddStratification() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addContext(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ engine.addStratification("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 500, Collections.singleton(CONTEXT_1_NAME));
+ final Collection stratificationCollection = engine.getStrata();
+ Assert.assertNotNull(stratificationCollection);
+ Assert.assertEquals(stratificationCollection.size(), 1);
+ final SVStatificationEngine.Stratum stratification = stratificationCollection.iterator().next();
+ Assert.assertNotNull(stratification);
+ Assert.assertEquals(stratification.getSvType(), GATKSVVCFConstants.StructuralVariantAnnotationType.DEL);
+ Assert.assertNotNull(stratification.getMinSize());
+ Assert.assertEquals(stratification.getMinSize().intValue(), 50);
+ Assert.assertNotNull(stratification.getMaxSize());
+ Assert.assertEquals(stratification.getMaxSize().intValue(), 500);
+ Assert.assertEquals(stratification.getContextNames().size(), 1);
+ Assert.assertEquals(stratification.getContextNames().get(0), CONTEXT_1_NAME);
+ }
+
+ @Test(expectedExceptions = IllegalArgumentException.class)
+ public void testAddStratificationBadMinSize() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addStratification("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, -1, 500, Collections.emptySet());
+ }
+
+ @Test(expectedExceptions = IllegalArgumentException.class)
+ public void testAddStratificationBadMaxSize() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addStratification("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, -1, Collections.emptySet());
+ }
+
+ @Test(expectedExceptions = IllegalArgumentException.class)
+ public void testAddStratificationBadMaxSizeInfinity() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addStratification("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Integer.MAX_VALUE, Collections.emptySet());
+ }
+
+ @Test(expectedExceptions = IllegalArgumentException.class)
+ public void testAddStratificationMaxEqualToMin() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addStratification("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 50, Collections.emptySet());
+ }
+
+ @Test(expectedExceptions = IllegalArgumentException.class)
+ public void testAddStratificationMaxLessThanMin() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addStratification("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 49, Collections.emptySet());
+ }
+
+ @Test
+ public void testCreate() {
+ final Map> map = new HashMap<>();
+ map.put(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ map.put(CONTEXT_2_NAME, CONTEXT_2_INTERVALS);
+ final SVStatificationEngine engine = SVStatificationEngine.create(map, CONFIG_FILE_PATH, SVTestUtils.hg38Dict);
+ Assert.assertNotNull(engine);
+ Assert.assertNotNull(engine.getContextIntervals(CONTEXT_1_NAME));
+ Assert.assertEquals(engine.getStrata().size(), 7);
+ }
+
+ @DataProvider(name="testGetMatchVariantsData")
+ public Object[][] testGetMatchVariantsData() {
+ return new Object[][] {
+
+ // DEL
+
+ // Outside context interval
+ { "chr1", 100, "chr1", 200, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null },
+ { "chr1", 2000, "chr1", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null },
+ // Simple match
+ { "chr1", 1100, "chr1", 1200, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ { "chr1", 900, "chr1", 1200, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ { "chr1", 900, "chr1", 1900, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ { "chr1", 1100, "chr1", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ { "chr1", 800, "chr1", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ { "chr1", 999, "chr1", 2001, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ { "chr2", 1100, "chr2", 1200, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ // Wrong contig
+ { "chr3", 1100, "chr3", 1200, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null },
+ // Barely match
+ { "chr1", 1000, "chr1", 3001, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ { "chr1", 2, "chr1", 2000, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ { "chr1", 500, "chr1", 2000, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ // Barely miss overlap threshold
+ { "chr1", 1000, "chr1", 3002, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null },
+ // Barely large enough
+ { "chr1", 1100, "chr1", 1149, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ // Too small
+ { "chr1", 1100, "chr1", 1148, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null },
+
+ // INV (null context)
+
+ // Right size
+ { "chr1", 1001, "chr1", 2000, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, "INV_gt1kb" },
+ { "chr1", 4001, "chr1", 5000, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, "INV_gt1kb" },
+ { "chr2", 10000, "chr2", 20000, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, "INV_gt1kb" },
+ // Too small
+ { "chr1", 1001, "chr1", 1999, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, null },
+ { "chr1", 100, "chr1", 200, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, null },
+
+ // INS
+
+ // In context
+ { "chr1", 1100, "chr1", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, 100, "INS_context1" },
+ // SVLEN should not matter
+ { "chr1", 1100, "chr1", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, 1, "INS_context1" },
+ { "chr1", 1100, "chr1", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, 10000, "INS_context1" },
+ // Out of context
+ { "chr1", 100, "chr1", 100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, 100, null },
+ // Out of size range for context2
+ { "chr2", 1100, "chr2", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, 1000, null },
+ { "chr2", 1100, "chr2", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, 400, null },
+
+ // BND
+
+ // Both ends
+ { "chr1", 1000, "chr1", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, "BND_context1" },
+ { "chr1", 2000, "chr1", 2000, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, "BND_context1" },
+ // One end only
+ { "chr1", 500, "chr1", 900, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, null },
+ { "chr1", 1500, "chr1", 3000, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, null },
+ // No ends
+ { "chr1", 500, "chr1", 3000, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, null },
+
+ // BND (same as CTX)
+
+ // Both ends
+ { "chr1", 1000, "chr1", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, "CTX_context1" },
+ { "chr1", 2000, "chr1", 2000, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, "CTX_context1" },
+ // One end only
+ { "chr1", 500, "chr1", 900, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, null },
+ { "chr1", 1500, "chr1", 3000, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, null },
+ // No ends
+ { "chr1", 500, "chr1", 3000, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, null },
+ };
+ }
+
+ @Test(dataProvider = "testGetMatchVariantsData")
+ public void testGetMatchVariants(final String chromA, final int posA, final String chromB, final int posB,
+ final GATKSVVCFConstants.StructuralVariantAnnotationType svType,
+ final Integer svlen,
+ final String expectedStratName) {
+ final Map> map = new HashMap<>();
+ map.put(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ map.put(CONTEXT_2_NAME, CONTEXT_2_INTERVALS);
+ final SVStatificationEngine engine = SVStatificationEngine.create(map, CONFIG_FILE_PATH, SVTestUtils.hg38Dict);
+ final SVCallRecord record;
+ if (svType == GATKSVVCFConstants.StructuralVariantAnnotationType.INS) {
+ record = SVTestUtils.newCallRecordInsertionWithLengthAndCoordinates(chromA, posA, svlen);
+ } else {
+ record = SVTestUtils.newCallRecordWithCoordinatesAndType("record", chromA, posA, chromB, posB, svType);
+ }
+ final Collection result = engine.getMatches(record, 0.5, 0, 2);
+ if (expectedStratName == null) {
+ Assert.assertTrue(result.isEmpty());
+ } else {
+ Assert.assertFalse(result.isEmpty());
+ Assert.assertEquals(result.iterator().next().getName(), expectedStratName);
+ }
+ }
+
+ // Not supported
+ @Test(expectedExceptions = GATKException.class)
+ public void testGetMatchVariantsCpx() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addContext(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ engine.addContext("context3", Lists.newArrayList(new SimpleInterval("chr1", 1500, 2500)));
+ engine.addStratification("strat1", GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, 50, 500, Collections.singleton("context1"));
+ engine.addStratification("strat2", GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, 50, 500, Collections.singleton("context3"));
+ final SVCallRecord record = SVTestUtils.newCallRecordWithCoordinatesAndType("record", "chr1", 1800, "chr1", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX);
+ // Should throw error
+ engine.getMatches(record, 0.5, 0, 2);
+ }
+
+ @Test
+ public void testGetMatchVariantsMultiple() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addContext(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ engine.addContext("context3", Lists.newArrayList(new SimpleInterval("chr1", 1500, 2500)));
+ engine.addStratification("strat1", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 500, Collections.singleton("context1"));
+ engine.addStratification("strat2", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 500, Collections.singleton("context3"));
+ final SVCallRecord record = SVTestUtils.newCallRecordWithCoordinatesAndType("record", "chr1", 1800, "chr1", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL);
+ final Collection result = engine.getMatches(record, 0.5, 0, 2);
+ final List names = result.stream().map(SVStatificationEngine.Stratum::getName).collect(Collectors.toList());
+ Assert.assertTrue(names.contains("strat1"));
+ Assert.assertTrue(names.contains("strat2"));
+ }
+
+ @Test
+ public void testGetMatchVariantsNullContexts() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addContext(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ engine.addStratification("strat1", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 500, Collections.emptySet());
+ final SVCallRecord record = SVTestUtils.newCallRecordWithCoordinatesAndType("record", "chr2", 1800, "chr2", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL);
+ final Collection result = engine.getMatches(record, 0.5, 0, 2);
+ final List names = result.stream().map(SVStatificationEngine.Stratum::getName).collect(Collectors.toList());
+ Assert.assertEquals(names.size(), 1);
+ Assert.assertEquals(names.get(0), "strat1");
+ }
+
+ @Test
+ public void testGetMatchVariantsNoEngineContexts() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addStratification("strat1", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 500, Collections.emptySet());
+ final SVCallRecord record = SVTestUtils.newCallRecordWithCoordinatesAndType("record", "chr2", 1800, "chr2", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL);
+ final Collection result = engine.getMatches(record, 0.5, 0, 2);
+ final List names = result.stream().map(SVStatificationEngine.Stratum::getName).collect(Collectors.toList());
+ Assert.assertEquals(names.size(), 1);
+ Assert.assertEquals(names.get(0), "strat1");
+ }
+
+ @Test
+ public void testTestAddStratificationInnerClass() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addContext(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ final SVStatificationEngine.Stratum stratification = engine.new Stratum("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 500, Collections.singleton(CONTEXT_1_NAME));
+ engine.addStratification(stratification);
+ final Collection stratificationCollection = engine.getStrata();
+ Assert.assertNotNull(stratificationCollection);
+ Assert.assertEquals(stratificationCollection.size(), 1);
+ final SVStatificationEngine.Stratum stratificationOut = stratificationCollection.iterator().next();
+ Assert.assertNotNull(stratificationOut);
+ Assert.assertEquals(stratificationOut.getSvType(), GATKSVVCFConstants.StructuralVariantAnnotationType.DEL);
+ Assert.assertNotNull(stratificationOut.getMinSize());
+ Assert.assertEquals(stratificationOut.getMinSize().intValue(), 50);
+ Assert.assertNotNull(stratificationOut.getMaxSize());
+ Assert.assertEquals(stratificationOut.getMaxSize().intValue(), 500);
+ Assert.assertEquals(stratificationOut.getContextNames().size(), 1);
+ Assert.assertEquals(stratificationOut.getContextNames().get(0), CONTEXT_1_NAME);
+ }
+
+ @Test
+ public void testMatchesType() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
+ 100, 500,
+ Collections.emptySet()
+ );
+ Assert.assertTrue(strat.matchesType(SVTestUtils.newCallRecordWithLengthAndType(null, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ Assert.assertFalse(strat.matchesType(SVTestUtils.newCallRecordWithLengthAndType(null, GATKSVVCFConstants.StructuralVariantAnnotationType.DUP)));
+ }
+
+ @Test
+ public void testMatchesSizeSimple() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
+ 100, 500,
+ Collections.emptySet()
+ );
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(499, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(50, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(500, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ }
+
+ @Test
+ public void testMatchesSizeNoMin() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
+ null, 500,
+ Collections.emptySet()
+ );
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(499, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(1, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(500, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ }
+
+ @Test
+ public void testMatchesSizeNoMax() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
+ 50, null,
+ Collections.emptySet()
+ );
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(49, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(Integer.MAX_VALUE - 1, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ }
+
+ @Test
+ public void testMatchesSizeNoMinOrMax() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
+ null, null,
+ Collections.emptySet()
+ );
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(1, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(Integer.MAX_VALUE - 1, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ }
+
+ @Test
+ public void testMatchesSizeInsertion() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.INS,
+ 100, 500,
+ Collections.emptySet()
+ );
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS)));
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(499, GATKSVVCFConstants.StructuralVariantAnnotationType.INS)));
+ Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(50, GATKSVVCFConstants.StructuralVariantAnnotationType.INS)));
+ Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(500, GATKSVVCFConstants.StructuralVariantAnnotationType.INS)));
+ }
+
+ @Test
+ public void testMatchesSizeInsertionNullLength() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.INS,
+ 0, Integer.MAX_VALUE - 1,
+ Collections.emptySet()
+ );
+ Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(null, GATKSVVCFConstants.StructuralVariantAnnotationType.INS)));
+ }
+
+ @Test
+ public void testMatchesSizeInsertionNullLength2() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.INS,
+ null, null,
+ Collections.emptySet()
+ );
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(null, GATKSVVCFConstants.StructuralVariantAnnotationType.INS)));
+ }
+
+ @Test
+ public void testMatchesSizeBnd() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
+ null, null,
+ Collections.emptySet()
+ );
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newBndCallRecordWithStrands(true, false)));
+ }
+
+
+ @DataProvider(name="testMatchesContextDelData")
+ public Object[][] testMatchesContextDelData() {
+ return new Object[][] {
+ // Outside context interval
+ { "chr1", 1000, 1500, 0.5, 0, true },
+ { "chr1", 500, 1500, 0.5, 0, true },
+ { "chr1", 499, 1499, 0.5, 0, false },
+ { "chr1", 900, 1300, 0.5, 1, true },
+ { "chr1", 1999, 2000000, 0, 1, true },
+ { "chr1", 500, 600, 0, 2, false },
+ { "chr1", 500, 1100, 0, 2, false },
+ { "chr1", 1100, 1200, 0, 2, true },
+ { "chr1", 1100, 1200, 1, 2, true }
+ };
+ }
+
+ @Test(dataProvider = "testMatchesContextDelData")
+ public void testMatchesContextDel(final String chrom, final int start, final int end,
+ final double overlapFraction, final int numBreakpointOverlaps,
+ final boolean expected) {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addContext(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
+ null, null,
+ Collections.singleton(CONTEXT_1_NAME)
+ );
+ Assert.assertEquals(strat.matchesContext(SVTestUtils.newCallRecordWithCoordinatesAndType("record", chrom, start, chrom, end, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL),
+ overlapFraction, numBreakpointOverlaps, 1), expected);
+ }
+
+ @DataProvider(name="testMatchesContextInsData")
+ public Object[][] testMatchesContextInsData() {
+ return new Object[][] {
+ // Outside context interval
+ { "chr1", 1100, 100, 0.1, 0, true },
+ { "chr1", 1100, 100000, 0.1, 0, true },
+ { "chr1", 999, 100, 0.1, 0, false }
+ };
+ }
+
+ @Test(dataProvider = "testMatchesContextInsData")
+ public void testMatchesContextIns(final String chrom, final int start, final int length,
+ final double overlapFraction, final int numBreakpointOverlaps,
+ final boolean expected) {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addContext(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
+ null, null,
+ Collections.singleton(CONTEXT_1_NAME)
+ );
+ Assert.assertEquals(strat.matchesContext(SVTestUtils.newCallRecordInsertionWithLengthAndCoordinates(chrom, start, length),
+ overlapFraction, numBreakpointOverlaps, 1), expected);
+ }
+
+ @DataProvider(name="testMatchesContextBndData")
+ public Object[][] testMatchesContextBndData() {
+ return new Object[][] {
+ { "chr1", 999, "chr1", 2001, 1, false },
+ { "chr1", 1000, "chr1", 1200, 1, true },
+ { "chr1", 1000, "chr1", 50000, 1, true },
+ { "chr1", 1000, "chr1", 1000, 1, true },
+ { "chr1", 500, "chr1", 1000, 1, true },
+ { "chr1", 1000, "chr1", 1999, 2, true },
+ { "chr1", 1000, "chr1", 2000, 2, true },
+ { "chr1", 1000, "chr2", 1000, 2, false },
+ { "chr1", 1000, "chr1", 2001, 2, false },
+ { "chr1", 999, "chr1", 1000, 2, false }
+ };
+ }
+
+ @Test(dataProvider = "testMatchesContextBndData")
+ public void testMatchesContextBnd(final String chromA, final int posA, final String chromB, final int posB,
+ final int numBreakpointOverlapsInterchrom, final boolean expected) {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addContext(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
+ null, null,
+ Collections.singleton(CONTEXT_1_NAME)
+ );
+ Assert.assertEquals(strat.matchesContext(SVTestUtils.newCallRecordWithCoordinatesAndType("record", chromA, posA, chromB, posB, GATKSVVCFConstants.StructuralVariantAnnotationType.BND),
+ 0.5, 2, numBreakpointOverlapsInterchrom), expected);
+ }
+
+ @DataProvider(name="testCountAnyContextOverlapData")
+ public Object[][] testCountAnyContextOverlapData() {
+ return new Object[][] {
+ { "chr1", 500, 1500, 1 },
+ { "chr1", 1000, 2000, 1 },
+ { "chr1", 1500, 2500, 1 },
+ { "chr1", 500, 2500, 1 },
+ { "chr1", 1100, 1900, 1 },
+ { "chr1", 999, 999, 0 },
+ { "chr1", 999, 1000, 1 },
+ { "chr1", 1000, 1000, 1 },
+ { "chr1", 1000, 1001, 1 },
+ { "chr2", 1000, 1001, 0 },
+ { "chr1", 1999, 2000, 1 },
+ { "chr1", 2000, 2000, 1 },
+ { "chr1", 2001, 2001, 0 }
+ };
+ }
+
+ @Test(dataProvider = "testCountAnyContextOverlapData")
+ public void testCountAnyContextOverlap(final String chrom, final int start, final int end, final int expected) {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addContext(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
+ null, null,
+ Collections.singleton(CONTEXT_1_NAME)
+ );
+ Assert.assertEquals(strat.countAnyContextOverlap(new SimpleInterval(chrom, start, end)), expected);
+ }
+
+ @DataProvider(name="testIsMutuallyExclusiveData")
+ public Object[][] testIsMutuallyExclusiveData() {
+ return new Object[][] {
+ {GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null, null, null,
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null, null, null,
+ false},
+ };
+ }
+
+ @Test
+ public void testGetters() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addContext(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
+ 50, 500,
+ Collections.singleton(CONTEXT_1_NAME)
+ );
+ Assert.assertEquals(strat.getContextNames().size(), 1);
+ Assert.assertEquals(strat.getContextNames().get(0), CONTEXT_1_NAME);
+ Assert.assertEquals(strat.getSvType(), GATKSVVCFConstants.StructuralVariantAnnotationType.DEL);
+ Assert.assertEquals(strat.getMinSize(), Integer.valueOf(50));
+ Assert.assertEquals(strat.getMaxSize(), Integer.valueOf(500));
+ Assert.assertEquals(strat.getName(), "strat");
+ }
+}
\ No newline at end of file
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVClusterIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVClusterIntegrationTest.java
new file mode 100644
index 00000000000..2590ae07f0c
--- /dev/null
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVClusterIntegrationTest.java
@@ -0,0 +1,112 @@
+package org.broadinstitute.hellbender.tools.walkers.sv;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.StructuralVariantType;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.vcf.VCFHeader;
+import org.apache.commons.lang3.tuple.Pair;
+import org.broadinstitute.hellbender.CommandLineProgramTest;
+import org.broadinstitute.hellbender.GATKBaseTest;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
+import org.broadinstitute.hellbender.testutils.VariantContextTestUtils;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
+import org.broadinstitute.hellbender.tools.sv.stratify.SVStratificationEngineArgumentsCollection;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+import java.io.File;
+import java.util.List;
+
+public class GroupedSVClusterIntegrationTest extends CommandLineProgramTest {
+
+ @Test
+ public void testClusterStratified() {
+ final File output = createTempFile("single_linkage_cluster", ".vcf");
+
+ final String clusteringConfigFile = getToolTestDataDir() + "stratified_cluster_params.tsv";
+ final String stratifyConfigFile = getToolTestDataDir() + "stratified_cluster_strata.tsv";
+ final String segdupFile = getToolTestDataDir() + "../SVStratify/hg38.SegDup.chr22.bed";
+ final String segdupName = "SD";
+ final String repeatmaskerFile = getToolTestDataDir() + "../SVStratify/hg38.RM.chr22_subsampled.bed";
+ final String repeatmaskerName = "RM";
+
+ final ArgumentsBuilder args = new ArgumentsBuilder()
+ .addOutput(output)
+ .addVCF(getToolTestDataDir() + "../SVStratify/bwa_melt.chr22.vcf.gz")
+ .add(SVCluster.PLOIDY_TABLE_LONG_NAME, getToolTestDataDir() + "../SVCluster/1kgp.batch1.ploidy.tsv")
+ .add(SVCluster.VARIANT_PREFIX_LONG_NAME, "SVx")
+ .add(SVCluster.ALGORITHM_LONG_NAME, SVCluster.CLUSTER_ALGORITHM.SINGLE_LINKAGE)
+ .add(GroupedSVCluster.CLUSTERING_CONFIG_FILE_LONG_NAME, clusteringConfigFile)
+ .add(SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME, stratifyConfigFile)
+ .add(SVStratificationEngineArgumentsCollection.CONTEXT_NAME_FILE_LONG_NAME, segdupName)
+ .add(SVStratificationEngineArgumentsCollection.CONTEXT_INTERVAL_FILE_LONG_NAME, segdupFile)
+ .add(SVStratificationEngineArgumentsCollection.CONTEXT_NAME_FILE_LONG_NAME, repeatmaskerName)
+ .add(SVStratificationEngineArgumentsCollection.CONTEXT_INTERVAL_FILE_LONG_NAME, repeatmaskerFile)
+ .add(SVStratificationEngineArgumentsCollection.OVERLAP_FRACTION_LONG_NAME, 0.5)
+ .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, GATKBaseTest.hg38Reference);
+
+ runCommandLine(args, GroupedSVCluster.class.getSimpleName());
+
+ final Pair> vcf = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath());
+ final List records = vcf.getValue();
+
+ Assert.assertEquals(records.size(), 1437);
+
+ // Check for specific records
+ int expectedRecordsFound = 0;
+ for (final VariantContext variant : records) {
+ Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY));
+ Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.STRATUM_INFO_KEY));
+ Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE));
+ if (variant.getID().equals("SVx00000032")) {
+ expectedRecordsFound++;
+ Assert.assertEquals(variant.getContig(), "chr22");
+ Assert.assertEquals(variant.getStart(), 11628747);
+ Assert.assertEquals(variant.getEnd(), 11629803);
+ final List algorithms = variant.getAttributeAsStringList(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, null);
+ Assert.assertEquals(algorithms.size(), 2);
+ Assert.assertTrue(algorithms.contains("manta"));
+ Assert.assertTrue(algorithms.contains("wham"));
+ final List members = variant.getAttributeAsStringList(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, null);
+ Assert.assertEquals(members.size(), 2);
+ final List alts = variant.getAlternateAlleles();
+ Assert.assertEquals(alts.size(), 1);
+ Assert.assertEquals(alts.get(0), Allele.SV_SIMPLE_DEL);
+ Assert.assertEquals(variant.getStructuralVariantType(), StructuralVariantType.DEL);
+ Assert.assertEquals(variant.getAttribute(GATKSVVCFConstants.STRATUM_INFO_KEY), "DEL_50_5k_SD_RM");
+ } else if (variant.getID().equals("SVx00000125")) {
+ expectedRecordsFound++;
+ Assert.assertEquals(variant.getContig(), "chr22");
+ Assert.assertEquals(variant.getStart(), 22563654);
+ Assert.assertEquals(variant.getEnd(), 22567049);
+ final List algorithms = variant.getAttributeAsStringList(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, null);
+ Assert.assertEquals(algorithms.size(), 1);
+ Assert.assertTrue(algorithms.contains("manta"));
+ final List members = variant.getAttributeAsStringList(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, null);
+ Assert.assertEquals(members.size(), 1);
+ final List alts = variant.getAlternateAlleles();
+ Assert.assertEquals(alts.size(), 1);
+ Assert.assertEquals(alts.get(0), Allele.SV_SIMPLE_DEL);
+ Assert.assertEquals(variant.getStructuralVariantType(), StructuralVariantType.DEL);
+ Assert.assertEquals(variant.getAttribute(GATKSVVCFConstants.STRATUM_INFO_KEY), SVStratify.DEFAULT_STRATUM);
+ } else if (variant.getID().equals("SVx000001dc")) {
+ expectedRecordsFound++;
+ Assert.assertEquals(variant.getContig(), "chr22");
+ Assert.assertEquals(variant.getStart(), 26060912);
+ Assert.assertEquals(variant.getEnd(), 26060989);
+ final List algorithms = variant.getAttributeAsStringList(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, null);
+ Assert.assertEquals(algorithms.size(), 1);
+ Assert.assertTrue(algorithms.contains("manta"));
+ final List members = variant.getAttributeAsStringList(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, null);
+ Assert.assertEquals(members.size(), 1);
+ final List alts = variant.getAlternateAlleles();
+ Assert.assertEquals(alts.size(), 1);
+ Assert.assertEquals(alts.get(0), Allele.SV_SIMPLE_DUP);
+ Assert.assertEquals(variant.getStructuralVariantType(), StructuralVariantType.DUP);
+ Assert.assertEquals(variant.getAttribute(GATKSVVCFConstants.STRATUM_INFO_KEY), SVStratify.DEFAULT_STRATUM);
+ }
+ }
+ Assert.assertEquals(expectedRecordsFound, 3);
+ }
+}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentationIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentationIntegrationTest.java
index 8f27d2a6389..136e0c36ff0 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentationIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentationIntegrationTest.java
@@ -234,7 +234,6 @@ public void testOverlappingEvents(final List inputVcfs) {
//in NA11829 variant events are not overlapping, so there should be a CN2 homRef in between
final List samplesWithOverlaps = Arrays.asList("HG00365", "HG01789", "HG02221", "NA07357", "NA12005", "NA12873", "NA18997", "NA19428", "NA21120");
- final List samplesWithGaps = Arrays.asList("NA11829");
//all of these samples have an event that overlaps the next event, which is not called in that sample
boolean sawVariant;
@@ -254,18 +253,18 @@ public void testOverlappingEvents(final List inputVcfs) {
}
//these samples have a variant that doesn't overlap the next call
- for (final String sample : samplesWithGaps) {
- sawVariant = false;
- for (final VariantContext vc : overlappingEvents.getRight()) {
- if (!sawVariant && !vc.getGenotype(sample).isHomRef()) {
- sawVariant = true;
- continue;
- }
- if (sawVariant) {
- Assert.assertTrue(vc.getGenotype(sample).isHomRef()
- && (Integer.parseInt(vc.getGenotype(sample).getExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT).toString()) == 2));
- break;
- }
+ sawVariant = false;
+ for (final VariantContext vc : overlappingEvents.getRight()) {
+ final Genotype genotype = vc.getGenotype("NA11829");
+ if (!sawVariant && !genotype.isHomRef()) {
+ sawVariant = true;
+ continue;
+ }
+ if (sawVariant && vc.getEnd() == 23236095) {
+ // Smaller variant nested inside larger hom-var DEL: hom-ref genotype but CN is 0 since it overlaps
+ Assert.assertTrue(genotype.isHomRef()
+ && (Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT).toString()) == 0));
+ break;
}
}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVClusterIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVClusterIntegrationTest.java
index caff0aa9675..5908c6ff06c 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVClusterIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVClusterIntegrationTest.java
@@ -23,6 +23,7 @@
import org.testng.annotations.Test;
import java.io.File;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
@@ -43,7 +44,7 @@ public void testDefragmentation() {
.addVCF(inputVcfPath)
.add(SVCluster.PLOIDY_TABLE_LONG_NAME, getToolTestDataDir() + "1kgp.batch1.ploidy.tsv")
.add(SVCluster.VARIANT_PREFIX_LONG_NAME, "SVx")
- .add(SVCluster.ALGORITHM_LONG_NAME, SVCluster.CLUSTER_ALGORITHM.DEFRAGMENT_CNV)
+ .add(SVCluster.ALGORITHM_LONG_NAME, SVClusterWalker.CLUSTER_ALGORITHM.DEFRAGMENT_CNV)
.add(SVCluster.DEFRAG_PADDING_FRACTION_LONG_NAME, 0.25)
.add(SVClusterEngineArgumentsCollection.DEPTH_SAMPLE_OVERLAP_FRACTION_NAME, 0.5);
@@ -293,17 +294,23 @@ public void testAgainstSimpleImplementation() {
mixedParameters,
pesrParameters);
- vcfInputFilenames.stream()
- .flatMap(vcfFilename -> VariantContextTestUtils.readEntireVCFIntoMemory(getToolTestDataDir() + vcfFilename).getValue().stream())
- .sorted(IntervalUtils.getDictionaryOrderComparator(referenceSequenceFile.getSequenceDictionary()))
- .map(v -> SVCallRecordUtils.create(v, SVTestUtils.hg38Dict))
- .forEach(engine::add);
+ final List expectedRecords = new ArrayList<>();
+ expectedRecords.addAll(
+ vcfInputFilenames.stream()
+ .flatMap(vcfFilename -> VariantContextTestUtils.readEntireVCFIntoMemory(getToolTestDataDir() + vcfFilename).getValue().stream())
+ .sorted(IntervalUtils.getDictionaryOrderComparator(referenceSequenceFile.getSequenceDictionary()))
+ .map(v -> SVCallRecordUtils.create(v, SVTestUtils.hg38Dict))
+ .map(engine::add)
+ .flatMap(List::stream)
+ .collect(Collectors.toList())
+ );
+ expectedRecords.addAll(engine.flush());
- final Comparator recordComparator = SVCallRecordUtils.getCallComparator(referenceSequenceFile.getSequenceDictionary());
- final List expectedVariants = engine.forceFlush().stream()
- .sorted(recordComparator)
+ final Comparator recordComparator = testVcf.getLeft().getVCFRecordComparator();
+ final List expectedVariants = expectedRecords.stream()
.map(SVCallRecordUtils::getVariantBuilder)
.map(VariantContextBuilder::make)
+ .sorted(recordComparator)
.collect(Collectors.toList());
final List testVariants = testVcf.getValue();
@@ -533,5 +540,34 @@ public void testAllosome() {
}
Assert.assertEquals(expectedRecordsFound, 1);
}
+ @Test
+ public void testCleanedVcf() {
+ final File output = createTempFile("cleaned_vcf_cluster", ".vcf");
+ // Note we use very loose clustering criteria on a normal cleaned vcf to ensure some clustering happens
+ final ArgumentsBuilder args = new ArgumentsBuilder()
+ .addOutput(output)
+ .addVCF(getToolTestDataDir() + "bwa_melt.cleaned.chr22_chrY.vcf.gz")
+ .add(SVCluster.PLOIDY_TABLE_LONG_NAME, getToolTestDataDir() + "1kgp.batch1.ploidy.tsv")
+ .add(SVCluster.VARIANT_PREFIX_LONG_NAME, "SVx")
+ .add(SVCluster.ALGORITHM_LONG_NAME, SVCluster.CLUSTER_ALGORITHM.SINGLE_LINKAGE)
+ .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, REFERENCE_PATH)
+ .add(SVClusterEngineArgumentsCollection.DEPTH_SAMPLE_OVERLAP_FRACTION_NAME, 0)
+ .add(SVClusterEngineArgumentsCollection.DEPTH_INTERVAL_OVERLAP_FRACTION_NAME, 0.1)
+ .add(SVClusterEngineArgumentsCollection.DEPTH_BREAKEND_WINDOW_NAME, 10000000)
+ .add(SVClusterEngineArgumentsCollection.MIXED_SAMPLE_OVERLAP_FRACTION_NAME, 0)
+ .add(SVClusterEngineArgumentsCollection.MIXED_INTERVAL_OVERLAP_FRACTION_NAME, 0.1)
+ .add(SVClusterEngineArgumentsCollection.MIXED_BREAKEND_WINDOW_NAME, 5000)
+ .add(SVClusterEngineArgumentsCollection.PESR_SAMPLE_OVERLAP_FRACTION_NAME, 0)
+ .add(SVClusterEngineArgumentsCollection.PESR_INTERVAL_OVERLAP_FRACTION_NAME, 0.1)
+ .add(SVClusterEngineArgumentsCollection.PESR_BREAKEND_WINDOW_NAME, 5000);
+
+ runCommandLine(args, SVCluster.class.getSimpleName());
+
+ final Pair> vcf = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath());
+ final VCFHeader header = vcf.getKey();
+ Assert.assertEquals(header.getSampleNamesInOrder().size(), 161);
+ final List records = vcf.getValue();
+ Assert.assertEquals(records.size(), 1227);
+ }
}
\ No newline at end of file
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratifyIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratifyIntegrationTest.java
new file mode 100644
index 00000000000..f808f7d0c19
--- /dev/null
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratifyIntegrationTest.java
@@ -0,0 +1,169 @@
+package org.broadinstitute.hellbender.tools.walkers.sv;
+
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.vcf.VCFHeader;
+import org.apache.commons.lang3.tuple.Pair;
+import org.broadinstitute.hellbender.CommandLineProgramTest;
+import org.broadinstitute.hellbender.GATKBaseTest;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
+import org.broadinstitute.hellbender.testutils.VariantContextTestUtils;
+import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberStandardArgument;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
+import org.broadinstitute.hellbender.tools.sv.stratify.SVStratificationEngineArgumentsCollection;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+import picard.vcf.VcfUtils;
+
+import java.io.File;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class SVStratifyIntegrationTest extends CommandLineProgramTest {
+
+ @Test
+ public void testBwaMeltCohort() {
+ final File outputDir = createTempDir("stratify");
+ final String inputVcfPath = getToolTestDataDir() + "bwa_melt.chr22.vcf.gz";
+ final String configFile = getToolTestDataDir() + "test_config.tsv";
+
+ final String segdupFile = getToolTestDataDir() + "hg38.SegDup.chr22.bed";
+ final String segdupName = "SD";
+ final String repeatmaskerFile = getToolTestDataDir() + "hg38.RM.chr22_subsampled.bed";
+ final String repeatmaskerName = "RM";
+
+ final ArgumentsBuilder args = new ArgumentsBuilder()
+ .addOutput(outputDir)
+ .add(CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME, "test")
+ .add(SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME, configFile)
+ .add(SVStratificationEngineArgumentsCollection.CONTEXT_NAME_FILE_LONG_NAME, segdupName)
+ .add(SVStratificationEngineArgumentsCollection.CONTEXT_INTERVAL_FILE_LONG_NAME, segdupFile)
+ .add(SVStratificationEngineArgumentsCollection.CONTEXT_NAME_FILE_LONG_NAME, repeatmaskerName)
+ .add(SVStratificationEngineArgumentsCollection.CONTEXT_INTERVAL_FILE_LONG_NAME, repeatmaskerFile)
+ .add(SVStratificationEngineArgumentsCollection.OVERLAP_FRACTION_LONG_NAME, 0.5)
+ .add(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, GATKBaseTest.FULL_HG38_DICT)
+ .add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVcfPath);
+
+ runCommandLine(args, SVStratify.class.getSimpleName());
+
+ final File[] outputFiles = outputDir.listFiles();
+ Assert.assertEquals(outputFiles.length, 14);
+ final Map expectedOutputSuffixes = new HashMap<>();
+ expectedOutputSuffixes.put("INS_small_SD", 46);
+ expectedOutputSuffixes.put("DEL_50_5k_both", 110);
+ expectedOutputSuffixes.put("DEL_5k_50k_SD", 2);
+ expectedOutputSuffixes.put("DUP_lt5kb_RM", 0);
+ expectedOutputSuffixes.put("INV_gt1kb", 26);
+ expectedOutputSuffixes.put("BND_SD", 77);
+ expectedOutputSuffixes.put(SVStratify.DEFAULT_STRATUM, 1196);
+ int numVcfs = 0;
+ int totalRecords = 0;
+ for (final File file : outputFiles) {
+ if (VcfUtils.isVariantFile(file)) {
+ ++numVcfs;
+ final Pair> outputVcf = VariantContextTestUtils.readEntireVCFIntoMemory(file.getAbsolutePath());
+ boolean foundSuffix = false;
+ for (final String suffix : expectedOutputSuffixes.keySet()) {
+ if (file.toString().contains("." + suffix + ".")) {
+ foundSuffix = true;
+ for (final VariantContext variant : outputVcf.getRight()) {
+ Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.STRATUM_INFO_KEY));
+ Assert.assertEquals(variant.getAttribute(GATKSVVCFConstants.STRATUM_INFO_KEY), suffix);
+ }
+ final int expectedSize = expectedOutputSuffixes.get(suffix).intValue();
+ final int actualSize = outputVcf.getRight().size();
+ Assert.assertEquals(actualSize, expectedSize,
+ "Expected " + expectedSize + " records but found " + actualSize + " in " + suffix);
+ totalRecords += actualSize;
+ break;
+ }
+ }
+ Assert.assertTrue(foundSuffix, "Unexpected file suffix: " + file.getAbsolutePath());
+ }
+ }
+ Assert.assertEquals(numVcfs, 7);
+ final int numInputRecords = VariantContextTestUtils.readEntireVCFIntoMemory(inputVcfPath).getRight().size();
+ Assert.assertEquals(totalRecords, numInputRecords);
+ }
+
+ @Test(expectedExceptions = GATKException.class)
+ public void testBwaMeltCohortRedundant() {
+ final File outputDir = createTempDir("stratify");
+ final String inputVcfPath = getToolTestDataDir() + "bwa_melt.chr22.vcf.gz";
+ final String configFile = getToolTestDataDir() + "test_config_redundant.tsv";
+
+ final String segdupFile = getToolTestDataDir() + "hg38.SegDup.chr22.bed";
+ final String segdupName = "SD";
+ final String repeatmaskerFile = getToolTestDataDir() + "hg38.RM.chr22_subsampled.bed";
+ final String repeatmaskerName = "RM";
+
+ final ArgumentsBuilder args = new ArgumentsBuilder()
+ .addOutput(outputDir)
+ .add(CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME, "test")
+ .add(SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME, configFile)
+ .add(SVStratificationEngineArgumentsCollection.CONTEXT_NAME_FILE_LONG_NAME, segdupName)
+ .add(SVStratificationEngineArgumentsCollection.CONTEXT_INTERVAL_FILE_LONG_NAME, segdupFile)
+ .add(SVStratificationEngineArgumentsCollection.CONTEXT_NAME_FILE_LONG_NAME, repeatmaskerName)
+ .add(SVStratificationEngineArgumentsCollection.CONTEXT_INTERVAL_FILE_LONG_NAME, repeatmaskerFile)
+ .add(SVStratificationEngineArgumentsCollection.OVERLAP_FRACTION_LONG_NAME, 0.5)
+ .add(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, GATKBaseTest.FULL_HG38_DICT)
+ .add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVcfPath);
+
+ runCommandLine(args, SVStratify.class.getSimpleName());
+ }
+
+ @Test
+ public void testBwaMeltCohortBypassRedundant() {
+ final File outputDir = createTempDir("stratify");
+ final String inputVcfPath = getToolTestDataDir() + "bwa_melt.chr22.vcf.gz";
+ final String configFile = getToolTestDataDir() + "test_config_redundant.tsv";
+
+ final String segdupFile = getToolTestDataDir() + "hg38.SegDup.chr22.bed";
+ final String segdupName = "SD";
+ final String repeatmaskerFile = getToolTestDataDir() + "hg38.RM.chr22_subsampled.bed";
+ final String repeatmaskerName = "RM";
+
+ final ArgumentsBuilder args = new ArgumentsBuilder()
+ .addOutput(outputDir)
+ .add(CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME, "test")
+ .add(SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME, configFile)
+ .add(SVStratificationEngineArgumentsCollection.CONTEXT_NAME_FILE_LONG_NAME, segdupName)
+ .add(SVStratificationEngineArgumentsCollection.CONTEXT_INTERVAL_FILE_LONG_NAME, segdupFile)
+ .add(SVStratificationEngineArgumentsCollection.CONTEXT_NAME_FILE_LONG_NAME, repeatmaskerName)
+ .add(SVStratificationEngineArgumentsCollection.CONTEXT_INTERVAL_FILE_LONG_NAME, repeatmaskerFile)
+ .add(SVStratificationEngineArgumentsCollection.OVERLAP_FRACTION_LONG_NAME, 0.5)
+ .add(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, GATKBaseTest.FULL_HG38_DICT)
+ .add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVcfPath)
+ .addFlag(SVStratify.ALLOW_MULTIPLE_MATCHES_LONG_NAME);
+
+ runCommandLine(args, SVStratify.class.getSimpleName());
+ }
+
+ @Test(expectedExceptions = {GATKException.class, IllegalArgumentException.class})
+ public void testBwaMeltCohortDuplicateContextName() {
+ final File outputDir = createTempDir("stratify");
+ final String inputVcfPath = getToolTestDataDir() + "bwa_melt.chr22.vcf.gz";
+ final String configFile = getToolTestDataDir() + "test_config_duplicate.tsv";
+
+ final String segdupFile = getToolTestDataDir() + "hg38.SegDup.chr22.bed";
+ final String segdupName = "SD";
+ final String repeatmaskerFile = getToolTestDataDir() + "hg38.RM.chr22_subsampled.bed";
+ final String repeatmaskerName = "RM";
+
+ final ArgumentsBuilder args = new ArgumentsBuilder()
+ .addOutput(outputDir)
+ .add(CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME, "test")
+ .add(SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME, configFile)
+ .add(SVStratificationEngineArgumentsCollection.CONTEXT_NAME_FILE_LONG_NAME, segdupName)
+ .add(SVStratificationEngineArgumentsCollection.CONTEXT_INTERVAL_FILE_LONG_NAME, segdupFile)
+ .add(SVStratificationEngineArgumentsCollection.CONTEXT_NAME_FILE_LONG_NAME, repeatmaskerName)
+ .add(SVStratificationEngineArgumentsCollection.CONTEXT_INTERVAL_FILE_LONG_NAME, repeatmaskerFile)
+ .add(SVStratificationEngineArgumentsCollection.OVERLAP_FRACTION_LONG_NAME, 0.5)
+ .add(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, GATKBaseTest.FULL_HG38_DICT)
+ .add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVcfPath);
+
+ runCommandLine(args, SVStratify.class.getSimpleName());
+ }
+}
\ No newline at end of file
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/sv/sv_stratify_config.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/sv/sv_stratify_config.tsv
new file mode 100644
index 00000000000..8422edb8c11
--- /dev/null
+++ b/src/test/resources/org/broadinstitute/hellbender/tools/sv/sv_stratify_config.tsv
@@ -0,0 +1,8 @@
+NAME SVTYPE MIN_SIZE MAX_SIZE CONTEXT
+INS_context1 INS -1 -1 context1
+DEL_50_5k_both DEL 50 5000 context1,context2
+DEL_5k_50k_context1 DEL 5000 50000 context1
+DUP_lt5kb_context1 DUP -1 5000 context1
+INV_gt1kb INV 1000 -1 NULL
+BND_context1 BND -1 -1 context1
+CTX_context1 CTX -1 -1 context1
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster/stratified_cluster_params.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster/stratified_cluster_params.tsv
new file mode 100644
index 00000000000..b94357460a6
--- /dev/null
+++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster/stratified_cluster_params.tsv
@@ -0,0 +1,3 @@
+NAME RECIPROCAL_OVERLAP SIZE_SIMILARITY BREAKEND_WINDOW SAMPLE_OVERLAP
+INS_small_SD 0.1 0.5 50 0
+DEL_50_5k_SD_RM 0.1 0.5 1000 0
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster/stratified_cluster_strata.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster/stratified_cluster_strata.tsv
new file mode 100644
index 00000000000..38943b0807e
--- /dev/null
+++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster/stratified_cluster_strata.tsv
@@ -0,0 +1,3 @@
+NAME SVTYPE MIN_SIZE MAX_SIZE CONTEXT
+INS_small_SD INS -1 -1 SD
+DEL_50_5k_SD_RM DEL 50 5000 SD,RM
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/bwa_melt.cleaned.chr22_chrY.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/bwa_melt.cleaned.chr22_chrY.vcf.gz
new file mode 100644
index 0000000000000000000000000000000000000000..ebf9af067ff08545821a2a2247a38b1c13e30575
GIT binary patch
literal 468997
zcmZs>Q;aAK4=p;jZQHhO+qP}nwr$(CZQI=Ane+Yk?Vg;zv@a`7)2^gx7k)GZ2*CgB
z4-7y^7ziK``<7Q*nYeRx{-)mHmhXfR*izE>HKf*PkXbksa14yfvxsMbp^i0U!;CFi
z|F4pC^Y(6yvwSt^!;O7TO{|3!SMt2Bcw3%-SGMe9*XEZ&?@k}L?+feh5#R6ga`$m^
z)S%<tSeogD_S@hnrN8_ZO|Ci<(Cr76OD!g2M
z^(nl$+`f1DJbeD6$DJ?R&pTr_zfT4YepT!IuV&rLBkSz%GPbVjl+Nlj>|^%ClCK+6
z-kV+jtT}ws=pVtK-rRV6bmey^g=(M!3rD38$5gpz^UA4b?aK4AixHr^kCyQK<1tbG
zcGN3BTooz*9G~>bo5?f$$8i(8rs~b?o!5MPG&yHex1`_p7sth}3~3Q9z!x}qy6Y7W
zJK>g99!@@-Mr++?rS9$Gs$gr~ccJL)^s@g-)#;z#9KOAi$4m3aiC+<+Bw1(Q`})U$
zR!~=-gpi#<_uIC~N2$BD#U{4>ku#&_nK!>0pw46wX}T<4zm%I^FrxqEuIzYxzZ~BE
zIzcu-Uj7y!1!pQtt8j>7KuQ^gvzt=IzPZM;Yv)qT3RTC$baxS^-&S#@oBw
zKaxpqeSbN$r-FL@`K4p%`d10z>eK+-rNd1p)D
z5_+82JyXaWSCx80-)g(Q{Z{AY|M~UxR|qz~KJzYsPHNtIIr8O6P~DsJV%Gqt$Ar-u#+8`RmCD3ff}k;I6Dkbu)DEaL#L4gHQc8e_Q!{`=
zWCRQKI7fDvp9<9&(pnA3?d8M9)ARY@v?9bT-gwfP8uIj}TbX~H7Q$7dD$<)tQ}V@c
z1*Z~~y(FZtZH{Lj-D_7=_vh&PVBq4zimewh*sH7Sk3ox;>{hPi
zEvM3;dv(2el6@EwhU!FqnQLCHmv5dpYTAB23@5ZBOep#>d(&jRI&yZ_3np;=y{lZU
zR|hAvRnVDeV*_(CzjNi$K~+Sbntar_EW|e2=)Xd|WyPX`m5HXO)=`JW3FGA8wHDqA
z#Qj>5(MCP;r0bWQ>CENZr_DP{x&U_rcExFT>sXxB4Xg*~NRDyv;&fQLb5?ie@}l@8
z(5F$iRm6S|KYqclD|z^Ha(s_}9d4RNLL}Vs$=vnZFuYrAXrnOIYJ5MxpRIp&qD*b?
z8R;1GyuYetx=XHw`!v3jX}P_aMIkONIU9gNQ%Qc7DpcrUM^OE&bTLciufNU(Xzy%kNP~fk^L4|
zCks-8Iw$e(Nf*+oOz-OY6?c2omnk_tfJ)Cjcl+qMbnXdB7g91oy`xyC+od#fq}{2i
z61CbXE}Vko_ELI$96rBn_et!_tPZ?+W|%4BIPKBQ-=1<#i;Ogzc^XpBH>H>>qG(Qq
z|9b*7FRWRh!PlHFr$M4@AB?X(O(uXXqG?Q#uQ@H232ExFNRzKU9gc`(L33>OF{7nd
zN(_aRrg?$7K@~|NRSROI4$X`RVu`CotszOHeIP*!1Qnf;Ge(QzK;dasX+Y+b2vQ)K
zQ5%#y4KT$tuB??-V=@m+h=8Tl8L2Um1C38&FiCq-V=51nI0>+Qs5OaO${=NIb^vWx
z;)FZ>)37;&vLkiEAQwz4MWZ`a!;(%ju*A@Yc#!W#84<*gkX@wtqo;lkc_uMY^c7m2
zOUC~`r=-XAG{%Phldd|d%KrK|v*XN`MPJt^|FBozGP-T%MZ0b6_tT^+OP=q3lD;YP
zdE=AA#NHR_ZcqMr((bLx`R13PZqonW-OhbuIf^=bb({a)-p+mH`(x|#n<$^|&9>z8
zo3VdWNxUwa*jvx;?)~~>i@w<3{}*?nV)A#yc^~^!jtB28^8$JwUNa7i_xq6Sf-%o`
zmy;js78map7VORqPL9ro2QHW}N(C!yr)0RKP>Og=hDyfT
zKb|OQoJ__g3!Kc9ob|a$#p=KWOAd#WTt+tQlEskhlzinCX+U;Lw%T9b7&M-7S1%JP
zIeRZ@voumt#XCl9xU^7`C@0I=N0naT$B4-EKfouER%xLmD~{m-1h6;;MNtw%eZ3IEAiZuF8(RW!e=@qSN%c+%Oj4QX+M|C
zsCo-w5#(O^O{)wd>V%NR%3xiyteke90cT!43B8GHJq?t6zAw~Mcbmuu~6Ye@UE`^U$#T$(9&w%|N2XKRX|SWYO?j`r;?
zI<%F2^kolQiT!GgP*+Zq#6ufQ*iE&-;`VBblfOsH^Ka4Lud@DV-6z$>iaI0X>wS4&
zR&J#~|0}qOxcupz6sADH!8rHh=O9Rq5$
zneCIkuLNQUsGwV&q{3pT`!ZcBQ-gbtXn(bmx=->>v?SWvvzFKn5Mm9!ns$+^(1N)S
zbNM@*c7vmC56F-(W)Ngx0opY&I4D(F$QT$M5qOv@QH7h3UmD7wj8Jd_pU+wgLk}oN
zrk&ZOyxdUpq9bwq`V$+$+DBz5UcKOPNtwk0ubp6N*G>v&0%kL;OK>J)G8{6nxL&Q+
zgaqOMC+--{kVFJu?OT0_`ue|Wz6dO0)culcVFCokB7`gvSXm=sa5igFf-EV-*eXs%
zh^A#QE6>&?GyJPx^$j@mh%3wMr4-uLu5sxb)iDx&X;>X)WNfXXIy33E5L*zK`bfu&
zt5VqDkUcUsMh$8K3&3Y6_c9UgV*qNr$yHPm)P2N>^bBZKBLp?|BC3}sW{{84P*ee2
zaGCYB=B;;+h~iBV-nDp`N_8d`P{o>;P{wtzC#9Xu$uhD7PZ%Ct3Beq22vq4%1$9T-
z8vq{a5K_dv-eE0lJQ_2DVJxHQdnK?YDBI|J)Mk-YK=FwH;)ZLa3}uM2jqWzYpu~JG
z5ak<3w!(CgK+RA+lSs3RQ1HR&FukY@>4)}$0h3($&N&3wwk1qW;q$8}!(?t3v0%s3
z`B0E3G7L3~r7o^jgCMZ}qg|P+D)m1c_J~-)cq|wIcu0^TY%;uY9tFIMP74d2mh+lQ
zIqpLCig_@BGXsC+HJX}k!z}2gun{ai1X%pF7m9_f`b0!F98(xu0CeCDkW6fWY;45<
zx}aDfVo=M|F$IHW$K9f$-pQ+Gp&&rbg$!d0xrl2#ct))RK@5A1R8{rN+G>F4V1od}
zikN{cz{3f?7NLB|Y&m04{Y{s$YNQ4;iVsuR;%FvDZl;Ij0DUB0M>PPK*Iq8ap{IoRaEjdb5_dyO++c!36Nd2kO*54n1znzys}8pzeHb*~kE~U7M^E2ErhY*pw$b
zTvvPo-ogvENYg14A+m}TCff1HOiF}K3#I)fwfT47$?43wGa->3ls}lsE-4y_CJ{mhILYbBQQ6jJt1j<
zok$q82KWLA*{QDq`Max31E6!hEx8vYIV0X1)X1g?F+Ih&P+kqS?=33
zC!f~jh)bd&Ha`tYnv(iDJUz=HMa^n@jvNypUTc0Fj5HBvor54X=2
z-|U)QT3IT;tm0qgKM!UvI)oO=aLOB)83s+w94=;S^+6p3wMG|o(<J7fj>Dn
z>uFfFg0ZYzYaCcC6dI{|y3))JK215;e9#m@fq-x&?j;{!hOy}pHrLvl>___Gcmf$S
z7oMP(U~$pNF;eJsVmLMcEtoLJfZxXKX2Cwl6(5M=pBwhr##(+QXum+0f(&zts7R~=
z|@5ad`Wb|qVy@q#sio4SnWI<
z+Xc7p$2bwd;8F^z?XTo=R;fD{%J#iDdOM_u^@
zW;2$UMGz7b;V^f!=h=bW85ax-RvTr()@}`nmX*PESj9-Ga9Ohnd52(Vhf+u*KRY_D
z2}E#+O+p}x1kF$cqvS?$TPnFo;S_+#Jw(72?GekQwkmMCVcCFuA00jAr2+Emfqe(n
zpgkZ56$0`PZo0#~7e1n>EKn-POha=a3hIgqtTy>!q{o$fR>6Vg?k-4y8{<*g2+W=<
z)Pus;v^iY(3WeGl5vc%DT?vh1PfFqqN!M75M4*