From 5a10c0cb9b0b3bd93862e390f2bc98bb3acab6fa Mon Sep 17 00:00:00 2001
From: Chris Norman
Date: Tue, 27 Aug 2024 17:09:48 -0400
Subject: [PATCH] More tests and javadoc.
---
.../cmdline/CommandLineProgram.java | 2 +-
.../hellbender/tools/CreateBundle.java | 198 ++++++++++++------
.../engine/BundleSupportIntegrationTest.java | 34 +++
.../MultiVariantWalkerIntegrationTest.java | 3 +-
.../tools/CreateBundleIntegrationTest.java | 79 +++++--
.../tools/PrintReadsIntegrationTest.java | 57 ++++-
.../engine/print_reads_bundle_do_not_use.json | 10 +
7 files changed, 287 insertions(+), 96 deletions(-)
create mode 100644 src/test/java/org/broadinstitute/hellbender/engine/BundleSupportIntegrationTest.java
create mode 100644 src/test/resources/org/broadinstitute/hellbender/engine/print_reads_bundle_do_not_use.json
diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/CommandLineProgram.java b/src/main/java/org/broadinstitute/hellbender/cmdline/CommandLineProgram.java
index 8835707c556..2056b676870 100644
--- a/src/main/java/org/broadinstitute/hellbender/cmdline/CommandLineProgram.java
+++ b/src/main/java/org/broadinstitute/hellbender/cmdline/CommandLineProgram.java
@@ -156,7 +156,7 @@ public Object instanceMainPostParseArgs() {
tmpDir = new GATKPath(System.getProperty("java.io.tmpdir"));
}
- // Build the default headers
+ // Build the defauNlt headers
final ZonedDateTime startDateTime = ZonedDateTime.now();
this.defaultHeaders.add(new StringHeader(commandLine));
this.defaultHeaders.add(new StringHeader("Started on: " + Utils.getDateTimeForDisplay(startDateTime)));
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java b/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java
index 1cdc73374f7..9e65720b896 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java
@@ -20,115 +20,173 @@
import java.util.*;
/**
- * Create a bundle (JSON) file for use with a GATK tool.
- *
- * Since most bundles need to contain a primary resource plus at least one secondary resource (typically an index),
- * the tool will attempt to infer standard secondary resources(s) for a given primary resource if no secondary resource
- * is explicitly provided on the command line. Inferred secondary resources are automatically added to the resulting
- * bundle. Secondary resource inference can be suppressed by using the --suppress-resource-resolution argument.
- *
- * Each resource in a bundle must have an associated content type tag. Content types for each resource are either
- * specified on the command line via argument tags, or inferred by the tool. For the primary and secondary resources,
- * when no content type argument tag is provided, the tool will attempt to infer the content type from the file
- * extension. However, the content type for "other" resources (resources that are nether primary nor secondary
- * resources) are NEVER inferred, and must always include a content type argument tag.
+ *
+ * Create a single bundle (JSON) file from one or more related inputs, for use as input to a GATK tool.
+ *
Bundle Structure
+ * A bundle is a JSON file that contains references to one or more related resources (i.e., a VCF file
+ * and it's associated index file, or a .fasta reference file and it's associated index and dictionary files).
+ * Bundle files can be supplied as inputs for many GATK tools. The primary advantage to using a bundle input
+ * over an individual file input is that a bundle allows the individual resources to be located in different
+ * directories (ie., a VCF input and it's corresponding index file generally need to be siblings in the same
+ * parent directory when the VCF is specified as an individual file, whereas using a bundle file,
+ * the index can be located in a different directory).
+ *
+ * Each resource in a bundle has an associated content type, which is a string that identifies the type of data
+ * in that resource. One resource in the bundle is always designated as the "primary" resource, which determines
+ * the type of the bundle.
*
+ *
An example VCF bundle JSON file is show below. The bundle has 3 resources, with content types
+ * "HAPLOID_REFERENCE", "REFERENCE_DICTIONARY", and "REFERENCE_INDEX" respectively, with the "HAPLOID_REFERENCE"
+ * resource designated as the primary resource. This bundle is a reference bundle, and can be used as an
+ * input where ever a reference argument is to be provided.
+ *
+ * {
+ * "schema": {
+ * "schemaVersion": "0.1.0",
+ * "schemaName": "htsbundle"
+ * },
+ * "HAPLOID_REFERENCE": {"path": "file:///projects/print_reads.fasta"},
+ * "REFERENCE_DICTIONARY": {"path": "file:///projects/print_reads.dict"},
+ * "REFERENCE_INDEX": {"path": "file:///projects/print_reads.fasta.fai"},
+ * "primary": "HAPLOID_REFERENCE"
+ * }
+ *
+ * Using CreateBundle
+ *
+ * CreateBundle requires at least one primary resource input. One or more optional secondary resource inputs may also
+ * be supplied.
+ *
+ * The simplest way to use CreateBundle is to specify only the primary resource. In this case (no secondary
+ * resources are explicitly provided), CreateBundle will attempt to locate and infer "standard" secondary
+ * resources, as long as the primary resource content type is a well known type (see "Standard Secondary
+ * Resources" below), unless the "--suppress-resource-resolution" argument is used to suppress this behavior.
+ * However, secondary resource inference requires that the primary resource has a well known type (variants
+ * or a fasta file), and the secondary resources must be siblings (in the same parent directory) as the
+ * primary resource. If the type of the primary resource is not supplied, and cannot be determined, or if the
+ * standard secondary resources for a standard primary resource cannot be found in the same director as the
+ * primary, an exception will be thrown.
+ *
+ * Another way to is to explicitly specify all of the resources and their content types. This is useful when
+ * the resources are not in the same directory, or when the content types are not standard.
+ *
+ * For the primary resource, if the content type is not specified on the command line (content types are
+ * supplied on the command line using argument tags - see "Standard Content Types" and the examples below),
+ * CreateBundle will attempt to determine the content type of the primary resource. Content types for explicitly
+ * supplied secondary resources are never inferred, and must always be supplied explicitly.
+ *
* Bundle output file names must end with the suffix ".json".
- *
- * In general, content types can be any string, but there are well known content types that must be used when creating
- * bundles for tools that expect well known resources types, such as a VCF, a VCF index, a .fasta file, or a reference
+ *
+ *
Standard Content Types
+ * In general, bundle content types can be any string, but many tools expect bundles to use standard, well known
+ * content types that are pre-defined, such as content types for a VCF, a VCF index, a .fasta file, or a reference
* dictionary file. The common well known content types are:
- *
- * - "CT_VARIANT_CONTEXTS": a VCF file
- * - "CT_VARIANTS_INDEX: VCF" index file
- *
- * - "CT_HAPLOID_REFERENCE": fasta reference file
- * - "CT_HAPLOID_REFERENCE_INDEX": fasta index file
- * - "CT_HAPLOID_REFERENCE_DICTIONARY": fasta dictionary file
- *
- * Common bundle creation examples:
- *
- * VCF Bundles:
- *
+ * Standard VCF Content Types
+ *
+ * - "CT_VARIANT_CONTEXTS": a VCF file
+ * - "CT_VARIANTS_INDEX": a VCF index file
+ *
+ * Standard Reference Content Types
+ *
+ * - "CT_HAPLOID_REFERENCE": a fasta reference file
+ * - "CT_HAPLOID_REFERENCE_INDEX": a fasta index file
+ * - "CT_HAPLOID_REFERENCE_DICTIONARY": a fasta dictionary file
+ *
+ *
Standard Secondary Resources
+ * For VCFS, the standard (inferred) secondary resources are:
+ *
+ * For references, the standard secondary resources are:
+ *
+ * - an index file
+ *- a dictionary file
+ *
+ *
+ *
Common bundle creation examples:
+ *
+ * VCF Bundle Examples
+ *
* 1) Create a resource bundle for a VCF from just the VCF, letting the tool resolve the secondary (index) resource by
* automatically finding the sibling index file, and letting the tool determine the content types. If the sibling index
- * file cannot be found, an exception will be thrown. Resulting bundle contains the VCF and associated index.
- *
+ * file cannot be found, an exception will be thrown. The resulting bundle contains the VCF and associated index.
+ *
* CreateBundle \
* --primary path/to/my.vcf \
* --output mybundle.json
- *
- * The exact same bundle could be created manually by specifying both the resources and the content types explicitly:
- *
+ *
+ * The exact same bundle could be created manually by specifying both the resources and the content types explicitly:
+ *
* CreateBundle \
* --primary:CT_VARIANT_CONTEXTS path/to/my.vcf \
* --secondary:CT_VARIANTS_INDEX path/to/my.vcf.idx \
* --output mybundle.json
- *
+ *
+ *
* 2) Create a resource bundle for a VCF from just the VCF, but suppress automatic resolution of the secondary
- * resources. Let the tool determine the content types. The resulting bundle will contain only the vcf resource:
- *
+ * resources. Let the tool determine the content type. The resulting bundle will contain only the VCF resource:
+ *
* CreateBundle \
* --primary path/to/my.vcf \
* --suppress-resource-resolution \
* --output mybundle.json
- *
+ *
* 3) Create a resource bundle for a VCF, but specify the VCF AND the secondary index resource explicitly (which
* suppresses automatic secondary resolution). This is useful when the VCF and index are not in the same directory.
- * Let the tool determine the content types. The resulting bundle will contain the VCF and index resources:
- *
+ * Let the tool determine the primary content type. The resulting bundle will contain the VCF and index resources:
+ *
* CreateBundle \
* --primary path/to/my.vcf \
- * --secondary some/other/path/to/vcd.idx \
+ * --secondary:CT_VARIANTS_INDEX some/other/path/to/vcd.idx \
* --output mybundle.json
- *
+ *
* 4) Create a resource bundle for a VCF, but specify the VCF AND the secondary index resource explicitly (this
* is useful when the VCF and index are not in the same directory), and specify the content types explicitly via
* command line argument tags. The resulting bundle will contain the VCF and index resources.
- *
+ *
* CreateBundle \
* --primary:CT_VARIANT_CONTEXTS path/to/my.vcf \
* --secondary:CT_VARIANTS_INDEX some/other/path/to/vcd.idx \
* --output mybundle.json
- *
- * Reference bundles:
- *
+ *
+ *
Reference Bundle Examples
+ *
* 1) Create a resource bundle for a reference from just the .fasta, letting the tool resolve the secondary
* (index and dictionary) resource by automatically finding the sibling files, and determining the content types.
* If the sibling index file cannot be found, an exception will be thrown. The resulting bundle will contain the
* reference, index, and dictionary.
- *
+ *
* CreateBundle \
* --primary path/to/my.fasta \
* --output mybundle.json
- *
+ *
* 2) Create a resource bundle for a reference from just the .fasta, but suppress resolution of the secondary index and
- * dictionary resources). Let the tool determine the content type. The resulting bundle will contain only the .fasta
+ * dictionary resources. Let the tool determine the content type. The resulting bundle will contain only the .fasta
* resource:
- *
+ *
* CreateBundle \
* --primary path/to/my.fasta \
* --suppress-resource-resolution \
* --output mybundle.json
- *
+ *
* 3) Create a resource bundle for a fasta, but specify the fasta AND the secondary index and dictionary resources
* explicitly (which suppresses automatic secondary resolution). Let the tool determine the content types. The
* resulting bundle will contain the fasta, index and dictionary resources:
- *
+ *
* CreateBundle \
* --primary path/to/my.fasta \
- * --secondary some/other/path/to/my.fai \
- * --secondary some/other/path/to/my.dict \
+ * --secondary:CT_HAPLOID_REFERENCE_INDEX some/other/path/to/my.fai \
+ * --secondary:CT_HAPLOID_REFERENCE_DICTIONARY some/other/path/to/my.dict \
* --output mybundle.json
- *
+ *
* 4) Create a resource bundle for a fasta, but specify the fasta, index and dictionary resources and the content
* types explicitly. The resulting bundle will contain the fasta, index and dictionary resources:
- *
+ *
* CreateBundle \
* --primary:CT_HAPLOID_REFERENCE path/to/my.fasta \
* --secondary:CT_HAPLOID_REFERENCE_INDEX some/other/path/to/my.fai \
* --secondary:CT_HAPLOID_REFERENCE_DICTIONARY some/other/path/to/my.dict \
* --output mybundle.json
+ *
*/
@DocumentedFeature
@CommandLineProgramProperties(
@@ -141,15 +199,15 @@ public class CreateBundle extends CommandLineProgram {
@Argument(fullName = StandardArgumentDefinitions.PRIMARY_RESOURCE_LONG_NAME,
shortName = StandardArgumentDefinitions.PRIMARY_RESOURCE_SHORT_NAME,
- doc="Path to the primary bundle resource (content type will be inferred if no content type tag is specified)")
+ doc="Path to the primary resource (content type will be inferred if no content type tag is specified)")
GATKPath primaryResource;
@Argument(fullName = StandardArgumentDefinitions.SECONDARY_RESOURCE_LONG_NAME,
shortName = StandardArgumentDefinitions.SECONDARY_RESOURCE_SHORT_NAME,
- doc = "Path to a secondary bundle resource for" + StandardArgumentDefinitions.PRIMARY_RESOURCE_LONG_NAME +
- "(usually an index). The content type will be inferred if no content type tag is specified. If no "+
- " secondary resource is specified, standard secondary resources for the primary bundle resource " +
- " will also automatically be inferred unless " + SUPPRESS_SECONDARY_RESOURCE_RESOLUTION_FULL_NAME +
+ doc = "Path to a secondary resource for" + StandardArgumentDefinitions.PRIMARY_RESOURCE_LONG_NAME +
+ " (usually an index). If no secondary resource is specified, standard secondary resources" +
+ " for the primary bundle resource will be automatically inferred unless " +
+ SUPPRESS_SECONDARY_RESOURCE_RESOLUTION_FULL_NAME +
" is specified.",
optional = true)
GATKPath secondaryResource;
@@ -212,7 +270,7 @@ private BundleType determinePrimaryContentType() {
// determine the type of bundle to create; consult the tag attributes if any, otherwise try to infer from the
// primary resource's file extension
final String primaryContentTag = primaryResource.getTag();
- if (primaryContentTag != null && !primaryContentTag.isEmpty()) {
+ if (primaryContentTag != null) {
if (primaryContentTag.equals(BundleResourceType.CT_VARIANT_CONTEXTS)) {
bundleType = BundleType.VCF;
} else if (primaryContentTag.equals(BundleResourceType.CT_HAPLOID_REFERENCE)) {
@@ -224,7 +282,7 @@ private BundleType determinePrimaryContentType() {
bundleType = BundleType.CUSTOM;
}
} else {
- logger.info(String.format("A content type for the primary input was not provided. Attempting to infer the content type from the %s extension.", primaryResource));
+ logger.info(String.format("A content type for the primary input was not provided. Attempting to infer the primary content type from the %s extension.", primaryResource));
bundleType = inferPrimaryContentType(primaryResource);
}
return bundleType;
@@ -248,8 +306,8 @@ private Bundle createVCFBundle() {
if (secondaryResource != null) {
final String secondaryContentType = secondaryResource.getTag();
if (secondaryContentType == null) {
- logger.info(String.format("A content type for the secondary resource was not provided. Assuming %s is an index.", secondaryResource));
- bundleResources.add(new IOPathResource(secondaryResource, BundleResourceType.CT_VARIANTS_INDEX));
+ throw new IllegalArgumentException(String.format("A content type for the secondary input %s must be provided.",
+ secondaryResource.getRawInputString()));
} else {
bundleResources.add(new IOPathResource(secondaryResource, secondaryContentType));
}
@@ -265,7 +323,7 @@ private Bundle createVCFBundle() {
}
bundleResources.add(new IOPathResource(indexPath.get(), BundleResourceType.CT_VARIANTS_INDEX));
}
- if (otherResources != null) {
+ if (otherResources != null && !otherResources.isEmpty()) {
for (final GATKPath otherInput : otherResources) {
final String otherContentType = otherInput.getTag();
if (otherContentType == null) {
@@ -283,7 +341,7 @@ private Bundle createVCFBundle() {
private Bundle createHaploidReferenceBundle() {
final Collection bundleResources = new ArrayList<>();
- if (secondaryResource == null && otherResources == null) {
+ if (secondaryResource == null && (otherResources == null || otherResources.isEmpty())) {
// infer dictionary and index
return HaploidReferenceResolver.referenceBundleFromFastaPath(primaryResource, GATKPath::new);
}
@@ -292,12 +350,13 @@ private Bundle createHaploidReferenceBundle() {
if (secondaryResource != null) {
final String secondaryContentType = secondaryResource.getTag();
if (secondaryContentType == null) {
- throw new IllegalArgumentException(String.format("A content type for the secondary input must be provided."));
+ throw new IllegalArgumentException(String.format("A content type for the secondary input %s must be provided.",
+ secondaryResource.getRawInputString()));
} else {
bundleResources.add(new IOPathResource(secondaryResource, secondaryContentType));
}
}
- if (otherResources != null) {
+ if (otherResources != null && !otherResources.isEmpty()) {
for (final GATKPath otherInput : otherResources) {
final String otherContentType = otherInput.getTag();
if (otherContentType == null) {
@@ -319,12 +378,13 @@ private Bundle createOtherBundle() {
if (secondaryResource != null) {
final String secondaryContentType = secondaryResource.getTag();
if (secondaryContentType == null) {
- throw new IllegalArgumentException(String.format("A content type for the secondary input must be provided."));
+ throw new IllegalArgumentException(String.format("A content type for the secondary input %s must be provided.",
+ secondaryResource.getRawInputString()));
} else {
bundleResources.add(new IOPathResource(secondaryResource, secondaryContentType));
}
}
- if (otherResources != null) {
+ if (otherResources != null && !otherResources.isEmpty()) {
for (final GATKPath otherInput : otherResources) {
final String otherContentType = otherInput.getTag();
if (otherContentType == null) {
diff --git a/src/test/java/org/broadinstitute/hellbender/engine/BundleSupportIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/engine/BundleSupportIntegrationTest.java
new file mode 100644
index 00000000000..04566869e0d
--- /dev/null
+++ b/src/test/java/org/broadinstitute/hellbender/engine/BundleSupportIntegrationTest.java
@@ -0,0 +1,34 @@
+package org.broadinstitute.hellbender.engine;
+
+import htsjdk.beta.io.IOPathUtils;
+import htsjdk.beta.io.bundle.Bundle;
+import htsjdk.beta.io.bundle.BundleJSON;
+import htsjdk.io.IOPath;
+import org.broadinstitute.hellbender.GATKBaseTest;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+import java.io.IOException;
+
+public class BundleSupportIntegrationTest extends GATKBaseTest {
+
+ // this test uses a serialized bundle file to ensure that we don't unintentionally pick up any
+ // code (like, from htsjdk) that introduces backward compatibility issues
+ @Test
+ public void testReadWriteSerializedReferenceBundle() throws IOException {
+ // This test file contains absolute paths to files on a local dev machine, so it shouldn't really be used
+ // for anything other than this test, since the absolute paths are unlikely to work on any other machine.
+ // But here we just want to make sure we can consume and roundtrip it without error
+ final IOPath testBundleFilePath = new GATKPath("src/test/resources/org/broadinstitute/hellbender/engine/print_reads_bundle_do_not_use.json");
+
+ // get our test bundle from the file (ensure we canparse it), then write it out to a temp file, read it back
+ // in, and compare
+ final Bundle testBundle = BundleJSON.toBundle(IOPathUtils.getStringFromPath(testBundleFilePath));
+ final IOPath roundTrippedBundleFilePath = new GATKPath(
+ createTempPath("testReadWriteSerializedReferenceBundle", ".json").toString());
+ IOPathUtils.writeStringToPath(roundTrippedBundleFilePath, BundleJSON.toJSON(testBundle));
+ final Bundle roundTrippedBundle = BundleJSON.toBundle(IOPathUtils.getStringFromPath(testBundleFilePath));
+ Assert.assertTrue(Bundle.equalsIgnoreOrder(roundTrippedBundle, testBundle));
+ }
+
+}
diff --git a/src/test/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerIntegrationTest.java
index 315fbe3458d..99f135f273a 100644
--- a/src/test/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerIntegrationTest.java
@@ -167,8 +167,7 @@ private static IOPath createRemoteBundleForFile(
final File index1,
final File vcf2,
final File index2) throws IOException {
- //TODO: replace this path with getGCPTestStaging()
- final String remotePath = BucketUtils.randomRemotePath("gs://hellbender/test/staging/remoteBundles", "remote_bundle_test", "dir");
+ final String remotePath = BucketUtils.randomRemotePath(getGCPTestStaging() + "remoteBundles", "remote_bundle_test", "dir");
final Path remoteDirPath = IOUtils.getPath(remotePath + "/");
Files.createDirectory(remoteDirPath);
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/CreateBundleIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/CreateBundleIntegrationTest.java
index 93f31d60e5e..d9904cdeaab 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/CreateBundleIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/CreateBundleIntegrationTest.java
@@ -3,6 +3,7 @@
import htsjdk.beta.io.bundle.*;
import htsjdk.beta.plugin.IOUtils;
import htsjdk.beta.plugin.variants.VariantsBundle;
+import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.GATKPath;
@@ -13,24 +14,28 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
-import java.util.stream.Collectors;
public class CreateBundleIntegrationTest extends CommandLineProgramTest {
// force our local paths to use absolute path names to make BundleResource and IOPath equality checks easier,
// since once a bundle is round-tripped/serialized to JSON, the resources will always contain absolute path names
// for local files
+
+ //NOTE: These variables are Strings, but they are initialized to Strings obtained by first creating a GATKPath,
+ // and then calling getURIString on the resulting object. This is just shortcut to normalize them so they
+ // match the strings that will be embedded in the bundles created by the CreateBundle tool (i.e., to have
+ // full/absolute paths and protocol schemes).
private final static String LOCAL_VCF = new GATKPath(getTestDataDir() + "/count_variants_withSequenceDict.vcf").getURIString();
private final static String LOCAL_VCF_IDX = new GATKPath(getTestDataDir() + "/count_variants_withSequenceDict.vcf.idx").getURIString();
private final static String LOCAL_VCF_GZIP = new GATKPath("src/test/resources/large/NA24385.vcf.gz").getURIString();
private final static String LOCAL_VCF_TBI = new GATKPath("src/test/resources/large/NA24385.vcf.gz.tbi").getURIString();
private final static String LOCAL_VCF_WITH_NO_INDEX = new GATKPath("src/test/resources/org/broadinstitute/hellbender/tools/count_variants_withSequenceDict_noIndex.vcf").getURIString();
- private final static String CLOUD_VCF = "gs://hellbender/test/resources/large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf";
- private final static String CLOUD_VCF_IDX = "gs://hellbender/test/resources/large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.idx";
+ private final static String CLOUD_VCF = GCS_GATK_TEST_RESOURCES + "large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf";
+ private final static String CLOUD_VCF_IDX = GCS_GATK_TEST_RESOURCES + "large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.idx";
- private final static String LOCAL_FASTA = "src/test/resources/large/Homo_sapiens_assembly38.20.21.fasta";
- private final static String LOCAL_FASTA_INDEX = "src/test/resources/large/Homo_sapiens_assembly38.20.21.fasta.fai";
- private final static String LOCAL_FASTA_DICT = "src/test/resources/large/Homo_sapiens_assembly38.20.21.fasta.dict";
+ private final static String LOCAL_FASTA = new GATKPath("src/test/resources/large/Homo_sapiens_assembly38.20.21.fasta").getURIString();
+ private final static String LOCAL_FASTA_INDEX = new GATKPath("src/test/resources/large/Homo_sapiens_assembly38.20.21.fasta.fai").getURIString();
+ private final static String LOCAL_FASTA_DICT = new GATKPath("src/test/resources/large/Homo_sapiens_assembly38.20.21.dict").getURIString();
private final static String CUSTOM_PRIMARY_CT = "primary_ct";
private final static String CUSTOM_SECONDARY_CT = "secondary_ct";
@@ -41,22 +46,21 @@ public Object[][] bundleCases() {
return new Object[][] {
// primary, primary tag, secondary, secondary tag, other(s), other tag(s), suppressResourceResolution, expectedBundle
- // VCF bundle cases, with AUTOMATIC secondary resolution, and INFERRED content types
+ // VCF bundle cases, with AUTOMATIC secondary resolution, and INFERRED primary content types
{LOCAL_VCF, null, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))},
- {LOCAL_VCF, null, LOCAL_VCF_IDX, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))},
+ {LOCAL_VCF, null, LOCAL_VCF_IDX, BundleResourceType.CT_VARIANTS_INDEX, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))},
{LOCAL_VCF_GZIP, null, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF_GZIP), new GATKPath(LOCAL_VCF_TBI))},
+ {LOCAL_VCF_GZIP, null, LOCAL_VCF_TBI, BundleResourceType.CT_VARIANTS_INDEX, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF_GZIP), new GATKPath(LOCAL_VCF_TBI))},
{CLOUD_VCF, null, null, null, null, null, false, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))},
- {CLOUD_VCF, null, CLOUD_VCF_IDX, null, null, null, false, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))},
+ {CLOUD_VCF, null, CLOUD_VCF_IDX, BundleResourceType.CT_VARIANTS_INDEX, null, null, false, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))},
- // VCF bundle cases, with SUPPRESSED secondary resolution, and INFERRED content types
+ // VCF bundle cases, with SUPPRESSED secondary resolution, and INFERRED primary content types
{LOCAL_VCF, null, null, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF))},
- {LOCAL_VCF, null, LOCAL_VCF_IDX, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))},
+ // local vcf that has no index, but since suppressSecondaryResourceResolution is true, we don't throw since we don't try to infer the index
{LOCAL_VCF_WITH_NO_INDEX, null, null, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF_WITH_NO_INDEX))},
- {LOCAL_VCF_GZIP, null, LOCAL_VCF_TBI, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF_GZIP), new GATKPath(LOCAL_VCF_TBI))},
{CLOUD_VCF, null, null, null, null, null, true, new VariantsBundle(new GATKPath(CLOUD_VCF))},
- {CLOUD_VCF, null, CLOUD_VCF_IDX, null, null, null, true, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))},
- // VCF bundle cases, with AUTOMATIC secondary resolution, and EXPLICIT content types
+ // VCF bundle cases, with AUTOMATIC secondary resolution, and EXPLICIT primary content types
{LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))},
{LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, LOCAL_VCF_IDX, BundleResourceType.CT_VARIANTS_INDEX, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))},
@@ -70,9 +74,24 @@ public Object[][] bundleCases() {
.addPrimary(new IOPathResource(new GATKPath(LOCAL_VCF), BundleResourceType.CT_VARIANT_CONTEXTS))
.addSecondary(new IOPathResource(new GATKPath(LOCAL_VCF_IDX), BundleResourceType.CT_VARIANTS_INDEX))
.addSecondary(new IOPathResource(new GATKPath(new GATKPath("someVariantsCompanion.txt").getURIString()), "someVariantsCT"))
- .build()},
+ .build()
+ },
// reference bundles
+ { LOCAL_FASTA, null, null, null, null, null, false,
+ new BundleBuilder()
+ .addPrimary(new IOPathResource(new GATKPath(LOCAL_FASTA), BundleResourceType.CT_HAPLOID_REFERENCE))
+ .addSecondary(new IOPathResource(new GATKPath(LOCAL_FASTA_INDEX), BundleResourceType.CT_REFERENCE_INDEX))
+ .addSecondary(new IOPathResource(new GATKPath(LOCAL_FASTA_DICT), BundleResourceType.CT_REFERENCE_DICTIONARY))
+ .build()
+ },
+ { LOCAL_FASTA, BundleResourceType.CT_HAPLOID_REFERENCE, LOCAL_FASTA_INDEX, BundleResourceType.CT_REFERENCE_INDEX, Arrays.asList(LOCAL_FASTA_DICT), Arrays.asList(BundleResourceType.CT_REFERENCE_DICTIONARY), false,
+ new BundleBuilder()
+ .addPrimary(new IOPathResource(new GATKPath(LOCAL_FASTA), BundleResourceType.CT_HAPLOID_REFERENCE))
+ .addSecondary(new IOPathResource(new GATKPath(LOCAL_FASTA_INDEX), BundleResourceType.CT_REFERENCE_INDEX))
+ .addSecondary(new IOPathResource(new GATKPath(LOCAL_FASTA_DICT), BundleResourceType.CT_REFERENCE_DICTIONARY))
+ .build()
+ },
// "custom" bundles
{
@@ -105,17 +124,26 @@ public Object[][] negativeBundleCases() {
return new Object[][] {
// primary, primary tag, secondary, secondary tag, other(s), other tag(s), suppressIndexResolution, expectedBundle
- // no index file can be inferred
+ // no vcf index file can be inferred
{LOCAL_VCF_WITH_NO_INDEX, null, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF_WITH_NO_INDEX))},
+ // vcf bundle with secondary/other content type not explicitly provided
+ {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, null, null, Arrays.asList("other.txt"), null, false, null},
+ {LOCAL_VCF, null, LOCAL_VCF_IDX, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))},
+ {LOCAL_VCF, null, LOCAL_VCF_IDX, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))},
+ {LOCAL_VCF_GZIP, null, LOCAL_VCF_TBI, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF_GZIP), new GATKPath(LOCAL_VCF_TBI))},
+ {CLOUD_VCF, null, CLOUD_VCF_IDX, null, null, null, false, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))},
+ {CLOUD_VCF, null, CLOUD_VCF_IDX, null, null, null, true, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))},
// primary content type not provided, and cannot be inferred from the extension
{"primaryFile.ext", null, null, null, null, null, false, null},
// secondary content type not provided
{"primaryFile.ext", CUSTOM_PRIMARY_CT, "secondaryFile.ext", null, null, null, false, null},
+ // reference input with unknown content type specified
+ { LOCAL_FASTA, null, LOCAL_FASTA_INDEX, "unknown", null, null, false, null},
+
// other bundle with other content type not provided
{"primaryFile.ext", CUSTOM_PRIMARY_CT, "secondaryFile.ext", CUSTOM_SECONDARY_CT, Arrays.asList("other.txt"), null, false, null},
- // vcf bundle with other content type not provided
- {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, null, null, Arrays.asList("other.txt"), null, false, null},
+
};
}
@@ -145,6 +173,17 @@ public void testNegativeBundleCases(
doCreateBundleTest (primaryInput, primaryInputTag, secondaryInput, secondaryInputTag, otherInputs, otherInputTags, suppressResourceResolution, expectedBundle);
}
+ @Test(expectedExceptions={CommandLineException.class})
+ public void testRequireBundleExtension() {
+ final GATKPath outputPath = new GATKPath(createTempFile("test", ".bundle.BOGUS").getAbsolutePath().toString());
+ final List args = new ArrayList<>();
+ args.add("--" + StandardArgumentDefinitions.PRIMARY_RESOURCE_LONG_NAME);
+ args.add(LOCAL_FASTA);
+ args.add("--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME);
+ args.add(outputPath.toString());
+ runCommandLine(args);
+ }
+
private void doCreateBundleTest(
final String primaryInput,
final String primaryInputTag,
@@ -176,10 +215,6 @@ private void doCreateBundleTest(
args.add("--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME);
args.add(outputPath.toString());
- System.out.println();
- System.out.println(args.stream().collect(Collectors.joining("\n ")));
- System.out.println();
-
runCommandLine(args);
final Bundle actualBundle = BundleJSON.toBundle(IOUtils.getStringFromPath(outputPath), GATKPath::new);
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/PrintReadsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/PrintReadsIntegrationTest.java
index 028b5965efc..5fb7702b797 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/PrintReadsIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/PrintReadsIntegrationTest.java
@@ -1,9 +1,13 @@
package org.broadinstitute.hellbender.tools;
-import htsjdk.samtools.SamReaderFactory;
-import htsjdk.samtools.ValidationStringency;
+import htsjdk.beta.io.IOPathUtils;
+import htsjdk.beta.io.bundle.*;
+import htsjdk.io.IOPath;
+import htsjdk.samtools.*;
+import htsjdk.samtools.cram.ref.ReferenceSource;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.ReadsDataSource;
import org.broadinstitute.hellbender.engine.ReadsPathDataSource;
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
@@ -11,12 +15,15 @@
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
+import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
import java.util.Arrays;
import java.util.List;
@@ -104,4 +111,50 @@ public void testHttpPaths(String reads, String index, String nonHttpReads, Strin
SamAssertionUtils.assertEqualBamFiles(out, out2, false, ValidationStringency.DEFAULT_STRINGENCY);
}
+ @Test(groups = {"cloud"})
+ public void testPrintReadsWithReferenceBundle() throws IOException {
+ // test that both reading and writing a cram work when the reference is specified via a bundle where the
+ // reference, reference index, and reference dictionary are all in different buckets
+ final IOPath testFastaFile = new GATKPath(getTestDataDir() + "/print_reads.fasta");
+ final IOPath testIndexFile = new GATKPath(getTestDataDir() + "/print_reads.fasta.fai");
+ final IOPath testDictFile = new GATKPath(getTestDataDir() + "/print_reads.dict");
+
+ final String targetBucketName = BucketUtils.randomRemotePath(getGCPTestStaging(), "testPrintReadsWithReferenceBundle", "") + "/";
+ final IOPath targetBucket = new GATKPath(targetBucketName);
+ IOUtils.deleteOnExit(targetBucket.toPath());
+
+ final Path remoteFasta = Files.copy(testFastaFile.toPath(), new GATKPath(targetBucketName + "print_reads.fasta").toPath());
+ final IOPath targetIndex = new GATKPath(targetBucketName + "refindex/print_reads.fasta.fai");
+ final Path remoteFastaIndex = Files.copy(testIndexFile.toPath(), targetIndex.toPath());
+ final IOPath targetDict = new GATKPath(targetBucketName + "refdict/print_reads.dict");
+ final Path remoteFastaDict = Files.copy(testDictFile.toPath(), targetDict.toPath());
+
+ // create a bundle with the remote reference, index, and dict files
+ final Bundle refBundle = new BundleBuilder()
+ .addPrimary(new IOPathResource(new GATKPath(remoteFasta.toUri().toString()), BundleResourceType.CT_HAPLOID_REFERENCE))
+ .addSecondary(new IOPathResource(new GATKPath(remoteFastaIndex.toUri().toString()), BundleResourceType.CT_REFERENCE_INDEX))
+ .addSecondary(new IOPathResource(new GATKPath(remoteFastaDict.toUri().toString()), BundleResourceType.CT_REFERENCE_DICTIONARY))
+ .build();
+ final IOPath bundleFilePath = new GATKPath(targetBucketName + "refBundle.json");
+ IOPathUtils.writeStringToPath(bundleFilePath, BundleJSON.toJSON(refBundle));
+
+ final IOPath targetOutCRAM = new GATKPath(IOUtils.createTempFile("testReferenceSequenceForNioBundle", ".cram").getAbsolutePath());
+ final ArgumentsBuilder args = new ArgumentsBuilder()
+ .addInput(getTestDataDir() + "/print_reads.cram")
+ .addReference(bundleFilePath.toString())
+ .addOutput(targetOutCRAM.toString());
+ runCommandLine(args);
+
+ int count = 0;
+ try (final SamReader in = SamReaderFactory.makeDefault()
+ .validationStringency(ValidationStringency.SILENT)
+ .referenceSource(new ReferenceSource(bundleFilePath.toPath()))
+ .open(targetOutCRAM.toPath())) {
+ for (@SuppressWarnings("unused") final SAMRecord rec : in) {
+ count++;
+ }
+ }
+ Assert.assertEquals(count, 8);
+ }
+
}
\ No newline at end of file
diff --git a/src/test/resources/org/broadinstitute/hellbender/engine/print_reads_bundle_do_not_use.json b/src/test/resources/org/broadinstitute/hellbender/engine/print_reads_bundle_do_not_use.json
new file mode 100644
index 00000000000..66fe7bf0ecb
--- /dev/null
+++ b/src/test/resources/org/broadinstitute/hellbender/engine/print_reads_bundle_do_not_use.json
@@ -0,0 +1,10 @@
+{
+ "schema": {
+ "schemaVersion": "0.1.0",
+ "schemaName": "htsbundle"
+ },
+ "HAPLOID_REFERENCE": {"path": "file:///Users/cnorman/projects/gatk/src/test/resources/org/broadinstitute/hellbender/tools/print_reads.fasta"},
+ "REFERENCE_DICTIONARY": {"path": "file:///Users/cnorman/projects/gatk/src/test/resources/org/broadinstitute/hellbender/tools/print_reads.dict"},
+ "REFERENCE_INDEX": {"path": "file:///Users/cnorman/projects/gatk/src/test/resources/org/broadinstitute/hellbender/tools/print_reads.fasta.fai"},
+ "primary": "HAPLOID_REFERENCE"
+}