From 5a10c0cb9b0b3bd93862e390f2bc98bb3acab6fa Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Tue, 27 Aug 2024 17:09:48 -0400 Subject: [PATCH] More tests and javadoc. --- .../cmdline/CommandLineProgram.java | 2 +- .../hellbender/tools/CreateBundle.java | 198 ++++++++++++------ .../engine/BundleSupportIntegrationTest.java | 34 +++ .../MultiVariantWalkerIntegrationTest.java | 3 +- .../tools/CreateBundleIntegrationTest.java | 79 +++++-- .../tools/PrintReadsIntegrationTest.java | 57 ++++- .../engine/print_reads_bundle_do_not_use.json | 10 + 7 files changed, 287 insertions(+), 96 deletions(-) create mode 100644 src/test/java/org/broadinstitute/hellbender/engine/BundleSupportIntegrationTest.java create mode 100644 src/test/resources/org/broadinstitute/hellbender/engine/print_reads_bundle_do_not_use.json diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/CommandLineProgram.java b/src/main/java/org/broadinstitute/hellbender/cmdline/CommandLineProgram.java index 8835707c556..2056b676870 100644 --- a/src/main/java/org/broadinstitute/hellbender/cmdline/CommandLineProgram.java +++ b/src/main/java/org/broadinstitute/hellbender/cmdline/CommandLineProgram.java @@ -156,7 +156,7 @@ public Object instanceMainPostParseArgs() { tmpDir = new GATKPath(System.getProperty("java.io.tmpdir")); } - // Build the default headers + // Build the defauNlt headers final ZonedDateTime startDateTime = ZonedDateTime.now(); this.defaultHeaders.add(new StringHeader(commandLine)); this.defaultHeaders.add(new StringHeader("Started on: " + Utils.getDateTimeForDisplay(startDateTime))); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java b/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java index 1cdc73374f7..9e65720b896 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java @@ -20,115 +20,173 @@ import java.util.*; /** - * Create a bundle (JSON) file for use with a GATK tool. - * - * Since most bundles need to contain a primary resource plus at least one secondary resource (typically an index), - * the tool will attempt to infer standard secondary resources(s) for a given primary resource if no secondary resource - * is explicitly provided on the command line. Inferred secondary resources are automatically added to the resulting - * bundle. Secondary resource inference can be suppressed by using the --suppress-resource-resolution argument. - * - * Each resource in a bundle must have an associated content type tag. Content types for each resource are either - * specified on the command line via argument tags, or inferred by the tool. For the primary and secondary resources, - * when no content type argument tag is provided, the tool will attempt to infer the content type from the file - * extension. However, the content type for "other" resources (resources that are nether primary nor secondary - * resources) are NEVER inferred, and must always include a content type argument tag. + *

+ * Create a single bundle (JSON) file from one or more related inputs, for use as input to a GATK tool. + *

Bundle Structure

+ * A bundle is a JSON file that contains references to one or more related resources (i.e., a VCF file + * and it's associated index file, or a .fasta reference file and it's associated index and dictionary files). + * Bundle files can be supplied as inputs for many GATK tools. The primary advantage to using a bundle input + * over an individual file input is that a bundle allows the individual resources to be located in different + * directories (ie., a VCF input and it's corresponding index file generally need to be siblings in the same + * parent directory when the VCF is specified as an individual file, whereas using a bundle file, + * the index can be located in a different directory). + *

+ * Each resource in a bundle has an associated content type, which is a string that identifies the type of data + * in that resource. One resource in the bundle is always designated as the "primary" resource, which determines + * the type of the bundle. * + *

An example VCF bundle JSON file is show below. The bundle has 3 resources, with content types + * "HAPLOID_REFERENCE", "REFERENCE_DICTIONARY", and "REFERENCE_INDEX" respectively, with the "HAPLOID_REFERENCE" + * resource designated as the primary resource. This bundle is a reference bundle, and can be used as an + * input where ever a reference argument is to be provided. + *

+ *  {
+ *    "schema": {
+ *     "schemaVersion": "0.1.0",
+ *     "schemaName": "htsbundle"
+ *    },
+ *    "HAPLOID_REFERENCE": {"path": "file:///projects/print_reads.fasta"},
+ *    "REFERENCE_DICTIONARY": {"path": "file:///projects/print_reads.dict"},
+ *    "REFERENCE_INDEX": {"path": "file:///projects/print_reads.fasta.fai"},
+ *    "primary": "HAPLOID_REFERENCE"
+ *  }
+ * 
+ *

Using CreateBundle

+ *

+ * CreateBundle requires at least one primary resource input. One or more optional secondary resource inputs may also + * be supplied. + *

+ * The simplest way to use CreateBundle is to specify only the primary resource. In this case (no secondary + * resources are explicitly provided), CreateBundle will attempt to locate and infer "standard" secondary + * resources, as long as the primary resource content type is a well known type (see "Standard Secondary + * Resources" below), unless the "--suppress-resource-resolution" argument is used to suppress this behavior. + * However, secondary resource inference requires that the primary resource has a well known type (variants + * or a fasta file), and the secondary resources must be siblings (in the same parent directory) as the + * primary resource. If the type of the primary resource is not supplied, and cannot be determined, or if the + * standard secondary resources for a standard primary resource cannot be found in the same director as the + * primary, an exception will be thrown. + *

+ * Another way to is to explicitly specify all of the resources and their content types. This is useful when + * the resources are not in the same directory, or when the content types are not standard. + *

+ * For the primary resource, if the content type is not specified on the command line (content types are + * supplied on the command line using argument tags - see "Standard Content Types" and the examples below), + * CreateBundle will attempt to determine the content type of the primary resource. Content types for explicitly + * supplied secondary resources are never inferred, and must always be supplied explicitly. + *

* Bundle output file names must end with the suffix ".json". - * - * In general, content types can be any string, but there are well known content types that must be used when creating - * bundles for tools that expect well known resources types, such as a VCF, a VCF index, a .fasta file, or a reference + *

+ *

Standard Content Types

+ * In general, bundle content types can be any string, but many tools expect bundles to use standard, well known + * content types that are pre-defined, such as content types for a VCF, a VCF index, a .fasta file, or a reference * dictionary file. The common well known content types are: - * - * - "CT_VARIANT_CONTEXTS": a VCF file - * - "CT_VARIANTS_INDEX: VCF" index file - * - * - "CT_HAPLOID_REFERENCE": fasta reference file - * - "CT_HAPLOID_REFERENCE_INDEX": fasta index file - * - "CT_HAPLOID_REFERENCE_DICTIONARY": fasta dictionary file - * - * Common bundle creation examples: - * - * VCF Bundles: - * + *

Standard VCF Content Types

+ * + *

Standard Reference Content Types

+ *

+ *

Standard Secondary Resources

+ * For VCFS, the standard (inferred) secondary resources are: + * + * For references, the standard secondary resources are: + * + *

+ *

Common bundle creation examples:

+ *

+ *

VCF Bundle Examples

+ *

* 1) Create a resource bundle for a VCF from just the VCF, letting the tool resolve the secondary (index) resource by * automatically finding the sibling index file, and letting the tool determine the content types. If the sibling index - * file cannot be found, an exception will be thrown. Resulting bundle contains the VCF and associated index. - * + * file cannot be found, an exception will be thrown. The resulting bundle contains the VCF and associated index. + *

  *    CreateBundle \
  *      --primary path/to/my.vcf \
  *      --output mybundle.json
- *
- *  The exact same bundle could be created manually by specifying both the resources and the content types explicitly:
- *
+ * 

+ * The exact same bundle could be created manually by specifying both the resources and the content types explicitly: + *

  *     CreateBundle \
  *      --primary:CT_VARIANT_CONTEXTS path/to/my.vcf \
  *      --secondary:CT_VARIANTS_INDEX path/to/my.vcf.idx \
  *      --output mybundle.json
- *
+ * 

+ *

* 2) Create a resource bundle for a VCF from just the VCF, but suppress automatic resolution of the secondary - * resources. Let the tool determine the content types. The resulting bundle will contain only the vcf resource: - * + * resources. Let the tool determine the content type. The resulting bundle will contain only the VCF resource: + *

  *    CreateBundle \
  *      --primary path/to/my.vcf \
  *      --suppress-resource-resolution \
  *      --output mybundle.json
- *
+ * 

* 3) Create a resource bundle for a VCF, but specify the VCF AND the secondary index resource explicitly (which * suppresses automatic secondary resolution). This is useful when the VCF and index are not in the same directory. - * Let the tool determine the content types. The resulting bundle will contain the VCF and index resources: - * + * Let the tool determine the primary content type. The resulting bundle will contain the VCF and index resources: + *

  *     CreateBundle \
  *      --primary path/to/my.vcf \
- *      --secondary some/other/path/to/vcd.idx \
+ *      --secondary:CT_VARIANTS_INDEX some/other/path/to/vcd.idx \
  *      --output mybundle.json
- *
+ * 

* 4) Create a resource bundle for a VCF, but specify the VCF AND the secondary index resource explicitly (this * is useful when the VCF and index are not in the same directory), and specify the content types explicitly via * command line argument tags. The resulting bundle will contain the VCF and index resources. - * + *

  *     CreateBundle \
  *      --primary:CT_VARIANT_CONTEXTS path/to/my.vcf \
  *      --secondary:CT_VARIANTS_INDEX some/other/path/to/vcd.idx \
  *      --output mybundle.json
- *
- * Reference bundles:
- *
+ * 

+ *

Reference Bundle Examples

+ *

* 1) Create a resource bundle for a reference from just the .fasta, letting the tool resolve the secondary * (index and dictionary) resource by automatically finding the sibling files, and determining the content types. * If the sibling index file cannot be found, an exception will be thrown. The resulting bundle will contain the * reference, index, and dictionary. - * + *

  *    CreateBundle \
  *      --primary path/to/my.fasta \
  *      --output mybundle.json
- *
+ * 

* 2) Create a resource bundle for a reference from just the .fasta, but suppress resolution of the secondary index and - * dictionary resources). Let the tool determine the content type. The resulting bundle will contain only the .fasta + * dictionary resources. Let the tool determine the content type. The resulting bundle will contain only the .fasta * resource: - * + *

  *    CreateBundle \
  *      --primary path/to/my.fasta \
  *      --suppress-resource-resolution \
  *      --output mybundle.json
- *
+ * 

* 3) Create a resource bundle for a fasta, but specify the fasta AND the secondary index and dictionary resources * explicitly (which suppresses automatic secondary resolution). Let the tool determine the content types. The * resulting bundle will contain the fasta, index and dictionary resources: - * + *

  *     CreateBundle \
  *      --primary path/to/my.fasta \
- *      --secondary some/other/path/to/my.fai \
- *      --secondary some/other/path/to/my.dict \
+ *      --secondary:CT_HAPLOID_REFERENCE_INDEX some/other/path/to/my.fai \
+ *      --secondary:CT_HAPLOID_REFERENCE_DICTIONARY some/other/path/to/my.dict \
  *      --output mybundle.json
- *
+ * 

* 4) Create a resource bundle for a fasta, but specify the fasta, index and dictionary resources and the content * types explicitly. The resulting bundle will contain the fasta, index and dictionary resources: - * + *

  *     CreateBundle \
  *      --primary:CT_HAPLOID_REFERENCE path/to/my.fasta \
  *      --secondary:CT_HAPLOID_REFERENCE_INDEX some/other/path/to/my.fai \
  *      --secondary:CT_HAPLOID_REFERENCE_DICTIONARY some/other/path/to/my.dict \
  *      --output mybundle.json
+ * 

*/ @DocumentedFeature @CommandLineProgramProperties( @@ -141,15 +199,15 @@ public class CreateBundle extends CommandLineProgram { @Argument(fullName = StandardArgumentDefinitions.PRIMARY_RESOURCE_LONG_NAME, shortName = StandardArgumentDefinitions.PRIMARY_RESOURCE_SHORT_NAME, - doc="Path to the primary bundle resource (content type will be inferred if no content type tag is specified)") + doc="Path to the primary resource (content type will be inferred if no content type tag is specified)") GATKPath primaryResource; @Argument(fullName = StandardArgumentDefinitions.SECONDARY_RESOURCE_LONG_NAME, shortName = StandardArgumentDefinitions.SECONDARY_RESOURCE_SHORT_NAME, - doc = "Path to a secondary bundle resource for" + StandardArgumentDefinitions.PRIMARY_RESOURCE_LONG_NAME + - "(usually an index). The content type will be inferred if no content type tag is specified. If no "+ - " secondary resource is specified, standard secondary resources for the primary bundle resource " + - " will also automatically be inferred unless " + SUPPRESS_SECONDARY_RESOURCE_RESOLUTION_FULL_NAME + + doc = "Path to a secondary resource for" + StandardArgumentDefinitions.PRIMARY_RESOURCE_LONG_NAME + + " (usually an index). If no secondary resource is specified, standard secondary resources" + + " for the primary bundle resource will be automatically inferred unless " + + SUPPRESS_SECONDARY_RESOURCE_RESOLUTION_FULL_NAME + " is specified.", optional = true) GATKPath secondaryResource; @@ -212,7 +270,7 @@ private BundleType determinePrimaryContentType() { // determine the type of bundle to create; consult the tag attributes if any, otherwise try to infer from the // primary resource's file extension final String primaryContentTag = primaryResource.getTag(); - if (primaryContentTag != null && !primaryContentTag.isEmpty()) { + if (primaryContentTag != null) { if (primaryContentTag.equals(BundleResourceType.CT_VARIANT_CONTEXTS)) { bundleType = BundleType.VCF; } else if (primaryContentTag.equals(BundleResourceType.CT_HAPLOID_REFERENCE)) { @@ -224,7 +282,7 @@ private BundleType determinePrimaryContentType() { bundleType = BundleType.CUSTOM; } } else { - logger.info(String.format("A content type for the primary input was not provided. Attempting to infer the content type from the %s extension.", primaryResource)); + logger.info(String.format("A content type for the primary input was not provided. Attempting to infer the primary content type from the %s extension.", primaryResource)); bundleType = inferPrimaryContentType(primaryResource); } return bundleType; @@ -248,8 +306,8 @@ private Bundle createVCFBundle() { if (secondaryResource != null) { final String secondaryContentType = secondaryResource.getTag(); if (secondaryContentType == null) { - logger.info(String.format("A content type for the secondary resource was not provided. Assuming %s is an index.", secondaryResource)); - bundleResources.add(new IOPathResource(secondaryResource, BundleResourceType.CT_VARIANTS_INDEX)); + throw new IllegalArgumentException(String.format("A content type for the secondary input %s must be provided.", + secondaryResource.getRawInputString())); } else { bundleResources.add(new IOPathResource(secondaryResource, secondaryContentType)); } @@ -265,7 +323,7 @@ private Bundle createVCFBundle() { } bundleResources.add(new IOPathResource(indexPath.get(), BundleResourceType.CT_VARIANTS_INDEX)); } - if (otherResources != null) { + if (otherResources != null && !otherResources.isEmpty()) { for (final GATKPath otherInput : otherResources) { final String otherContentType = otherInput.getTag(); if (otherContentType == null) { @@ -283,7 +341,7 @@ private Bundle createVCFBundle() { private Bundle createHaploidReferenceBundle() { final Collection bundleResources = new ArrayList<>(); - if (secondaryResource == null && otherResources == null) { + if (secondaryResource == null && (otherResources == null || otherResources.isEmpty())) { // infer dictionary and index return HaploidReferenceResolver.referenceBundleFromFastaPath(primaryResource, GATKPath::new); } @@ -292,12 +350,13 @@ private Bundle createHaploidReferenceBundle() { if (secondaryResource != null) { final String secondaryContentType = secondaryResource.getTag(); if (secondaryContentType == null) { - throw new IllegalArgumentException(String.format("A content type for the secondary input must be provided.")); + throw new IllegalArgumentException(String.format("A content type for the secondary input %s must be provided.", + secondaryResource.getRawInputString())); } else { bundleResources.add(new IOPathResource(secondaryResource, secondaryContentType)); } } - if (otherResources != null) { + if (otherResources != null && !otherResources.isEmpty()) { for (final GATKPath otherInput : otherResources) { final String otherContentType = otherInput.getTag(); if (otherContentType == null) { @@ -319,12 +378,13 @@ private Bundle createOtherBundle() { if (secondaryResource != null) { final String secondaryContentType = secondaryResource.getTag(); if (secondaryContentType == null) { - throw new IllegalArgumentException(String.format("A content type for the secondary input must be provided.")); + throw new IllegalArgumentException(String.format("A content type for the secondary input %s must be provided.", + secondaryResource.getRawInputString())); } else { bundleResources.add(new IOPathResource(secondaryResource, secondaryContentType)); } } - if (otherResources != null) { + if (otherResources != null && !otherResources.isEmpty()) { for (final GATKPath otherInput : otherResources) { final String otherContentType = otherInput.getTag(); if (otherContentType == null) { diff --git a/src/test/java/org/broadinstitute/hellbender/engine/BundleSupportIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/engine/BundleSupportIntegrationTest.java new file mode 100644 index 00000000000..04566869e0d --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/engine/BundleSupportIntegrationTest.java @@ -0,0 +1,34 @@ +package org.broadinstitute.hellbender.engine; + +import htsjdk.beta.io.IOPathUtils; +import htsjdk.beta.io.bundle.Bundle; +import htsjdk.beta.io.bundle.BundleJSON; +import htsjdk.io.IOPath; +import org.broadinstitute.hellbender.GATKBaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.IOException; + +public class BundleSupportIntegrationTest extends GATKBaseTest { + + // this test uses a serialized bundle file to ensure that we don't unintentionally pick up any + // code (like, from htsjdk) that introduces backward compatibility issues + @Test + public void testReadWriteSerializedReferenceBundle() throws IOException { + // This test file contains absolute paths to files on a local dev machine, so it shouldn't really be used + // for anything other than this test, since the absolute paths are unlikely to work on any other machine. + // But here we just want to make sure we can consume and roundtrip it without error + final IOPath testBundleFilePath = new GATKPath("src/test/resources/org/broadinstitute/hellbender/engine/print_reads_bundle_do_not_use.json"); + + // get our test bundle from the file (ensure we canparse it), then write it out to a temp file, read it back + // in, and compare + final Bundle testBundle = BundleJSON.toBundle(IOPathUtils.getStringFromPath(testBundleFilePath)); + final IOPath roundTrippedBundleFilePath = new GATKPath( + createTempPath("testReadWriteSerializedReferenceBundle", ".json").toString()); + IOPathUtils.writeStringToPath(roundTrippedBundleFilePath, BundleJSON.toJSON(testBundle)); + final Bundle roundTrippedBundle = BundleJSON.toBundle(IOPathUtils.getStringFromPath(testBundleFilePath)); + Assert.assertTrue(Bundle.equalsIgnoreOrder(roundTrippedBundle, testBundle)); + } + +} diff --git a/src/test/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerIntegrationTest.java index 315fbe3458d..99f135f273a 100644 --- a/src/test/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerIntegrationTest.java @@ -167,8 +167,7 @@ private static IOPath createRemoteBundleForFile( final File index1, final File vcf2, final File index2) throws IOException { - //TODO: replace this path with getGCPTestStaging() - final String remotePath = BucketUtils.randomRemotePath("gs://hellbender/test/staging/remoteBundles", "remote_bundle_test", "dir"); + final String remotePath = BucketUtils.randomRemotePath(getGCPTestStaging() + "remoteBundles", "remote_bundle_test", "dir"); final Path remoteDirPath = IOUtils.getPath(remotePath + "/"); Files.createDirectory(remoteDirPath); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/CreateBundleIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/CreateBundleIntegrationTest.java index 93f31d60e5e..d9904cdeaab 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/CreateBundleIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/CreateBundleIntegrationTest.java @@ -3,6 +3,7 @@ import htsjdk.beta.io.bundle.*; import htsjdk.beta.plugin.IOUtils; import htsjdk.beta.plugin.variants.VariantsBundle; +import org.broadinstitute.barclay.argparser.CommandLineException; import org.broadinstitute.hellbender.CommandLineProgramTest; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.engine.GATKPath; @@ -13,24 +14,28 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import java.util.stream.Collectors; public class CreateBundleIntegrationTest extends CommandLineProgramTest { // force our local paths to use absolute path names to make BundleResource and IOPath equality checks easier, // since once a bundle is round-tripped/serialized to JSON, the resources will always contain absolute path names // for local files + + //NOTE: These variables are Strings, but they are initialized to Strings obtained by first creating a GATKPath, + // and then calling getURIString on the resulting object. This is just shortcut to normalize them so they + // match the strings that will be embedded in the bundles created by the CreateBundle tool (i.e., to have + // full/absolute paths and protocol schemes). private final static String LOCAL_VCF = new GATKPath(getTestDataDir() + "/count_variants_withSequenceDict.vcf").getURIString(); private final static String LOCAL_VCF_IDX = new GATKPath(getTestDataDir() + "/count_variants_withSequenceDict.vcf.idx").getURIString(); private final static String LOCAL_VCF_GZIP = new GATKPath("src/test/resources/large/NA24385.vcf.gz").getURIString(); private final static String LOCAL_VCF_TBI = new GATKPath("src/test/resources/large/NA24385.vcf.gz.tbi").getURIString(); private final static String LOCAL_VCF_WITH_NO_INDEX = new GATKPath("src/test/resources/org/broadinstitute/hellbender/tools/count_variants_withSequenceDict_noIndex.vcf").getURIString(); - private final static String CLOUD_VCF = "gs://hellbender/test/resources/large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf"; - private final static String CLOUD_VCF_IDX = "gs://hellbender/test/resources/large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.idx"; + private final static String CLOUD_VCF = GCS_GATK_TEST_RESOURCES + "large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf"; + private final static String CLOUD_VCF_IDX = GCS_GATK_TEST_RESOURCES + "large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.idx"; - private final static String LOCAL_FASTA = "src/test/resources/large/Homo_sapiens_assembly38.20.21.fasta"; - private final static String LOCAL_FASTA_INDEX = "src/test/resources/large/Homo_sapiens_assembly38.20.21.fasta.fai"; - private final static String LOCAL_FASTA_DICT = "src/test/resources/large/Homo_sapiens_assembly38.20.21.fasta.dict"; + private final static String LOCAL_FASTA = new GATKPath("src/test/resources/large/Homo_sapiens_assembly38.20.21.fasta").getURIString(); + private final static String LOCAL_FASTA_INDEX = new GATKPath("src/test/resources/large/Homo_sapiens_assembly38.20.21.fasta.fai").getURIString(); + private final static String LOCAL_FASTA_DICT = new GATKPath("src/test/resources/large/Homo_sapiens_assembly38.20.21.dict").getURIString(); private final static String CUSTOM_PRIMARY_CT = "primary_ct"; private final static String CUSTOM_SECONDARY_CT = "secondary_ct"; @@ -41,22 +46,21 @@ public Object[][] bundleCases() { return new Object[][] { // primary, primary tag, secondary, secondary tag, other(s), other tag(s), suppressResourceResolution, expectedBundle - // VCF bundle cases, with AUTOMATIC secondary resolution, and INFERRED content types + // VCF bundle cases, with AUTOMATIC secondary resolution, and INFERRED primary content types {LOCAL_VCF, null, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, - {LOCAL_VCF, null, LOCAL_VCF_IDX, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, + {LOCAL_VCF, null, LOCAL_VCF_IDX, BundleResourceType.CT_VARIANTS_INDEX, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, {LOCAL_VCF_GZIP, null, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF_GZIP), new GATKPath(LOCAL_VCF_TBI))}, + {LOCAL_VCF_GZIP, null, LOCAL_VCF_TBI, BundleResourceType.CT_VARIANTS_INDEX, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF_GZIP), new GATKPath(LOCAL_VCF_TBI))}, {CLOUD_VCF, null, null, null, null, null, false, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))}, - {CLOUD_VCF, null, CLOUD_VCF_IDX, null, null, null, false, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))}, + {CLOUD_VCF, null, CLOUD_VCF_IDX, BundleResourceType.CT_VARIANTS_INDEX, null, null, false, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))}, - // VCF bundle cases, with SUPPRESSED secondary resolution, and INFERRED content types + // VCF bundle cases, with SUPPRESSED secondary resolution, and INFERRED primary content types {LOCAL_VCF, null, null, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF))}, - {LOCAL_VCF, null, LOCAL_VCF_IDX, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, + // local vcf that has no index, but since suppressSecondaryResourceResolution is true, we don't throw since we don't try to infer the index {LOCAL_VCF_WITH_NO_INDEX, null, null, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF_WITH_NO_INDEX))}, - {LOCAL_VCF_GZIP, null, LOCAL_VCF_TBI, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF_GZIP), new GATKPath(LOCAL_VCF_TBI))}, {CLOUD_VCF, null, null, null, null, null, true, new VariantsBundle(new GATKPath(CLOUD_VCF))}, - {CLOUD_VCF, null, CLOUD_VCF_IDX, null, null, null, true, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))}, - // VCF bundle cases, with AUTOMATIC secondary resolution, and EXPLICIT content types + // VCF bundle cases, with AUTOMATIC secondary resolution, and EXPLICIT primary content types {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, LOCAL_VCF_IDX, BundleResourceType.CT_VARIANTS_INDEX, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, @@ -70,9 +74,24 @@ public Object[][] bundleCases() { .addPrimary(new IOPathResource(new GATKPath(LOCAL_VCF), BundleResourceType.CT_VARIANT_CONTEXTS)) .addSecondary(new IOPathResource(new GATKPath(LOCAL_VCF_IDX), BundleResourceType.CT_VARIANTS_INDEX)) .addSecondary(new IOPathResource(new GATKPath(new GATKPath("someVariantsCompanion.txt").getURIString()), "someVariantsCT")) - .build()}, + .build() + }, // reference bundles + { LOCAL_FASTA, null, null, null, null, null, false, + new BundleBuilder() + .addPrimary(new IOPathResource(new GATKPath(LOCAL_FASTA), BundleResourceType.CT_HAPLOID_REFERENCE)) + .addSecondary(new IOPathResource(new GATKPath(LOCAL_FASTA_INDEX), BundleResourceType.CT_REFERENCE_INDEX)) + .addSecondary(new IOPathResource(new GATKPath(LOCAL_FASTA_DICT), BundleResourceType.CT_REFERENCE_DICTIONARY)) + .build() + }, + { LOCAL_FASTA, BundleResourceType.CT_HAPLOID_REFERENCE, LOCAL_FASTA_INDEX, BundleResourceType.CT_REFERENCE_INDEX, Arrays.asList(LOCAL_FASTA_DICT), Arrays.asList(BundleResourceType.CT_REFERENCE_DICTIONARY), false, + new BundleBuilder() + .addPrimary(new IOPathResource(new GATKPath(LOCAL_FASTA), BundleResourceType.CT_HAPLOID_REFERENCE)) + .addSecondary(new IOPathResource(new GATKPath(LOCAL_FASTA_INDEX), BundleResourceType.CT_REFERENCE_INDEX)) + .addSecondary(new IOPathResource(new GATKPath(LOCAL_FASTA_DICT), BundleResourceType.CT_REFERENCE_DICTIONARY)) + .build() + }, // "custom" bundles { @@ -105,17 +124,26 @@ public Object[][] negativeBundleCases() { return new Object[][] { // primary, primary tag, secondary, secondary tag, other(s), other tag(s), suppressIndexResolution, expectedBundle - // no index file can be inferred + // no vcf index file can be inferred {LOCAL_VCF_WITH_NO_INDEX, null, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF_WITH_NO_INDEX))}, + // vcf bundle with secondary/other content type not explicitly provided + {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, null, null, Arrays.asList("other.txt"), null, false, null}, + {LOCAL_VCF, null, LOCAL_VCF_IDX, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, + {LOCAL_VCF, null, LOCAL_VCF_IDX, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, + {LOCAL_VCF_GZIP, null, LOCAL_VCF_TBI, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF_GZIP), new GATKPath(LOCAL_VCF_TBI))}, + {CLOUD_VCF, null, CLOUD_VCF_IDX, null, null, null, false, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))}, + {CLOUD_VCF, null, CLOUD_VCF_IDX, null, null, null, true, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))}, // primary content type not provided, and cannot be inferred from the extension {"primaryFile.ext", null, null, null, null, null, false, null}, // secondary content type not provided {"primaryFile.ext", CUSTOM_PRIMARY_CT, "secondaryFile.ext", null, null, null, false, null}, + // reference input with unknown content type specified + { LOCAL_FASTA, null, LOCAL_FASTA_INDEX, "unknown", null, null, false, null}, + // other bundle with other content type not provided {"primaryFile.ext", CUSTOM_PRIMARY_CT, "secondaryFile.ext", CUSTOM_SECONDARY_CT, Arrays.asList("other.txt"), null, false, null}, - // vcf bundle with other content type not provided - {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, null, null, Arrays.asList("other.txt"), null, false, null}, + }; } @@ -145,6 +173,17 @@ public void testNegativeBundleCases( doCreateBundleTest (primaryInput, primaryInputTag, secondaryInput, secondaryInputTag, otherInputs, otherInputTags, suppressResourceResolution, expectedBundle); } + @Test(expectedExceptions={CommandLineException.class}) + public void testRequireBundleExtension() { + final GATKPath outputPath = new GATKPath(createTempFile("test", ".bundle.BOGUS").getAbsolutePath().toString()); + final List args = new ArrayList<>(); + args.add("--" + StandardArgumentDefinitions.PRIMARY_RESOURCE_LONG_NAME); + args.add(LOCAL_FASTA); + args.add("--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME); + args.add(outputPath.toString()); + runCommandLine(args); + } + private void doCreateBundleTest( final String primaryInput, final String primaryInputTag, @@ -176,10 +215,6 @@ private void doCreateBundleTest( args.add("--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME); args.add(outputPath.toString()); - System.out.println(); - System.out.println(args.stream().collect(Collectors.joining("\n "))); - System.out.println(); - runCommandLine(args); final Bundle actualBundle = BundleJSON.toBundle(IOUtils.getStringFromPath(outputPath), GATKPath::new); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/PrintReadsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/PrintReadsIntegrationTest.java index 028b5965efc..5fb7702b797 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/PrintReadsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/PrintReadsIntegrationTest.java @@ -1,9 +1,13 @@ package org.broadinstitute.hellbender.tools; -import htsjdk.samtools.SamReaderFactory; -import htsjdk.samtools.ValidationStringency; +import htsjdk.beta.io.IOPathUtils; +import htsjdk.beta.io.bundle.*; +import htsjdk.io.IOPath; +import htsjdk.samtools.*; +import htsjdk.samtools.cram.ref.ReferenceSource; import org.broadinstitute.hellbender.GATKBaseTest; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.GATKPath; import org.broadinstitute.hellbender.engine.ReadsDataSource; import org.broadinstitute.hellbender.engine.ReadsPathDataSource; import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; @@ -11,12 +15,15 @@ import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.gcs.BucketUtils; +import org.broadinstitute.hellbender.utils.io.IOUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.Arrays; import java.util.List; @@ -104,4 +111,50 @@ public void testHttpPaths(String reads, String index, String nonHttpReads, Strin SamAssertionUtils.assertEqualBamFiles(out, out2, false, ValidationStringency.DEFAULT_STRINGENCY); } + @Test(groups = {"cloud"}) + public void testPrintReadsWithReferenceBundle() throws IOException { + // test that both reading and writing a cram work when the reference is specified via a bundle where the + // reference, reference index, and reference dictionary are all in different buckets + final IOPath testFastaFile = new GATKPath(getTestDataDir() + "/print_reads.fasta"); + final IOPath testIndexFile = new GATKPath(getTestDataDir() + "/print_reads.fasta.fai"); + final IOPath testDictFile = new GATKPath(getTestDataDir() + "/print_reads.dict"); + + final String targetBucketName = BucketUtils.randomRemotePath(getGCPTestStaging(), "testPrintReadsWithReferenceBundle", "") + "/"; + final IOPath targetBucket = new GATKPath(targetBucketName); + IOUtils.deleteOnExit(targetBucket.toPath()); + + final Path remoteFasta = Files.copy(testFastaFile.toPath(), new GATKPath(targetBucketName + "print_reads.fasta").toPath()); + final IOPath targetIndex = new GATKPath(targetBucketName + "refindex/print_reads.fasta.fai"); + final Path remoteFastaIndex = Files.copy(testIndexFile.toPath(), targetIndex.toPath()); + final IOPath targetDict = new GATKPath(targetBucketName + "refdict/print_reads.dict"); + final Path remoteFastaDict = Files.copy(testDictFile.toPath(), targetDict.toPath()); + + // create a bundle with the remote reference, index, and dict files + final Bundle refBundle = new BundleBuilder() + .addPrimary(new IOPathResource(new GATKPath(remoteFasta.toUri().toString()), BundleResourceType.CT_HAPLOID_REFERENCE)) + .addSecondary(new IOPathResource(new GATKPath(remoteFastaIndex.toUri().toString()), BundleResourceType.CT_REFERENCE_INDEX)) + .addSecondary(new IOPathResource(new GATKPath(remoteFastaDict.toUri().toString()), BundleResourceType.CT_REFERENCE_DICTIONARY)) + .build(); + final IOPath bundleFilePath = new GATKPath(targetBucketName + "refBundle.json"); + IOPathUtils.writeStringToPath(bundleFilePath, BundleJSON.toJSON(refBundle)); + + final IOPath targetOutCRAM = new GATKPath(IOUtils.createTempFile("testReferenceSequenceForNioBundle", ".cram").getAbsolutePath()); + final ArgumentsBuilder args = new ArgumentsBuilder() + .addInput(getTestDataDir() + "/print_reads.cram") + .addReference(bundleFilePath.toString()) + .addOutput(targetOutCRAM.toString()); + runCommandLine(args); + + int count = 0; + try (final SamReader in = SamReaderFactory.makeDefault() + .validationStringency(ValidationStringency.SILENT) + .referenceSource(new ReferenceSource(bundleFilePath.toPath())) + .open(targetOutCRAM.toPath())) { + for (@SuppressWarnings("unused") final SAMRecord rec : in) { + count++; + } + } + Assert.assertEquals(count, 8); + } + } \ No newline at end of file diff --git a/src/test/resources/org/broadinstitute/hellbender/engine/print_reads_bundle_do_not_use.json b/src/test/resources/org/broadinstitute/hellbender/engine/print_reads_bundle_do_not_use.json new file mode 100644 index 00000000000..66fe7bf0ecb --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/engine/print_reads_bundle_do_not_use.json @@ -0,0 +1,10 @@ +{ + "schema": { + "schemaVersion": "0.1.0", + "schemaName": "htsbundle" + }, + "HAPLOID_REFERENCE": {"path": "file:///Users/cnorman/projects/gatk/src/test/resources/org/broadinstitute/hellbender/tools/print_reads.fasta"}, + "REFERENCE_DICTIONARY": {"path": "file:///Users/cnorman/projects/gatk/src/test/resources/org/broadinstitute/hellbender/tools/print_reads.dict"}, + "REFERENCE_INDEX": {"path": "file:///Users/cnorman/projects/gatk/src/test/resources/org/broadinstitute/hellbender/tools/print_reads.fasta.fai"}, + "primary": "HAPLOID_REFERENCE" +}