From 20e34e87414cb75ba44500d5ca35be5b796a5a14 Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Thu, 19 Jan 2023 17:33:17 -0500 Subject: [PATCH] Add a --numeric-gt option to VariantsToTable * add an new option to VariantsToTable to allow output VCF style numeric GT fields previously it always output the actual bases of the Allele in the GT spot * resolves https://github.com/broadinstitute/gatk/issues/8160 * updates htsjdk to 3.0.5 --- .../walkers/variantutils/VariantsToTable.java | 32 +++++++++++++------ .../VariantsToTableIntegrationTest.java | 16 ++++++++++ 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable.java index db5d4e2e63c..0eaa843dcc4 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable.java @@ -1,6 +1,7 @@ package org.broadinstitute.hellbender.tools.walkers.variantutils; import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.*; import org.apache.logging.log4j.LogManager; @@ -122,6 +123,7 @@ public final class VariantsToTable extends VariantWalker { public final static String SPLIT_MULTI_ALLELIC_LONG_NAME = "split-multi-allelic"; public final static String SPLIT_MULTI_ALLELIC_SHORT_NAME = "SMA"; + public static final String NUMERIC_GT_FULLNAME = "numeric-gt"; static final Logger logger = LogManager.getLogger(VariantsToTable.class); @@ -204,6 +206,11 @@ public final class VariantsToTable extends VariantWalker { doc="Fail on missing data", optional=true) public boolean errorIfMissingData = false; + @Argument(fullName = NUMERIC_GT_FULLNAME, + doc = "write the GT field the way it appears in a VCF ( ex. 0/1 instead of A/T )", + optional = true) + public boolean useNumericGT = false; + private static final String MISSING_DATA = "NA"; private SortedSet samples; @@ -341,7 +348,7 @@ private void emitMoltenizedOutput(final List record) { * @param vc the VariantContext whose field values we can to capture * @return List of lists of field values */ - protected List> extractFields(final VariantContext vc) { + private List> extractFields(final VariantContext vc) { final int numRecordsToProduce = splitMultiAllelic ? vc.getAlternateAlleles().size() : 1; final List> records = new ArrayList<>(numRecordsToProduce); @@ -395,18 +402,23 @@ protected List> extractFields(final VariantContext vc) { private void addGenotypeFieldsToRecords(final VariantContext vc, final List> records, final boolean errorIfMissingData) { for ( final String sample : samples ) { + final Genotype genotype = vc.getGenotype(sample); for ( final String gf : genotypeFieldsToTake ) { - if ( vc.hasGenotype(sample) && vc.getGenotype(sample).hasAnyAttribute(gf) ) { + if ( vc.hasGenotype(sample) && genotype.hasAnyAttribute(gf) ) { if (VCFConstants.GENOTYPE_KEY.equals(gf)) { - addFieldValue(vc.getGenotype(sample).getGenotypeString(true), records); + if(useNumericGT) { + addFieldValue(VCFEncoder.encodeGtField(vc, genotype), records); + } else { + addFieldValue(genotype.getGenotypeString(true), records); + } } else { /** * TODO - If gf == "FT" and the GT record is not filtered, Genotype.getAnyAttribute == null. Genotype.hasAnyAttribute should be changed so it * returns false for this condition. Presently, it always returns true. Once this is fixed, then only the "addFieldValue" statement will * remain in the following logic block. */ - if (vc.getGenotype(sample).getAnyAttribute(gf) != null) { - addFieldValue(vc.getGenotype(sample).getAnyAttribute(gf), records); + if (genotype.getAnyAttribute(gf) != null) { + addFieldValue(genotype.getAnyAttribute(gf), records); } else { handleMissingData(errorIfMissingData, gf, records, vc); } } @@ -416,21 +428,21 @@ private void addGenotypeFieldsToRecords(final VariantContext vc, final List altDepths = new ArrayList<>(); - int[] allDepths = vc.getGenotype(sample).getAD(); + final List altDepths = new ArrayList<>(); + int[] allDepths = genotype.getAD(); for (int i = 1; i < allDepths.length; i++) { altDepths.add(allDepths[0] + "," + allDepths[i]); } addFieldValue(altDepths, records); } else { - addAlleleSpecificFieldValue(split(vc.getGenotype(sample).getExtendedAttribute(field).toString(), ','), + addAlleleSpecificFieldValue(split(genotype.getExtendedAttribute(field).toString(), ','), records, inputHeader.getFormatHeaderLine(field).getCountType()); } } else { - final String value = vc.getGenotype(sample).getAnyAttribute(field).toString(); + final String value = genotype.getAnyAttribute(field).toString(); if (field.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) { addFieldValue(value.replace("[","").replace("]","").replaceAll("\\s",""),records); } else { diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTableIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTableIntegrationTest.java index 10c922a267a..b1e4b96a084 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTableIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTableIntegrationTest.java @@ -2,6 +2,7 @@ import org.broadinstitute.hellbender.CommandLineProgramTest; import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; import org.broadinstitute.hellbender.testutils.IntegrationTestSpec; import org.testng.annotations.Test; @@ -276,5 +277,20 @@ public void testNoFieldsSpecifiedFormatFieldInHeaderNoSamples() throws IOExcepti IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile); } + + @Test + public void testNumericGTFlag() throws IOException { + final File inputFile = new File(getToolTestDataDir(), "VCFWithGenotypes_1000G.phase3.snippet.vcf"); + final File outputFile = createTempFile("numericGT", ".table"); + final File expectedFile = new File(getToolTestDataDir(), "expected.numericGT.table"); + + final ArgumentsBuilder args = new ArgumentsBuilder(); + args.addVCF(inputFile) + .addOutput(outputFile) + .addFlag(VariantsToTable.NUMERIC_GT_FULLNAME); + runCommandLine(args); + + IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile); + } }