Skip to content

Commit

Permalink
Fixed support for newer Gencode GTF versions by making the GencodeGTF…
Browse files Browse the repository at this point in the history
…Field parsing more permissive (#8351)
  • Loading branch information
jamesemery authored Jun 28, 2023
1 parent daeb3e2 commit 01e45a2
Show file tree
Hide file tree
Showing 17 changed files with 2,886 additions and 3,012 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.broadinstitute.hellbender.tools.funcotator;

import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation;
import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGTFFieldConstants;
import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfFeature;

import java.util.Comparator;
Expand Down Expand Up @@ -246,8 +247,8 @@ public ComparatorByProteinCodingStatus(){}
@Override
public int compare( final GencodeFuncotation a, final GencodeFuncotation b ) {
// Is it protein coding?
final boolean isAProteinCoding = GencodeGtfFeature.KnownGeneBiotype.PROTEIN_CODING.toString().equals(a.getGeneTranscriptType());
final boolean isBProteinCoding = GencodeGtfFeature.KnownGeneBiotype.PROTEIN_CODING.toString().equals(b.getGeneTranscriptType());
final boolean isAProteinCoding = GencodeGTFFieldConstants.KnownGeneBiotype.PROTEIN_CODING.toString().equals(a.getGeneTranscriptType());
final boolean isBProteinCoding = GencodeGTFFieldConstants.KnownGeneBiotype.PROTEIN_CODING.toString().equals(b.getGeneTranscriptType());
if ( isAProteinCoding != isBProteinCoding ) {
if ( isAProteinCoding ) {
return -1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import org.broadinstitute.hellbender.tools.funcotator.Funcotation;
import org.broadinstitute.hellbender.tools.funcotator.metadata.FuncotationMetadata;
import org.broadinstitute.hellbender.tools.funcotator.vcfOutput.VcfOutputRenderer;
import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGTFFieldConstants;
import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfGeneFeature;

import java.util.Arrays;
Expand Down Expand Up @@ -74,7 +75,7 @@ public class GencodeFuncotation implements Funcotation {

// These are included because they help determine the transcript selection
private Integer locusLevel;
private GencodeGtfGeneFeature.FeatureTag apprisRank;
private GencodeGTFFieldConstants.FeatureTag apprisRank;
private Integer transcriptLength;
private String version;
private String geneTranscriptType;
Expand Down Expand Up @@ -378,7 +379,7 @@ public boolean equals(final Object o) {
if (transcriptLength != null ? !transcriptLength.equals(that.transcriptLength) : that.transcriptLength != null)
return false;
if (version != null ? !version.equals(that.version) : that.version != null) return false;
if (geneTranscriptType != that.geneTranscriptType) return false;
if (geneTranscriptType != that.geneTranscriptType) return false; //TODO this is a problem string equality comparison.... it breaks tests to fix it though...
if (hugoSymbolSerializedOverride != null ? !hugoSymbolSerializedOverride.equals(that.hugoSymbolSerializedOverride) : that.hugoSymbolSerializedOverride != null)
return false;
if (ncbiBuildSerializedOverride != null ? !ncbiBuildSerializedOverride.equals(that.ncbiBuildSerializedOverride) : that.ncbiBuildSerializedOverride != null)
Expand Down Expand Up @@ -660,11 +661,11 @@ public void setLocusLevel(final Integer locusLevel) {
this.locusLevel = locusLevel;
}

public GencodeGtfGeneFeature.FeatureTag getApprisRank() {
public GencodeGTFFieldConstants.FeatureTag getApprisRank() {
return apprisRank;
}

public void setApprisRank(final GencodeGtfGeneFeature.FeatureTag apprisRank) {
public void setApprisRank(final GencodeGTFFieldConstants.FeatureTag apprisRank) {
this.apprisRank = apprisRank;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.funcotator.metadata.FuncotationMetadataUtils;
import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGTFFieldConstants;
import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfGeneFeature;

import java.util.ArrayList;
Expand Down Expand Up @@ -260,10 +261,10 @@ public GencodeFuncotationBuilder setLocusLevel( final Integer locusLevel ) {

/**
* Set the Appris Rank in the {@link GencodeFuncotation}.
* @param apprisRank The {@link GencodeGtfGeneFeature.FeatureTag} containing the Appris Rank for the {@link GencodeFuncotation}.
* @param apprisRank The {@link GencodeGTFFieldConstants.FeatureTag} containing the Appris Rank for the {@link GencodeFuncotation}.
* @return {@code this} {@link GencodeFuncotationBuilder}
*/
public GencodeFuncotationBuilder setApprisRank( final GencodeGtfGeneFeature.FeatureTag apprisRank ) {
public GencodeFuncotationBuilder setApprisRank( final GencodeGTFFieldConstants.FeatureTag apprisRank ) {
gencodeFuncotation.setApprisRank( apprisRank );
return this;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import static org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfFeature.FeatureTag.*;
import static org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGTFFieldConstants.FeatureTag.*;

/**
* A factory to create {@link GencodeFuncotation}s.
Expand Down Expand Up @@ -111,24 +111,22 @@ public class GencodeFuncotationFactory extends DataSourceFuncotationFactory {
/**
* List of valid Appris Ranks used for sorting funcotations to get the "best" one.z
*/
private static final HashSet<GencodeGtfGeneFeature.FeatureTag> apprisRanks = new HashSet<>(
Arrays.asList(
APPRIS_PRINCIPAL,
APPRIS_PRINCIPAL_1,
APPRIS_PRINCIPAL_2,
APPRIS_PRINCIPAL_3,
APPRIS_PRINCIPAL_4,
APPRIS_PRINCIPAL_5,
APPRIS_ALTERNATIVE_1,
APPRIS_ALTERNATIVE_2,
APPRIS_CANDIDATE_HIGHEST_SCORE,
APPRIS_CANDIDATE_LONGEST_CCDS,
APPRIS_CANDIDATE_CCDS,
APPRIS_CANDIDATE_LONGEST_SEQ,
APPRIS_CANDIDATE_LONGEST,
APPRIS_CANDIDATE
)
);
private static final LinkedHashMap<String, GencodeGTFFieldConstants.FeatureTag> apprisRanks = new LinkedHashMap<>() {{
put(APPRIS_PRINCIPAL.toString(), APPRIS_PRINCIPAL);
put(APPRIS_PRINCIPAL_1.toString(), APPRIS_PRINCIPAL_1);
put(APPRIS_PRINCIPAL_2.toString(), APPRIS_PRINCIPAL_2);
put(APPRIS_PRINCIPAL_3.toString(), APPRIS_PRINCIPAL_3);
put(APPRIS_PRINCIPAL_4.toString(), APPRIS_PRINCIPAL_4);
put(APPRIS_PRINCIPAL_5.toString(), APPRIS_PRINCIPAL_5);
put(APPRIS_ALTERNATIVE_1.toString(), APPRIS_ALTERNATIVE_1);
put(APPRIS_ALTERNATIVE_2.toString(), APPRIS_ALTERNATIVE_2);
put(APPRIS_CANDIDATE_HIGHEST_SCORE.toString(), APPRIS_CANDIDATE_HIGHEST_SCORE);
put(APPRIS_CANDIDATE_LONGEST_CCDS.toString(), APPRIS_CANDIDATE_LONGEST_CCDS);
put(APPRIS_CANDIDATE_CCDS.toString(), APPRIS_CANDIDATE_CCDS);
put(APPRIS_CANDIDATE_LONGEST_SEQ.toString(), APPRIS_CANDIDATE_LONGEST_SEQ);
put(APPRIS_CANDIDATE_LONGEST.toString(), APPRIS_CANDIDATE_LONGEST);
put(APPRIS_CANDIDATE.toString(), APPRIS_CANDIDATE);
}};

/**
* The set of {@link GencodeFuncotation.VariantClassification} types that are valid for coding regions.
Expand Down Expand Up @@ -946,8 +944,7 @@ private static boolean isBasic(final GencodeGtfTranscriptFeature transcript) {
// Check if this transcript has the `basic` tag:
return transcript.getOptionalFields().stream()
.filter( f -> f.getName().equals("tag") )
.filter( f -> f.getValue() instanceof GencodeGtfFeature.FeatureTag )
.filter( f -> f.getValue().equals(GencodeGtfFeature.FeatureTag.BASIC) )
.filter( f -> f.getValue().equals(GencodeGTFFieldConstants.FeatureTag.BASIC.toString()) )
.count() > 0;
}

Expand Down Expand Up @@ -1079,7 +1076,7 @@ private GencodeFuncotation createExonFuncotation(final VariantContext variant,

// Before we get started, check to see if this is a non-protein-coding feature.
// If it is, we must handle it differently:
if ( GencodeGtfFeature.KnownGeneBiotype.PROTEIN_CODING.toString().equals(transcript.getGeneType()) ) {
if ( GencodeGTFFieldConstants.KnownGeneBiotype.PROTEIN_CODING.toString().equals(transcript.getGeneType()) ) {
return createCodingRegionFuncotationForProteinCodingFeature(variant, altAllele, reference, transcript, exon);
}
else {
Expand Down Expand Up @@ -1700,7 +1697,7 @@ private GencodeFuncotation createIntronFuncotation(final VariantContext variant,
gencodeFuncotationBuilder.setReferenceContext(referenceBases.getBaseString(Strand.POSITIVE));

// Set the VariantClassification:
if ( GencodeGtfFeature.KnownGeneBiotype.PROTEIN_CODING.toString().equals(transcript.getGeneType()) ) {
if ( GencodeGTFFieldConstants.KnownGeneBiotype.PROTEIN_CODING.toString().equals(transcript.getGeneType()) ) {
gencodeFuncotationBuilder.setVariantClassification(GencodeFuncotation.VariantClassification.INTRON);
}
else {
Expand Down Expand Up @@ -2708,19 +2705,18 @@ else if (altAllele.length() < refAllele.length()) {

/**
* Get the Appris Rank from the given {@link GencodeGtfGeneFeature}.
* Appris ranks are specified as annotations using {@link org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfFeature.FeatureTag}s.
* Appris ranks are specified as annotations using {@link GencodeGTFFieldConstants.FeatureTag}s.
* @param gtfFeature The {@link GencodeGtfTranscriptFeature} from which to get the Appris Rank.
* @return The highest Appris Rank found in the given {@code gtfFeature}; if no Appris Rank exists, {@code null}.
*/
@VisibleForTesting
static GencodeGtfFeature.FeatureTag getApprisRank( final GencodeGtfTranscriptFeature gtfFeature ) {
static GencodeGTFFieldConstants.FeatureTag getApprisRank(final GencodeGtfTranscriptFeature gtfFeature ) {

// Get our appris tag(s) if it/they exist(s):
final List<GencodeGtfFeature.FeatureTag> gtfApprisTags = gtfFeature.getOptionalFields().stream()
// Get the Appris Rank tags and convert them to Sortable Enums:
final List<GencodeGTFFieldConstants.FeatureTag> gtfApprisTags = gtfFeature.getOptionalFields().stream()
.filter( f -> f.getName().equals("tag") )
.filter( f -> f.getValue() instanceof GencodeGtfFeature.FeatureTag )
.filter( f -> apprisRanks.contains( f.getValue() ) )
.map( f -> (GencodeGtfFeature.FeatureTag)f.getValue() ).collect(Collectors.toList());
.filter( f -> apprisRanks.containsKey( f.getValue() ) )
.map( f -> apprisRanks.get(f.getValue()) ).collect(Collectors.toList());

if ( gtfApprisTags.isEmpty() ) {
return null;
Expand All @@ -2737,16 +2733,16 @@ else if ( gtfApprisTags.size() == 1 ) {

/**
* Converts a given GeneTranscriptType {@link String} to a {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification}.
* Assumes the given {@code type} is not {@link GencodeGtfFeature.KnownGeneBiotype#PROTEIN_CODING}.
* Assumes the given {@code type} is not {@link GencodeGTFFieldConstants.KnownGeneBiotype#PROTEIN_CODING}.
* If no type can be assessed, returns {@code null}.
* @param type A {@link String} representing a GeneTranscriptType to convert to a {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification}.
* @return A {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification} representing the given GeneTranscriptType {@link String}, or {@code null}.
*/
private static GencodeFuncotation.VariantClassification convertGeneTranscriptTypeToVariantClassification (final String type ) {

//TODO: This all needs to be fixed so there is a 1:1 mapping of GencodeGtfFeature.KnownGeneBiotype->VariantClassification - Issue #4405
if (GencodeGtfFeature.KnownGeneBiotype.LINCRNA.toString().equals(type) ||
GencodeGtfFeature.KnownGeneBiotype.MACRO_LNCRNA.toString().equals(type)) {
if (GencodeGTFFieldConstants.KnownGeneBiotype.LINCRNA.toString().equals(type) ||
GencodeGTFFieldConstants.KnownGeneBiotype.MACRO_LNCRNA.toString().equals(type)) {
return GencodeFuncotation.VariantClassification.LINCRNA;
}
return GencodeFuncotation.VariantClassification.RNA;
Expand Down
Loading

0 comments on commit 01e45a2

Please sign in to comment.