Skip to content

Commit

Permalink
Issue #7159 Create tool for producing genomic regions (as a BED file) (
Browse files Browse the repository at this point in the history
#8942)

* Initial commit and basic code to read gtf

* add: code to write to bed & integration test

* fix: make getAllFeatures public and use the nesting of features to get to transcripts

* add: filtering transcripts by basic tag

* add: sorts by contig and start (need to fix - sorting lexicographically)

* fix: now sorts by contig then start & output is correct

* fix: make dictionary an arg

* add: comments + simplified CompareGtfInfo

* refactor: apply method
test: add separate tests for gene and transcript

* refactor: onTraversalSuccess and writeToBed

* add: more tests

* fix: test files in correct dir pt1. (files are too large)

* fix: test files in correct dir pt2.

* add: compareFiles and ground truth bed files

* fix: runGtfToBed assert

* add: comments to GtfToBed

* fix: error handling for different versions of gtf and dictionary

* fix: edited some bad conventions

* fix: remove spaces from input file fullName

* add: gtf file with MYT1L and MAPK1

* add: many transcripts unit test and refactoring

* add: tiebreaker sorting by id

* add: make sort by basic optional

* add: html doc comment

* fix: dictionary arg

* fix: add "Gencode" to description

* add: sample mouse gencode testing

* fix: Remove arg shortnames

* fix: rename and move CompareGtfInfo

* fix: kebab-case args

* fix: update html doc

* fix: use IntegrationTestSpec.assertEqualTextFiles()

* fix: remove unnecessary test of pik3ca

* fix: remove set functions in GtfInfo

* fix: style of comparator

* fix: style of comparator

* fix: use Files.newOutputStream() to write and logger for errors

* fix: use getBestAvailableSequenceDictionary()

* fix: use dataProvider for integration tests

* fix: better encapsulation

* fix: move mapk1.gtf to large dir

* fix: arg names

* fix: rename reference dict.

* fix: sequence-dictionary arg javadoc

* add: javadoc to GtfInfo

* add: dictionary exception and corresponding test

* add: test with fasta file as reference arg

* add: javadoc for fasta file

* fix: javadoc and onTraversalStart exception
  • Loading branch information
sanashah007 authored Aug 29, 2024
1 parent 9f2fbb5 commit 3c1430d
Show file tree
Hide file tree
Showing 22 changed files with 8,675 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package org.broadinstitute.hellbender.tools.walkers.conversion;

import htsjdk.samtools.util.Interval;

/**
* A class that represents information extracted from a feature in a GTF file.
* The {@code GtfInfo} object encapsulates details about a specific gene or transcript,
* including its type, gene name, and interval.
*
* <p>The {@code GtfInfo.Type} enum specifies whether the features is a
* {@code GENE} or a {@code TRANSCRIPT}.</p>
*
* <p>The interval specifies the feature's contig (chromosome), start position,
* and end position, providing the precise location of the gene or transcript
* on the genome.</p>
*
* <p>Example usage:</p>
* <pre>
* Interval interval = new Interval("chr1", 1000, 2000);
* GtfInfo gtfInfo = new GtfInfo(interval, GtfInfo.Type.GENE, "MAPK1");
* </pre>
**/

public class GtfInfo {

public enum Type {
GENE,
TRANSCRIPT
}

private final Type type;
private final String geneName;
private final Interval interval;

public GtfInfo(Interval interval, Type type, String geneName) {
this.interval = interval;
this.type = type;
this.geneName = geneName;
}

public Type getType() {
return type;
}

public String getGeneName() {
return geneName;
}

public Interval getInterval() {
return interval;
}

public Integer getStart() {
return interval.getStart();
}

public Integer getEnd() {
return interval.getEnd();
}

@Override
public String toString() {
return "GtfInfo{ " + "type = " + type + " geneName = " + geneName + "interval = " + interval;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,322 @@
package org.broadinstitute.hellbender.tools.walkers.conversion;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.*;
import htsjdk.tribble.Feature;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.WorkflowProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfFeature;

import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.util.*;

/**
* <p>Convert Gencode GTF files to BED format with options for gene and transcript level processing.
* This tool allows for the extraction of gene and transcript information from Gencode GTF files and
* outputs the data in BED format.</p>
*
*
* <p>The conversion process includes sorting entries
* by karyotype, providing flexibility in the selection of either gene or transcript level
* data, and an option to only use basic tags. It ensures that the BED output is sorted and formatted correctly for subsequent use.
* Note that it has been tested for both human and mouse Gencode GTFs. </p>
*
* <h3>Usage examples</h3>
* <p>Example commands to run GtfToBed for typical scenarios:</p>
*
* <h4>(i) Convert GTF to BED with gene level data</h4>
* <p>This mode extracts and converts gene data from the input GTF file to BED format:</p>
*
* <pre>
* java -jar GtfToBed.jar \
* -gtf-path input.gtf \
* -sequence-dictionary dictionary.dict \
= * -output output.bed \
* </pre>
*
* <h4>(ii) Convert GTF to BED with transcript level data</h4>
* <p>This mode extracts and converts transcript data from the input GTF file to BED format:</p>
*
* <pre>
* java -jar GtfToBed.jar \
* -gtf-path input.gtf \
* -sequence-dictionary dictionary.dict \
* -sort-by-transcript \
* -output output.bed \
* </pre>
*
* <h4>(iii) Convert GTF to BED with transcript level data, filtering for only those with the basic tag</h4>
* <p>This mode extracts and converts basic transcript data from the input GTF file to BED format:</p>
*
* <pre>
* java -jar GtfToBed.jar \
* -gtf-path input.gtf \
* -sequence-dictionary dictionary.dict \
* -sort-by-transcript \
* -use-basic-transcript \
* -output output.bed \
* </pre>
*
* <h4>(iiii) </h4> Convert GTF to BED with transcript level data using a reference FASTA instead of DICT file.
* <p>This mode extracts and converts transcript data from the input GTF file to BED format using a FASTA file:</p>
*
* <pre>
* java -jar GtfToBed.jar \
* -gtf-path input.gtf \
* -reference ref.fa \
* -sort-by-transcript \
* -output output.bed \
* </pre>
*/

@CommandLineProgramProperties(
summary = "Converts Gencode GTF files to Bed file format with each row of bed file being either a gene or a transcript.",
oneLineSummary = "Gencode GTF to BED",
programGroup = ShortVariantDiscoveryProgramGroup.class
)

@DocumentedFeature
@WorkflowProperties
public class GtfToBed extends FeatureWalker<GencodeGtfFeature> {
public static final String SORT_BY_TRANSCRIPT_LONG_NAME = "sort-by-transcript";
public static final String USE_BASIC_TRANSCRIPT_LONG_NAME = "use-basic-transcript";
public static final String INPUT_LONG_NAME = "gtf-path";
protected final Logger logger = LogManager.getLogger(this.getClass());

@Argument(fullName = INPUT_LONG_NAME, doc = "Path to Gencode GTF file")
public GATKPath inputFile;

@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME , doc = "Output BED file")
public GATKPath outputFile;

@Argument(fullName = SORT_BY_TRANSCRIPT_LONG_NAME, doc = "Make each row of BED file sorted by transcript", optional = true)
public boolean sortByTranscript = false;

@Argument(fullName = USE_BASIC_TRANSCRIPT_LONG_NAME, doc = "Only use basic transcripts")
public boolean sortByBasic = false;

//stores either gene or transcript ID and summary information about the feature
private final Map<String, GtfInfo> featureInfoMap = new HashMap<>();

//Sequence Dictionary
private SAMSequenceDictionary sequenceDictionary = null;

@Override
protected boolean isAcceptableFeatureType(Class<? extends Feature> featureType) {
return featureType.isAssignableFrom(GencodeGtfFeature.class);
}

// runs per line of gtf file
@Override
public void apply(GencodeGtfFeature feature, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) {
// list of all features of the gene
List<GencodeGtfFeature> geneFeatures = feature.getAllFeatures();

// process each gtf feature in the list of gene features
for (GencodeGtfFeature gtfFeature : geneFeatures) {
// the basic tag is in optional fields
List<GencodeGtfFeature.OptionalField<?>> optionalFields = getOptionalFields(gtfFeature);

// if the gtf feature is a Gene
if (gtfFeature.getFeatureType() == GencodeGtfFeature.FeatureType.GENE) {
processGeneFeature(gtfFeature);
}

// check if the gtf feature is a transcript. If user only wants basic transcripts check that it has the basic tag
else if (gtfFeature.getFeatureType() == GencodeGtfFeature.FeatureType.TRANSCRIPT) {
if (sortByBasic) {
for (GencodeGtfFeature.OptionalField<?> field : optionalFields) {
if ("basic".equals(field.getValue())) {
processTranscriptFeature(gtfFeature);
}
}
} else {
processTranscriptFeature(gtfFeature);
}
}
}
}

// gets the tag out of the list of optional fields
private List<GencodeGtfFeature.OptionalField<?>> getOptionalFields(GencodeGtfFeature gtfFeature) {
List<GencodeGtfFeature.OptionalField<?>> optionalFields = null;
try {
optionalFields = gtfFeature.getOptionalField("tag");
} catch (Exception e) {
logger.error("Could not retrieve optional fields: ", e);
}
return optionalFields;
}

// stores the gene ID and Interval info in hashmap
private void processGeneFeature(GencodeGtfFeature gtfFeature) {
final int geneStart = gtfFeature.getStart();
final int geneEnd = gtfFeature.getEnd();
final Interval interval = new Interval(gtfFeature.getContig(), geneStart, geneEnd);

// put the interval, type as gene, and the name of gene
final GtfInfo gtfInfo = new GtfInfo(interval, GtfInfo.Type.GENE, gtfFeature.getGeneName());

// store in hashmap with key as geneId
featureInfoMap.put(gtfFeature.getGeneId(), gtfInfo);
}

// stores the transcript ID and Interval info in hashmap
private void processTranscriptFeature(GencodeGtfFeature gtfFeature) {
//get interval and put the interval, type as transcript, and the name of the gene it's in
final Interval interval = new Interval(gtfFeature.getContig(), gtfFeature.getStart(), gtfFeature.getEnd());
final GtfInfo gtfInfo = new GtfInfo(interval, GtfInfo.Type.TRANSCRIPT, gtfFeature.getGeneName());

//store in hashmap with key as transcriptId
featureInfoMap.put(gtfFeature.getTranscriptId(), gtfInfo);

//update start/end of corresponding gene if needed
updateGeneStart(gtfFeature);
updateGeneEnd(gtfFeature);
}

// update the gene interval start position based on the transcript
private void updateGeneStart(GencodeGtfFeature gtfFeature) {
// get the start value of the gene
int geneStart = featureInfoMap.get(gtfFeature.getGeneId()).getStart();

// if the transcript start is less than the gene start
if (gtfFeature.getStart() < geneStart) {
// set the gene start to be the transcript start
geneStart = gtfFeature.getStart();
updateGeneInterval(gtfFeature, geneStart, featureInfoMap.get(gtfFeature.getGeneId()).getEnd());
}
}

// update the gene interval end position based on the transcript
private void updateGeneEnd(GencodeGtfFeature gtfFeature) {
// get the end value of the gene
int geneEnd = featureInfoMap.get(gtfFeature.getGeneId()).getEnd();

// if the transcript end is greater than the gene end
if (gtfFeature.getEnd() > geneEnd) {
// set the gene end to be the transcript end
geneEnd = gtfFeature.getEnd();
updateGeneInterval(gtfFeature, featureInfoMap.get(gtfFeature.getGeneId()).getStart(), geneEnd);
}
}

// updates an interval of the gene if it needs to be changed
private void updateGeneInterval(GencodeGtfFeature gtfFeature, int geneStart, int geneEnd) {
Interval geneInterval = new Interval(gtfFeature.getContig(), geneStart, geneEnd);
GtfInfo gtfGeneInfo = new GtfInfo(geneInterval, GtfInfo.Type.GENE, gtfFeature.getGeneName());
featureInfoMap.put(gtfFeature.getGeneId(), gtfGeneInfo);
}

@Override
public void onTraversalStart() {
sequenceDictionary = getBestAvailableSequenceDictionary();
if(sequenceDictionary == null){
throw new UserException("Sequence Dictionary must be specified (" + StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME + ").");
}
}

// runs immediately after it has gone through each line of gtf (apply method)
@Override
public Object onTraversalSuccess() {
// create linked hash map to store sorted values of idToInfo
LinkedHashMap<String, GtfInfo> karyotypeIdToInfo = getSortedMap(sequenceDictionary);

// if user wants to sort by transcript only use transcripts else only use genes
GtfInfo.Type selectedType = sortByTranscript ? GtfInfo.Type.TRANSCRIPT : GtfInfo.Type.GENE;
writeToBed(selectedType, karyotypeIdToInfo);

return null;
}

//Compare GtfInfo objects positionally by contig and start position. If transcripts have the same contig and start, compare by TranscriptId
public static class GtfInfoComparator implements Comparator<Map.Entry<String, GtfInfo>> {

private final SAMSequenceDictionary dictionary;

GtfInfoComparator(SAMSequenceDictionary dictionary) {
this.dictionary = dictionary;
}

// compare two entries of a map where key = geneId or transcriptId and value = gtfInfo object
@Override
public int compare(Map.Entry<String, GtfInfo> e1, Map.Entry<String, GtfInfo> e2) {
final Interval e1Interval = e1.getValue().getInterval();
final Interval e2Interval = e2.getValue().getInterval();

Utils.nonNull(dictionary.getSequence(e1Interval.getContig()), "could not get sequence for " + e1Interval.getContig());
Utils.nonNull(dictionary.getSequence(e2Interval.getContig()), "could not get sequence for " + e2Interval.getContig());

//compare by contig, then start, then by key
return Comparator
.comparingInt((Map.Entry<String, GtfInfo> e) ->
dictionary.getSequence(e.getValue().getInterval().getContig()).getSequenceIndex())
.thenComparingInt(e -> e.getValue().getInterval().getStart())
.thenComparing(Map.Entry::getKey)
.compare(e1,e2);


}
}

// sorts the map containing the features based on contig and start position
private LinkedHashMap<String, GtfInfo> getSortedMap(SAMSequenceDictionary sequenceDictionary) {
// create a list that has the keys and values of idToInfo and sort the list using GtfInfoComparator
List<Map.Entry<String, GtfInfo>> entries = new ArrayList<>(featureInfoMap.entrySet());
entries.sort(new GtfInfoComparator(sequenceDictionary));

// put each (sorted) entry in the list into a linked hashmap
LinkedHashMap<String, GtfInfo> karyotypeIdToInfo = new LinkedHashMap<>();
for (Map.Entry<String, GtfInfo> entry : entries) {
karyotypeIdToInfo.put(entry.getKey(), entry.getValue());
}

return karyotypeIdToInfo;
}

// writes to bed file
private void writeToBed(GtfInfo.Type type, Map<String, GtfInfo> sortedMap) {
try (final OutputStream writer = Files.newOutputStream(outputFile.toPath())) {
for (Map.Entry<String, GtfInfo> entry : sortedMap.entrySet()) {
if (entry.getValue().getType() == type) {
String line = formatBedLine(entry, type);
writer.write((line + System.lineSeparator()).getBytes());
}
}
} catch (IOException e) {
throw new GATKException("Error writing to BED file", e);
}
}

// formats each line of the bed file depending on whether user has selected gene or transcript
private String formatBedLine(Map.Entry<String, GtfInfo> entry, GtfInfo.Type type) {
GtfInfo info = entry.getValue();
String line = info.getInterval().getContig() + "\t" +
info.getInterval().getStart() + "\t" +
info.getInterval().getEnd() + "\t" +
info.getGeneName();

if (type == GtfInfo.Type.TRANSCRIPT) {
line += "," + entry.getKey();
}

return line;
}

@Override
public GATKPath getDrivingFeaturePath() {
return inputFile;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ public void setStopCodon(final GencodeGtfStopCodonFeature stopCodon) {


@Override
protected List<GencodeGtfFeature> getAllFeatures() {
public List<GencodeGtfFeature> getAllFeatures() {
final ArrayList<GencodeGtfFeature> list = new ArrayList<>();
list.add(this);

Expand Down
Loading

0 comments on commit 3c1430d

Please sign in to comment.