diff --git a/src/main/java/au/edu/wehi/idsv/vcf/UntemplatedSequenceAnnotator.java b/src/main/java/au/edu/wehi/idsv/vcf/UntemplatedSequenceAnnotator.java index 53a03e385..136ed71f9 100644 --- a/src/main/java/au/edu/wehi/idsv/vcf/UntemplatedSequenceAnnotator.java +++ b/src/main/java/au/edu/wehi/idsv/vcf/UntemplatedSequenceAnnotator.java @@ -1,39 +1,44 @@ package au.edu.wehi.idsv.vcf; -import au.edu.wehi.idsv.GenomicProcessingContext; -import au.edu.wehi.idsv.IdsvVariantContext; -import au.edu.wehi.idsv.IdsvVariantContextBuilder; -import au.edu.wehi.idsv.VariantContextDirectedEvidence; +import au.edu.wehi.idsv.*; import au.edu.wehi.idsv.alignment.ExternalProcessStreamingAligner; import au.edu.wehi.idsv.util.AutoClosingIterator; import com.google.common.collect.Iterators; import com.google.common.collect.PeekingIterator; import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.fastq.FastqRecord; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.Log; +import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.vcf.VCFFileReader; +import joptsimple.internal.Strings; import java.io.File; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; -public class UntemplatedSequenceAnnotator implements CloseableIterator { +public class UntemplatedSequenceAnnotator implements CloseableIterator { public static final byte DEFAULT_QUAL_SCORE = 20; private static final Log log = Log.getInstance(UntemplatedSequenceAnnotator.class); - private final GenomicProcessingContext context; + private static final Pattern breakendRegex = Pattern.compile("^(.(?.*))?[\\[\\]].*[\\[\\]]((?.*).)?$"); + private final File referenceFile; private final File vcf; private final boolean overwrite; private final List cmd; private final int threads; - private CloseableIterator vcfStream; + private CloseableIterator vcfStream; private PeekingIterator samStream; private ExternalProcessStreamingAligner aligner; private Thread feedingAligner; - private IdsvVariantContext nextRecord = null; - public UntemplatedSequenceAnnotator(GenomicProcessingContext context, File vcf, boolean overwrite, List aligner_command_line, int threads) { - this.context = context; + private VariantContext nextRecord = null; + public UntemplatedSequenceAnnotator(File referenceFile, File vcf, boolean overwrite, List aligner_command_line, int threads) { + this.referenceFile = referenceFile; this.vcf = vcf; this.overwrite = overwrite; this.cmd = aligner_command_line; @@ -42,15 +47,14 @@ public UntemplatedSequenceAnnotator(GenomicProcessingContext context, File vcf, createRecordAlignmentStream(); this.samStream = Iterators.peekingIterator(this.aligner); } - private CloseableIterator getVcf() { + private CloseableIterator getVcf() { VCFFileReader vcfReader = new VCFFileReader(vcf, false); CloseableIterator it = vcfReader.iterator(); - Iterator idsvIt = Iterators.transform(it, variant -> IdsvVariantContext.create(context, null, variant)); - return new AutoClosingIterator(idsvIt, it, vcfReader); + return new AutoClosingIterator(it, vcfReader); } private ExternalProcessStreamingAligner createRecordAlignmentStream() { log.debug("Started async external alignment feeder thread."); - aligner = new ExternalProcessStreamingAligner(context.getSamReaderFactory(), cmd, context.getReferenceFile(), threads); + aligner = new ExternalProcessStreamingAligner(SamReaderFactory.make(), cmd, referenceFile, threads); feedingAligner = new Thread(() -> feedExternalAligner()); feedingAligner.setName("async-feedExternalAligner"); feedingAligner.start(); @@ -59,20 +63,37 @@ private ExternalProcessStreamingAligner createRecordAlignmentStream() { private boolean shouldAttemptAlignment(VariantContext v) { return overwrite || !v.hasAttribute(VcfInfoAttributes.BREAKEND_ALIGNMENTS.attribute()); } + private static String getBreakendSequence(VariantContext seq) { + if (seq.getAlternateAlleles().size() != 1) return null; + Allele allele = seq.getAlternateAllele(0); + String alt = allele.getDisplayString(); + if (alt.charAt(0) == '.' || alt.charAt(alt.length() - 1) == '.') { + return alt.substring(1, alt.length() - 1); + } else if (alt.charAt(0) == '[' || alt.charAt(0) == ']' || alt.charAt(alt.length() - 1) == '[' || alt.charAt(alt.length() - 1) == ']') { + Matcher matcher = breakendRegex.matcher(alt); + if (matcher.matches()) { + String leftIns = matcher.group("leftins"); + if (!Strings.isNullOrEmpty(leftIns)) { + return leftIns; + } + String rightIns = matcher.group("rightins"); + return rightIns; + } + } + return null; + } + private void feedExternalAligner() { - try (CloseableIterator it = getVcf()) { + try (CloseableIterator it = getVcf()) { while (it.hasNext()) { - IdsvVariantContext vc = it.next(); - if (vc instanceof VariantContextDirectedEvidence && shouldAttemptAlignment(vc)) { - VariantContextDirectedEvidence e = (VariantContextDirectedEvidence)vc; - byte[] seq = e.getBreakendSequence(); - byte[] qual = e.getBreakendQuality(); - if (seq != null && seq.length > 0) { - if (qual == null) { - qual = new byte[seq.length]; - Arrays.fill(qual, DEFAULT_QUAL_SCORE); - } - FastqRecord fq = new FastqRecord(e.getID(), seq, null, qual); + VariantContext vc = it.next(); + if (shouldAttemptAlignment(vc)) { + String seqstr = getBreakendSequence(vc); + if (!Strings.isNullOrEmpty(seqstr)) { + byte[] seq = seqstr.getBytes(StandardCharsets.UTF_8); + byte[] qual = new byte[seq.length]; + Arrays.fill(qual, DEFAULT_QUAL_SCORE); + FastqRecord fq = new FastqRecord(vc.getID(), seq, null, qual); aligner.asyncAlign(fq); } } @@ -94,10 +115,10 @@ public boolean hasNext() { return nextRecord != null; } @Override - public IdsvVariantContext next() { + public VariantContext next() { ensureNext(); if (!hasNext()) throw new NoSuchElementException(); - IdsvVariantContext result = nextRecord; + VariantContext result = nextRecord; nextRecord = null; return result; } @@ -128,7 +149,7 @@ private void annotateNextRecord() { } } if (alignments.size() > 0) { - IdsvVariantContextBuilder builder = new IdsvVariantContextBuilder(context, nextRecord); + VariantContextBuilder builder = new VariantContextBuilder(nextRecord); builder.attribute(VcfInfoAttributes.BREAKEND_ALIGNMENTS.attribute(), writeAlignmentAnnotation(alignments)); nextRecord = builder.make(); } diff --git a/src/main/java/gridss/AnnotateUntemplatedSequence.java b/src/main/java/gridss/AnnotateUntemplatedSequence.java index cf880ccec..6d3d67171 100644 --- a/src/main/java/gridss/AnnotateUntemplatedSequence.java +++ b/src/main/java/gridss/AnnotateUntemplatedSequence.java @@ -10,6 +10,7 @@ import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.ProgressLogger; +import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; import htsjdk.variant.vcf.VCFFileReader; @@ -35,9 +36,11 @@ public class AnnotateUntemplatedSequence extends ReferenceCommandLineProgram { @Argument(doc="Overwrite existing annotation. Setting to 'true' will replace any existing BEALN annotations. " + "Setting to 'false' will generate annotations only for records without an existing BEALN annotation. ") public boolean OVERWRITE = false; - @Argument(doc="Directly pipe the input and output of the aligner instead of writing to intermediate files." + @Argument(doc="Command line arguments to run external aligner. " + + "Aligner output must be written to stdout and the records MUST match the input fastq order." + " The aligner must support using \"-\" as the input filename when reading from stdin." - + " The sort order of the input file will not be retained.", optional=true) + + "Java argument formatting is used with %1$s being the fastq file to align, " + + "%2$s the reference genome, and %3$d the number of threads to use.", optional=true) public List ALIGNER_COMMAND_LINE = new SoftClipsToSplitReads().ALIGNER_COMMAND_LINE; public static void main(String[] argv) { System.exit(new AnnotateUntemplatedSequence().instanceMain(argv)); @@ -46,10 +49,10 @@ public static void main(String[] argv) { public int doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsWritable(OUTPUT); + IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE); log.info("Annotating variant untemplated sequence in " + INPUT); - GenomicProcessingContext context = new GenomicProcessingContext(getFileSystemContext(), REFERENCE_SEQUENCE, getReference()); - try (UntemplatedSequenceAnnotator ann = new UntemplatedSequenceAnnotator(context, INPUT, OVERWRITE, ALIGNER_COMMAND_LINE, WORKER_THREADS)) { - saveVcf(context, INPUT, OUTPUT, ann); + try (UntemplatedSequenceAnnotator ann = new UntemplatedSequenceAnnotator(REFERENCE_SEQUENCE, INPUT, OVERWRITE, ALIGNER_COMMAND_LINE, WORKER_THREADS)) { + saveVcf(INPUT, OUTPUT, ann); log.info("Annotated variants written to " + OUTPUT); } catch (IOException e) { log.error(e); @@ -57,8 +60,8 @@ public int doWork() { } return 0; } - protected void saveVcf(GenomicProcessingContext context, File input, File output, Iterator calls) throws IOException { - VCFHeader header = null; + protected void saveVcf(File input, File output, Iterator calls) throws IOException { + VCFHeader header; try (VCFFileReader vcfReader = new VCFFileReader(input, false)) { header = vcfReader.getFileHeader(); } @@ -72,7 +75,7 @@ protected void saveVcf(GenomicProcessingContext context, File input, File output try (VariantContextWriter vcfWriter = builder.build()) { vcfWriter.writeHeader(header); while (calls.hasNext()) { - IdsvVariantContext record = calls.next(); + VariantContext record = calls.next(); vcfWriter.add(record); writeProgress.record(record.getContig(), record.getStart()); }