From a203d8709898daa2f5215cddcc16adadd4246224 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Tue, 20 Sep 2022 15:02:04 -0400 Subject: [PATCH] Incorporate 2D model fix from NVIDIA Port the patch from https://github.com/NVIDIA-Genomics-Research/nvscorevariants/commit/937ffafb78b0f3e7df9b1edc3b08d11e3ebee35a --- .../hellbender/scorevariants/encoders.py | 43 +++++++++++++------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/src/main/python/org/broadinstitute/hellbender/scorevariants/encoders.py b/src/main/python/org/broadinstitute/hellbender/scorevariants/encoders.py index a25d4c40c4b..0c9f259fdbb 100644 --- a/src/main/python/org/broadinstitute/hellbender/scorevariants/encoders.py +++ b/src/main/python/org/broadinstitute/hellbender/scorevariants/encoders.py @@ -471,17 +471,32 @@ def _fill_rows(self, alignment_number, read, sequence, quality, tensor): ] = mq def filter_read(self, read, window): + if not read: + True - # filters alignment entries that do not have a cigarstring - if ( - not read - or not hasattr(read, "cigarstring") - or read.cigarstring is None - ): - return True + if (not read.is_unmapped) and (read.query_alignment_start < 0): + True + + if (not read.is_unmapped) and (read.query_alignment_length + 1 < 0): + True + + if len(read.query_qualities) != len(read.query_sequence): + True + + if (not read.is_unmapped) and (len(read.cigarstring) != len(read.query_sequence)): + True + + if len(read.query_sequence) <= 0: + True + + if read.cigarstring and "N" in read.cigarstring: + True # filters HaplotypeCaller artificial haplotypes - read_group = read.get_tag("RG") + try: + read_group = read.get_tag("RG") + except KeyError: + True if "artificial" in read_group.lower(): return True @@ -524,16 +539,20 @@ def get_reads(self, variant, interval): start = variant.start stop = variant.stop - for alignment_number, read in enumerate(self.alignment_file.fetch( + alignment_number = 0 + for read in self.alignment_file.fetch( variant.contig, start, stop, multiple_iterators=False, - )): - if alignment_number < self.read_limit: + ): + if self.filter_read(read, interval): + continue + alignment_number += 1 + if alignment_number <= self.read_limit: reads.append(read) else: - randomSlot = mRandomGenerator.nextInt(alignment_number+1) + randomSlot = mRandomGenerator.nextInt(alignment_number) if randomSlot < self.read_limit: reads[randomSlot] = read insertions = self.get_insertions(reads, interval, self.window_size)