From d332e0e8bf0fee84b3d6c3f4feba61f4938979b8 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 6 Sep 2024 11:36:23 -0400 Subject: [PATCH 1/2] Initial commit --- src/svtk/svtk/cli/baf_test.py | 116 ++++++++++++---------------------- 1 file changed, 41 insertions(+), 75 deletions(-) diff --git a/src/svtk/svtk/cli/baf_test.py b/src/svtk/svtk/cli/baf_test.py index a8e3edda7..0ccaa46b4 100644 --- a/src/svtk/svtk/cli/baf_test.py +++ b/src/svtk/svtk/cli/baf_test.py @@ -1,13 +1,10 @@ #!usr/bin/env python from svtk.baf.BAFpysam import * import argparse -from collections import deque import numpy as np import pandas as pd import pysam import sys -########## - def preprocess(chrom, start, end, tbx, samples, window=None): """ @@ -34,56 +31,50 @@ def preprocess(chrom, start, end, tbx, samples, window=None): BAF of each called SNP within CNV """ - # Load and filter SNP sites if window is None: window = end - start - # records = tbx.fetch(chrom, start - window, end + window,parser=pysam.asTuple()) - # sites = deque() + bafs = deque() + if window < 1000000: for record in tbx.fetch(chrom, max(1, start - window), end + window, parser=pysam.asTuple()): bafs.append(np.array(record)) else: - for record in tbx.fetch(chrom, max(1, start - 1000000), start, parser=pysam.asTuple()): bafs.append(np.array(record)) for record in tbx.fetch(chrom, (start + end) // 2 - 500000, (start + end) // 2 + 500000, parser=pysam.asTuple()): bafs.append(np.array(record)) for record in tbx.fetch(chrom, end, end + 1000000, parser=pysam.asTuple()): bafs.append(np.array(record)) - bafs = np.array(bafs) - # if bafs.shape[0] == 0: - # return 0,0 - bafs = pd.DataFrame(bafs) + + bafs = pd.DataFrame(np.array(bafs)) + if bafs.empty: return bafs, bafs + bafs.columns = ['chr', 'pos', 'baf', 'sample'] bafs = bafs[bafs['sample'].isin(samples)] - # print(bafs) + if bafs.empty: return bafs, bafs + bafs['pos'] = bafs.pos.astype(int) bafs['baf'] = bafs.baf.astype(float) - # print(bafqs) + bafs.loc[bafs.pos <= start, 'region'] = 'before' bafs.loc[bafs.pos >= end, 'region'] = 'after' bafs.loc[(bafs.pos > start) & (bafs.pos < end), 'region'] = 'inside' - het_counts = bafs.groupby('sample region'.split()).size( - ).reset_index().rename(columns={0: 'count'}) - het_counts = het_counts.pivot_table( - values='count', index='sample', columns='region', fill_value=0) - het_counts = het_counts.reindex(samples).fillna(0).astype(int) - cols = 'before inside after sample'.split() - het_counts = het_counts.reset_index() - for colname in cols: + + het_counts = bafs.groupby(['sample', 'region']).size().unstack(fill_value=0) + het_counts = het_counts.reindex(samples, fill_value=0).astype(int).reset_index() + + for colname in ['before', 'inside', 'after']: if colname not in het_counts.columns: het_counts[colname] = 0 - # het_counts = het_counts.reset_index()[cols] - # Report BAF for variants inside CNV + called_bafs = bafs.loc[bafs.region == 'inside'].copy() return het_counts, called_bafs - def main(argv): parser = argparse.ArgumentParser( description=__doc__, @@ -93,30 +84,18 @@ def main(argv): parser.add_argument('file', help='Compiled snp file') parser.add_argument('-b', '--batch',) parser.add_argument('--index', help='Tabix index for remote bed') - # help='Samples') - # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) - # fi = args.file - # if args.vcf.startswith('s3://'): - # vcf_path = args.vcf[5:] - # bucket = vcf_path.split('/')[0] - # vcf_path = '/'.join(vcf_path.split('/')[1:]) - - # vcf = load_s3vcf(bucket, vcf_path, args.tbi) - - # else: - # vcf = pysam.VariantFile(args.vcf) splist = [] with open(args.batch, 'r') as f: f.readline() for line in f: splist.append(line.split('\t')[0]) - total_sample = sum(1 for line in open(args.batch)) - 1 + total_sample = len(splist) if args.index is not None: tbx = pysam.TabixFile(args.file, index=args.index) @@ -125,7 +104,6 @@ def main(argv): raise Exception('Must provide tabix index with remote URL') tbx = pysam.TabixFile(args.file) - # this is necessary to avoid stochasticity in calculation of KS statistic np.random.seed(0) random_state = np.random.RandomState(0) @@ -133,20 +111,16 @@ def main(argv): for line in f: if line[0] != "#": dat = line.rstrip().split('\t') - chrom = dat[0] - start = int(dat[1]) - end = int(dat[2]) - id = dat[3] - samples = dat[4] + chrom, start, end, id, samples, type = dat[:6] + start, end = int(start), int(end) samplelist = samples.split(',') - type = dat[5] + try: het_counts, called_bafs = preprocess( chrom, start, end, tbx, samples=splist) except ValueError: - het_counts = pd.DataFrame() - called_bafs = pd.DataFrame() - # Running BAF testing + het_counts, called_bafs = pd.DataFrame(), pd.DataFrame() + if not het_counts.empty: Del = DeletionTest(het_counts, samplelist, min(end - start, 1000000), random_state=random_state) @@ -154,50 +128,42 @@ def main(argv): ks, ksp = KS.test(samplelist) mean, delp = Del.Ttest(samplelist) statis = Del.stats(samplelist) - line = chrom + '\t' + str(start) + '\t' + str(end) + '\t' + id + '\t' + samples + '\t' + type + '\t' + str( - mean) + ',' + str(delp) + "\t" + str(ks) + ',' + str(ksp) + '\t' + statis + line = f"{chrom}\t{start}\t{end}\t{id}\t{samples}\t{type}\t{mean},{delp}\t{ks},{ksp}\t{statis}" else: - line = chrom + '\t' + str(start) + '\t' + str(end) + '\t' + id + '\t' + samples + '\t' + \ - type + '\t' + 'NoSNP' + ',' + 'NoSNP' + "\t" + \ - 'NoSNP' + ',' + 'NoSNP' + '\t' + 'NoSNP' - line = line.rstrip() - dat = line.split('\t') + line = f"{chrom}\t{start}\t{end}\t{id}\t{samples}\t{type}\tNoSNP,NoSNP\tNoSNP,NoSNP\tNoSNP" + + dat = line.rstrip().split('\t') + if len(dat) > 11: - dat = dat[0:11] + dat = dat[:11] dat[-1] = dat[-1][0:-len(dat[0])] line = '\t'.join(dat) - if len(dat) < 11: - dat = line.split('\t') - # samp = dat[4] + elif len(dat) < 11: nsamp = len(dat[4].split(',')) ncontrol = total_sample - nsamp - dat = dat[0:-1] + ['0,0\t0,' + - str(nsamp) + '\t0,0,' + str(ncontrol)] + dat = dat[:-1] + [f"0,0\t0,{nsamp}\t0,0,{ncontrol}"] line = '\t'.join(dat) + dat = line.split('\t') - delp = dat[6] - dupp = dat[7] + delp, dupp = dat[6], dat[7] + try: - dp = float(delp.split(',')[0]) - dstat = float(delp.split(',')[1]) + dp, dstat = map(float, delp.split(',')) derr = 'delOK' except: - dstat = 'NA' - dp = 'NA' - derr = delp.split(',')[1] + dstat, dp, derr = 'NA', 'NA', delp.split(',')[1] + try: - up = float(dupp.split(',')[0]) - ustat = float(dupp.split(',')[1]) + up, ustat = map(float, dupp.split(',')) uerr = 'dupOK' except: - ustat = 'NA' - up = 'NA' - uerr = dupp.split(',')[1] - dat = dat[0:6] + [derr, str(dp), str(dstat), uerr, str(up), str( + ustat, up, uerr = 'NA', 'NA', dupp.split(',')[1] + + dat = dat[:6] + [derr, str(dp), str(dstat), uerr, str(up), str( ustat)] + dat[8].split(',') + dat[9].split(',') + dat[10].split(',') + print('\t'.join(dat)) sys.stdout.flush() - if __name__ == '__main__': - main() + main(sys.argv[1:]) From 2b6d6e333ef61b7ecfa7bc8a2c10185e11602ec4 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 6 Sep 2024 11:52:24 -0400 Subject: [PATCH 2/2] Added deque import --- src/svtk/svtk/cli/baf_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/svtk/svtk/cli/baf_test.py b/src/svtk/svtk/cli/baf_test.py index 0ccaa46b4..e776da5fe 100644 --- a/src/svtk/svtk/cli/baf_test.py +++ b/src/svtk/svtk/cli/baf_test.py @@ -1,6 +1,7 @@ #!usr/bin/env python from svtk.baf.BAFpysam import * import argparse +from collections import deque import numpy as np import pandas as pd import pysam