broadinstitute · kjaisingh · Sep 6, 2024 · Sep 6, 2024
diff --git a/src/svtk/svtk/cli/baf_test.py b/src/svtk/svtk/cli/baf_test.py
@@ -6,8 +6,6 @@
 import pandas as pd
 import pysam
 import sys
-##########
-
 
 def preprocess(chrom, start, end, tbx, samples, window=None):
     """
@@ -34,56 +32,50 @@ def preprocess(chrom, start, end, tbx, samples, window=None):
         BAF of each called SNP within CNV
     """
 
-    # Load and filter SNP sites
     if window is None:
         window = end - start
-    # records = tbx.fetch(chrom, start - window, end + window,parser=pysam.asTuple())
-    # sites = deque()
+
     bafs = deque()
+
     if window < 1000000:
         for record in tbx.fetch(chrom, max(1, start - window), end + window, parser=pysam.asTuple()):
             bafs.append(np.array(record))
     else:
-
         for record in tbx.fetch(chrom, max(1, start - 1000000), start, parser=pysam.asTuple()):
             bafs.append(np.array(record))
         for record in tbx.fetch(chrom, (start + end) // 2 - 500000, (start + end) // 2 + 500000, parser=pysam.asTuple()):
             bafs.append(np.array(record))
         for record in tbx.fetch(chrom, end, end + 1000000, parser=pysam.asTuple()):
             bafs.append(np.array(record))
-    bafs = np.array(bafs)
-    # if bafs.shape[0] == 0:
-    # return 0,0
-    bafs = pd.DataFrame(bafs)
+
+    bafs = pd.DataFrame(np.array(bafs))
+
     if bafs.empty:
         return bafs, bafs
+
     bafs.columns = ['chr', 'pos', 'baf', 'sample']
     bafs = bafs[bafs['sample'].isin(samples)]
-    # print(bafs)
+
     if bafs.empty:
         return bafs, bafs
+
     bafs['pos'] = bafs.pos.astype(int)
     bafs['baf'] = bafs.baf.astype(float)
-    # print(bafqs)
+
     bafs.loc[bafs.pos <= start, 'region'] = 'before'
     bafs.loc[bafs.pos >= end, 'region'] = 'after'
     bafs.loc[(bafs.pos > start) & (bafs.pos < end), 'region'] = 'inside'
-    het_counts = bafs.groupby('sample region'.split()).size(
-    ).reset_index().rename(columns={0: 'count'})
-    het_counts = het_counts.pivot_table(
-        values='count', index='sample', columns='region', fill_value=0)
-    het_counts = het_counts.reindex(samples).fillna(0).astype(int)
-    cols = 'before inside after sample'.split()
-    het_counts = het_counts.reset_index()
-    for colname in cols:
+
+    het_counts = bafs.groupby(['sample', 'region']).size().unstack(fill_value=0)
+    het_counts = het_counts.reindex(samples, fill_value=0).astype(int).reset_index()
+
+    for colname in ['before', 'inside', 'after']:
         if colname not in het_counts.columns:
             het_counts[colname] = 0
-    # het_counts = het_counts.reset_index()[cols]
-    # Report BAF for variants inside CNV
+
     called_bafs = bafs.loc[bafs.region == 'inside'].copy()
     return het_counts, called_bafs
 
-
 def main(argv):
     parser = argparse.ArgumentParser(
         description=__doc__,
@@ -93,30 +85,18 @@ def main(argv):
     parser.add_argument('file', help='Compiled snp file')
     parser.add_argument('-b', '--batch',)
     parser.add_argument('--index', help='Tabix index for remote bed')
-    # help='Samples')
 
-    # Print help if no arguments specified
     if len(argv) == 0:
         parser.print_help()
         sys.exit(1)
     args = parser.parse_args(argv)
 
-    # fi = args.file
-    # if args.vcf.startswith('s3://'):
-    # vcf_path = args.vcf[5:]
-    # bucket = vcf_path.split('/')[0]
-    # vcf_path = '/'.join(vcf_path.split('/')[1:])
-
-    # vcf = load_s3vcf(bucket, vcf_path, args.tbi)
-
-    # else:
-    # vcf = pysam.VariantFile(args.vcf)
     splist = []
     with open(args.batch, 'r') as f:
         f.readline()
         for line in f:
             splist.append(line.split('\t')[0])
-    total_sample = sum(1 for line in open(args.batch)) - 1
+    total_sample = len(splist)
 
     if args.index is not None:
         tbx = pysam.TabixFile(args.file, index=args.index)
@@ -125,79 +105,66 @@ def main(argv):
             raise Exception('Must provide tabix index with remote URL')
         tbx = pysam.TabixFile(args.file)
 
-    # this is necessary to avoid stochasticity in calculation of KS statistic
     np.random.seed(0)
     random_state = np.random.RandomState(0)
 
     with open(args.bed, 'r') as f:
         for line in f:
             if line[0] != "#":
                 dat = line.rstrip().split('\t')
-                chrom = dat[0]
-                start = int(dat[1])
-                end = int(dat[2])
-                id = dat[3]
-                samples = dat[4]
+                chrom, start, end, id, samples, type = dat[:6]
+                start, end = int(start), int(end)
                 samplelist = samples.split(',')
-                type = dat[5]
+
                 try:
                     het_counts, called_bafs = preprocess(
                         chrom, start, end, tbx, samples=splist)
                 except ValueError:
-                    het_counts = pd.DataFrame()
-                    called_bafs = pd.DataFrame()
-                # Running BAF testing
+                    het_counts, called_bafs = pd.DataFrame(), pd.DataFrame()
+
                 if not het_counts.empty:
                     Del = DeletionTest(het_counts, samplelist,
                                        min(end - start, 1000000), random_state=random_state)
                     KS = KS2sample(called_bafs, samplelist)
                     ks, ksp = KS.test(samplelist)
                     mean, delp = Del.Ttest(samplelist)
                     statis = Del.stats(samplelist)
-                    line = chrom + '\t' + str(start) + '\t' + str(end) + '\t' + id + '\t' + samples + '\t' + type + '\t' + str(
-                        mean) + ',' + str(delp) + "\t" + str(ks) + ',' + str(ksp) + '\t' + statis
+                    line = f"{chrom}\t{start}\t{end}\t{id}\t{samples}\t{type}\t{mean},{delp}\t{ks},{ksp}\t{statis}"
                 else:
-                    line = chrom + '\t' + str(start) + '\t' + str(end) + '\t' + id + '\t' + samples + '\t' + \
-                        type + '\t' + 'NoSNP' + ',' + 'NoSNP' + "\t" + \
-                        'NoSNP' + ',' + 'NoSNP' + '\t' + 'NoSNP'
-                line = line.rstrip()
-                dat = line.split('\t')
+                    line = f"{chrom}\t{start}\t{end}\t{id}\t{samples}\t{type}\tNoSNP,NoSNP\tNoSNP,NoSNP\tNoSNP"
+
+                dat = line.rstrip().split('\t')
+
                 if len(dat) > 11:
-                    dat = dat[0:11]
+                    dat = dat[:11]
                     dat[-1] = dat[-1][0:-len(dat[0])]
                     line = '\t'.join(dat)
-                if len(dat) < 11:
-                    dat = line.split('\t')
-                    # samp = dat[4]
+                elif len(dat) < 11:
                     nsamp = len(dat[4].split(','))
                     ncontrol = total_sample - nsamp
-                    dat = dat[0:-1] + ['0,0\t0,' +
-                                       str(nsamp) + '\t0,0,' + str(ncontrol)]
+                    dat = dat[:-1] + [f"0,0\t0,{nsamp}\t0,0,{ncontrol}"]
                     line = '\t'.join(dat)
+
                 dat = line.split('\t')
-                delp = dat[6]
-                dupp = dat[7]
+                delp, dupp = dat[6], dat[7]
+
                 try:
-                    dp = float(delp.split(',')[0])
-                    dstat = float(delp.split(',')[1])
+                    dp, dstat = map(float, delp.split(','))
                     derr = 'delOK'
                 except:
-                    dstat = 'NA'
-                    dp = 'NA'
-                    derr = delp.split(',')[1]
+                    dstat, dp, derr = 'NA', 'NA', delp.split(',')[1]
+
                 try:
-                    up = float(dupp.split(',')[0])
-                    ustat = float(dupp.split(',')[1])
+                    up, ustat = map(float, dupp.split(','))
                     uerr = 'dupOK'
                 except:
-                    ustat = 'NA'
-                    up = 'NA'
-                    uerr = dupp.split(',')[1]
-                dat = dat[0:6] + [derr, str(dp), str(dstat), uerr, str(up), str(
+                    ustat, up, uerr = 'NA', 'NA', dupp.split(',')[1]
+
+                dat = dat[:6] + [derr, str(dp), str(dstat), uerr, str(up), str(
                     ustat)] + dat[8].split(',') + dat[9].split(',') + dat[10].split(',')
+
                 print('\t'.join(dat))
                 sys.stdout.flush()
 
-
 if __name__ == '__main__':
-    main()
+    main(sys.argv[1:])