Add pipeline tests for pVACsplice

griffithlab · May 23, 2024 · 5b7f999 · 5b7f999
1 parent c2b73d3
commit 5b7f999
Show file tree

Hide file tree

Showing 40 changed files with 82,327 additions and 33 deletions.
diff --git a/pvactools/lib/__init__.py b/pvactools/lib/__init__.py
@@ -18,6 +18,12 @@
     "run_utils",
     "post_processor",
     "vector_visualization",
+    'filter_regtools_results',
+    'junction_to_fasta',
+    'fasta_to_kmers',
+    'combine_inputs',
+    'load_gtf_data',
+    'splice_pipeline',
 ]
 
 from . import *
diff --git a/pvactools/lib/splice_pipeline.py b/pvactools/lib/splice_pipeline.py
@@ -1,12 +1,11 @@
 import shutil
-from pvactools.lib.filter_regtools_results import *
-from pvactools.lib.junction_to_fasta import *
-from pvactools.lib.fasta_to_kmers import *
-from pvactools.lib.combine_inputs import *
-from pvactools.lib.run_argument_parser import *
+import os
+from pvactools.lib.filter_regtools_results import FilterRegtoolsResults
+from pvactools.lib.junction_to_fasta import JunctionToFasta
+from pvactools.lib.fasta_to_kmers import FastaToKmers
+from pvactools.lib.combine_inputs import CombineInputs
 from pvactools.lib.input_file_converter import PvacspliceVcfConverter
-from pvactools.lib.load_gtf_data import *
-
+from pvactools.lib.load_gtf_data import LoadGtfData
 
 class JunctionPipeline:
     def __init__(self, **kwargs):

diff --git a/pvactools/tools/pvacsplice/generate_protein_fasta.py b/pvactools/tools/pvacsplice/generate_protein_fasta.py
@@ -0,0 +1,192 @@
+import sys
+import argparse
+import tempfile
+import os
+import shutil
+import yaml
+import csv
+import re
+from collections import OrderedDict
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+from pvactools.lib.splice_pipeline import JunctionPipeline
+from pvactools.lib.calculate_manufacturability import CalculateManufacturability
+from pvactools.lib.run_utils import *
+
+def define_parser():
+    parser = argparse.ArgumentParser(
+        "pvacsplice generate_protein_fasta",
+        description="Generate an annotated fasta file from a RegTools junctions output TSV file with protein sequences of mutations",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "input_file",
+        help="RegTools junctions output TSV file"
+    )
+    parser.add_argument(
+        "flanking_sequence_length", type=int,
+        help="Number of amino acids to add on each side of the splice site when creating the FASTA.",
+    )
+    parser.add_argument(
+        "output_file",
+        help="The output fasta file."
+    )
+    parser.add_argument(
+        "annotated_vcf",
+        help="A VEP-annotated single- or multi-sample VCF containing genotype and transcript information."
+        + "The VCF ma be gzipped (requires tabix index)."
+    )
+    parser.add_argument(
+        "ref_fasta",
+        help="A reference FASTA file. Note: this input should be the same as the RegTools vcf input."
+    )
+    parser.add_argument(
+        "gtf_file",
+        help="A reference GTF file. Note: this input should be the same as the RegTools gtf input."
+    )
+    parser.add_argument(
+        "--input-tsv",
+        help = "A pVACsplice all_epitopes, filtered, or aggregated TSV file with epitopes to use for subsetting the inputs to peptides of interest. Only the peptide sequences for the epitopes in the TSV will be used when creating the FASTA."
+    )
+    parser.add_argument(
+        '--pass-only',
+        help="Only process VCF entries with a PASS status.",
+        default=False,
+        action='store_true',
+    )
+    parser.add_argument(
+        "--biotypes", type=lambda s:[a for a in s.split(',')],
+        help="A list of biotypes to use for pre-filtering transcripts for processing in the pipeline.",
+        default=['protein_coding']
+    )
+    parser.add_argument(
+        "-j", "--junction-score", type=int,
+        help="Junction Coverage Cutoff. Only sites above this read depth cutoff will be considered.",
+        default=10
+    )
+    parser.add_argument(
+        "-v", "--variant-distance", type=int,
+        help="Regulatory variants can lie inside or outside of splicing junction."
+        + "Maximum distance window (upstream and downstream) for a variant outside the junction.",
+        default=100
+    )
+    parser.add_argument(
+        "--anchor-types", nargs="*",
+        help="The anchor types of junctions to use. Multiple anchors can be specified using a comma-separated list."
+        + "Choices: A, D, NDA, DA, N",
+        default=['A', 'D', 'NDA'],
+        choices=['A', 'D', 'NDA', 'DA', 'N']
+    )
+    parser.add_argument(
+        "--aggregate-report-evaluation",
+        help="When running with an aggregate report input TSV, only include variants with this evaluation. Valid values for this field are Accept, Reject, Pending, and Review. Specifiy multiple values as a comma-separated list to include multiple evaluation states.",
+        default='Accept',
+        type=lambda s:[e for e in s.split(',')],
+    )
+    parser.add_argument(
+        "-s", "--sample-name",
+        help="The name of the sample being processed. Required when processing a multi-sample VCF and must be a sample ID in the input VCF #CHROM header line."
+    )
+    return parser
+
+def parse_input_tsv(input_tsv):
+    if input_tsv is None:
+        return (None, None)
+    with open(input_tsv, 'r') as fh:
+        reader = csv.DictReader(fh, delimiter = "\t")
+        if 'Index' in reader.fieldnames:
+            indexes = parse_full_input_tsv(reader)
+            file_type = 'full'
+        else:
+            indexes = parse_aggregated_input_tsv(reader)
+            file_type = 'aggregated'
+    return (indexes, file_type)
+
+def parse_full_input_tsv(reader):
+    indexes = []
+    for line in reader:
+        indexes.append(line['Index'])
+    return indexes
+
+def parse_aggregated_input_tsv(reader):
+    indexes = []
+    for line in reader:
+        indexes.append(line)
+    return indexes
+
+def main(args_input = sys.argv[1:]):
+    parser = define_parser()
+    args = parser.parse_args(args_input)
+
+    if not (set(args.aggregate_report_evaluation)).issubset(set(['Accept', 'Reject', 'Review', 'Pending'])):
+        sys.exit("Aggregate report evaluation ({}) contains invalid values.".format(args.aggregate_report_evaluation))
+
+    temp_dir = tempfile.mkdtemp()
+
+    junction_arguments = {
+        'input_file_type'                  : 'junctions',
+        'junctions_dir'                    : temp_dir,
+        'input_file'                       : args.input_file,
+        'gtf_file'                         : args.gtf_file,
+        'save_gtf'                         : False,
+        'sample_name'                      : args.sample_name,
+        'ref_fasta'                        : args.ref_fasta,
+        'annotated_vcf'                    : args.annotated_vcf,
+        'pass_only'                        : args.pass_only,
+        'biotypes'                         : args.biotypes,
+        'junction_score'                   : args.junction_score,
+        'variant_distance'                 : args.variant_distance,
+        'anchor_types'                     : args.anchor_types,
+        'normal_sample_name'               : None,
+        'keep_tmp_files'                   : False,
+        'class_i_epitope_length'           : [],
+        'class_ii_epitope_length'          : [],
+        'class_i_hla'                      : [],
+        'class_ii_hla'                     : [],
+    }
+
+    pipeline = JunctionPipeline(**junction_arguments)
+    pipeline.vcf_to_tsv()
+    pipeline.junction_to_fasta()
+
+    transcript_fasta = pipeline.create_file_path('fasta')
+    mt_sequences = {}
+    wt_sequences = {}
+    for record in SeqIO.parse(transcript_fasta, "fasta"):
+        if record.id.startswith('WT.'):
+            wt_sequences[record.id.split('.', 1)[1]] = record.seq
+        if record.id.startswith('ALT.'):
+            mt_sequences[record.id.split('.', 1)[1]] = record.seq
+
+    final_sequences = {}
+    for (index, mt_sequence) in mt_sequences.items():
+        wt_sequence = wt_sequences[index]
+        final_sequences[index] = get_mutated_peptide_with_flanking_sequence(wt_sequence, mt_sequence, args.flanking_sequence_length)
+
+    (tsv_indexes, tsv_file_type) = parse_input_tsv(args.input_tsv)
+    output_records = []
+    for (index, sequence) in final_sequences.items():
+        if tsv_indexes is not None:
+            if tsv_file_type == 'full':
+                if index not in tsv_indexes:
+                    continue
+            else:
+                matches = [i for i in tsv_indexes if i['ID'] == index and i['Evaluation'] in args.aggregate_report_evaluation]
+                if len(matches) == 0:
+                    continue
+        new_record = SeqRecord(sequence, id=index, description=index)
+        output_records.append(new_record)
+
+    SeqIO.write(output_records, args.output_file, "fasta")
+    print("Completed")
+
+    shutil.rmtree(temp_dir, ignore_errors=True)
+    manufacturability_file = "{}.manufacturability.tsv".format(args.output_file)
+    print("Calculating Manufacturability Metrics")
+    CalculateManufacturability(args.output_file, manufacturability_file, 'fasta').execute()
+    print("Completed")
+
+if __name__ == '__main__':
+    main()
diff --git a/pvactools/tools/pvacsplice/run.py b/pvactools/tools/pvacsplice/run.py
@@ -1,6 +1,7 @@
 import sys
 import os
 import pandas as pd
+import shutil
 from pathlib import Path
 from pvactools.lib.splice_pipeline import *
 from pvactools.lib.prediction_class import *
@@ -19,7 +20,7 @@ def combine_epitope_len_reports(file_list, file_final_name):
     if len(file_list) > 1:
         combine_reports(file_list, file_final_name)
     elif len(file_list) == 1:
-        os.rename(file_list[0], file_final_name)
+        shutil.copy(file_list[0], file_final_name)
 
 def create_combined_report(junctions_dir, args):
     output_dir = os.path.join(junctions_dir, 'combined')
@@ -278,7 +279,5 @@ def main(args_input = sys.argv[1:]):
 
     change_permissions_recursive(junctions_dir, 0o755, 0o644)
 
-    #TODO: add tumor purity
-
 if __name__ == '__main__':
     main()
diff --git a/tests/test_data/pvacsplice/mock_files/Netmhcstab.html b/tests/test_data/pvacsplice/mock_files/Netmhcstab.html
@@ -0,0 +1,41 @@
+<BODY LINK="#0000ff" BGCOLOR="#ffffff">
+
+<script language=JavaScript type = "text/javascript">
+  var loc = window.top.location.search.substring(1);
+  if (loc != null && loc !== "" && loc.length > 1) {
+      document.write('<center><h2>',loc,' Server Output - DTU Health Tech</h2></center>');}
+  </script> 
+<hr>
+<pre>
+
+# NetMHCstabpan version 1.0
+
+# Input is in FSA format
+
+# Peptide length 9
+
+HLA-G01:09 : Distance to traning data  0.372 (using nearest neighbor HLA-A24:03)
+
+# Rank Threshold for Strong binding peptides   0.500
+# Rank Threshold for Weak binding peptides   2.000
+-----------------------------------------------------------------------------------------------------
+ pos      HLA         peptide         Identity       Pred     Thalf(h) %Rank_Stab BindLevel
+-----------------------------------------------------------------------------------------------------
+    0  HLA-G*01:09    KVYCLQKEL      0000000000      0.008       0.14       5.50
+-----------------------------------------------------------------------------------------------------
+
+Protein 0000000000. Allele HLA-G*01:09. Number of high binders 0. Number of weak binders 0. Number of peptides 1
+
+Link to Allele Frequencies in Worldwide Populations <a href="http://www.allelefrequencies.net/hla6006a.asp?hla_selection=G*01:09" target=_blank >HLA-G*01:09</a>
+-----------------------------------------------------------------------------------------------------
+</pre>
+<hr>
+Go <a href="javascript:history.back()"><b>back</b></a>.
+<!--
+<button type="button"
+onclick="window.top.location.reload(true);")>
+New Submission</button>
+-->
+</BODY>
+</HTML>
+
diff --git a/tests/test_data/pvacsplice/mock_files/net_chop.html b/tests/test_data/pvacsplice/mock_files/net_chop.html
@@ -0,0 +1,47 @@
+<BODY LINK="#0000ff" BGCOLOR="#ffffff">
+
+<script language=JavaScript type = "text/javascript">
+  var loc = window.top.location.search.substring(1);
+  if (loc != null && loc !== "" && loc.length > 1) {
+      document.write('<center><h2>',loc,' Server Output - DTU Health Tech</h2></center>');}
+  </script> 
+<hr>
+<pre>
+NetChop 3.0 predictions using version C-term. Threshold 0.500000
+
+--------------------------------------
+ pos  AA  C      score      Ident
+--------------------------------------
+   1   Q  .   0.030908 0000000000
+   2   V  S   0.596502 0000000000
+   3   S  .   0.032148 0000000000
+   4   E  .   0.026136 0000000000
+   5   E  .   0.023295 0000000000
+   6   T  .   0.031866 0000000000
+   7   I  .   0.108056 0000000000
+   8   K  .   0.244313 0000000000
+   9   V  S   0.962786 0000000000
+  10   Y  S   0.975373 0000000000
+  11   C  .   0.028444 0000000000
+  12   L  S   0.960379 0000000000
+  13   Q  .   0.169144 0000000000
+  14   K  S   0.960541 0000000000
+  15   E  .   0.032194 0000000000
+  16   L  S   0.959280 0000000000
+  17   K  .   0.424314 0000000000
+--------------------------------------
+
+Number of cleavage sites 6. Number of amino acids 17. Protein name 0000000000
+
+--------------------------------------
+</pre>
+<hr>
+Go <a href="javascript:history.back()"><b>back</b></a>.
+<!--
+<button type="button"
+onclick="window.top.location.reload(true);")>
+New Submission</button>
+-->
+</BODY>
+</HTML>
+
diff --git a/tests/test_data/pvacsplice/mock_files/response_DRB1*11:01_15_nn_align.tsv b/tests/test_data/pvacsplice/mock_files/response_DRB1*11:01_15_nn_align.tsv
@@ -0,0 +1,52 @@
+allele	seq_num	start	end	length	core_peptide	peptide	ic50	rank	adjusted_rank
+HLA-DRB1*11:01	14	1	15	15	FEKQLKKKS	KEKFEKQLKKKSEEK	31.70	3.2	3.20
+HLA-DRB1*11:01	15	1	15	15	FEKQLKKKS	EKFEKQLKKKSEEKE	39.10	4.0	4.00
+HLA-DRB1*11:01	16	1	15	15	FEKQLKKKS	KFEKQLKKKSEEKEL	56.10	5.4	5.40
+HLA-DRB1*11:01	17	1	15	15	FEKQLKKKS	FEKQLKKKSEEKELK	140.60	12.0	12.00
+HLA-DRB1*11:01	13	1	15	15	LQKELKIKN	KVYCLQKELKIKNHS	142.90	12.0	12.00
+HLA-DRB1*11:01	12	1	15	15	LQKELKIKN	IKVYCLQKELKIKNH	171.90	14.0	14.00
+HLA-DRB1*11:01	11	1	15	15	LQKELKIKN	TIKVYCLQKELKIKN	191.00	15.0	15.00
+HLA-DRB1*11:01	36	1	15	15	VKVKCKLTQ	SADVVKVKCKLTQSF	278.20	18.0	18.00
+HLA-DRB1*11:01	10	1	15	15	VYCLQKELK	ETIKVYCLQKELKIK	302.50	19.0	19.00
+HLA-DRB1*11:01	35	1	15	15	VKVKCKLTQ	KSADVVKVKCKLTQS	314.80	19.0	19.00
+HLA-DRB1*11:01	34	1	15	15	VKVKCKLTQ	MKSADVVKVKCKLTQ	474.30	24.0	24.00
+HLA-DRB1*11:01	27	1	15	15	LKIKNHSLQ	EKELKIKNHSLQETS	489.20	25.0	25.00
+HLA-DRB1*11:01	26	1	15	15	LKIKNHSLQ	EEKELKIKNHSLQET	514.20	25.0	25.00
+HLA-DRB1*11:01	25	1	15	15	LKIKNHSLQ	SEEKELKIKNHSLQE	589.10	27.0	27.00
+HLA-DRB1*11:01	24	1	15	15	KELKIKNHS	KSEEKELKIKNHSLQ	726.30	30.0	30.00
+HLA-DRB1*11:01	43	1	15	15	MKSADVVKQ	HIMKSADVVKQRFKN	742.40	30.0	30.00
+HLA-DRB1*11:01	9	1	15	15	VYCLQKELK	EETIKVYCLQKELKI	747.80	30.0	30.00
+HLA-DRB1*11:01	33	1	15	15	DVVKVKCKL	IMKSADVVKVKCKLT	754.90	30.0	30.00
+HLA-DRB1*11:01	41	1	15	15	MKSADVVKQ	KTHIMKSADVVKQRF	822.50	32.0	32.00
+HLA-DRB1*11:01	32	1	15	15	MKSADVVKV	HIMKSADVVKVKCKL	827.90	32.0	32.00
+HLA-DRB1*11:01	42	1	15	15	MKSADVVKQ	THIMKSADVVKQRFK	831.60	32.0	32.00
+HLA-DRB1*11:01	49	1	15	15	VKQRFKNPA	DVVKQRFKNPAWVWL	855.80	32.0	32.00
+HLA-DRB1*11:01	40	1	15	15	MKSADVVKQ	SKTHIMKSADVVKQR	889.80	33.0	33.00
+HLA-DRB1*11:01	45	1	15	15	DVVKQRFKN	MKSADVVKQRFKNPA	916.90	34.0	34.00
+HLA-DRB1*11:01	46	1	15	15	DVVKQRFKN	KSADVVKQRFKNPAW	918.40	34.0	34.00
+HLA-DRB1*11:01	48	1	15	15	VKQRFKNPA	ADVVKQRFKNPAWVW	918.40	34.0	34.00
+HLA-DRB1*11:01	50	1	15	15	RFKNPAWVW	VVKQRFKNPAWVWLW	938.60	34.0	34.00
+HLA-DRB1*11:01	47	1	15	15	DVVKQRFKN	SADVVKQRFKNPAWV	963.60	34.0	34.00
+HLA-DRB1*11:01	44	1	15	15	DVVKQRFKN	IMKSADVVKQRFKNP	988.30	35.0	35.00
+HLA-DRB1*11:01	29	1	15	15	MKSADVVKV	SKTHIMKSADVVKVK	1076.40	36.0	36.00
+HLA-DRB1*11:01	51	1	15	15	RFKNPAWVW	VKQRFKNPAWVWLWN	1113.40	37.0	37.00
+HLA-DRB1*11:01	31	1	15	15	MKSADVVKV	THIMKSADVVKVKCK	1115.30	37.0	37.00
+HLA-DRB1*11:01	30	1	15	15	MKSADVVKV	KTHIMKSADVVKVKC	1184.30	38.0	38.00
+HLA-DRB1*11:01	39	1	15	15	KTHIMKSAD	ASKTHIMKSADVVKQ	1244.00	39.0	39.00
+HLA-DRB1*11:01	23	1	15	15	KELKIKNHS	KKSEEKELKIKNHSL	1296.00	39.0	39.00
+HLA-DRB1*11:01	38	1	15	15	KTHIMKSAD	ASKTHIMKSADVVNR	1345.20	40.0	40.00
+HLA-DRB1*11:01	28	1	15	15	KTHIMKSAD	ASKTHIMKSADVVKV	1372.00	41.0	41.00
+HLA-DRB1*11:01	8	1	15	15	VYCLQKELK	SEETIKVYCLQKELK	1550.20	43.0	43.00
+HLA-DRB1*11:01	37	1	15	15	KTHIMKSAD	TASKTHIMKSADVVN	2075.10	48.0	48.00
+HLA-DRB1*11:01	22	1	15	15	KELKIKNHS	KKKSEEKELKIKNHS	2591.40	53.0	53.00
+HLA-DRB1*11:01	7	1	15	15	TIKVYCLQK	VSEETIKVYCLQKEL	3255.70	58.0	58.00
+HLA-DRB1*11:01	6	1	15	15	TIKVYCLQK	QVSEETIKVYCLQKE	4843.60	67.0	67.00
+HLA-DRB1*11:01	5	1	15	15	EETIKVYCL	NQVSEETIKVYCLQK	6064.80	72.0	72.00
+HLA-DRB1*11:01	21	1	15	15	EEKELKIKN	LKKKSEEKELKIKNH	6442.50	73.0	73.00
+HLA-DRB1*11:01	18	1	15	15	LKKKSEEKE	EKQLKKKSEEKELKI	7314.10	76.0	76.00
+HLA-DRB1*11:01	19	1	15	15	LKKKSEEKE	KQLKKKSEEKELKIK	7702.70	77.0	77.00
+HLA-DRB1*11:01	20	1	15	15	EEKELKIKN	QLKKKSEEKELKIKN	8201.00	78.0	78.00
+HLA-DRB1*11:01	4	1	15	15	EETIKVYCL	QNQVSEETIKVYCLQ	11060.20	84.0	84.00
+HLA-DRB1*11:01	3	1	15	15	EETIKVYCL	LQNQVSEETIKVYCL	13819.50	88.0	88.00
+HLA-DRB1*11:01	1	1	15	15	LQNQVSEET	KALQNQVSEETIKVY	16019.10	91.0	91.00
+HLA-DRB1*11:01	2	1	15	15	VSEETIKVY	ALQNQVSEETIKVYC	17978.50	93.0	93.00