Skip to content

Commit

Permalink
Add pipeline tests for pVACsplice
Browse files Browse the repository at this point in the history
  • Loading branch information
susannasiebert committed May 23, 2024
1 parent c2b73d3 commit 5b7f999
Show file tree
Hide file tree
Showing 40 changed files with 82,327 additions and 33 deletions.
6 changes: 6 additions & 0 deletions pvactools/lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@
"run_utils",
"post_processor",
"vector_visualization",
'filter_regtools_results',
'junction_to_fasta',
'fasta_to_kmers',
'combine_inputs',
'load_gtf_data',
'splice_pipeline',
]

from . import *
13 changes: 6 additions & 7 deletions pvactools/lib/splice_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import shutil
from pvactools.lib.filter_regtools_results import *
from pvactools.lib.junction_to_fasta import *
from pvactools.lib.fasta_to_kmers import *
from pvactools.lib.combine_inputs import *
from pvactools.lib.run_argument_parser import *
import os
from pvactools.lib.filter_regtools_results import FilterRegtoolsResults
from pvactools.lib.junction_to_fasta import JunctionToFasta
from pvactools.lib.fasta_to_kmers import FastaToKmers
from pvactools.lib.combine_inputs import CombineInputs
from pvactools.lib.input_file_converter import PvacspliceVcfConverter
from pvactools.lib.load_gtf_data import *

from pvactools.lib.load_gtf_data import LoadGtfData

class JunctionPipeline:
def __init__(self, **kwargs):
Expand Down
192 changes: 192 additions & 0 deletions pvactools/tools/pvacsplice/generate_protein_fasta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
import sys
import argparse
import tempfile
import os
import shutil
import yaml
import csv
import re
from collections import OrderedDict
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

from pvactools.lib.splice_pipeline import JunctionPipeline
from pvactools.lib.calculate_manufacturability import CalculateManufacturability
from pvactools.lib.run_utils import *

def define_parser():
parser = argparse.ArgumentParser(
"pvacsplice generate_protein_fasta",
description="Generate an annotated fasta file from a RegTools junctions output TSV file with protein sequences of mutations",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"input_file",
help="RegTools junctions output TSV file"
)
parser.add_argument(
"flanking_sequence_length", type=int,
help="Number of amino acids to add on each side of the splice site when creating the FASTA.",
)
parser.add_argument(
"output_file",
help="The output fasta file."
)
parser.add_argument(
"annotated_vcf",
help="A VEP-annotated single- or multi-sample VCF containing genotype and transcript information."
+ "The VCF ma be gzipped (requires tabix index)."
)
parser.add_argument(
"ref_fasta",
help="A reference FASTA file. Note: this input should be the same as the RegTools vcf input."
)
parser.add_argument(
"gtf_file",
help="A reference GTF file. Note: this input should be the same as the RegTools gtf input."
)
parser.add_argument(
"--input-tsv",
help = "A pVACsplice all_epitopes, filtered, or aggregated TSV file with epitopes to use for subsetting the inputs to peptides of interest. Only the peptide sequences for the epitopes in the TSV will be used when creating the FASTA."
)
parser.add_argument(
'--pass-only',
help="Only process VCF entries with a PASS status.",
default=False,
action='store_true',
)
parser.add_argument(
"--biotypes", type=lambda s:[a for a in s.split(',')],
help="A list of biotypes to use for pre-filtering transcripts for processing in the pipeline.",
default=['protein_coding']
)
parser.add_argument(
"-j", "--junction-score", type=int,
help="Junction Coverage Cutoff. Only sites above this read depth cutoff will be considered.",
default=10
)
parser.add_argument(
"-v", "--variant-distance", type=int,
help="Regulatory variants can lie inside or outside of splicing junction."
+ "Maximum distance window (upstream and downstream) for a variant outside the junction.",
default=100
)
parser.add_argument(
"--anchor-types", nargs="*",
help="The anchor types of junctions to use. Multiple anchors can be specified using a comma-separated list."
+ "Choices: A, D, NDA, DA, N",
default=['A', 'D', 'NDA'],
choices=['A', 'D', 'NDA', 'DA', 'N']
)
parser.add_argument(
"--aggregate-report-evaluation",
help="When running with an aggregate report input TSV, only include variants with this evaluation. Valid values for this field are Accept, Reject, Pending, and Review. Specifiy multiple values as a comma-separated list to include multiple evaluation states.",
default='Accept',
type=lambda s:[e for e in s.split(',')],
)
parser.add_argument(
"-s", "--sample-name",
help="The name of the sample being processed. Required when processing a multi-sample VCF and must be a sample ID in the input VCF #CHROM header line."
)
return parser

def parse_input_tsv(input_tsv):
if input_tsv is None:
return (None, None)
with open(input_tsv, 'r') as fh:
reader = csv.DictReader(fh, delimiter = "\t")
if 'Index' in reader.fieldnames:
indexes = parse_full_input_tsv(reader)
file_type = 'full'
else:
indexes = parse_aggregated_input_tsv(reader)
file_type = 'aggregated'
return (indexes, file_type)

def parse_full_input_tsv(reader):
indexes = []
for line in reader:
indexes.append(line['Index'])
return indexes

def parse_aggregated_input_tsv(reader):
indexes = []
for line in reader:
indexes.append(line)
return indexes

def main(args_input = sys.argv[1:]):
parser = define_parser()
args = parser.parse_args(args_input)

if not (set(args.aggregate_report_evaluation)).issubset(set(['Accept', 'Reject', 'Review', 'Pending'])):
sys.exit("Aggregate report evaluation ({}) contains invalid values.".format(args.aggregate_report_evaluation))

temp_dir = tempfile.mkdtemp()

junction_arguments = {
'input_file_type' : 'junctions',
'junctions_dir' : temp_dir,
'input_file' : args.input_file,
'gtf_file' : args.gtf_file,
'save_gtf' : False,
'sample_name' : args.sample_name,
'ref_fasta' : args.ref_fasta,
'annotated_vcf' : args.annotated_vcf,
'pass_only' : args.pass_only,
'biotypes' : args.biotypes,
'junction_score' : args.junction_score,
'variant_distance' : args.variant_distance,
'anchor_types' : args.anchor_types,
'normal_sample_name' : None,
'keep_tmp_files' : False,
'class_i_epitope_length' : [],
'class_ii_epitope_length' : [],
'class_i_hla' : [],
'class_ii_hla' : [],
}

pipeline = JunctionPipeline(**junction_arguments)
pipeline.vcf_to_tsv()
pipeline.junction_to_fasta()

transcript_fasta = pipeline.create_file_path('fasta')
mt_sequences = {}
wt_sequences = {}
for record in SeqIO.parse(transcript_fasta, "fasta"):
if record.id.startswith('WT.'):
wt_sequences[record.id.split('.', 1)[1]] = record.seq
if record.id.startswith('ALT.'):
mt_sequences[record.id.split('.', 1)[1]] = record.seq

final_sequences = {}
for (index, mt_sequence) in mt_sequences.items():
wt_sequence = wt_sequences[index]
final_sequences[index] = get_mutated_peptide_with_flanking_sequence(wt_sequence, mt_sequence, args.flanking_sequence_length)

(tsv_indexes, tsv_file_type) = parse_input_tsv(args.input_tsv)
output_records = []
for (index, sequence) in final_sequences.items():
if tsv_indexes is not None:
if tsv_file_type == 'full':
if index not in tsv_indexes:
continue
else:
matches = [i for i in tsv_indexes if i['ID'] == index and i['Evaluation'] in args.aggregate_report_evaluation]
if len(matches) == 0:
continue
new_record = SeqRecord(sequence, id=index, description=index)
output_records.append(new_record)

SeqIO.write(output_records, args.output_file, "fasta")
print("Completed")

shutil.rmtree(temp_dir, ignore_errors=True)
manufacturability_file = "{}.manufacturability.tsv".format(args.output_file)
print("Calculating Manufacturability Metrics")
CalculateManufacturability(args.output_file, manufacturability_file, 'fasta').execute()
print("Completed")

if __name__ == '__main__':
main()
5 changes: 2 additions & 3 deletions pvactools/tools/pvacsplice/run.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import sys
import os
import pandas as pd
import shutil
from pathlib import Path
from pvactools.lib.splice_pipeline import *
from pvactools.lib.prediction_class import *
Expand All @@ -19,7 +20,7 @@ def combine_epitope_len_reports(file_list, file_final_name):
if len(file_list) > 1:
combine_reports(file_list, file_final_name)
elif len(file_list) == 1:
os.rename(file_list[0], file_final_name)
shutil.copy(file_list[0], file_final_name)

def create_combined_report(junctions_dir, args):
output_dir = os.path.join(junctions_dir, 'combined')
Expand Down Expand Up @@ -278,7 +279,5 @@ def main(args_input = sys.argv[1:]):

change_permissions_recursive(junctions_dir, 0o755, 0o644)

#TODO: add tumor purity

if __name__ == '__main__':
main()
41 changes: 41 additions & 0 deletions tests/test_data/pvacsplice/mock_files/Netmhcstab.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
<BODY LINK="#0000ff" BGCOLOR="#ffffff">

<script language=JavaScript type = "text/javascript">
var loc = window.top.location.search.substring(1);
if (loc != null && loc !== "" && loc.length > 1) {
document.write('<center><h2>',loc,' Server Output - DTU Health Tech</h2></center>');}
</script>
<hr>
<pre>

# NetMHCstabpan version 1.0

# Input is in FSA format

# Peptide length 9

HLA-G01:09 : Distance to traning data 0.372 (using nearest neighbor HLA-A24:03)

# Rank Threshold for Strong binding peptides 0.500
# Rank Threshold for Weak binding peptides 2.000
-----------------------------------------------------------------------------------------------------
pos HLA peptide Identity Pred Thalf(h) %Rank_Stab BindLevel
-----------------------------------------------------------------------------------------------------
0 HLA-G*01:09 KVYCLQKEL 0000000000 0.008 0.14 5.50
-----------------------------------------------------------------------------------------------------

Protein 0000000000. Allele HLA-G*01:09. Number of high binders 0. Number of weak binders 0. Number of peptides 1

Link to Allele Frequencies in Worldwide Populations <a href="http://www.allelefrequencies.net/hla6006a.asp?hla_selection=G*01:09" target=_blank >HLA-G*01:09</a>
-----------------------------------------------------------------------------------------------------
</pre>
<hr>
Go <a href="javascript:history.back()"><b>back</b></a>.
<!--
<button type="button"
onclick="window.top.location.reload(true);")>
New Submission</button>
-->
</BODY>
</HTML>

47 changes: 47 additions & 0 deletions tests/test_data/pvacsplice/mock_files/net_chop.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
<BODY LINK="#0000ff" BGCOLOR="#ffffff">

<script language=JavaScript type = "text/javascript">
var loc = window.top.location.search.substring(1);
if (loc != null && loc !== "" && loc.length > 1) {
document.write('<center><h2>',loc,' Server Output - DTU Health Tech</h2></center>');}
</script>
<hr>
<pre>
NetChop 3.0 predictions using version C-term. Threshold 0.500000

--------------------------------------
pos AA C score Ident
--------------------------------------
1 Q . 0.030908 0000000000
2 V S 0.596502 0000000000
3 S . 0.032148 0000000000
4 E . 0.026136 0000000000
5 E . 0.023295 0000000000
6 T . 0.031866 0000000000
7 I . 0.108056 0000000000
8 K . 0.244313 0000000000
9 V S 0.962786 0000000000
10 Y S 0.975373 0000000000
11 C . 0.028444 0000000000
12 L S 0.960379 0000000000
13 Q . 0.169144 0000000000
14 K S 0.960541 0000000000
15 E . 0.032194 0000000000
16 L S 0.959280 0000000000
17 K . 0.424314 0000000000
--------------------------------------

Number of cleavage sites 6. Number of amino acids 17. Protein name 0000000000

--------------------------------------
</pre>
<hr>
Go <a href="javascript:history.back()"><b>back</b></a>.
<!--
<button type="button"
onclick="window.top.location.reload(true);")>
New Submission</button>
-->
</BODY>
</HTML>

Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
allele seq_num start end length core_peptide peptide ic50 rank adjusted_rank
HLA-DRB1*11:01 14 1 15 15 FEKQLKKKS KEKFEKQLKKKSEEK 31.70 3.2 3.20
HLA-DRB1*11:01 15 1 15 15 FEKQLKKKS EKFEKQLKKKSEEKE 39.10 4.0 4.00
HLA-DRB1*11:01 16 1 15 15 FEKQLKKKS KFEKQLKKKSEEKEL 56.10 5.4 5.40
HLA-DRB1*11:01 17 1 15 15 FEKQLKKKS FEKQLKKKSEEKELK 140.60 12.0 12.00
HLA-DRB1*11:01 13 1 15 15 LQKELKIKN KVYCLQKELKIKNHS 142.90 12.0 12.00
HLA-DRB1*11:01 12 1 15 15 LQKELKIKN IKVYCLQKELKIKNH 171.90 14.0 14.00
HLA-DRB1*11:01 11 1 15 15 LQKELKIKN TIKVYCLQKELKIKN 191.00 15.0 15.00
HLA-DRB1*11:01 36 1 15 15 VKVKCKLTQ SADVVKVKCKLTQSF 278.20 18.0 18.00
HLA-DRB1*11:01 10 1 15 15 VYCLQKELK ETIKVYCLQKELKIK 302.50 19.0 19.00
HLA-DRB1*11:01 35 1 15 15 VKVKCKLTQ KSADVVKVKCKLTQS 314.80 19.0 19.00
HLA-DRB1*11:01 34 1 15 15 VKVKCKLTQ MKSADVVKVKCKLTQ 474.30 24.0 24.00
HLA-DRB1*11:01 27 1 15 15 LKIKNHSLQ EKELKIKNHSLQETS 489.20 25.0 25.00
HLA-DRB1*11:01 26 1 15 15 LKIKNHSLQ EEKELKIKNHSLQET 514.20 25.0 25.00
HLA-DRB1*11:01 25 1 15 15 LKIKNHSLQ SEEKELKIKNHSLQE 589.10 27.0 27.00
HLA-DRB1*11:01 24 1 15 15 KELKIKNHS KSEEKELKIKNHSLQ 726.30 30.0 30.00
HLA-DRB1*11:01 43 1 15 15 MKSADVVKQ HIMKSADVVKQRFKN 742.40 30.0 30.00
HLA-DRB1*11:01 9 1 15 15 VYCLQKELK EETIKVYCLQKELKI 747.80 30.0 30.00
HLA-DRB1*11:01 33 1 15 15 DVVKVKCKL IMKSADVVKVKCKLT 754.90 30.0 30.00
HLA-DRB1*11:01 41 1 15 15 MKSADVVKQ KTHIMKSADVVKQRF 822.50 32.0 32.00
HLA-DRB1*11:01 32 1 15 15 MKSADVVKV HIMKSADVVKVKCKL 827.90 32.0 32.00
HLA-DRB1*11:01 42 1 15 15 MKSADVVKQ THIMKSADVVKQRFK 831.60 32.0 32.00
HLA-DRB1*11:01 49 1 15 15 VKQRFKNPA DVVKQRFKNPAWVWL 855.80 32.0 32.00
HLA-DRB1*11:01 40 1 15 15 MKSADVVKQ SKTHIMKSADVVKQR 889.80 33.0 33.00
HLA-DRB1*11:01 45 1 15 15 DVVKQRFKN MKSADVVKQRFKNPA 916.90 34.0 34.00
HLA-DRB1*11:01 46 1 15 15 DVVKQRFKN KSADVVKQRFKNPAW 918.40 34.0 34.00
HLA-DRB1*11:01 48 1 15 15 VKQRFKNPA ADVVKQRFKNPAWVW 918.40 34.0 34.00
HLA-DRB1*11:01 50 1 15 15 RFKNPAWVW VVKQRFKNPAWVWLW 938.60 34.0 34.00
HLA-DRB1*11:01 47 1 15 15 DVVKQRFKN SADVVKQRFKNPAWV 963.60 34.0 34.00
HLA-DRB1*11:01 44 1 15 15 DVVKQRFKN IMKSADVVKQRFKNP 988.30 35.0 35.00
HLA-DRB1*11:01 29 1 15 15 MKSADVVKV SKTHIMKSADVVKVK 1076.40 36.0 36.00
HLA-DRB1*11:01 51 1 15 15 RFKNPAWVW VKQRFKNPAWVWLWN 1113.40 37.0 37.00
HLA-DRB1*11:01 31 1 15 15 MKSADVVKV THIMKSADVVKVKCK 1115.30 37.0 37.00
HLA-DRB1*11:01 30 1 15 15 MKSADVVKV KTHIMKSADVVKVKC 1184.30 38.0 38.00
HLA-DRB1*11:01 39 1 15 15 KTHIMKSAD ASKTHIMKSADVVKQ 1244.00 39.0 39.00
HLA-DRB1*11:01 23 1 15 15 KELKIKNHS KKSEEKELKIKNHSL 1296.00 39.0 39.00
HLA-DRB1*11:01 38 1 15 15 KTHIMKSAD ASKTHIMKSADVVNR 1345.20 40.0 40.00
HLA-DRB1*11:01 28 1 15 15 KTHIMKSAD ASKTHIMKSADVVKV 1372.00 41.0 41.00
HLA-DRB1*11:01 8 1 15 15 VYCLQKELK SEETIKVYCLQKELK 1550.20 43.0 43.00
HLA-DRB1*11:01 37 1 15 15 KTHIMKSAD TASKTHIMKSADVVN 2075.10 48.0 48.00
HLA-DRB1*11:01 22 1 15 15 KELKIKNHS KKKSEEKELKIKNHS 2591.40 53.0 53.00
HLA-DRB1*11:01 7 1 15 15 TIKVYCLQK VSEETIKVYCLQKEL 3255.70 58.0 58.00
HLA-DRB1*11:01 6 1 15 15 TIKVYCLQK QVSEETIKVYCLQKE 4843.60 67.0 67.00
HLA-DRB1*11:01 5 1 15 15 EETIKVYCL NQVSEETIKVYCLQK 6064.80 72.0 72.00
HLA-DRB1*11:01 21 1 15 15 EEKELKIKN LKKKSEEKELKIKNH 6442.50 73.0 73.00
HLA-DRB1*11:01 18 1 15 15 LKKKSEEKE EKQLKKKSEEKELKI 7314.10 76.0 76.00
HLA-DRB1*11:01 19 1 15 15 LKKKSEEKE KQLKKKSEEKELKIK 7702.70 77.0 77.00
HLA-DRB1*11:01 20 1 15 15 EEKELKIKN QLKKKSEEKELKIKN 8201.00 78.0 78.00
HLA-DRB1*11:01 4 1 15 15 EETIKVYCL QNQVSEETIKVYCLQ 11060.20 84.0 84.00
HLA-DRB1*11:01 3 1 15 15 EETIKVYCL LQNQVSEETIKVYCL 13819.50 88.0 88.00
HLA-DRB1*11:01 1 1 15 15 LQNQVSEET KALQNQVSEETIKVY 16019.10 91.0 91.00
HLA-DRB1*11:01 2 1 15 15 VSEETIKVY ALQNQVSEETIKVYC 17978.50 93.0 93.00
Loading

0 comments on commit 5b7f999

Please sign in to comment.