-
Notifications
You must be signed in to change notification settings - Fork 59
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c2b73d3
commit 5b7f999
Showing
40 changed files
with
82,327 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,192 @@ | ||
import sys | ||
import argparse | ||
import tempfile | ||
import os | ||
import shutil | ||
import yaml | ||
import csv | ||
import re | ||
from collections import OrderedDict | ||
from Bio import SeqIO | ||
from Bio.Seq import Seq | ||
from Bio.SeqRecord import SeqRecord | ||
|
||
from pvactools.lib.splice_pipeline import JunctionPipeline | ||
from pvactools.lib.calculate_manufacturability import CalculateManufacturability | ||
from pvactools.lib.run_utils import * | ||
|
||
def define_parser(): | ||
parser = argparse.ArgumentParser( | ||
"pvacsplice generate_protein_fasta", | ||
description="Generate an annotated fasta file from a RegTools junctions output TSV file with protein sequences of mutations", | ||
formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
) | ||
parser.add_argument( | ||
"input_file", | ||
help="RegTools junctions output TSV file" | ||
) | ||
parser.add_argument( | ||
"flanking_sequence_length", type=int, | ||
help="Number of amino acids to add on each side of the splice site when creating the FASTA.", | ||
) | ||
parser.add_argument( | ||
"output_file", | ||
help="The output fasta file." | ||
) | ||
parser.add_argument( | ||
"annotated_vcf", | ||
help="A VEP-annotated single- or multi-sample VCF containing genotype and transcript information." | ||
+ "The VCF ma be gzipped (requires tabix index)." | ||
) | ||
parser.add_argument( | ||
"ref_fasta", | ||
help="A reference FASTA file. Note: this input should be the same as the RegTools vcf input." | ||
) | ||
parser.add_argument( | ||
"gtf_file", | ||
help="A reference GTF file. Note: this input should be the same as the RegTools gtf input." | ||
) | ||
parser.add_argument( | ||
"--input-tsv", | ||
help = "A pVACsplice all_epitopes, filtered, or aggregated TSV file with epitopes to use for subsetting the inputs to peptides of interest. Only the peptide sequences for the epitopes in the TSV will be used when creating the FASTA." | ||
) | ||
parser.add_argument( | ||
'--pass-only', | ||
help="Only process VCF entries with a PASS status.", | ||
default=False, | ||
action='store_true', | ||
) | ||
parser.add_argument( | ||
"--biotypes", type=lambda s:[a for a in s.split(',')], | ||
help="A list of biotypes to use for pre-filtering transcripts for processing in the pipeline.", | ||
default=['protein_coding'] | ||
) | ||
parser.add_argument( | ||
"-j", "--junction-score", type=int, | ||
help="Junction Coverage Cutoff. Only sites above this read depth cutoff will be considered.", | ||
default=10 | ||
) | ||
parser.add_argument( | ||
"-v", "--variant-distance", type=int, | ||
help="Regulatory variants can lie inside or outside of splicing junction." | ||
+ "Maximum distance window (upstream and downstream) for a variant outside the junction.", | ||
default=100 | ||
) | ||
parser.add_argument( | ||
"--anchor-types", nargs="*", | ||
help="The anchor types of junctions to use. Multiple anchors can be specified using a comma-separated list." | ||
+ "Choices: A, D, NDA, DA, N", | ||
default=['A', 'D', 'NDA'], | ||
choices=['A', 'D', 'NDA', 'DA', 'N'] | ||
) | ||
parser.add_argument( | ||
"--aggregate-report-evaluation", | ||
help="When running with an aggregate report input TSV, only include variants with this evaluation. Valid values for this field are Accept, Reject, Pending, and Review. Specifiy multiple values as a comma-separated list to include multiple evaluation states.", | ||
default='Accept', | ||
type=lambda s:[e for e in s.split(',')], | ||
) | ||
parser.add_argument( | ||
"-s", "--sample-name", | ||
help="The name of the sample being processed. Required when processing a multi-sample VCF and must be a sample ID in the input VCF #CHROM header line." | ||
) | ||
return parser | ||
|
||
def parse_input_tsv(input_tsv): | ||
if input_tsv is None: | ||
return (None, None) | ||
with open(input_tsv, 'r') as fh: | ||
reader = csv.DictReader(fh, delimiter = "\t") | ||
if 'Index' in reader.fieldnames: | ||
indexes = parse_full_input_tsv(reader) | ||
file_type = 'full' | ||
else: | ||
indexes = parse_aggregated_input_tsv(reader) | ||
file_type = 'aggregated' | ||
return (indexes, file_type) | ||
|
||
def parse_full_input_tsv(reader): | ||
indexes = [] | ||
for line in reader: | ||
indexes.append(line['Index']) | ||
return indexes | ||
|
||
def parse_aggregated_input_tsv(reader): | ||
indexes = [] | ||
for line in reader: | ||
indexes.append(line) | ||
return indexes | ||
|
||
def main(args_input = sys.argv[1:]): | ||
parser = define_parser() | ||
args = parser.parse_args(args_input) | ||
|
||
if not (set(args.aggregate_report_evaluation)).issubset(set(['Accept', 'Reject', 'Review', 'Pending'])): | ||
sys.exit("Aggregate report evaluation ({}) contains invalid values.".format(args.aggregate_report_evaluation)) | ||
|
||
temp_dir = tempfile.mkdtemp() | ||
|
||
junction_arguments = { | ||
'input_file_type' : 'junctions', | ||
'junctions_dir' : temp_dir, | ||
'input_file' : args.input_file, | ||
'gtf_file' : args.gtf_file, | ||
'save_gtf' : False, | ||
'sample_name' : args.sample_name, | ||
'ref_fasta' : args.ref_fasta, | ||
'annotated_vcf' : args.annotated_vcf, | ||
'pass_only' : args.pass_only, | ||
'biotypes' : args.biotypes, | ||
'junction_score' : args.junction_score, | ||
'variant_distance' : args.variant_distance, | ||
'anchor_types' : args.anchor_types, | ||
'normal_sample_name' : None, | ||
'keep_tmp_files' : False, | ||
'class_i_epitope_length' : [], | ||
'class_ii_epitope_length' : [], | ||
'class_i_hla' : [], | ||
'class_ii_hla' : [], | ||
} | ||
|
||
pipeline = JunctionPipeline(**junction_arguments) | ||
pipeline.vcf_to_tsv() | ||
pipeline.junction_to_fasta() | ||
|
||
transcript_fasta = pipeline.create_file_path('fasta') | ||
mt_sequences = {} | ||
wt_sequences = {} | ||
for record in SeqIO.parse(transcript_fasta, "fasta"): | ||
if record.id.startswith('WT.'): | ||
wt_sequences[record.id.split('.', 1)[1]] = record.seq | ||
if record.id.startswith('ALT.'): | ||
mt_sequences[record.id.split('.', 1)[1]] = record.seq | ||
|
||
final_sequences = {} | ||
for (index, mt_sequence) in mt_sequences.items(): | ||
wt_sequence = wt_sequences[index] | ||
final_sequences[index] = get_mutated_peptide_with_flanking_sequence(wt_sequence, mt_sequence, args.flanking_sequence_length) | ||
|
||
(tsv_indexes, tsv_file_type) = parse_input_tsv(args.input_tsv) | ||
output_records = [] | ||
for (index, sequence) in final_sequences.items(): | ||
if tsv_indexes is not None: | ||
if tsv_file_type == 'full': | ||
if index not in tsv_indexes: | ||
continue | ||
else: | ||
matches = [i for i in tsv_indexes if i['ID'] == index and i['Evaluation'] in args.aggregate_report_evaluation] | ||
if len(matches) == 0: | ||
continue | ||
new_record = SeqRecord(sequence, id=index, description=index) | ||
output_records.append(new_record) | ||
|
||
SeqIO.write(output_records, args.output_file, "fasta") | ||
print("Completed") | ||
|
||
shutil.rmtree(temp_dir, ignore_errors=True) | ||
manufacturability_file = "{}.manufacturability.tsv".format(args.output_file) | ||
print("Calculating Manufacturability Metrics") | ||
CalculateManufacturability(args.output_file, manufacturability_file, 'fasta').execute() | ||
print("Completed") | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
<BODY LINK="#0000ff" BGCOLOR="#ffffff"> | ||
|
||
<script language=JavaScript type = "text/javascript"> | ||
var loc = window.top.location.search.substring(1); | ||
if (loc != null && loc !== "" && loc.length > 1) { | ||
document.write('<center><h2>',loc,' Server Output - DTU Health Tech</h2></center>');} | ||
</script> | ||
<hr> | ||
<pre> | ||
|
||
# NetMHCstabpan version 1.0 | ||
|
||
# Input is in FSA format | ||
|
||
# Peptide length 9 | ||
|
||
HLA-G01:09 : Distance to traning data 0.372 (using nearest neighbor HLA-A24:03) | ||
|
||
# Rank Threshold for Strong binding peptides 0.500 | ||
# Rank Threshold for Weak binding peptides 2.000 | ||
----------------------------------------------------------------------------------------------------- | ||
pos HLA peptide Identity Pred Thalf(h) %Rank_Stab BindLevel | ||
----------------------------------------------------------------------------------------------------- | ||
0 HLA-G*01:09 KVYCLQKEL 0000000000 0.008 0.14 5.50 | ||
----------------------------------------------------------------------------------------------------- | ||
|
||
Protein 0000000000. Allele HLA-G*01:09. Number of high binders 0. Number of weak binders 0. Number of peptides 1 | ||
|
||
Link to Allele Frequencies in Worldwide Populations <a href="http://www.allelefrequencies.net/hla6006a.asp?hla_selection=G*01:09" target=_blank >HLA-G*01:09</a> | ||
----------------------------------------------------------------------------------------------------- | ||
</pre> | ||
<hr> | ||
Go <a href="javascript:history.back()"><b>back</b></a>. | ||
<!-- | ||
<button type="button" | ||
onclick="window.top.location.reload(true);")> | ||
New Submission</button> | ||
--> | ||
</BODY> | ||
</HTML> | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
<BODY LINK="#0000ff" BGCOLOR="#ffffff"> | ||
|
||
<script language=JavaScript type = "text/javascript"> | ||
var loc = window.top.location.search.substring(1); | ||
if (loc != null && loc !== "" && loc.length > 1) { | ||
document.write('<center><h2>',loc,' Server Output - DTU Health Tech</h2></center>');} | ||
</script> | ||
<hr> | ||
<pre> | ||
NetChop 3.0 predictions using version C-term. Threshold 0.500000 | ||
|
||
-------------------------------------- | ||
pos AA C score Ident | ||
-------------------------------------- | ||
1 Q . 0.030908 0000000000 | ||
2 V S 0.596502 0000000000 | ||
3 S . 0.032148 0000000000 | ||
4 E . 0.026136 0000000000 | ||
5 E . 0.023295 0000000000 | ||
6 T . 0.031866 0000000000 | ||
7 I . 0.108056 0000000000 | ||
8 K . 0.244313 0000000000 | ||
9 V S 0.962786 0000000000 | ||
10 Y S 0.975373 0000000000 | ||
11 C . 0.028444 0000000000 | ||
12 L S 0.960379 0000000000 | ||
13 Q . 0.169144 0000000000 | ||
14 K S 0.960541 0000000000 | ||
15 E . 0.032194 0000000000 | ||
16 L S 0.959280 0000000000 | ||
17 K . 0.424314 0000000000 | ||
-------------------------------------- | ||
|
||
Number of cleavage sites 6. Number of amino acids 17. Protein name 0000000000 | ||
|
||
-------------------------------------- | ||
</pre> | ||
<hr> | ||
Go <a href="javascript:history.back()"><b>back</b></a>. | ||
<!-- | ||
<button type="button" | ||
onclick="window.top.location.reload(true);")> | ||
New Submission</button> | ||
--> | ||
</BODY> | ||
</HTML> | ||
|
52 changes: 52 additions & 0 deletions
52
tests/test_data/pvacsplice/mock_files/response_DRB1*11:01_15_nn_align.tsv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
allele seq_num start end length core_peptide peptide ic50 rank adjusted_rank | ||
HLA-DRB1*11:01 14 1 15 15 FEKQLKKKS KEKFEKQLKKKSEEK 31.70 3.2 3.20 | ||
HLA-DRB1*11:01 15 1 15 15 FEKQLKKKS EKFEKQLKKKSEEKE 39.10 4.0 4.00 | ||
HLA-DRB1*11:01 16 1 15 15 FEKQLKKKS KFEKQLKKKSEEKEL 56.10 5.4 5.40 | ||
HLA-DRB1*11:01 17 1 15 15 FEKQLKKKS FEKQLKKKSEEKELK 140.60 12.0 12.00 | ||
HLA-DRB1*11:01 13 1 15 15 LQKELKIKN KVYCLQKELKIKNHS 142.90 12.0 12.00 | ||
HLA-DRB1*11:01 12 1 15 15 LQKELKIKN IKVYCLQKELKIKNH 171.90 14.0 14.00 | ||
HLA-DRB1*11:01 11 1 15 15 LQKELKIKN TIKVYCLQKELKIKN 191.00 15.0 15.00 | ||
HLA-DRB1*11:01 36 1 15 15 VKVKCKLTQ SADVVKVKCKLTQSF 278.20 18.0 18.00 | ||
HLA-DRB1*11:01 10 1 15 15 VYCLQKELK ETIKVYCLQKELKIK 302.50 19.0 19.00 | ||
HLA-DRB1*11:01 35 1 15 15 VKVKCKLTQ KSADVVKVKCKLTQS 314.80 19.0 19.00 | ||
HLA-DRB1*11:01 34 1 15 15 VKVKCKLTQ MKSADVVKVKCKLTQ 474.30 24.0 24.00 | ||
HLA-DRB1*11:01 27 1 15 15 LKIKNHSLQ EKELKIKNHSLQETS 489.20 25.0 25.00 | ||
HLA-DRB1*11:01 26 1 15 15 LKIKNHSLQ EEKELKIKNHSLQET 514.20 25.0 25.00 | ||
HLA-DRB1*11:01 25 1 15 15 LKIKNHSLQ SEEKELKIKNHSLQE 589.10 27.0 27.00 | ||
HLA-DRB1*11:01 24 1 15 15 KELKIKNHS KSEEKELKIKNHSLQ 726.30 30.0 30.00 | ||
HLA-DRB1*11:01 43 1 15 15 MKSADVVKQ HIMKSADVVKQRFKN 742.40 30.0 30.00 | ||
HLA-DRB1*11:01 9 1 15 15 VYCLQKELK EETIKVYCLQKELKI 747.80 30.0 30.00 | ||
HLA-DRB1*11:01 33 1 15 15 DVVKVKCKL IMKSADVVKVKCKLT 754.90 30.0 30.00 | ||
HLA-DRB1*11:01 41 1 15 15 MKSADVVKQ KTHIMKSADVVKQRF 822.50 32.0 32.00 | ||
HLA-DRB1*11:01 32 1 15 15 MKSADVVKV HIMKSADVVKVKCKL 827.90 32.0 32.00 | ||
HLA-DRB1*11:01 42 1 15 15 MKSADVVKQ THIMKSADVVKQRFK 831.60 32.0 32.00 | ||
HLA-DRB1*11:01 49 1 15 15 VKQRFKNPA DVVKQRFKNPAWVWL 855.80 32.0 32.00 | ||
HLA-DRB1*11:01 40 1 15 15 MKSADVVKQ SKTHIMKSADVVKQR 889.80 33.0 33.00 | ||
HLA-DRB1*11:01 45 1 15 15 DVVKQRFKN MKSADVVKQRFKNPA 916.90 34.0 34.00 | ||
HLA-DRB1*11:01 46 1 15 15 DVVKQRFKN KSADVVKQRFKNPAW 918.40 34.0 34.00 | ||
HLA-DRB1*11:01 48 1 15 15 VKQRFKNPA ADVVKQRFKNPAWVW 918.40 34.0 34.00 | ||
HLA-DRB1*11:01 50 1 15 15 RFKNPAWVW VVKQRFKNPAWVWLW 938.60 34.0 34.00 | ||
HLA-DRB1*11:01 47 1 15 15 DVVKQRFKN SADVVKQRFKNPAWV 963.60 34.0 34.00 | ||
HLA-DRB1*11:01 44 1 15 15 DVVKQRFKN IMKSADVVKQRFKNP 988.30 35.0 35.00 | ||
HLA-DRB1*11:01 29 1 15 15 MKSADVVKV SKTHIMKSADVVKVK 1076.40 36.0 36.00 | ||
HLA-DRB1*11:01 51 1 15 15 RFKNPAWVW VKQRFKNPAWVWLWN 1113.40 37.0 37.00 | ||
HLA-DRB1*11:01 31 1 15 15 MKSADVVKV THIMKSADVVKVKCK 1115.30 37.0 37.00 | ||
HLA-DRB1*11:01 30 1 15 15 MKSADVVKV KTHIMKSADVVKVKC 1184.30 38.0 38.00 | ||
HLA-DRB1*11:01 39 1 15 15 KTHIMKSAD ASKTHIMKSADVVKQ 1244.00 39.0 39.00 | ||
HLA-DRB1*11:01 23 1 15 15 KELKIKNHS KKSEEKELKIKNHSL 1296.00 39.0 39.00 | ||
HLA-DRB1*11:01 38 1 15 15 KTHIMKSAD ASKTHIMKSADVVNR 1345.20 40.0 40.00 | ||
HLA-DRB1*11:01 28 1 15 15 KTHIMKSAD ASKTHIMKSADVVKV 1372.00 41.0 41.00 | ||
HLA-DRB1*11:01 8 1 15 15 VYCLQKELK SEETIKVYCLQKELK 1550.20 43.0 43.00 | ||
HLA-DRB1*11:01 37 1 15 15 KTHIMKSAD TASKTHIMKSADVVN 2075.10 48.0 48.00 | ||
HLA-DRB1*11:01 22 1 15 15 KELKIKNHS KKKSEEKELKIKNHS 2591.40 53.0 53.00 | ||
HLA-DRB1*11:01 7 1 15 15 TIKVYCLQK VSEETIKVYCLQKEL 3255.70 58.0 58.00 | ||
HLA-DRB1*11:01 6 1 15 15 TIKVYCLQK QVSEETIKVYCLQKE 4843.60 67.0 67.00 | ||
HLA-DRB1*11:01 5 1 15 15 EETIKVYCL NQVSEETIKVYCLQK 6064.80 72.0 72.00 | ||
HLA-DRB1*11:01 21 1 15 15 EEKELKIKN LKKKSEEKELKIKNH 6442.50 73.0 73.00 | ||
HLA-DRB1*11:01 18 1 15 15 LKKKSEEKE EKQLKKKSEEKELKI 7314.10 76.0 76.00 | ||
HLA-DRB1*11:01 19 1 15 15 LKKKSEEKE KQLKKKSEEKELKIK 7702.70 77.0 77.00 | ||
HLA-DRB1*11:01 20 1 15 15 EEKELKIKN QLKKKSEEKELKIKN 8201.00 78.0 78.00 | ||
HLA-DRB1*11:01 4 1 15 15 EETIKVYCL QNQVSEETIKVYCLQ 11060.20 84.0 84.00 | ||
HLA-DRB1*11:01 3 1 15 15 EETIKVYCL LQNQVSEETIKVYCL 13819.50 88.0 88.00 | ||
HLA-DRB1*11:01 1 1 15 15 LQNQVSEET KALQNQVSEETIKVY 16019.10 91.0 91.00 | ||
HLA-DRB1*11:01 2 1 15 15 VSEETIKVY ALQNQVSEETIKVYC 17978.50 93.0 93.00 |
Oops, something went wrong.