-
Notifications
You must be signed in to change notification settings - Fork 0
/
hiv_vb_variant_analyse_snp_distribution.R
219 lines (187 loc) · 8.79 KB
/
hiv_vb_variant_analyse_snp_distribution.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# Author: Chris Wymant
# Acknowledgment: I wrote this while funded by ERC Advanced Grant PBDR-339251
# and a Li Ka Shing Foundation grant, both awarded to Christophe Fraser.
# This script analyses a whole-genome nucleotide alignment with the VB variant
# consensus, the consensus of other Dutch subtype-B in BEEHIVE, and HXB2. It
# splits the genome into equally sized segments, calculates how many positions
# in each segment are SNPs (single nucleotide polymorphisms) between the VB variant and the
# consensus of other Dutch subtype-B in BEEHIVE, and compares this to a null
# model. The null distribution is constructed as follows. Conditioning on the
# total number of nucleotide changes, the number in each segment is taken to be
# multinomially distributed, with the probability for each bin being the mean
# evolutionary rate over positions inside that segment, normalised to 1.
library(tidyverse)
library(seqinr)
# WARNING: the command below deletes all objects in memory in your current R
# session. Convenient when repeatedly re-running this script.
rm(list = ls())
# INPUT ------------------------------------------------------------------------
# Set seed for random number generation, for reproducibility
set.seed(12345)
# Set the working directory to where this code lives on your machine
# setwd()
# The whole-genome nucleotide alignment with the VB variant consensus, the
# consensus of other Dutch subtype-B in BEEHIVE, and HXB2, provided as
# supplementary data with the associated paper.
file_in_aln <- "nucleotide_VB_vs_DutchB_wHXB2.fasta"
# The 'normalisation' file, measuring between-host evolutionary rate as a
# function of position in the HIV genome (HXB2 coordinates).
file_in_normalisation <- "data_external/HIV_DistanceNormalisationOverGenome.csv"
# Files we'll create
file_out_snps <- "SNPs_genome_distribution.pdf"
file_out_snps_raw <- "SNPs_genome_distribution_raw.pdf"
num_segments <- 10L
num_multinomial_draws <- 200000L
plot <- FALSE
# MAIN -------------------------------------------------------------------------
stopifnot(file.exists(file_in_aln))
stopifnot(file.exists(file_in_normalisation))
# positions (wrt HXB2) between which we have sequence for both the lineage and
# the Dutch subtype B consensus (defined by the Gall et al 2012 primers)
pos_start <- 497L
pos_end <- 9496L
segment_boundaries <- seq(pos_start, pos_end, length.out = num_segments + 1L)
segment_boundaries[[1]] <- pos_start - 1L # Since bins are defined as (min,max]
# Percentiles of the null distribution that we'll calculate. Subsequent code
# expects these specific values.
percentiles <- c(0.5, 2.5, 25, 50, 75, 97.5, 99.5)
# Read in the aln, convert bases to upper case
aln <- read.fasta(file_in_aln, seqtype = "DNA") %>%
map(toupper)
# Check the three expected seq names are present
name_hxb2 <- "B.FR.83.HXB2_LAI_IIIB_BRU.K03455"
name_dutch_b <- "consensus_Dutch_subtypeB_inBEEHIVE"
name_lineage <- "consensus_VB_variant"
stopifnot(name_hxb2 %in% names(aln))
stopifnot(name_dutch_b %in% names(aln))
stopifnot(name_lineage %in% names(aln))
seq_hxb2 <- aln[[name_hxb2]]
seq_dutch_b <- aln[[name_dutch_b]]
seq_lineage <- aln[[name_lineage]]
# Check all seqs have the same length
length_aln <- length(seq_hxb2)
stopifnot(length_aln == length(seq_dutch_b))
stopifnot(length_aln == length(seq_lineage))
# Get the HXB2 position of aln positions where the lineage and dutch B seqs
# disagree. Check the bases are what we expect.
pos_snps <- integer()
bases_ok <- c("A", "C", "G", "T")
hxb2_pos_current <- 0L
for (pos in 1:length_aln) {
if (seq_hxb2[[pos]] != "-") hxb2_pos_current <- hxb2_pos_current + 1L
if (seq_dutch_b[[pos]] == seq_lineage[[pos]] ||
seq_dutch_b[[pos]] == "N" ||
seq_lineage[[pos]] == "N" ||
seq_dutch_b[[pos]] == "-" ||
seq_lineage[[pos]] == "-") {
next
}
# Manual hack: R means A or G so that's not a SNP
if (seq_dutch_b[[pos]] == "R" && seq_lineage[[pos]] == "A") next
if (! seq_dutch_b[[pos]] %in% bases_ok) {
stop(paste("Unexpected base", seq_dutch_b[[pos]], "in seq", name_dutch_b,
"at position", pos))
}
if (! seq_lineage[[pos]] %in% bases_ok) {
# Manual hack, Y means C or T so that's a SNP
if (seq_lineage[[pos]] != "Y" && seq_dutch_b[[pos]] != "A") {
stop(paste("Unexpected base", seq_lineage[[pos]], "in seq", name_lineage,
"at position", pos))
}
}
pos_snps[[length(pos_snps) + 1L]] <- hxb2_pos_current
}
stopifnot(all(pos_snps >= pos_start))
stopifnot(all(pos_snps <= pos_end))
num_snps <- length(pos_snps)
# Read in the normalisation file
df_normalisation <- read_csv(file_in_normalisation, col_types = cols(
POSITION_WRT_HXB2 = col_integer(),
MEDIAN_PAIRWISE_DISTANCE_BETWEEN_STANDARD_REFS = col_double()
)) %>%
rename(pos = POSITION_WRT_HXB2,
distance = MEDIAN_PAIRWISE_DISTANCE_BETWEEN_STANDARD_REFS)
# Add extra rows before the start and after the end of the normalisation file,
# which are within the [pos_start, pos_end] range we consider but where
# normalisation values are missing. Then check we have all the positions needed.
df_normalisation <- df_normalisation %>%
bind_rows(tibble(pos = c(seq(pos_start, min(df_normalisation$pos) - 1L),
seq(max(df_normalisation$pos) + 1L, pos_end)),
distance = NA)) %>%
arrange(pos)
stopifnot(identical(df_normalisation$pos,
pos_start:pos_end))
# Group the normalisation file into segments; take the mean normalisation in
# each segment, and then re-normalise so the segment values sum to 1.
df_normalisation <- df_normalisation %>%
mutate(segment = as.character(cut(pos, breaks = segment_boundaries)))
df_segments <- df_normalisation %>%
group_by(segment) %>%
summarise(distance = mean(distance, na.rm = TRUE),
segment_start = min(pos),
segment_end = max(pos),
.groups = "drop") %>%
arrange(segment_start) %>%
mutate(fraction_snps_expected = distance / sum(distance))
# Make a set of multinomial draws, with probabilities given by the normalisation
# value for each segment. i.e. simulate how many SNPs expected in each segment
# by chance: our null model.
snps_over_segments_simulation <- rmultinom(
n = num_multinomial_draws,
size = num_snps,
prob = df_segments$fraction_snps_expected
)
# Calculate the desired percentiles for the null model.
df_segments[paste0("num_snps_percentile_", percentiles)] <-
apply(snps_over_segments_simulation, 1, function(snp_counts) {
quantile(snp_counts, probs = percentiles / 100)
}) %>%
t()
# Merge the actual SNP counts with the null model percentiles
snp_segment_counts <- table(cut(pos_snps, segment_boundaries))
stopifnot(identical(names(snp_segment_counts),
df_segments$segment))
df_segments <- full_join(df_segments,
tibble(segment = names(snp_segment_counts),
snp_count = snp_segment_counts),
by = "segment")
# Plot!
if (plot) {
p <- ggplot(df_segments) +
theme_classic() +
scale_y_continuous(limits = c(0, NA), expand = c(0, 0)) +
scale_x_continuous(limits = c(pos_start, pos_end), breaks = 1:9 * 1000, expand = c(0, 0)) +
geom_rect(aes(xmin = segment_start,
xmax = segment_end,
ymin = num_snps_percentile_25,
ymax = num_snps_percentile_75), fill = "grey", alpha = 1) +
geom_rect(aes(xmin = segment_start,
xmax = segment_end,
ymin = num_snps_percentile_2.5,
ymax = num_snps_percentile_97.5), fill = "grey", alpha = 0.7) +
geom_rect(aes(xmin = segment_start,
xmax = segment_end,
ymin = num_snps_percentile_0.5,
ymax = num_snps_percentile_99.5), fill = "grey", alpha = 0.4) +
geom_segment(aes(x = segment_start, xend = segment_end,
y = snp_count, yend = snp_count), colour = "black") +
labs(x = "genome position (HXB2)",
y = "number of nucleotide changes")
ggsave(file_out_snps,
p, height = 4.5, width = 5.5)
p <- ggplot() +
theme_classic() +
scale_y_continuous(limits = c(0, NA), expand = c(0, 0)) +
scale_x_continuous(limits = c(pos_start, pos_end), breaks = 1:9 * 1000, expand = c(0, 0)) +
geom_point(data = tibble(x = pos_snps, y = 0), aes(x, y), shape = "|", size = 1.5) +
geom_line(data = df_normalisation, aes(x = pos, y = distance)) +
labs(x = "genome position (HXB2)",
y = "evolutionary rate (substitutions per site)")
ggsave(file_out_snps_raw,
p, height = 4.5, width = 5.5)
}
# After visual inspection, when 10 segments are used, segment 5 (with 32 snps)
# deviates most from the null. Calculate its two-tailed p value, unadjusted for
# multiple testing:
2 * sum(snps_over_segments_simulation[5,] >= 32) / num_multinomial_draws
# 2 * sum(snps_over_segments_simulation[9,] >= 21) / num_multinomial_draws # <- for when 20 segments are used