-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodelling_substitutions.py
1381 lines (1255 loc) · 58.9 KB
/
modelling_substitutions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# code by [email protected]
# last updated July 2020
# standard python libraries
from itertools import combinations
import math
import os
import re
# 3rd party python libraries
from Bio import SeqIO
import dendropy as ddp
import matplotlib.pyplot as plt
from matplotlib import rcParams
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import poisson
# permissive matching, so long as there is at least one nucleotide in common
# between two bases, they are considered to match
permissive_matches = {
# Adenine
'A': {'A', 'D', 'H', 'M', 'N', 'R', 'V', 'W'},
# Cytosine
'C': {'B', 'C', 'H', 'M', 'N', 'S', 'V', 'Y'},
# Guanine
'G': {'B', 'D', 'G', 'K', 'N', 'R', 'S', 'V'},
# Thymine
'T': {'B', 'D', 'H', 'K', 'N', 'T', 'W', 'Y'},
# A or G
'R': {'A', 'B', 'D', 'G', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W'},
# C or T
'Y': {'B', 'C', 'D', 'H', 'K', 'M', 'N', 'S', 'T', 'V', 'W', 'Y'},
# G or C
'S': {'B', 'C', 'D', 'G', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'Y'},
# A or T
'W': {'A', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'T', 'V', 'W', 'Y'},
# G or T
'K': {'B', 'D', 'G', 'H', 'K', 'N', 'R', 'S', 'T', 'V', 'W', 'Y'},
# A or C
'M': {'A', 'B', 'C', 'D', 'H', 'M', 'N', 'R', 'S', 'V', 'W', 'Y'},
# C or G or T
'B': {'B', 'C', 'D', 'G', 'H', 'K', 'M', 'N', 'R', 'S', 'T', 'V', 'W',
'Y'},
# A or G or T
'D': {'A', 'B', 'D', 'G', 'H', 'K', 'M', 'N', 'R', 'S', 'T', 'V', 'W',
'Y'},
# A or C or T
'H': {'A', 'B', 'C', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'T', 'V', 'W',
'Y'},
# A or C or G
'V': {'A', 'B', 'C', 'D', 'G', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W',
'Y'},
# A or C or G or T
'N': {'A', 'B', 'C', 'D', 'G', 'H', 'K', 'M', 'N', 'R', 'S', 'T', 'V',
'W', 'Y'},
}
file_names_style = {
# g1, genotype; g2, region
'style1': r'(B3|D4|D8).*?(N450_MFNCR|N450|MFNCR)',
# g1, dataset; g2, model settings (for BEAST runs)
'style2': r'([^-]+)-([A-Z])\..*',
# g1, genotype; g2, dataset; g3, region
'style3': r'(B3|D4|D8)_([A-Z_]*)(?:_|-).*?(N450_MFNCR|N450|MFNCR)'
}
def nucleotide_match(base1, base2, partial_matches_allowed=True):
"""
Compares two bases and returns True if there is a match and False
otherwise.
Parameters
----------
base1, base2 : str
A string of len 1 representing a nucleotide.
partial_matches_allowed : bool
True if, for example, R and A or G (and vice-versa) should be
considered a match, False if not.
Returns
-------
bool
True if:
* identical strs
* partial_matches_allowed and base1 and base2 are a permitted match
False if:
* different bases and base2 is not in the permitted matches for base1
"""
base1_upper = base1.upper()
base2_upper = base2.upper()
for base in [base1_upper, base2_upper]:
if base not in [*permissive_matches.keys(), '-']:
raise ValueError(f'base ({base}) character not known IUPAC symbol')
return (base1_upper == base2_upper or
(partial_matches_allowed and
base2_upper in permissive_matches.get(base1_upper, set())))
def num_diff_seq_pair(seq1, seq2, partial_matches_allowed=True):
"""
Takes in two aligned sequences (with '-') and returns absolute number of
differences between them. If partial_matches_allowed, ambiguous bases will
considered a match to any of the nucleotides they represent (i.e., 'R' will
be counted as match to either 'A' or 'G' and 'A'/'G' will be counted as a
match to 'R'). Counts gaps and difference in length between sequences as
differences.
"""
# add one each time sequences of each pair differ
# adds the difference in length of the sequences
return sum(
1 for a, b in zip(seq1, seq2)
if not nucleotide_match(a, b, partial_matches_allowed)
) + abs(len(seq1) - len(seq2))
# for editable text within Inkscape
rcParams['svg.fonttype'] = 'none'
def year_fraction_from_id(identifier):
"""
Takes sample record id like 'H123456789_2017.1489' and returns year
fraction (float following '_').
"""
return float(identifier.split('_')[-1])
def sample_pairs_times(tree_path, tree_type='nexus'):
"""
Given a Bayesian-inferred phylogenetic tree `tree_path` of type tree_type
returns a `pandas.DataFrame` containing time information for each pair of
samples in the tree.
Parameters
----------
tree_path : str
A path to a tree file with trip tip labels as "SampleID_year.float".
tree_type : str, default 'nexus'
The tree-encoding format. Any permitted by Dendropy's `get_from_path`
method.
Returns
-------
pd.DataFrame
Pandas data frame containing the time information gathered from the
tree file for each pair of samples.
Sample1, Sample2: sample ids (str)
ts1, ts2: sample time in weeks (float)
Dt: weeks between the pair of samples (int)
DtMRCA: weeks between most recent sample and the Bayesian-predicted
MRCA (float)
tMRCA: time of Bayesian-predicted MRCA in weeks (float)
"""
# read tree from file
tree = ddp.Tree().get_from_path(tree_path, tree_type,
preserve_underscores=True)
# produce tree distance matrix
distance_matrix = tree.node_distance_matrix()
# get pairs of sample nodes
node_pairs = list(combinations(tree.leaf_nodes(), 2))
# data for plotting and validation
pairs_data = []
for (s1_node, s2_node) in node_pairs:
# for (sample1, sample2) in sample_pairs:
# (year+wk/52)*52 is int
sample1 = s1_node.taxon.label
sample2 = s2_node.taxon.label
time1 = int(year_fraction_from_id(sample1) * 52)
time2 = int(year_fraction_from_id(sample2) * 52)
# time between samples in weeks (originally in years)
pair_delta = abs(time1 - time2)
# get time of the MRCA as calculated by BEAST
# * find MRCA node
s1_s2_mrca = distance_matrix.mrca(s1_node, s2_node)
# * calculate the distance of each node from the root (cannot use
# distance_from_tip as this calculates time between node and most
# recent child of the node (not most recent tip in the tree);
# tree distances in years, so converted to weeks
node_root_dist = {node: node.distance_from_root() * 52
for node in [s1_node, s2_node, s1_s2_mrca]}
most_recent_sample_node = max(node_root_dist,
key=lambda node: node_root_dist[node])
# * number of weeks between the most recent sample and the MRCA
Dt_mrca = (node_root_dist[most_recent_sample_node] -
node_root_dist[s1_s2_mrca])
# * get time of MRCA (may not be needed)
t_mrca = max(time1, time2) - Dt_mrca
# add info to list of data
pairs_data.append(
[sample1, sample2, time1, time2, pair_delta, Dt_mrca, t_mrca])
return pd.DataFrame(
columns='Sample1, Sample2, ts1, ts2, Dt, DtMRCA, '
'tMRCA'.split(', '),
data=pairs_data)
def sample_pairs_distances(pair_data, fasta_path):
"""
Calculates the pairwise distances for the samples in the `pair_data`
pandas.DataFrame from the sequences in the `fasta_path`.
Parameters
----------
pairs_info : pd.DataFrame
Pandas data frame containing the time information for each pair of
samples gathered from a tree file.
Sample1, Sample2: sample ids (str)
ts1, ts2: sample time in weeks (float)
Dt: weeks difference between the pair of samples (int)
DtMRCA: weeks between most recent sample and MRCA (float)
tMRCA: time of MRCA in weeks (float)
fasta_path : str
Path to the fasta file to analyse.
Returns
-------
pd.DataFrame
Pandas data frame like the input one, with an extra column for the
Hamming distance calculated for each sample pair.
Sample1, Sample2: sample ids (str)
ts1, ts2: sample time in weeks (float)
Dt: weeks difference between the pair of samples (int)
DtMRCA: weeks between most recent sample and MRCA (float)
tMRCA: time of MRCA in weeks (float)
Dist: Hamming distance between Sample1 and Sample2 (int)
"""
record_dict = SeqIO.to_dict(SeqIO.parse(fasta_path, 'fasta'))
def get_distance(row):
"""
Calculates distance between sequences of the two samples in the df row.
Ambiguous bases and gaps are counted as differences.
"""
seq1 = record_dict[row['Sample1']].seq
seq2 = record_dict[row['Sample2']].seq
return num_diff_seq_pair(seq1, seq2, partial_matches_allowed=True)
pair_data['Dist'] = pair_data.apply(get_distance, axis=1)
return pair_data
def poisson_mode_and_alpha(expected, alpha):
"""
Returns mode number of expected substitutions and upper & lower limits
within which falls `alpha` fraction of the occurrences for a poisson
distribution with lambda=`expected` value.
Parameters
----------
expected : float
Expected substitutions.
alpha : float
Probability interval containing alpha fraction of the Poisson
distribution.
Returns
-------
tuple of int
mode: Typical number of substitutions expected.
lower_limit, upper_limit: Upper and lower number of substitutions
between which alpha fraction of the Poisson distribution is
contained.
"""
mean, var = poisson.stats(expected, moments='mv')
mode = math.floor(mean) # round down to closest integer
lower_limit, upper_limit = poisson.interval(alpha, expected, loc=0)
return mode, lower_limit, upper_limit
def expected_substitutions(rate, sites, alpha, time_range):
"""
Given a substitution `rate`, the number of `sites`, the `alpha`probability
interval `alpha` and the `time_range` to use returns expected, lower and
upper values for the expected substitutions for a Poisson distribution with
lambda=rate*time.
Parameters
----------
rate : float
Substitution rate in substitutions/(site*year).
sites : int
Number of sites in the genomic region to which the rate applies.
alpha : float
Fraction of the Poisson distribution to fall within interval (0-1).
e.g., 0.95: 95% of the Poisson distribution at a given time will fall
within the lower-upper range of number of substitutions.
time_range : tuple
Time range for which expected values are calculated (ints or floats),
(min_time, max_time)
Returns
-------
expected_data : pd.DataFrame
Pandas dataframe containing expected substitution range for each time
in the interval given.
Time: time for which mode, lower and upper are calculated, in weeks
(int)
Typical: typical number of substitutions expected at each time (int)
Lower: lower # of substitutions at the confidence (alpha) interval
given (int)
Upper: upper # of substitutions for the confidence (alpha) interval
given (int)
"""
min_time, max_time = time_range
expected_data = pd.DataFrame({'Time': range(math.floor(min_time),
math.ceil(max_time) + 1)})
def poisson_values(time):
"""Expected substitutions for time with rate (poisson lambda)."""
# rate in subs/(site.year), convert to subs/week; multiply by each time
# point to obtain expected substitutions at that time
poisson_rate = (rate / 52) * sites * time
mode, lower, upper = poisson_mode_and_alpha(poisson_rate, alpha)
return {'Typical': mode, 'Lower': lower, 'Upper': upper}
expected_data[['Typical', 'Lower', 'Upper']] = expected_data['Time'].apply(
lambda t: pd.Series(poisson_values(t)))
return expected_data
def get_cumulative_evolution_times(pairs_data, relative_times_to_ca,
possible_margin=0.1):
"""
Calculate the time each pair of samples had to evolve since the presumed
common ancestor for pairs of samples collected within a smaller time frame
than that between the most recent sample and the presumed ancestor.
Parameters
----------
pairs_data : pd.DataFrame
Pandas data frame containing the information for each pair of samples
gathered from fasta and tree files.
Sample1, Sample2: sample ids (str)
ts1, ts2: sample time in weeks (float)
Dt: weeks difference between the pair of samples (int)
DtMRCA: weeks between most recent sample and MRCA (float)
tMRCA: time of MRCA in weeks (float)
Dist: number of differences between Sample1 and Sample2 sequences (int)
relative_times_to_ca : iter
An iterable of times from a putative common ancestor (DtCA; int) for
which model is being assessed.
possible_margin : float, default 0.1
The level of tolerance for the MRCA time cutoff. 0.1 means that when
the MRCA of two samples has a time larger than the threshold by up to
10%, it is considered possible that the MRCA happened within the time
considered (accounting for errors in the BEAST summary tree or sample
dates).
Returns
-------
pairs_to_classify : pd.DataFrame
Pandas data frame with each pair info line repeated for each relative
time to common ancestor (ignoring sample pairs spanning a period larger
than that between the most recent sample and the presumed ancestor.
Sample1, Sample2: sample ids (str)
ts1, ts2: sample time in weeks (float)
Dt: weeks difference between the pair of samples (int)
DtMRCA: weeks between most recent sample and MRCA (float)
tMRCA: time of MRCA in weeks (float)
Dist: Hamming distance between Sample1 and Sample2 (int)
DtCA: times since a presumed CA (float)
max_DtCA: DtCA * (1 + possible_margin) (float)
tCE: cumulative evolution time (tCE) for the pair since the tCA (float)
"""
pairs_to_classify = pd.DataFrame(
columns='Sample1, Sample2, ts1, ts2, Dt, DtMRCA, tMRCA, Dist, '
'DtCA, max_DtCA, tCE'.split(', '))
for relative_t_to_ca in relative_times_to_ca:
pairs_info_for_DtCA = pairs_data.copy()
pairs_info_for_DtCA['DtCA'] = relative_t_to_ca
pairs_info_for_DtCA['max_DtCA'] = (
relative_t_to_ca * (1 + possible_margin))
# relative_t_to_ca is the number of weeks from most recent sample
# to the presumed common ancestor (DtCA)
# if the difference in time between samples 1 and 2 is longer than the
# time predicted for the common ancestor, ignore the pair, e.g.:
# /--- Sample 1
# \--------- Sample 2
# |<-->| DtCA would not make sense to consider these pairs
pairs_to_consider = pairs_info_for_DtCA[
pairs_info_for_DtCA['Dt'] <= relative_t_to_ca].reset_index(
drop=True)
# calculate cumulative evolution time (tCE)
# 2 samples may have accumulated differences over the sum of times
# between each sample and the putative common ancestor
pairs_to_consider['tCE'] = (
2 * relative_t_to_ca - pairs_to_consider['Dt'])
pairs_to_classify = pd.concat([pairs_to_classify, pairs_to_consider],
ignore_index=True)
print(f'Calculated tCEs for DtCA {relative_t_to_ca}.')
return pairs_to_classify
def observed_and_predicted_for_dataset(
pairs_info, relative_ts_to_ca, substitution_rate, num_sites,
possible_margin=0.1, alpha=0.95):
"""
Given a pandas.DataFrame containing time and distance information for pairs
of samples, classifies each pair at each tCA as per the Bayesian estimate
and the Poisson prediction.
Parameters
----------
pairs_info : pd.DataFrame
Pandas data frame containing the information for each pair of samples
gathered from fasta and tree files.
Sample1, Sample2: sample ids (str)
ts1, ts2: sample time in weeks (float)
Dt: weeks difference between the pair of samples (int)
DtMRCA: weeks between most recent sample and MRCA (float)
tMRCA: time of MRCA in weeks (float)
Dist: Hamming distance between Sample1 and Sample2 (int)
relative_ts_to_ca : iter
An iterable object containing ints or floats, each representing a time
between the most recent sample and a presumed common ancestor (DtCA).
substitution_rate : float
Substitution rate in substitutions/(site.year)
num_sites : int
The number of sites in the MSA.
possible_margin: float; default 0.1
The level of tolerance for the MRCA time cutoff. 0.1 means that when
the MRCA of two samples has a time larger than the threshold by up to
10%, it is considered possible that the MRCA happened within the time
considered (accounting for errors in the BEAST summary tree or sample
dates).
alpha : float, default 0.95
Fraction of the Poisson distribution to fall within interval. Between
0 and 1.
e.g., 0.95: 95% of the Poisson distribution at a given time will fall
within the lower-upper range of number of substitutions.
Returns
-------
pairs_results : pd.DataFrame
Pandas data frame containing the observed and predicted results for
each file.
Sample1, Sample2: sample ids (str)
ts1, ts2: sample time in weeks (float)
Dt: weeks difference between the pair of samples (int)
DtMRCA: weeks between most recent sample and MRCA (float)
tMRCA: time of MRCA in weeks (float)
Dist: Hamming distance between Sample1 and Sample2 (int)
DtCA: times since a presumed CA (float)
max_DtCA: DtCA * (1 + possible_margin) (float)
tCE: cumulative evolution time (tCE) for the pair since the tCA (float)
Observed: pair category based on time tree - likely, possible or
unlikely (str)
Predicted: predicted category based on model - below, typical or above
(str)
Result: result classification based on observed and predicted (str)
poisson_values : pd.DataFrame
Pandas data frame containing expected substitution range for each time
in the interval given.
Time: time for which mode, lower and upper are calculated, in weeks
(int)
Typical: typical number of substitutions expected at each time (int)
Lower: lower # of substitutions at the confidence (alpha) interval
given (int)
Upper: upper # of substitutions for the confidence (alpha) interval
given (int)
"""
# tCE for each DtCA
# pairs_results columns:
# Sample1, Sample2, ts1, ts2, Dt, DtMRCA, tMRCA, Dist, DtCA, max_DtCA,
# tCE
pairs_results = get_cumulative_evolution_times(
pairs_info, relative_ts_to_ca, possible_margin)
# calculate expected substitution range for the full range of tCE
min_tce = pairs_results['tCE'].min()
max_tce = pairs_results['tCE'].max()
# data frame columns: Time, Typical, Lower, Upper
poisson_values = expected_substitutions(
substitution_rate, num_sites, alpha, (min_tce, max_tce))
# classify based on time tree
pairs_results['Observed'] = np.nan
# DtMRCA <= DtCA: likely
likely_pairs = pairs_results['DtMRCA'] <= pairs_results['DtCA']
pairs_results.loc[likely_pairs, 'Observed'] = 'likely'
# DtMRCA > DtCA: unlikely
unlikely_pairs = pairs_results['DtMRCA'] > pairs_results['max_DtCA']
pairs_results.loc[unlikely_pairs, 'Observed'] = 'unlikely'
# remaining (else): possible
pairs_results.loc[pairs_results['Observed'].isna(),
'Observed'] = 'possible'
print('Classified pairs based on time tree.')
# classify based on Poisson predictions and model tCE
# add subs ranges to df
pairs_results = pairs_results.join(poisson_values.set_index('Time'),
on='tCE')
pairs_results['Predicted'] = np.nan
# distance < low end of expected substitution range for tCE: below
below_pairs = pairs_results['Dist'] < pairs_results['Lower']
pairs_results.loc[below_pairs, 'Predicted'] = 'below'
# distance > high end of expected substitution range for tCE: above
above_pairs = pairs_results['Dist'] > pairs_results['Upper']
pairs_results.loc[above_pairs, 'Predicted'] = 'above'
# remaining (else): expected
pairs_results.loc[pairs_results['Predicted'].isna(),
'Predicted'] = 'typical'
# drop unnecessary columns
pairs_results.drop(columns='Typical, Lower, Upper'.split(', '),
inplace=True)
print('Classified pairs based on model prediction.')
# classify result
obs_likely = pairs_results['Observed'] == 'likely'
obs_possible = pairs_results['Observed'] == 'possible'
obs_unlikely = pairs_results['Observed'] == 'unlikely'
pre_below = pairs_results['Predicted'] == 'below'
pre_above = pairs_results['Predicted'] == 'above'
pre_typical = pairs_results['Predicted'] == 'typical'
correct_pairs = (
# likely pairs below max exp subs
obs_likely & (pre_typical | pre_below)) | (
# unlikely pairs above exp subs
obs_unlikely & pre_above) | (
# possible always correct (borderline)
obs_possible)
incorrect_pairs = (
# false positives
obs_likely & pre_above) | (
# false negatives
obs_unlikely & (pre_typical | pre_below))
pairs_results['Result'] = np.nan
pairs_results.loc[correct_pairs, 'Result'] = 'correct'
pairs_results.loc[incorrect_pairs, 'Result'] = 'incorrect'
print('Classified model results.\n')
# pairs_results columns:
# Sample1, Sample2, ts1, ts2, Dt, DtMRCA, tMRCA, Dist, DtCA, max_DtCA,
# tCE, Observed, Predicted, Result
# poisson_values columns: Time, Typical, Lower, Upper
return pairs_results, poisson_values
def observed_and_predicted(
tree_fastas, relative_times_to_ca, rates, possible_margin=0.1,
alpha=0.95, tree_encoding='nexus'):
"""
Given combinations of trees and fasta files, calculates distance tMRCA, and
cumulative evolutionary time for combinations of pairs of samples for times
in relative_times_to_ca and analyses observed and predicted results.
Parameters
----------
tree_fastas : dict
A dictionary mapping a time-scaled phylogenetic tree to a list of fasta
files containing MSAs for the region(s) used to infer the tree.
e.g., 'B3_N450_MFCR.nex': ['B3_N450.fas', 'B3_MFNCR.fas']
relative_times_to_ca : iter
An iterable object containing ints or floats, each representing a time
between the most recent sample and a presumed common ancestor (DtCA).
rates : pd.DataFrame
Pandas data frame containing substitution rates.
Genotype, Region: Genotype and region for which the rate was calculated
(str)
Sites: number of sites in the MSA for the region analysed (int)
Rate: substitution rate in substitutions/(site.year) (float)
possible_margin : float; default 0.1
The level of tolerance for the MRCA time cutoff. 0.1 means that when
the time of the MRCA in the time tree for a pair of samples has a time
larger than the relative time to a CA being checked by up to 10%, it is
considered possible that the MRCA happened within the time considered
(accounting for errors in the BEAST summary tree or sample dates).
alpha : float
Fraction of the Poisson distribution to fall within interval. Between
0 and 1.
e.g., 0.95: 95% of the Poisson distribution at a given time will fall
within the lower-upper range of number of substitutions.
tree_encoding : str
The format of the tree file, e.g., 'newick', 'nexus'.
Returns
-------
results : pd.DataFrame
Pandas data frame containing the observed and predicted results for
all files.
Sample1, Sample2: sample ids (str)
ts1, ts2: sample times in weeks (float)
Dt: weeks difference between the pair of samples (int)
DtMRCA: weeks between most recent sample and MRCA (float)
tMRCA: time of MRCA in weeks (float)
Dist: Hamming distance between Sample1 and Sample2 (int)
DtCA: times since a presumed CA (float)
max_DtCA: DtCA * (1 + possible_margin) (float)
tCE: cumulative evolution time (tCE) for the pair since the tCA (float)
Observed: pair category based on time tree - likely, possible or
unlikely (str)
Predicted: predicted category based on model - below, typical or above
(str)
Result: result classification based on observed and predicted (str)
Genotype, Dataset, Region: Genotype, set of data and region for which
the rate was calculated (str)
predicted : pd.DataFrame
Pandas dataframe containing expected substitution range for each time
in the interval given for each dataset, genotype and region.
Time: time for which mode, lower and upper are calculated, in weeks
(int)
Typical: typical number of substitutions expected at each time (int)
Lower: lower # of substitutions at the confidence (alpha) interval
given (int)
Upper: upper # of substitutions for the confidence (alpha) interval
given (int)
Genotype, Dataset, Region: Genotype, set of data and region for which
the substitutions were calculated (str)
"""
results = pd.DataFrame(
columns='Sample1, Sample2, ts1, ts2, Dt, DtMRCA, tMRCA, Dist, '
'DtCA, max_DtCA, tCE, Observed, Predicted, Result, '
'Genotype, Dataset, Region'.split(', '))
predicted = pd.DataFrame(
columns='Time, Typical, Lower, Upper, Genotype, Dataset, '
'Region'.split(', '))
in_dir = None
num_trees = len(tree_fastas)
trees_processed = 1
for tree_path, fastas in tree_fastas.items():
print(f'\nProcessing tree {trees_processed}/{num_trees}\n{tree_path}')
# get tree info
# pairs_time_info columns:
# Sample1, Sample2, ts1, ts2, Dt, DtMRCA, tMRCA
pairs_time_info = sample_pairs_times(
tree_path, tree_type=tree_encoding)
for fasta_path in fastas:
print(f'Processing sequences in {fasta_path}')
# obtain information about the data
base_dir, filename = os.path.split(fasta_path)
if not in_dir:
in_dir = base_dir
basename, file_ext = os.path.splitext(filename)
match = re.search(file_names_style['style3'], basename)
genotype = match.group(1)
dataset = match.group(2).replace('_', ' ')
region = match.group(3).replace(
'N450', 'N-450').replace('MFNCR', 'MF-NCR')
# get distances
# pairs_info columns:
# Sample1, Sample2, ts1, ts2, Dt, DtMRCA, tMRCA, Dist
pairs_info = sample_pairs_distances(pairs_time_info, fasta_path)
print('Obtained time and distance information for pairs in tree.')
# rate and number of sites in MSA from rates.csv
rate = rates.loc[(genotype, region), 'Rate']
num_sites = rates.loc[(genotype, region), 'Sites']
# get observed and predicted values
# dataset_results columns:
# Sample1, Sample2, ts1, ts2, Dt, DtMRCA, tMRCA, Dist, DtCA,
# max_DtCA, tCE, Observed, Predicted, Result
# poisson_values columns:
# Time, Mode, Lower, Upper
(dataset_results,
poisson_values) = observed_and_predicted_for_dataset(
pairs_info, relative_times_to_ca, rate, num_sites,
possible_margin=possible_margin, alpha=alpha)
dataset_results['Genotype'] = genotype
dataset_results['Dataset'] = dataset
dataset_results['Region'] = region
poisson_values['Genotype'] = genotype
poisson_values['Dataset'] = dataset
poisson_values['Region'] = region
# results columns:
# Sample1, Sample2, ts1, ts2, Dt, DtMRCA, tMRCA, Dist, DtCA,
# max_DtCA, tCE, Observed, Predicted, Result, Genotype, Dataset,
# Region
results = pd.concat([results, dataset_results],
ignore_index=True)
# predicted columns:
# Time, Typical, Lower, Upper, Genotype, Dataset, Region
predicted = pd.concat([predicted, poisson_values],
ignore_index=True)
trees_processed += 1
out_dir = in_dir.replace('input', 'output')
out_results = os.path.join(out_dir, 'pair_results.csv')
results.to_csv(out_results, index=False)
out_predicted = os.path.join(out_dir, 'predicted.csv')
predicted.to_csv(out_predicted, index=False)
print(f'Completed processing trees and fasta files. Results in '
f'{out_results}')
return results, predicted
def get_axes_mid_points(figure):
"""
Given a matplotlib figure containing one or multiple sets of axes, returns
the horizontal and vertical mid points of the axes.
Parameters
----------
figure : matplotlib.figure.Figure
A matplotlib figure object containing one or multiple axes.
Returns
-------
(x_centre, y_centre) : tuple
Length 2 tuple containing two floats. The first is the mid point of the
x axes and the second is the mid point of the y axes.
"""
# positions of the first and last subplots
top_left_subplot_pos = figure.get_axes()[0].get_position()
bottom_right_subplot_pos = figure.get_axes()[-1].get_position()
# x and y limits of the subplots
x_min = top_left_subplot_pos.xmin
x_max = bottom_right_subplot_pos.xmax
y_max = top_left_subplot_pos.ymax
y_min = bottom_right_subplot_pos.ymin
x_centre = x_min + (x_max - x_min) / 2
y_centre = y_min + (y_max - y_min) / 2
return x_centre, y_centre
def add_axes_labels_to_figure(figure, x_label='', y_label=''):
"""
Takes in a matplotlib figure containing one or multiple axes and returns
the figure with the requested x/y labels added at the centre of the set of
axes.
Parameters
----------
figure : matplotlib.figure.Figure
A matplotlib figure object.
x_label : str, default ''
The text to use to label the x axes. If an empty string, no horizontal
text is added.
y_label : str, default ''
The text to use to label the y axes. If an empty string, no vertical
text is added.
Returns
-------
figure : matplotlib.figure.Figure
The figure object with the centered labels added.
"""
(x_centre, y_centre) = get_axes_mid_points(figure)
# add labels centered to the ax(es)
if x_label:
figure.text(x=x_centre, y=0, s=x_label,
ha='center', weight='bold')
if y_label:
figure.text(x=0, y=y_centre, s=y_label,
va='center', rotation='vertical', weight='bold')
return figure
def make_plots(results, expected, out_dir,
sets_to_analyse=tuple(), times_to_analyse=tuple(),
alpha=0.95, sns_style='ticks', sns_context='paper',
save_img_as=('.png',)):
"""
Plots results from a model predicting sequence pair relatedness.
Parameters
----------
results : pd.DataFrame
Pandas data frame containing observed and predicted results for
multiple trees and regions.
Sample1, Sample2: sample ids (str)
ts1, ts2: sample times in weeks (float)
Dt: weeks difference between the pair of samples (int)
DtMRCA: weeks between most recent sample and MRCA (float)
tMRCA: time of MRCA in weeks (float)
Dist: Hamming distance between Sample1 and Sample2 (int)
DtCA: times since a presumed CA (float)
max_DtCA: DtCA * (1 + possible_margin) (float)
tCE: cumulative evolution time (tCE) for the pair since the tCA (float)
Observed: pair category based on time tree - likely, possible or
unlikely (str)
Predicted: predicted category based on model - below, typical or above
(str)
Result: result classification based on observed and predicted (str)
Genotype, Dataset, Region: Genotype, set of data and region for which
the rate was calculated (str)
expected : pd.DataFrame
Pandas dataframe containing expected substitution range for each time
in the interval given for each dataset, genotype and region.
Time: time for which mode, lower and upper are calculated, in weeks
(int)
Typical: typical number of substitutions expected at each time (int)
Lower: lower # of substitutions at the confidence (alpha) interval
given (int)
Upper: upper # of substitutions for the confidence (alpha) interval
given (int)
Genotype, Dataset, Region: Genotype, set of data and region for which
the substitutions were calculated (str)
out_dir : str
Path to a directory where figures will be saved.
sets_to_analyse : tuple, default tuple()
The data sets to plot as entered in the results df 'Datasets' column.
times_to_analyse : tuple or list, default tuple()
A list or tuple containing the times (int) for which each dataset is
plotted.
alpha : float, default 0.95
Fraction of the Poisson distribution to fall within interval. Between
0 and 1. 0.95: 95% of the Poisson distribution at a given time will
fall within the lower-upper range of number of substitutions.
sns_style : str, default 'ticks'
Argument passed to seaborn set_style method, which adjusts plot
settings. Any permitted bt sns.set_style.
sns_context : str, default 'paper'
Argument passed to seaborn set_context method, which adjusts plot
settings. Any permitted bt sns.set_context.
save_img_as : tuple, default ('.png')
Tuple containing figure formats to save matplotlib plot as. Any file
type supported by matplolib.
Returns
-------
None
"""
# slice df based on the times to analyse
if times_to_analyse:
subset = results[results['DtCA'].isin(times_to_analyse)
].reset_index(drop=True)
ts_to_ca = times_to_analyse
# use full df if no DtCA subset is given
else:
subset = results
ts_to_ca = results['DtCA'].unique()
# if no data set subset given, analyse all data sets
if not sets_to_analyse:
sets_to_analyse = results['Dataset'].unique()
# set up the plot style
sns.set_style(sns_style)
sns.set_context(sns_context)
typical_label = 'typical'
exp_range_label = f'{alpha} of Poisson'
below_exp_label = 'over-corrected'
for dataset in sets_to_analyse:
ds_subset = subset[subset['Dataset'] == dataset].reset_index(drop=True)
dse_subset = expected[expected['Dataset'] == dataset
].reset_index(drop=True)
for genotype in ds_subset['Genotype'].unique():
gen_subset = ds_subset[ds_subset['Genotype'] == genotype
].reset_index(drop=True)
ge_subset = dse_subset[dse_subset['Genotype'] == genotype
].reset_index(drop=True)
for t_to_ca in ts_to_ca:
t_subset = gen_subset[gen_subset['DtCA'] == t_to_ca
].reset_index(drop=True)
(unlikely_label, possible_label, likely_label) = (
'unlikely', 'possible', 'likely')
# set up facet grid
num_regions = ge_subset['Region'].nunique()
facet_grid_args = dict(
hue='Observed', hue_order=[unlikely_label,
possible_label, likely_label],
hue_kws=dict(marker=['^', '.', 'v']),
palette={unlikely_label: '#7570b3',
possible_label: '#d95f02',
likely_label: '#1b9e77'},
height=2.5, aspect=1.3, despine=True,
sharex=True, sharey=False, margin_titles=True)
if num_regions > 1:
facet_grid_args.update(dict(
row='Region', row_order=['N-450', 'MF-NCR']))
g = sns.FacetGrid(t_subset, **facet_grid_args)
g.map(plt.scatter, 'tCE', 'Dist', zorder=4)
region_order = ['N-450', 'MF-NCR']
regions = [reg
for reg in region_order
if reg in ge_subset['Region'].unique()]
# for each facet, add the expected substitutions line and range
for i in range(len(regions)):
ax = g.axes.flat[i]
region = regions[i]
xmin, xmax, ymin, ymax = ax.axis()
# adjust axis limits
tmin = max(math.floor(xmin), 2)
tmax = math.ceil(xmax)
smax = max(ymax, 1)
ax.set(xlim=(tmin, tmax), ylim=(0, smax))
ax.xaxis.set_major_locator(plt.MaxNLocator(integer=True))
ax.yaxis.set_major_locator(plt.MaxNLocator(integer=True))
# get the relevant expected substitutions for the facet
exp = ge_subset[(
ge_subset['Time'].between(tmin, tmax)) & (
ge_subset['Region'] == region)
].reset_index(drop=True)
tce_vals = exp['Time'].values
typical_vals = exp['Typical'].values
upper_limit = exp['Upper'].values
lower_limit = exp['Lower'].values
# plot typical values
ax.plot(tce_vals, typical_vals,
linewidth=3, color='#4575b4', zorder=3,
label=typical_label)
# plot expected range
ax.fill_between(
tce_vals, upper_limit, lower_limit,
facecolor='#91bfdb', edgecolor='#91bfdb',
alpha=1, zorder=2,
label=exp_range_label)
# plot over-corrected range if min limit is 1 or more
if max(lower_limit) > 0:
# plot range below expected substitutions
ax.fill_between(
tce_vals, lower_limit, 0,
facecolor='#D0D0D0', edgecolor='#D0D0D0',
alpha=1, zorder=1,
label='over-corrected')
# set up labels
g.set_axis_labels('', '')
[plt.setp(ax.texts, text='') for ax in g.axes.flat]
g.set_titles(row_template='{row_name}')
g.fig.subplots_adjust(hspace=0.1)
# add a figure title
g.fig.text(0, 1,
s=fr'{genotype} - {dataset} '
fr'$(\Delta t_{{CA}} = {t_to_ca}\ weeks)$',
weight='bold')
g.fig = add_axes_labels_to_figure(
g.fig,
x_label=r'$\mathbf{\Delta t_{CE}}$ (weeks)',
y_label='number of substitutions')
g.fig.tight_layout()
# configure legend
current_handles, current_labels = plt.gca(
).get_legend_handles_labels()
lab_han = dict(zip(current_labels, current_handles))
exp_legend_order = [
typical_label, exp_range_label, below_exp_label]
obs_legend_order = [
unlikely_label, possible_label, likely_label]
ordered_exp_lab_han = [(label, lab_han[label])
for label in exp_legend_order
if label in lab_han]
exp_ordered_labels, exp_ordered_handles = list(
zip(*ordered_exp_lab_han))
ordered_obs_lab_han = [(label, lab_han[label])
for label in obs_legend_order
if label in lab_han]
obs_ordered_labels, obs_ordered_handles = list(
zip(*ordered_obs_lab_han))
# Create a legend for the expected series
exp_legend = plt.legend(
handles=exp_ordered_handles, labels=exp_ordered_labels,
bbox_to_anchor=(0.57, 0), loc='upper right', ncol=1,
bbox_transform=g.fig.transFigure, frameon=False)
# Add the legend manually to the current Axes.
plt.gca().add_artist(exp_legend)
# Create another legend for the observed series
plt.legend(
handles=obs_ordered_handles, labels=obs_ordered_labels,
bbox_to_anchor=(0.57, 0), loc='upper left', ncol=1,
bbox_transform=g.fig.transFigure, frameon=False)
# save image
for fig_type in save_img_as:
fig_ext = (f'.{fig_type}'
if fig_type[0] != '.' else fig_type)
file_name = f'{genotype}-{dataset}-{t_to_ca:03d}'.replace(
' ', '_')
g.savefig(os.path.join(out_dir, f'{file_name}{fig_ext}'),
dpi=300)
plt.close()
print(f'Plot produced for {genotype} {dataset} '
f'DtCA={t_to_ca:03d} weeks.')
# --------------------------------------------------------------------------- #
# Ignoring possibles
# --------------------------------------------------------------------------- #
# Ignoring possibles because tMRCA too close to tMRKCA to know whether null
# hypothesis is True or False
#
# Predicted
# a | r
# B E A
# Observed L/H0 true TN TN FP
# P/H0 u/k - - - ignored
# U/H0 false FN FN TP
#