-
Notifications
You must be signed in to change notification settings - Fork 25
/
rfmix.h
196 lines (166 loc) · 7.2 KB
/
rfmix.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
/* RFMIX v2.XX - Local Ancestry and Admixture Analysis
Bustamante Lab - Stanford School of Medicine
(c) 2016 Mark Hamilton Wright
This program is licensed for academic research use only
unless otherwise stated. Contact [email protected] for
commercial licensing options.
Academic and research users should cite Brian Maples'
paper describing RFMIX in any publication using RFMIX
results. Citation is printed when the program is started. */
#ifndef RFMIX_H
#define RFMIX_H
/* From GNU autoconf/automake */
#include "config.h"
#include <vector>
#include "genetic-map.h"
#include "hash-table.h"
#include <math.h>
/* As a critical feature to trim memory usage, we are going to use a log odds
encoding of a floating point number restricted to range 0.0 - 1.0 by
8-bit integers in range -127 to 127. This effectively creates 8 bit floats.
The error for representation is at maximum about 2.0% and peaks in the
center of the 0.0 to 1.0 range. The minimum possible and maximum possible
floating point numbers are 0.000025334 and 0.9999747 respectively. The
macro DF8(x) decodes the 8-bit integer to a double, and the inline function
ef8 encodes a number in range min to max above to an int8_t. */
#define DF8(x) ( 1.0/(1.0+exp(((double) (x))/-25.0)) )
#define EF8(p) ( (int) ( -25.0*log( (1.0-(p))/(p) ) ) )
static inline int8_t ef8(double p) {
int tmp = (int) ( -25.0*log( (1.0-p)/p ) );
if (tmp < -127) tmp = -127;
if (tmp > 127) tmp = 127;
return tmp;
}
/* Likewise with int8_t float encodings above, but with int16_t giving much
higher precision and range closer to 0.0 and 1.0. The maximum error is
0.024% peaking at 0.5 and the range is 1.26765e-14 to what rounds to 1.0
in R. This level of precision is needed for current_p[2][] because these
arrays store the results of the forward-backward algorithm both for
feedback to EM and for output of results. The forward-backward calculations
can produce a much finer degree of precision than random forest does with
a typical 100 trees or so */
#define DF16(x) ( 1.0/( 1.0 + exp((double) (x)/-1024.0) ))
#define EF16(p) ( (int) ( -1024*log( (1.0-(p))/(p) ) ) )
static inline int16_t ef16(double p) {
if (p <= 0.) return -32767;
if (p >= 1.) return +32767;
int tmp = (int) ( -1024.0 * log( (1.0 - p)/p ) );
if (tmp < -32767) tmp = -32767;
if (tmp > 32767) tmp = 32767;
return tmp;
}
/* Alternative with uniform rounding error over the range, and full range 0.0
to 1.0. Also less expensive to encode, decode. The log-odds formulation
above can be tweaked to give better accuracy at the tails at the expense
of rounding error in the middle of the range, or vice-versa. */
//#define DF8(x) ((x)/255.0)
//#define EF8(p) ((p)*255.0)
/* Program command line and configuration options - see rfmix.c for option definitions
and default values set in init_options(). The global variable rfmix_opts, declared
and set in rfmix.c is referenced all over the program for these values where needed */
typedef struct {
char *qvcf_fname;
char *rvcf_fname;
char *genetic_fname;
char *class_fname;
char *output_basename;
double maximum_missing_data_freq;
double n_generations;
double rf_window_size;
double crf_spacing;
int n_trees;
int node_size;
int reanalyze_reference;
int em_iterations;
int bootstrap_mode;
int minimum_snps;
int analyze_range[2];
char *analyze_str;
double crf_weight;
int debug;
int n_threads;
char *chromosome;
char *random_seed_str;
int random_seed; /* set by parsing random_seed_str which might be "clock" or a hex number */
} rfmix_opts_t;
/* I am using AF_TYPE to mean either float or double, depending on how set here, so
that it is simple to change the program to operate either entirely in floats for
the large arrays of floating point numbers, or in doubles. float (32 bit) are
preferred for memory conservation, but doubles (64 bit) are usually faster in
execution on modern systems */
#define AF_TYPE float
/* The chromosome is broken up into discrete segments, possibly one at each input SNP, on
which the conditional random field is defined. For training the random forests estimating
the probabilities at each CRF point/window, the SNPs used may come from a larger region
overlapping more than one CRF window. rf_start_idx and rf_end_idx indicate the first and
last SNP (inclusive) to be included in training the random forest */
typedef struct {
int snp_idx;
int rf_start_idx;
int rf_end_idx;
double genetic_pos;
} crf_window_t;
typedef struct {
int pos;
AF_TYPE genetic_pos;
int crf_index; // CRF window index for this snp
char *ref;
char *alt;
char *snp_id;
} snp_t;
/* IMPORTANT: Because pointers take 8 bytes and the number of subpops is often small but
number of windows large, the array of pointers for windows to arrays of
values for subpops per each window takes up as much or more memory than
the values themselves. So the marco IDX(w,s) performs the translation from
two dimensional indexing to one dimensional so we do not need the array
of pointers. The variable n_subpops must be defined and set appropriately
in the function that uses IDX().
An alternative is to have the outer dimension be subpops and the inner one
be windows, but since most or all loops will access all subpops in a row
for each window, looping subpops within the window loop, doing it this way
produces better L2 cache performance by keeping the subpop values together */
#define IDX(w,s) ( (w)*n_subpops + (s) )
/* IMPORTANT: the current_p and est_p arrays are using the 8 bit float scheme discussed above */
typedef struct {
char *sample_id;
int apriori_subpop; // 0 means query/admixed/unknown sample. 1 through K, reference sample
int8_t *haplotype[2];
int8_t *msp[4];
int8_t *ksp[2]; /* known state path, allocated and set only for internal simulated samples */
double logl[4];
int16_t *current_p[2]; // current estimate of probability of subpop [hap][ IDX(crf_window,subpop) ]
int16_t *est_p[4]; // new estimate of probability of subpop estimate [hap][ IDX(crf_window,subpop) ]
float *sis_p[2]; // Suyash stay-in-state forward-backward probability [hap][ crf_window ]
int column_idx;
int sample_idx;
int s_parent;
int s_sample;
} sample_t;
typedef struct {
int n_subpops;
char **reference_subpops; // string names of the reference subpops
int n_samples;
sample_t *samples;
HashTable *sample_hash;
int n_snps;
snp_t *snps;
int n_windows;
crf_window_t *crf_windows;
GeneticMap *genetic_map;
} input_t;
/* This can be anything. The value I put here I pulled out of my backside. */
#define RFOREST_RNG_KEY 0x949FC1AD
enum { RF_BOOTSTRAP_FLAT=0, RF_BOOTSTRAP_HIERARCHICAL, RF_BOOTSTRAP_STRATIFIED, N_RF_BOOTSTRAP };
#define MINIMUM_GENETIC_DISTANCE (0.00001)
#define P_MINIMUM_FOR_REF (0.0)
#define RF_THREAD_WINDOW_CHUNK_SIZE (3)
#define CRF_SAMPLES_PER_BLOCK (32)
#define SIM_PARENT_PROPORTION (0.10)
#define SIM_GROWTH_RATE (1.20)
#define SIM_SAMPLES_PER_SUBPOP (200)
double crf(input_t *input, double w);
void msp_output(input_t *input);
void fb_output(input_t *input);
void fb_stay_in_state_output(input_t *input);
void output_Q(input_t *input);
#endif