-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathf_infomeasures_from_samples.m
204 lines (161 loc) · 10.5 KB
/
f_infomeasures_from_samples.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
function [H_x, DKL_x, HPQ_x, H_xgy, DKL_xgy, HPQ_xgy] = f_infomeasures_from_samples(data, edges, data_binned, data_histcounts, sample_sizes, num_rep, samplingstrategy)
% For a target X and an any-dimensional set of predictors Y, returns information measures as a function of the size of samples taken from the data set
% Note: f_infomeasures_from_samples can also be applied to a target only (no predictors passed to the function.
% In this case, H_xgy, DKL_xgy and HPQ_xgy are not computed and will be 'NaN'
% Input
% - data: [num_data,num_dim] array, where each row is a set of related target (col=1) and predictors (col= 2:end) values
% Note: data must be NaN-free
% - edges: [1,num_dim] cell array, with a [1,num_edges] array of bin edges for each dimension inside
% Note: For each dimension, the edges must completely cover the entire value range of the respective data
% - data_binned: [num_data,num_dim] array, with values in data classified, separately for each dimension, into its respective bins
% Note:
% - data_binned(:,1) is the target, data_binned(:,2:end) are the predictors
% - data_binned must be NaN-free
% - data_histcounts: [num_bins of dim 1, num_bins of dim 2, ... , num_bins of dim end]
% matrix, with 'dim 1' being the target, and 'dim 2' ... 'dim end' being the predictors
% with counts of occurrency of the particular target/predictors combination
% - sample_sizes: [num_sasi,1] vector with the sample sizes to be evaluated
% - num_rep: [1,1] number of repetitions to average over for each sample size
% - samplingstrategy: [1,1] how to take samples from the full data set:
% - 'random': points are randomly sampled
% - 'continuous': from a randomly chosen starting point the points are continuosly sampled
% Output
% - H_x: [1,1] unconditional entropy of the target, full data set
% - DKL_x: [num_sasi,1] unconditional Kullback-Leibler divergence of target samples, against full data set as reference
% This quantifies the additional uncertainty about the target if its distribution
% is estimated from a sample
% - HPQ_x: [num_sasi,1] unconditional cross entropy of target samples, against full data set as reference
% This quantifies the total uncertainty about the target if its distribution
% is estimated from a sample. HPQ_x = H_x + DKL_x
% - H_xgy: [1,1] conditional entropy of target given the predictors, full data set
% This is the expected (mean) H of all target conditional distributions
% given a particular combination of predictor values
% - DKL_xgy: [num_sasi,1] conditional Kullback-Leibler divergence of target samples, against full data set conditional
% distribtutions as reference.
% This quantifies the additional uncertainty about the target if the predictors are known, but if its
% conditional distributions are estimated from a sample
% This is the expected (mean) DKL of all sample target conditional distributions vs. the target conditional
% distribution from the full data set
% - HPQ_xgy: [num_sasi,1] conditional cross entropies of target samples, against full data set conditional
% distributions as reference
% This quantifies the total uncertainty about the target if predcitors are known and its
% conditional distributions are estimated from a sample. HPQ_xgy = H_xgy + DKL_xgy
% Version
% - 2018/07/23 Uwe Ehret: updated some function names
% - 2017/11/24 Uwe Ehret: added some more comments
% - 2017/10/22 Uwe Ehret: initial version
%% preparations
% get dimensionality of data set
num_data = size(data_binned,1); % length of the data set (rows of data_binned)
num_dim = size(data_binned,2); % target + number of predictors (cols of data_binned, and dimensionality of data_histcounts
num_bins = size(data_histcounts); % [num_dim,1] with number of bins of target and every predictor
num_sasi = size(sample_sizes,1); % number of different sample sizes to test
% check data for NaN
if ~isempty(find(isnan(data)))
error('data contains NaN');
end
% check data_binned for NaN
if ~isempty(find(isnan(data_binned)))
error('data_binned contains NaN');
end
% check data_histcounts for NaN
if ~isempty(find(isnan(data_histcounts)))
error('data_histcounts contains NaN');
end
% check if input data fall outside the bin edges
mins = min(data); % smallest value in each dimension
maxs = max(data); % largest value in each dimension
% loop over all dimensions
for d = 1 : num_dim
if mins(d) < edges{d}(1) % smallest value is < leftmost edge
error('input data < leftmost edge');
elseif maxs(d) > edges{d}(end) % largest value is > rightmost edge
error('input data > rightmost edge');
end
end
% initialize output variables
H_x = NaN(1,1);
DKL_x = NaN(num_sasi,1);
HPQ_x = NaN(num_sasi,1);
H_xgy = NaN(1,1);
DKL_xgy = NaN(num_sasi,1);
HPQ_xgy = NaN(num_sasi,1);
% create variables needed in the script
% edges of the target bins
edges_target = edges{1};
% array of all possible predictor bin combinations
% This is needed to derive conditional distributions of the target
% Note that the input to f_all_predictor_bincombs includes the target dimension, but the output does not!
[p_combis] = f_all_predictor_bincombs(num_bins); % set of all possible predictor combinations
num_p_combis = size(p_combis,1); % the number of all possible predictor combinations
%% compute H_x
% find the number of bins of the target (size of data_histcounts along the 1st dimension)
num_bins_target = size(data_histcounts,1);
% reshape data_histcounts such that it resolves the target (= 1st dimension)
% and all other dimensions are collapsed to one (= 2nd dimension)
dummy = reshape(data_histcounts,[num_bins_target,numel(data_histcounts)/num_bins_target]);
% get overall counts of each target bin by summing over the 2nd dimension
% this is the target marginal histogram
target_hist = sum(dummy,2)';
% get target marginal pdf by dividing its histogram with the total number of target counts
target_pdf = target_hist / sum(target_hist);
% compute the entropy
H_x = f_entropy(target_pdf);
%% compute H_xgy (but only if at least one predictor is available)
if num_dim > 1
[H_xgy, target_histograms_fulldataset] = f_conditionalentropy_anyd(data_histcounts, p_combis);
end
%% compute statistics for samples: DKL_x, HPQ_x, DKL_xgy, HPQ_xgy
% loop over all sample sizes to be evaluated
for s = 1 : num_sasi
% the current sample size
num_data_sasi = sample_sizes(s);
% initialize vectors for results of all repetitions
DKL_x_rep = NaN(num_rep,1);
HPQ_x_rep = NaN(num_rep,1);
DKL_xgy_rep = NaN(num_rep,1);
HPQ_xgy_rep = NaN(num_rep,1);
% initialize conditional entropies of the samples, only needed as intermediate result
H_xgy_rep = NaN(num_rep,1);
% loop over repetitions of one sample size
for r = 1 : num_rep
% take a sample of the full data set
sample = f_sample_data(data, num_data_sasi, samplingstrategy);
% bin the sample with the same edges as the full data set
[sample_binned, sample_histcounts] = f_histcounts_anyd(sample, edges);
% compute the DKL between the unconditional target pdf of the sample and the target pdf of the full data set as a reference
% reshape sample_histcounts such that it resolves the target (= 1st dimension)
% and all other dimensions are collapsed to one (= 2nd dimension)
dummy = reshape(sample_histcounts,[num_bins_target,numel(sample_histcounts)/num_bins_target]);
% get overall counts of each target bin by summing over the 2nd dimension
% this is the target marginal histogram of the sample
target_hist_sample = sum(dummy,2)';
% get target marginal pdf of the sample by dividing its histogram with the total number of target counts
target_pdf_sample = target_hist_sample / sum(target_hist_sample);
% Note: The case that target_pdf_sample is completely empty (sum(target_hist_sample = 0))
% will never occur here (unless we put the sample size to 0)
% so we do not have to take care of it
% check for zero values in target_pdf_sample_hat where pdf is non-zero
% If this is the case, replace pdf_hat with its non-zero estimate
if ~isempty(intersect(find(target_pdf_sample == 0), find(target_pdf ~= 0)))
target_pdf_sample = f_NonZeroPDF(target_hist_sample);
end
% finally, compute DKL
DKL_x_rep(r) = f_kld_anyd(target_pdf,target_pdf_sample);
% if at least one predictor exists, compute conditional entropy and conditional DKL
if num_dim > 1
% compute the conditional entropy and target histograms of the sample for all predictor combinations of the full data set
[H_xgy_rep(r), target_histograms_sample] = f_conditionalentropy_anyd(sample_histcounts, p_combis);
% compute the conditional DKL between the conditional target pdfs of the sample and the conditional target pdf of the full data set as a reference
% Note: Here the case that the condtional pdf of target or predictor is completely empty
% can occur, but will be handeled in the function
DKL_xgy_rep(r) = f_conditionalkld_anyd_OLD(target_histograms_fulldataset, target_histograms_sample);
end
end % of loop over repetitions of one subsample size
% take the mean over all repetitions. This is the expected value of the respective statistic
DKL_x(s) = mean(DKL_x_rep);
HPQ_x(s) = H_x + DKL_x(s); % cross entropy can be computed from the mean/expected values
DKL_xgy(s) = mean(DKL_xgy_rep );
HPQ_xgy(s) = H_xgy + DKL_xgy(s); % cross entropy can be computed from the mean/expected values
end % of loop over all subsample sizes
end