-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmae283_lec14.tex
256 lines (211 loc) · 10.8 KB
/
mae283_lec14.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
\mainmatter%
\setcounter{page}{1}
\lectureseries[\course]{\course}
\auth[\lecAuth]{Lecturer: \lecAuth\\ Scribe: \scribe}
\date{November 12, 2009}
\setaddress%
% the following hack starts the lecture numbering at 14
\setcounter{lecture}{13}
\setcounter{chapter}{13}
\lecture{Asymptotic Distribution of Parameters}
\section{Convergence Review}
In \S\ref{sec:convergence} we saw that
\begin{align*}
\lim_{N\to\infty}V_N(\theta,Z^N) &= \bar{V}(\theta) \text{~w.p.~} 1 \\
V_N(\theta,Z^N) &= \onen\sumt\epsilon^2(t,\theta) \\
\bar{V}(\theta) &= \bar{E}\{\epsilon^2(t,\theta)\}
\end{align*}
where $\text{ w.p. } 1$ implies zero variance.
We also saw that
\begin{equation*}
\lim_{N\to\infty}\thetan = \arg\min_\theta V_N(\theta,Z^N) = \theta^\ast \text{~w.p.~} 1
\end{equation*}
In Example~\ref{ex:convergence} we saw that when if we have an ARMAX system but we use an ARX model then we get
\begin{equation*}
\theta^\ast = \left[\begin{array}{c} b_0 \\ a_0-\frac{c_0}{R_0} \end{array}\right], \qquad R_0=R_y (0)
\end{equation*}
To make $\theta^\ast\to\theta_0$ we would need to make $R_0\to\infty$.
We can make the variance of the output signal $y (t)=G_0u (t)+H_0e (t)$ larger by making the input signal larger.
\section{Consistency Review}
We want to have $\mathcal{S}\in\mathcal{M}$ but that is not always possible.
Lacking that a useful trait for modeling is if the system transfer function is in the model set, $\mathcal{G}\in\mathcal{M}$, but the transfer function of the noise is not in the model set.
This can be thought of as
\begin{equation*}
\mathcal{S}\in\mathcal{M}:~\exists~\theta_0 \text{~s.t.~} \mathcal{M} (\theta)=\mathcal{S}
\end{equation*}
Note that $\mathcal{M}$ involves both $G_\theta$ and $H_\theta$.
These models both need to be rich enough parameter-wise to capture the real parameters of the system.
Under the assumptions on the data listed in \S\ref{sec:14data} we found that $\theta^\ast=\theta_0$.
For the case where the transfer function of the noise is not in the model set we have
\begin{equation*}
\mathcal{G}\in\mathcal{M}:~\exists~\rho_0 \text{~s.t.~} \mathcal{M} (\rho_0) \Rightarrow~G_{\rho_0}=G_0
\end{equation*}
We set the parameter to
\begin{equation*}
\theta= \left[\begin{array}{c c} \rho & \eta \end{array}\right]^T
\end{equation*}
and use $G_\rho$ and $H_\eta$.
We call this the independent parameterization of $G_\rho$ and $H_\eta$.
The model structures that have this are OE, BJ and FIR\@.
However, OE and BJ require non-linear optimization and FIR is linear but assumes $G_\rho$ is a finite impulse response.
Using all of this we find that
\begin{equation*}
\rho^\ast= \arg\min_{\rho,\eta} V_N\left(\left[\begin{array}{c} \rho \\ \eta \end{array}\right],Z^N\right) \Rightarrow\rho^\ast= \rho_0 \text{~w.p.~} 1 \text{~if~} \mathcal{G}\in\mathcal{M}
\end{equation*}
\section{Asymptotic Distribution of Parameters}
We already know that $\lim_{N\to\infty}\thetan=\theta_0 \text{ w.p. } 1$.
What is the probability distribution of the parameters when $N<\infty$? We can get a sense of this from looking back at least squares estimation where $Y_N=\Phi_N\theta_0+E_N$ and the covariance of the parameter estimate was given by
\begin{align*}
&E\left\lbrace (\thn-\theta_0){(\thn-\theta_0)}^T\right\rbrace = \\
&\qquad E\left\lbrace {\left(\tonen\Phi_N^T\Phi_N\right)}^{-1} \left(\tonen\Phi_N^T E_N\right) \left(E_N^T\Phi_N\tonen\right) {\left(\tonen\Phi_N^T\Phi_N\right)}^{-1} \right\rbrace
\end{align*}
The term $E_N E_N^T$ is an $N\times~N$ matrix found by
\begin{equation*}
E\{E_N E_N^T\} = E\left\lbrace~\onen~\left[\begin{array}{c} e(1) \\ e(2) \\ \vdots \\ e(N) \end{array}\right] \left[\begin{array}{c c c c} e(1) & e(2) & \cdots & e(N) \end{array}\right]\right\rbrace% chktex 1
\end{equation*}
Since we have $\mathcal{S}\in\mathcal{M}$ then $\{e (t)\}$ is white noise with variance $\lambda$.
This leads to the matrix $E_N E_N^T$ being a diagonal matrix with $\lambda$ on the the diagonal so $E_N E_N^T=\lambda~I$ and the $\lambda$ term can be moved outside of the expectation operator.
Following that we can cancel out terms and the covariance simplifies to
\begin{align*}
\text{cov}\left(\thn\right) &= \lambda I_{N\times N}\cdot \tonen\underbrace{{\left(\tonen\Phi_N^T\Phi_N\right)}^{-1}}_{R^{-1}(N)} \\
&= \frac{\lambda}{N}R^{-1}(N)
\end{align*}
\subsection{Optimality Condition}
Define
\begin{align*}
\epsilon(t,\theta) &= H_\theta^{-1}(y(t)-G_\theta u(t)) \\
\thetan &= \arg\min_\theta\onen\sumt\epsilon^2(t,\theta) \\
&= \arg\min_\theta V_N(\theta,Z^N) \\
\psi(t,\theta) &= -\frac{\partial}{\partial\theta}\epsilon(t,\theta)
\end{align*}
For now, let $\dim(\epsilon)=1\times1$, $\dim(\theta)=p\times1$ leading to $\dim(\psi)=p\times1$.
We want to show that
\begin{align*}
\onen\sumt\psi(t,\thetan)\epsilon(t,\thetan) = 0
\end{align*}
This can be shown by multiplying out the terms such that
\begin{align*}
\frac{\partial}{\partial\theta}V_N(\theta,Z^N) &= \frac{\partial}{\partial\epsilon}\frac{\partial\epsilon}{\partial\theta}V_N(\theta,Z^N) \\
&= -\frac{\partial}{\partial\epsilon}\psi(t,\theta)V_N(\theta,Z^N) \\
&= -2\onen\sumt\psi(t,\theta)\epsilon(t,\theta)
\end{align*}
Then we have that
\begin{align*}
\left.\frac{\partial}{\partial\theta}V_N(\theta,Z^N)\right|_{\theta=\thetan} = 0 \\
\end{align*}
\begin{align}
\label{eq:14optimality}
\Rightarrow \boxed{\onen\sumt\psi(t,\thetan)\epsilon(t,\thetan) = 0}
\end{align}
This is known as the \textit{optimality condition} on $\thetan$.
Note that the $0$ term in (\ref{eq:14optimality}) is a $p\times1$ vector of zeros.
\subsection{Optimality Condition and ARX}
Applying the optimality condition to an ARX model shows that
\begin{align*}
\mathcal{M}: G_\theta&=\frac{B_\theta}{A_\theta}, \qquad H_\theta=\frac{1}{A_\theta} \\
y(t)&= \vp^T(t)\theta+\epsilon(t,\theta) \\
\epsilon(t,\theta) &= y(t)-\vp^T(t)\theta \\
\psi(t,\theta) &= -\ptheta\epsilon(t,\theta) = \vp(t)
\end{align*}
In least squares estimation we performed an orthogonal projection to get
\begin{align*}
\onen\sumt\vp(t)\epsilon(t,\theta) = 0
\end{align*}
This shows that for a least squares estimate $\psi(t,\theta) = \vp(t)$!
\subsection{Approximation Result}
We want to show that
\begin{equation*}
\epsilon(t,\thetan) \approx e(t) - \psi^T(t,\thetan)[\thetan-\theta_0]
\end{equation*}
This is the form of a Taylor series expansion so we use
\begin{align*}
e(t) &= \epsilon(t,\thetan) + \frac{1}{1!}\underbrace{\left.\ptheta\epsilon(t,\theta)\right|_{\theta=\thetan}}_{\psi(t,\thetan)}[\theta_0-\thetan] + \left.\frac{1}{2!}\frac{\partial^2}{\partial\theta^2}\epsilon(t,\theta)\right|_{\theta=\thetan}{[\theta_0-\thetan]}^2
\end{align*}
The Taylor series is an infinite sum so there are some higher order terms that are not shown here.
By ignoring even the last term shown we get the \textit{approximation result} of
\begin{align}
\label{eq:14approximation}
\boxed{\epsilon(,t,\thetan) \approx e(t) - \psi^T(t,\thetan)[\theta_0-\thetan]}
\end{align}
Note that $\{e(t)\}$ is still white noise here and that for an ARX model this is an equality because all of the higher order terms do drop out.
\subsection{Parameter Error}
We want to show that the parameter error, $\btheta$, is
\begin{equation*}
\btheta = \frac{1}{\sqrt{N}}\cdot\sqrt{\lambda}\cdot V^T\Psi\left[\thetan-\theta_0\right] \sim \mathcal{N}(0,1)
\end{equation*}
Note that for a Gaussian distribution we have $c\tilde{\theta}\sim\mathcal{N}(0,1)\Rightarrow \text{var}(\tilde{\theta})=c^2$.
We have used that
\begin{equation*}
\Psi = \left[\begin{array}{c c c} \psi(1,\thetan) & \cdots & \psi(N,\thetan) \end{array}\right]^T
\end{equation*}
and $V$ comes from the SVD of $\Psi^T=U\Sigma~V^T$.
Then, let
\begin{align*}
\mathcal{E} &= \left[\begin{array}{c c c} \epsilon(1,\thetan) & \cdots & \epsilon(N,\thetan) \end{array}\right]^T \\
E &= \left[\begin{array}{c c c} e(1) & \cdots & e(N) \end{array}\right]^T
\end{align*}
This gives $\Psi^T\mathcal{E} = 0$ using the optimality condition on $\thetan$ from (\ref{eq:14optimality}) and $\mathcal{E} \approx E-\Psi[\thetan-\theta_0]$ using the approximation result from (\ref{eq:14approximation}).
This shows that
\begin{align*}
\Psi^T\mathcal{E} &\approx \Psi^T E-\Psi^T\Psi[\thetan-\theta_0] \\
\Psi^T E &\approx \Psi^T\Psi[\thetan-\theta_0] \\
&[\text{right multiply entire expression by } \Sigma^{-1}U^T] \\
V^T E &\approx V^T\Psi[\thetan-\theta_0]
\end{align*}
From
\begin{equation*}
\tonen E^T V^T VE, \qquad V^T V=I
\end{equation*}
we see that $\onen E^T E$ is the variance so
\begin{equation*}
\tfrac{1}{\sqrt{N}}V^T E \approx \tfrac{1}{\sqrt{N}}V^T\Psi[\thetan-\theta_0]
\end{equation*}
where the right hand side has a variance of $\lambda$.
We also have that the mean is zero because as $N\to\infty$
\begin{equation*}
\thetan\to\theta_0 \Rightarrow \thetan-\theta_0 = 0 \Rightarrow \btheta = 0
\end{equation*}
This leads to
\begin{align}
\begin{split}
&\sqrt{N}(\thetan-\theta_0) \sim \mathcal{N}(0,P_\theta) \\
&P_\theta = {\left[V''(\theta^\ast)\right]}^{-1}Q{\left[V''(\theta^\ast)\right]}^{-T} \\
&Q = \lim_{N\to\infty}N\cdot E\left\lbrace V'(\theta^\ast,Z^N)V'{(\theta^\ast,Z^N)}^T \right\rbrace
\end{split}
\end{align}
where the $'$ and $''$ represent derivatives.
\subsection{Covariance with System in Model Set}%
\label{sec:varsinm}
Let $\mathcal{S}\in\mathcal{M}$, then we know $\theta^\ast=\theta_0$.
The covariance of $\sqrt{N}(\thetan-\theta_0)$ is found by
\begin{align}
\label{eq:14covariance}
\boxed{P_\theta = \lambda{\left[E\left\lbrace \psi(t,\theta_0)\psi{(t,\theta_0)}^T\right\rbrace\right]}^{-1}}
\end{align}
where
\begin{equation*}
\psi(t,\theta) = -\left.\ptheta\epsilon(t,\theta)\right|_{\theta=\theta_0}
\end{equation*}
Again, this form is very similar to least squares but can be applied to more general situations.
Note that the derivative depends on the model structure, so we really have to estimate the values in (\ref{eq:14covariance}) because they cannot be computed directly.
\begin{align*}
\boxed{\hat{\lambda}^N = \onen\sumt\epsilon^2(t,\thetan)}
\end{align*}
\begin{align*}
\boxed{\hat{P}_\theta = \hat{\lambda}^N\cdot\onen\sumt\psi(t,\thetan)\psi^T(t,\thetan)}
\end{align*}
It is very important to look at the covariance of the parameter estimates because it tells us how sensitive our parameter estimate is to the model structure being used.
\begin{example}
Let
\begin{align*}
\mathcal{S}: y(t) &= \frac{b_1q^{-1}}{1+f_1q^{-1}}u(t) + e(t) \\
\mathcal{M}: y(t) &= \frac{\hat{b}_0 + \hat{b}_1q^{-1}}{1+\hat{f}_1q^{-1}}u(t) + \epsilon(t,\thetan)
\end{align*}
In the estimate $\hat{b}_0$ will have a large confidence interval because it is not really needed to describe the system.
Another model structure that would have large confidence intervals for this system would be
\begin{equation*}
\mathcal{M}: y(t) = \frac{\hat{b}_0+\hat{b}_1q^{-1}+\hat{b}_2q^{-2}}{1+\hat{f}_1q^{-1}+\hat{f}_2q^{-2}}u(t)+\epsilon(t,\thetan)
\end{equation*}
$\lozenge$
\end{example}
Large confidence intervals indicate that the model is over-parameterized or that the wrong model structure is being used.