-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmae283_lec08.tex
332 lines (276 loc) · 12.9 KB
/
mae283_lec08.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
\mainmatter%
\setcounter{page}{1}
\lectureseries[\course]{\course}
\auth[\lecAuth]{Lecturer: \lecAuth\\ Scribe: \scribe}
\date{October 20, 2009}
\setaddress%
% the following hack starts the lecture numbering at 8
\setcounter{lecture}{7}
\setcounter{chapter}{7}
\lecture{Properties of Least Squares Estimates}
\section{Least Squares/Linear Regression Review}
These notes correspond to Chapter 7.3 in Ljung.
The least squares estimate is our first attempt to generate a parametric model.
The system and model are given by
\begin{align*}
\mathcal{S}: y(t) &= \varphi(t)\theta_0 \\
\mathcal{M}: y(t) &= \varphi(t)\theta + e(t,\theta)
\end{align*}
where $\varphi(t)$ is known as the ``regressor'' and is the same for the model and the system.
Also, $\theta_0$ is the ``real'' parameter and $\theta$ is the model of the parameter.
Then the least squares estimate of the parameter, $\theta$, is
\begin{align*}
\hat{\theta}_{LS} &= \min_\theta ||e(t,\theta)|| \\
\thn &= \min_\theta \frac{1}{N}\sum_{t=1}^N e^2(t,\theta) = \min_\theta R_e(\tau=0) = \min_\theta\hat{E}\{e(t,\theta)\}
\end{align*}
where $\hat{E}\{e(t,\theta)\}$ is the estimation of the expected variance.
Recall that the regressor, $\varphi(t)$, is all of the past inputs and outputs such that
\begin{align}
\label{eq:regressor}
\phi^T(t) &= \left[\begin{array}{c c c c c c c}
u(k) & u(k-1) & \cdots & u(k-n) & -y(k-1) & \cdots & -y(k-n) \end{array}\right]
\end{align}
and the parameter contains
\begin{align}
\label{eq:parameter}
\theta &= \left[\begin{array}{c c c c c c c c}
b_0 & b_1 & \cdots & b_n & a_1 & a_2 & \cdots & a_n \end{array}\right]^T
\end{align}
The model given by
\begin{equation*}
y(t) = \varphi(t)\theta + e(t,\theta)
\end{equation*}
can be accumulated over $N$ samples and written in matrix form as
\begin{align*}
\underbrace{\left[\begin{array}{c} y(1) \\ y(2) \\ \vdots \\ y(N) \end{array}\right]}_{\mathbf{y}} &= \underbrace{\left[\begin{array}{c} \varphi^T(1) \\ \varphi^T(2) \\ \vdots \\ \varphi^T(N) \end{array}\right]}_{\mathbf{\varphi}^T} \theta
+ \underbrace{\left[\begin{array}{c} e(1,0) \\ e(2,0) \\ \vdots \\ e(N,0) \end{array}\right]}_{\mathbf{e}} \\
\Rightarrow \tfrac{1}{N}\mathbf{\varphi}\mathbf{y} &= \tfrac{1}{N}\mathbf{\varphi}\mathbf{\varphi}^T\theta + \tfrac{1}{N}\mathbf{\varphi}\mathbf{e} \\
\Rightarrow \hat{\theta} &= \min_\theta \tfrac{1}{N}\mathbf{e}^T\mathbf{e}
\end{align*}
This uses an orthogonal projection to make $\hat{\theta}$ as small as possible.
Rearranging to get the parameter estimate yields
\begin{equation*}
\thn = {\left(\tfrac{1}{N}\mathbf{\varphi\varphi}^T\right)}^{-1} \left(\tfrac{1}{N}\mathbf{\varphi y}\right)
\end{equation*}
Note that this is an ordinal equation and is not the best way to compute the least square estimate from a numerical methods standpoint.
\section{Applications}
\subsection{ARX Model}
The least squares estimate from the perspective of modeling discrete-time systems can be used when the model is given by
\begin{align}
\mathcal{M}: y(t) = \varphi(t)\theta + e(t,\theta)
\end{align}
using (\ref{eq:regressor}) and (\ref{eq:parameter}).
This is equivalent to writing
\begin{align*}
y(t) &= b_0u(t) + b_1u(t-1) + \cdots + b_{n_b}u(t-n_b) \\
&\quad- a_1y(t-1) - a_2y(t-2) - \cdots - a_{n_a}y(t-n_a) + e(t,\theta) \\
\end{align*}
\begin{align*}
\Rightarrow &(1+a_1q^{-1}+a_2q^{-2}+\cdots+a_{n_a}q^{-n_a})y(t) \\
&\qquad = (b_0+b_1q^{-1}+b_2q^{-2}+\cdots+b_{n_b}q^{-n_b})u(t) + e(t,\theta)
\end{align*}
\begin{align*}
\Rightarrow y(t) &= G(q,\theta)u(t) + H(q,\theta)e(t,\theta) \\
&= \frac{b_0+b_1q^{-1}+b_2q^{-2}+\cdots+b_{n_b}q^{-n_b}}{1+a_1q^{-1}+a_2q^{-2}+\cdots+a_{n_a}q^{-n_a}}u(t) \\
&\quad + \frac{1}{1+a_1q^{-1}+a_2q^{-2}+\cdots+a_{n_a}q^{-n_a}}e(t,\theta)
\end{align*}
Here we can see that the denominators for $G(q,\theta)$ and $H(q,\theta)$ are the same.
This is referred to as a ``Autogenous Regression with eXogenous input'' (ARX) model.
The ARX model has shared poles between $G(q,\theta)$ and $H(q,\theta)$ and the numerator of $H(q,\theta)=1$.
\subsection{Linear Regression and Noise}
If we look at a system with white noise such as
\begin{equation*}
\mathcal{S}: y(t) = \varphi^T(t)\theta_0 + e(t)
\end{equation*}
where $\{e(t)\}=$ white noise and the parameter $\theta_0$ is unknown we can re-write this as
\begin{equation*}
y(t) = G_0(q)u(t) + v(t)
\end{equation*}
where $v(t)=H_0(q)e(t)$.
How do we find the transfer functions of this system? First, note that $G(q,\theta_0)=G_0(q)$.
Next we need to find the regressor, $\varphi(t)$.
We can assume that the regressor is of the form (\ref{eq:regressor}) and contains all past inputs and outputs.
Then we know
\begin{align}
\label{eq:g}
G_0(q) = \frac{b_0+b_1q^{-1}+b_2q^{-2}+\cdots+b_{n_b}q^{-n_b}}{1+a_1q^{-1}+a_2q^{-2}+\cdots+a_{n_a}q^{-n_a}}
\end{align}
and
\begin{align}
\label{eq:h}
H_0(q) = \frac{1}{1+a_1q^{-1}+a_2q^{-2}+\cdots+a_{n_a}q^{-n_a}}
\end{align}
This means that we are implicitly assuming that the noise on the system is filtered such that it has the exact same poles as the real data.
That is a very restrictive assumption as it does not apply to many real-world systems.
If we use $\mathcal{S}: y(t) = G_0(q)u(t) + e(t)$ where $\{e(t)\}$ is white noise then we cannot write it as a linear regression because the noise and the system transfer functions do not have the same poles.
An exception is when we are looking at a Finite Impulse Response (FIR) system.
Then the denominators for both transfer functions can be $1$ and the current output has no dependance on past outputs.
\subsection{State Space}
Given a state space model
\begin{align*}
\mathcal{M}: \begin{cases} x(t+1)&=Ax(t)+Bu(t)+v(t) \\ y(t)&=Cx(t)+Du(t)+w(t) \end{cases}
\end{align*}
where $\{v(t)\}$ and $\{w(t)\}$ are white noise and assuming that we have access to/measurements of $\{u(t),y(t)\} + \{x(t)\}$ then the estimates of the system matrices can be written as
\begin{align}
\left[\begin{array}{c} x(t+1) \\ y(t) \end{array}\right] &=
\left[\begin{array}{c c} A & B \\ C & D \end{array}\right] \left[\begin{array}{c} x(t) \\ u(t) \end{array}\right] +
\left[\begin{array}{c} v(t) \\ w(t) \end{array}\right] \nonumber\\
\bar{y} (t) &= \bar{\theta}^T\bar{\varphi} (t) + \bar{e} (t) \nonumber\\
\bar{y}^T (t) &= \bar{\varphi}^T (t)\bar{\theta} + \bar{e}^T (t)
\end{align}
Conclusion: If $\{u(t),y(t)\} + \{x(t)\}$ is available then the estimation of $A,B,C,D$ can be written as a least squares problem.
\section{Properties of Least Squares Estimate}
Some important properties of the least squares estimate, $\thn$, are given below.
\subsection{Relation to the Covariance Functions}%
\label{sec:lsecovfns}
\begin{equation*}
\thn = {(\tfrac{1}{N}\mathbf{\varphi\varphi}^T)}^{-1}(\tfrac{1}{N}\mathbf{\varphi y}) \triangleq \underbrace{R^{-1}(N)}_{p\times p} \underbrace{f(N)}_{p\times 1}
\end{equation*}
Looking more closely at $R(N)$ we can see that it is a $p\times p$ matrix built from elements of (\ref{eq:regressor}).
\begin{equation*}
R(N) = \frac{1}{N}\left[\begin{array}{c c c} \varphi(1) & \cdots & \varphi(N) \end{array}\right]
\left[\begin{array}{c} \varphi^T(1) \\ \vdots \\ \varphi^T(N) \end{array}\right]
\end{equation*}
Calculating a few of the elements of $R (N)$ gives
\begin{align*}
R{(N)}_{1,1} &= \frac{1}{N}\sum_{t=1}^N u(t)u(t) = \hat{R}_u^N(0) \\
R{(N)}_{1,2} &= \frac{1}{N}\sum_{t=1}^N u(t)u(t-1) = \hat{R}_u^N(1) \\
R{(N)}_{2,1} &= \frac{1}{N}\sum_{t=1}^N u(t-1)u(t) = \hat{R}_u^N(1)
\end{align*}
The last two equalities show that $R(N)$ has some symmetry within it.
The general form for $R(N)$ is
\renewcommand{\arraystretch}{2}
\begin{align*}
\newcommand*{\temp}{\multicolumn{1}{r|}{}}
R(N) = \left[\begin{array}{ccc}
\mathbf{R}_u & \temp & \mathbf{R}_{yu} \\ \cline{1-3}
\mathbf{R}_{yu}^T & \temp & \mathbf{R}_y
\end{array}\right]
\end{align*}
where the blocks are found as
\begin{align*}
% Top left.
\mathbf{R}_u &= \left[\begin{array}{c c c c}
\hat{R}_u^N(0) & \hat{R}_u^N(1) & \cdots & \hat{R}_u^N(n_b) \\
\hat{R}_u^N(1) & \hat{R}_u^N(0) & \cdots & \hat{R}_u^N(n_b-1) \\
\vdots & \vdots & \ddots & \vdots \\
\hat{R}_u^N(n_b) & \hat{R}_u^N(n_b-1) & \cdots & \hat{R}_u^N(0)
\end{array}\right] \\
% % Top right.
\mathbf{R}_{yu} &= \left[\begin{array}{c c c}
\hat{R}_{yu}^N(1) & \cdots & \hat{R}_{yu}^N(n_a) \\
\hat{R}_{yu}^N(0) & \cdots & \hat{R}_{yu}^N(n_a-1) \\
\vdots & \ddots & \vdots \\
\hat{R}_{yu}^N(n_a-n_b-1) & \cdots & \hat{R}_{yu}^N(0)
\end{array}\right] \\
% Bottom right.
\mathbf{R}_y &= \left[\begin{array}{c c c c}
\hat{R}_y^N(0) & \hat{R}_y^N(1) & \cdots & \hat{R}_y^N(n_b) \\
\hat{R}_y^N(1) & \hat{R}_y^N(0) & \cdots & \hat{R}_y^N(n_b-1) \\
\vdots & \vdots & \ddots & \vdots \\
\hat{R}_y^N(n_b) & \hat{R}_y^N(n_b-1) & \cdots & \hat{R}_y^N(0)
\end{array}\right]
\end{align*}
Note that the $\mathbf{R}_u$ and $\mathbf{R}_y$ blocks are Toeplitz.
We also have that
\begin{align*}
f(N) = \left[\begin{array}{c}
\hat{R}_{yu}^N(0) \\
\hat{R}_{yu}^N(1) \\
\vdots \\
\hat{R}_{yu}^N(n_a-n_b-1) \\
\midrule
\hat{R}_y^N(1) \\
\vdots \\
\hat{R}_y^N(n_a) \end{array}\right]
\end{align*}
Conclusion: the least squares estimate for the parameter is filled with auto-/cross-covariance function estimates.
\subsection{Invertibility of $R(N)$}%
\label{sec:upersistent}
For an FIR model we only have the $\mathbf{R}_u$ block as all the rest of the terms in $R(N)=0$.
Then, for FIR, $\mathbf{R}_u$ must have $\{u(t)\}$ be a persistently exciting signal to ensure the invertibility of $\mathbf{R}_u$.
When we are not looking at an FIR model we know that $\mathbf{R}_u$ is invertible and must use the matrix inversion lemma to determine if $R(N)$ is invertible.
\begin{definition}
The matrix inversion lemma states that
\begin{equation*}
\det\left(\left[\begin{array}{c c}A&B\\C&D\end{array}\right]\right) = \det(A) \cdot \det(D-CA^{-1}B)
\end{equation*}
\end{definition}
For $R (N)$ to be invertible it must be true that $\det(\mathbf{R}_y-\mathbf{R}_{yu}^T\mathbf{R}_u^{-1}\mathbf{R}_{yu})\neq~0$.
Note that $\mathbf{R}_y$ is usually perturbed by noise.
However, we stil need to have $\{u (t)\}$ be a persistently exciting signal.
\subsection{Consistency of the Parameter Estimation}%
\label{sec:peconsistency}
What happens in
\begin{equation*}
\lim_{N\to\infty} \thn=\theta_0 \text{~w.p.~} 1
\end{equation*}
Is this a true statement? If we use the model
\begin{equation*}
\mathcal{M}: y (t) = \varphi^T (t)\theta+e (t,\theta)
\end{equation*}
and assume that the system is
\begin{equation*}
\mathcal{S}: y (t) = \varphi^T (t)\theta_0 + v (t)
\end{equation*}
where the assumption means that $\theta$ and $\theta_0$ have the same size, $\{v (t)\}$ is not necessarily white noise, $E\{v (t)\} = 0$ and $u\perp~v$, then
\begin{align}
\label{eq:thn}
\thn &= R^{-1}(N)f(N) \nonumber \\
&= R^{-1}(N)\left(\frac{1}{N}\sum_{t=1}^N\varphi(t)y(t)\right) \nonumber \\
&= R^{-1}(N)\left( \underbrace{\frac{1}{N}\sum_{t=1}^N\varphi(t)\varphi^T(t)}_{R(N)}\theta_0+\frac{1}{N}\sum_{t=1}^N\varphi(t)v(t)\right) \nonumber\\
&= \theta_0 + R^{-1}(N)\cdot \frac{1}{N}\sum_{t=1}^N\varphi(t)v(t)
\end{align}
This leads to
\begin{equation*}
\lim_{N\to\infty} E\{\thn\} = \theta_0 + \lim_{N\to\infty}R^{-1}(N)\frac{1}{N}\sum_{t=1}^N\varphi(t)v(t)
\end{equation*}
We want the last term to go to zero so that
\begin{equation*}
\lim_{N\to\infty} E\{\thn\} = 0
\end{equation*}
That happens when
\begin{equation*}
\lim_{N\to\infty} R^{-1}(N)R_{\varphi v}(0) = 0
\end{equation*}
Recall that $\varphi(t)$ is given by (\ref{eq:regressor}) and $u\perp v$.
Then we have that
\begin{equation*}
R_{\varphi v}(0) = \left[\begin{array}{c} R_{uv}(0) \\ R_{uv}(-1) \\ \vdots \\ R_{uv}(-n_b) \\ \midrule \\ R_{yv}(-1) \\ \vdots \\ R_{yv}(-n_a) \end{array}\right]
\end{equation*}
The first $n_b$ terms in the upper block are zero because of $u\perp~v$.
The last $n_a$ terms in the lower block are zero if and only if $\{v (t)\}$ is white noise.
Conclusion: For $\mathcal{S}: y (t) = \varphi^T (t)\theta_0 + v (t)$ it must be true that $\{v (t)\}$ is white noise in order to avoid bias in the least squares estimation of the parameter.
\subsection{Careful with the White Noise Assumption}%
\label{sec:carefulwhitenoise}
Given a system and model such that
\begin{align*}
\mathcal{S}: y(t) &= G_0(q)u(t) + e(t) \\
\mathcal{M}: y(t) &+ \varphi^T(t)\theta + e(t,\theta)
\end{align*}
where $\{e(t)\}$ is white noise we get
\begin{equation*}
\thn = \min_\theta \frac{1}{N}\sum_{t=1}^N e^2(t,\theta)
\end{equation*}
What happens when
\begin{equation*}
\lim_{N\to\infty} E\{\thn\}
\end{equation*}
Does it go to the real parameter, $\theta_0$?
We can re-write this as
\begin{align*}
y(t) &= \frac{B(q)}{A(q)}u(t) + e(t) \\
y(t)A(q) &= B(q)u(t) + A(q)e(t) \\
y(t) &= \varphi^T(t)\theta_0 + v(t) \\
v(t) &= A(q)e(t)
\end{align*}
The last equality shows that the noise is actually colored by the filter, $A(q)$ and so $\{v(t)\}$ is \textit{not} white noise.
The noise we actually measure is $\{v(t)\}$ and not $\{e(t)\}$.
The only time that we can say that we have white noise in this situation is when
\begin{equation*}
\mathcal{S}: y(t) = G_0(q)u(t) + H_0(q)e(t)
\end{equation*}
with
\begin{equation*}
H_0(q) = \frac{1}{A(q)}
\end{equation*}