mae283_lec08.tex

\mainmatter%
\setcounter{page}{1}

\lectureseries[\course]{\course}

\auth[\lecAuth]{Lecturer: \lecAuth\\ Scribe: \scribe}
\date{October 20, 2009}

\setaddress%

% the following hack starts the lecture numbering at 8
\setcounter{lecture}{7}
\setcounter{chapter}{7}

\lecture{Properties of Least Squares Estimates}

\section{Least Squares/Linear Regression Review}
These notes correspond to Chapter 7.3 in Ljung.
The least squares estimate is our first attempt to generate a parametric model.
The system and model are given by
\begin{align*}
\mathcal{S}: y(t) &= \varphi(t)\theta_0 \\
\mathcal{M}: y(t) &= \varphi(t)\theta + e(t,\theta)
\end{align*}
where $\varphi(t)$ is known as the ``regressor'' and is the same for the model and the system.
Also, $\theta_0$ is the ``real'' parameter and $\theta$ is the model of the parameter.
Then the least squares estimate of the parameter, $\theta$, is
\begin{align*}
\hat{\theta}_{LS} &= \min_\theta ||e(t,\theta)|| \\
\thn &= \min_\theta \frac{1}{N}\sum_{t=1}^N e^2(t,\theta) = \min_\theta R_e(\tau=0) = \min_\theta\hat{E}\{e(t,\theta)\}
\end{align*}
where $\hat{E}\{e(t,\theta)\}$ is the estimation of the expected variance.
Recall that the regressor, $\varphi(t)$, is all of the past inputs and outputs such that
\begin{align}
\label{eq:regressor}
\phi^T(t) &= \left[\begin{array}{c c c c c c c}
u(k) & u(k-1) & \cdots & u(k-n) & -y(k-1) & \cdots & -y(k-n) \end{array}\right]
\end{align}
and the parameter contains
\begin{align}
\label{eq:parameter}
\theta &= \left[\begin{array}{c c c c c c c c}
b_0 & b_1 & \cdots & b_n & a_1 & a_2 & \cdots & a_n \end{array}\right]^T
\end{align}
The model given by

\begin{equation*}
y(t) = \varphi(t)\theta + e(t,\theta)
\end{equation*}

can be accumulated over $N$ samples and written in matrix form as
\begin{align*}
\underbrace{\left[\begin{array}{c} y(1) \\ y(2) \\ \vdots \\ y(N) \end{array}\right]}_{\mathbf{y}} &= \underbrace{\left[\begin{array}{c} \varphi^T(1) \\ \varphi^T(2) \\ \vdots \\ \varphi^T(N) \end{array}\right]}_{\mathbf{\varphi}^T} \theta
+ \underbrace{\left[\begin{array}{c} e(1,0) \\ e(2,0) \\ \vdots \\ e(N,0) \end{array}\right]}_{\mathbf{e}} \\
\Rightarrow \tfrac{1}{N}\mathbf{\varphi}\mathbf{y} &= \tfrac{1}{N}\mathbf{\varphi}\mathbf{\varphi}^T\theta + \tfrac{1}{N}\mathbf{\varphi}\mathbf{e} \\
\Rightarrow \hat{\theta} &= \min_\theta \tfrac{1}{N}\mathbf{e}^T\mathbf{e}
\end{align*}
This uses an orthogonal projection to make $\hat{\theta}$ as small as possible.
Rearranging to get the parameter estimate yields

\begin{equation*}
\thn = {\left(\tfrac{1}{N}\mathbf{\varphi\varphi}^T\right)}^{-1} \left(\tfrac{1}{N}\mathbf{\varphi y}\right)
\end{equation*}

Note that this is an ordinal equation and is not the best way to compute the least square estimate from a numerical methods standpoint.

\section{Applications}
\subsection{ARX Model}
The least squares estimate from the perspective of modeling discrete-time systems can be used when the model is given by
\begin{align}
\mathcal{M}: y(t) = \varphi(t)\theta + e(t,\theta)
\end{align}
using (\ref{eq:regressor}) and (\ref{eq:parameter}).
This is equivalent to writing
\begin{align*}
y(t) &= b_0u(t) + b_1u(t-1) + \cdots + b_{n_b}u(t-n_b) \\
&\quad- a_1y(t-1) - a_2y(t-2) - \cdots - a_{n_a}y(t-n_a) + e(t,\theta) \\
\end{align*}
\begin{align*}
\Rightarrow &(1+a_1q^{-1}+a_2q^{-2}+\cdots+a_{n_a}q^{-n_a})y(t) \\
&\qquad = (b_0+b_1q^{-1}+b_2q^{-2}+\cdots+b_{n_b}q^{-n_b})u(t) + e(t,\theta)
\end{align*}
\begin{align*}
\Rightarrow y(t) &= G(q,\theta)u(t) + H(q,\theta)e(t,\theta) \\
&= \frac{b_0+b_1q^{-1}+b_2q^{-2}+\cdots+b_{n_b}q^{-n_b}}{1+a_1q^{-1}+a_2q^{-2}+\cdots+a_{n_a}q^{-n_a}}u(t) \\
&\quad + \frac{1}{1+a_1q^{-1}+a_2q^{-2}+\cdots+a_{n_a}q^{-n_a}}e(t,\theta)
\end{align*}
Here we can see that the denominators for $G(q,\theta)$ and $H(q,\theta)$ are the same.
This is referred to as a ``Autogenous Regression with eXogenous input'' (ARX) model.
The ARX model has shared poles between $G(q,\theta)$ and $H(q,\theta)$ and the numerator of $H(q,\theta)=1$.

\subsection{Linear Regression and Noise}
If we look at a system with white noise such as

\begin{equation*}
\mathcal{S}: y(t) = \varphi^T(t)\theta_0 + e(t)
\end{equation*}

where $\{e(t)\}=$ white noise and the parameter $\theta_0$ is unknown we can re-write this as

\begin{equation*}
y(t) = G_0(q)u(t) + v(t)
\end{equation*}

where $v(t)=H_0(q)e(t)$.
How do we find the transfer functions of this system? First, note that $G(q,\theta_0)=G_0(q)$.
Next we need to find the regressor, $\varphi(t)$.
We can assume that the regressor is of the form (\ref{eq:regressor}) and contains all past inputs and outputs.
Then we know
\begin{align}
\label{eq:g}
G_0(q) = \frac{b_0+b_1q^{-1}+b_2q^{-2}+\cdots+b_{n_b}q^{-n_b}}{1+a_1q^{-1}+a_2q^{-2}+\cdots+a_{n_a}q^{-n_a}}
\end{align}
and
\begin{align}
\label{eq:h}
H_0(q) = \frac{1}{1+a_1q^{-1}+a_2q^{-2}+\cdots+a_{n_a}q^{-n_a}}
\end{align}
This means that we are implicitly assuming that the noise on the system is filtered such that it has the exact same poles as the real data.
That is a very restrictive assumption as it does not apply to many real-world systems.

If we use $\mathcal{S}: y(t) = G_0(q)u(t) + e(t)$ where $\{e(t)\}$ is white noise then we cannot write it as a linear regression because the noise and the system transfer functions do not have the same poles.
An exception is when we are looking at a Finite Impulse Response (FIR) system.
Then the denominators for both transfer functions can be $1$ and the current output has no dependance on past outputs.

\subsection{State Space}
Given a state space model
\begin{align*}
\mathcal{M}: \begin{cases} x(t+1)&=Ax(t)+Bu(t)+v(t) \\ y(t)&=Cx(t)+Du(t)+w(t) \end{cases}
\end{align*}
where $\{v(t)\}$ and $\{w(t)\}$ are white noise and assuming that we have access to/measurements of $\{u(t),y(t)\} + \{x(t)\}$ then the estimates of the system matrices can be written as
\begin{align}
\left[\begin{array}{c} x(t+1) \\ y(t) \end{array}\right] &=
\left[\begin{array}{c c} A & B \\ C & D \end{array}\right] \left[\begin{array}{c} x(t) \\ u(t) \end{array}\right] +
\left[\begin{array}{c} v(t) \\ w(t) \end{array}\right] \nonumber\\
\bar{y} (t) &= \bar{\theta}^T\bar{\varphi} (t) + \bar{e} (t) \nonumber\\
\bar{y}^T (t) &= \bar{\varphi}^T (t)\bar{\theta} + \bar{e}^T (t)
\end{align}
Conclusion: If $\{u(t),y(t)\} + \{x(t)\}$ is available then the estimation of $A,B,C,D$ can be written as a least squares problem.

\section{Properties of Least Squares Estimate}
Some important properties of the least squares estimate, $\thn$, are given below.

\subsection{Relation to the Covariance Functions}%
\label{sec:lsecovfns}

\begin{equation*}
\thn = {(\tfrac{1}{N}\mathbf{\varphi\varphi}^T)}^{-1}(\tfrac{1}{N}\mathbf{\varphi y}) \triangleq \underbrace{R^{-1}(N)}_{p\times p} \underbrace{f(N)}_{p\times 1}
\end{equation*}

Looking more closely at $R(N)$ we can see that it is a $p\times p$ matrix built from elements of (\ref{eq:regressor}).

\begin{equation*}
R(N) = \frac{1}{N}\left[\begin{array}{c c c} \varphi(1) & \cdots & \varphi(N) \end{array}\right]
\left[\begin{array}{c} \varphi^T(1) \\ \vdots \\ \varphi^T(N) \end{array}\right]
\end{equation*}

Calculating a few of the elements of $R (N)$ gives
\begin{align*}
R{(N)}_{1,1} &= \frac{1}{N}\sum_{t=1}^N u(t)u(t) = \hat{R}_u^N(0) \\
R{(N)}_{1,2} &= \frac{1}{N}\sum_{t=1}^N u(t)u(t-1) = \hat{R}_u^N(1) \\
R{(N)}_{2,1} &= \frac{1}{N}\sum_{t=1}^N u(t-1)u(t) = \hat{R}_u^N(1)
\end{align*}
The last two equalities show that $R(N)$ has some symmetry within it.
The general form for $R(N)$ is
\renewcommand{\arraystretch}{2}
\begin{align*}
\newcommand*{\temp}{\multicolumn{1}{r|}{}}
R(N) = \left[\begin{array}{ccc}
\mathbf{R}_u & \temp & \mathbf{R}_{yu} \\ \cline{1-3}
\mathbf{R}_{yu}^T & \temp & \mathbf{R}_y
\end{array}\right]
\end{align*}

where the blocks are found as

\begin{align*}
% Top left.
\mathbf{R}_u &= \left[\begin{array}{c c c c}
\hat{R}_u^N(0) & \hat{R}_u^N(1) & \cdots & \hat{R}_u^N(n_b) \\
\hat{R}_u^N(1) & \hat{R}_u^N(0) & \cdots & \hat{R}_u^N(n_b-1) \\
\vdots & \vdots & \ddots & \vdots \\
\hat{R}_u^N(n_b) & \hat{R}_u^N(n_b-1) & \cdots & \hat{R}_u^N(0)
\end{array}\right] \\
% % Top right.
\mathbf{R}_{yu} &= \left[\begin{array}{c c c}
\hat{R}_{yu}^N(1) & \cdots & \hat{R}_{yu}^N(n_a) \\
\hat{R}_{yu}^N(0) & \cdots & \hat{R}_{yu}^N(n_a-1) \\
\vdots & \ddots & \vdots \\
\hat{R}_{yu}^N(n_a-n_b-1) & \cdots & \hat{R}_{yu}^N(0)
\end{array}\right] \\
% Bottom right.
\mathbf{R}_y &= \left[\begin{array}{c c c c}
\hat{R}_y^N(0) & \hat{R}_y^N(1) & \cdots & \hat{R}_y^N(n_b) \\
\hat{R}_y^N(1) & \hat{R}_y^N(0) & \cdots & \hat{R}_y^N(n_b-1) \\
\vdots & \vdots & \ddots & \vdots \\
\hat{R}_y^N(n_b) & \hat{R}_y^N(n_b-1) & \cdots & \hat{R}_y^N(0)
\end{array}\right]
\end{align*}

Note that the $\mathbf{R}_u$ and $\mathbf{R}_y$ blocks are Toeplitz.
We also have that
\begin{align*}
f(N) = \left[\begin{array}{c}
\hat{R}_{yu}^N(0) \\
\hat{R}_{yu}^N(1) \\
\vdots \\
\hat{R}_{yu}^N(n_a-n_b-1) \\
\midrule
\hat{R}_y^N(1) \\
\vdots \\
\hat{R}_y^N(n_a) \end{array}\right]
\end{align*}
Conclusion: the least squares estimate for the parameter is filled with auto-/cross-covariance function estimates.

\subsection{Invertibility of $R(N)$}%
\label{sec:upersistent}
For an FIR model we only have the $\mathbf{R}_u$ block as all the rest of the terms in $R(N)=0$.
Then, for FIR, $\mathbf{R}_u$ must have $\{u(t)\}$ be a persistently exciting signal to ensure the invertibility of $\mathbf{R}_u$.

When we are not looking at an FIR model we know that $\mathbf{R}_u$ is invertible and must use the matrix inversion lemma to determine if $R(N)$ is invertible.
\begin{definition}
The matrix inversion lemma states that

\begin{equation*}
\det\left(\left[\begin{array}{c c}A&B\\C&D\end{array}\right]\right) = \det(A) \cdot \det(D-CA^{-1}B)
\end{equation*}

\end{definition}
For $R (N)$ to be invertible it must be true that $\det(\mathbf{R}_y-\mathbf{R}_{yu}^T\mathbf{R}_u^{-1}\mathbf{R}_{yu})\neq~0$.
Note that $\mathbf{R}_y$ is usually perturbed by noise.
However, we stil need to have $\{u (t)\}$ be a persistently exciting signal.

\subsection{Consistency of the Parameter Estimation}%
\label{sec:peconsistency}
What happens in

\begin{equation*}
\lim_{N\to\infty} \thn=\theta_0 \text{~w.p.~} 1
\end{equation*}

Is this a true statement? If we use the model

\begin{equation*}
\mathcal{M}: y (t) = \varphi^T (t)\theta+e (t,\theta)
\end{equation*}

and assume that the system is

\begin{equation*}
\mathcal{S}: y (t) = \varphi^T (t)\theta_0 + v (t)
\end{equation*}

where the assumption means that $\theta$ and $\theta_0$ have the same size, $\{v (t)\}$ is not necessarily white noise, $E\{v (t)\} = 0$ and $u\perp~v$, then
\begin{align}
\label{eq:thn}
\thn &= R^{-1}(N)f(N) \nonumber \\
&= R^{-1}(N)\left(\frac{1}{N}\sum_{t=1}^N\varphi(t)y(t)\right) \nonumber \\
&= R^{-1}(N)\left( \underbrace{\frac{1}{N}\sum_{t=1}^N\varphi(t)\varphi^T(t)}_{R(N)}\theta_0+\frac{1}{N}\sum_{t=1}^N\varphi(t)v(t)\right) \nonumber\\
&= \theta_0 + R^{-1}(N)\cdot \frac{1}{N}\sum_{t=1}^N\varphi(t)v(t)
\end{align}
This leads to

\begin{equation*}
\lim_{N\to\infty} E\{\thn\} = \theta_0 + \lim_{N\to\infty}R^{-1}(N)\frac{1}{N}\sum_{t=1}^N\varphi(t)v(t)
\end{equation*}

We want the last term to go to zero so that

\begin{equation*}
\lim_{N\to\infty} E\{\thn\} = 0
\end{equation*}

That happens when

\begin{equation*}
\lim_{N\to\infty} R^{-1}(N)R_{\varphi v}(0) = 0
\end{equation*}

Recall that $\varphi(t)$ is given by (\ref{eq:regressor}) and $u\perp v$.
Then we have that

\begin{equation*}
R_{\varphi v}(0) = \left[\begin{array}{c} R_{uv}(0) \\ R_{uv}(-1) \\ \vdots \\ R_{uv}(-n_b) \\ \midrule \\ R_{yv}(-1) \\ \vdots \\ R_{yv}(-n_a) \end{array}\right]
\end{equation*}

The first $n_b$ terms in the upper block are zero because of $u\perp~v$.
The last $n_a$ terms in the lower block are zero if and only if $\{v (t)\}$ is white noise.

Conclusion: For $\mathcal{S}: y (t) = \varphi^T (t)\theta_0 + v (t)$ it must be true that $\{v (t)\}$ is white noise in order to avoid bias in the least squares estimation of the parameter.

\subsection{Careful with the White Noise Assumption}%
\label{sec:carefulwhitenoise}
Given a system and model such that
\begin{align*}
\mathcal{S}: y(t) &= G_0(q)u(t) + e(t) \\
\mathcal{M}: y(t) &+ \varphi^T(t)\theta + e(t,\theta)
\end{align*}
where $\{e(t)\}$ is white noise we get

\begin{equation*}
\thn = \min_\theta \frac{1}{N}\sum_{t=1}^N e^2(t,\theta)
\end{equation*}

What happens when

\begin{equation*}
\lim_{N\to\infty} E\{\thn\}
\end{equation*}

Does it go to the real parameter, $\theta_0$?

We can re-write this as
\begin{align*}
y(t) &= \frac{B(q)}{A(q)}u(t) + e(t) \\
y(t)A(q) &= B(q)u(t) + A(q)e(t) \\
y(t) &= \varphi^T(t)\theta_0 + v(t) \\
v(t) &= A(q)e(t)
\end{align*}
The last equality shows that the noise is actually colored by the filter, $A(q)$ and so $\{v(t)\}$ is \textit{not} white noise.
The noise we actually measure is $\{v(t)\}$ and not $\{e(t)\}$.
The only time that we can say that we have white noise in this situation is when

\begin{equation*}
\mathcal{S}: y(t) = G_0(q)u(t) + H_0(q)e(t)
\end{equation*}

with

\begin{equation*}
H_0(q) = \frac{1}{A(q)}
\end{equation*}