turing-21.tex

\documentclass[11pt]{beamer}

\usepackage{booktabs}
\usepackage{multirow}
\usepackage{subfigure} 
\usepackage{amsmath}
\usepackage{mathtools}
\usepackage{multicol}

%\def\argmax{\operatornamewithlimits{arg\,max}}
%\def\argmin{\operatornamewithlimits{arg\,min}}

\def\O{{\ensuremath{\mathcal{O}}}}
\def\M{{\ensuremath{\mathcal{M}}}}
\def\t{{\ensuremath{\theta}}}
\def\tt{{\ensuremath{\tilde{\theta}}}}
\def\R{\rm I\!R}

\usepackage{algorithmic}

%%%%%%% Referencing
\newcommand\reffig[1]{Figure \ref{fig:#1}}
\newcommand\refsec[1]{Section \ref{sec:#1}}

%%%%%%% Comments
\newcommand\sw[1]{\emph{\textcolor{red}{#1}}}

%%%%%%% Stuff
\newcommand{\theHalgorithm}{\arabic{algorithm}}
\newcommand{\xbest}{\mathbf{\vx}^{+}}
\newcommand{\xstar}{\mathbf{\vx}^{*}}
\newcommand{\ystar}{\mathbf{\vy}^{*}}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\T}{^\intercal}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand*{\thead}[1]{\mlticolumn{1}{c}{#1}}
\newcommand{\pluseq}{\mathrel{+}=}

%%%%%%% Kamil's expectations and vars.
\newcommand{\ex}[1]{{\mathbb E}\left[ #1 \right]}
\newcommand{\exc}[2]{{\mathbb E}\left[ #1 \,\middle \vert\, #2 \right]}
\newcommand{\exs}[2]{{\mathbb E_{#1}}\left[ #2 \right]}
\newcommand{\vars}[2]{{\mathbb V_{#1}}\left[ #2 \right]}
\newcommand{\excs}[3]{{\mathbb E_{#1}}\left[ #2 \,\middle \vert\, #3 \right]}

\newcommand{\bld}[1]{\emph{\textcolor{red}{#1}}}
\newcommand{\fd}[1]{\textcolor{gray}{#1}}

\definecolor{lightyellow}{rgb}{1,0.98,0.71}
\newcommand{\paper}[1]{\begin{center}\colorbox{lightyellow}{%
\begin{minipage}{0.9\textwidth}\scriptsize
#1\end{minipage}}\end{center}}

\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Return:}}
\renewcommand{\algorithmiccomment}[1]{// \textit{#1}}

% Weighted QMIX commands
\newcommand{\optimOper}{\ensuremath{\mathcal{T}^*}}
\newcommand{\qmixOper}{\ensuremath{\mathcal{T}^*_{\text{Qmix}}}}
\newcommand{\wOper}{\ensuremath{\mathcal{T}^*_{w}}}
\newcommand{\wqmixOper}{\ensuremath{\mathcal{T}^*_{\text{WQMIX}}}}
\newcommand{\qmixSpace}{\ensuremath{\mathcal{Q}^{mix}}}
\newcommand{\qmixProj}{\ensuremath{\Pi_{\text{Qmix}}}}
\newcommand{\wProj}{\ensuremath{\Pi_{w}}}


\mode<presentation>
{
  \usetheme{Boadilla}
  \usecolortheme{dolphin}
  \usefonttheme{structurebold}
%  \setbeamercovered{transparent}
  \setbeamertemplate{navigation symbols}{}
}

%\pgfdeclareimage[height=0.5cm]{institution-logo}{uva_logo}
%\logo{\pgfuseimage{institution-logo}}

\title[Factored VFs for Cooperative MARL]{Factored Value Functions for Cooperative Multi-Agent Reinforcement Learning}
\author[Whiteson \& Rashid (Oxford)]{Shimon Whiteson \& Tabish Rashid \\ Dept.\ of Computer Science \\ University of Oxford \\ \ \\ joint work with Jakob Foerster, Gregory Farquhar, Bei Peng,\\ Mikayel Samvelyan, and Christian Schroeder de Witt}
\date{\today}

\begin{document}

\frame{\titlepage}

\frame{
\frametitle{Coordination Problems are Everywhere}
%%%%%%%%%%%%%%%%%

\begin{center}
%  \includegraphics[scale = 0.4]{traffic-jam}
%  \hspace{1cm}
%  \includegraphics[scale = 0.125]{many-drones}
  \includegraphics[width=0.45\linewidth]{traffic-jam}
  \includegraphics[width=0.45\linewidth]{many-drones}\\
  \includegraphics[width=0.45\linewidth]{firemen}
  \includegraphics[width=0.45\linewidth]{kiva}    
\end{center} 

}

%\frame{
%\frametitle{Update rules}
%%%%%%%%%%%%%%%%%%
%
%\begin{itemize}
%
%	\item \sw{Temporal difference} (TD) update rule:
%		\begin{equation*}
%		V(s_t) \leftarrow V(s_t) + \alpha[r_t + \gamma V(s_{t+1}) - V(s_t)]
%		\end{equation*}
%	\item \sw{Sarsa} update rule:
%		\begin{equation*}
%		Q(s_t,u_t) \leftarrow Q(s_t,u_t) + \alpha[r_t + \gamma Q(s_{t+1},u_{t+1}) - Q(s_t,u_t)]		
%		\end{equation*}
%	\item \sw{Q-learning} update rule:
%	\begin{equation*}
%	 	Q(s_t,u_t) \leftarrow Q(s_t,u_t) + \alpha[r_t + \gamma \max_u Q(s_{t+1},u) - Q(s_t,u_t)]	
%	\end{equation*}
%	\item Act (soft) \sw{greedily} wrt to $Q$-values:
%	\begin{equation*}
%		u_t = \argmax_u Q(s_t,u)
%	\end{equation*}
%\end{itemize}
%}
%
%\frame{
%\frametitle{Policy Gradient Methods}
%%%%%%%%%%%%%%%%%%
%
%\begin{itemize}
%	\item What about when \sw{greedification} is hard, e.g., continuous actions?
%	\vspace{2mm}	
%	\item Optimise $\pi_\theta$ with gradient ascent on expected return:
%	\begin{equation*} 
%	J_\theta=\exs{s\sim\rho(s),u\sim\pi_\theta(s,\cdot)}{r(s,u)}
%	\end{equation*}
%%	\vspace{4mm}	
%	\item Policy gradient theorem [Sutton et al.\ 2000]:
%	\begin{equation*}
%	\nabla_\theta J_\theta = \exs{s\sim\rho^\pi(s),u\sim\pi_\theta(s,\cdot)}{\nabla_{\theta} \log \pi_\theta (u \vert s)Q^\pi(s,u)}
%	\end{equation*}
%	\item Estimate gradient with trajectory $\tau$ and learned \sw{critic} $Q(s,u)$:
%	\begin{equation*}
%	\nabla_\theta J_\theta \approx g(\tau) = \sum_{t=0}^{T} \nabla_{\theta} \log \pi_\theta (u_t \vert s_t)Q(s_t,u_t)
%	\end{equation*}
%\end{itemize}
%}
%
%\frame{
%\frametitle{Baselines}
%%%%%%%%%%%%%%%%%%
%
%\begin{itemize}
%	\item Reduce variance with a \sw{baseline} $b(s)$:	
%	\begin{equation*}
%	g(\tau) = \sum_{t=0}^{T} \nabla_{\theta} \log \pi_\theta (u_t \vert s_t)(Q(s_t,u_t)-b(s_t))
%	\end{equation*}
%%	\vspace{5mm}
%	\item $b(s) = V(s) \implies Q(s,u) - b(s) = A(s,u)$, the \sw{advantage function}:
%	\begin{equation*}
%	g(\tau) = \sum_{t=0}^{T} \nabla_{\theta} \log \pi_\theta (u_t \vert s_t)A(s_t,u_t)
%	\end{equation*}
%%	\vspace{5mm}	
%	\item \sw{TD-error} $r_t + \gamma V(s_{t+1}) - V(s)$ is an unbiased estimate of $A(s_t,u_t)$:
%\begin{equation*}
%	g(\tau) = \sum_{t=0}^{T} \nabla_{\theta} \log \pi_\theta (u_t \vert s_t)(r_t + \gamma V(s_{t+1}) - V(s_t))
%	\end{equation*}
%\end{itemize}
%}

%\frame{
%\frametitle{Deep Actor-Critic Methods}
%%%%%%%%%%%%%%%%%%
%
%\begin{itemize}
%	\item Actor and critic are both deep neural networks
%	\begin{itemize}
%	\vspace{2mm}
%	\item Convolutional and recurrent layers
%	\vspace{2mm}
%	\item Actor and critic share layers
%	\end{itemize}
%	\vspace{5mm}
%	\item Both trained with stochastic gradient descent
%	\vspace{2mm}
%	\begin{itemize}
%	\item Actor follows policy gradient
%	\vspace{2mm}
%	\item Critic performs TD or Sarsa updates
%	\end{itemize}
%\end{itemize}
%}

%\frame{
%\frametitle{Multi-Agent RL Methods from WhiRL}
%%%%%%%%%%%%%%%%%%
%
%\begin{itemize}
%	\item DIAL [Foerster et al.\ 2015]
%	\vspace{0.11cm}
%	\item Multi-Agent Fingerprints [Foerster et al.\ 2017]
%	\vspace{0.1cm}
%	\item COMA [Foerster et al.\ 2018]
%	\vspace{0.1cm}
%	\item \bld{QMIX [Rashid et al.\ 2018]}
%	\vspace{0.1cm}
%	\item LOLA [Foerster et al.\ 2019]
%	\vspace{0.1cm}
%	\item SOS [Letcher et al.\ 2019]
%	\vspace{0.1cm}
%	\item MACKRL [Schroeder de Witt et al.\ 2019]
%	\vspace{0.1cm}
%	\item MAVEN [Mahajan et al.\ 2019]
%	\vspace{0.1cm}
%	\item \bld{WQMIX [Rashid et al.\ 2020]}
%	\vspace{0.1cm}
%	\item COMIX [Schroeder de Witt et al.\ 2020]
%\end{itemize}
%}

\frame{
\frametitle{Setting}
%%%%%%%%%%%%%%%%%

\begin{center}
  \includegraphics[scale = 0.55]{setting}\\ \ \\
  (Figure by Jakob Foerster)
\end{center} 

}

\frame{
\frametitle{Markov Decision Process}
%%%%%%%%%%%%%%%%%

\begin{itemize}
	\item Agent observes  \sw{state} $s \in S$ and selects an \sw{action} $u \in U$
	\vspace{3mm}	
	\item State transitions: $P(s'|s,u): S \times U \times S \rightarrow [0,1]$
	\vspace{3mm}	
	\item Receives \sw{reward}: $r(s,u): S \times U \rightarrow \mathbb{R}$
	\vspace{3mm}	
	\item Goal: maximise expected cumulative discounted \sw{return}:
	\begin{equation*}
		R_t = \sum_{k=0}^\infty \gamma^k r_{t+k}		
	\end{equation*}
	\item \sw{Value functions} given policy $\pi(s,u)$:
	\begin{equation*}
		V^\pi(s) = \exs{\pi}{R_t | s_t = s}~~\textup{and}~~Q^\pi(s,u) = \exs{\pi}{R_t | s_t = s, u_t=u}
	\end{equation*}
\end{itemize}
}

\frame{
\frametitle{Multi-Agent MDP}
%%%%%%%%%%%%%%%%%

\begin{itemize}
	\item All agents see the global state $s$
	\vspace{5mm}	
	\item Individual actions: $u^a \in U$
	\vspace{5mm}	
	\item State transitions: $P(s'|s,\mathbf{u}): S \times \mathbf{U} \times S \rightarrow [0,1]$
	\vspace{5mm}	
	\item Shared team reward: $r(s,\mathbf{u}): S \times \mathbf{U} \rightarrow \mathbb{R}$
	\vspace{5mm}	
	\item Equivalent to an MDP with a factored action space
\end{itemize}
}

\frame{
\frametitle{Dec-POMDP}
%%%%%%%%%%%%%%%%%

\begin{itemize}
	\item Observation function: $O(s, a): S \times A \rightarrow Z$
	\vspace{5mm}	
	\item Action-observation history: $\tau^a \in T \equiv (Z \times U)^{*}$
	\vspace{5mm}	
	\item Decentralised policies: $\pi^a(u^a|\tau^a): T \times U \rightarrow [0,1]$
	\vspace{5mm}	
	\item Natural decentralisation: communication and sensory constraints
	\vspace{5mm}	
	\item Artificial decentralisation: improve tractability
	\vspace{5mm}	
	\item Centralised learning of decentralised policies
\end{itemize}
}

\frame{
\frametitle{Independent Learning}
%%%%%%%%%%%%%%%%%
\begin{itemize}
	\item Independent $Q$-learning [Tan 1993]
	\begin{itemize}
	\item Each agent learns independently with its own $Q$-function
	\item Treats other agents as part of the environment
	\end{itemize}
	\vspace{5mm}
	\item Independent actor-critic [Foerster et al.\ 2018]
	\begin{itemize}
	\item Each agent learns independently with its own actor-critic
	\item Treats other agents as part of the environment
	\end{itemize}
	\vspace{5mm}	
	\item Speed learning with \sw{parameter sharing}
	\begin{itemize}
	\item Different inputs, including $a$, induce different behaviour
	\item Still independent: value functions condition only on $\tau^a$ and $u^a$
	\end{itemize}
	\vspace{5mm}
	\item Limitations:
	\begin{itemize}
	\item Nonstationary learning
	\item Hard to learn to coordinate
	\end{itemize}
\end{itemize}
}

\frame{
\frametitle{Centralised Critics {\small [Lowe et al.\ 2017; Foerster et al.\ 2018]}}
%%%%%%%%%%%%%%%%%
\vspace{2mm}
\centering Centralised $V(s,\boldsymbol{\tau})$ or $Q(s,\boldsymbol{\tau},\mathbf{u}) \rightarrow$ hard greedification $\rightarrow$ actor-critic\\
\vspace{-2mm}	
\begin{center}
  \includegraphics[scale = 0.4]{coma_architecture}
\end{center} 
}

\frame{
\frametitle{Factored Joint Value Functions}
%%%%%%%%%%%%%%%%%
\begin{itemize}
	\item \sw{Factored value functions} [Guestrin et al.\ 2003] can improve scalability:
		\begin{equation*}
Q_{tot}(\boldsymbol{\tau}, \mathbf{u}; \boldsymbol{\theta}) = \sum_{e=1}^E Q_e (\mathbf{\tau}^e, \mathbf{u}^e;\mathbf{\theta}^e)
	\end{equation*}
	where each $e$ indicates a subset of the agents
\end{itemize}
\begin{center}
  \includegraphics[scale = 0.15]{factor-graph.pdf}\\ \ \\
\end{center} 
}

\frame{
\frametitle{Value Decomposition Networks {\small [Sunehag et al., 2017]}}
%%%%%%%%%%%%%%%%%
\begin{itemize}
	\item Most extreme factorisation: one per agent:
	\begin{equation*}
Q_{tot}(\boldsymbol{\tau}, \mathbf{u}; \boldsymbol{\theta}) = \sum_{a=1}^N Q_a (\tau^a, u^a;\theta^a)
	\end{equation*}
	\vspace{0.2cm}
\end{itemize}
\begin{center}
  \includegraphics[scale = 0.15]{VDN.pdf}\\ \ \\
\end{center} 
}
	
	
\frame{
\frametitle{Decentralisability}
%%%%%%%%%%%%%%%%%
\begin{itemize}
	\item Added benefit of decentralising the $\max$ and $\argmax$:
	\begin{align*}
	\max_{\mathbf{u}}Q_{tot}(\boldsymbol{\tau}, \mathbf{u}; \boldsymbol{\theta}) &= \sum \max_{u^a}Q_a(\tau^a, u^a; \theta^a) \\ \ \\
	\argmax_{\mathbf{u}}Q_{tot}(\boldsymbol{\tau}, \mathbf{u}; \boldsymbol{\theta}) &= 
	\begin{pmatrix}
	\argmax_{u^1}Q_1(\tau^1, u^1; \theta^1)   \\
	\vdots \\
	\argmax_{u^n}Q_n(\tau^n, u^n; \theta^n) \\
	\end{pmatrix}
	\end{align*}
	\vspace{2mm}
	\item No more hard greedification $\implies Q$-learning, not actor-critic:
	\begin{align*}
\mathcal{L}(\boldsymbol{\theta})&=\sum\limits_{i=1}^{b}\left[\left(y_i^{\text{tot}}-Q_{tot}(\boldsymbol{\tau}, \mathbf{u}; \boldsymbol{\theta})\right)^2\right],\\
y^{\text{tot}}_i&=r_i+\gamma\max_{\mathbf{u}^\prime}Q_{tot}(\boldsymbol{\tau}^\prime_i, \mathbf{u}^\prime; \boldsymbol{\theta^-})
\end{align*} 
\end{itemize}
}

\frame{
\frametitle{QMIX's Monotonicity Constraint}
%%%%%%%%%%%%%%%%%
\vspace{2mm}
	To decentralise $\max/\argmax$, it suffices to enforce:
$
	\frac{\partial Q_{tot}}{\partial Q_a}  \geq 0,~ \forall a \in A
$

\begin{center}
  \includegraphics[scale = 0.6]{monotonic_2}
\end{center} 	

}

\frame{
\frametitle{Representational Capacity}
%%%%%%%%%%%%%%%%%

\begin{center}
	\includegraphics[width=0.7\textwidth]{Shimon}\\
	\includegraphics[width=\textwidth]{games.pdf}
\end{center}
\ \ \ \ \ \ \ \ \ VDN \& QMIX \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Just QMIX \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Neither\\ \ \\
}

\frame{
\frametitle{Bootstrapping}
%%%%%%%%%%%%%%%%%
\begin{center}
	\includegraphics[width=\textwidth]{Students}\\
\end{center}
\vspace{-2mm}
		\begin{align*}
\mathcal{L}(\boldsymbol{\theta})&=\sum\limits_{i=1}^{b}\left[\left(y_i^{\text{tot}}-Q_{tot}(\boldsymbol{\tau}, \mathbf{u}, s; \boldsymbol{\theta})\right)^2\right],\\
y^{\text{tot}}_i&=r_i+\gamma\max_{\mathbf{u}^\prime}\textcolor{red}{Q_{tot}(\boldsymbol{\tau}^\prime_i, \mathbf{u}^\prime, s^\prime; \boldsymbol{\theta^-})}
\end{align*} 
}

\frame{
\frametitle{Two-Step Game}
%%%%%%%%%%%%%%%%%
\vspace{-2mm}
\begin{center}	
	\includegraphics[width=\textwidth]{2step.pdf} 
\end{center}
}

\frame{
\frametitle{Two-Step Game Results}
%%%%%%%%%%%%%%%%%
\vspace{-2mm}
\begin{center}	
	\includegraphics[width=\textwidth]{2step-results.pdf} 
\end{center}
}

\frame{
\frametitle{QMIX [Rashid et al.\ 2018]}
%%%%%%%%%%%%%%%%%
\vspace{-0.2cm}
\begin{center}
  \includegraphics[width=\textwidth]{All_Three_Blow_up.png}
\end{center} 	
	\begin{itemize}
		\item Agent network: represents $Q_i (\tau^a, u^a;\theta^a)$
		\vspace{2mm}
		\item Mixing network: represents $Q_{tot}(\boldsymbol{\tau})$ using nonnegative weights
		\vspace{2mm}		
		\item Hypernetwork: generates weights of hypernetwork based on global $s$
	\end{itemize}
}

\frame{
\frametitle{Random Matrix Games (The Students Were Right)}
%%%%%%%%%%%%%%%%%
	\includegraphics[width=0.32\textwidth]{"figures/results/matrix_games/no_legend/2 Agents 2 Actions_max_qtot_median"}
	\includegraphics[width=0.32\textwidth]{"figures/results/matrix_games/no_legend/2 Agents 3 Actions_max_qtot_median"}
	\includegraphics[width=0.32\textwidth]{"figures/results/matrix_games/no_legend/2 Agents 4 Actions_max_qtot_median"}
	\includegraphics[width=0.32\textwidth]{"figures/results/matrix_games/no_legend/3 Agents 2 Actions_max_qtot_median"}
	\includegraphics[width=0.32\textwidth]{"figures/results/matrix_games/no_legend/3 Agents 3 Actions_max_qtot_median"}
	\includegraphics[width=0.32\textwidth]{"figures/results/matrix_games/no_legend/3 Agents 4 Actions_max_qtot_median"}
	\includegraphics[width=0.32\textwidth]{"figures/results/matrix_games/no_legend/4 Agents 2 Actions_max_qtot_median"}
	\includegraphics[width=0.32\textwidth]{"figures/results/matrix_games/no_legend/4 Agents 3 Actions_max_qtot_median"}
	\includegraphics[width=0.32\textwidth]{"figures/results/matrix_games/legend/4 Agents 4 Actions_max_qtot_median"}
}

\frame{
\frametitle{StarCraft Multi-Agent Challenge (SMAC)\\{\small [Samvelyan et al.\ 2019]}}
%%%%%%%%%%%%%%%%%
\begin{figure}
    \centering
        \includegraphics[width=0.32\textwidth]{3s_vs_5z.jpg}
        \includegraphics[width=0.32\textwidth]{8m_8m.jpg}        
        \includegraphics[width=0.32\textwidth]{banelings4.jpg}\\ \ \\               
        \includegraphics[width=0.32\textwidth]{micro_colossus2.jpg}                
        \includegraphics[width=0.32\textwidth]{micro_corridor.jpg}                        
        \includegraphics[width=0.32\textwidth]{MMM.jpg}
\end{figure}
\begin{center}
\url{https://github.com/oxwhirl/smac}
\url{https://github.com/oxwhirl/pymarl}	
\end{center}

}

\frame{
\frametitle{Partial Observability in SMAC}
%%%%%%%%%%%%%%%%%
\begin{center}
	\includegraphics[width=0.75\textwidth]{smac_agent_obs} \\
\vspace{0.2cm}
Cyan = sight range \ \ \ \ \ Red = shooting range
\end{center}
}

\frame{
\frametitle{SMAC Maps}
	\scalebox{.75}{
		\begin{tabular}{ccc}
			\toprule
			Name & Ally Units & Enemy Units \\
			\midrule
			\texttt{2s3z} &  2 Stalkers \& 3 Zealots &  2 Stalkers \& 3 Zealots \\
			\texttt{3s5z} &  3 Stalkers \&  5 Zealots &  3 Stalkers \&  5 Zealots \\
			\texttt{1c3s5z} &  1 Colossus, 3 Stalkers \&  5 Zealots &  1 Colossus, 3 Stalkers \&  5 Zealots \\
			\hline
			\texttt{5m\_vs\_6m} & 5 Marines & 6 Marines \\
			\texttt{10m\_vs\_11m} & 10 Marines & 11 Marines \\
			\texttt{27m\_vs\_30m} & 27 Marines & 30 Marines \\
			\texttt{3s5z\_vs\_3s6z} & 3 Stalkers \& 5 Zealots & 3 Stalkers \& 6 Zealots \\
			\texttt{MMM2} &  1 Medivac, 2 Marauders \& 7 Marines
			&  1 Medivac, 3 Marauders \& 8 Marines  \\
			\hline
			\texttt{2s\_vs\_1sc}& 2 Stalkers  & 1 Spine Crawler \\
			\texttt{3s\_vs\_5z} & 3 Stalkers & 5 Zealots \\
			\texttt{6h\_vs\_8z} & 6 Hydralisks  & 8 Zealots \\
			\texttt{bane\_vs\_bane} & 20 Zerglings \& 4 Banelings  & 20 Zerglings \& 4 Banelings \\		
			\texttt{2c\_vs\_64zg}& 2 Colossi  & 64 Zerglings \\		
			\texttt{corridor} & 6 Zealots  & 24 Zerglings \\
			\bottomrule
	\end{tabular}}
}

\frame{
\frametitle{Overall Results (The Students Were Right)}
%%%%%%%%%%%%%%%%%
\begin{center}
%	\includegraphics[width=0.475\textwidth]{figures/results/median_test_win.png}
	\includegraphics[width=\textwidth]{figures/results/maps_best.png}
\end{center}
%\ \ \ \ \ Median test win \% \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Total scenarios with best test win \%
}

\frame{
\frametitle{State Ablations}
%%%%%%%%%%%%%%%%%

\begin{figure}[h!]
    \centering
    \includegraphics[width=0.475\textwidth]{figures/results/ablations/1/3s5z_test_battle_won_mean_median.png}
    \includegraphics[width=0.475\textwidth]{figures/results/ablations/1/2c_vs_64zg_test_battle_won_mean_median.png}
    \includegraphics[width=0.475\textwidth]{figures/results/ablations/1/legend/MMM2_test_battle_won_mean_median.png}
\end{figure}
}


\frame{
\frametitle{Linear Ablations}
%%%%%%%%%%%%%%%%%

\begin{figure}[h!]
    \centering
    \includegraphics[width=0.475\textwidth]{figures/results/ablations/3/3s5z_test_battle_won_mean_median.png}
    \includegraphics[width=0.475\textwidth]{figures/results/ablations/3/2c_vs_64zg_test_battle_won_mean_median.png}
    \includegraphics[width=0.475\textwidth]{figures/results/ablations/3/legend/MMM2_test_battle_won_mean_median.png}
\end{figure}
}

\frame{
\frametitle{Learned Mixing Functions (2c\_vs\_64zg)}
%%%%%%%%%%%%%%%%%

\begin{figure}[h!]
    \centering
    \includegraphics[width=0.475\textwidth]{figures/vis/2c_64zg/2c_64zg_elu_0t.png}
    \includegraphics[width=0.475\textwidth]{figures/vis/2c_64zg/2c_64zg_elu_50t.png}
\end{figure}
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ $t=0$ \ \ \ \ \ \ \ \ \ \  \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ $t=50$

}

\frame{
\frametitle{Multi-Layer Linear Mixing (Regression)}
%%%%%%%%%%%%%%%%%

\begin{figure}[h!]
    \centering
    \includegraphics[width=\textwidth]{figures/results/regression/mixing_net_parametrisation_loss.png}
\end{figure}
}

\frame{
\frametitle{Multi-Layer Linear Mixing (SMAC)}
%%%%%%%%%%%%%%%%%

\begin{figure}[h!]
    \centering
   \includegraphics[width=0.475\textwidth]{figures/results/ablations/2/2c_vs_64zg_test_battle_won_mean_median.png}
    \includegraphics[width=0.475\textwidth]{figures/results/ablations/2/legend/MMM2_test_battle_won_mean_median.png}
\end{figure}
}

\frame{
\frametitle{Tanh Activation}
%%%%%%%%%%%%%%%%%

\begin{figure}[h!]
    \centering
     \includegraphics[width=0.475\textwidth]{figures/vis/2c_64zg/2c_64zg_tanh_0t.png}
    \includegraphics[width=0.475\textwidth]{figures/vis/2c_64zg/2c_64zg_tanh_50t.png}
\end{figure}
\vspace{-4mm}
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ $t=0$ \ \ \ \ \ \ \ \ \ \  \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ $t=50$
\begin{figure}
    \includegraphics[width=0.4\textwidth]{figures/results/tanh/3s5z_test_battle_won_mean_median.png}
    \includegraphics[width=0.4\textwidth]{figures/results/tanh/2c_vs_64zg_test_battle_won_mean_median.png}
\end{figure}
}

\frame{
\frametitle{QMIX Takeaways}
%%%%%%%%%%%%%%%%%
\begin{itemize}
	\item Value function factorisation is crucial
	\vspace{4mm}
	\item Flexible conditioning on central state is crucial
	\vspace{4mm}
	\item Richly parameterised mixing is crucial
	\vspace{4mm}
	\item Nonlinear mixing is not crucial (on SMAC)
\end{itemize}
}

\frame{
	\frametitle{Revisiting Representational Capacity}
	\begin{center}
		\includegraphics[width=0.7\textwidth]{Shimon}\\ \ \\
%		\begin{itemize}
			%\item 
			How important is this in general?
%		\end{itemize}
	\end{center}
}

\frame{
	\frametitle{Limitations of QMIX}
	\begin{itemize}
		\item QMIX cannot represent \textbf{all} possible joint action $Q$-values\\
		\item Can have catastrophic consequences:
	\end{itemize}
	\begin{center}
		\begin{tabular}{ | c | c | c |}
			\hline
			8 & -12 & -12 \\
			\hline
			-12 & 0 & 0 \\
			\hline
			-12 & 0 & 0 \\
			\hline
		\end{tabular}
		~~~~~~~~~~~~
		\begin{tabular}{ | c | c | c |}
			\hline
			-12 & -12 & -12 \\
			\hline
			-12 & 0 & 0 \\
			\hline
			-12 & 0 & 0 \\
			\hline
		\end{tabular}\\
		\vspace{0.1cm}
		% \includegraphics[width=0.7\textwidth]{qtran_matrixgame}\\
		Matrix game payoffs \ \ \ \ \ \ \ \ \ \ What QMIX learns \ \ \\ \ \\
		(Example from [Son. et al 2019])
	\end{center}
	\begin{itemize}
		\item Failure to recover the correct argmax!
		\item Failure to recover the correct maximum $Q$-value!
	\end{itemize}
}


\frame{
	\frametitle{Fundamental Limitations of QMIX}
	\begin{itemize}
		\item This limitation is \textbf{fundamental} to QMIX\\
		\vspace{3mm}
		\item Consequence of deliberate factorisation of the joint $Q$-function\\
		\vspace{3mm}
		\item Not resolved with bigger networks, more training, more compute, etc.\\
		\vspace{3mm}
		\item QMIX \textbf{cannot} represent all joint-action $Q$\\
		\vspace{3mm}
	\end{itemize}
	\begin{center}
		$\implies$ Let's try and understand \textit{why} QMIX is failing
	\end{center}
	
}


\frame{
	\frametitle{Theoretical Framework for Analysing QMIX}
	\begin{itemize}
		\item Consider an \textit{idealised} version of QMIX
		\item Use an \textit{operator} that represents QMIX:
	\begin{center}
		$\mathcal{T}^*_{\text{Qmix}} := \overbrace{\Pi_{\text{Qmix}}}^{\textcolor{red}{\mathclap{\text{\emph{QMIX projection operator}}}}} \circ \underbrace{\mathcal{T}^*}_{\textcolor{blue}{\mathclap{\text{\emph{Standard Bellman Optimality Operator}}}}}$
	\end{center}
	\vspace{2mm}
		where the projection is defined as:
	\end{itemize}
	\begin{center}
		$\qmixProj Q := \argmin_{q \in \underbrace{\qmixSpace}_{\textcolor{red}{\mathclap{\text{\emph{$Q$s that QMIX can represent}}}}}} \sum_{\mathbf{u} \in \mathbf{U}} 
(\overbrace{Q(s,\mathbf{u})}^{\textcolor{blue}{\mathclap{\text{\emph{Target $Q$s we are trying to learn}}}}} - q(s,\mathbf{u}))^2$
	\end{center}
}


\frame{
	\frametitle{Why QMIX Fails}
	\begin{center}
		$\qmixProj Q := \argmin_{q \in \qmixSpace} \sum_{\mathbf{u} \in \mathbf{U}} 
(Q(s,\mathbf{u}) - q(s,\mathbf{u}))^2$
	\end{center}
	\begin{itemize}
		\item QMIX weights the error for \textbf{all} joint actions equally
	\end{itemize}
	\begin{center}
		\begin{tabular}{ | c | c | c |}
			\hline
			\bld{8} & -12 & -12 \\
			\hline
			-12 & 0 & 0 \\
			\hline
			-12 & 0 & 0 \\
			\hline
		\end{tabular}
		~~~$\overset{\qmixProj}{\implies}$~~~
		\begin{tabular}{ | c | c | c |}
			\hline
			\bld{-12} & -12 & -12 \\
			\hline
			-12 & 0 & 0 \\
			\hline
			-12 & 0 & 0 \\
			\hline
		\end{tabular}\\
		\vspace{0.1cm}
		\ \ \ \ \ \ \ \ \ \ $Q$ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \  \ \ \ \ \ \ \ $\argmin_{q \in \qmixSpace}$ \ \ \ 
	\end{center}
	\begin{itemize}
		\item Result is failure on this simple matrix game
	\end{itemize}
	
}
\frame{
	\frametitle{Hypothetical}
	\begin{itemize}
		\item \textit{Hypothetically}, what if we knew what the optimal joint action was?
		\item What if we \textbf{only} consider error on the optimal joint action?
	\end{itemize}
	\begin{center}
		\begin{tabular}{ | c | c | c |}
			\hline
			\bld{8} & -12 & -12 \\
			\hline
			-12 & 0 & 0 \\
			\hline
			-12 & 0 & 0 \\
			\hline
		\end{tabular}
		~~~~$\implies$~~~~
		\begin{tabular}{ | c | c | c |}
			\hline
			\bld{8} & ? & ? \\
			\hline
			? & ? & ? \\
			\hline
			? & ? & ? \\
			\hline
		\end{tabular}\\
	\end{center}
	\begin{itemize}
		\item For a single joint action, QMIX's limitations have no effect
		\item In general though, we also need to estimate $Q$-values for the other joint actions
		\item Ideally, we want to learn $Q$-values for them that do not impact our optimal joint action's $Q$-values
		% \item Ideally, we also want to learn $Q$-values for the other joint-actions that are below the optimal action's $Q$ \sw{This doesn't seem right: we *need* to estimate those other joint values in oder to know which joint action is optimal in the first place.}
	\end{itemize}

}

\frame{
	\frametitle{Weighting Function}
	\begin{itemize}
		\item Introduce a weighting function into our objective:
	\end{itemize}
	\begin{center}
		$\wProj Q := \argmin_{q \in \qmixSpace} \sum_{\mathbf{u} \in \mathbf{U}} \underbrace{\textcolor{red}{w(s,\mathbf{u})}}_{\mathclap{\text{\bld{Weighting function}}}}(Q(s,\mathbf{u}) - q(s,\mathbf{u}))^2$
	\end{center}
	\begin{itemize}
		\item The weighting changes the $q \in \qmixSpace$ that is returned
	\end{itemize}
}


\frame{
	\frametitle{Candidates for the Weighting Functions}
% \sw{Can you improve the alignment on this slide?}
	\begin{itemize}
		\item \textbf{Idealised Central Weighting:} 
	\end{itemize}
	\begin{multicols}{2}
		\begin{equation*}
		    w(s,\mathbf{u}) = 
		    \begin{cases}
		    % 1 & Q_{critic}(s,\mathbf{u}) > Q_{critic}(s,\mathbf{u}^*) ~~\text{or}~~ \mathbf{u} = \mathbf{u}^* \\
		    1 & \mathbf{u} = \argmax_{\mathbf{u}} Q(s,\mathbf{u})\\
		    \alpha & \text{otherwise}.
		    \end{cases}
		\end{equation*}
	\columnbreak
	\vspace*{-2mm}
	\vspace*{\fill}
	\begin{itemize}
		\item Not practical 
		\item Requires approximation
	\end{itemize}
	\vspace*{\fill}
	\end{multicols}

	\begin{itemize}
		\item \textbf{Optimistic Weighting:}
	\end{itemize}
	\begin{multicols}{2}
		\begin{equation*}
			w(s,\mathbf{u}) = 
		    \begin{cases}
			    1 & Q_{tot}(s,\mathbf{u}) < Q(s,\mathbf{u}) \\
			    \alpha & \text{otherwise}.
		    \end{cases}
		\end{equation*}
	\columnbreak
	\vspace*{-2mm}
	\vspace*{\fill}
	\begin{itemize}
		\item Easy to use
		\item Less obvious it works
	\end{itemize}
	\vspace*{\fill}
	\end{multicols}
\begin{itemize}
	\item Theoretical results apply to both
\end{itemize}

}

\frame{
	\frametitle{Weighted QMIX}
	Three components of Weighted QMIX:\\
	\vspace{2mm}
	\begin{enumerate}
		\item QMIX's $Q$-values: $Q_{tot}$
			\begin{itemize}
				\item Produces the decentralised agents
				\item Used to efficiently maximise
			\end{itemize}
	\vspace{2mm}
		\item Weighted QMIX loss.
			\begin{itemize}
				\item Changes the $Q_{tot}$ learnt by QMIX
				\item Ensures correct argmax action and correct maximum $Q$-value
			\end{itemize}
	\vspace{2mm}
		\item Centralised $Q$-values: $\hat{Q}^*$
			\begin{itemize}
				\item Unrestricted joint action $Q$-value
				\item Used to estimate $Q$-values
			\end{itemize}
	\end{enumerate}
}


\frame{
	\frametitle{Deep RL Weighted QMIX}
%SW: mention the points below when you speak but you don't need them on the slide
%	\begin{itemize}
%		\item Ultimately, we are interested in the Deep MARL setting
%		\item Our analysis provides a firm theoretical foundation to build on 
%		\item How can we realise Weighted QMIX in practise?
%	\end{itemize}
	\begin{center}
		\includegraphics[width=1\textwidth]{wqmix_targets.png}
	\end{center}
	\begin{multicols}{2}
		Central Weighting (\textbf{CW-QMIX})
			\begin{equation*}
			    w(s,\mathbf{u}) = 
			    \begin{cases}
			    % 1 & Q_{critic}(s,\mathbf{u}) > Q_{critic}(s,\mathbf{u}^*) ~~\text{or}~~ \mathbf{u} = \mathbf{u}^* \\
			    1 & \mathbf{u} = \hat{\mathbf{u}} ~\text{or}~ y_i > \hat{Q}^*(s, \hat{\mathbf{u}}) \\
			    \alpha & \text{otherwise}.
			    \end{cases}
			\end{equation*}
			% \vspace*{\fill}
			% \vfill\null
		\\
		\columnbreak
		Optimistic Weighting (\textbf{OW-QMIX})
			\begin{equation*}
			w(s,\mathbf{u}) = 
		    \begin{cases}
			    1 & Q_{tot}(s,\mathbf{u}) < y_i \\
			    \alpha & \text{otherwise}.
		    \end{cases}
		\end{equation*}
	\end{multicols}
}

\frame{
	\frametitle{Similarities to Actor-Critic}
	\begin{itemize}
		\item Define deterministic QMIX policy as:
		\vspace{5mm}
	\end{itemize}
		\begin{equation*}
		\pi(s) = \begin{pmatrix}
		\argmax_{u_1}Q_1(s, u_1)\\
		\hdots \\
		\argmax_{u_n}Q_n(s, u_n) 
		\end{pmatrix}
		\end{equation*}
	\begin{itemize}
		\item Think of $\hat{Q}^*$ as a critic
		\item Weighted QMIX then looks like an off-policy actor-critic algorithm
		\item Difference is in how the actors are trained
	\end{itemize}
}

\frame{
	\frametitle{Relevant Baselines}
	\begin{itemize}
		\item MADDPG and MASAC
		\begin{itemize}
			\item Implementations as close to Weighted QMIX as possible
			\item Only difference is the optimisation of agents
		\end{itemize}
	\end{itemize}
	\begin{itemize}
		\item QTRAN
		\begin{itemize}
			\item Can be viewed as specific choices of the 3 components in Weighted QMIX
		\end{itemize}
	\end{itemize}
	\begin{itemize}
		\item QPLEX
		\begin{itemize}
			\item Can theoretically represent \textbf{all} joint-action $Q$-functions
			\item Maintains consistency as well!
		\end{itemize}
	\end{itemize}

}

\frame{
	\frametitle{Predator Prey}
	\begin{itemize}
		\item 2 agents need to try and capture the prey at the \textbf{same} timestep
		\item Punished if only 1 agent attempts capture
	\end{itemize}
	\begin{center}
		\includegraphics[width=0.8\textwidth]{wqmix_predprey_test_return.png}
	\end{center}
}

\frame{
	\frametitle{Implicit Weighting in Deep RL Setting}
	\begin{itemize}
		\item Deep RL QMIX has an \textbf{implicit} weighting function:
	\end{itemize}
	\begin{center}
		Loss $\approx \sum_{i} \underbrace{\textcolor{red}{b(s_i,\mathbf{u}_i)}}_{\mathclap{\text{\bld{Implicit Weighting}}}}(y_i - Q_{tot}(s_i,\mathbf{u}_i))^2$
	\end{center}
	\begin{itemize}
		\item $b$ is hard to control
		\item Deep RL Weighted QMIX also has an \textbf{explicit} weighting:
	\end{itemize}
	\begin{center}
		Weighted Loss $\approx \sum_{i} \underbrace{\textcolor{blue}{w(s_i,\mathbf{u}_i)}}_{\mathclap{\text{\textcolor{blue}{User-defined Weighting}}}}b(s_i,u_i)(y_i - Q_{tot}(s_i,\mathbf{u}_i))^2$
	\end{center}
}

\frame{
	\frametitle{Robustness to Exploration Results}
	%\begin{itemize}
		%\item 
		QMIX can be brittle in the presence of significant exploration
%	\end{itemize} 
	\begin{center}
		\includegraphics[width=0.8\textwidth]{wqmix_3s5z_robustness_epsilon.png}
	\end{center}
}

\frame{
	\frametitle{SMAC Exploration Results}
	%\begin{itemize}
		%\item 
		Additional exploration can be beneficial in some SMAC scenarios.
%	\end{itemize} 
	\begin{center}
		\includegraphics[width=0.8\textwidth]{wqmix_6h_vs_8z.png}
	\end{center}
}

\frame{
	\frametitle{Limitations}
	\begin{itemize}
		\item Weighted QMIX introduces additional complexity 
			\begin{itemize}
				\item Additional model 
				\item Weighting function in loss
			\end{itemize}
		\item Can cause \textit{regressions} compared to QMIX
	\end{itemize} 
	\begin{center}
		\includegraphics[width=0.8\textwidth]{wqmix_corridor_limitations.png}
	\end{center}

}

\frame{
	\frametitle{Future Directions/Questions}
	\begin{itemize}
		\item Our choice of weightings for Deep RL is relatively arbitrary
		\vspace{3mm}
		\item Architecture/training of the centralised $\hat{Q}^*$ is lacking
		\vspace{3mm}
		\item Why doesn't something like QPLEX work in our Predator Prey?
	\end{itemize} 

}

\frame{
\frametitle{Papers}
%%%%%%%%%%%%%%%%%

\center
\emph{QMIX: Monotonic Value Function Factorisation\\for Deep Multi-Agent Reinforcement Learning}, {\bf ICML-18} \\
\vspace{1mm}
Tabish Rashid, Mikayel Samvelyan, Christian Schroeder de Witt,\\ Gregory Farquhar, Jakob Foerster, \& Shimon Whiteson \\ \ \\

\emph{Monotonic Value Function Factorisation for Deep Multi-Agent Reinforcement Learning}, {\bf JMLR-20}\\
\vspace{1mm}
Tabish Rashid, Mikayel Samvelyan, Christian Schroeder de Witt,\\ Gregory Farquhar, Jakob Foerster, \& Shimon Whiteson \\ \ \\

\emph{Weighted QMIX: Expanding Monotonic Value Function Factorisation\\for Deep Multi-Agent Reinforcement Learning}, {\bf NeurIPS-20}\\
\vspace{1mm}
Tabish Rashid, Gregory Farquhar, Bei Peng \& Shimon Whiteson \\ 

% \bld{TODO: update with WQMIX}
}

\frame{
\frametitle{Conclusions}
%%%%%%%%%%%%%%%%%

\begin{itemize}
	\item QMIX: simple, effective value factorisation for MARL
	\vspace{4mm}
	\item Extensive results suggest factorisation is crucial
	\vspace{4mm}
	\item QMIX's learnt $Q$-values can be inaccurate/unhelpful 
	\vspace{4mm}
	\item Adding a weighting to the loss function can rectify this
\end{itemize}
}

\end{document}