\documentclass[12pt]{article}
\usepackage{scribe}
\providecommand{\setN}{\mathbb{N}}
\providecommand{\setZ}{\mathbb{Z}}
\providecommand{\setQ}{\mathbb{Q}}
\providecommand{\setR}{\mathbb{R}}
\usepackage{mdwlist}
\Scribe{Friedrich Eisenbrand}
\Lecturer{Friedrich Eisenbrand}
\LectureNumber{1}
\LectureDate{22.09.2010}
\LectureTitle{Following experts advice}
\begin{document}
\MakeScribeTop
\section{Motivation}
In many situations, one has to choose repeatedly from a given set of
options. In finance, for example, one is regularly confronted with
the question on how to invest. There are many \emph{experts} that
may tell you how they foresee a future development. These experts
are far from perfect and could even give bad advice. Nevertheless,
some experts could be qualified. But who the good, the bad and the
ugly ones are can be determined only in hindsight. However, you
observe that some experts get rewarded along the way and this could
help you to bet on the right ones. Our following studies are
motivated by the following question: \emph{Are there strategies that
are competitive with the ones of the best expert in hindsight? }
We give a somewhat positive answer to this question and quantify
exactly the competitiveness. The next
lectures circle around these methods and cover in particular
\begin{itemize*}
\item Weighted majority algorithms
\item Zero sum games and their approximate solution
\item Construction of a portfolio that is competitive with the best
constantly re-balanced portfolio
\end{itemize*}
%
\section{A forecasting problem}
Consider the following setting. There are $N$ experts and time-steps
$t=0,\ldots,T$.
At time $t$, expert $j$ makes a prediction $y_j^t \in
\{0,1\}$, the \emph{forecaster} predicts $\hat{p}_t \in \{0,1\}$ and
\emph{nature reveals} $z_t \in \{0,1\}$. The forecaster makes a
\emph{prediction mistake} at time $t$ if $z_t \neq \hat{p}_t$ holds.
Some experts are good, others are bad. The question is: \emph{Can we
predict nearly as well as the best expert in hindsight?}
Let us assume, for a moment, that at least one expert is perfect and
never makes a prediction mistake, i.e., $y_j^t = z_t$ for each
$t=0,\ldots,T$. Here is an algorithm that penalizes experts making a
mistake and finds the perfect one.
\begin{algorithm}\label{alg:1}
~
\begin{tabbing}
Initialize: \quad \quad \= Set $w_j :=1$ for $j=1,\ldots,N$. \\
At time $t$: \> (Forecast) \\
\> If $|\{j \colon w_j = 1, y_j^t = 1\}| \geq |\{j \colon w_j = 1, y_j^t = 0\}|$ \\
\> Then $\hat{p}_j =1$ \\
\> Else $\hat{p}_j =0$ \\
\\
\> (Observe and Penalize) \\
\> For \= each $j \in 1,\ldots,N$ with $y_j^t \neq z_t$ \\
\> \> set $w_j :=0$
\end{tabbing}
\end{algorithm}
\begin{theorem}\label{thr:1}
If one expert is perfect, then using Algorithm~\ref{alg:1}, the
forecaster makes at most $\lfloor\log_2(N)\rfloor$ prediction mistakes.
\end{theorem}
\begin{proof}
The forecaster predicts as the majority of active (those with $w_j=1$) experts does. If
the forecaster errs, then at least half of the still active experts
become inactive. Let $A_m$ denote the number of active experts
after the $m$-th mistake of the forecaster. One has $A_0 = N$ and
$A_{m+1} \leq A_m/2$ and since $A_m\geq1$ it follows that $1\leq A_m\leq
N/2^m$ holds and thus $m \leq \lfloor\log_2(N)\rfloor$.
\end{proof}
What, if there is no perfect expert? One solution would be to
re-activate all experts, once all of them have become inactive. It is
quite straightforward to see that the number of mistakes commited by
the forecaster is bounded by $(\log_2(N) +1) (m_j+1)$, if $m_j$ is the
number of mistakes committed by the $j$-th expert and thus that by
using this scheme, the forecaster's number of mistakes is roughly the
logarithm of the number of experts times the number of mistakes
committed by the best expert. However, one can do much better with the
next algorithm.
\begin{algorithm}[Weighted Majority Algorithm]\label{alg:2}
~
\begin{tabbing}
Initialize: \quad \quad \= Set $w_j :=1$ for $j=1,\ldots,N$. \\
At time $t$: \> (Forecast) \\
\> If $\sum_{j\colon y_j^t=1}w_j \geq\sum_{j\colon y_j^t=0}w_j$\\
\> Then $\hat{p}_j =1$ \\
\> Else $\hat{p}_j =0$ \\
\\
\> (Observe and Re-Weight) \\
\> For \= each $j \in 1,\ldots,N$ with $y_j^t \neq z_t$ \\
\> \> set $w_j :=w_j/2$
\end{tabbing}
\end{algorithm}
\begin{theorem}
\label{thr:2}
If $m^*$ denotes the number of forecasting mistakes and $m_j$
denotes the number of mistakes committed by expert $j$, then
\begin{displaymath}
m^* \leq \frac{1}{\log_2 (4/3)} (m_j + \log_2 (N)).
\end{displaymath}
\end{theorem}
\begin{proof}
Let $W = \sum_{j=1}^N w_j$ denote the total weight of the experts that
is changing from one time-step to another. After each forecasting
mistake, the weight drops at least by a factor of $3/4$. Since
$W \geq w_j = (1/2)^{m_j}$ one has $(1/2)^{m_j} \leq N \cdot (3/4)^{m^*}$
and thus
\begin{eqnarray*}
(4/3)^{m^*} & \leq & N \cdot 2^{m_j} \\
\Longleftrightarrow \quad m^* \log_2(4/3) & \leq & \log_2 (N) + m_j
\end{eqnarray*}
which implies the claim.
\end{proof}
We can \emph{re-interpret} this forecasting problem as follows. In each round
a \emph{loss vector} $\ell^t \in \{0,1\}^N$ is revealed that here depends
on the predictions of the experts and the actual observation of
$z_t$. The number of mistakes of expert $j$ is the \emph{total loss}
of expert $j$
\begin{displaymath}
\sum_{t = 1}^T \ell_j^t.
\end{displaymath}
Also the forecaster experiences a loss $\hat{\ell}^t$ at time $t$ and
the weighted majority algorithm guarantees
\begin{displaymath}
\sum_{t=0}^T \hat{\ell^t} \leq(1/ \log_2(4/3)) \cdot \left(\sum_{t=0}^T \ell_j^t +
\log_2(N)\right).
\end{displaymath}
%
If the loss vector $\ell^t$ is defined as
\begin{displaymath}
\ell_j^t =
\begin{cases}
1 & \text{ if } z_t \neq y_j^t \\
0 & \text{ if } z_t = y_j^t,
\end{cases}
\end{displaymath}
%
then one obtains the setting of Algorithm~\ref{alg:2}.
\section{The randomized weighted majority algorithm}
We finally come to the strongest algorithm for online-prediction. We
generalize the setting along the lines of the re-interpretation given
above. Again, we have $N$ experts. In each round, nature provides us
with a loss vector $\ell^t \in [0,1]^N$. Notice that in this setting, the
experts do not make a prediction anymore. Throughout the time-steps,
the expert $j$ experiences a loss $L^j = \sum_{t = 0}^T \ell_j^t$.
At time $t$, the forecaster chooses an expert $j$ (before the loss vector
$\ell^t$ is revealed) and experiences the same loss $\ell_j^t$ as the
expert $j$. In the randomized weighted majority algorithm, this choice
of the expert is done at random according to a changing probability
distribution on the experts. Here are the details.
\begin{algorithm}[Randomized Weighted Majority Algorithm]\label{alg:3}
~
\begin{tabbing}
Initialize: \quad \quad \= Set $w_j :=1$ for $j=1,\ldots,N$. \\
At time $t$: \> (Forecast) \\
\> Select expert $j$ with probability $p_j = w_j /
\sum_{k=1}^N w_k$ \\
\\
\> (Observe and Re-Weight) \\
\> Observe loss vector $\ell^t$ \\
\> Forecaster experiences loss $\ell_j^t$\\
\> For \= $j=1,\ldots,N$ \\
\> \> set $w_j :=w_j (1-\varepsilon)^{\ell_j^t}$
\end{tabbing}
\end{algorithm}
%
% We now come to the main result
% of today's lecture.
% %
% \begin{theorem}\label{thr:3}
% Let $0 < \varepsilon \leq 1/2$. If $E[L]$ denotes the expected loss of the forecaster, then one has
% for each expert $j$
% \begin{displaymath}
% E[L] \leq \frac{\ln n}{\varepsilon} + (1+\varepsilon) \cdot L^j.
% \end{displaymath}
% \end{theorem}
% \begin{proof}
% Denote the total weight at time $t+1$ by $W^{t+1} = \sum_{j=0}^N
% w_j^{t+1}$, where $w_j^t$ denotes the weight of expert $j$ at time
% $t$. Let $\hat{L}^t$ be the loss of the forecaster at time $t$. One has $W^0 = N$ and
% \begin{eqnarray}
% W^{t+1} & = & \sum_{j=1}^N w_j^t (1-\varepsilon)^{\ell_j^t} \\ \label{eq:1}
% & \leq & \sum_{j=1}^N w_j^t (1-\varepsilon\cdot {\ell_j^t}) \\\label{eq:2}
% & = & W^t - \varepsilon (w^t)^T \ell^t \\\label{eq:3}
% & = & W^t(1 - \varepsilon (p^t)^T \ell^t) \\\label{eq:4}
% & = & W^t(1 - \varepsilon E[\hat{L}^t]) \\\label{eq:5}
% & \leq & W^t e^{- \varepsilon E[\hat{L}^t]} \\\label{eq:6}
% & \leq & N e^{- \varepsilon (E[\hat{L}^0+\cdots+\hat{L}^t])}.
% \end{eqnarray}
% The first inequality~(\ref{eq:1}) follows from $(1-\varepsilon)^x \leq (1-\varepsilon x)$
% for $x \in [0,1]$. For~(\ref{eq:2}) we used the notation $w^t$ for the
% vector of weights at time $t$. To derive~(\ref{eq:3}) we used $p^t =
% w^t / W^t$ and in (\ref{eq:4}) $E[L^t] = (p^t)^T \ell^t $ is the
% expected loss of the forecaster at time $t$. In (\ref{eq:5}) we used
% the inequality $1+x\leq e^x$ and in (\ref{eq:6}) we applied recursion
% and linearity of expectation.
% On the other hand, $W^{t+1}\geq (1-\varepsilon)^{L_j}$. From this we conclude
% \begin{displaymath}
% N \cdot e^{-\varepsilon E[L]} \geq (1-\varepsilon)^{L_j}
% \end{displaymath}
% and thus
% \begin{displaymath}
% \varepsilon \cdot E[L] \leq \ln N + \ln \left(\frac{1}{1-\varepsilon}\right) L^j \leq \ln N + (\varepsilon + \varepsilon^2) L^j
% \end{displaymath}
% from which the claim follows. In the last step, we used
% $\ln \left(\frac{1}{1-\varepsilon}\right) \leq \varepsilon + \varepsilon^2$ for $0< \varepsilon \leq 1/2$.
% \end{proof}
% This result allows us now to nicely balance the loss of the forecaster
% by the logarithm of the number of experts and the loss of the best
% expert. We will see applications of this theorem in the next lectures.
Further reading to these topics are the papers \cite{HSSW98,AHK05}.
\bibliographystyle{abbrv} % if you need a bibliography
\bibliography{papers} % assuming yours is named mybib.bib
\end{document}