\documentclass[12pt]{article} \usepackage{scribe} \providecommand{\setN}{\mathbb{N}} \providecommand{\setZ}{\mathbb{Z}} \providecommand{\setQ}{\mathbb{Q}} \providecommand{\setR}{\mathbb{R}} \usepackage{mdwlist} \Scribe{Friedrich Eisenbrand} \Lecturer{Friedrich Eisenbrand} \LectureNumber{1} \LectureDate{22.09.2010} \LectureTitle{Following experts advice} \begin{document} \MakeScribeTop \section{Motivation} In many situations, one has to choose repeatedly from a given set of options. In finance, for example, one is regularly confronted with the question on how to invest. There are many \emph{experts} that may tell you how they foresee a future development. These experts are far from perfect and could even give bad advice. Nevertheless, some experts could be qualified. But who the good, the bad and the ugly ones are can be determined only in hindsight. However, you observe that some experts get rewarded along the way and this could help you to bet on the right ones. Our following studies are motivated by the following question: \emph{Are there strategies that are competitive with the ones of the best expert in hindsight? } We give a somewhat positive answer to this question and quantify exactly the competitiveness. The next lectures circle around these methods and cover in particular \begin{itemize*} \item Weighted majority algorithms \item Zero sum games and their approximate solution \item Construction of a portfolio that is competitive with the best constantly re-balanced portfolio \end{itemize*} % \section{A forecasting problem} Consider the following setting. There are $N$ experts and time-steps $t=0,\ldots,T$. At time $t$, expert $j$ makes a prediction $y_j^t \in \{0,1\}$, the \emph{forecaster} predicts $\hat{p}_t \in \{0,1\}$ and \emph{nature reveals} $z_t \in \{0,1\}$. The forecaster makes a \emph{prediction mistake} at time $t$ if $z_t \neq \hat{p}_t$ holds. Some experts are good, others are bad. The question is: \emph{Can we predict nearly as well as the best expert in hindsight?} Let us assume, for a moment, that at least one expert is perfect and never makes a prediction mistake, i.e., $y_j^t = z_t$ for each $t=0,\ldots,T$. Here is an algorithm that penalizes experts making a mistake and finds the perfect one. \begin{algorithm}\label{alg:1} ~ \begin{tabbing} Initialize: \quad \quad \= Set $w_j :=1$ for $j=1,\ldots,N$. \\ At time $t$: \> (Forecast) \\ \> If $|\{j \colon w_j = 1, y_j^t = 1\}| \geq |\{j \colon w_j = 1, y_j^t = 0\}|$ \\ \> Then $\hat{p}_j =1$ \\ \> Else $\hat{p}_j =0$ \\ \\ \> (Observe and Penalize) \\ \> For \= each $j \in 1,\ldots,N$ with $y_j^t \neq z_t$ \\ \> \> set $w_j :=0$ \end{tabbing} \end{algorithm} \begin{theorem}\label{thr:1} If one expert is perfect, then using Algorithm~\ref{alg:1}, the forecaster makes at most $\lfloor\log_2(N)\rfloor$ prediction mistakes. \end{theorem} \begin{proof} The forecaster predicts as the majority of active (those with $w_j=1$) experts does. If the forecaster errs, then at least half of the still active experts become inactive. Let $A_m$ denote the number of active experts after the $m$-th mistake of the forecaster. One has $A_0 = N$ and $A_{m+1} \leq A_m/2$ and since $A_m\geq1$ it follows that $1\leq A_m\leq N/2^m$ holds and thus $m \leq \lfloor\log_2(N)\rfloor$. \end{proof} What, if there is no perfect expert? One solution would be to re-activate all experts, once all of them have become inactive. It is quite straightforward to see that the number of mistakes commited by the forecaster is bounded by $(\log_2(N) +1) (m_j+1)$, if $m_j$ is the number of mistakes committed by the $j$-th expert and thus that by using this scheme, the forecaster's number of mistakes is roughly the logarithm of the number of experts times the number of mistakes committed by the best expert. However, one can do much better with the next algorithm. \begin{algorithm}[Weighted Majority Algorithm]\label{alg:2} ~ \begin{tabbing} Initialize: \quad \quad \= Set $w_j :=1$ for $j=1,\ldots,N$. \\ At time $t$: \> (Forecast) \\ \> If $\sum_{j\colon y_j^t=1}w_j \geq\sum_{j\colon y_j^t=0}w_j$\\ \> Then $\hat{p}_j =1$ \\ \> Else $\hat{p}_j =0$ \\ \\ \> (Observe and Re-Weight) \\ \> For \= each $j \in 1,\ldots,N$ with $y_j^t \neq z_t$ \\ \> \> set $w_j :=w_j/2$ \end{tabbing} \end{algorithm} \begin{theorem} \label{thr:2} If $m^*$ denotes the number of forecasting mistakes and $m_j$ denotes the number of mistakes committed by expert $j$, then \begin{displaymath} m^* \leq \frac{1}{\log_2 (4/3)} (m_j + \log_2 (N)). \end{displaymath} \end{theorem} \begin{proof} Let $W = \sum_{j=1}^N w_j$ denote the total weight of the experts that is changing from one time-step to another. After each forecasting mistake, the weight drops at least by a factor of $3/4$. Since $W \geq w_j = (1/2)^{m_j}$ one has $(1/2)^{m_j} \leq N \cdot (3/4)^{m^*}$ and thus \begin{eqnarray*} (4/3)^{m^*} & \leq & N \cdot 2^{m_j} \\ \Longleftrightarrow \quad m^* \log_2(4/3) & \leq & \log_2 (N) + m_j \end{eqnarray*} which implies the claim. \end{proof} We can \emph{re-interpret} this forecasting problem as follows. In each round a \emph{loss vector} $\ell^t \in \{0,1\}^N$ is revealed that here depends on the predictions of the experts and the actual observation of $z_t$. The number of mistakes of expert $j$ is the \emph{total loss} of expert $j$ \begin{displaymath} \sum_{t = 1}^T \ell_j^t. \end{displaymath} Also the forecaster experiences a loss $\hat{\ell}^t$ at time $t$ and the weighted majority algorithm guarantees \begin{displaymath} \sum_{t=0}^T \hat{\ell^t} \leq(1/ \log_2(4/3)) \cdot \left(\sum_{t=0}^T \ell_j^t + \log_2(N)\right). \end{displaymath} % If the loss vector $\ell^t$ is defined as \begin{displaymath} \ell_j^t = \begin{cases} 1 & \text{ if } z_t \neq y_j^t \\ 0 & \text{ if } z_t = y_j^t, \end{cases} \end{displaymath} % then one obtains the setting of Algorithm~\ref{alg:2}. \section{The randomized weighted majority algorithm} We finally come to the strongest algorithm for online-prediction. We generalize the setting along the lines of the re-interpretation given above. Again, we have $N$ experts. In each round, nature provides us with a loss vector $\ell^t \in [0,1]^N$. Notice that in this setting, the experts do not make a prediction anymore. Throughout the time-steps, the expert $j$ experiences a loss $L^j = \sum_{t = 0}^T \ell_j^t$. At time $t$, the forecaster chooses an expert $j$ (before the loss vector $\ell^t$ is revealed) and experiences the same loss $\ell_j^t$ as the expert $j$. In the randomized weighted majority algorithm, this choice of the expert is done at random according to a changing probability distribution on the experts. Here are the details. \begin{algorithm}[Randomized Weighted Majority Algorithm]\label{alg:3} ~ \begin{tabbing} Initialize: \quad \quad \= Set $w_j :=1$ for $j=1,\ldots,N$. \\ At time $t$: \> (Forecast) \\ \> Select expert $j$ with probability $p_j = w_j / \sum_{k=1}^N w_k$ \\ \\ \> (Observe and Re-Weight) \\ \> Observe loss vector $\ell^t$ \\ \> Forecaster experiences loss $\ell_j^t$\\ \> For \= $j=1,\ldots,N$ \\ \> \> set $w_j :=w_j (1-\varepsilon)^{\ell_j^t}$ \end{tabbing} \end{algorithm} % % We now come to the main result % of today's lecture. % % % \begin{theorem}\label{thr:3} % Let $0 < \varepsilon \leq 1/2$. If $E[L]$ denotes the expected loss of the forecaster, then one has % for each expert $j$ % \begin{displaymath} % E[L] \leq \frac{\ln n}{\varepsilon} + (1+\varepsilon) \cdot L^j. % \end{displaymath} % \end{theorem} % \begin{proof} % Denote the total weight at time $t+1$ by $W^{t+1} = \sum_{j=0}^N % w_j^{t+1}$, where $w_j^t$ denotes the weight of expert $j$ at time % $t$. Let $\hat{L}^t$ be the loss of the forecaster at time $t$. One has $W^0 = N$ and % \begin{eqnarray} % W^{t+1} & = & \sum_{j=1}^N w_j^t (1-\varepsilon)^{\ell_j^t} \\ \label{eq:1} % & \leq & \sum_{j=1}^N w_j^t (1-\varepsilon\cdot {\ell_j^t}) \\\label{eq:2} % & = & W^t - \varepsilon (w^t)^T \ell^t \\\label{eq:3} % & = & W^t(1 - \varepsilon (p^t)^T \ell^t) \\\label{eq:4} % & = & W^t(1 - \varepsilon E[\hat{L}^t]) \\\label{eq:5} % & \leq & W^t e^{- \varepsilon E[\hat{L}^t]} \\\label{eq:6} % & \leq & N e^{- \varepsilon (E[\hat{L}^0+\cdots+\hat{L}^t])}. % \end{eqnarray} % The first inequality~(\ref{eq:1}) follows from $(1-\varepsilon)^x \leq (1-\varepsilon x)$ % for $x \in [0,1]$. For~(\ref{eq:2}) we used the notation $w^t$ for the % vector of weights at time $t$. To derive~(\ref{eq:3}) we used $p^t = % w^t / W^t$ and in (\ref{eq:4}) $E[L^t] = (p^t)^T \ell^t $ is the % expected loss of the forecaster at time $t$. In (\ref{eq:5}) we used % the inequality $1+x\leq e^x$ and in (\ref{eq:6}) we applied recursion % and linearity of expectation. % On the other hand, $W^{t+1}\geq (1-\varepsilon)^{L_j}$. From this we conclude % \begin{displaymath} % N \cdot e^{-\varepsilon E[L]} \geq (1-\varepsilon)^{L_j} % \end{displaymath} % and thus % \begin{displaymath} % \varepsilon \cdot E[L] \leq \ln N + \ln \left(\frac{1}{1-\varepsilon}\right) L^j \leq \ln N + (\varepsilon + \varepsilon^2) L^j % \end{displaymath} % from which the claim follows. In the last step, we used % $\ln \left(\frac{1}{1-\varepsilon}\right) \leq \varepsilon + \varepsilon^2$ for $0< \varepsilon \leq 1/2$. % \end{proof} % This result allows us now to nicely balance the loss of the forecaster % by the logarithm of the number of experts and the loss of the best % expert. We will see applications of this theorem in the next lectures. Further reading to these topics are the papers \cite{HSSW98,AHK05}. \bibliographystyle{abbrv} % if you need a bibliography \bibliography{papers} % assuming yours is named mybib.bib \end{document}