\documentclass[twoside]{article}
\usepackage[T1]{fontenc}
\usepackage[latin9]{inputenc}
\usepackage{amssymb, amsmath}
\usepackage{mathrsfs}

\usepackage{esint}

\oddsidemargin  0in \evensidemargin 0in \topmargin -0.5in
\headheight 0.2in \headsep 0.2in
\textwidth   6.5in \textheight 9in 
\parskip 1.5ex  \parindent 0ex \footskip 40pt
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% LyX specific LaTeX commands.
\newcommand{\noun}[1]{\textsc{#1}}

\begin{document}

\framebox[6.4in]{
\begin{minipage}{6.4in}
  \vspace{1mm}
  \center \makebox[6.2in]{{\bf CS369M: Algorithms for Modern Massive Data Set Analysis \hfill Lecture 7 - 10/14/2009}} 
  \vspace{2mm} \\
  \center \makebox[6.2in]{{\large Reproducing Kernel Hilbert Spaces and Kernel-based Learning Methods (2 of 2)  }} 
  \vspace{1mm} \\
  \center \makebox[6.2in]{{\it Lecturer: Michael Mahoney \hfill Scribes: Mark Wagner and Weidong Shao}}
  \vspace{1mm}
\end{minipage}
} \vspace{2mm} \\
\mbox{{ \it *Unedited Notes}}

\section{Support Vector Machine (continued) }

Given $\left(\vec{x_{i}}, y_{i}\right) \in \mathbb{R}^{n} \times \left\{ -1, +1\right\} $, we want to find a good classification hyperplane. 

Assumptions: data
are linearly separable, i.e., there exists a hyperplane such that $\begin{cases}
\left\langle w,x\right\rangle +b\geq1 & y=1\\
\left\langle w,x\right\rangle +b\leq-1 & y=-1\end{cases}$, or\[
y_{i}\left(\left\langle w,x_{i}\right\rangle +b\right)\geq1\]
 To decide on a hyperplane, we want to maximize margin.


\subsection{Problem statement}

 (\noun{Primal)}
\begin{eqnarray*}
\min_{w,b} & \frac{1}{2}\left|\left|w\right|\right|_{2}^{2}\\
\mbox{s.t.} & y_{i}\left(\left\langle w,x_{i}\right\rangle +b\right) & \geq1\end{eqnarray*}
 The Lagrangian for the above problem
\[
L\left(w,b,\alpha\right)=\frac{1}{2}\left|\left|w\right|\right|^{2}-\sum_{i=1}^{n}\alpha_{i}y_{i}\left(\left(\left\langle w,x_{i}\right\rangle +b\right)-1\right)\]

We can view this as a 2-player game, i.e.
\[ \min_{w,b}\max_{\alpha\geq0}L\left(w,b,\alpha\right)\]
 where player A chooses $w$ and $b$ while player B chooses an $\alpha$

In particular, if A chooses an \emph{infeasible }point (i.e. constraint
violated), B can make the expression as large as possible. If A chooses a \emph{feasible }point, then 

\[ \forall i \mbox{ such that}  \left(y_{i}\left\langle w,x_{i}\right\rangle +b\right)-1>0 \]

We must have $\rightarrow\alpha_{i}=0 $

If feasible, then optimizing $\frac{1}{2}\left|\left|w\right|\right|^{2}$

Alternatively: Consider the {}``Dual game''

\[ \max_{\alpha_{i}>0} \min_{w,b}L\left(w,b,\alpha\right)\leq \min_{w,b}\max_{\alpha}L\left(w,b,\alpha\right)\]
 which is the weak duality. But for a wide class of objectives the  \emph{equality} holds (i.e., no duality gap--minimax).

\underbar{L}\underbar{\noun{et}}

\begin{eqnarray*}
\left(w^{*}, b^{*}\right)&=& \arg\min_{w,b} \max_{\alpha}L\left(w,b,\alpha\right) \hspace{2mm} \left(\mathbf {*A}\right) \\
\alpha^{*}&=&\arg\max_{\alpha}\min_{w,b}L\left(w,b,\alpha\right) \hspace{2mm} \left(\mathbf{*B}\right)  
\end{eqnarray*}

\begin{eqnarray*}
L\left(w^{*}, b^{*},\alpha^{*}\right) & \leq & \max_{\alpha}L\left(w^{*},b^{*},\alpha\right)\\
 & = & \min_{w,b}\max_{\alpha}L\left(w,b,\alpha\right)
\end{eqnarray*}


by $\left(\mathbf{*A}\right)$

and then by minimax we can switch the order from above (not proved)

\begin{eqnarray*}
 & = & \max_{\alpha}\min_{w,b}L\left(w,b,\alpha\right)\\
 & \leq & \min_{w,b}L\left(w,b,\alpha^{*}\right)\end{eqnarray*}
 by $\left(\mathbf{*B}\right)$ \[
\leq L\left(w^{*},b^{*},\alpha^{*}\right)\]


Therefore all the above inequalities are equalities

Since $L\left( \cdot,\alpha\right)$ is convex with respect to $w,b$
for a fixed $\alpha$, we can find the optimum by the First Order
Condition (fix $\alpha$).

\begin{eqnarray*}
\frac{\partial L}{\partial b}  =  0 &\rightarrow&\sum_{i}\alpha_{i}y_{i}=0\\
  w_{i}&= & \sum_{i}\alpha_{i}y_{i}x_{i}\\
\frac{\partial L}{\partial w}  =  0&\rightarrow& w^{*}=\sum_{i}\alpha_{i}y_{i}x_{i}\\
\\\end{eqnarray*}
 ie the optimal solution can be written in terms of the data points

\[ \vec{w} =\sum_{i}\alpha_{i}y_{i}\vec{x_{i}}\] 

\subsection{Dual problem}



(\noun{Dual}) \begin{eqnarray*}
&& \max\sum_{i}\alpha_{i}-\frac{1}{2}\sum_{ij}\alpha_{i}y_{i}\alpha_{j}y_{j}\left\langle x_{i},x_{j}\right\rangle \\
{\mbox st} && \alpha_{i}  \geq  0\\
&&\sum_{i}\alpha_{i}y_{i}  =  0\end{eqnarray*}


where $\left\langle x_{i},x_{j}\right\rangle $ is the kernel or Gram
matrix $k\left(x_{i},x_{j}\right)$ 

\subsection{Generalizations}

 What if there are a few outliers? The data might not
be separable, or might be separable but might have noise. 

Problem statement (\noun{Primal}). Define slack variable $\zeta$.
Define regularization parameter $\eta$.

\begin{eqnarray*}
\min_{w,b,\zeta} & \frac{1}{2}\left|\left|w\right|\right|_{2}^{2}+\eta\left|\left|\zeta\right|\right|\\
st & y_{i}\left(\left\langle w,x_{i}\right\rangle +b\right)   \geq1-\zeta_{i} \\
&\zeta\geq0\end{eqnarray*}
where $\zeta_i$ measures the degree of misclassification of the $x_i$.
 To remove constrains define Lagrangian over parameters $\alpha$
\[
L\left(w,b,\alpha\right)=\frac{1}{2}\left|\left|w\right|\right|^{2}-\sum_{i=1}^{n}\alpha_{i}y_{i}\left(\left(\left\langle w,x_{i}\right\rangle +b\right)-1\right)\]


In the dual:

\begin{eqnarray*}
&\max\sum_{i}\alpha_{i}-\frac{1}{2}\sum_{ij}\alpha_{i}y_{i}\alpha_{j}y_{j}\left\langle x_{i},x_{j}\right\rangle \\
&\eta\geq\alpha_{i}  \geq  0\\
&\sum_{i}\alpha_{i}y_{i}  =  0\end{eqnarray*}

\underbar{\noun{Idea: }}

$k\left(x_{i},x_{j}\right)\sim$ correlation
matrix based on dot products\[
\Phi:x\rightarrow\Phi\left(x\right)\in \mathcal{F}\]
where $\mathcal{F}$ is the feature space, which may be high dimensional. Work with $\left(\Phi\left(x_{i}\right),y_{i}\right)$ in $\mathcal{F}$.
But since $\mathcal{F}$ is higher dimensional it will be worse algorithmically
and statistically.

Good news: 
\begin{itemize}
\item hyperplanes are particularly nice - regularized heavily, vector space
computations (eigenvalues, convex optimization)
\item for certain $\mathcal{F}$ this works
\end{itemize}
Note: $k(x, y)$ may be very inexpensive to calculate, even though $\Phi(x)$ itself may be very expensive to calculate (e.g., in high dimensions). Examples,

\begin{eqnarray*}
k\left(x,y\right) &\sim& \exp\left(-\beta\left|\left|x-y\right|\right|^{2}\right) \\
& \sim& \left(\left\langle x,y\right\rangle +1\right)^{\beta} \\
&\sim& \tanh\left(\alpha\left\langle x,y\right\rangle +\beta\right)
\end{eqnarray*}

$k$ can also be defined {}``operationally'' from data-defined graphs.

\section{Reproducing Kernel Hilbert Spaces}

\subsection{Hilbert Space}
\underbar{\noun{Define}} A vector space is a space with things (vectors)
such that addition and scalar multiplication (over a field) are defined.
e.g. $\mathbb{R}$ , $\mathbb{R}^{n}$, $\mathbb{R}^{\mathbb{R}}-$functions
from $\mathbb{R}\rightarrow\mathbb{R}$. $\mathbb{R}^{X}-$set of
functions from $X\rightarrow\mathbb{R}$ (where $X$ might be $\mathbb{R}^{n}$
or some subset of it).

\underbar{\noun{Define} } A \noun{Banach space }is a vector space with a
norm, i.e., elements have some ``size'' measure.

e.g, consider $\mathbb{R}^n$, fix a number $p\geq 1$, then $\left|\left|x\right|\right|_{p}=\left(\sum_{i}\left|x_{i}\right|^{p}\right)^{1/p}$
(the $p-$norm)

Define $\left\{ L_{p}=f:\mathbb{R}^{n}\rightarrow\mathbb{R}:\int\left|f\right|^{p}dx<\infty\right\} $
with norm $\left|\left|f\right|\right|_{L_{p}}=\int\left|f\right|^{p}dx$ .

A \noun{Hilbert space } is a Banach space that is complete with respect to the norm induced by the inner product. (Note: A metric space $M$ is complete if every Cauchy sequence in $M$ converges in $M$).

Examples:

$\mathbb{R}^{n}$ where $\left\langle x,y\right\rangle =\sum_{i=1}^{n}x_{i}y_{i}$

$l_{2}$ where $\left\langle x,y\right\rangle _{l_{2}}=\sum_{i=1}^{\infty}x_{i}y_{i}$ 

$L_{2}$ where $\left\langle x,y\right\rangle _{L_{2}}=\int_{-\infty}^{\infty}x\left(t\right)y\left(t\right)dt$

Intuitively, $L_{2}$ is an infinite-dimensional version of $\mathbb{R}^{n}$
\underbar{but} 
\begin{itemize}
\item it's too big to get tractable algorithms, too big for good generalization
properties
\item too many {}``weird'' or ``pathological'' functions
\end{itemize}
So, consider a subset of it (RKHS):

\subsection{RKHS}

\underbar{\noun{Define}}\underbar{ }for a compact subset of $\mathbb{R}^{n}$
and some Hilbert space $H$ of functions from $X\rightarrow\mathbb{R}$.
$H$ is a \noun{Reproducing Kernel Hilbert Space }if $\exists$ some
kernel $k:X\times X\rightarrow\mathbb{R}$ such that
\begin{enumerate}
\item $k$ has the {\it reproducing property}: $\left\langle k(.,x),f\right\rangle =f\left(x\right)$
\item $k$ spans $H$, i.e. $ \mbox{span}  \left\{ k ( \bullet, x):x\in X \right\} =H$
\end{enumerate}
\underbar{Technical point}

Reisz Representer theorem.

If $\Phi$ is a \underbar{bounded }functional on $H$ then $\exists$
\underbar{unique }$u\in H$ such that $\Phi\left(f\right)=\left\langle f,u\right\rangle _{H}\forall f\in H$ 

\underbar{Define} a function/operator $k$ is \underbar{positive-definite}
if $\forall$functions $\int f\left(x\right)k\left(x, x'\right)f\left(x'\right)dxdx'>0$

The high level idea:
\begin{enumerate}
\item start with kernel $k$

\item define universe $V$ of $H$ ie a set of functions and \underbar{define
}a dot product on $V\times V$

\item This dot product gives a norm, which makes a reproducing kernel hilbert
space
\end{enumerate}

Given a positive-definite kernel $k\left(x,x'\right)$ \underbar{AND
}$x_{1},..x_{n}$, \underbar{define }a Gram matrix $K$ such that

\[ K_{ij}=k\left(x_{i},x_{j}\right) \]

\underbar{Note }Cauchy Schwarz holds ie $k\left(x_{i},x_{j}\right)^{2}\leq k\left(x_{i},x_{i}\right)k\left(x_{j},x_{j}\right)$ 

Define reproducing property

\[\Phi:x\rightarrow k\left(\cdot,x\right) \]

i.e., represent each $x$ by its behavior with respect to every other point.

Construct a vector space by linear combinations of $k\left(\cdot,x\right)$ 

\[f\left(\cdot\right)=\sum_{i=1}^{n}\alpha_{i}k\left(\cdot,x_{i}\right)\]
- vector space, reproducing kernel hilbert space

Define dot product:

\[g\left(\cdot\right)=\sum_{j}B_{j}k\left(\cdot,x_{j}\right)\]
Then
$\left\langle f\left(\cdot\right),g\left(\cdot\right)\right\rangle =\sum_{ij}\alpha_{i}\beta_{i}k\left(\cdot,x_{i}\right)k\left(\cdot,x_{j}\right)=\sum_{ij}\alpha_{i}\beta_{j} k\left(x_{i},x_{j}\right)$ 

claim: this is an inner product

\[\left\langle k\left(\cdot,x\right),f\right\rangle =\sum\alpha_{i}k\left(x_{i},x\right)\],
i.e. $k$ is the {}``representer'' of the evaluation (analog of the
delta function).

In particular, one possible $f$ could be the kernel $k\left(\cdot,x\right)$
in which case the dot product:

\[\left\langle k\left(\cdot,x\right),k\left(\cdot,x'\right)\right\rangle =k\left(x,x'\right)\]

this is reproducing

\subsection{Mercer Theorem}

If k is a positive definite kernel, then $\exists$ continuous $\left\{ \Phi_{i}\right\} _{i=1}^{\infty}\left\{ \lambda_{i}\right\} _{i=1}^{\infty}$
such that $k\left(x,x'\right)=\sum_{i=1}^{n}\lambda_{i}\Phi\left(x\right)\Phi\left(x'\right)^{T}$ 

\underbar{will show}~ 
\begin{itemize}
\item that we can represent data as a finite set of points
\item solutions to optimization problems can be written in terms of data
points
\end{itemize}
Basis for any algorithm that depends on the data just in terms of
dot products can be represented by $k\left(x,x'\right)$ 
\begin{itemize}
\item construction of data dependent kernels (isomap, lle, laplacian eigenmaps)
\end{itemize}

\section{References}
\begin{enumerate}
   \item Cortes and Vapnik, "Support-Vector Networks", \emph{Machine Learning},
    p 273-297, 1995

\item  Scholkopf, Smola, and Muller, "Nonlinear component analysis as a kernel eigenvalue problem", 1998
\item  Scholkopf, Herbrich, Smola, and Williamson, "A Generalized Representer Theorem"


\end{enumerate}


\end{document}
