\documentclass[twoside]{article}

\usepackage{amssymb, amsmath}
\usepackage{mathrsfs}
\oddsidemargin  0in \evensidemargin 0in \topmargin -0.5in
\headheight 0.2in \headsep 0.2in
\textwidth   6.5in \textheight 9in 
\parskip 1.5ex  \parindent 0ex \footskip 40pt


\begin{document}

\framebox[6.8in]{
\begin{minipage}{6.8in}
  \vspace{1mm}
  \center \makebox[6.2in]{{\bf CS369M: Algorithms for Modern Massive Data Set Analysis \hfill Lecture 8 - 10/19/2009}} 
  \vspace{2mm} \\
  \center \makebox[6.2in]{{\Large Spectral \& Kernel Methods for Nonlinear Dimensionality Reduction (1 of 2)}} 
  \vspace{1mm} \\
  \center \makebox[6.2in]{{\it Lecturer: Michael Mahoney \hfill Scribe: Gourab Mukherjee and Deyan Simeonov}}
  \vspace{1mm}
\end{minipage}
} \vspace{2mm} \\
\mbox{{ \it *Unedited notes}}

\section{ Linear Dimensionality Reduction }
\subsection{Definition}
Let $K^{(x,y)}$ be a kernel. $\psi$ is an \textit{eigenfunction} of $K$ if 
$$\int K(x,x') \psi(x') d x' = \lambda \psi(x)$$

\subsection{Theorem[Mercer]}
Let $K$ be a positive-definite kernel, $\int \int K(x,x') dx dx' < \infty$.
Then we can write $K$ in terms of a countable set of orthonormal eigenfunctions and eigenvalues,
in other words there exist $(\psi_i, \lambda_i)$, such that
$$K(x,x') = \sum_{i=1}^{\infty} \lambda_i \psi_i(x) \psi_i(x')$$

Our goal is to use this coordinate representation to construct feature maps.

Define $H = \{  \sum_{j=1}^{\infty} \psi_j(x) c_j \} $ (note that we have $K(x,x') \in H$).\\
Define $f(x) = \sum_j c_j \psi_j(x)$ and
	$g(x) = \sum_j d_j \psi_j(x)$.\\
Then $<f, g> = \sum_j \frac{1}{\lambda_j} c_j d_j$ (the $\frac{1}{\lambda_j}$ factor smooths out the space.)\\
\\
\textbf{Claim.} This is a reproducing kernel Hilbert space (RKHS):
$$< f(x), R(x, x') > = \sum_j \frac{1}{\lambda_j} c_j \lambda_j \psi_j(x) = \sum_j c_j \psi_j(x)$$
Let above is given a positive kernel $K$. Then we can construct a RKHS with $K$ as the dot product. In addition to defining $H$, we can use this to find the feature map $\phi : X \rightarrow H$, such that $K(x,x') = <\phi(x), \phi(x')>$.\\
\\
There are 2 representations:\\
1. Use $H_K$ as the feature space:\\
Define $\phi(x) = K( \cdot , x)$. Then 
$$< \phi(x), \phi(x') > = < K( \cdot , x), K( \cdot , x') > = K(x,x')$$
2. Use $l_2$ as the feature space:\\
Define $\phi(x) = (\lambda_1^{-1} \psi_1(x), \lambda_2^{-1} \psi_2(x), ...)$. Then 
$$<\phi(x), \phi(x')> = \sum_{j=1}^{\infty} \lambda_j \psi_j(x) \psi_j(x') = K(x,x')$$

We want to use this to do optimizations.

\subsection{Representative theorem}
We have a kernel $K$ on $X \times X$, $(\vec{x_i}, y_i) \in X \times \mathbb{R}$ ($1 \leq i \leq m$, a strictly increasing function $g: (0, \infty] \rightarrow \mathbb{R}$, a cost function $c: {(X \times \mathbb{R}^2)^m} \rightarrow \mathbb{R} \cup {\infty}$, and a class of functions $f: X \rightarrow \mathbb{R}, f( \cdot ) = \sum_i \beta_i K( \cdot , z_i)$ with a RKHS norm (i.e. $(\sum_i \beta_i K( \cdot , z_i))^2 = \sum_{i,j} \beta_i \beta_j K(z_i, z_j)$).\\
If $c$ is a regularized risk function, then
$\min(c((x_1,y_1,(x_1)),...,(x_m,y_m,(x_m)))+g(||f||))$ will admit a solution of the form $f( \cdot ) = \sum_i \alpha_i K( \cdot , x)$. So, any algorithms, written in terms of dot products can be "kernelized". 
\\
\\
There are "a priory" kernels:
\begin{itemize}
\item Gaussian kernel
\item Polynomial kernel
\item etc.
\end{itemize}
One can construct a "data-dependent" kernel:\\
1. Construct a similarity graph\\
2. Do eigenanalysis on the Laplacian or adjacency matrix. Then "embed" the data, using these eigenvectors. In addition, we can view these things as kernels. They rely on a small number of algorithmic primitives. Think of these procedures as data analysis tools.

\subsection{Linear dimensionality reduction methods}

\subsubsection{PCA}
\begin{itemize}
\item maximize variance
\item minimize construction error
\item $C \sim X X^T$
\end{itemize}
You can "kernelize" PCA, i.e. write it in terms of dot products.
\\
\subsubsection{MDS (Multi-Dimensional Scaling)}
\begin{itemize}
\item minimize dot product error
\item $G \sim X^T X$
\end{itemize}

Let $\delta_{ij} = ||x_i-x_j||^2_2 = (x_i - x_j)^T (x_i - x_j)$.\\
Let $A_{ij} = - \frac{1}{2} \delta_{ij}$.\\
$B = HAH$, where $H$ is a "centering" matrix ($H=I-\frac{1}{n} 1.1^T$).\\
$B$ is SPSD.\\
% $B_{ij} = (x_i - x)^T (x_j - x) = H x^T x H = (Hx^T)(Hx^T)^T$ - SPSD matrix.\\
\\ 
\textbf{Fact:}\\
If $K_{ij} = f(||x_i-x_j||)$, then $K_{ij} = r(\delta_{ij})$ ($r(0) = 1$).
$$\tilde{\delta_{ij}} 
= \text{Euclidean distance in feature space} 
= (\phi(x_i) - \phi(x_j))^T (\phi(x_i) - \phi(x_j)) 
= 2(1-r(\delta_{ij}))$$
i.e. $A$ is such that $A_{ij} = r(\delta_{ij}) -1$, $A=K-1.1^T$, so (fact) $HAH=HKH$.
\\
\\
In the linear case both PCA and MDS rely on SVD and can be constructed in $O(mn^2)$ time ($m > n$).
\\
Note: For isotropic kernels, i.e. $k_{ij} = f(||x_i-x_j))$, PCA is a form of MDS and vice-versa.

\section{Non-LinearDimension Reduction }

\textbf{General Framework}
\begin{itemize}
\item Derive some graph (often sparse) from the data.
\item Derive Matrix from the graph (viz. adjacency matrix, Laplacian ).
\item Dense embedding into $\Re^d$ for eigen vectors.
\end{itemize}

\subsection{ISOMAP}
\emph{Algorithm}
\begin{itemize}
\item Build the nearest neighbor graph.
\item Look at the shortest path or geodesic distance between all pairs.
\item Do Mutlidimensional scaling (MDS) based on A (the shortest path distance matrix).
\end{itemize}
\emph{Advantages}
\begin{itemize}
\item Polynomial Time.
\item No local minima.
\item Non-iterative.
\end{itemize}
\emph{Disadvantages}
\begin{itemize}
\item Non-linear Time.
\item No immediate out of sample extension.
\end{itemize}

\subsection{Local Linear Embedding (LLE) }
\emph{Algorithm}
\begin{description}
\item [Step1 : Construct the Adjacency Graph]  There aretwo variations:
\begin{enumerate}
\item  $\epsilon$ neighborhood 
\item  K Nearest neighbor Graph 
\end{enumerate}
\item  [Step2 : Choosing  weights]  Weights are choosen based on the projection of each datapoint on the 
linear subspace generated by its neighbors. 
$ W_{ij} =\left\{ \begin{array}{c c}
0 & \text{if vertex j is not a neighbor of i } \\ 
\text{ argmin} \sum_i ||x_i-\sum_j w_{ij}x_j||^2 & \text{otherwise} \end{array}\right.$ 
\item [Step3 : Mapping to Embedded Co-ordinates] Compute output $y \in \Re^d$ such that
\begin{equation}
\psi(y)=\sum_{i} ||y_i - \sum_j W_{ij} y_j ||^2  \text{  is minimized .}
\end{equation}
\begin{itemize}
\item The above minimization reduces to finding eigen vectors corresponding to the  (k+1) lowest eigenvalues of the the positive definite matrix $(I-W)'(I-W)$
\item Lowest eigen value is uninteresting so have to throw that eigen vetor out.
\end{itemize}
\end{description}

\subsection{Laplacian Eigenmaps (LE)}
\emph{Algorithm}
\begin{description}
\item [Step1 : Construct the Adjacency Graph]  There aretwo variations:
\begin{enumerate}
\item  $\epsilon$ neighborhood 
\item  K Nearest neighbor Graph 
\end{enumerate}
\item [Step2 : Choosing weights] 
\[ W_{ij}=\left\{ \begin{array}{c c}
e^{-\frac{||x_i-x_j||^2}{4t}} & \text{if vertices i \& j are connected by an edge} \\
0  & \text{otherwise} \end{array} \right.\] 
\item [Step3 : Eigen maps] We compute $ y \in \Re^d $ such that
\begin{equation}
\psi(y)=\sum_{i,j}\frac{w_{ij}||y_i-y_j||^2}{\sqrt{D_{ii} \cdot D_{jj}}}
\end{equation}
is minimized for each connected component of the graph where $ D=diag\{\sum_i w_{ij} : j=1 (1) n \}$ 
\end{description}



\section{References}
\begin{enumerate}
\item Saul, Weinberger, Ham, Sha, and Lee. Spectral methods for dimensionality reduction .\emph{Semisupervised Learning} MIT Press, Cambridge, MA, 2006
\item Belkin and Niyogi. Laplacian eigenmaps for dimensionality reduction and data representation. \emph{Neural computation}, 2003 - MIT Press
\end{enumerate}
\end{document}


