\documentclass[twoside]{article}

\usepackage{amssymb, amsmath}
\usepackage{mathrsfs}
\usepackage{dsfont}

\oddsidemargin  0in \evensidemargin 0in \topmargin -0.5in
\headheight 0.2in \headsep 0.2in
\textwidth   6.5in \textheight 9in 
\parskip 1.5ex  \parindent 0ex \footskip 40pt

\newcommand{\R}{\ensuremath{\mathbf{R}}}
\newcommand{\Remp}{\ensuremath{R_{emp}[f]}}
\newcommand{\half}{\ensuremath{\frac{1}{2}}}


\begin{document}

\framebox[6.4in]{
\begin{minipage}{6.4in}
  \vspace{1mm}
  \center \makebox[6.2in]{{\bf CS369M: Algorithms for Modern Massive Data Set Analysis \hfill Lecture 13 - 11/06/2009}} 
  \vspace{2mm} \\
  \center \makebox[6.2in]{{\large Global and Local Spectral Methods for Clustering and Partitioning Graphs and Data }}
  \vspace{1mm} \\
  \center \makebox[6.2in]{{\it Lecturer: Michael Mahoney \hfill Scribes: David Fong and Rajendra Shinde}}
  \vspace{1mm}
\end{minipage}
} \vspace{2mm} \\
\mbox{{ \it *Unedited Notes}}

\section{Graph partitioning using spectral methods}

Recall Cheeger's inequality

\begin{equation}\label{version:1}
\frac{d-\lambda_2}{2} \leq h_G \leq \sqrt{2d(d-\lambda_2)}
\end{equation}

Here $G$ is a $d$-regular Graph and $h_G  = \text{min}_{S,|S| \leq \frac{|V|}{2}} \frac{E(S,\bar{S})}{|S|}$ denotes the edge-expansion of the graph.  Also $\lambda_2 = \text{max}_{x \perp \bar{1}} \frac{x^TAx}{x^Tx}$ denotes the Fiedler value of the graph (least non-zero eigenvalue of the adjacency matrix of the graph) and $\bar{1}$ denotes the all ones vector.

Alternate form of Cheeger's inequality which holds for all connected graphs $G$\cite{Chung} 
\begin{equation}\label{version:chung}
2h_G \geq \lambda_G \geq \alpha_G \geq h^2_G/2
\end{equation}

where $G$ is any graph, $h_G$ is the Cheeger constant for the graph defined as $h_G = \text{min}_{S,\text{Vol}(S) \leq \text{Vol}(V)/2} \frac{\partial S}{\text{Vol}(S)}$ \footnote{Vol$(S) = \sum_{i \in S}d_i$}, $\lambda_G$ denotes Fiedler value of the graph Laplacian (i.e. second largest eigenvalue of the graph laplacian) and $\alpha_G$ represents minimum value among all Cheeger ratios of initial segments of vertices when all vertices are arranged in a line using the eigen-vector associated with $\lambda_2$. 

{\bf Proof of: }{$\frac{d-\lambda_2}{2} \leq h_G$ in \eqref{version:1}}

Consider the quadratic form
\begin{equation}
\begin{array}{ll}
\sum_{ij} A_{ij}{(x_i - x_j)}^2 & = 2d(x^Tx) - 2 \sum_{ij} x_i A_{ij} x_j. \\
 & = 2d(x^Tx) - 2 x^T A x. \\
\end{array}
\end{equation}
Recall
\begin{equation}
\begin{array}{ll}
\lambda_2 &= \text{max}_{x \perp \bar{1}} \frac{x^TAx}{x^Tx}  \\
 &= \text{max}_{x \perp \bar{1}} \frac{dx^Tx - (1/2)\sum_{ij}A_{ij}{(x_i-x_j)}^2}{2x^Tx}  \\
 &= d - \text{min}_{x \perp \bar{1}} \frac{\sum_{ij}A_{ij}{(x_i-x_j)}^2}{2x^Tx}  \\
\end{array}
\end{equation}

Let $S$ denote the set which achieves the minimum Cheeger ratio ie. $\frac{E(S,\bar{S})}{\text{min}(|S|,|\bar{S}|)}  = h_G$. Let $p  = |S|/n$ and $q = |\bar{S}|/n = 1 - p$. 
Let 
\begin{equation}\begin{array}{ll}
X_i &= q \text{  if  } i \in S \\
X_i &= -p \text{  if  } i \in \bar{S} 
\end{array}\end{equation}
Then  $x \cdot \bar{1} = |S|q + |\bar{S}|p = 0 \implies x \perp \bar{1}$. 
Also,   
$x^Tx = |S|q^2 + |\bar{S}|p^2 = npq^2 + nqp^2 = nqp(p+q) = npq$. 
Then, 

\begin{equation}\label{ref1}
\begin{array}{ll}
d - \lambda_2 & = \text{min}_{x \perp \bar{1}} \frac{ \sum_{ij}A_{ij}{(x_i-x_j)}^2 }{2x^Tx}  \\
& = E(S,\bar{S})/npq \\
& = nE(S,\bar{S})/(|S||\bar{S}|) \\
\end{array}
\end{equation}

Now we know that the sparsity$(sp)$ of the cut $(S,\bar{S})$ is defined as $sp(S) = \frac{nE(S,\bar{S})}{|S||\bar{S}|}$. 
Further, $sp(S) = \frac{nE(S,\bar{S}}{\text{min}(|S|,|\bar{S}|)\text{max}(|S|,|\bar{S}|)}$ and using the fact that $\text{max}(|S|,|\bar{S}|) \geq n/2$, we get $sp(S) \leq 2 \frac{E(S,\bar{S})}{\text{min}(|S|,|\bar{S}|)} = 2h_G$. Hence we get $d -\lambda_2 \leq 2h_G$. 

In fact if $\phi$ denotes the {\it sparsest cut} (i.e. the cut of minimum sparsity) then it can be shown that 
\begin{equation} \label{fact} \phi \geq h_G \geq (1/2)\phi \end{equation}
Hence, an approximation to the sparsest cut is a 2-approximation to $h_G$. 

{\bf Another explanation:}
We have already seen that 
\begin{equation}\begin{array}{l}
\sum_{i,j} A_{ij} {(x_i - x_j)}^2 = 2 d x^T x - 2 x^T A x \\
\implies \sum_{i,j} {(x_i - x_j)}^2 = 2 n x^T x - 2 x^T {\bf 1} x \\
\text{where } {\bf 1} \text{ denotes the matrix of all ones, i.e. } {\bf 1} \cdot x = 0,\text{  } \forall x \perp \bar{1}
\end{array}\end{equation}

Plugging this back in \eqref{ref1}, we get, 
\begin{equation}\label{ref1}
\begin{array}{ll}
d - \lambda_2 & = \text{min}_{x \perp \bar{1}} \frac{ \sum_{ij}A_{ij}{(x_i-x_j)}^2 }{2x^Tx}  \\
			   & = \text{min}_{x \perp \bar{1}} \frac{ \sum_{ij}A_{ij}{(x_i-x_j)}^2 }{(1/n) \sum_{ij}{(x_i-x_j)}^2}  \\
\end{array}
\end{equation}
 
However observe that the right side is invariant under the transformation $x \to x + c \bar{1}$. Hence we can choose $c$ in order to eliminate the constraint $x \perp \bar{1}$. 

Hence  
\begin{equation}\label{ref2}
\begin{array}{ll}
d - \lambda_2 & = \text{min}_{x \in {\mathds{R}}^m } \frac{ \sum_{ij}A_{ij}{(x_i-x_j)}^2 }{(1/n) \sum_{ij}{(x_i-x_j)}^2} \\
\end{array}
\end{equation}

Now again, recall the sparsest cut 
\begin{equation}\label{ref3} 
\begin{array}{ll}
\phi &= \text{min}_{S} \frac{nE(S,\bar{S})}{|S||\bar{S}|} \\ 
	 &= \text{min}_{x \in {\{0,1\}}^n} \frac{ \sum_{ij}A_{ij}|x_i-x_j| }{(1/n) \sum_{ij}|x_i-x_j|} \\
     &= \text{min}_{x \in {\{0,1\}}^n} \frac{ \sum_{ij}A_{ij}{(x_i-x_j)}^2 }{(1/n) \sum_{ij}{(x_i-x_j)}^2}
\end{array} 
\end{equation}

Hence comparing \eqref{ref2} and \eqref{ref3}, we get $d - \lambda_2 \leq \phi$. Also, using \eqref{fact}, we conclude that $d - \lambda_2 \leq 2h_G$
%d - \lambda_2 = \text{min} \frac{A_{ij}{(x_i-x_j)}^2}{2x^Tx}  \leq \frac{2hnp}{npq} = \frac{2h}{q} \leq h
%\end{array}

\section{Claim: $h\leq \sqrt{2d(d-\lambda_2)}$ for $d$-regular graphs}
\paragraph{Proof:}
\[ 
    \vdash_1: (\forall y \in \mathbb{R}^n) \quad 
    \sum_{i,j} A_{i,j} |y_i^2 - y_j^2| 
    \leq \sqrt{2d y^T y - 2d y^T A y } \sqrt{ 4 d y^T y} 
\]
\begin{align*}
    \text{LHS} 
    &= \sum_{i,j} A_{i,j}^\half |y_i - y_j| |y_i + y_j| A_{i,j}^\half
  \\&\leq \left( \sum_{i,j} A_{i,j}|y_i - y_j|^2 \right)^\half
          \left( \sum_{i,j} A_{i,j}|y_i + y_j|^2 \right)^\half
  \\&\leq \left( \sum_{i,j} A_{i,j}|y_i - y_j|^2 \right)^\half
          \left( \sum_{i,j} 2 A_{i,j}(y_i^2 + y_j^2) \right)^\half
  \\&\leq \left( \sum_{i,j} A_{i,j}|y_i - y_j|^2 \right)^\half
          \left( 2d \sum_{i,j} \frac{y_i^2 + y_j^2}{y^T y} \right)^\half
  \\&\leq \left( 2d y^T y - 2y^TAy \right)^\half
          \left( 4d y^T y \right)^\half
\end{align*}
\[
  \vdash_2:
    \begin{cases}
      \text{Let $x$ be an eigenvector with eigenvalue $\lambda$i. } 
      \text{i.e. $xA = \lambda_2 x$ s.t. $|\{i: x_i >0\}|\leq \frac{n}{2}$ } \\
      \text{Define $y: y_i= \max\{x_i,0\}$, }
      \text{then $yA\geq \lambda_2 y$ componentwise.}
    \end{cases}
\]
Since $A$ is positive, we have
\[
  \begin{cases}
    x_i>0 & (yA)_i \geq (xA)_i = (\lambda_2 x)_i = (\lambda_2 y)_i \\
    x_i<0 & (yA)_i \geq (xA)_i
  \end{cases}
\]
\[
  \text{For $y$ defined before, }
  \sum_{i,j} A_{i,j} |y_i^2 - y_j^2| \geq 2hy^T y
\]
Arrange the components of $y$ in non-increasing order
\[ y(i_1) \geq y(i_2) \geq \cdots y(i_n) \]
With $t$ of them strictly greater than 0, i.e.
\[ y(i_t)> y(i_{t+1}) = \cdots = y(i_n) = 0 \]
Let $K$ be the set such that jump occurs:
\[ K = \{ i: y(i_k)> y(i_{k+1})\} \]
\begin{align*}
      \sum_{u,v} A_{u,v} |y(u)^2-y(v)^2|
      &= 2 \sum_{i=1}^t \sum_{j=i+1}^n A_{v_i,v_j}(y(v_i)^2-y(v_j)^2)
      \\&= 2 \sum_{k\in K}\sum_{ii\leq k} \sum_{j>k} A_{v_i,v_j}(y(v_i)^2-y(v_j)^2)
\end{align*}
For each $k=1,\cdots,n$, let
\begin{align*}
  L_k &= \{v_i: i\leq k\} \\
  L_0 &= \phi
\end{align*}
Note $\sum_{i\leq k} \sum{j>k} A_{v_i,v_j} \geq h|L_k|$ \quad (*)
\begin{align*}
  \text{*RHS}
  &\geq 2\sum_k h|L_k| (y(v_k)^2 - y(v_{k+1})^2)
  \\&= 2h \sum_k (|L_k|- |L_{k'}|) y(v_k)^2 
  \\&= 2h \sum_k |\{v: y(v) = y(v_k) \}| y(v_k)^2 
  \\&= 2d \sum_v y(v)^2 
  \\&= 2h y^T y
\end{align*}
\begin{align*}
  h
  &\leq \frac{\sum_{i,j} A_{i,j} (y_i -y_j)^2}{2y^Ty}
  \\&\leq \frac{2dy^Ty - 2y^TAy)^\half (2dy^T y)^\half}{2y^T y}
  \\&\leq (2d(d-\lambda_2))^\half
\end{align*}
\paragraph{Algorithm (Spectral Graph Partitioning)}
\begin{enumerate}
  \item Compute 2nd eigenvector of A/L
  \item Perform a sweep cut in some way (i.e. Check the set of best notes
    derived from the eigenvector)
  \item Keep the best cut
\end{enumerate}
\paragraph{Potential Issues}
\begin{itemize}
  \item The actual set returned might not be 'good', e.g. clse to optimal
  \item The eigenvector computation may be too expensive.
  \item Local information may be reliable, but global not. For extremely large 
    graphs, or graphs with local information nice, global properties bad. We may 
    want to cluster locally, pull out a set of nodes that are good near you.
\end{itemize}
Lots of heuristics are motivated by this: Cut out nearest neighbors, 2nd 
nearest neighbors, \dots

Can we inherit some of the nice properies of the global spectral? For example
the Cheeger's inequality, sweep cut, \dots
\paragraph{2 ways to be local:}
\begin{itemize}
  \item Bias yourself locally, but still do computation that depend on the size
    of the graph.
  \item Have computation that depend on the size of the output, not the size 
    of the graph.
\end{itemize}


\begin{thebibliography}{10}
\bibitem {Chung} F. Chung. Four proofs of the Cheeger inequality and graph partition algorithms. 
\end{thebibliography}

\end{document}

