\documentclass[twoside]{article}

\usepackage{amssymb, amsmath,amsthm}
\usepackage{mathrsfs}

\oddsidemargin  0in \evensidemargin 0in \topmargin -0.5in
\headheight 0.2in \headsep 0.2in
\textwidth   6.5in \textheight 9in 
\parskip 1.5ex  \parindent 4ex \footskip 40pt

\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{corollary}[theorem]{Corollary}

\begin{document}

\framebox[6.4in]{
\begin{minipage}{6.4in}
  \vspace{1mm}
  \center \makebox[6.2in]{{\bf CS369M: Algorithms for Modern Massive Data Set Analysis \hfill Lecture 4 - 10/05/2009}} 
  \vspace{2mm} \\
  \center \makebox[6.2in]{{\Large Norms of Random Matrices \& Low-Rank via Sampling }} 
  \vspace{1mm} \\
  \center \makebox[6.2in]{{\it Lecturer: Michael Mahoney \hfill Scribes: Jacob Bien and Noah Youngs}}
  \vspace{1mm}
\end{minipage}
} \vspace{2mm} \\
\mbox{{\it *Unedited Notes}}

\section{ Overview}
\underline{Looked at so far:} \\
	\indent -Randomized algorithms for (fast) low-rank approximation\\
	\indent -Johnson Lindenstrauss Lemma $\rightarrow$ Random Projections\\
	\indent -Approximate multiplication $\rightarrow$ Random Sampling Algorithms\\
	\indent \indent Both have additive error\\
	\\
\underline{Today:}\\
\indent Wigner's Semicircle Law\\
\indent \indent Random Matrix Theorum $\rightarrow$ Element-wise sampling algorithm\\
\\
\\
\underline{Next Time:}\\
\indent Approximate L2 Regression $\rightarrow$ (1 + $\epsilon$) Approximation
\section{ Wigner's Semicircle Law }
What does a $10^6$ dimensional data-set look like?\\
 \indent -What would the "null hypothesis" of a truly random data set look like?\\
 \\
 \underline{Wigner's Semicircle Law:}\\
 \indent Let $A = (a_{ij}) $ be a symmetric matrix $(a_{ij} = a_{ji})$ such that\\
\indent \indent \indent 1. $E(a_{ij}) = 0$\\
\indent \indent \indent 2. $Var(a_{ij}) = \sigma^2$\\
\indent \indent \indent 3. $|a_{ij}| \leq K$\\
\\
\indent Let $\lambda_1 \geq \lambda_2 \geq ...\lambda_n \in \mathbb{R}$ be the eigenvalues of A\\
\indent Let $W_n(x) = $ the empirical distribution of the eigenvalues.\\
\\
\indent Then $\lim_{n\to +\infty} W_n(x*2 \sigma \sqrt{n}) = W(x)$ in probability\\
\\
\indent Where $W(x) = \begin{cases}
\frac{2}{\pi}(1-x^2)^{1/2} & \text{for $|x| \leq1$}\\
0 & \text{for $|x| > 1$}
\end{cases}$\\
\\
This function is a semicircle centered at the origin with radius 1\\
\\
\underline{Proof Idea:} Compare the moments of W(x) with the moments of $\frac{2}{\pi}(1-x^2)^{1/2}$
\section{Extensions}
1. If $E(a_{ij}) = \mu \neq 0$\\
\indent Then $\lambda_1(A)  \sim N(n \mu + \frac{\sigma^2}{\mu}, 2\sigma^2)$\\
\indent And the rest of the eigenvalues will follow the semicircle law\\
\\
2. Apply Wigner's Law to the adjacency matrix of a random graph\\
\\
\indent If G = (V,E), and $\forall ij \in V \times V$ , $ij \in E$ with probability p\\
\indent Then $A_{ij} = \begin{cases}
1 & \text{with probability p}\\
0 & \text{with probability (1-p)}
\end{cases}$\\
\\
\indent Therefore the entries of this matrix have non-zero mean, but fixed variance\\
\\
\indent How sparse can this matrix be before the results break down?\\
\indent Let d = np be the average degree of the matrix\\
\indent Then the "trace argument" only works if $p \geq \frac{log(n)^4}{n}$\\
\indent In other words, if the above relation holds, then $d_{empirical} \approx d_{expected}$\\
\\
\indent What if $p = \frac{2}{n}$ ?\\
\indent What if you want a random graph with expected degrees?
\section{Bounds on the largest eigenvalue}
\underline{We want:}
\indent For a fixed n, to be able to make some statement about the largest eigenvalue\\
\\
\underline{Fact:} If G is a random n x d matrix, $d \leq n$ with entries $N( 0, \sigma^2)$, i.i.d, and for a fixed $\epsilon, k$\\
\indent then w.p. $\geq \frac{1}{poly(k,\epsilon)}$\\
\indent $||G||_2 = ||G_n||_2 \approx (2+\epsilon) \sigma \sqrt{n}$\\
\indent $||G||_F \leq (2+\epsilon) \sigma \sqrt{nk}$\\
\\
\indent Is this scale good or bad?\\
\\
\indent Let D be the trivial rank-k approximation to G obtained by keeping the first k columns.\\
\indent \indent Then $||D||_F \approx \sigma \sqrt{nk}$\\
\indent \indent Rank(D) $\leq k$, by definition, so $||D||_2 \geq \frac{||D||_F}{\sqrt{k}} \approx \sigma \sqrt{n}$\\
\\
\indent This generalizes to the where the distribution of the entries in G has:\\
\indent \indent -Mean zero\\
\indent \indent -Bounded entries\\
\indent \indent -Independence\\
\\
The following results about the largest eigenvalue of a random symmetric matrix can be found in the references:\\
\\
From Furedi and Komlos:\\
 Let A be a random symmetric matrix with: $|a_{ij}| \leq k$, $E(a_{ij}) = 0$, and $var(a_{ij}) = \sigma^2$\\
 Then there exists a constant $C = C(\sigma,k)$ such that with high probability:\\
 \indent \indent $2\sigma \sqrt{n} -Cn^{1/3}log(n) \leq \lambda_1(A) \leq 2\sigma \sqrt{n} +Cn^{1/3}log(n)$\\
 \\
 This theorem is extended in Alon, Krivelevich, and Vu:\\
 Then given A,$ \forall$ t, $P[|\lambda_1(A) - E(\lambda_1(A))| > ct] \leq 4e^{\frac{-t^2}{32}}$






\section{``How can we use these ideas to do something low rank without causing too much damage?"}
Suppose $A\in R^{m\times n}$ and $N$ is a noise matrix (i.e. 0 mean, etc.) such that $A+N\approx A$.  Then we will try to add a data-dependent noise matrix to get a speed up (without messing things up too much).

\begin{lemma}
Let $A$ and $N$ be such that $\hat A = A+N$.  Then
\begin{enumerate}
\item[1)] $||A-A_k||_2 \le ||A-\hat A_k||_2 \le ||A-A_k||_2 + 2||N_k||_2 $
\item[2)] $||A-A_k||_F \le ||A-\hat A_k||_F \le ||A-A_k||_F + ||N_k||_F + 2\sqrt{||N_k||_F||A_k||_F} $
\end{enumerate}•
(Note: $A_k$, $\hat A_k$, and $N_k$ denote the best rank $k$ approximations to $A$, $\hat A$ and $N$ respectively.)
\end{lemma}
\begin{proof}[Proof of Lemma]
The first inequality of both (1) and (2) follows from the definition of $A_k$ as the best rank $k$ approximation to $A$.  Now, let $B$ be an any matrix (e.g. $A+N$).
\begin{enumerate}
\item[1)]
\begin{eqnarray*}
||A-B_k||_2 &\le& ||A-B||_2 + ||B-B_k||_2 \text{~by the triangle inequality}\\
&\le&||A-B||_2 + ||B-A_k||_2 \text{~since $B_k$ is the best rank $k$ approximation to $B$}\\
&\le&||A-B||_2 + ||B-A||_2 + ||A-A_k||_2\text{~by the triangle inequality}\\
&=&  ||A-A_k||_2 + 2||B-A||_2\\
&=&  ||A-A_k||_2 + 2||(B-A)_k||_2 \\
\end{eqnarray*}•
where the last equality holds since $B-A$ and $(B-A)_k$ have the same largest eigenvalue.
Part (1) of the lemma follows letting $B=\hat A = A+N$.

\item[2)]
Let $P_{M}$ denote the projection onto the column space of the matrix $M$.
\begin{claim}
$||P_{A_k}A||_F \le ||P_{B_k}A||_F + 2||(A-B)_k||_F$
\end{claim}
\begin{proof}[Proof of Claim]
\begin{eqnarray*}
||P_{A_k}A||_F &\le& ||P_{A_k}(A-B)||_F +||P_{A_k}B||_F \text{~by the triangle inequality}\\
 &\le& ||P_{A_k}(A-B)||_F +||P_{B_k}B||_F \text{~since of all $B_k$ is the best rank $k$ approximation to $B$}\\
&\le& ||P_{A_k}(A-B)||_F +||P_{B_k}(B-A)||_F + ||P_{B_k}A||_F\\
&\le& ||P_{B_k}A||_F + 2||P_{(A-B)_k}(A-B)||_F\text{~since $(A-B)_k$ is the best rank $k$ approx. to $A-B$}\\
&=& ||P_{B_k}A||_F + 2||(A-B)_k||_F\text{~since $P_{(A-B)_k}(A-B) = (A-B)_k$}\\
\end{eqnarray*}•
which proves the claim.
\end{proof}
Now, from the claim it follows that
\begin{eqnarray*}
||P_{B_k}A||^2_F&\ge& ( ||P_{A_k}A||_F-2||(A-B)_k||_F )^2\\
 &=& ||P_{A_k}A||^2_F -4||P_{A_k}A||\cdot||(A-B)_k||_F + 4||(A-B)_k||^2_F.\\
\end{eqnarray*}•
Thus, we have that
\begin{equation}\label{eq:*}
||P_{B_k}A||^2_F\ge ||P_{A_k}A||^2_F -4||P_{A_k}A||\cdot||(A-B)_k||_F,
\end{equation}• 
which we shall use shortly.
Next, observe that 
\begin{eqnarray*}
||A-B_k||_F &\le& ||A-P_{B_k}A||_F  + ||P_{B_k}A-B_k||_F \text{~by the triangle inequality}\\
&\le&||A-P_{B_k}A||_F + ||P_{B_k}(A-B)||_F \text{~since~$P_{B_k}B = B_k$}\\
&\le&||A-P_{B_k}A||_F + ||P_{(A-B)_k}(A-B)||_F\text{~since $(A-B)_k$ is the best rank $k$ approx. to $A-B$}.\\
\end{eqnarray*}•
Thus,
\begin{equation}
\label{eq:**}
||A-B_k||_F \le||A-P_{B_k}A||_F + ||(A-B)_k||_F.
\end{equation}•
Finally, we make use of Equations \ref{eq:*} and \ref{eq:**}:
\begin{eqnarray*}
||A-P_{B_k}A||_F &\le& (||A||_F^2 - ||P_{B_k}A||_F^2||)^{1/2} \text{~since $(A-P_{B_k}A)\perp P_{B_k}A$ }\\
&\le&(||A||_F^2 - ||P_{A_k}A||^2_F +4||P_{A_k}A||\cdot||(A-B)_k||_F)^{1/2} \text{~by Equation \ref{eq:*}}\\
&=&(||A-A_k||_F^2 +4||A_k||\cdot||(A-B)_k||_F)^{1/2}\\
&\le&||A-A_k||_F + 2(||A_k||\cdot||(A-B)_k||_F)^{1/2}
\end{eqnarray*}•
using that $\sqrt{x+y}\le \sqrt{x}+\sqrt{y}$.  And by Equation \ref{eq:**},
\begin{eqnarray*}
||A-B_k||_F &\le& ||A-P_{B_k}A||_F + ||(A-B)_k||_F\\
&\le& ||A-A_k||_F + 2(||A_k||\cdot||(A-B)_k||_F)^{1/2} + ||(A-B)_k||_F,
\end{eqnarray*}•
Taking $B=\hat A = A+N$, this gives
\begin{equation}
||A-\hat A_k||_F\le  ||A-A_k||_F + 2(||A_k||\cdot||N_k||_F)^{1/2} + ||N_k||_F,
\end{equation}•
which completes the proof of the second part of the lemma.
\end{enumerate}•
\end{proof}

\section{Applications of the above lemma}
The lemma above establishes that a rank $k$ approximation of the perturbed matrix may not be too much worse.  Now, we give two examples where perturbing can help in terms of memory and speed:
\begin{enumerate}
\item
In representing A, each $a_{ij}$ takes 32 or 64 bits.
\item
Iterative eigensolvers depend on the number of non-zero entries of $A$.
\end{enumerate}•
\subsection{Quantize the data}
Given $A$, let $b=\max_{ij}|A_{ij}|$ and define
\begin{equation}
\hat A_{ij} = \begin{cases}
+b & wp.~1/2+A_{ij}/(2b)\\
-b & wp.~1/2-A_{ij}/(2b)
\end{cases}•
\end{equation}•
Now, $E[\hat A_{ij}]=A_{ij}$ and $Var[\hat A_{ij}]=b^2-A_{ij}$ and all the $\hat A_{ij}$ are independent.
It follows from Theorem 3.1 of the Achlioptas and McSherry paper, that with high probability, $||A-\hat A||_{F\text{~or~}2}$ is not too large.  By the lemmas, a low rank approximation to this quantized version of $A$ will not be too bad.
\subsection{Sparsify the data}
Let $p=(\frac{8\log n}{n})^4$. Here, we sample (independently) elementwise :
\begin{equation}
\hat A_{ij} = \begin{cases}
A_{ij}/p & wp.~p\\
0 & wp.~1-p
\end{cases}•
\end{equation}•
So $E[\hat A_{ij}]=A_{ij}$ and $Var[\hat A_{ij}]=A_{ij}^2(1/p - 1)\le b^2/p$.  And with high probability $||(A-\hat A)_k|| \le \text{~"$2(1+\epsilon)\sigma\sqrt{n}$)"~} \le 4b\sqrt{n/p}$.

We can actually do better by non-uniform sampling:  Let $p_{ij} = pA_{ij}^2/b^2$ and 
\begin{equation}
\hat A_{ij} = \begin{cases}
A_{ij}/p_{ij} & wp.~p_{ij}\\
0 & wp.~1-p_{ij}
\end{cases}•
\end{equation}•
We still have $E[\hat A_{ij}]=A_{ij}$ and $Var[\hat A_{ij}]=A_{ij}^2(1/p_{ij} - 1)$ and the error bounds still hold.
The expected number of non-zero elements is in this case
\begin{equation}
E[\text{Num. non-zeros}] = \sum p_{ij} = p||A||_F^2/b^2 = \frac{pmn}{b^2}\frac{||A||_F^2}{mn}. 
\end{equation}•
(Alternatively, we can use $p_{ij}\sim A_{ij}$ for small entries to keep from violating the bound constraint.)
\section{References:}
D. Achlioptas, F. Mcsherry, \emph{Fast computation of low-rank matrix approximations}, Journal of ACM 54.2(2007).\\
\\
N. Alon, M. Krivelevich, and V. Vu, \emph{On the concentration of eigenvalues of random symmetric matrices}, Israel Journal of Mathematics 131.2(2002), 259 - 267.\\
\\
Z. Furedi and J. Komlos, \emph{The eigenvalues of random symmetric matrices}, Combinatorica1 (1981), 233 - 241.\\
\\
\end{document}

