\documentclass[twoside]{article}

\usepackage{amssymb, amsmath, amsthm}
\usepackage{mathrsfs}



\newtheorem{claim}{Claim}[section]
\newenvironment{pf}[1][Proof]{\begin{trivlist}
\item[\hskip \labelsep {\bfseries #1}]}{\end{trivlist}}
\newcommand{\done}{\nobreak \ifvmode \relax \else
      \ifdim\lastskip<1.5em \hskip-\lastskip
      \hskip1.5em plus0em minus0.5em \fi \nobreak
      \vrule height0.75em width0.5em depth0.25em\fi}
\oddsidemargin  0in \evensidemargin 0in \topmargin -0.5in
\headheight 0.2in \headsep 0.2in
\textwidth   6.5in \textheight 9in 
\parskip 1.5ex  \parindent 0ex \footskip 40pt


\begin{document}

\framebox[6.4in]{
\begin{minipage}{6.4in}
  \vspace{1mm}
  \center \makebox[6.2in]{{\bf CS369M: Algorithms for Modern Massive Data Set Analysis \hfill Lecture 2 - 09/28/2009}} 
  \vspace{2mm} \\
  \center \makebox[6.2in]{{\Large Projection Algorithm and Random Sampling }} 
  \vspace{1mm} \\
  \center \makebox[6.2in]{{\it Lecturer: Michael Mahoney \hfill Scribes: Meghana Vishvanath and Erik Goldman}}
  \vspace{1mm}
\end{minipage}
} \vspace{2mm} \\
\mbox{{ \it *Unedited notes}}

\section{ Projection Algorithm }
\textbf{Johnson-Lindenstrauss lemma (JL)} addresses how well a metric can be embedded in $\mathscr{L}_2$. If the metric is Euclidean, it can be done with an $\epsilon$-distortion for every $\epsilon$. For other metrics, it is usually worse: maybe with a constant or logarithmic distortion.

\textbf{Question:} Which metric is the least like $\mathscr{L}_2$? An expander.

\textbf{Papadimitriou, Raghavan, Tamaki, and Vempala (PRTV) Algorithm}\\
LSI is a way to represent documents in a vector space. From last time, we know that for both SVD and PCA, if you keep a few directions, you can capture most of the norm. \\
\indent How long does it take to compute? $O(\text{min}\{mn^2, m^2n\})$ time. If you take advantage of sparsity and other factors, it can be faster. For example, if you only want $k$-components, you can do that in approximately $O(mnk^2\text{log}(\cdot))$ time, where $mn=M$= the number of nonzero elements. 

\textbf{Recall:} Frobenius Norm: $||\Omega||_\text{F}^2 = \text{tr}(\Omega^T\Omega) = \sum_{i,j}{|\Omega_{ij}|^2} = \sum_i{\sigma^2_{i}}$ and $||\Omega|| = \min_x{\dfrac{||\Omega x||_2}{||x||_2}}$\\

\textbf{Random Projection Algorithm}\\
INPUT: $A \in {\cal{R}}^{m \times n}$
\begin{enumerate}
\item[$1)$] 
$R \in {\cal{R}}^{m \times l}$ where $R$ is a random matrix whose entries are distributed as $R_{ij} \sim N(0,1)$\\
and $l \geq \dfrac{c\text{log}(n)}{\epsilon^2}$
\item[$2)$]  Construct $B = \dfrac{1}{\sqrt{l}} R^T A \in {\cal{R}}^{l \times n}$
\item[$3)$] Compute the SVD of $B$, $B = \sum_{i=1}^l{\lambda_i a_i b_i^T}$, where $b_i$ are the right singular vectors and $a_i$ are the left singular vectors.
\item[$4)$] Return $B_k$ or $\tilde{A} = A \sum_{i=1}^k{b_i b_i^T}$
\end{enumerate}
In the last step, we either return $B_k$, the basis or $\tilde{A}$ the actual projection. The runtime of step $1$ is $O(ml)$, step $2$ is $O(mnl)$, step $3$ is $O(nl^2)$ and the total runtime is $O\left(\dfrac{mn\text{log}(n)}{\epsilon^2}\right)$

\begin{claim} $||A-\tilde{A}||_F \leq ||A-A_k||_F^2 + \epsilon ||A_k||_F^2$ with probability $1-4n^{-(\epsilon^2-\epsilon^3)\frac{l}{4}}$.
\end{claim}
\begin{pf}
Recall $A=\sum_i{\sigma_i u_i v_i^T}$, $B=\sum_i{\lambda_i a_i b_i^T}$, and $\tilde{A}=A \sum_i{b_i b_i^T}$
\begin{eqnarray*}
||A-\tilde{A}||_F^2 &=& \sum_{i=1}^n{||(A-\tilde{A}) b_i||^2_2} = \sum_i{||Ab_i - \tilde{A}b_i||_2^2} \\
 &=& \sum_i{\left|\left|Ab_i - A\left(\sum_{j=1}^k{b_jb_j^T}\right)b_i\right|\right|_2^2} = \sum_{i=k+1}^n{||Ab_i||^2_2}\\
 &=&||A||_F^2 - \sum_{i=1}^k{||Ab_i||_2^2} = ||A-A_k||_F^2 + ||A_k||_F^2 - \sum_{i=1}^k{||Ab_i||_2^2}
\end{eqnarray*}
We want to relate $\sum_{i=1}^k{||Ab_i||^2_2}$ to the singular values of $B$.
\begin{eqnarray*}
\sum_{i=1}^k{\lambda_i^2} &=& \sum_{i=1}^k{||Bb_i||_2^2} = \sum_{i=1}^k{\dfrac{1}{l}||R^TAb_i||^2_2}\\
 &=& \left(1+\dfrac{\epsilon}{2}\right)\sum_{i=1}^k{||Ab_i||_2^2} \text{~~~~by JS}
\end{eqnarray*}
Now, $\sum_{i=1}^k{||Ab_i||^2_2} \leq \dfrac{1}{1+\frac{\epsilon}{2}}\sum_{i=1}^k{\lambda_i^2}$\\
Since $v_i$ are the basis vectors for $A$, 
\begin{eqnarray*}
\sum_{i=1}^k{\lambda_i^2} &\geq& \sum_i{v_i^TB^TBv_i} = \sum_{i=1}^k{\dfrac{1}{l}v_i^TA^TRR^TAv_i} \\
&=& \sum_{i=1}^k {||\dfrac{1}{l} R^TAv_i||^2_2} \geq \left(1-\frac{\epsilon}{2}\right)\sum_{i=1}^k{||Av_i||_2^2} \\
&=& \left(1-\frac{\epsilon}{2}\right) ||A_k||
\end{eqnarray*}
Combining gives $$\sum_{i=1}^k{||Ab_i||_2^2} ~~\geq~~ \dfrac{1-\frac{\epsilon}{2}}{1+\frac{\epsilon}{2}}||A_k||_F^2 ~~\geq~~ (1-\epsilon)||A_k||^2$$\done
\end{pf}

\section{ Approximating Matrix Products and Random Sampling for Low Rank Approximation }

\textbf{Streaming model:} assume that we can only take passes on the data, no random access is allowed.  The resources required for a streaming algorithm are thus the number of passes, the additional time, and additional space required.

\textbf{Concept:} take a pass over the data, keep a sample, and return output based on that sample.  If all data is similar, uniform random sampling should be sufficient.  If not, we should change our probability distribution to be non-uniform.  Also, we should analyze how much worse this technique will be compared to operating on all of our data.

\textbf{SELECT algorithm}
\begin{enumerate} 
	\item[$1)$] Set $D=0$
	\item[$2)$] while (more data exists in the stream)
	\item[$3)$] \hspace{20pt} read item $\{i, a_i\}$
	\item[$4)$] \hspace{20pt} Set $D=D+a_i$
	\item[$5)$] \hspace{20pt} with probability $\frac{a_i}{D}$, set $i^*=i$ and $a^* = a_i$
	\item[$5)$] output $i^*$ and $a^*$
\end{enumerate}

\textbf{Lemma:} in one pass and O(1) space and time, SELECT returns $i^*$ and $a^*$ such that $P[i=i^*] = \frac{a^*}{\sum_{j=1}^n a_j}$ (proof by induction).

\textbf{Lemma:} If you run SELECT on $(i, j, A_{ij})$, then $P( \{i, j\} = \{i^*,j^*\}) = \frac{A_{ij}}{||A||_F^2}$

\textbf{Basic Matrix Multiplication Algorithm:}\\
Inputs: $A \in \mathbb{R}^{m x n}$, $B \in \mathbb{R}^{n x p}$, $c \ge 0$, probability distribution $P_i$ defined for $i = 1 \rightarrow n$\\
Output: C, an m x c matrix of the sampled columns of A, and R, a c x n matrix of the sampled rows of B.

\begin{enumerate} 
	\item[$1)$] for $t = 1 \rightarrow c$
	\item[$2)$] \hspace{20pt} pick $i_t \in [n]$ with $P(i_t = k) = P_k$
	\item[$4)$] \hspace{20pt} $C^{(t)} = \frac{A^{(i_t)}}{\sqrt{C*P_{i_t}}}$
	\item[$5)$] \hspace{20pt} $R_t = \frac{B_{(i_t)}}{\sqrt{C*P_{i_t}}}$
	\item[$5)$] return C, R
\end{enumerate}

We thus have \\
$CR = \sum_{t=1}^c C^{(t)}R_{(t)} = \sum_{t=1}^{c} \frac{1}{C*P_{i_t}} A^{(i_t)}B_{(i_t)} \approx \sum_{t=1}^n A^{(t)}P_{(t)}$


\end{document}
