\documentclass[twoside]{article}

\usepackage{amssymb, amsmath}
\usepackage{mathrsfs}
\usepackage{textcomp}
\usepackage{amsthm, algorithm, algorithmic}

\oddsidemargin  0in \evensidemargin 0in \topmargin -0.5in
\headheight 0.2in \headsep 0.2in
\textwidth   6.5in \textheight 9in 
\parskip 1.5ex  \parindent 0ex \footskip 40pt

\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{corollary}[theorem]{Corollary}

\DeclareMathOperator{\range}{range}
\DeclareMathOperator{\spn}{span}
\DeclareMathOperator{\Tr}{Tr}

\begin{document}

\framebox[6.4in]{
\begin{minipage}{6.4in}
  \vspace{1mm}
  \center \makebox[6.2in]{{\bf CS369M: Algorithms for Modern Massive Data Set Analysis \hfill Lecture 3 - 09/30/2009}} 
  \vspace{2mm} \\
  \center \makebox[6.2in]{{\Large Approximating matrix multiplication and low-rank approximation}} 
  \vspace{1mm} \\
  \center \makebox[6.2in]{{\it Lecturer: Michael Mahoney \hfill Scribes: Richa Bhayani and Daniel Chen}}
  \vspace{1mm}
\end{minipage}
} \vspace{2mm} \\
\mbox{{\it *Unedited notes}}

\section{ Matrix Multiplication using sampling }
Given two matrices $A$ of size $m*n$ and $B$ of size $n*p$ ,our goal is to produce an approximation to the matrix multiplication product $A*B$. We do this by performing $c$ independent trials , where in each trial we randomly sample an element of ${1,2,....,n}$ with an appropriate probability distribution $P$.We form a matrix $C$ of size $m*c$ consisting of sampled columns of $A$ and a matrix $R$ of size $c*n$ consisting of sampled rows of $B$, both scaled appropriately.If $P$ and the scaling factors are chosen judiciously, we show that $CR$ is a good approximation of $AB$. More precisely we show that,
\begin{equation}\label{approximation_multiplication}
 || (AB) - (CR) ||_F =O(||A||_F||B||_F/\sqrt{c})
\end{equation}

\subsection{Pass efficiency}
   The pass-efficient model is based on the notion that you have the ability to store large amounts of data but do not have random access to it.Data is assumed to be on an external disk, to be presented to the algorithm on a read-only tape.The only access the algorithm to the data is via a pass, where a pass is a sequential read of the entire data. Therefore, the resources of interest are the number of passes over the data,the additional space and time required.

\subsection{Lemmas}
\begin{lemma}
Suppose that ${a_1,a_2,....,a_n}$, $a_i$ \textgreater= 0, are read in one pass i.e.,one sequential read over the data, by the $SELECT$ algorithm.Then the $SELECT$ algorithm requires $O(1)$ i.e.,constant with respect to $n$, additional space and returns a random $i^{*}$ sampled from the probability distribution $Pr[i^{*}=i]=a_i /{\sum_{i^{'}=1}^n}{a_{i^{'}}}$.

% {\sum_{i^{'}^n}=1^n {a_{i^{'}}}}
\end{lemma}

Note that this choice of probability assignment naturally favors heavier elements.

\begin{algorithm}
\caption{SELECT algorithm}
\label{ltsvd}
\begin{algorithmic}[1]
\REQUIRE ${a_1,...,a_n}$, $a_i \textgreater = 0$, read in one pass.
\STATE D=0.
\FOR{$i=1$ to $n$}
\STATE $D=D+a$.
\STATE With probability $a_i /D$, let $i_* =i $ and $a_{i^*}=a_i$.
\ENDFOR
\RETURN$i_*,a_{i^*}$ .
\end{algorithmic}
\end{algorithm}


\begin{lemma}
Suppose that $A \epsilon  R^{m*n}$ is presented in the sparse-unordered representation and is read in one pass by the SELECT algorithm. Then, the algorithm requires $O(1)$ and returns $Pr[i^* =1 and {j^*} =j] = {|A_i|}^2/ {||A||_F}^2$
\end{lemma}

$Pr[i^* =1 and {j^*} =j] = {|A_i|}^2/ {||A||_F}^2$ is obtained simply by marginal probability definition, whereas  the first claim follows from the previous lemma.

Algorithms such as SELECT which select elements without looking at all the data, are known as reservoir algorithms.

\subsection{The BasicMatrixMultiplication algorithm}

This is a simple algorithm to approximate the product AB.Randomly sample with replacement, the terms from $A$ and $B$ , in the summation $c$ times according to a probability distribution $p_i$ where i runs from 1 to n; scale each term appropriately and output the sum of the scaled terms.

\begin{algorithm}
\caption{BASICMATRIXMULTIPLICATION algorithm}
\label{ltsvd}
\begin{algorithmic}[1]
\REQUIRE  $A \epsilon  R^{m*n}$ and  $B \epsilon  R^{n*p}$,  $C \epsilon  Z^{+}$ such that 1 \textless = $c$ \textless =n and ${(p_i)}_{i=1}^n$, are such that $p_i \textgreater=0  and \sum{{p_i}_{i=1}}^n=1$
\FOR{$t=1$ to $c$}
\STATE Pick $i_t \epsilon {1,....,n}$ with $Pr[i_t =k]=p_k, k=1,....,n$, independently.
\STATE Set $C^{(t)}=A^{(i_t)}/ \sqrt{cp_{i_t}}$ and $R_{(t)}=B_{(i_t)}/ \sqrt{cp_{i_t}}$
\ENDFOR
\RETURN$C,R$ .
\end{algorithmic}
\end{algorithm}


\subsection{Analysis of the sampling and running time }
In the case of uniform sampling, you simply perform 'c' trials and construct $C$ and $R$ , thus needing $O(c(m+p))$ additional time and space. In the case of non-uniform sampling however, one must first scan through column-row pairs to decide which to sample, requiring $O(n)$ additional time and space.

\subsection{Analysis of the algorithm for arbitrary probabilities}
We make use of the Jensen's inequality to prove upper bounds for $||AB-CR||_F ^2$. 
\begin{lemma}
Suppose  $A \epsilon  R^{m*n}$ and  $B \epsilon  R^{n*p}$,  $C \epsilon  Z^{+}$ such that 1 \textless = $c$ \textless =n and ${(p_i)}_{i=1}^n$, are such that $p_i \textgreater=0  and \sum{{p_i}_{i=1}}^n=1$.Then, $E[(CR)_{ij}]$= $(AB)_{ij}$ and $Var[(CR)_{ij}]$= (1/c) *$(AB)_{ij}$
\end{lemma}

This lemmma proves that the expectation of the $(i,j)$th element of the approximation is equal to the $(i,j)$th element of the exact product.
The proof for this lemma is discussed in class. We fix $i,j$, and define for some $t$ , $X_t$ = $(A^{(i_t)}B_{(i_t)})/ cp_{i_t} )_{ij}$. Thus, using the definition of expectation of a random variable, we sum over different values of $t$ , ranging from 1 to $n$ and get the desired expressions for expectation and variance.

\begin{lemma}
Suppose  $A \epsilon  R^{m*n}$ and  $B \epsilon  R^{n*p}$,  $C \epsilon  Z^{+}$ such that 1 \textless = $c$ \textless =n and ${(p_i)}_{i=1}^n$, are such that $p_i \textgreater=0  and \sum{{p_i}_{i=1}}^n=1$.Then, E[${||AB - CR||_{F}}^2$]= ${{\sum_{k=1}^n}{|A^{k}|^2 }{|B_{k}|^2)/cp_{k}} - {1/c *{||AB||_F}^2}}$ . Furthermore, if $p_k = |A^{(k)}|B_{(k)}|/{\sum_{k^{'}=1}^n} |A^{(k^{'})}|B_{(k^{'})}|$ then , E[${||AB - CR||_{F}}^2$]= ${1/c*{\sum_{k=1}^n} {{(|A^{k}| }{|B_{k}|})}^2 - {1/c *{||AB||_F}^2}}$
\end{lemma}
Note that by definition $E[{||AB-CR||_F}^2]={\sum_{i=1}^m}
{\sum_{j=1}^p}Var[(CR)_{ij}]$ and so from lemma above, by mere substitution we get the result. 
To prove that this result is optimal however, we use a Lagrange multiplier.
Let $f(p_1,...p_n)= {\sum_{k=1}^n} {1/p_k}*({|A^{k}|}^2 ){|B_{k}|^2}$
Then, let $\lambda$ be the Lagrange multiplier and we define a 
$g(p_1,....,g_n)=f{..}+\lambda ({\sum_{k=1}^n}p_k -1)$.
We get the minimum by taking  the derivatives.And hence, the results.

\subsection{Analysis of the algorithm for near-optimal probabilities}
We proved the above results by using Jensen's. Now to prove the tightness we analyse for near optimal probabilities. Optimal probabilities are those that minimize the $E[{||AB-CR||_F}^2$. For proving that the results change in only small $\beta$ dependent loss in accuracy if the probabilities were defined instead to be $p_k \textgreater = \beta* |A^{(k)}|B_{(k)}|/{\sum_{k^{'}=1}^n} |A^{(k^{'})}|B_{(k^{'})}|$.

\begin{theorem}Suppose  $A \epsilon  R^{m*n}$ and  $B \epsilon  R^{n*p}$,  $C \epsilon  Z^{+}$ such that 1 \textless = $c$ \textless =n and ${(p_i)}_{i=1}^n$ and for some small constant $\beta$ \textless = 1,
	 $p_k \textgreater = \beta* |A^{(k)}|B_{(k)}|/{\sum_{k^{'}=1}^n} |A^{(k^{'})}|B_{(k^{'})}|$
then,
	 $E[{||AB-CR||_F}^2 \textless = (1/ \beta *c) *{{||A||_F}^2 {||B||_F}^2}$ . Furthermore, let $\delta \epsilon (0,1)$ and $\eta = 1+ \sqrt{(8/\beta)log(1/ \beta)}$. Then, with probability atleast $1-\delta$,
	$((||AB-CR||)_F)^2 \textless = (\eta ^2/ \beta *c)*{{||A||_F}^2 {||B||_F}^2}$
   
\end{theorem}

Using the same steps as previous lemma , only using the updates values of probabilities we get the updated expectation, which by Cauchy Schwartz gives you $E[{||AB-CR||_F}^2 \textless = (1/ \beta *c) *{{||A||_F}^2 {||B||_F}^2}$ .
To derive the rest one can make use of triangle inequality and Cauchy Schwartz. Interested readers can look at the paper.

An important consequence of this theorem is that by choosing enough number of column-row pairs, the error can be made arbitrarily small.

\section{Low-Rank Matrix Approximation by Sampling}
Given a matrix $A$, we seek to compute what is in some sense an approximation to the SVD of $A$. When we compute a SVD, we find a rank-$k$ matrix $U_k$ that best approximates the column space of $A$. Here, we seek a rank-$k$ matrix $H_k$ which does not do much worse than $U_k$. To do so, we begin with two facts from matrix perturbation theory that will be central to our analysis: If $A,E \in \mathbb{R}^{m \times n}$, then
\begin{equation}\label{perturbation_spectral_bound}
\max_{1 \leq t \leq n} | \sigma_t(A+E) - \sigma_t(A) | \leq \|E\|_2
\end{equation}
and 
\begin{equation}\label{perturbation_frobenius_bound}
\sum_{k=1}^n  (\sigma_t(A+E) - \sigma_t(A))^2  \leq \|E\|_F^2
\end{equation}

The algorithm is described as follows:

\begin{algorithm}
\caption{LinearTimeSVD}
\label{ltsvd}
\begin{algorithmic}[1]
\REQUIRE $A \in \mathbb{R}^{m \times n}$, $c,k \in \mathbb{Z}^+$ such that $1 \leq k \leq c \leq n$, $\{ p_i \}_{i=1}^n$ such that $p_i \leq 0$ and $\sum_{i=1}^n p_i = 1$.
\ENSURE $H_k \in \mathbb{R}^{m \times k}$ and $\sigma_t(C),t=1,\dots,k$.
\FOR{$t=1$ to $c$}
\STATE Pick $i_t \in 1, \dots, n$ with $\Pr(i_t = \alpha) = p_\alpha, \alpha = 1,\dots, n$.
\STATE Set $C^{(t)} = A^{(i_t)} / \sqrt{cp_{i_t}}$.
\ENDFOR
\STATE Compute $C^TC$ and its SVD; say $C^TC = \sum_{t=1}^c \sigma_t^2(C)y^t{y^t}^T$.
\STATE Compute $h^t = Cy^t / \sigma_t(C)$ for $t = 1, \dots, k$.
\RETURN $H_k$, where $H_k^{(t)} = h^t$, and $\sigma_t(C),t=1,\dots,k$.
\end{algorithmic}
\end{algorithm}

This algorithm is pass efficient because the probabilities $p_\alpha$ can be calculated in one pass and the matrix $C$ can be determined in another pass. It uses $O(mc^2)$ time for computing $C^TC$ and its SVD, and uses $O(mck)$ total time to compute the columns $h^t$. Therefore, the total additional time required by this algorithm is $O(mc^2)$, which is linear in the dimension $m$. The approximation $H_k$ achieves the following bounds:

\begin{theorem}
Suppose $A \in \mathbb{R}^{m \times n}$ and let $H_k$ be constructed from the LinearTimeSVD algorithm. Then
\begin{equation}\label{spectral_bound}
\| A - H_kH_k^TA \|_2^2 \leq \| A - A_k \|_2^2 + 2 \|AA^T - CC^T \|_2
\end{equation}
and
\begin{equation}\label{frobenius_bound}
\| A - H_kH_k^TA \|_F^2 \leq \| A - A_k \|_F^2 + 2 \sqrt{k} \|AA^T - CC^T \|_F
\end{equation}
\end{theorem}
We will prove Equation (\ref{spectral_bound}) in class. We will sketch the proof for Equation (\ref{frobenius_bound}) but Interested readers can find the the full proof in the references.
\begin{proof}[Proof for Equation (\ref{spectral_bound})]
Let $\mathcal{H}_k = \range(H_k) = \spn(h^{(1)}, h^{(2)}, \dots, h^{(k)})$ and let $\mathcal{H}_{m-k}$ be the orthogonal complement of $\mathcal{H}_k$. Furthermore, consider any vector $x \in \mathbb{R}^m$. We can write $x$ as $\alpha y + \beta z$ where $y \in \mathcal{H}_k$ and $z \in \mathcal{H}_{m-k}$. Then, we have:
\begin{align}
\| A - H_kH_k^TA \|_2 &= \max_{x \in \mathbb{R}^m, |x| = 1} |x^T(A - H_kH_k^TA)|\label{spectral}\\
&= \max_{y \in \mathcal{H}_k, |y| = 1, z \in \mathcal{H}_{m-k}, |z| = 1, \alpha^2 + \beta^2 = 1} |(\alpha y^T + \beta z^T) (A - H_kH_k^TA) |\\
&\leq \max_{y \in \mathcal{H}_k, |y| = 1}  |y^T(A - H_kH_k^TA)| + \max_{z \in \mathcal{H}_{m-k}, |z| = 1}  |z^T(A - H_kH_k^TA)|\\
&= \max_{z \in \mathcal{H}_{m-k}, |z| = 1} |z^TA|\label{orthogonal}
\end{align}
Equation (\ref{spectral}) follows because for any matrix $B$, $\|B\|_2 = \sigma_1(B) = \sigma_1(B^T) = \|B^T\|_2$ and equation (\ref{orthogonal}) follows because $y$ projected onto $\mathcal{H}_k$ is $y$ and $z$ projected onto $\mathcal{H}_k$ is $0$. Then, we consider $z^TA$ with $z \in \mathcal{H}_{m-k}$:
\begin{align}
|z^TA|^2 &= z^TAA^Tz\\
&= z^TCC^Tz + z^T(AA^T-CC^T)z\\
&\leq \sigma_{k+1}^2(C)  + \|AA^T - CC^T \|_2\label{maxpossible}\\
&= \sigma_{k+1}(CC^T) + \|AA^T - CC^T \|_2\\
&\leq \sigma_{k+1}(AA^T) +  2\|AA^T - CC^T \|_2\label{matrixperturb}\\
&= \sigma_{k+1}^2(A) +  2\|AA^T - CC^T \|_2\\
&= \|A - A_k \|_2^2 + 2\|AA^T - CC^T \|_2
\end{align}
The first half of inequality (\ref{maxpossible}) follows because $z$ is in $\mathcal{H}_{m-k}$ and hence the maximum value of $|z^TC|$ is $\sigma_{k+1}(C)$. The second half of (\ref{maxpossible}) follows from submultiplicativity of the spectral norm. Inequality (\ref{matrixperturb}) follows from (\ref{perturbation_spectral_bound}).
\end{proof}
\begin{proof}[Sketch of Proof for Equation (\ref{frobenius_bound})]
We first note that $||X||_F^2 = \Tr(X^TX)$. Then, we can expand $\| A - H_kH_k^TA \|_F^2$ as a trace of a matrix product and obtain $\|A\|_F^2 - \|A^TH_k\|_F^2$. Then, we prove the following claim:
\begin{equation}
\left| \|A^TH_k \|_F^2 - \sum_{t=1}^k \sigma_t^2(C) \right| \leq \sqrt{k} \| AA^T - CC^T \|_F
\end{equation}
With (\ref{perturbation_frobenius_bound}) we can also show:
\begin{equation}
\left|  \sum_{t=1}^k \sigma_t^2(C) - \sum_{t=1}^k \sigma_t^2(A)  \right| \leq \sqrt{k} \| AA^T - CC^T \|_F
\end{equation}
From these two inequalities, we obtain (\ref{frobenius_bound}).
\end{proof}
From (\ref{spectral_bound}) and (\ref{frobenius_bound}), it can then be show that with high probability,
\begin{equation}
\| A - H_kH_k^TA \|_2 \leq \|A - A_k \|_2^2 + \epsilon \|A\|_F^2
\end{equation}
and
\begin{equation}
\| A - H_kH_k^TA \|_F \leq \|A - A_k \|_F^2 + \epsilon \|A\|_F^2
\end{equation}
if $p_i$ are nearly optimal and $\epsilon$ is chosen judiciously.

\end{document}
