\documentclass{article}
\usepackage[left=1.25in,top=1.25in,right=1.25in,bottom=1.25in,head=1.25in]{geometry}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{verbatim,float,url,enumerate}
\usepackage{graphicx,subfigure,psfrag}
\usepackage{natbib,xcolor,dsfont}
\usepackage{microtype}
\usepackage{mathtools}
\newtheorem{algorithm}{Algorithm}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\theoremstyle{remark}
\newtheorem{remark}{Remark}
\theoremstyle{definition}
\newtheorem{definition}{Definition}
\newcommand{\argmin}{\mathop{\mathrm{argmin}}}
\newcommand{\argmax}{\mathop{\mathrm{argmax}}}
\newcommand{\minimize}{\mathop{\mathrm{minimize}}}
\newcommand{\maximize}{\mathop{\mathrm{maximize}}}
\newcommand{\st}{\mathop{\mathrm{subject\,\,to}}}
\def\R{\mathbb{R}}
\def\E{\mathbb{E}}
\def\P{\mathbb{P}}
\def\S{\mathbb{S}}
\def\Cov{\mathrm{Cov}}
\def\Var{\mathrm{Var}}
\def\half{\frac{1}{2}}
\def\sign{\mathrm{sign}}
\def\supp{\mathrm{supp}}
\def\th{\mathrm{th}}
\def\tr{\mathrm{tr}}
\def\dim{\mathrm{dim}}
\def\dom{\mathrm{dom}}
\def\prox{\mathrm{prox}}
\begin{document}
\title{Homework 5}
\author{\Large Convex Optimization 10-725}
\date{{\bf Due Friday November 15 at 11:59pm} \\
\bigskip
Submit your work as a single PDF on Gradescope. Make sure to prepare your
solution to each problem on a separate page. (Gradescope will ask you select the
pages which contain the solution to each problem.) \\
\bigskip
Choose to solve {\bf one of Q3 or Q4}. \\
That is, you will either submit Q1+Q2+Q3 or Q1+Q2+Q4. \\
\bigskip
Note that the programming questions in this assignment, Q1(e), Q3(g), Q4(f),
will be {\bf peer-graded}. Instructions to follow about how this will be done. \\
\bigskip
Total: 63 points}
\maketitle
\section{Coordinatewise optima of smooth + separable convex functions (23
points)}
Let $f(x) = g(x) + \sum_{i=1}^n h_i(x_i)$, where $g$ and $h_i$, $i=1,\ldots,n$
are convex, and $g$ is differentiable. Let $x$ be a point that is a {\it
coordinatewise minimizer}, i.e.,
$$
f(x + v e_i) \geq f(x), \;\;\; \text{for all $v$ and $i=1,\ldots,n$}.
$$
(Here $e_i$ is denotes the $i$th standard basis vector, which is all 0s except
for a 1 in the $i$th component, for $i=1,\ldots,n$.) We will show that
$x$ must be a global minimizer of $f$, by fixing an arbitrary $y$ and
establishing that $f(y) \geq f(x)$.
\bigskip
\noindent
(a, 2 pts) Using convexity of $g$, show that
$$
f(y)-f(x) \geq \sum_{i=1}^n \underbrace{\big(\nabla_i g(x) (y_i-x_i) +
h_i(y_i)-h_i(x_i)\big)}_{a_i},
$$
where $\nabla_i g$ denotes the $i$th component of the vector-valued gradient
map $\nabla g$.
\bigskip
\noindent
(b, 4 pts) Using the fact that $x$ is a coordinatewise minimizer, and
subgradient optimality, show that $a_i \geq 0$, $i=1,\ldots,n$, and thus $f(y)
\geq f(x)$.
\bigskip
\noindent
(c, 5 pts) Now let $g(x) = \frac{1}{2} x^T Q x - b^T x$ for $Q \succ 0$, and
$h(x)=\lambda \|x\|_1$. Write out the updates for one cycle of coordinate
descent:
$$
x^{(k)}_i = \argmin_{x_i} \; f \big(x_1^{(k)},\ldots,
x_{i-1}^{(k)},x_i,x_{i+1}^{(k-1)},\ldots,x_n^{(k-1)}\big), \;\;\; i=1,\ldots,n,
$$
as expicitly as possible. Write out the updates for one cycle of coordinate
proximal gradient descent:
$$
x^{(k)}_i = \prox_{h_i,t_{ki}} \Big(
x^{(k-1)}_i - t_{ki} \cdot \nabla_i g \big(x_1^{(k)},\ldots,
x_i^{(k-1)},\ldots,x_n^{(k-1)}\big)\Big), \;\;\; i=1,\ldots,n,
$$
again as explicitly as possible. Show that these two are equal under certain
choices of step sizes $t_{ki}$, $i=1,\ldots,n$.
\bigskip
\noindent
(d, 2 pts) Argue that the step sizes, which make coordinate proximal gradient descent equivalent
to coordinate descent, can be viewed as the result of {\it exact} step size
optimization, i.e., exact line search,
$$
t_{ki} = \argmin_{t \geq 0} \; f \big(x_1^{(k)},\ldots,
x_{i-1}^{(k)},x^{(k)}_i(t),x_{i+1}^{(k-1)},\ldots,x_n^{(k-1)}\big), \;\;\;
i=1,\ldots,n,
$$
where $x^{(k)}_i(t)$ denotes the $i$th update from coordinate proximal gradient
descent with step size $t$.
\bigskip
\noindent
(e, 10 pts) Design and conduct an experiment to empirically investigate, for the
given class of functions $f=g+h$ (quadratic plus $\ell_1$), the use of exact
steps sizes in coordinate proximal gradient descent. That is, we know (from
parts (c) and (d)) that coordinate descent is the same as using exact line
search at each step of coordinate proximal gradient descent; how much does this
help over fixed step sizes, or backtracking line search?
Some general tips: be completely explicit about all your experimental design
choices; think in particular about the problem conditioning; use figures rather
than tables to report what you find; aggregate results over multiple simulation
instances. Your simulation will be graded on the following criteria (3 points
each):
\begin{itemize}
\item is the setup clearly explained?
\item are the results clearly explained?
\item are the conclusions justified?
\end{itemize}
As usual, append all code in an appendix (1 point for readable/organized code).
\section{Conjugates, duality, and proximal mappings (18 points)}
Let $f,g$ be closed and convex functions, and $f^*,g^*$ denote their
conjugates.
\bigskip
\noindent
(a, 2 pts) For a matrix $A \in \R^{m\times n}$, prove that the dual problem of
\begin{equation}
\label{eq:primal}
\min_x \; f(x) + g(Ax)
\end{equation}
is
\begin{equation}
\label{eq:dual}
\max_y \; -f^*(-A^T y) - g^*(y).
\end{equation}
\bigskip
\noindent
(b, 3 pts) Assume that $f$ is strictly convex. Prove that this implies $f^*$ is
differentiable, and that
$$
\nabla f^*(y) = \argmin_z \; f(z) - y^T z.
$$
Hint: use the fact that $x \in \partial f^* (y) \iff y \in \partial f(x)$, as you
established if Q2(d) of Homework 3.
\bigskip
\noindent
From now on, assume that $f$ is strictly convex and smooth, and $g$ is not
smooth, but we know its proximal operator
$$
\prox_{g,t} (x) = \argmin_z \; \frac{1}{2t} \|x-z\|_2^2 + g(z).
$$
We note that this {\it does not} necessarily mean that we know the proximal
operator for $h(x)=g(Ax)$. Therefore we cannot easily apply proximal gradient
descent to the primal problem \eqref{eq:primal}. However, as you will show in
the next few parts, knowing the the proximal mapping of $g$ {\it does} lead to
the proximal mapping of $g^*$, which leads to an algorithm on the dual problem
\eqref{eq:dual}.
\bigskip
\noindent
(c, 4 pts) Prove first that
$$
\prox_{g,1}(x) + \prox_{g^*,1}(x) = x,
$$
for all $x$. This is sometimes called {\it Moreau's theorem}. Note the
specification $t=1$ in the above. Hint: again, use $x \in \partial g^* (y) \iff
y \in \partial g(x)$.
\bigskip
\noindent
(d, 4 pts) Verify that for $t>0$, we have $(tg)^*(x) = tg^*(x/t)$. Use this,
and part (c), to prove that for any $t>0$,
$$
\prox_{g,t}(x) + t \cdot \prox_{g^*,1/t}(x/t) = x,
$$
for all $x$. Hint: apply part (c) to the function $tg$. Then note that
$\prox_{g,t}(x)=\prox_{tg,1}(x)$, and the same for $g^*$.
\bigskip
\noindent
(e, 2 pts) Now write down a proximal gradient ascent algorithm for the dual
problem \eqref{eq:dual}. Use parts (b) and (d) of this question to express all
quantities in terms of $f$ and $g$. That is, your proximal gradient ascent
updates should not have any appearences of $\nabla f^*$ and
$\prox_{g^*,t}(\cdot)$.
\bigskip
\noindent
(f, 3 pts) Write down the steps of ADMM applied to problem \eqref{eq:primal},
after substituting $g(z)$ for $g(Ax)$ in the criterion, and introducing the
inequality constrained $Ax=z$. Compare these to the steps of the dual proximal
gradient algorithm from part (e). How are they different? Briefly explain any
advantages/disadvantages you see to using each method.
\section{Coordinate descent for the graphical lasso (22 points)}
Let $X \in \R^{n \times p}$ be a data matrix whose rows are independent
observations from $N(0,\Sigma)$. Normality theory tells us that for $x \sim
N(0,\Sigma)$, $\Sigma_{ij}^{-1}=0$ implies that the variables $x_i$ and $x_j$
are conditionally independent given all the other variables $\{x_k\}_{k \notin
\{i,j\}}$. So, if we believe that many pairs of features recorded in $X$ are
conditionally independent given the other features (which is often a reasonable
belief if $p$ is large), then we want an estimate of $\Sigma$ such that
$\Sigma^{-1}$ is sparse. This goal can be achieved by solving the {\it graphical
lasso} problem
\begin{equation}
\label{eq:glasso}
\min_{\Theta} \; -\log \det \Theta + \tr(S\Theta) + \lambda\|\Theta\|_1.
\end{equation}
Here the domain of the minimization problem is $\S_{++}^p$ (the space of
symmetric $p\times p$ positive definite matrices), $S=X^TX/n$ is
the samples covariance matrix, and \smash{$\|\Theta\|_1=\sum_{i,j=1}^p
|\Theta_{ij}|$}. Note that the solution \smash{$\hat\Theta$} in the above
serves as our estimate for $\Sigma^{-1}$.
\bigskip
\noindent
(a, 3 pts) Prove that the subgradient optimality condition for the graphical
lasso problem \eqref{eq:glasso} is
\begin{equation}
\label{eq:glasso_subg}
-\Theta^{-1} + S + \lambda \Gamma = 0,
\end{equation}
where $\Gamma_{ij} \in \partial |\Theta_{ij}|$ for each $i,j$. Let
$W=\Theta^{-1}$. Verify that the above implies that $W_{ii}=S_{ii}+\lambda$ for
each $i$.
\bigskip
\noindent
Consider now partitioning $W$ as
$$
W= \begin{pmatrix}
W_{11} & w_{12} \\
w_{21} & w_{22}
\end{pmatrix},
$$
where $W_{11} \in \R^{(p-1) \times (p-1)}$, $w_{12} \in \R^{p-1}, w_{21}^T \in
\R^{p-1}$, and $w_{22} \in \R$. Consider partitioning the other matrices
$\Theta$, $S$, and $\Gamma$ in the same manner.
\bigskip
\noindent
(b, 2 pts) Using the fact that $W\Theta = I$ and the subgradient optimality
condiiton from part (a), show that
$$
w_{12} = -W_{11} \theta_{12}/\theta_{22}
$$
and therefore
$$
W_{11} \frac{\theta_{12}}{\theta_{22}} + s_{12} + \lambda \gamma_{12} = 0.
$$
\bigskip
\noindent
(c, 3 pts) Let now $\beta = \theta_{12} / \theta_{22} \in \R^{p-1}$. Write a
lasso problem (with $\beta$ as the optimization variable) such that
$\beta = \theta_{12} / \theta_{22}$ is the solution. Note: in this lasso
problem, you may directly write out the quadratic and linear terms in the loss
(i.e., this will be easier than directly specifying the form of the least
squares loss).
\bigskip
\noindent
Observe that, once the lasso problem from part (c) is solved, it is easy to
recover $w_{12}=-W_{11} \beta$ and $w_{21}=w_{12}^T$. Furthermore,
$\theta_{12}$, $\theta_{21}^T$, and $\theta_{22}$ are directly obtained by
solving
$$
\begin{pmatrix}
W_{11} & w_{12} \\
w_{21} & w_{22}
\end{pmatrix}
\begin{pmatrix}
\Theta_{11} & \theta_{12} \\
\theta_{21} & \theta_{22}
\end{pmatrix}
=
\begin{pmatrix}
I & 0 \\
0 & 1
\end{pmatrix}.
$$
By rearranging appropriately the entries of $W$, $\Theta$, $S$, and $\Gamma$,
one can then iterate this procedure to solve for the entire matrix $W$.
\bigskip
\noindent
(d, 3 pts) Describe an algorithm to estimate $\Theta$ based on iterative
blockwise minimization using the results of parts (a), (b), and (c).
\bigskip
\noindent
(e, 1 pts) Explain why, strictly speaking, such an algorithm is in fact
\textit{not} a blockwise coordinate descent algorithm for the graphical lasso
problem as formulated in \eqref{eq:glasso}.
\bigskip
\noindent
(f, 6 pts) We will now show that the coordinate descent algorithm suggested by
parts (a), (b), and (c), which optimizes over the matrix $W$ and is known as
the {\it glasso} algorithm\footnote{Friedman et al.\ (2007), ``Sparse inverse
covariance estimation with the graphical lasso''.}, is in fact a proper
coordinate descent algorithm for the {\it dual} of the graphical lasso
problem\footnote{Mazumder and Hastie (2013), ``The graphical lasso: New insights
and alternatives''.}. Prove that the dual of \eqref{eq:glasso} is (equivalent
to)
\begin{equation}
\label{eq:glasso_dual}
\min_{\tilde\Gamma} \; -\log \det (\tilde \Gamma + S)
- p \;\; \st \;\; \|\tilde \Gamma\|_{\infty} \leq \lambda.
\end{equation}
Write down the KKT conditions for the dual problem \eqref{eq:glasso_dual}. Show
that the subgradient optimality condition for the primal graphical lasso problem
\eqref{eq:glasso} can be retrieved from the KKT conditions for the dual problem
(possibly after appropriate changes of variable). Finally, briefly clarify why
the glasso algorithm that you derived in part (d) is a proper coordinate descent
for \eqref{eq:glasso_dual}.
\bigskip
\noindent
(g, 4 pts) Produce an empirical example to verify that the glasso algorithm
(which you will implement) is not a descent algorithm on the primal graphical
lasso problem \eqref{eq:glasso}, but is indeed a descent algorithm on its
equivalent dual \eqref{eq:glasso_dual}. Explain clearly your setup and
results. As always, append your code.
\section{Coordinate descent and Dykstra (22 points)}
Given $y \in \R^n$, $X \in \R^{n \times p}$, consider the regularized least
squares program
\begin{equation}
\label{eq:reg}
\min_{w \in \R^p} \; \frac{1}{2} \|y - Xw\|_2^2 +
\sum_{i=1}^d h_i(w_i),
\end{equation}
where $w = (w_1,\ldots,w_d)$ is a block decomposition with $w_i \in \R^{p_i}$,
$i=1,\ldots,d$, and where $h_i$, $i=1,\ldots,d$ are convex functions. Let
$X_i \in \R^{n \times p_i}$, $i=1,\ldots,d$ be a corresponding block
decomposition of the columns of $X$. Consider coordinate descent, which repeats
the following updates:
\begin{equation}
\label{eq:cd}
w_i^{(k)} = \argmin_{w_i \in \R^{p_i}} \; \frac{1}{2} \bigg\| y - \sum_{j*i} X_j w_j^{(k-1)} - X_iw_i \bigg\|_2^2 + h_i(w_i),
\quad i=1,\ldots,d,
\end{equation}
for $k=1,2,3,\ldots$. Assume that $h_i$, $i=1,\ldots,d$ are each support
functions
$$
h_i(v) =\max_{u \in D_i} \; \langle u,v \rangle, \quad i=1,\ldots,d.
$$
where $D_i \subseteq \R^{p_i}$, $i=1,\dots,d$ are closed, convex sets.
\bigskip
\noindent
(a, 3 pts) Show that the dual of \eqref{eq:reg} is what is sometimes called the
{\it best approximation problem}
\begin{equation}
\label{eq:bap}
\min_{u \in \R^n} \; \|y-u\|_2^2
\quad \st \quad u \in C_1 \cap \cdots \cap C_d.
\end{equation}
where each $C_i = (X_i^T)^{-1}(D_i) \subseteq \R^n$, the inverse image of $D_i$
under the linear map $X_i^T$. Show also that the relationship between the
primal and dual solutions $w,u$ is
\begin{equation}
\label{eq:prim_dual}
u = y - Xw.
\end{equation}
\bigskip
\noindent
(b, 2 pts) Assume that each $X_i$ has full column rank. Show that, for each $i$
and any $a \in \R^n$,
$$
w^*_i = \argmin_{w_i \in \R^{p_i}} \; \frac{1}{2} \| a - X_iw_i \|_2^2 +
h_i(w_i) \quad \iff \quad X_iw^*_i = a - P_{C_i}(a).
$$
Hint: write $X_iw_i^*$ in terms of a proximal operator then use Moreau's theorem
in Q2(c).
\bigskip
\noindent
{\it Dykstra's algorithm} for problem \eqref{eq:bap} can be described as
follows. We set \smash{$u_d^{(0)}=y$}, \smash{$z_1^{(0)}=\cdots=z_d^{(0)}=0$},
and then repeat:
\begin{equation}
\begin{aligned}
\label{eq:dyk}
&u_0^{(k)} = u_d^{(k-1)}, \\
&\begin{rcases*}
u_i^{(k)} = P_{C_i} (u_{i-1}^{(k)} + z_i^{(k-1)}), & \\
z_i^{(k)} = u_{i-1}^{(k)} + z_i^{(k-1)} - u_i^{(k)}, &
\end{rcases*}
\quad \text{for $i=1,\ldots,d$},
\end{aligned}
\end{equation}
for $k=1,2,3,\ldots$. As $k \to \infty$, the iterate $u_0^{(k)}$ in
\eqref{eq:dyk} will approach the solution in \eqref{eq:bap}.
\bigskip
\noindent
(c, 6 pts) Assuming we initialize $w^{(0)}=0$, show that coordinate descent
\eqref{eq:cd} for problem \eqref{eq:reg} and Dykstra's algorithm \eqref{eq:dyk}
for problem \eqref{eq:bap} are in fact completely equivalent, and satisfy
$$
z_i^{(k)} = X_iw_i^{(k)} \quad\text{and}\quad
u_i^{(k)}=y - \sum_{j \leq i} X_j w_j^{(k)} - \sum_{j > i} X_j
w_j^{(k-1)}, \quad\text{for $i=1,\ldots,d$},
$$
at all iterations $k=1,2,3,\ldots$. Hint: use an inductive argument, and the
result in part (b).
\bigskip
\noindent
Now let $\gamma_1,\ldots,\gamma_d>0$ be arbitrary weights with
$\sum_{i=1}^d \gamma_i = 1$. Consider the problem
\begin{equation}
\label{eq:bap_prod}
\min_{u=(u_1,\ldots,u_d) \in \R^{nd}} \; \sum_{i=1}^d \gamma_i
\|y-u_i\|_2^2 \quad \st \quad u \in C_0 \cap (C_1 \times \cdots \times C_d),
\end{equation}
where $C_0=\{ (u_1,\ldots,u_d) \in \R^{nd} : u_1=\cdots=u_d\}$. Observe that
this is equivalent to \eqref{eq:bap}, and is sometimes called the {\it
product-space reformulation} of \eqref{eq:bap}, or the {\it consensus form}
of \eqref{eq:bap}.
\bigskip
\noindent
(d, 3 pts) Rescale \eqref{eq:bap_prod} to turn the loss into an unweighted
squared loss, then apply Dykstra's algorithm to the resulting best approximation
problem. Show that the resulting algorithm repeats:
\begin{equation}
\begin{aligned}
\label{eq:dyk_par}
&u_0^{(k)} = \sum_{i=1}^d \gamma_i u_i^{(k-1)}, \\
&\begin{rcases*}
u_i^{(k)} = P_{C_i} (u_0^{(k)} + z_i^{(k-1)}), & \\
z_i^{(k)} = u_0^{(k)} + z_i^{(k-1)} - u_i^{(k)}, &
\end{rcases*}
\quad \text{for $i=1,\ldots,d$},
\end{aligned}
\end{equation}
for $k=1,2,3,\ldots$. Importantly, the steps enclosed in curly brace above
can all be performed in parallel, so that \eqref{eq:dyk_par} is a parallel
version of Dykstra's algorithm \eqref{eq:dyk} for problem \eqref{eq:bap}.
\bigskip
\noindent
(e, 4 pts) Prove that the iterations \eqref{eq:dyk_par} can be rewritten in
equivalent form as
\begin{equation}
\label{eq:cd_dyk_par}
w_i^{(k)} = \argmin_{w_i \in \R^{p_i}} \;
\frac{1}{2} \Big\|y - Xw^{(k-1)} + X_i w_i^{(k-1)}/\gamma_i -
X_i w_i/\gamma_i\Big\|_2^2 + h_i (w_i/\gamma_i),
\quad i=1,\ldots,d,
\end{equation}
for $k=1,2,3,\ldots$. Importantly, the updates above can all be performed in
parallel, so that \eqref{eq:cd_dyk_par} is a parallel version of coordinate
descent \eqref{eq:cd} for problem \eqref{eq:reg}. Hint: use an inductive
argument and the result in part (b), similar to your proof in part (c).
\bigskip
\noindent
(f, 4 pts) Produce an empirical example to verify that coordinate descent and
Dykstra on \eqref{eq:reg} and \eqref{eq:bap}, respectively (which you will both
implement), are equivalent. Explain clearly your setup and results. As always,
append your code.
\end{document}*