\documentclass{article}
\usepackage[left=1.25in,top=1.25in,right=1.25in,bottom=1.25in,head=1.25in]{geometry}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{verbatim,float,url,enumerate}
\usepackage{graphicx,subfigure,psfrag}
\usepackage{natbib,xcolor}
\usepackage{microtype}
\newtheorem{algorithm}{Algorithm}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\theoremstyle{remark}
\newtheorem{remark}{Remark}
\theoremstyle{definition}
\newtheorem{definition}{Definition}
\newcommand{\argmin}{\mathop{\mathrm{argmin}}}
\newcommand{\argmax}{\mathop{\mathrm{argmax}}}
\newcommand{\minimize}{\mathop{\mathrm{minimize}}}
\newcommand{\maximize}{\mathop{\mathrm{maximize}}}
\newcommand{\st}{\mathop{\mathrm{subject\,\,to}}}
\def\R{\mathbb{R}}
\def\E{\mathbb{E}}
\def\P{\mathbb{P}}
\def\S{\mathbb{S}}
\def\Cov{\mathrm{Cov}}
\def\Var{\mathrm{Var}}
\def\half{\frac{1}{2}}
\def\sign{\mathrm{sign}}
\def\supp{\mathrm{supp}}
\def\th{\mathrm{th}}
\def\tr{\mathrm{tr}}
\def\dim{\mathrm{dim}}
\def\dom{\mathrm{dom}}
\begin{document}
\title{Homework 1}
\author{\Large Convex Optimization 10-725}
\date{{\bf Due Friday September 13 at 11:59pm} \\
\bigskip
Submit your work as a single PDF on Gradescope. Make sure to prepare your
solution to each problem on a separate page. (Gradescope will ask you select the
pages which contain the solution to each problem.) \\
\bigskip
Total: 66 points (+ 10 bonus points)}
\maketitle
\section{Convex sets (16 points)}
\noindent
(a, 12 pts) Closed sets and convex sets.
\begin{enumerate}[i.]
\item Show that a polyhedron $\{x \in \R^n : Ax \leq b \}$, for some $A\in
\R^{m\times n}, b\in \R^m$, is both convex and closed.
\item Show that if $S_i \subseteq \R^n$, $i\in I$ is a collection of convex
sets, then their intersection $\cap_{i\in I} S_i$ is also convex. Show that
the same statement holds if we replace ``convex'' with ``closed''.
\item Given an example of a closed set in $\R^2$ whose convex hull is not
closed.
\item Let $A\in\R^{m\times n}$. Show that if $S\subseteq \R^m$ is convex then
so is $A^{-1}(S) =\{ x \in \R^n : Ax \in S\}$, which is called the preimage of
$S$ under the map $A :\R^n \to \R^m$. Show that the same statement holds if
we replace ``convex'' with ``closed''.
\item Let $A\in\R^{m\times n}$. Show that if $S\subseteq \R^n$ is convex then
so is $A(S) = \{ Ax : x \in S \}$, called the image of $S$ under $A$.
\item Give an example of a matrix $A\in\R^{m\times n}$ and a set $S\subseteq
\R^n$ that is closed and convex but such that $A(S)$ is not closed.
\end{enumerate}
\bigskip
\noindent
(b, 4 pts) Polyhedra.
\begin{enumerate}[i.]
\item Show that if $P \subseteq \R^n$ is a polyhedron, and $A \in \R^{m \times
n}$, then $A(P)$ is a polyhedron. Hint: you may use the fact that
$$
\text{$P \subseteq \R^{m+n}$ is a polyhedron}
\; \Rightarrow \;
\text{$\{x\in \R^n : \text{$(x,y)\in P$ for some $y\in\R^m$}\}$
is a polyhedron}.
$$
\item Show that if $Q \subseteq \R^m$ is a polyhedron, and $A \in \R^{m \times
n}$, then $A^{-1}(Q)$ is a polyhedron.
\end{enumerate}
\section{Convex functions (14 points)}
\noindent
(a, 2 pts) Prove that the {\it entropy function}, defined as
$$
f(x) = -\sum_{i=1}^n x_i \log(x_i),
$$
with \smash{$\dom(f) = \{x \in \R^n_{++} : \sum_{i=1}^n x_i = 1\}$},
is strictly concave.
\bigskip
\noindent
(b, 4 pts) Let $f$ be twice differentiable, with $\mathrm{dom}(f)$ convex.
Prove that $f$ is convex if and only if
$$
(\nabla f(x) - \nabla f(y))^T (x-y) \geq 0,
$$
for all $x,y$. This property is called {\it monotonicity} of the gradient
$\nabla f$.
\bigskip
\noindent
(c, 2 pts) Give an example of a strictly convex function that does not attain
its infimum.
\bigskip
\noindent
(d, 3 pts) A function $f : \R^n \to \R$ is said to be {\it coercive} provided
that $f(x) \to \infty$ as $\|x\|_2 \to \infty$. A key fact about coercive
functions is that they attain their infimums. Prove that a twice
differentiable, strongly convex function is coercive and hence attains its
infimum. Hint: use Q3 part (b.iv).
\bigskip
\noindent
(e, 3 pts) Prove that the maximum of a convex function over a bounded
polyhedron must occur at one of the vertices. Hint: you may use the fact that a
bounded polyhedron can be represented as the convex hull of its vertices.
\section{Partial optimization with $\ell_2$ penalties (10 bonus points)}
Consider the problem
\begin{equation}
\label{eq:L2_prob}
\min_{\beta, \, \sigma \geq 0} \; f(\beta) + \frac{\lambda}{2} \sum_{i=1}^n
g(\beta_i,\sigma_i),
\end{equation}
for some convex $f$ with domain $\R^n$, $\lambda \geq 0$, and
$$
g(x,y) = \begin{cases}
x^2/y + y & \text{if $y > 0$} \\
0 & \text{if $x=0$, $y=0$} \\
\infty & \text{else}.
\end{cases}
$$
In other words, the problem \eqref{eq:L2_prob} is just the weighted $\ell_2$
penalized problem
$$
\min_{\beta, \, \sigma \geq 0} \;\, f(\beta) +
\frac{\lambda}{2} \sum_{i=1}^n
\Big(\frac{\beta_i^2}{\sigma_i}+\sigma_i\Big),
$$
but being careful to treat the $i$th term in the sum as zero when
$\beta_i=\sigma_i=0$.
\bigskip
\noindent
(a, 5 pts) Prove that $g$ is convex. Hence argue that \eqref{eq:L2_prob} is a
convex problem. Note that this means we can perform partial
optimization in \eqref{eq:L2_prob} and expect it to return another
convex problem. Hint: use the definition of convexity.
\bigskip
\noindent
(b, 2 pts) Argue that $\min_{y \geq 0} \, g(x,y) = 2|x|$.
\bigskip
\noindent
(c, 3 pts) Argue that minimizing over $\sigma \geq 0$ in \eqref{eq:L2_prob}
gives the $\ell_1$ penalized problem
$$
\min_\beta \; f(\beta) + \lambda \|\beta\|_1.
$$
\section{Lipschitz gradients and strong convexity (18 points)}
Let $f$ be convex and twice continuously differentiable.
\bigskip
\noindent
(a, 10 pts) Show that the following statements are equivalent.
\begin{enumerate}[i.]
\item $\nabla f$ is Lipschitz with constant $L$;
\item $(\nabla f(x) - \nabla f(y))^T(x-y) \leq L \|x-y\|_2^2$ for all
$x,y$;
\item $\nabla^2 f(x) \preceq LI$ for all $x$;
\item $f(y) \leq f(x) + \nabla f(x)^T (y-x) + \frac{L}{2} \|y-x\|_2^2$
for all $x,y$.
\end{enumerate}
Your solution should have 5 parts, where you prove i $\Rightarrow$ ii, ii
$\Rightarrow$ iii, iii $\Rightarrow$ iv, iv $\Rightarrow$ ii, and iii
$\Rightarrow$ i.
\bigskip
\noindent
(b, 8 pts) Show that the following statements are equivalent.
\begin{enumerate}[i.]
\item $f$ is strongly convex with constant $m$;
\item $(\nabla f(x) - \nabla f(y))^T(x-y) \geq m \|x-y\|_2^2$ for all
$x,y$;
\item $\nabla^2 f(x) \succeq mI$ for all $x$;
\item $f(y) \geq f(x) + \nabla f(x)^T (y-x) + \frac{m}{2}
\|y-x\|_2^2$ for all $x,y$.
\end{enumerate}
Your solution should have 4 parts, where you prove i $\Rightarrow$ ii, ii
$\Rightarrow$ iii, iii $\Rightarrow$ iv, and iv $\Rightarrow$ i.
\section{Solving optimization problems with CVX (18 points)}
CVX is a fantastic framework for disciplined convex programming: it's rarely
the fastest tool for the job, but it's widely applicable, and so it's a great
tool to be comfortable with. In this exercise we will set up the CVX
environment and solve a convex optimization problem.
Generally speaking, for homeworks in this class, your solution to
programming-based problems should include plots and whatever explanation
necessary to answer the questions asked. In addition, you full code should be
submitted as an appendix to the homework document.
CVX variants are available for each of the major numerical programming
languages. There are some minor syntactic and functional differences between the
variants but all provide essentially the same functionality. Download the CVX
variant of your choosing:
\begin{itemize}
\item Matlab: \url{http://cvxr.com/cvx/}
\item Python: \url{http://www.cvxpy.org/}
\item R: \url{https://cvxr.rbind.io}
\item Julia: \url{https://github.com/JuliaOpt/Convex.jl}
\end{itemize}
and consult the documentation to understand the basic functionality. Make sure
that you can solve the least squares problem $\min_\beta \; \|y-X\beta\|_2^2$
for an arbitrary vector $y$ and matrix $X$. Check your answer by comparing with
the closed-form solution $(X^T X)^{-1} X^T y$.
\bigskip
\noindent
(a, 10 pts) Given labels $y \in \{-1,1\}^n$, and a feature matrix $X \in
\R^{n\times p}$ with rows $x_1,\ldots x_n$, recall the support vector machine
(SVM) problem
\begin{alignat*}{2}
&\min_{\beta,\beta_0,\xi} \quad
&& \frac{1}{2} \|\beta\|_2^2 + C \sum_{i=1}^n \xi_i \\
&\st \quad && \xi_i \geq 0, \; i=1,\ldots n \\
& && y_i(x_i^T \beta + \beta_0) \geq 1-\xi_i, \;
i=1,\ldots n.
\end{alignat*}
\begin{enumerate}[i.]
\item Load the training data in {\tt xy\_train.csv}. This is a matrix of $n=200$
row and 3 columns. The first two columns give the first $p=2$ features, and
the third column gives the labels. Using CVX, solve the SVM problem with
$C=1$. Report the optimal crtierion value, and the optimal coefficients
$\beta \in \R^2$ and intercept $\beta_0 \in \R$.
\item Recall that the SVM solution defines a hyperplane
$$
\beta_0 + \beta^T x = 0,
$$
which serves as the decision boundary for the SVM classifier. Plot the
training data and color the points from the two classes differently. Draw
the decision boundary on top.
\item Now define \smash{$\tilde{X} \in \R^{n \times p}$} to have rows
$\tilde{x}_i=y_i x_i$, $i=1,\ldots,n$, and solve using CVX the problem
\begin{alignat*}{2}
&\max_w \quad && -\frac{1}{2} w^T \tilde{X} \tilde{X}^T w + 1^T w \\
&\st \quad && 0 \leq w \leq C1, \; w^T y = 0,
\end{alignat*}
(Above, we use 1 to denote the vector of all 1s.) Report the optimal
criterion value; it should match that from part i. Also report
\smash{$\tilde{X}^T w$} at the optimal $w$; this should mach the optimal
$\beta$ from part i. Note: this is not a coincidence, and is an example of
{\it duality}, as we will study in detail later in the course.
\item Investigate many values of the cost parameter $C=2^a$, as $a$ varies from
$-5$ to $5$. For each one, solve the SVM problem, form the decision boundary,
and calculate the misclassification error on the test data in {\tt
xy\_test.csv}. Make a plot of misclassification error (y-axis) versus $C$
(x-axis, which you will probably want to put a log scale).
\end{enumerate}
\bigskip
\noindent
(b, 8 pts) Disciplined convex programming (DCP) is a system for composing
functions while ensuring their convexity. It is the language that underlies
CVX. Essentially, each node in the parse tree for a convex expression is tagged
with attributes for curvature (convex, concave, affine, constant) and sign
(positive, negative) allowing for reasoning about the convexity of entire
expressions. The website \url{http://dcp.stanford.edu/} provides visualization
and analysis of simple expressions.
Typically, writing problems in the DCP form is natural, but in some cases
manipulation is required to construct expressions that satisfy the rules. For
each set of mathematical expressions below, first briefly explain why each
defines a convex set. Then, give an equivalent DCP expression along with a brief
explanation of why the DCP expression is equivalent to the original for each
set. DCP expressions should be given in a form that passes analysis (a green
tick on the left of the expression box) at
\url{http://dcp.stanford.edu/analyzer}.
Note: this question is really about developing a better understanding of the
various composition rules for convex functions.
\begin{enumerate}[i.]
\item $\|(x, y, z)\|_2^2 \le 1$
\item $\sqrt{x^2 + 1} \le 3x + y$
\item $1/x + 2/y \le 5, x > 0, y > 0$
\item $(x+y)^2/\sqrt{y} \le x - y + 5, y > 0$
\item $(x+z)y \ge 1, x+z \ge 0, y \ge 0$
\item $\|(x + 2y, x-y)\|_2 = 0$
\item $x\sqrt{y} \ge 1$, $x \ge 0, y \ge 0$
\item $\log(e^{y-1} + e^{x/2}) \le -e^x $
\end{enumerate}
\end{document}