@Proceedings{COLT2020,
title = {Proceedings of Machine Learning Research},
booktitle = {Proceedings of Machine Learning Research},
editor = {Jacob Abernethy and Shivani Agarwal},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
volume = 125
}
@InProceedings{abernethy20a,
title = {Conference on Learning Theory 2020: Preface},
author = {Abernethy, Jacob and Agarwal, Shivani},
pages = {1--2},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/abernethy20a/abernethy20a.pdf},
url = {http://proceedings.mlr.press/v125/abernethy20a.html},
abstract = {Preface to the proceedings of the 32nd Conference on Learning Theory}
}
@InProceedings{acharya20a,
title = {Domain Compression and its Application to Randomness-Optimal Distributed Goodness-of-Fit},
author = {Acharya, Jayadev and Canonne, Cl{\'e}ment L and Han, Yanjun and Sun, Ziteng and Tyagi, Himanshu},
pages = {3--40},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/acharya20a/acharya20a.pdf},
url = {http://proceedings.mlr.press/v125/acharya20a.html},
abstract = { We study goodness-of-fit of discrete distributions in the distributed setting, where samples are divided between multiple users who can only release a limited amount of information about their samples due to various information constraints. Recently, a subset of the authors showed that having access to a common random seed (i.e., shared randomness) leads to a significant reduction in the sample complexity of this problem. In this work, we provide a complete understanding of the interplay between the amount of shared randomness available, the stringency of information constraints, and the sample complexity of the testing problem by characterizing a tight trade-off between these three parameters. We provide a general distributed goodness-of-fit protocol that as a function of the amount of shared randomness interpolates smoothly between the private- and public-coin sample complexities. We complement our upper bound with a general framework to prove lower bounds on the sample complexity of this testing problems under limited shared randomness. Finally, we instantiate our bounds for the two archetypal information constraints of communication and local privacy, and show that our sample complexity bounds are optimal as a function of all the parameters of the problem, including the amount of shared randomness. A key component of our upper bounds is a new primitive of \textit{domain compression}, a tool that allows us to map distributions to a much smaller domain size while preserving their pairwise distances, using a limited amount of randomness. }
}
@InProceedings{acharya20b,
title = {Distributed Signal Detection under Communication Constraints},
author = {Acharya, Jayadev and Canonne, Cl{\'e}ment L and Tyagi, Himanshu},
pages = {41--63},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/acharya20b/acharya20b.pdf},
url = {http://proceedings.mlr.press/v125/acharya20b.html},
abstract = { Independent draws from a $d$-dimensional spherical Gaussian distribution are distributed across users, each holding one sample. A central server seeks to distinguish between the two hypotheses: the distribution has zero mean, or the mean has $\ell_2$-norm at least $\varepsilon$, a pre-specified threshold. However, the users can each transmit at most $\ell$ bits to the server. This is the problem of detecting whether an observed signal is simply white noise in a distributed setting. We study this distributed testing problem with and without the availability of a common randomness shared by the users. We design schemes with and without such shared randomness which achieve sample complexities. We then obtain lower bounds for protocols with public randomness, tight when $\ell=O(1)$. We finally conclude with several conjectures and open problems.}
}
@InProceedings{agarwal20a,
title = {Optimality and Approximation with Policy Gradient Methods in Markov Decision Processes},
author = {Agarwal, Alekh and Kakade, Sham M and Lee, Jason D and Mahajan, Gaurav},
pages = {64--66},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/agarwal20a/agarwal20a.pdf},
url = {http://proceedings.mlr.press/v125/agarwal20a.html},
abstract = { Policy gradient (PG) methods are among the most effective methods in challenging reinforcement learning problems with large state and/or action spaces. However, little is known about even their most basic theoretical convergence properties, including: if and how fast they converge to a globally optimal solution (say with a sufficiently rich policy class); how they cope with approximation error due to using a restricted class of parametric policies; or their finite sample behavior. Such characterizations are important not only to compare these methods to their approximate value function counterparts (where such issues are relatively well understood, at least in the worst case), but also to help with more principled approaches to algorithm design. This work provides provable characterizations of computational, approximation, and sample size issues with regards to policy gradient methods in the context of discounted Markov Decision Processes (MDPs). We focus on both: 1) “tabular” policy parameterizations, where the optimal policy is contained in the class and where we show global convergence to the optimal policy, and 2) restricted policy classes, which may not contain the optimal policy and where we provide agnostic learning results. In the \emph{tabular setting}, our main results are: 1) convergence rate to global optimum for direct parameterization and projected gradient ascent 2) an asymptotic convergence to global optimum for softmax policy parameterization and PG; and a convergence rate with additional entropy regularization, and 3) dimension-free convergence to global optimum for softmax policy parameterization and Natural Policy Gradient (NPG) method with exact gradients. In \emph{function approximation}, we further analyze NPG with exact as well as inexact gradients under certain smoothness assumptions on the policy parameterization and establish rates of convergence in terms of the quality of the initial state distribution. One insight of this work is in formalizing how a favorable initial state distribution provides a means to circumvent worst-case exploration issues. Overall, these results place PG methods under a solid theoretical footing, analogous to the global convergence guarantees of iterative value function based algorithms.}
}
@InProceedings{agarwal20b,
title = {Model-Based Reinforcement Learning with a Generative Model is Minimax Optimal},
author = {Agarwal, Alekh and Kakade, Sham and Yang, Lin F.},
pages = {67--83},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/agarwal20b/agarwal20b.pdf},
url = {http://proceedings.mlr.press/v125/agarwal20b.html},
abstract = { This work considers the sample and computational complexity of obtaining an $\epsilon$-optimal policy in a discounted Markov Decision Process (MDP), given only access to a generative model. In this model, the learner accesses the underlying transition model via a sampling oracle that provides a sample of the next state, when given any state-action pair as input. We are interested in a basic and unresolved question in model based planning: is this naïve “plug-in” approach — where we build the maximum likelihood estimate of the transition model in the MDP from observations and then find an optimal policy in this empirical MDP — non-asymptotically, minimax optimal? Our main result answers this question positively. With regards to computation, our result provides a simpler approach towards minimax optimal planning: in comparison to prior model-free results, we show that using \emph{any} high accuracy, black-box planning oracle in the empirical model suffices to obtain the minimax error rate. The key proof technique uses a leave-one-out analysis, in a novel “absorbing MDP” construction, to decouple the statistical dependency issues that arise in the analysis of model-based planning; this construction may be helpful more generally.}
}
@InProceedings{ahn20a,
title = {From Nesterov’s Estimate Sequence to Riemannian Acceleration},
author = {Ahn, Kwangjun and Sra, Suvrit},
pages = {84--118},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/ahn20a/ahn20a.pdf},
url = {http://proceedings.mlr.press/v125/ahn20a.html},
abstract = { We propose the first global accelerated gradient method for Riemannian manifolds. Toward establishing our results, we revisit Nesterov’s estimate sequence technique and develop a conceptually simple alternative from first principles. We then extend our analysis to Riemannian acceleration, localizing the key difficulty into “metric distortion.” We control this distortion via a novel geometric inequality, which enables us to formulate and analyze global Riemannian acceleration.}
}
@InProceedings{alon20a,
title = {Closure Properties for Private Classification and Online Prediction},
author = {Alon, Noga and Beimel, Amos and Moran, Shay and Stemmer, Uri},
pages = {119--152},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/alon20a/alon20a.pdf},
url = {http://proceedings.mlr.press/v125/alon20a.html},
abstract = { Let H be a class of boolean functions and consider a composed class H’ that is derived from H using some arbitrary aggregation rule (for example, H’ may be the class of all 3-wise majority-votes of functions in H). We upper bound the Littlestone dimension of H’ in terms of that of H. As a corollary, we derive closure properties for online learning and private PAC learning. The derived bounds on the Littlestone dimension exhibit an undesirable exponential dependence. For private learning, we prove close to optimal bounds that circumvents this suboptimal dependency. The improved bounds on the sample complexity of private learning are derived algorithmically via transforming a private learner for the original class H to a private learner for the composed class H’. Using the same ideas we show that any (proper or improper) private algorithm that learns a class of functions H in the realizable case (i.e., when the examples are labeled by some function in the class) can be transformed to a private algorithm that learns the class H in the agnostic case.}
}
@InProceedings{alon20b,
title = {Hierarchical Clustering: A 0.585 Revenue Approximation},
author = {Alon, Noga and Azar, Yossi and Vainstein, Danny},
pages = {153--162},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/alon20b/alon20b.pdf},
url = {http://proceedings.mlr.press/v125/alon20b.html},
abstract = { Hierarchical Clustering trees have been widely accepted as a useful form of clustering data, resulting in a prevalence of adopting fields including phylogenetics, image analysis, bioinformatics and more. Recently, Dasgupta (STOC 16’) initiated the analysis of these types of algorithms through the lenses of approximation. Later, the dual problem was considered by Moseley and Wang (NIPS 17’) dubbing it the Revenue goal function. In this problem, given a nonnegative weight $w_{ij}$ for each pair $i,j \in [n]=\{1,2, \ldots ,n\}$, the objective is to find a tree $T$ whose set of leaves is $[n]$ that maximizes the function $\sum_{i1$; (2) almost exact recovery for $ 1 \le k \le o\left( \frac{\log n}{\log \log n} \right)$ if $ \liminf \frac{kD(P_n||Q_n)}{\log n}>1, $ where $\alpha_n \triangleq -2 \log \int \sqrt{d P_n d Q_n}$ is the Rényi divergence of order $\frac{1}{2}$ and $D(P_n||Q_n)$ is the Kullback-Leibler divergence. Under mild distributional assumptions, these conditions are shown to be information-theoretically necessary for any algorithm to succeed. A key challenge in the analysis is the enumeration of $2k$-NN graphs that differ from the hidden one by a given number of edges. We also analyze several computationally efficient algorithms and provide sufficient conditions under which they achieve exact/almost exact recovery. In particular, we develop a polynomial-time algorithm that attains the threshold for exact recovery under the small-world model.}
}
@InProceedings{dong20a,
title = {Root-n-Regret for Learning in Markov Decision Processes with Function Approximation and Low Bellman Rank},
author = {Dong, Kefan and Peng, Jian and Wang, Yining and Zhou, Yuan},
pages = {1554--1557},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/dong20a/dong20a.pdf},
url = {http://proceedings.mlr.press/v125/dong20a.html},
abstract = { In this paper, we consider the problem of online learning of Markov decision processes (MDPs) with very large state spaces. Under the assumptions of realizable function approximation and low Bellman ranks, we develop an online learning algorithm that learns the optimal value function while at the same time achieving very low cumulative regret during the learning process. Our learning algorithm, Adaptive Value-function Elimination (AVE), is inspired by the policy elimination algorithm proposed in (Jiang et al., 2017), known as OLIVE. One of our key technical contributions in AVE is to formulate the elimination steps in OLIVE as contextual bandit problems. This technique enables us to apply the active elimination and expert weighting methods from (Dudik et al., 2011), instead of the random action exploration scheme used in the original OLIVE algorithm, for more efficient exploration and better control of the regret incurred in each policy elimination step. To the best of our knowledge, this is the first root-n-regret result for reinforcement learning in stochastic MDPs with general value function approximation.}
}
@InProceedings{finocchiaro20a,
title = {Embedding Dimension of Polyhedral Losses},
author = {Finocchiaro, Jessie and Frongillo, Rafael and Waggoner, Bo},
pages = {1558--1585},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/finocchiaro20a/finocchiaro20a.pdf},
url = {http://proceedings.mlr.press/v125/finocchiaro20a.html},
abstract = { A common technique in supervised learning with discrete losses, such as 0-1 loss, is to optimize a convex surrogate loss over Rd, calibrated with respect to the original loss. In particular, recent work has investigated embedding the original predictions (e.g. labels) as points in Rd, showing an equivalence to using polyhedral surrogates. In this work, we study the notion of the embedding dimension of a given discrete loss: the minimum dimension d such that an embedding exists. We characterize d-embeddability for all d, with a particularly tight characterization for d=1 (embedding into the real line), and useful necessary conditions for d>1 in the form of a quadratic feasibility program. We illustrate our results with novel lower bounds for abstain loss.}
}
@InProceedings{fotakis20a,
title = {Efficient Parameter Estimation of Truncated Boolean Product Distributions},
author = {Fotakis, Dimitris and Kalavasis, Alkis and Tzamos, Christos},
pages = {1586--1600},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/fotakis20a/fotakis20a.pdf},
url = {http://proceedings.mlr.press/v125/fotakis20a.html},
abstract = { We study the problem of estimating the parameters of a Boolean product distribution in $d$ dimensions, when the samples are truncated by a set $S \subset \{0, 1\}^d$ accessible through a membership oracle. This is the first time that the computational and statistical complexity of learning from truncated samples is considered in a discrete setting. We introduce a natural notion of \emph{fatness} of the truncation set $S$, under which truncated samples reveal enough information about the true distribution. We show that if the truncation set is sufficiently fat, samples from the true distribution can be generated from truncated samples. A stunning consequence is that virtually any statistical task (e.g., learning in total variation distance, parameter estimation, uniformity or identity testing) that can be performed efficiently for Boolean product distributions, can also be performed from truncated samples, with a small increase in sample complexity. We generalize our approach to ranking distributions over $d$ alternatives, where we show how fatness implies efficient parameter estimation of Mallows models from truncated samples. Exploring the limits of learning discrete models from truncated samples, we identify three natural conditions that are necessary for efficient identifiability: (i) the truncation set $S$ should be rich enough; (ii) $S$ should be accessible through membership queries; and (iii) the truncation by $S$ should leave enough randomness in all directions. By carefully adapting the Stochastic Gradient Descent approach of (Daskalakis et al., FOCS 2018), we show that these conditions are also sufficient for efficient learning of truncated Boolean product distributions.}
}
@InProceedings{franks20a,
title = {Rigorous Guarantees for Tyler’s M-Estimator via Quantum Expansion},
author = {Franks, William Cole and Moitra, Ankur},
pages = {1601--1632},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/franks20a/franks20a.pdf},
url = {http://proceedings.mlr.press/v125/franks20a.html},
abstract = { Estimating the shape of an elliptical distribution is a fundamental problem in statistics. One estimator for the shape matrix, Tyler’s M-estimator, has been shown to have many appealing asymptotic properties. It performs well in numerical experiments and can be quickly computed in practice by a simple iterative procedure. Despite the many years the estimator has been studied in the statistics community, there was neither a non-asymptotic bound on the rate of the estimator nor a proof that the iterative procedure converges in polynomially many steps. Here we observe a surprising connection between Tyler’s M-estimator and operator scaling, which has been intensively studied in recent years in part because of its connections to the Brascamp-Lieb inequality in analysis. We use this connection, together with novel results on quantum expanders, to show that Tyler’s M-estimator has the optimal rate up to factors logarithmic in the dimension, and that in the generative model the iterative procedure has a linear convergence rate even without regularization.}
}
@InProceedings{ganassali20a,
title = {From tree matching to sparse graph alignment},
author = {Ganassali, Luca and Massouli\'e, Laurent},
pages = {1633--1665},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/ganassali20a/ganassali20a.pdf},
url = {http://proceedings.mlr.press/v125/ganassali20a.html},
abstract = { In this paper we consider alignment of sparse graphs, for which we introduce the Neighborhood Tree Matching Algorithm (NTMA). For correlated Erdős-R{é}nyi random graphs, we prove that the algorithm returns – in polynomial time – a positive fraction of correctly matched vertices, and a vanishing fraction of mismatches. This result holds with average degree of the graphs in $O(1)$ and correlation parameter $s$ that can be bounded away from $1$, conditions under which random graph alignment is particularly challenging. As a byproduct of the analysis we introduce a matching metric between trees and characterize it for several models of correlated random trees. These results may be of independent interest, yielding for instance efficient tests for determining whether two random trees are correlated or independent.}
}
@InProceedings{garber20a,
title = {On the Convergence of Stochastic Gradient Descent with Low-Rank Projections for Convex Low-Rank Matrix Problems},
author = {Garber, Dan},
pages = {1666--1681},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/garber20a/garber20a.pdf},
url = {http://proceedings.mlr.press/v125/garber20a.html},
abstract = { We revisit the use of Stochastic Gradient Descent (SGD) for solving convex optimization problems that serve as highly popular convex relaxations for many important low-rank matrix recovery problems such as matrix completion, phase retrieval, and more. The computational limitation of applying SGD to solving these relaxations in large-scale is the need to compute a potentially high-rank singular value decomposition (SVD) on each iteration in order to enforce the low-rank-promoting constraint. We begin by considering a simple and natural sufficient condition so that these relaxations indeed admit low-rank solutions. This condition is also necessary for a certain notion of low-rank-robustness to hold. Our main result shows that under this condition which involves the eigenvalues of the gradient vector at optimal points, SGD with mini-batches, when initialized with a “warm-start" point, produces iterates that are low-rank with high probability, and hence only a low-rank SVD computation is required on each iteration. This suggests that SGD may indeed be practically applicable to solving large-scale convex relaxations of low-rank matrix recovery problems. Our theoretical results are accompanied with supporting preliminary empirical evidence. As a side benefit, our analysis is quite simple and short.}
}
@InProceedings{gerbelot20a,
title = {Asymptotic Errors for High-Dimensional Convex Penalized Linear Regression beyond Gaussian Matrices},
author = {Gerbelot, C\'{e}dric and Abbara, Alia and Krzakala, Florent},
pages = {1682--1713},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/gerbelot20a/gerbelot20a.pdf},
url = {http://proceedings.mlr.press/v125/gerbelot20a.html},
abstract = { We consider the problem of learning a coefficient vector $\bf x_0 \in \mathbb R^N$ from noisy linear observations $\mathbf{y} = \mathbf{F}{\mathbf{x}_{0}}+\mathbf{w} \in \mathbb R^M$ in high dimensional limit $M,N \to \infty$ with $\alpha \equiv M/N$ fixed. We provide a rigorous derivation of an explicit formula —first conjectured using heuristics method from statistical physics— for the asymptotic mean squared error obtained by penalized convex estimators such as the LASSO or the elastic net, for a sequence of very generic random matrix $\mathbf{F}$ corresponding to rotationally invariant data matrices of arbitrary spectrum. The proof is based on a convergence analysis of an oracle version of vector approximate message-passing (oracle-VAMP) and on the properties of its state evolution equations. Our method leverages on and highlights the link between vector approximate message-passing, Douglas-Rachford splitting and proximal descent algorithms, extending previous results obtained with i.i.d. matrices for a large class of problems. We illustrate our results on some concrete examples and show that even though they are asymptotic, our predictions agree remarkably well with numerics even for very moderate sizes.}
}
@InProceedings{ghai20a,
title = {No-Regret Prediction in Marginally Stable Systems},
author = {Ghai, Udaya and Lee, Holden and Singh, Karan and Zhang, Cyril and Zhang, Yi},
pages = {1714--1757},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/ghai20a/ghai20a.pdf},
url = {http://proceedings.mlr.press/v125/ghai20a.html},
abstract = { We consider the problem of online prediction in a marginally stable linear dynamical system subject to bounded adversarial or (non-isotropic) stochastic perturbations. This poses two challenges. Firstly, the system is in general unidentifiable, so recent and classical results on parameter recovery do not apply. Secondly, because we allow the system to be marginally stable, the state can grow polynomially with time; this causes standard regret bounds in online convex optimization to be vacuous. In spite of these challenges, we show that the online least-squares algorithm achieves sublinear regret (improvable to polylogarithmic in the stochastic setting), with polynomial dependence on the system’s parameters. This requires a refined regret analysis, including a structural lemma showing the current state of the system to be a small linear combination of past states, even if the state grows polynomially. By applying our techniques to learning an autoregressive filter, we also achieve logarithmic regret in the partially observed setting under Gaussian noise, with polynomial dependence on the memory of the associated Kalman filter.}
}
@InProceedings{golowich20a,
title = {Last Iterate is Slower than Averaged Iterate in Smooth Convex-Concave Saddle Point Problems},
author = {Golowich, Noah and Pattathil, Sarath and Daskalakis, Constantinos and Ozdaglar, Asuman},
pages = {1758--1784},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/golowich20a/golowich20a.pdf},
url = {http://proceedings.mlr.press/v125/golowich20a.html},
abstract = { In this paper we study the smooth convex-concave saddle point problem. Specifically, we analyze the last iterate convergence properties of the Extragradient (EG) algorithm. It is well known that the ergodic (averaged) iterates of EG converge at a rate of $O(1/T)$ (Nemirovski, 2004). In this paper, we show that the last iterate of EG converges at a rate of $O(1/\sqrt{T})$. To the best of our knowledge, this is the first paper to provide a convergence rate guarantee for the last iterate of EG for the smooth convex-concave saddle point problem. Moreover, we show that this rate is tight by proving a lower bound of $\Omega(1/\sqrt{T})$ for the last iterate. This lower bound therefore shows a quadratic separation of the convergence rates of ergodic and last iterates in smooth convex-concave saddle point problems.}
}
@InProceedings{gopi20a,
title = {Locally Private Hypothesis Selection},
author = {Gopi, Sivakanth and Kamath, Gautam and Kulkarni, Janardhan and Nikolov, Aleksandar and Wu, Zhiwei Steven and Zhang, Huanyu},
pages = {1785--1816},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/gopi20a/gopi20a.pdf},
url = {http://proceedings.mlr.press/v125/gopi20a.html},
abstract = { We initiate the study of hypothesis selection under local differential privacy. Given samples from an unknown probability distribution $p$ and a set of $k$ probability distributions $\mathcal{Q}$, we aim to output, under the constraints of $\varepsilon$-differential privacy, a distribution from $\mathcal{Q}$ whose total variation distance to $p$ is comparable to the best such distribution. This is a generalization of the classic problem of $k$-wise simple hypothesis testing, which corresponds to when $p \in \mathcal{Q}$, and we wish to identify $p$. Absent privacy constraints, this problem requires $O(\log k)$ samples from $p$, and it was recently shown that the same complexity is achievable under (central) differential privacy. However, the naive approach to this problem under local differential privacy would require $\tilde O(k^2)$ samples. We first show that the constraint of local differential privacy incurs an exponential increase in cost: any algorithm for this problem requires at least $\Omega(k)$ samples. Second, for the special case of $k$-wise simple hypothesis testing, we provide a non-interactive algorithm which nearly matches this bound, requiring $\tilde O(k)$ samples. Finally, we provide sequentially interactive algorithms for the general case, requiring $\tilde O(k)$ samples and only $O(\log \log k)$ rounds of interactivity. Our algorithms are achieved through a reduction to maximum selection with adversarial comparators, a problem of independent interest for which we initiate study in the parallel setting. For this problem, we provide a family of algorithms for each number of allowed rounds of interaction $t$, as well as lower bounds showing that they are near-optimal for every $t$. Notably, our algorithms result in exponential improvements on the round complexity of previous methods.}
}
@InProceedings{hao20a,
title = {Bessel Smoothing and Multi-Distribution Property Estimation},
author = {Hao, Yi and Li, Ping},
pages = {1817--1876},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/hao20a/hao20a.pdf},
url = {http://proceedings.mlr.press/v125/hao20a.html},
abstract = { We consider a basic problem in statistical learning: estimating properties of multiple discrete distributions. Denoting by $\Delta_k$ the standard simplex over $[k]:=\{0,1,\ldots, k\}$, a property of $d$ distributions is a mapping from $\Delta_k^d$ to $\mathbb R$. These properties include well-known distribution characteristics such as Shannon entropy and support size ($d=1$), and many important divergence measures between distributions ($d=2$). The primary problem being considered is to learn the property value of an $\emph{unknown}$ $d$-tuple of distributions from its sample. The study of such problems dates back to the works of Efron and Thisted (1976b); Thisted and Efron (1987); Good (1953b); Carlton (1969), and has been pushed forward steadily during the past decades. Surprisingly, before our work, the general landscape of this fundamental learning problem was insufficiently understood, and nearly all the existing results are for the special case $d\le 2$. Our first main result provides a near-linear-time computable algorithm that, given independent samples from any collection of distributions and for a broad class of multi-distribution properties, learns the property as well as the empirical plug-in estimator that uses samples with logarithmic-factor larger sizes. As a corollary of this, for any $\varepsilon>0$ and fixed $d\in \mathbb Z^+$, a $d$-distribution property over $[k]$ that is Lipschitz and additively separable can be learned to an accuracy of $\varepsilon$ using a sample of size $\mathcal{O}(k/(\varepsilon^3\sqrt{\log k}))$, with high probability. Our second result addresses a closely related problem– tolerant independence testing: One receives samples from the unknown joint and marginal distributions, and attempts to infer the $\ell_1$ distance between the joint distribution and the product distribution of the marginals. We show that this testing problem also admits a sample complexity sub-linear in the alphabet sizes, demonstrating the broad applicability of our approach.}
}
@InProceedings{hazan20a,
title = {Faster Projection-free Online Learning},
author = {Hazan, Elad and Minasyan, Edgar},
pages = {1877--1893},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/hazan20a/hazan20a.pdf},
url = {http://proceedings.mlr.press/v125/hazan20a.html},
abstract = { In many online learning problems the computational bottleneck for gradient-based methods is the projection operation. For this reason, in many problems the most efficient algorithms are based on the Frank-Wolfe method, which replaces projections by linear optimization. In the general case, however, online projection-free methods require more iterations than projection-based methods: the best known regret bound scales as $T^{3/4}$. Despite significant work on various variants of the Frank-Wolfe method, this bound has remained unchanged for a decade. In this paper we give an efficient projection-free algorithm that guarantees $T^{2/3}$ regret for general online convex optimization with smooth cost functions and one linear optimization computation per iteration. As opposed to previous Frank-Wolfe approaches, our algorithm is derived using the Follow-the-Perturbed-Leader method and is analyzed using an online primal-dual framework.}
}
@InProceedings{hinder20a,
title = {Near-Optimal Methods for Minimizing Star-Convex Functions and Beyond},
author = {Hinder, Oliver and Sidford, Aaron and Sohoni, Nimit},
pages = {1894--1938},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/hinder20a/hinder20a.pdf},
url = {http://proceedings.mlr.press/v125/hinder20a.html},
abstract = { In this paper, we provide near-optimal accelerated first-order methods for minimizing a broad class of smooth nonconvex functions that are unimodal on all lines through a minimizer. This function class, which we call the class of smooth quasar-convex functions, is parameterized by a constant $$\gamma \in (0,1]$$: $$\gamma = 1$$ encompasses the classes of smooth convex and star-convex functions, and smaller values of $$\gamma$$ indicate that the function can be "more nonconvex." We develop a variant of accelerated gradient descent that computes an $$\epsilon$$-approximate minimizer of a smooth $$\gamma$$-quasar-convex function with at most $$O(\gamma^{-1} \epsilon^{-1/2} \log(\gamma^{-1} \epsilon^{-1}))$$ total function and gradient evaluations. We also derive a lower bound of $$\Omega(\gamma^{-1} \epsilon^{-1/2})$$ on the worst-case number of gradient evaluations required by any deterministic first-order method, showing that, up to a logarithmic factor, no deterministic first-order method can improve upon ours.}
}
@InProceedings{holtzman20a,
title = {A Greedy Anytime Algorithm for Sparse PCA},
author = {Holtzman, Guy and Soffer, Adam and Vilenchik, Dan},
pages = {1939--1956},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/holtzman20a/holtzman20a.pdf},
url = {http://proceedings.mlr.press/v125/holtzman20a.html},
abstract = { The taxing computational effort that is involved in solving some high-dimensional statistical problems, in particular problems involving non-convex optimization, has popularized the development and analysis of algorithms that run efficiently (polynomial-time) but with no general guarantee on statistical consistency. In light of the ever-increasing compute power and decreasing costs, a more useful characterization of algorithms is by their ability to calibrate the invested computational effort with various characteristics of the input at hand and with the available computational resources. We propose a new greedy algorithm for the $\ell_0$-sparse PCA problem which supports the calibration principle. We provide both a rigorous analysis of our algorithm in the spiked covariance model, as well as simulation results and comparison with other existing methods. Our findings show that our algorithm recovers the spike in SNR regimes where all polynomial-time algorithms fail while running in a reasonable parallel-time on a cluster.}
}
@InProceedings{hopkins20a,
title = {Noise-tolerant, Reliable Active Classification with Comparison Queries},
author = {Hopkins, Max and Kane, Daniel and Lovett, Shachar and Mahajan, Gaurav},
pages = {1957--2006},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/hopkins20a/hopkins20a.pdf},
url = {http://proceedings.mlr.press/v125/hopkins20a.html},
abstract = { With the explosion of massive, widely available unlabeled data in the past years, finding label and time efficient, robust learning algorithms has become ever more important in theory and in practice. We study the paradigm of active learning, in which algorithms with access to large pools of data may adaptively choose what samples to label in the hope of exponentially increasing efficiency. By introducing comparisons, an additional type of query comparing two points, we provide the first time and query efficient algorithms for learning non-homogeneous linear separators robust to bounded (Massart) noise. We further provide algorithms for a generalization of the popular Tsybakov low noise condition, and show how comparisons provide a strong reliability guarantee that is often impractical or impossible with only labels - returning a classifier that makes no errors with high probability.}
}
@InProceedings{hu20a,
title = {Smooth Contextual Bandits: Bridging the Parametric and Non-differentiable Regret Regimes},
author = {Hu, Yichun and Kallus, Nathan and Mao, Xiaojie},
pages = {2007--2010},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/hu20a/hu20a.pdf},
url = {http://proceedings.mlr.press/v125/hu20a.html},
abstract = { We study a nonparametric contextual bandit problem where the expected reward functions belong to a Hölder class with smoothness parameter $\beta$. We show how this interpolates between two extremes that were previously studied in isolation: non-differentiable bandits ($\beta\leq1$), where rate-optimal regret is achieved by running separate non-contextual bandits in different context regions, and parametric-response bandits (satisfying $\beta=\infty$), where rate-optimal regret can be achieved with minimal or no exploration due to infinite extrapolatability. We develop a novel algorithm that carefully adjusts to all smoothness settings and we prove its regret is rate-optimal by establishing matching upper and lower bounds, recovering the existing results at the two extremes. In this sense, our work bridges the gap between the existing literature on parametric and non-differentiable contextual bandit problems and between bandit algorithms that exclusively use global or local information, shedding light on the crucial interplay of complexity and regret in contextual bandits.}
}
@InProceedings{jana20a,
title = {Extrapolating the profile of a finite population},
author = {Jana, Soham and Polyanskiy, Yury and Wu, Yihong},
pages = {2011--2033},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/jana20a/jana20a.pdf},
url = {http://proceedings.mlr.press/v125/jana20a.html},
abstract = { We study a prototypical problem in empirical Bayes. Namely, consider a population consisting of $k$ individuals each belonging to one of $k$ types (some types can be empty). Without any structural restrictions, it is impossible to learn the composition of the full population having observed only a small (random) subsample of size $m = o(k)$. Nevertheless, we show that in the sublinear regime of $m =\omega(k/\log k)$, it is possible to consistently estimate in total variation the \emph{profile} of the population, defined as the empirical distribution of the sizes of each type, which determines many symmetric properties of the population. We also prove that in the linear regime of $m=c k$ for any constant $c$ the optimal rate is $\Theta(1/\log k)$. Our estimator is based on Wolfowitz’s minimum distance method, which entails solving a linear program (LP) of size $k$. We show that there is a single infinite-dimensional LP whose value simultaneously characterizes the risk of the minimum distance estimator and certifies its minimax optimality. The sharp convergence rate is obtained by evaluating this LP using complex-analytic techniques. }
}
@InProceedings{javanmard20a,
title = {Precise Tradeoffs in Adversarial Training for Linear Regression},
author = {Javanmard, Adel and Soltanolkotabi, Mahdi and Hassani, Hamed},
pages = {2034--2078},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/javanmard20a/javanmard20a.pdf},
url = {http://proceedings.mlr.press/v125/javanmard20a.html},
abstract = { Despite breakthrough performance, modern learning models are known to be highly vulnerable to small adversarial perturbations in their inputs. While a wide variety of recent \emph{adversarial training} methods have been effective at improving robustness to perturbed inputs (robust accuracy), often this benefit is accompanied by a decrease in accuracy on benign inputs (standard accuracy), leading to a tradeoff between often competing objectives. Complicating matters further, recent empirical evidence suggest that a variety of other factors (size and quality of training data, model size, etc.) affect this tradeoff in somewhat surprising ways. In this paper we provide a precise and comprehensive understanding of the role of adversarial training in the context of linear regression with Gaussian features. In particular, we characterize the fundamental tradeoff between the accuracies achievable by any algorithm regardless of computational power or size of the training data. Furthermore, we precisely characterize the standard/robust accuracy and the corresponding tradeoff achieved by a contemporary mini-max adversarial training approach in a high-dimensional regime where the number of data points and the parameters of the model grow in proportion to each other. Our theory for adversarial training algorithms also facilitates the rigorous study of how a variety of factors (size and quality of training data, model overparametrization etc.) affect the tradeoff between these two competing accuracies. }
}
@InProceedings{jeong20a,
title = {Robust causal inference under covariate shift via worst-case subpopulation treatment effects},
author = {Jeong, Sookyo and Namkoong, Hongseok},
pages = {2079--2084},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/jeong20a/jeong20a.pdf},
url = {http://proceedings.mlr.press/v125/jeong20a.html},
abstract = { We propose a notion of worst-case treatment effect (WTE) across all subpopulations of a given size, a conservative notion of topline treatment effect. Compared to the average treatment effect (ATE) that solely relies on the covariate distribution of collected data, WTE is robust to unanticipated covariate shifts, and ensures reliable inference uniformly over underrepresented minority groups. We develop a semiparametrically efficient estimator for the WTE, leveraging machine learning-based estimates of heterogenous treatment effects and propensity scores. By virtue of satisfying a key (Neyman) orthogonality property, our estimator enjoys central limit behavior—oracle rates with true nuisance parameters—even when estimates of nuisance parameters converge at slower-than-parameteric rates. In particular, this allows using black-box machine learning methods to construct asymptotically exact confidence intervals for the WTE. For both observational and randomized studies, we prove that our estimator achieves the \emph{optimal} asymptotic variance, by establishing a semiparametric efficiency lower bound. On real datasets, we illustrate the non-robustness of ATE under even small amounts distributional shift, and demonstrate that WTE allows us to guard against brittle findings that are invalidated by unanticipated covariate shifts. }
}
@InProceedings{jezequel20a,
title = {Efficient improper learning for online logistic regression},
author = {J{\'e}z{\'e}quel, R{\'e}mi and Gaillard, Pierre and Rudi, Alessandro},
pages = {2085--2108},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/jezequel20a/jezequel20a.pdf},
url = {http://proceedings.mlr.press/v125/jezequel20a.html},
abstract = { We consider the setting of online logistic regression and consider the regret with respect to the $\ell_2$-ball of radius $B$. It is known (see Hazan et al. (2014)) that any proper algorithm which has logarithmic regret in the number of samples (denoted $n$) necessarily suffers an exponential multiplicative constant in $B$. In this work, we design an efficient improper algorithm that avoids this exponential constant while preserving a logarithmic regret. Indeed, Foster et al. (2018) showed that the lower bound does not apply to improper algorithms and proposed a strategy based on exponential weights with prohibitive computational complexity. Our new algorithm based on regularized empirical risk minimization with surrogate losses satisfies a regret scaling as $O(B\log(Bn))$ with a per-round time-complexity of order $O(d^2 + \log(n))$.}
}
@InProceedings{ji20a,
title = {Gradient descent follows the regularization path for general losses},
author = {Ji, Ziwei and Dud{\'i}k, Miroslav and Schapire, Robert E. and Telgarsky, Matus},
pages = {2109--2136},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/ji20a/ji20a.pdf},
url = {http://proceedings.mlr.press/v125/ji20a.html},
abstract = { Recent work across many machine learning disciplines has highlighted that standard descent methods, even without explicit regularization, do not merely minimize the training error, but also exhibit an \emph{implicit bias}. This bias is typically towards a certain regularized solution, and relies upon the details of the learning process, for instance the use of the cross-entropy loss. In this work, we show that for empirical risk minimization over linear predictors with \emph{arbitrary} convex, strictly decreasing losses, if the risk does not attain its infimum, then the gradient-descent path and the \emph{algorithm-independent} regularization path converge to the same direction (whenever either converges to a direction). Using this result, we provide a justification for the widely-used exponentially-tailed losses (such as the exponential loss or the logistic loss): while this convergence to a direction for exponentially-tailed losses is necessarily to the maximum-margin direction, other losses such as polynomially-tailed losses may induce convergence to a direction with a poor margin. }
}
@InProceedings{jin20a,
title = {Provably efficient reinforcement learning with linear function approximation},
author = {Jin, Chi and Yang, Zhuoran and Wang, Zhaoran and Jordan, Michael I},
pages = {2137--2143},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/jin20a/jin20a.pdf},
url = {http://proceedings.mlr.press/v125/jin20a.html},
abstract = { Modern Reinforcement Learning (RL) is commonly applied to practical problems with an enormous number of states, where \emph{function approximation} must be deployed to approximate either the value function or the policy. The introduction of function approximation raises a fundamental set of challenges involving computational and statistical efficiency, especially given the need to manage the exploration/exploitation tradeoff. As a result, a core RL question remains open: how can we design provably efficient RL algorithms that incorporate function approximation? This question persists even in a basic setting with linear dynamics and linear rewards, for which only linear function approximation is needed. This paper presents the first provable RL algorithm with both polynomial runtime and polynomial sample complexity in this linear setting, without requiring a “simulator” or additional assumptions. Concretely, we prove that an optimistic modification of Least-Squares Value Iteration (LSVI)—a classical algorithm frequently studied in the linear setting—achieves $\tilde{\mathcal{O}}(\sqrt{d^3H^3T})$ regret, where $d$ is the ambient dimension of feature space, $H$ is the length of each episode, and $T$ is the total number of steps. Importantly, such regret is independent of the number of states and actions. }
}
@InProceedings{kaledin20a,
title = {Finite Time Analysis of Linear Two-timescale Stochastic Approximation with Markovian Noise},
author = {Kaledin, Maxim and Moulines, Eric and Naumov, Alexey and Tadic, Vladislav and Wai, Hoi-To},
pages = {2144--2203},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/kaledin20a/kaledin20a.pdf},
url = {http://proceedings.mlr.press/v125/kaledin20a.html},
abstract = { Linear two-timescale stochastic approximation (SA) scheme is an important class of algorithms which has become popular in reinforcement learning (RL), particularly for the policy evaluation problem. Recently, a number of works have been devoted to establishing the finite time analysis of the scheme, especially under the Markovian (non-i.i.d.) noise settings that are ubiquitous in practice. In this paper, we provide a finite-time analysis for linear two timescale SA. Our bounds show that there is no discrepancy in the convergence rate between Markovian and martingale noise, only the constants are affected by the mixing time of the Markov chain. With an appropriate step size schedule, the transient term in the expected error bound is $o(1/k^c)$ and the steady-state term is ${\cal O}(1/k)$, where $c>1$ and $k$ is the iteration number. Furthermore, we present an asymptotic expansion of the expected error with a matching lower bound of $\Omega(1/k)$. A simple numerical experiment is presented to support our theory.}
}
@InProceedings{kamath20a,
title = {Private Mean Estimation of Heavy-Tailed Distributions},
author = {Kamath, Gautam and Singhal, Vikrant and Ullman, Jonathan},
pages = {2204--2235},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/kamath20a/kamath20a.pdf},
url = {http://proceedings.mlr.press/v125/kamath20a.html},
abstract = { We give new upper and lower bounds on the minimax sample complexity of differentially private mean estimation of distributions with bounded $k$-th moments. Roughly speaking, in the univariate case, we show that $$n = \Theta\left(\frac{1}{\alpha^2} + \frac{1}{\alpha^{\frac{k}{k-1}}\varepsilon}\right)$$ samples are necessary and sufficient to estimate the mean to $\alpha$-accuracy under $\varepsilon$-differential privacy, or any of its common relaxations. This result demonstrates a qualitatively different behavior compared to estimation absent privacy constraints, for which the sample complexity is identical for all $k \geq 2$. We also give algorithms for the multivariate setting whose sample complexity is a factor of $O(d)$ larger than the univariate case.}
}
@InProceedings{kamath20b,
title = {{Approximate is Good Enough: Probabilistic Variants of Dimensional and Margin Complexity}},
author = {Kamath, Pritish and Montasser, Omar and Srebro, Nathan},
pages = {2236--2262},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/kamath20b/kamath20b.pdf},
url = {http://proceedings.mlr.press/v125/kamath20b.html},
abstract = { We present and study approximate notions of dimensional and margin complexity, which correspond to the minimal dimension or norm of an embedding required to {\em approximate}, rather then exactly represent, a given hypothesis class. We show that such notions are not only sufficient for learning using linear predictors or a kernel, but unlike the exact variants, are also necessary. Thus they are better suited for discussing limitations of linear or kernel methods.}
}
@InProceedings{kaplan20a,
title = {Privately Learning Thresholds: Closing the Exponential Gap},
author = {Kaplan, Haim and Ligett, Katrina and Mansour, Yishay and Naor, Moni and Stemmer, Uri},
pages = {2263--2285},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/kaplan20a/kaplan20a.pdf},
url = {http://proceedings.mlr.press/v125/kaplan20a.html},
abstract = { We study the sample complexity of learning threshold functions under the constraint of differential privacy. It is assumed that each labeled example in the training data is the information of one individual and we would like to come up with a generalizing hypothesis $h$ while guaranteeing differential privacy for the individuals. Intuitively, this means that any single labeled example in the training data should not have a significant effect on the choice of the hypothesis. This problem has received much attention recently; unlike the non-private case, where the sample complexity is independent of the domain size and just depends on the desired accuracy and confidence, for private learning the sample complexity must depend on the domain size $X$ (even for approximate differential privacy). Alon et al. (STOC 2019) showed a lower bound of $\Omega(\log^*|X|)$ on the sample complexity and Bun et al. (FOCS 2015) presented an approximate-private learner with sample complexity $\tilde{O}\left(2^{\log^*|X|}\right)$. In this work we reduce this gap significantly, almost settling the sample complexity. We first present a new upper bound (algorithm) of $\tilde{O}\left(\left(\log^*|X|\right)^2\right)$ on the sample complexity and then present an improved version with sample complexity $\tilde{O}\left(\left(\log^*|X|\right)^{1.5}\right)$. Our algorithm is constructed for the related interior point problem, where the goal is to find a point between the largest and smallest input elements. It is based on selecting an input-dependent hash function and using it to embed the database into a domain whose size is reduced logarithmically; this results in a new database, an interior point of which can be used to generate an interior point of the original database in a differentially private manner.}
}
@InProceedings{kesselheim20a,
title = {Online Learning with Vector Costs and Bandits with Knapsacks},
author = {Kesselheim, Thomas and Singla, Sahil},
pages = {2286--2305},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/kesselheim20a/kesselheim20a.pdf},
url = {http://proceedings.mlr.press/v125/kesselheim20a.html},
abstract = { We introduce online learning with vector costs ($OLVC_p$) where in each time step $t \in \{1,\ldots, T\}$, we need to play an action $i \in \{1,\ldots,n\}$ that incurs an unknown vector cost in $[0,1]^d$. The goal of the online algorithm is to minimize the $\ell_p$ norm of the sum of its cost vectors. This captures the classical online learning setting for $d=1$, and is interesting for general $d$ because of applications like online scheduling where we want to balance the load between different machines (dimensions). We study $OLVC_p$ in both stochastic and adversarial arrival settings, and give a general procedure to reduce the problem from $d$ dimensions to a single dimension. This allows us to use classical online learning algorithms in both full and bandit feedback models to obtain (near) optimal results. In particular, we obtain a single algorithm (up to the choice of learning rate) that gives sublinear regret for stochastic arrivals and a tight $O(\min\{p, \log d\})$ competitive ratio for adversarial arrivals. The $OLVC_p$ problem also occurs as a natural subproblem when trying to solve the popular Bandits with Knapsacks (BWK) problem. This connection allows us to use our $OLVC_p$ techniques to obtain (near) optimal results for BWK in both stochastic and adversarial settings. In particular, we obtain a tight $O(\log d \cdot \log T)$ competitive ratio algorithm for adversarial BWK, which improves over the $O(d \cdot \log T)$ competitive ratio algorithm of Immorlica et al. (2019).}
}
@InProceedings{kidger20a,
title = {{Universal Approximation with Deep Narrow Networks}},
author = {Kidger, Patrick and Lyons, Terry},
pages = {2306--2327},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/kidger20a/kidger20a.pdf},
url = {http://proceedings.mlr.press/v125/kidger20a.html},
abstract = { The classical Universal Approximation Theorem holds for neural networks of arbitrary width and bounded depth. Here we consider the natural ‘dual’ scenario for networks of bounded width and arbitrary depth. Precisely, let $n$ be the number of inputs neurons, $m$ be the number of output neurons, and let $\rho$ be any nonaffine continuous function, with a continuous nonzero derivative at some point. Then we show that the class of neural networks of arbitrary depth, width $n + m + 2$, and activation function $\rho$, is dense in $C(K; \mathbb{R}^m)$ for $K \subseteq \mathbb{R}^n$ with $K$ compact. This covers every activation function possible to use in practice, and also includes polynomial activation functions, which is unlike the classical version of the theorem, and provides a qualitative difference between deep narrow networks and shallow wide networks. We then consider several extensions of this result. In particular we consider nowhere differentiable activation functions, density in noncompact domains with respect to the $L^p$-norm, and how the width may be reduced to just $n + m + 1$ for ‘most’ activation functions.}
}
@InProceedings{kirschner20a,
title = {Information Directed Sampling for Linear Partial Monitoring},
author = {Kirschner, Johannes and Lattimore, Tor and Krause, Andreas},
pages = {2328--2369},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/kirschner20a/kirschner20a.pdf},
url = {http://proceedings.mlr.press/v125/kirschner20a.html},
abstract = { Partial monitoring is a rich framework for sequential decision making under uncertainty that generalizes many well known bandit models, including linear, combinatorial and dueling bandits. We introduce {\em information directed sampling} (IDS) for stochastic partial monitoring with a linear reward and observation structure. IDS achieves adaptive worst-case regret rates that depend on precise observability conditions of the game. Moreover, we prove lower bounds that classify the minimax regret of all finite games into four possible regimes. IDS achieves the optimal rate in all cases up to logarithmic factors, without tuning any hyper-parameters. We further extend our results to the contextual and the kernelized setting, which significantly increases the range of possible applications.}
}
@InProceedings{kobzar20a,
title = {New Potential-Based Bounds for Prediction with Expert Advice},
author = {Kobzar, Vladimir A. and Kohn, Robert V. and Wang, Zhilei},
pages = {2370--2405},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/kobzar20a/kobzar20a.pdf},
url = {http://proceedings.mlr.press/v125/kobzar20a.html},
abstract = { This work addresses the classic machine learning problem of online prediction with expert advice. We consider the finite-horizon version of this zero-sum, two-person game. Using verification arguments from optimal control theory, we view the task of finding better lower and upper bounds on the value of the game (regret) as the problem of finding better sub- and supersolutions of certain partial differential equations (PDEs). These sub- and supersolutions serve as the potentials for player and adversary strategies, which lead to the corresponding bounds. To get explicit bounds, we use closed-form solutions of specific PDEs. Our bounds hold for any given number of experts and horizon; in certain regimes (which we identify) they improve upon the previous state of the art. For two and three experts, our bounds provide the optimal leading order term. }
}
@InProceedings{kur20a,
title = {On Suboptimality of Least Squares with Application to Estimation of Convex Bodies},
author = {Kur, Gil and Rakhlin, Alexander and Guntuboyina, Adityanand},
pages = {2406--2424},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/kur20a/kur20a.pdf},
url = {http://proceedings.mlr.press/v125/kur20a.html},
abstract = { We develop a technique for establishing lower bounds on the sample complexity of Least Squares (or, Empirical Risk Minimization) for large classes of functions. As an application, we settle an open problem regarding optimality of Least Squares in estimating a convex set from noisy support function measurements in dimension $d\geq 6$. Specifically, we establish that Least Squares is mimimax sub-optimal, and achieves a rate of $\tilde{\Theta}_d(n^{-2/(d-1)})$ whereas the minimax rate is $\Theta_d(n^{-4/(d+3)})$.}
}
@InProceedings{kwon20a,
title = {The EM Algorithm gives Sample-Optimality for Learning Mixtures of Well-Separated Gaussians},
author = {Kwon, Jeongyeol and Caramanis, Constantine},
pages = {2425--2487},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/kwon20a/kwon20a.pdf},
url = {http://proceedings.mlr.press/v125/kwon20a.html},
abstract = { We consider the problem of spherical Gaussian Mixture models with $k \geq 3$ components when the components are well separated. A fundamental previous result established that separation of $\Omega(\sqrt{\log k})$ is necessary and sufficient for identifiability of the parameters with \textit{polynomial} sample complexity (Regev and Vijayaraghavan, 2017). In the same context, we show that $\tilde{O} (kd/\epsilon^2)$ samples suffice for any $\epsilon \lesssim 1/k$, closing the gap from polynomial to linear, and thus giving the first optimal sample upper bound for the parameter estimation of well-separated Gaussian mixtures. We accomplish this by proving a new result for the Expectation-Maximization (EM) algorithm: we show that EM converges locally, under separation $\Omega(\sqrt{\log k})$. The previous best-known guarantee required $\Omega(\sqrt{k})$ separation (Yan, et al., 2017). Unlike prior work, our results do not assume or use prior knowledge of the (potentially different) mixing weights or variances of the Gaussian components. Furthermore, our results show that the finite-sample error of EM does not depend on non-universal quantities such as pairwise distances between means of Gaussian components.}
}
@InProceedings{lattimore20a,
title = {Exploration by Optimisation in Partial Monitoring},
author = {Lattimore, Tor and Szepesv{\'a}ri, Csaba},
pages = {2488--2515},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/lattimore20a/lattimore20a.pdf},
url = {http://proceedings.mlr.press/v125/lattimore20a.html},
abstract = { We provide a novel algorithm for adversarial k-action d-outcome partial monitoring that is adaptive, intuitive and efficient. The highlight is that for the non-degenerate locally observable games, the n-round minimax regret is bounded by 6m k^(3/2) sqrt(n log(k)), where m is the number of signals. This matches the best known information-theoretic upper bound derived via Bayesian minimax duality. The same algorithm also achieves near-optimal regret for full information, bandit and globally observable games. High probability bounds and simple experiments are also provided.}
}
@InProceedings{lee20a,
title = {A Closer Look at Small-loss Bounds for Bandits with Graph Feedback},
author = {Lee, Chung-Wei and Luo, Haipeng and Zhang, Mengxiao},
pages = {2516--2564},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/lee20a/lee20a.pdf},
url = {http://proceedings.mlr.press/v125/lee20a.html},
abstract = { We study {\it small-loss} bounds for adversarial multi-armed bandits with graph feedback, that is, adaptive regret bounds that depend on the loss of the best arm or related quantities, instead of the total number of rounds. We derive the first small-loss bound for general strongly observable graphs, resolving an open problem of Lykouris et al. (2018). Specifically, we develop an algorithm with regret $\mathcal{\tilde{O}}(\sqrt{\kappa L_*})$ where $\kappa$ is the clique partition number and $L_*$ is the loss of the best arm, and for the special case of self-aware graphs where every arm has a self-loop, we improve the regret to $\mathcal{\tilde{O}}(\min\{\sqrt{\alpha T}, \sqrt{\kappa L_*}\})$ where $\alpha \leq \kappa$ is the independence number. Our results significantly improve and extend those by Lykouris et al. (2018) who only consider self-aware undirected graphs. Furthermore, we also take the first attempt at deriving small-loss bounds for weakly observable graphs. We first prove that no typical small-loss bounds are achievable in this case, and then propose algorithms with alternative small-loss bounds in terms of the loss of some specific subset of arms. A surprising side result is that $\mathcal{\tilde{O}}(\sqrt{T})$ regret is achievable even for weakly observable graphs as long as the best arm has a self-loop. Our algorithms are based on the Online Mirror Descent framework but require a suite of novel techniques that might be of independent interest. Moreover, all our algorithms can be made parameter-free without the knowledge of the environment.}
}
@InProceedings{lee20b,
title = {Logsmooth Gradient Concentration and Tighter Runtimes for Metropolized Hamiltonian Monte Carlo},
author = {Lee, Yin Tat and Shen, Ruoqi and Tian, Kevin},
pages = {2565--2597},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/lee20b/lee20b.pdf},
url = {http://proceedings.mlr.press/v125/lee20b.html},
abstract = { We show that the gradient norm $\norm{\nabla f(x)}$ for $x \sim \exp(-f(x))$, where $f$ is strongly convex and smooth, concentrates tightly around its mean. This removes a barrier in the prior state-of-the-art analysis for the well-studied Metropolized Hamiltonian Monte Carlo (HMC) algorithm for sampling from a strongly logconcave distribution. We correspondingly demonstrate that Metropolized HMC mixes in $\tilde{O}(\kappa d)$ iterations, improving upon the $\tilde{O}(\kappa^{1.5}\sqrt{d} + \kappa d)$ runtime of prior work by a factor $(\kappa/d)^{1/2}$ when the condition number $\kappa$ is large. Our mixing time analysis introduces several techniques which to our knowledge have not appeared in the literature and may be of independent interest, including restrictions to a nonconvex set with good conductance behavior, and a new reduction technique for boosting a constant-accuracy total variation guarantee under weak warmness assumptions. This is the first high-accuracy mixing time result for logconcave distributions using only first-order function information which achieves linear dependence on $\kappa$; we also give evidence that this dependence is likely to be necessary for standard Metropolized first-order methods.}
}
@InProceedings{lei20a,
title = {A Fast Spectral Algorithm for Mean Estimation with Sub-Gaussian Rates},
author = {Lei, Zhixian and Luh, Kyle and Venkat, Prayaag and Zhang, Fred},
pages = {2598--2612},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/lei20a/lei20a.pdf},
url = {http://proceedings.mlr.press/v125/lei20a.html},
abstract = { We study the algorithmic problem of estimating the mean of a heavy-tailed random vector in R^d, given n i.i.d. samples. The goal is to design an efficient estimator that attains the optimal sub-gaussian error bound, only assuming that the random vector has bounded mean and covariance. Polynomial-time solutions to this problem are known but have high runtime due to their use of semi-definite programming (SDP). Moreover, conceptually, it remains open whether convex relaxation is truly necessary for this problem. In this work, we show that it is possible to go beyond SDP and achieve better computational efficiency. In particular, we provide a spectral algorithm that achieves the optimal statistical performance and runs in time O ( n^2 d ), improving upon the previous fastest runtime O( n^{3.5}+ n^2 d ) by Cherapanamjeri et.al. (COLT ’19). Our algorithm is spectral in that it only requires (approximate) eigenvector computations, which can be implemented very efficiently by, for example, power iteration or the Lanczos method. At the core of our algorithm is a novel connection between the furthest hyperplane problem introduced by Karnin et. al. (COLT ’12) and a structural lemma on heavy-tailed distributions by Lugosi and Mendelson (Ann. Stat. ’19). This allows us to iteratively reduce the estimation error at a geometric rate using only the information derived from the top singular vector of the data matrix, leading to a significantly faster running time.}
}
@InProceedings{li20a,
title = {Learning Over-Parametrized Two-Layer Neural Networks beyond NTK},
author = {Li, Yuanzhi and Ma, Tengyu and Zhang, Hongyang R.},
pages = {2613--2682},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/li20a/li20a.pdf},
url = {http://proceedings.mlr.press/v125/li20a.html},
abstract = { We consider the dynamic of gradient descent for learning a two-layer neural network. We assume the input $x\in\mathbb{R}^d$ is drawn from a Gaussian distribution and the label of $x$ satisfies $f^{\star}(x) = a^{\top}|W^{\star}x|$, where $a\in\mathbb{R}^d$ is a nonnegative vector and $W^{\star} \in\mathbb{R}^{d\times d}$ is an orthonormal matrix. We show that an \emph{over-parameterized} two layer neural network with ReLU activation, trained by gradient descent from \emph{random initialization}, can provably learn the ground truth network with population loss at most $o(1/d)$ in polynomial time with polynomial samples. On the other hand, we prove that any kernel method, including Neural Tangent Kernel, with a polynomial number of samples in $d$, has population loss at least $\Omega(1 / d)$.}
}
@InProceedings{liang20a,
title = {On the Multiple Descent of Minimum-Norm Interpolants and Restricted Lower Isometry of Kernels},
author = {Liang, Tengyuan and Rakhlin, Alexander and Zhai, Xiyu},
pages = {2683--2711},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/liang20a/liang20a.pdf},
url = {http://proceedings.mlr.press/v125/liang20a.html},
abstract = { We study the risk of minimum-norm interpolants of data in Reproducing Kernel Hilbert Spaces. Our upper bounds on the risk are of a multiple-descent shape for the various scalings of $d = n^{\alpha}$, $\alpha\in(0,1)$, for the input dimension $d$ and sample size $n$. Empirical evidence supports our finding that minimum-norm interpolants in RKHS can exhibit this unusual non-monotonicity in sample size; furthermore, locations of the peaks in our experiments match our theoretical predictions. Since gradient flow on appropriately initialized wide neural networks converges to a minimum-norm interpolant with respect to a certain kernel, our analysis also yields novel estimation and generalization guarantees for these over-parametrized models. At the heart of our analysis is a study of spectral properties of the random kernel matrix restricted to a filtration of eigen-spaces of the population covariance operator, and may be of independent interest. }
}
@InProceedings{liang20b,
title = {Learning Entangled Single-Sample Gaussians in the Subset-of-Signals Model},
author = {Liang, Yingyu and Yuan, Hui},
pages = {2712--2737},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/liang20b/liang20b.pdf},
url = {http://proceedings.mlr.press/v125/liang20b.html},
abstract = { In the setting of entangled single-sample distributions, the goal is to estimate some common parameter shared by a family of $n$ distributions, given one single sample from each distribution. This paper studies mean estimation for entangled single-sample Gaussians that have a common mean but different unknown variances. We propose the subset-of-signals model where an unknown subset of $m$ variances are bounded by 1 while there are no assumptions on the other variances. In this model, we analyze a simple and natural method based on iteratively averaging the truncated samples, and show that the method achieves error $O \left(\frac{\sqrt{n\ln n}}{m}\right)$ with high probability when $m=\Omega(\sqrt{n\ln n})$, slightly improving existing bounds for this range of $m$. We further prove lower bounds, showing that the error is $\Omega\left(\left(\frac{n}{m^4}\right)^{1/2}\right)$ when $m$ is between $\Omega(\ln n)$ and $O(n^{1/4})$, and the error is $\Omega\left(\left(\frac{n}{m^4}\right)^{1/6}\right)$ when $m$ is between $\Omega(n^{1/4})$ and $O(n^{1 - \epsilon})$ for an arbitrarily small $\epsilon>0$, improving existing lower bounds and extending to a wider range of $m$.}
}
@InProceedings{lin20a,
title = {Near-Optimal Algorithms for Minimax Optimization},
author = {Lin, Tianyi and Jin, Chi and Jordan, Michael I.},
pages = {2738--2779},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/lin20a/lin20a.pdf},
url = {http://proceedings.mlr.press/v125/lin20a.html},
abstract = { This paper resolves a longstanding open question pertaining to the design of near-optimal first-order algorithms for smooth and strongly-convex-strongly-concave minimax problems. Current state-of-the-art first-order algorithms find an approximate Nash equilibrium using $\tilde{O}(\kappa_x+\kappa_y)$ or $\tilde{O}(\text{min}\{\kappa_x\sqrt{\kappa_y}, \sqrt{\kappa_x}\kappa_y\})$ gradient evaluations, where $\kappa_x$ and $\kappa_y$ are the condition numbers for the strong-convexity and strong-concavity assumptions. A gap still remains between these results and the best existing lower bound $\tilde{\Omega}(\sqrt{\kappa_x\kappa_y})$. This paper presents the first algorithm with $\tilde{O}(\sqrt{\kappa_x\kappa_y})$ gradient complexity, matching the lower bound up to logarithmic factors. Our algorithm is designed based on an accelerated proximal point method and an accelerated solver for minimax proximal steps. It can be easily extended to the settings of strongly-convex-concave, convex-concave, nonconvex-strongly-concave, and nonconvex-concave functions. This paper also presents algorithms that match or outperform all existing methods in these settings in terms of gradient complexity, up to logarithmic factors.}
}
@InProceedings{liu20a,
title = {Better Algorithms for Estimating Non-Parametric Models in Crowd-Sourcing and Rank Aggregation},
author = {Liu, Allen and Moitra, Ankur},
pages = {2780--2829},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/liu20a/liu20a.pdf},
url = {http://proceedings.mlr.press/v125/liu20a.html},
abstract = { Motivated by applications in crowd-sourcing and rank aggregation, a recent line of work has studied the problem of estimating an $n \times n$ bivariate isotonic matrix with an unknown permutation acting on its rows (and possibly another unknown permutation acting on its columns) from partial and noisy observations. There are wide and persistent computational vs. statistical gaps for this problem. It is known that the minimax optimal rate is $\widetilde{O}(n^{-1})$ when error is measured in average squared Frobenius norm. However the best known polynomial time computable estimator due to \cite{coltpaper} achieves the rate $\widetilde{O}(n^{-\frac{3}{4}})$, and this is the natural barrier to approaches based on using local statistics to figure out the relative order of pairs of rows without using information from the rest of the matrix. Here we introduce a framework for exploiting global information in shape-constrained estimation problems. In the case when only the rows are permuted, we give an algorithm that achieves error rate $O(n^{-1 + o(1)})$, which essentially closes the computational vs. statistical gap for this problem. When both the rows and columns are permuted, we give an improved algorithm that achieves error rate $O(n^{-\frac{5}{6} + o(1)})$. Additionally, all of our algorithms run in nearly linear time. }
}
@InProceedings{merlis20a,
title = {Tight Lower Bounds for Combinatorial Multi-Armed Bandits},
author = {Merlis, Nadav and Mannor, Shie},
pages = {2830--2857},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/merlis20a/merlis20a.pdf},
url = {http://proceedings.mlr.press/v125/merlis20a.html},
abstract = { The Combinatorial Multi-Armed Bandit problem is a sequential decision-making problem in which an agent selects a set of arms on each round, observes feedback for each of these arms and aims to maximize a known reward function of the arms it chose. While previous work proved regret upper bounds in this setting for general reward functions, only a few works provided matching lower bounds, all for specific reward functions. In this work, we prove regret lower bounds for combinatorial bandits that hold under mild assumptions for all smooth reward functions. We derive both problem-dependent and problem-independent bounds and show that the recently proposed Gini-weighted smoothness parameter (Merlis and Mannor, 2019) also determines the lower bounds for monotone reward functions. Notably, this implies that our lower bounds are tight up to log-factors.}
}
@InProceedings{mhammedi20a,
title = {Lipschitz and Comparator-Norm Adaptivity in Online Learning},
author = {Mhammedi, Zakaria and Koolen, Wouter M.},
pages = {2858--2887},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/mhammedi20a/mhammedi20a.pdf},
url = {http://proceedings.mlr.press/v125/mhammedi20a.html},
abstract = { We study Online Convex Optimization in the unbounded setting where neither predictions nor gradient are constrained. The goal is to simultaneously adapt to both the sequence of gradients and the comparator. We first develop parameter-free and scale-free algorithms for a simplified setting with hints. We present two versions: the first adapts to the squared norms of both comparator and gradients separately using $O(d)$ time per round, the second adapts to their squared inner products (which measure variance only in the comparator direction) in time $O(d^3)$ per round. We then generalize two prior reductions to the unbounded setting; one to not need hints, and a second to deal with the range ratio problem (which already arises in prior work). We discuss their optimality in light of prior and new lower bounds. We apply our methods to obtain sharper regret bounds for scale-invariant online prediction with linear models. }
}
@InProceedings{misra20a,
title = {Information Theoretic Optimal Learning of Gaussian Graphical Models},
author = {Misra, Sidhant and Vuffray, Marc and Lokhov, Andrey Y.},
pages = {2888--2909},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/misra20a/misra20a.pdf},
url = {http://proceedings.mlr.press/v125/misra20a.html},
abstract = { What is the optimal number of independent observations from which a sparse Gaussian Graphical Model can be correctly recovered? Information-theoretic arguments provide a lower bound on the minimum number of samples necessary to perfectly identify the support of any multivariate normal distribution as a function of model parameters. For a model defined on a sparse graph with $p$ nodes, a maximum degree $d$ and minimum normalized edge strength $\kappa$, this necessary number of samples scales at least as $d \log p/\kappa^2$. The sample complexity requirements of existing methods for perfect graph reconstruction exhibit dependency on additional parameters that do not enter in the lower bound. The question of whether the lower bound is tight and achievable by a polynomial time algorithm remains open. In this paper, we constructively answer this question and propose an algorithm, termed DICE, whose sample complexity matches the information-theoretic lower bound up to a universal constant factor. We also propose a related algorithm SLICE that has a slightly higher sample complexity, but can be implemented as a mixed integer quadratic program which makes it attractive in practice. Importantly, SLICE retains a critical advantage of DICE in that its sample complexity only depends on quantities present in the information theoretic lower bound. We anticipate that this result will stimulate future search of computationally efficient sample-optimal algorithms.}
}
@InProceedings{moitra20a,
title = {Parallels Between Phase Transitions and Circuit Complexity?},
author = {Moitra, Ankur and Mossel, Elchanan and Sandon, Colin},
pages = {2910--2946},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/moitra20a/moitra20a.pdf},
url = {http://proceedings.mlr.press/v125/moitra20a.html},
abstract = { In many natural average-case problems, there are or there are believed to be critical values in the parameter space where the structure of the space of solutions changes in a fundamental way. These phase transitions are often believed to coincide with drastic changes in the computational complexity of the associated problem. In this work, we study the circuit complexity of inference in the broadcast tree model, which has important applications in phylogenetic reconstruction and close connections to community detection. We establish a number of qualitative connections between phase transitions and circuit complexity in this model. Specifically we show that there is a $\mathbf{TC}^0$ circuit that competes with the Bayes optimal predictor in some range of parameters above the Kesten-Stigum bound. We also show that there is a $16$ label broadcast tree model beneath the Kesten-Stigum bound in which it is possible to accurately guess the label of the root, but beating random guessing is $\mathbf{NC}^1$-hard on average. The key to locating phase transitions is often to study some intrinsic notions of complexity associated with belief propagation \— e.g. where do linear statistics fail, or when is the posterior sensitive to noise? Ours is the first work to study the complexity of belief propagation in a way that is grounded in circuit complexity.}
}
@InProceedings{mou20a,
title = {On Linear Stochastic Approximation: Fine-grained {P}olyak-{R}uppert and Non-Asymptotic Concentration},
author = {Mou, Wenlong and Li, Chris Junchi and Wainwright, Martin J and Bartlett, Peter L and Jordan, Michael I},
pages = {2947--2997},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/mou20a/mou20a.pdf},
url = {http://proceedings.mlr.press/v125/mou20a.html},
abstract = { We undertake a precise study of the asymptotic and non-asymptotic properties of stochastic approximation procedures with Polyak-Ruppert averaging for solving a linear system $\bar{A} \theta = \bar{b}$. When the matrix $\bar{A}$ is Hurwitz, we prove a central limit theorem (CLT) for the averaged iterates with fixed step size and number of iterations going to infinity. The CLT characterizes the exact asymptotic covariance matrix, which is the sum of the classical Polyak-Ruppert covariance and a correction term that scales with the step size. Under assumptions on the tail of the noise distribution, we prove a non-asymptotic concentration inequality whose main term matches the covariance in CLT in any direction, up to universal constants. When the matrix $\bar{A}$ is not Hurwitz but only has non-negative real parts in its eigenvalues, we prove that the averaged LSA procedure actually achieves an $O(1/T)$ rate in mean-squared error. Our results provide a more refined understanding of linear stochastic approximation in both the asymptotic and non-asymptotic settings. We also show various applications of the main results, including the study of momentum-based stochastic gradient methods as well as temporal difference algorithms in reinforcement learning.}
}
@InProceedings{nanashima20a,
title = {{Extending Learnability to Auxiliary-Input Cryptographic Primitives and Meta-PAC Learning}},
author = {Nanashima, Mikito},
pages = {2998--3029},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/nanashima20a/nanashima20a.pdf},
url = {http://proceedings.mlr.press/v125/nanashima20a.html},
abstract = { We investigate the meaning of efficient learnability from several different perspectives. The purpose is to give new insights into central problems in computational learning theory (CoLT). Specifically, we discuss the following two questions related to efficient PAC learnability. First, we investigate the gap between PAC learnability for polynomial-size circuits and weak cryptographic primitives taking auxiliary-input. Applebaum et al. observed that such a weak primitive is enough to show the hardness of PAC learning. However, the opposite direction is still unknown. In this paper, we introduce the following two notions: (1) a variant model of PAC learning whose hardness corresponds to auxiliary-input one-way functions; (2) a variant of a hitting set generator corresponding to the hardness of PAC learning. The equivalence gives a clearer insight into the gap between the hardness of learning and weak cryptographic primitives. Second, we discuss why proving efficient learnability is difficult. This question is natural because few classes are known to be polynomially learnable at present. In this paper, we formulate a task of determining efficient learnability as a meta-PAC learning problem and show that our meta-PAC learning is exactly as hard as PAC learning. Our result insists on one possibility: a hard-to-learn instance itself yields the hardness of proving efficient learnability. Our technical contribution is to give (1) a general framework for translating the hardness of PAC learning into auxiliary-input primitives, and (2) a new formulation to discuss the hardness of determining efficient learnability. Our work yields new important frontiers related to CoLT, including investigation of the learning hierarchy. }
}
@InProceedings{neu20a,
title = {Fast Rates for Online Prediction with Abstention},
author = {Neu, Gergely and Zhivotovskiy, Nikita},
pages = {3030--3048},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/neu20a/neu20a.pdf},
url = {http://proceedings.mlr.press/v125/neu20a.html},
abstract = { In the setting of sequential prediction of individual $(0, 1)$-sequences with expert advice, we show that by allowing the learner to abstain from the prediction by paying a cost marginally smaller than $0.5$ (say, $0.49$), it is possible to achieve expected regret bounds that are independent of the time horizon T. We exactly characterize the dependence on the abstention cost $c$ and the number of experts N by providing matching upper and lower bounds of order $\frac{log N} {1−2c}$, which is to be contrasted with the best possible rate of $\sqrt{T\log N}$ that is available without the option to abstain. We also discuss various extensions of our model, including a setting where the sequence of abstention costs can change arbitrarily over time, where we show regret bounds interpolating between the slow and the fast rates mentioned above, under some natural assumptions on the sequence of abstention costs}
}
@InProceedings{neu20b,
title = {Efficient and robust algorithms for adversarial linear contextual bandits},
author = {Neu, Gergely and Olkhovskaya, Julia},
pages = {3049--3068},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/neu20b/neu20b.pdf},
url = {http://proceedings.mlr.press/v125/neu20b.html},
abstract = { We consider an adversarial variant of the classic $K$-armed linear contextual bandit problem where the sequence of loss functions associated with each arm are allowed to change without restriction over time. Under the assumption that the $d$-dimensional contexts are generated i.i.d. at random from a known distribution, we develop computationally efficient algorithms based on the classic Exp3 algorithm. Our first algorithm, RealLinExp3, is shown to achieve a regret guarantee of $\widetilde{O}(\sqrt{KdT})$ over $T$ rounds, which matches the best known lower bound for this problem. Our second algorithm, RobustLinExp3, is shown to be robust to misspecification, in that it achieves a regret bound of $\widetilde{O}((Kd)^{1/3}T^{2/3}) + \varepsilon \sqrt{d} T$ if the true reward function is linear up to an additive nonlinear error uniformly bounded in absolute value by $\varepsilon$. To our knowledge, our performance guarantees constitute the very first results on this problem setting.}
}
@InProceedings{lee20c,
title = {An $\widetilde{\mathcal{O}}(m/\varepsilon^{3.5})$-Cost Algorithm for Semidefinite Programs with Diagonal Constraints},
author = {Lee, Yin Tat and Padmanabhan, Swati},
pages = {3069--3119},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/lee20c/lee20c.pdf},
url = {http://proceedings.mlr.press/v125/lee20c.html},
abstract = {We provide a first-order algorithm for semidefinite programs (SDPs) with diagonal constraints on the matrix variable. Our algorithm outputs an $\varepsilon$-optimal solution with a run time of $\widetilde{\mathcal{O}}(m/\varepsilon^{3.5})$, where $m$ is the number of non-zero entries in the cost matrix. This improves upon the previous best run time of $\widetilde{\mathcal{O}}(m/\varepsilon^{4.5})$ by Arora and Kale. As a corollary of our result, given an instance of the Max-Cut problem with $n$ vertices and $m \gg n$ edges, our algorithm returns a $(1 - \varepsilon)\alpha_{GW}$ cut in the faster time of $\widetilde{\mathcal{O}}(m/\varepsilon^{3.5})$, where $\alpha_{GW} \approx 0.878567$ is the approximation ratio by Goemans and Williamson. Our key technical contribution is to combine an approximate variant of the Arora-Kale framework of mirror descent for SDPs with the idea of trading off exact computations in every iteration for variance-reduced estimations in most iterations, only periodically resetting the accumulated error with exact computations. This idea, along with the constructed estimator, are of possible independent interest for other problems that use the mirror descent framework. }
}
@InProceedings{paes-leme20a,
title = {Costly Zero Order Oracles},
author = {Paes Leme, Renato and Schneider, Jon},
pages = {3120--3132},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/paes-leme20a/paes-leme20a.pdf},
url = {http://proceedings.mlr.press/v125/paes-leme20a.html},
abstract = { We study optimization with an approximate zero order oracle where there is a cost $c(\epsilon)$ associated with querying the oracle with $\epsilon$ accuracy. We consider the task of reconstructing a linear function: given a linear function $f : X \subseteq \R^d \rightarrow [-1,1]$ defined on a not-necessarily-convex set $X$, the goal is to reconstruct $\hat f$ such that $\vert{f(x) - \hat f(x)}\vert \leq \epsilon$ for all $x \in X$. We show that this can be done with cost $O(d \cdot c(\epsilon/d))$. The algorithm is based on a (poly-time computable) John-like theorem for simplices instead of ellipsoids, which may be of independent interest. This question is motivated by optimization with threshold queries, which are common in economic applications such as pricing. With threshold queries, approximating a number up to precision $\epsilon$ requires $c(\epsilon) = \log(1/\epsilon)$. For this, our algorithm has cost $O(d \log(d/\epsilon))$ which matches the $\Omega(d \log(1/\epsilon))$ lower bound up to log factors. }
}
@InProceedings{parthasarathy20a,
title = {Adaptive Submodular Maximization under Stochastic Item Costs},
author = {Parthasarathy, Srinivasan},
pages = {3133--3151},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/parthasarathy20a/parthasarathy20a.pdf},
url = {http://proceedings.mlr.press/v125/parthasarathy20a.html},
abstract = { Constrained maximization of non-decreasing utility functions with submodularity-like properties is at the core of several AI and ML applications including viral marketing, pool-based active learning, adaptive feature selection, and sensor deployment. In this work, we develop adaptive policies for maximizing such functions when both the utility function and item costs may be stochastic. First, we study maximization of an adaptive weak submodular function which is submodular in an approximate and probabilistic sense, subject to a stochastic fractional knapsack constraint, which requires total expected item cost to be within a given capacity. We present the $\beta$-GREEDY policy for this problem; our approximation guarantee for it recovers many known greedy maximization guarantees as special cases. Next, we study maximization of an adaptive submodular function, which is submodular in a probabilistic sense, subject to a stochastic knapsack constraint, which requires the total item cost to be within a given capacity with probability one. We present the MIX policy for this problem; our approximation guarantee for it is the first known approximation guarantee for maximizing a non-linear utility function subject to a stochastic knapsack constraint. Using alternative parameterizations of MIX, we also derive the first known approximation guarantees for maximizing an adaptive submodular function subject to a deterministic knapsack constraint. Our guarantees are powered by an innovative differential analysis technique which models the $\beta$-GREEDY policy using a continuous-time stochastic reward process of a particle whose reward is related to the optimal utility through a differential inequality. The solution to this inequality yields our $\beta$-GREEDY guarantee. We combine differential analysis with a variety of other ideas to derive our MIX guarantees.}
}
@InProceedings{perrault20a,
title = {Covariance-adapting algorithm for semi-bandits with application to sparse outcomes},
author = {Perrault, Pierre and Valko, Michal and Perchet, Vianney},
pages = {3152--3184},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/perrault20a/perrault20a.pdf},
url = {http://proceedings.mlr.press/v125/perrault20a.html},
abstract = { We investigate \emph{stochastic combinatorial semi-bandits}, where the entire joint distribution of outcomes impacts the complexity of the problem instance (unlike in the standard bandits). Typical distributions considered depend on specific parameter values, whose prior knowledge is required in theory but quite difficult to estimate in practice; an example is the commonly assumed \emph{sub-Gaussian} family. We alleviate this issue by instead considering a new general family of \emph{sub-exponential} distributions, which contains bounded and Gaussian ones. We prove a new lower bound on the regret on this family, that is parameterized by the \emph{unknown} covariance matrix, a tighter quantity than the sub-Gaussian matrix. We then construct an algorithm that uses covariance estimates, and provide a tight asymptotic analysis of the regret. Finally, we apply and extend our results to the family of sparse outcomes, which has applications in many recommender systems. }
}
@InProceedings{qu20a,
title = {Finite-Time Analysis of Asynchronous Stochastic Approximation and $Q$-Learning},
author = {Qu, Guannan and Wierman, Adam},
pages = {3185--3205},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/qu20a/qu20a.pdf},
url = {http://proceedings.mlr.press/v125/qu20a.html},
abstract = { We consider a general asynchronous Stochastic Approximation (SA) scheme featuring a weighted infinity-norm contractive operator, and prove a bound on its finite-time convergence rate on a single trajectory. Additionally, we specialize the result to asynchronous $Q$-learning. The resulting bound matches the sharpest available bound for synchronous $Q$-learning, and improves over previous known bounds for asynchronous $Q$-learning.}
}
@InProceedings{raghavendra20a,
title = {List Decodable Subspace Recovery},
author = {Raghavendra, Prasad and Yau, Morris},
pages = {3206--3226},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/raghavendra20a/raghavendra20a.pdf},
url = {http://proceedings.mlr.press/v125/raghavendra20a.html},
abstract = { Learning from data in the presence of outliers is a fundamental problem in statistics. In this work, we study robust statistics in the presence of overwhelming outliers for the fundamental problem of subspace recovery. Given a dataset where an $\alpha$ fraction (less than half) of the data is distributed uniformly in an unknown $k$ dimensional subspace in $d$ dimensions, and with no additional assumptions on the remaining data, the goal is to recover a succinct list of $O(\frac{1}{\alpha})$ subspaces one of which is nontrivially correlated with the planted subspace. We provide the first polynomial time algorithm for the ’list decodable subspace recovery’ problem, and subsume it under a more general framework of list decoding over distributions that are "certifiably resilient" capturing state of the art results for list decodable mean estimation and regression.}
}
@InProceedings{rouyer20a,
title = {Tsallis-INF for Decoupled Exploration and Exploitation in Multi-armed Bandits},
author = {Rouyer, Chlo{\'e} and Seldin, Yevgeny},
pages = {3227--3249},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/rouyer20a/rouyer20a.pdf},
url = {http://proceedings.mlr.press/v125/rouyer20a.html},
abstract = { We consider a variation of the multi-armed bandit problem, introduced by Avner et al. (2012), in which the forecaster is allowed to choose one arm to explore and one arm to exploit at every round. The loss of the exploited arm is blindly suffered by the forecaster, while the loss of the explored arm is observed without being suffered. The goal of the learner is to minimize the regret. We derive a new algorithm using regularization by Tsallis entropy to achieve best of both worlds guarantees. In the adversarial setting we show that the algorithm achieves the minimax optimal $O(\sqrt{KT})$ regret bound, slightly improving on the result of Avner et al.. In the stochastic regime the algorithm achieves a time-independent regret bound, significantly improving on the result of Avner et al.. The algorithm also achieves the same time-independent regret bound in the more general stochastically constrained adversarial regime introduced by Wei and Luo (2018).}
}
@InProceedings{safran20a,
title = {How Good is SGD with Random Shuffling?},
author = {Safran, Itay and Shamir, Ohad},
pages = {3250--3284},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/safran20a/safran20a.pdf},
url = {http://proceedings.mlr.press/v125/safran20a.html},
abstract = { We study the performance of stochastic gradient descent (SGD) on smooth and strongly-convex finite-sum optimization problems. In contrast to the majority of existing theoretical works, which assume that individual functions are sampled with replacement, we focus here on popular but poorly-understood heuristics, which involve going over random permutations of the individual functions. This setting has been investigated in several recent works, but the optimal error rates remain unclear. In this paper, we provide lower bounds on the expected optimization error with these heuristics (using SGD with any constant step size), which elucidate their advantages and disadvantages. In particular, we prove that after $k$ passes over $n$ individual functions, if the functions are re-shuffled after every pass, the best possible optimization error for SGD is at least $\Omega\left(1/(nk)^2+1/nk^3\right)$, which partially corresponds to recently derived upper bounds. Moreover, if the functions are only shuffled once, then the lower bound increases to $\Omega(1/nk^2)$. Since there are strictly smaller upper bounds for repeated reshuffling, this proves an inherent performance gap between SGD with single shuffling and repeated shuffling. As a more minor contribution, we also provide a non-asymptotic $\Omega(1/k^2)$ lower bound (independent of $n$) for the incremental gradient method, when no random shuffling takes place. Finally, we provide an indication that our lower bounds are tight, by proving matching upper bounds for univariate quadratic functions.}
}
@InProceedings{schmalhofer20a,
title = {A Nearly Optimal Variant of the Perceptron Algorithm for the Uniform Distribution on the Unit Sphere},
author = {Schmalhofer, Marco},
pages = {3285--3295},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/schmalhofer20a/schmalhofer20a.pdf},
url = {http://proceedings.mlr.press/v125/schmalhofer20a.html},
abstract = { We show a simple perceptron-like algorithm to learn origin-centered halfspaces in $\mathbb{R}^n$ with accuracy $1-\epsilon$ and confidence $1-\delta$ in time $\mathcal{O}\left(\frac{n^2}{\epsilon}\left(\log \frac{1}{\epsilon}+\log \frac{1}{\delta}\right)\right)$ using $\mathcal{O}\left(\frac{n}{\epsilon}\left(\log \frac{1}{\epsilon}+\log \frac{1}{\delta}\right)\right)$ labeled examples drawn uniformly from the unit $n$-sphere. This improves upon algorithms given in Baum(1990), Long(1994) and Servedio(1999). The time and sample complexity of our algorithm match the lower bounds given in Long(1995) up to logarithmic factors. }
}
@InProceedings{shamir20a,
title = {Logistic Regression Regret: What’s the Catch?},
author = {Shamir, Gil I},
pages = {3296--3319},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/shamir20a/shamir20a.pdf},
url = {http://proceedings.mlr.press/v125/shamir20a.html},
abstract = { We address the problem of the achievable regret rates with online logistic regression. We derive lower bounds with logarithmic regret under $L_1$, $L_2$, and $L_\infty$ constraints on the parameter values. The bounds are dominated by $d/2 \log T$, where $T$ is the horizon and $d$ is the dimensionality of the parameter space. We show their achievability for $d=o(T^{1/3})$ in all these cases with Bayesian methods, that achieve them up to a $d/2 \log d$ term. Interesting different behaviors are shown for larger dimensionality. Specifically, on the negative side, if $d = \Omega(\sqrt{T})$, any algorithm is guaranteed regret of $\Omega(d \log T)$ (greater than $\Theta(\sqrt{T})$) under $L_\infty$ constraints on the parameters (and the example features). On the positive side, under $L_1$ constraints on the parameters, there exist Bayesian algorithms that can achieve regret that is sub-linear in $d$ for the asymptotically larger values of $d$. For $L_2$ constraints, it is shown that for large enough $d$, the regret remains linear in $d$ but no longer logarithmic in $T$. Adapting the \emph{redundancy-capacity\/} theorem from information theory, we demonstrate a principled methodology based on grids of parameters to derive lower bounds. Grids are also utilized to derive some upper bounds. Our results strengthen results by Kakade and Ng (2005) and Foster et al. (2018) for upper bounds for this problem, introduce novel lower bounds, and adapt a methodology that can be used to obtain such bounds for other related problems. They also give a novel characterization of the asymptotic behavior when the dimension of the parameter space is allowed to grow with $T$. They additionally strengthen connections to the information theory literature, demonstrating that the actual regret for logistic regression depends on the richness of the parameter class, where even within this problem, richer classes lead to greater regret.}
}
@InProceedings{simchowitz20a,
title = {Improper Learning for Non-Stochastic Control},
author = {Simchowitz, Max and Singh, Karan and Hazan, Elad},
pages = {3320--3436},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/simchowitz20a/simchowitz20a.pdf},
url = {http://proceedings.mlr.press/v125/simchowitz20a.html},
abstract = { We consider the problem of controlling a possibly unknown linear dynamical system with adversarial perturbations, adversarially chosen convex loss functions, and partially observed states, known as non-stochastic control. We introduce a controller parametrization based on the denoised observations, and prove that applying online gradient descent to this parametrization yields a new controller which attains sublinear regret vs. a large class of closed-loop policies. In the fully-adversarial setting, our controller attains an optimal regret bound of $\sqrt{T}$-when the system is known, and, when combined with an initial stage of least-squares estimation, $T^{2/3}$ when the system is unknown; both yield the first sublinear regret for the partially observed setting. Our bounds are the first in the non-stochastic control setting that compete with \emph{all} stabilizing linear dynamical controllers, not just state feedback. Moreover, in the presence of semi-adversarial noise containing both stochastic and adversarial components, our controller attains the optimal regret bounds of $\mathrm{poly}(\log T)$ when the system is known, and $\sqrt{T}$ when unknown. To our knowledge, this gives the first end-to-end $\sqrt{T}$ regret for online Linear Quadratic Gaussian controller, and applies in a more general setting with adversarial losses and semi-adversarial noise.}
}
@InProceedings{steinke20a,
title = {{R}easoning {A}bout {G}eneralization via {C}onditional {M}utual {I}nformation},
author = {Steinke, Thomas and Zakynthinou, Lydia},
pages = {3437--3452},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/steinke20a/steinke20a.pdf},
url = {http://proceedings.mlr.press/v125/steinke20a.html},
abstract = { We provide an information-theoretic framework for studying the generalization properties of machine learning algorithms. Our framework ties together existing approaches, including uniform convergence bounds and recent methods for adaptive data analysis. Specifically, we use Conditional Mutual Information (CMI) to quantify how well the input (i.e., the training data) can be recognized given the output (i.e., the trained model) of the learning algorithm. We show that bounds on CMI can be obtained from VC dimension, compression schemes, differential privacy, and other methods. We then show that bounded CMI implies various forms of generalization.}
}
@InProceedings{syrgkanis20a,
title = {Estimation and Inference with Trees and Forests in High Dimensions},
author = {Syrgkanis, Vasilis and Zampetakis, Manolis},
pages = {3453--3454},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/syrgkanis20a/syrgkanis20a.pdf},
url = {http://proceedings.mlr.press/v125/syrgkanis20a.html},
abstract = { Regression Trees [Breiman et al. 1984] and Random Forests [Breiman 2001], are one of the most widely used estimation methods by machine learning practitioners. Despite their widespread use, their theoretical underpinnings are far from being fully understood. Recent breakthrough advances have shown that such greedily built trees are asymptotically consistent [Biau et al. 2010, Denil et al. 2014, Scornet et al. 2015] in the low dimensional regime, where the number of features is a constant, independent of the sample size. Also, the works of [Mentch et al. 2016, Wager et al. 2018] provide asymptotic normality results for honest versions of Random Forests. In this work, we analyze the performance of regression trees and forests with binary features in the high-dimensional regime, where the number of features can grow exponentially with the number of samples. We show that trees and forests built greedily based on the celebrated CART criterion, provably adapt to sparsity: when only a subset $R$, of size $r$, of the features are relevant, then the mean squared error of appropriately shallow trees, or fully grown honest forests, scales exponentially only with the number of relevant features and depends only logarithmically on the overall number of features. More precisely, we identify three regimes, each providing different dependence on the number of relevant features. When the relevant variables are “weakly” relevant (in the sense that there is not strong separation between the relevant and irrelevant variables in terms of their ability to reduce variance), then shallow trees achieve “slow rates” on the mean squared error of the order of $2^r/\sqrt{n}$, when variables are independent, and $1/n^{1/(r+2)}$, when variables are dependent. When the relevant variables are “strongly” relevant, in that there is a separation in their ability to reduce variance as compared to the irrelevant ones, by a constant $\beta_{\min}$, then we show that greedily built shallow trees and fully grown honest forests can achieve fast parametric mean squared error rates of the order of $2^r/(\beta_{\min}\,{n})$. When variables are strongly relevant, we also show that the predictions of sub-sampled honest forests have an asymptotically normal distribution centered around their true values and whose variance scales at most as $O(2^r \log(n)/(\beta_{\min}\,{n}))$. Thus, sub-sampled honest forests are provably a data-adaptive method for non-parametric inference, that adapts to the latent sparsity dimension of the data generating distribution, as opposed to classical non-parametric regression approaches. Our results show that, at least for the case of binary features, forest based algorithms can offer immense improvement on the statistical power of non-parametric hypothesis tests in high-dimensional regimes. }
}
@InProceedings{turner20a,
title = {Balancing Gaussian vectors in high dimension},
author = {Turner, Paxton and Meka, Raghu and Rigollet, Philippe},
pages = {3455--3486},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/turner20a/turner20a.pdf},
url = {http://proceedings.mlr.press/v125/turner20a.html},
abstract = { Motivated by problems in controlled experiments, we study the discrepancy of random matrices with continuous entries where the number of columns $n$ is much larger than the number of rows $m$. Our first result shows that if $\omega(1) = m = o(n)$, a matrix with i.i.d. standard Gaussian entries has discrepancy $\Theta(\sqrt{n} \, 2^{-n/m})$ with high probability. This provides sharp guarantees for Gaussian discrepancy in a regime that had not been considered before in the existing literature. Our results also apply to a more general family of random matrices with continuous i.i.d. entries, assuming that $m = O(n/\log{n})$. The proof is non-constructive and is an application of the second moment method. Our second result is algorithmic and applies to random matrices whose entries are i.i.d. and have a Lipschitz density. We present a randomized polynomial-time algorithm that achieves discrepancy $e^{-\Omega(\log^2(n)/m)}$ with high probability, provided that $m = O(\sqrt{\log{n}})$. In the one-dimensional case, this matches the best known algorithmic guarantees due to Karmarkar–Karp. For higher dimensions $2 \leq m = O(\sqrt{\log{n}})$, this establishes the first efficient algorithm achieving discrepancy smaller than $O( \sqrt{m} )$. }
}
@InProceedings{wagenmaker20a,
title = {Active Learning for Identification of Linear Dynamical Systems},
author = {Wagenmaker, Andrew and Jamieson, Kevin},
pages = {3487--3582},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/wagenmaker20a/wagenmaker20a.pdf},
url = {http://proceedings.mlr.press/v125/wagenmaker20a.html},
abstract = { We propose an algorithm to actively estimate the parameters of a linear dynamical system. Given complete control over the system’s input, our algorithm adaptively chooses the inputs to accelerate estimation. We show a finite time bound quantifying the estimation rate our algorithm attains and prove matching upper and lower bounds which guarantee its asymptotic optimality, up to constants. In addition, we show that this optimal rate is unattainable when using Gaussian noise to excite the system, even with optimally tuned covariance, and analyze several examples where our algorithm provably improves over rates obtained by playing noise. Our analysis critically relies on a novel result quantifying the error in estimating the parameters of a dynamical system when arbitrary periodic inputs are being played. We conclude with numerical examples that illustrate the effectiveness of our algorithm in practice. }
}
@InProceedings{wei20a,
title = {Taking a hint: How to leverage loss predictors in contextual bandits?},
author = {Wei, Chen-Yu and Luo, Haipeng and Agarwal, Alekh},
pages = {3583--3634},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/wei20a/wei20a.pdf},
url = {http://proceedings.mlr.press/v125/wei20a.html},
abstract = { We initiate the study of learning in contextual bandits with the help of loss predictors. The main question we address is whether one can improve over the minimax regret $\mathcal{O}(\sqrt{T})$ for learning over $T$ rounds, when the total error of the predicted losses relative to the realized losses, denoted as $\mathcal{E} \leq T$, is relatively small. We provide a complete answer to this question, with upper and lower bounds for various settings: adversarial and stochastic environments, known and unknown $\mathcal{E}$, and single and multiple predictors. We show several surprising results, such as 1) the optimal regret is $\mathcal{O}(\min\{\sqrt{T}, \sqrt{\mathcal{E}}T^\frac{1}{4}\})$ when $\mathcal{E}$ is known, in contrast to the standard and better bound $\mathcal{O}(\sqrt{\mathcal{E}})$ for non-contextual problems (such as multi-armed bandits); 2) the same bound cannot be achieved if $\mathcal{E}$ is unknown, but as a remedy, $\mathcal{O}(\sqrt{\mathcal{E}}T^\frac{1}{3})$ is achievable; 3) with $M$ predictors, a linear dependence on $M$ is necessary, even though logarithmic dependence is possible for non-contextual problems. We also develop several novel algorithmic techniques to achieve matching upper bounds, including 1) a key \emph{action remapping} technique for optimal regret with known $\mathcal{E}$, 2) computationally efficient implementation of Catoni’s robust mean estimator via an ERM oracle in the stochastic setting with optimal regret, 3) an underestimator for $\mathcal{E}$ via estimating the histogram with bins of exponentially increasing size for the stochastic setting with unknown $\mathcal{E}$, and 4) a self-referential scheme for learning with multiple predictors, all of which might be of independent interest. }
}
@InProceedings{woodworth20a,
title = {Kernel and Rich Regimes in Overparametrized Models},
author = {Woodworth, Blake and Gunasekar, Suriya and Lee, Jason D. and Moroshko, Edward and Savarese, Pedro and Golan, Itay and Soudry, Daniel and Srebro, Nathan},
pages = {3635--3673},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/woodworth20a/woodworth20a.pdf},
url = {http://proceedings.mlr.press/v125/woodworth20a.html},
abstract = { A recent line of work studies overparametrized neural networks in the “kernel regime,” i.e. when during training the network behaves as a kernelized linear predictor, and thus, training with gradient descent has the effect of finding the corresponding minimum RKHS norm solution. This stands in contrast to other studies which demonstrate how gradient descent on overparametrized networks can induce rich implicit biases that are not RKHS norms. Building on an observation by \citet{chizat2018note}, we show how the \textbf{\textit{scale of the initialization}} controls the transition between the “kernel” (aka lazy) and “rich” (aka active) regimes and affects generalization properties in multilayer homogeneous models. We provide a complete and detailed analysis for a family of simple depth-$D$ linear networks that exhibit an interesting and meaningful transition between the kernel and rich regimes, and highlight an interesting role for the \emph{width} of the models. We further demonstrate this transition empirically for matrix factorization and multilayer non-linear networks.}
}
@InProceedings{xie20a,
title = {Learning Zero-Sum Simultaneous-Move Markov Games Using Function Approximation and Correlated Equilibrium},
author = {Xie, Qiaomin and Chen, Yudong and Wang, Zhaoran and Yang, Zhuoran},
pages = {3674--3682},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/xie20a/xie20a.pdf},
url = {http://proceedings.mlr.press/v125/xie20a.html},
abstract = { In this work, we develop provably efficient reinforcement learning algorithms for two-player zero-sum Markov games with simultaneous moves. We consider a family of Markov games where the reward function and transition kernel possess a linear structure. Two settings are studied: In the offline setting, we control both players and the goal is to find the Nash Equilibrium efficiently by minimizing the worst-case duality gap. In the online setting, we control a single player and play against an arbitrary opponent; the goal is to minimize the regret. For both settings, we propose an optimistic variant of the least-squares minimax value iteration algorithm. We show that our algorithm is computationally efficient and provably achieves an $\tilde O(\sqrt{d^3 H^3 T} )$ upper bound on the duality gap and regret, without requiring additional assumptions on the sampling model. We highlight that our setting requires overcoming several new challenges that are absent in MDPs or turn-based Markov games. In particular, to achieve optimism under the simultaneous-move games, we construct both upper and lower confidence bounds of the value function, and then derive the optimistic policy by solving a general-sum matrix game with these bounds as the payoff matrices. As finding the Nash Equilibrium of this general-sum game is computationally hard, our algorithm instead solves for a Coarse Correlated Equilibrium (CCE), which can be obtained efficiently via linear programming. To our best knowledge, such a CCE-based mechanism for implementing optimism has not appeared in the literature and might be of interest in its own right.}
}
@InProceedings{xu20a,
title = {Tree-projected gradient descent for estimating gradient-sparse parameters on graphs},
author = {Xu, Sheng and Fan, Zhou and Negahban, Sahand},
pages = {3683--3708},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/xu20a/xu20a.pdf},
url = {http://proceedings.mlr.press/v125/xu20a.html},
abstract = { We study estimation of a gradient-sparse parameter vector $\boldsymbol{\theta}^* \in \mathbb{R}^p$, having strong gradient-sparsity $s^*:=\|\nabla_G \boldsymbol{\theta}^*\|_0$ on an underlying graph $G$. Given observations $Z_1,\ldots,Z_n$ and a smooth, convex loss function $\mathcal{L}$ for which $\boldsymbol{\theta}^*$ minimizes the population risk $\mathbb{E}[\mathcal{L}(\boldsymbol{\theta};Z_1,\ldots,Z_n)]$, we propose to estimate $\boldsymbol{\theta}^*$ by a projected gradient descent algorithm that iteratively and approximately projects gradient steps onto spaces of vectors having small gradient-sparsity over low-degree spanning trees of $G$. We show that, under suitable restricted strong convexity and smoothness assumptions for the loss, the resulting estimator achieves the squared-error risk $\frac{s^*}{n} \log (1+\frac{p}{s^*})$ up to a multiplicative constant that is independent of $G$. In contrast, previous polynomial-time algorithms have only been shown to achieve this guarantee in more specialized settings, or under additional assumptions for $G$ and/or the sparsity pattern of $\nabla_G \boldsymbol{\theta}^*$. As applications of our general framework, we apply our results to the examples of linear models and generalized linear models with random design.}
}
@InProceedings{yang20a,
title = {Non-asymptotic Analysis for Nonparametric Testing},
author = {Yang, Yun and Shang, Zuofeng and Cheng, Guang},
pages = {3709--3755},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/yang20a/yang20a.pdf},
url = {http://proceedings.mlr.press/v125/yang20a.html},
abstract = { We develop a non-asymptotic framework for hypothesis testing in nonparametric regression where the true regression function belongs to a Sobolev space. Our statistical guarantees are exact in the sense that Type I and II errors are controlled for any finite sample size. Meanwhile, one proposed test is shown to achieve minimax rate optimality in the asymptotic sense. An important consequence of this non-asymptotic theory is a new and practically useful formula for selecting the optimal smoothing parameter in the testing statistic. Extensions of our results to general reproducing kernel Hilbert spaces and non-Gaussian error regression are also discussed.}
}
@InProceedings{yehudai20a,
title = {Learning a Single Neuron with Gradient Methods},
author = {Yehudai, Gilad and Ohad, Shamir},
pages = {3756--3786},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/yehudai20a/yehudai20a.pdf},
url = {http://proceedings.mlr.press/v125/yehudai20a.html},
abstract = { We consider the fundamental problem of learning a single neuron $\mathbf{x}\mapsto \sigma(\mathbf{w}^\top\mathbf{x})$ in a realizable setting, using standard gradient methods with random initialization, and under general families of input distributions and activations. On the one hand, we show that some assumptions on both the distribution and the activation function are necessary. On the other hand, we prove positive guarantees under mild assumptions, which go significantly beyond those studied in the literature so far. We also point out and study the challenges in further strengthening and generalizing our results.}
}
@InProceedings{yuan20a,
title = {Nearly Non-Expansive Bounds for Mahalanobis Hard Thresholding},
author = {Yuan, Xiao-Tong and Li, Ping},
pages = {3787--3813},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/yuan20a/yuan20a.pdf},
url = {http://proceedings.mlr.press/v125/yuan20a.html},
abstract = { Given a vector $w \in \mathbb{R}^p$ and a positive semi-definite matrix $A \in \mathbb{R}^{p\times p}$, we study the expansion ratio bound for the following defined Mahalanobis hard thresholding operator of $w$: \[ \mathcal{H}_{A,k}(w):=\argmin_{\|\theta\|_0\le k} \frac{1}{2}\|\theta - w\|^2_A, \]{where} $k\le p$ is the desired sparsity level. The core contribution of this paper is to prove that for any $\bar k$-sparse vector $\bar w$ with $\bar k < k$, the estimation error $\|\mathcal{H}_{A,k}(w) - \bar w\|_A$ satisfies \[ \|\mathcal{H}_{A,k}(w) - \bar w\|^2_A \le \left(1+ \mathcal{O}\left(\kappa(A,2k) \sqrt{\frac{\bar k }{k - \bar k}}\right)\right) \|{w} - \bar w\|^2_A, \]{where} $\kappa(A,2k)$ is the restricted strong condition number of $A$ over $(2k)$-sparse subspace. This estimation error bound is nearly non-expansive when $k$ is sufficiently larger than $\bar k$. Specially when $A$ is the identity matrix such that $\kappa(A,2k)\equiv1$, our bound recovers the previously known nearly non-expansive bounds for Euclidean hard thresholding operator. We further show that such a bound extends to an approximate version of $\mathcal{H}_{A,k}(w)$ estimated by Hard Thresholding Pursuit (HTP) algorithm. We demonstrate the applicability of these bounds to the mean squared error analysis of HTP and its novel extension based on preconditioning method. Numerical evidence is provided to support our theory and demonstrate the superiority of the proposed preconditioning HTP algorithm. }
}
@InProceedings{zhang20a,
title = {Wasserstein Control of Mirror Langevin Monte Carlo},
author = {Zhang, Kelvin Shuangjian and Peyr\'e, Gabriel and Fadili, Jalal and Pereyra, Marcelo},
pages = {3814--3841},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/zhang20a/zhang20a.pdf},
url = {http://proceedings.mlr.press/v125/zhang20a.html},
abstract = { Discretized Langevin diffusions are efficient Monte Carlo methods for sampling from high dimensional target densities that are log-Lipschitz-smooth and (strongly) log-concave. In particular, the Euclidean Langevin Monte Carlo sampling algorithm has received much attention lately, leading to a detailed understanding of its non-asymptotic convergence properties and of the role that smoothness and log-concavity play in the convergence rate. Distributions that do not possess these regularity properties can be addressed by considering a Riemannian Langevin diffusion with a metric capturing the local geometry of the log-density. However, the Monte Carlo algorithms derived from discretizations of such Riemannian Langevin diffusions are notoriously difficult to analyze. In this paper, we consider Langevin diffusions on a Hessian-type manifold and study a discretization that is closely related to the mirror-descent scheme. We establish for the first time a non-asymptotic upper-bound on the sampling error of the resulting Hessian Riemannian Langevin Monte Carlo algorithm. This bound is measured according to a Wasserstein distance induced by a Riemannian metric ground cost capturing the squared Hessian structure and closely related to a self-concordance-like condition. The upper-bound implies, for instance, that the iterates contract toward a Wasserstein ball around the target density whose radius is made explicit. Our theory recovers existing Euclidean results and can cope with a wide variety of Hessian metrics related to highly non-flat geometries. }
}
@InProceedings{foster20a,
title = {Open Problem: Model Selection for Contextual Bandits},
author = {Foster, Dylan J. and Krishnamurthy, Akshay and Luo, Haipeng},
pages = {3842--3846},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/foster20a/foster20a.pdf},
url = {http://proceedings.mlr.press/v125/foster20a.html},
abstract = {In statistical learning, algorithms for model selection allow the learner to adapt to the complexity of the best hypothesis class in a sequence. We ask whether similar guarantees are possible for contextual bandit learning.}
}
@InProceedings{koren20a,
title = {Open Problem: Tight Convergence of SGD in Constant Dimension},
author = {Koren, Tomer and Segal, Shahar},
pages = {3847--3851},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/koren20a/koren20a.pdf},
url = {http://proceedings.mlr.press/v125/koren20a.html},
abstract = {Stochastic Gradient Descent (SGD) is one of the most popular optimization methods in machine learning and has been studied extensively since the early 50’s. However, our understanding of this fundamental algorithm is still lacking in certain aspects. We point out to a gap that remains between the known upper and lower bounds for the expected suboptimality of the last SGD point whenever the dimension is a constant independent of the number of SGD iterations $T$, and in particular, that the gap is still unaddressed even in the one dimensional case. For the latter, we provide evidence that the correct rate is $\Theta(1/\sqrt{T})$ and conjecture that the same applies in any (constant) dimension.}
}
@InProceedings{luo20a,
title = {Open Problem: Average-Case Hardness of Hypergraphic Planted Clique Detection},
author = {Luo, Yuetian and Zhang, Anru R},
pages = {3852--3856},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/luo20a/luo20a.pdf},
url = {http://proceedings.mlr.press/v125/luo20a.html},
abstract = {We note the significance of hypergraphic planted clique (HPC) detection in the investigation of computational hardness for a range of tensor problems. We ask if more evidence for the computational hardness of HPC detection can be developed. In particular, we conjecture if it is possible to establish the equivalence of the computational hardness between HPC and PC detection.}
}
@InProceedings{steinke20b,
title = {{O}pen {P}roblem: {I}nformation {C}omplexity of {VC} {L}earning},
author = {Steinke, Thomas and Zakynthinou, Lydia},
pages = {3857--3863},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/steinke20b/steinke20b.pdf},
url = {http://proceedings.mlr.press/v125/steinke20b.html},
abstract = {Uniform convergence approaches learning by studying the complexity of hypothesis classes. In particular, hypothesis classes with bounded Vapnik-Chervonenkis dimension exhibit strong uniform convergence, that is, any hypothesis in the class has low generalization error. On the other hand, a long line of work studies the information complexity of a learning algorithm, as it is connected to several desired properties, including generalization. We ask whether all VC classes admit a learner with low information complexity which achieves the generalization bounds guaranteed by uniform convergence. Specifically, since we know that this is not possible if we consider proper and consistent learners and measure information complexity in terms of the mutual information (Bassily et al., 2018), we are interested in learners with low information complexity measured in terms of the recently introduced notion of CMI (Steinke and Zakynthinou, 2020). Can we obtain tight bounds on the information complexity of a learning algorithm for a VC class (via CMI), thus exactly retrieving the known generalization bounds implied for this class by uniform convergence?}
}
@InProceedings{van-erven20a,
title = {Open Problem: Fast and Optimal Online Portfolio Selection},
author = {{Van Erven}, Tim and {Van der Hoeven}, Dirk and Kot{\l}owski, Wojciech and Koolen, Wouter M.},
pages = {3864--3869},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
address = {},
month = {09--12 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/van-erven20a/van-erven20a.pdf},
url = {http://proceedings.mlr.press/v125/van-erven20a.html},
abstract = {Online portfolio selection has received much attention in the COLT community since its introduction by Cover, but all state-of-the-art methods fall short in at least one of the following ways: they are either i) computationally infeasible; or ii) they do not guarantee optimal regret; or iii) they assume the gradients are bounded, which is unnecessary and cannot be guaranteed. We are interested in a natural follow-the-regularized-leader (FTRL) approach based on the log barrier regularizer, which is computationally feasible. The open problem we put before the community is to formally prove whether this approach achieves the optimal regret. Resolving this question will likely lead to new techniques to analyse FTRL algorithms. There are also interesting technical connections to self-concordance, which has previously been used in the context of bandit convex optimization.}
}