@Proceedings{COLT2022,
title = {Proceedings of Thirty Fifth Conference on Learning Theory},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
editor = {Po-Ling Loh and Maxim Raginsky},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
volume = 178
}
@InProceedings{pmlr-v178-preface-loh22a,
title = {Conference on Learning Theory 2022: Preface},
author = {Loh, Po-Ling and Raginsky, Maxim},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {i--ii},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/preface-loh22a/preface-loh22a.pdf},
url = {https://proceedings.mlr.press/v178/preface-loh22a.html}
}
@InProceedings{pmlr-v178-chewi22a,
title = {Analysis of Langevin Monte Carlo from Poincare to Log-Sobolev},
author = {Chewi, Sinho and Erdogdu, Murat A and Li, Mufan and Shen, Ruoqi and Zhang, Shunshi},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1--2},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/chewi22a/chewi22a.pdf},
url = {https://proceedings.mlr.press/v178/chewi22a.html},
abstract = {Classically, the continuous-time Langevin diffusion converges exponentially fast to its stationary distribution $\pi$ under the sole assumption that $\pi$ satisfies a Poincaré inequality. Using this fact to provide guarantees for the discrete-time Langevin Monte Carlo (LMC) algorithm, however, is considerably more challenging due to the need for working with chi-squared or Rényi divergences, and prior works have largely focused on strongly log-concave targets. In this work, we provide the first convergence guarantees for LMC assuming that $\pi$ satisfies either a Latał{}a–Oleszkiewicz or modified log-Sobolev inequality, which interpolates between the Poincaré and log-Sobolev settings. Unlike prior works, our results allow for weak smoothness and do not require convexity or dissipativity conditions.}
}
@InProceedings{pmlr-v178-safran22a,
title = {Optimization-Based Separations for Neural Networks},
author = {Safran, Itay and Lee, Jason},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {3--64},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/safran22a/safran22a.pdf},
url = {https://proceedings.mlr.press/v178/safran22a.html},
abstract = {Depth separation results propose a possible theoretical explanation for the benefits of deep neural networks over shallower architectures, establishing that the former possess superior approximation capabilities. However, there are no known results in which the deeper architecture leverages this advantage into a provable optimization guarantee. We prove that when the data are generated by a distribution with radial symmetry which satisfies some mild assumptions, gradient descent can efficiently learn ball indicator functions using a depth 2 neural network with two layers of sigmoidal activations, and where the hidden layer is held fixed throughout training. By building on and refining existing techniques for approximation lower bounds of neural networks with a single layer of non-linearities, we show that there are $d$-dimensional radial distributions on the data such that ball indicators cannot be learned efficiently by any algorithm to accuracy better than $\Omega(d^{-4})$, nor by a standard gradient descent implementation to accuracy better than a constant. These results establish what is to the best of our knowledge, the first optimization-based separations where the approximation benefits of the stronger architecture provably manifest in practice. Our proof technique introduces new tools and ideas that may be of independent interest in the theoretical study of both the approximation and optimization of neural networks.}
}
@InProceedings{pmlr-v178-vural22a,
title = {Mirror Descent Strikes Again: Optimal Stochastic Convex Optimization under Infinite Noise Variance},
author = {Vural, Nuri Mert and Yu, Lu and Balasubramanian, Krishna and Volgushev, Stanislav and Erdogdu, Murat A},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {65--102},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/vural22a/vural22a.pdf},
url = {https://proceedings.mlr.press/v178/vural22a.html},
abstract = {We study stochastic convex optimization under infinite noise variance. Specifically, when the stochastic gradient is unbiased and has uniformly bounded $(1+\kappa)$-th moment, for some $\kappa \in (0,1]$, we quantify the convergence rate of the Stochastic Mirror Descent algorithm with a particular class of uniformly convex mirror maps, in terms of the number of iterations, dimensionality and related geometric parameters of the optimization problem. Interestingly this algorithm does not require any explicit gradient clipping or normalization, which have been extensively used in several recent empirical and theoretical works. We complement our convergence results with information-theoretic lower bounds showing that no other algorithm using only stochastic first-order oracles can achieve improved rates. Our results have several interesting consequences for devising online/streaming stochastic approximation algorithms for problems arising in robust statistics and machine learning.}
}
@InProceedings{pmlr-v178-milne22a,
title = {Wasserstein GANs with Gradient Penalty Compute Congested Transport},
author = {Milne, Tristan and Nachman, Adrian I},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {103--129},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/milne22a/milne22a.pdf},
url = {https://proceedings.mlr.press/v178/milne22a.html},
abstract = {Wasserstein GANs with Gradient Penalty (WGAN-GP) are a very popular method for training generative models to produce high quality synthetic data. While WGAN-GP were initially developed to calculate the Wasserstein 1 distance between generated and real data, recent works (e.g. [23]) have provided empirical evidence that this does not occur, and have argued that WGAN-GP perform well not in spite of this issue, but because of it. In this paper we show for the first time that WGAN-GP compute the minimum of a different optimal transport problem, the so-called congested transport [7]. Congested transport determines the cost of moving one distribution to another under a transport model that penalizes congestion. For WGAN-GP, we find that the congestion penalty has a spatially varying component determined by the sampling strategy used in [12] which acts like a local speed limit, making congestion cost less in some regions than others. This aspect of the congested transport problem is new, in that the congestion penalty turns out to be unbounded and depends on the distributions to be transported, and so we provide the necessary mathematical proofs for this setting. One facet of our discovery is a formula connecting the gradient of solutions to the optimization problem in WGAN-GP to the time averaged momentum of the optimal mass flow. This is in contrast to the gradient of Kantorovich potentials for the Wasserstein 1 distance, which is just the normalized direction of flow. Based on this and other considerations, we speculate on how our results explain the observed performance of WGAN-GP. Beyond applications to GANs, our theorems also point to the possibility of approximately solving large scale congested transport problems using neural network techniques.}
}
@InProceedings{pmlr-v178-acharya22a,
title = {Robust Estimation for Random Graphs},
author = {Acharya, Jayadev and Jain, Ayush and Kamath, Gautam and Suresh, Ananda Theertha and Zhang, Huanyu},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {130--166},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/acharya22a/acharya22a.pdf},
url = {https://proceedings.mlr.press/v178/acharya22a.html},
abstract = {We study the problem of robustly estimating the parameter $p$ of an Erdős-Rényi random graph on $n$ nodes, where a $\gamma$ fraction of nodes may be adversarially corrupted. After showing the deficiencies of canonical estimators, we design a computationally-efficient spectral algorithm which estimates $p$ up to accuracy $\tilde O(\sqrt{p(1-p)}/n + \gamma\sqrt{p(1-p)} /\sqrt{n}+ \gamma/n)$ for $\gamma < 1/60$. Furthermore, we give an inefficient algorithm with similar accuracy for all $\gamma<1/2$, the information-theoretic limit. Finally, we prove a nearly-matching statistical lower bound, showing that the error of our algorithms is optimal up to logarithmic factors.}
}
@InProceedings{pmlr-v178-liu22a,
title = {Tight query complexity bounds for learning graph partitions},
author = {Liu, Xizhi and Mukherjee, Sayan},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {167--181},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/liu22a/liu22a.pdf},
url = {https://proceedings.mlr.press/v178/liu22a.html},
abstract = {Given a partition of a graph into connected components, the membership oracle asserts whether any two vertices of the graph lie in the same component or not. We prove that for $n\ge k\ge 2$, learning the components of an $n$-vertex hidden graph with $k$ components requires at least $(k-1)n-\binom k2$ membership queries. Our result improves on the best known information-theoretic bound of $\Omega(n\log k)$ queries, and exactly matches the query complexity of the algorithm introduced by [Reyzin and Srivastava, 2007] for this problem. Additionally, we introduce an oracle that can learn the number of components of $G$ in asymptotically fewer queries than learning the full partition, thus answering another question posed by the same authors. Lastly, we introduce a more applicable version of this oracle, and prove asymptotically tight bounds of $\widetilde\Theta(m)$ queries for both learning and verifying an $m$-edge hidden graph $G$ using it.}
}
@InProceedings{pmlr-v178-zimmert22a,
title = {Pushing the Efficiency-Regret Pareto Frontier for Online Learning of Portfolios and Quantum States},
author = {Zimmert, Julian and Agarwal, Naman and Kale, Satyen},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {182--226},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/zimmert22a/zimmert22a.pdf},
url = {https://proceedings.mlr.press/v178/zimmert22a.html},
abstract = {We revisit the classical online portfolio selection problem. It is widely assumed that a trade-off between computational complexity and regret is unavoidable, with Cover’s Universal Portfolios algorithm, SOFT-BAYES and ADA-BARRONS currently constituting its state-of-the-art Pareto frontier. In this paper, we present the first efficient algorithm, BISONS, that obtains polylogarithmic regret with memory and per-step running time requirements that are polynomial in the dimension, displacing ADA-BARRONS from the Pareto frontier. Additionally, we resolve a COLT 2020 open problem by showing that a certain Follow-The-Regularized-Leader algorithm with log-barrier regularization suffers an exponentially larger dependence on the dimension than previously conjectured. Thus, we rule out this algorithm as a candidate for the Pareto frontier. We also extend our algorithm and analysis to a more general problem than online portfolio selection, viz. online learning of quantum states with log loss. This algorithm, called SCHRODINGER’S-BISONS, ibs the first efficient algorithm with polylogarithmic regret for this more general problem.}
}
@InProceedings{pmlr-v178-tinsi22a,
title = {Risk bounds for aggregated shallow neural networks using Gaussian priors},
author = {Tinsi, Laura and Dalalyan, Arnak},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {227--253},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/tinsi22a/tinsi22a.pdf},
url = {https://proceedings.mlr.press/v178/tinsi22a.html},
abstract = {Analysing statistical properties of neural networks is a central topic in statistics and machine learning. However, most results in the literature focus on the properties of the neural network minimizing the training error. The goal of this paper is to consider aggregated neural networks using a Gaussian prior. The departure point of our approach is an arbitrary aggregate satisfying the PAC-Bayesian inequality. The main contribution is a precise nonasymptotic assessment of the estimation error appearing in the PAC-Bayes bound. Our analysis is sharp enough to lead to minimax rates of estimation over Sobolev smoothness classes.}
}
@InProceedings{pmlr-v178-beugnot22a,
title = {On the Benefits of Large Learning Rates for Kernel Methods},
author = {Beugnot, Gaspard and Mairal, Julien and Rudi, Alessandro},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {254--282},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/beugnot22a/beugnot22a.pdf},
url = {https://proceedings.mlr.press/v178/beugnot22a.html},
abstract = {This paper studies an intriguing phenomenon related to the good generalization performance of estimators obtained by using large learning rates within gradient descent algorithms. First observed in the deep learning literature, we show that such a phenomenon can be precisely characterized in the context of kernel methods, even though the resulting optimization problem is convex. Specifically, we consider the minimization of a quadratic objective in a separable Hilbert space, and show that with early stopping, the choice of learning rate influences the spectral decomposition of the obtained solution on the Hessian’s eigenvectors. This extends an intuition described by Nakkiran (2020) on a two-dimensional toy problem to realistic learning scenarios such as kernel ridge regression. While large learning rates may be proven beneficial as soon as there is a mismatch between the train and test objectives, we further explain why it already occurs in classification tasks without assuming any particular mismatch between train and test data distributions.}
}
@InProceedings{pmlr-v178-hsu22a,
title = {Near-Optimal Statistical Query Lower Bounds for Agnostically Learning Intersections of Halfspaces with Gaussian Marginals},
author = {Hsu, Daniel J and Sanford, Clayton H and Servedio, Rocco and Vlatakis-Gkaragkounis, Emmanouil Vasileios},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {283--312},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/hsu22a/hsu22a.pdf},
url = {https://proceedings.mlr.press/v178/hsu22a.html},
abstract = {We consider the well-studied problem of learning intersections of halfspaces under the Gaussian distribution in the challenging \emph{agnostic learning} model. Recent work of Diakonikolas et al. (2021) shows that any Statistical Query (SQ) algorithm for agnostically learning the class of intersections of $k$ halfspaces over $\mathbb{R}^n$ to constant excess error either must make queries of tolerance at most $n^{-\tilde{\Omega}(\sqrt{\log k})}$ or must make $2^{n^{\Omega(1)}}$ queries. We strengthen this result by improving the tolerance requirement to $n^{-\tilde{\Omega}(\log k)}$. This lower bound is essentially best possible since an SQ algorithm of Klivans et al. (2008) agnostically learns this class to any constant excess error using $n^{O(\log k)}$ queries of tolerance $n^{-O(\log k)}$. We prove two variants of our lower bound, each of which combines ingredients from Diakonikolas et al. (2021) with (an extension of) a different earlier approach for agnostic SQ lower bounds for the Boolean setting due to Dachman-Soled et al. (2014). Our approach also yields lower bounds for agnostically SQ learning the class of "convex subspace juntas" (studied by Vempala, 2010) and the class of sets with bounded Gaussian surface area; all of these lower bounds are nearly optimal since they essentially match known upper bounds from Klivans et al. (2008).}
}
@InProceedings{pmlr-v178-faw22a,
title = {The Power of Adaptivity in SGD: Self-Tuning Step Sizes with Unbounded Gradients and Affine Variance},
author = {Faw, Matthew and Tziotis, Isidoros and Caramanis, Constantine and Mokhtari, Aryan and Shakkottai, Sanjay and Ward, Rachel},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {313--355},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/faw22a/faw22a.pdf},
url = {https://proceedings.mlr.press/v178/faw22a.html},
abstract = {We study convergence rates of AdaGrad-Norm as an exemplar of adaptive stochastic gradient methods (SGD), where the step sizes change based on observed stochastic gradients, for minimizing non-convex, smooth objectives. Despite their popularity, the analysis of adaptive SGD lags behind that of non adaptive methods in this setting. Specifically, all prior works rely on some subset of the following assumptions: (i) uniformly-bounded gradient norms, (ii) uniformly-bounded stochastic gradient variance (or even noise support), (iii) conditional independence between the step size and stochastic gradient. In this work, we show that AdaGrad-Norm exhibits an order optimal convergence rate of $\mathcal{O}\left(\frac{\mathrm{poly}\log(T)}{\sqrt{T}}\right)$ after $T$ iterations under the same assumptions as optimally-tuned non adaptive SGD (unbounded gradient norms and affine noise variance scaling), and crucially, without needing any tuning parameters. We thus establish that adaptive gradient methods exhibit order-optimal convergence in much broader regimes than previously understood.}
}
@InProceedings{pmlr-v178-cherapanamjeri22a,
title = {Optimal Mean Estimation without a Variance},
author = {Cherapanamjeri, Yeshwanth and Tripuraneni, Nilesh and Bartlett, Peter and Jordan, Michael},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {356--357},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/cherapanamjeri22a/cherapanamjeri22a.pdf},
url = {https://proceedings.mlr.press/v178/cherapanamjeri22a.html},
abstract = {We study the problem of heavy-tailed mean estimation in settings where the variance of the data-generating distribution does not exist. Concretely, given a sample $\bm{X} = \{X_i\}_{i = 1}^n$ from a distribution $\mc{D}$ over $\mb{R}^d$ with mean $\mu$ which satisfies the following \emph{weak-moment} assumption for some ${\alpha \in [0, 1]}$: \begin{equation*} \forall \norm{v} = 1: \mb{E}_{X \ts \mc{D}}[\abs{\inp{X - \mu}{v}}^{1 + \alpha}] \leq 1, \end{equation*} and given a target failure probability, $\delta$, our goal is to design an estimator which attains the smallest possible confidence interval as a function of $n,d,\delta$. For the specific case of $\alpha = 1$, foundational work of Lugosi and Mendelson exhibits an estimator achieving \emph{optimal} subgaussian confidence intervals, and subsequent work has led to computationally efficient versions of this estimator. Here, we study the case of general $\alpha$, and provide a precise characterization of the optimal achievable confidence interval by establishing the following information-theoretic lower bound: \begin{equation*} \Omega \lprp{\sqrt{\frac{d}{n}} + \lprp{\frac{d}{n}}^{\frac{\alpha}{(1 + \alpha)}} + \lprp{\frac{\log 1 / \delta}{n}}^{\frac{\alpha}{(1 + \alpha)}}}. \end{equation*} and devising an estimator matching the aforementioned lower bound up to constants. Moreover, our estimator is computationally efficient.}
}
@InProceedings{pmlr-v178-wagenmaker22a,
title = {Beyond No Regret: Instance-Dependent PAC Reinforcement Learning},
author = {Wagenmaker, Andrew J and Simchowitz, Max and Jamieson, Kevin},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {358--418},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/wagenmaker22a/wagenmaker22a.pdf},
url = {https://proceedings.mlr.press/v178/wagenmaker22a.html},
abstract = {The theory of reinforcement learning has focused on two fundamental problems: achieving low regret, and identifying $\epsilon$-optimal policies. While a simple reduction allows one to apply a low-regret algorithm to obtain an $\epsilon$-optimal policy and achieve the worst-case optimal rate, it is unknown whether low-regret algorithms can obtain the instance-optimal rate for policy identification. We show this is not possible—there exists a fundamental tradeoff between achieving low regret and identifying an $\epsilon$-optimal policy at the instance-optimal rate. Motivated by our negative finding, we propose a new measure of instance-dependent sample complexity for PAC tabular reinforcement learning which explicitly accounts for the attainable state visitation distributions in the underlying MDP. We then propose and analyze a novel, planning-based algorithm which attains this sample complexity—yielding a complexity which scales with the suboptimality gaps and the “reachability” of a state. We show our algorithm is nearly minimax optimal, and on several examples that our instance-dependent sample complexity offers significant improvements over worst-case bounds.}
}
@InProceedings{pmlr-v178-balkanski22a,
title = {Learning Low Degree Hypergraphs},
author = {Balkanski, Eric and Hanguir, Oussama and Wang, Shatian},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {419--420},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/balkanski22a/balkanski22a.pdf},
url = {https://proceedings.mlr.press/v178/balkanski22a.html},
abstract = {We study the problem of learning a hypergraph via edge detecting queries. In this problem, a learner queries subsets of vertices of a hidden hypergraph and observes whether these subsets contain an edge or not. In general, learning a hypergraph with m edges of maximum size d requires Omega((2m/d)^{d/2}) queries. In this paper, we aim to identify families of hypergraphs that can be learned without suffering from a query complexity that grows exponentially in the size of the edges. We show that hypermatchings and low-degree near-uniform hypergraphs with n vertices are learnable with poly(n) queries. For learning hypermatchings (hypergraphs of maximum degree Delta = 1), we give an O(log^3 n)-round algorithm with O(n log^5 n) queries. We complement this upper bound by showing that there are no algorithms with poly(n) queries that learn hypermatchings in o(log log n) adaptive rounds. For hypergraphs with maximum degree Delta and edge size ratio rho, we give a non-adaptive algorithm with O((2n)^{rho Delta+1} log^2 n) queries. To the best of our knowledge, these are the first algorithms with poly(n, m) query complexity for learning non-trivial families of hypergraphs that have a super-constant number of edges of arbitrarily large size.}
}
@InProceedings{pmlr-v178-domingo-enrich22a,
title = {Depth and Feature Learning are Provably Beneficial for Neural Network Discriminators},
author = {Domingo-Enrich, Carles},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {421--447},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/domingo-enrich22a/domingo-enrich22a.pdf},
url = {https://proceedings.mlr.press/v178/domingo-enrich22a.html},
abstract = {We construct pairs of distributions $\mu_d, \nu_d$ on $\mathbb{R}^d$ such that the quantity $|\mathbb{E}_{x \sim \mu_d} [F(x)] - \mathbb{E}_{x \sim \nu_d} [F(x)]|$ decreases as $\Omega(1/d^2)$ for some three-layer ReLU network $F$ with polynomial width and weights, while declining exponentially in $d$ if $F$ is any two-layer network with polynomial weights. This shows that deep GAN discriminators are able to distinguish distributions that shallow discriminators cannot. Analogously, we build pairs of distributions $\mu_d, \nu_d$ on $\mathbb{R}^d$ such that $|\mathbb{E}_{x \sim \mu_d} [F(x)] - \mathbb{E}_{x \sim \nu_d} [F(x)]|$ decreases as $\Omega(1/(d\log d))$ for two-layer ReLU networks with polynomial weights, while declining exponentially for bounded-norm functions in the associated RKHS. This confirms that feature learning is beneficial for discriminators. Our bounds are based on Fourier transforms.}
}
@InProceedings{pmlr-v178-shamir22a,
title = {The Implicit Bias of Benign Overfitting},
author = {Shamir, Ohad},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {448--478},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/shamir22a/shamir22a.pdf},
url = {https://proceedings.mlr.press/v178/shamir22a.html},
abstract = {The phenomenon of benign overfitting, where a predictor perfectly fits noisy training data while attaining low expected loss, has received much attention in recent years, but still remains not fully understood beyond simple linear regression setups. In this paper, we show that for regression, benign overfitting is “biased” towards certain types of problems, in the sense that its existence on one learning problem precludes its existence on other learning problems. On the negative side, we use this to argue that one should not expect benign overfitting to occur in general, for several natural extensions of the plain linear regression problems studied so far. We then turn to classification problems, and show that the situation there is much more favorable. Specifically, we consider a model where an arbitrary input distribution of some fixed dimension k is concatenated with a high-dimensional distribution, and prove that the max-margin predictor (to which gradient-based methods are known to converge in direction) is asymptotically biased towards minimizing the expected \emph{squared hinge loss} w.r.t. the k-dimensional distribution. This allows us to reduce the question of benign overfitting in classification to the simpler question of whether this loss is a good surrogate for the misclassification error, and use it to show benign overfitting in some new settings.}
}
@InProceedings{pmlr-v178-blanchard22a,
title = {Universal Online Learning with Bounded Loss: Reduction to Binary Classification},
author = {Blanchard, Moise and Cosson, Romain},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {479--495},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/blanchard22a/blanchard22a.pdf},
url = {https://proceedings.mlr.press/v178/blanchard22a.html},
abstract = {We study universal consistency of non-i.i.d. processes in the context of online learning. A stochastic process is said to admit universal consistency if there exists a learner that achieves vanishing average loss for any measurable response function on this process. When the loss function is unbounded, [1] showed that the only processes admitting strong universal consistency are those taking a finite number of values almost surely. However, when the loss function is bounded, the class of processes admitting strong universal consistency is much richer and its characterization could be dependent on the response setting [2]. In this paper, we show that this class of processes is independent from the response setting thereby closing an open question of [3] (Open Problem 3). Specifically, we show that the class of processes that admit universal online learning is the same for binary classification as for multiclass classification with countable number of classes. Consequently, any output setting with bounded loss can be reduced to binary classification. Our reduction is constructive and practical. Indeed, we show that the nearest neighbor algorithm is transported by our construction. For binary classification on a process admitting strong universal learning, we prove that nearest neighbor successfully learns at least all finite unions of intervals.}
}
@InProceedings{pmlr-v178-criscitiello22a,
title = {Negative curvature obstructs acceleration for strongly geodesically convex optimization, even with exact first-order oracles},
author = {Criscitiello, Christopher and Boumal, Nicolas},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {496--542},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/criscitiello22a/criscitiello22a.pdf},
url = {https://proceedings.mlr.press/v178/criscitiello22a.html},
abstract = {Hamilton and Moitra (2021) showed that, in certain regimes, it is not possible to accelerate Riemannian gradient descent in the hyperbolic plane if we restrict ourselves to algorithms which make queries in a (large) bounded domain and which receive gradients and function values corrupted by a (small) amount of noise. We show that acceleration remains unachievable for any deterministic algorithm which receives exact gradient and function-value information (unbounded queries, no noise). Our results hold for a large class of Hadamard manifolds including hyperbolic spaces and the symmetric space $\mathrm{SL}(n) / \mathrm{SO}(n)$ of positive definite $n \times n$ matrices of determinant one. This cements a surprising gap between the complexity of convex optimization and geodesically convex optimization: for hyperbolic spaces, Riemannian gradient descent is optimal on the class of smooth and strongly geodesically convex functions (in the regime where the condition number scales with the radius of the optimization domain). The key idea for proving the lower bound consists of perturbing squared distance functions with sums of bump functions chosen by a resisting oracle.}
}
@InProceedings{pmlr-v178-wu22a,
title = {Multi-Agent Learning for Iterative Dominance Elimination: Formal Barriers and New Algorithms},
author = {Wu, Jibang and Xu, Haifeng and Yao, Fan},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {543--543},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/wu22a/wu22a.pdf},
url = {https://proceedings.mlr.press/v178/wu22a.html},
abstract = {Dominated actions are natural (and perhaps the simplest possible) multi-agent generalizations of sub-optimal actions as in standard single-agent decision making. Thus similar to standard bandit learning, a fundamental learning question in multi-agent systems is whether agents can efficiently eliminate all iteratively dominated actions in an unknown game if they can only observe noisy bandit feedback about the payoff of their played actions. Surprisingly, despite a seemingly simple task, we show a quite negative result; that is, standard no regret algorithms — including the entire family of Dual Averaging algorithms — provably take exponentially many rounds to eliminate all iteratively dominated actions. Moreover, algorithms with the stronger no swap regret also suffer similar exponential inefficiency. To overcome these barriers, we develop a new algorithm that adjusts Exp3 with Diminishing Historical rewards (termed Exp3-DH); Exp3-DH gradually “forgets” history at carefully tailored rates. We prove that when all agents run Exp3-DH (a.k.a., self-play in multi-agent learning), all iteratively dominated actions can be eliminated within polynomially many rounds. Our experimental results further demonstrate the efficiency of Exp3-DH, and that state-of-the-art bandit algorithms, even those explicitly developed for learning in games, fail to eliminate all iteratively dominated actions efficiently.}
}
@InProceedings{pmlr-v178-kamath22a,
title = {A Private and Computationally-Efficient Estimator for Unbounded Gaussians},
author = {Kamath, Gautam and Mouzakis, Argyris and Singhal, Vikrant and Steinke, Thomas and Ullman, Jonathan},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {544--572},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/kamath22a/kamath22a.pdf},
url = {https://proceedings.mlr.press/v178/kamath22a.html},
abstract = {We give the first polynomial-time, polynomial-sample, differentially private estimator for the mean and covariance of an arbitrary Gaussian distribution $N(\mu,\Sigma)$ in $\R^d$. All previous estimators are either nonconstructive, with unbounded running time, or require the user to specify a priori bounds on the parameters $\mu$ and $\Sigma$. The primary new technical tool in our algorithm is a new differentially private preconditioner that takes samples from an arbitrary Gaussian $N(0,\Sigma)$ and returns a matrix $A$ such that $A \Sigma A^T$ has constant condition number}
}
@InProceedings{pmlr-v178-canonne22a,
title = {The Price of Tolerance in Distribution Testing},
author = {Canonne, Clement L and Jain, Ayush and Kamath, Gautam and Li, Jerry},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {573--624},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/canonne22a/canonne22a.pdf},
url = {https://proceedings.mlr.press/v178/canonne22a.html},
abstract = {We revisit the problem of tolerant distribution testing. That is, given samples from an unknown distribution $p$ over $\{1, …, n\}$, is it $\varepsilon_1$-close to or $\varepsilon_2$-far from a reference distribution $q$ (in total variation distance)? Despite significant interest over the past decade, this problem is well understood only in the extreme cases. In the noiseless setting (i.e., $\varepsilon_1 = 0$) the sample complexity is $\Theta(\sqrt{n})$, strongly sublinear in the domain size. At the other end of the spectrum, when $\varepsilon_1 = \varepsilon_2/2$, the sample complexity jumps to the barely sublinear $\Theta(n/\log n)$. However, very little is known about the intermediate regime. We fully characterize the price of tolerance in distribution testing as a function of $n$, $\varepsilon_1$, $\varepsilon_2$, up to a single $\log n$ factor. Specifically, we show the sample complexity to be \[\tilde \Theta\mleft(\frac{\sqrt{n}}{\ve_2^{2}} + \frac{n}{\log n} \cdot \max \mleft\{\frac{\ve_1}{\ve_2^2},\mleft(\frac{\ve_1}{\ve_2^2}\mright)^{\!\!2}\mright\}\mright),\]{providing} a smooth tradeoff between the two previously known cases. We also provide a similar characterization for the problem of tolerant equivalence testing, where both $p$ and $q$ are unknown. Surprisingly, in both cases, the main quantity dictating the sample complexity is the ratio $\varepsilon_1/\varepsilon_2^2$, and not the more intuitive $\varepsilon_1/\varepsilon_2$. Of particular technical interest is our lower bound framework, which involves novel approximation-theoretic tools required to handle the asymmetry between $\varepsilon_1$ and $\varepsilon_2$, a challenge absent from previous works.}
}
@InProceedings{pmlr-v178-dagan22a,
title = {A bounded-noise mechanism for differential privacy},
author = {Dagan, Yuval and Kur, Gil},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {625--661},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/dagan22a/dagan22a.pdf},
url = {https://proceedings.mlr.press/v178/dagan22a.html},
abstract = {We present an asymptotically optimal $(\epsilon,\delta)$ differentially private mechanism for answering multiple, adaptively asked, $\Delta$-sensitive queries, settling the conjecture of Steinke and Ullman [2020]. Our algorithm has a significant advantage that it adds independent bounded noise to each query, thus providing an absolute error bound. Additionally, we apply our algorithm in adaptive data analysis, obtaining an improved guarantee for answering multiple queries regarding some underlying distribution using a finite sample. Numerical computations show that the bounded-noise mechanism outperforms the Gaussian mechanism in many standard settings.}
}
@InProceedings{pmlr-v178-cohen22a,
title = {Learning with metric losses},
author = {Cohen, Dan Tsir and Kontorovich, Aryeh},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {662--700},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/cohen22a/cohen22a.pdf},
url = {https://proceedings.mlr.press/v178/cohen22a.html},
abstract = {We propose a practical algorithm for learning mappings between two metric spaces, $\X$ and $\Y$. Our procedure is strongly Bayes-consistent whenever $\X$ and $\Y$ are topologically separable and $\Y$ is “bounded in expectation” (our term; the separability assumption can be somewhat weakened). At this level of generality, ours is the first such learnability result for unbounded loss in the agnostic setting. Our technique is based on metric medoids (a variant of Fréchet means) and presents a significant departure from existing methods, which, as we demonstrate, fail to achieve Bayes-consistency on general instance- and label-space metrics. Our proofs introduce the technique of {\em semi-stable compression}, which may be of independent interest.}
}
@InProceedings{pmlr-v178-klukowski22a,
title = {Rate of Convergence of Polynomial Networks to Gaussian Processes},
author = {Klukowski, Adam},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {701--722},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/klukowski22a/klukowski22a.pdf},
url = {https://proceedings.mlr.press/v178/klukowski22a.html},
abstract = {We examine one-hidden-layer neural networks with random weights. It is well-known that in the limit of infinitely many neurons they simplify to Gaussian processes. For networks with a polynomial activation, we demonstrate that the rate of this convergence in 2-Wasserstein metric is O(1/sqrt(n)), where n is the number of hidden neurons. We suspect this rate is asymptotically sharp. We improve the known convergence rate for other activations, to power-law in n for ReLU and inverse-square-root up to logarithmic factors for erf. We explore the interplay between spherical harmonics, Stein kernels and optimal transport in the non-isotropic setting.}
}
@InProceedings{pmlr-v178-kothari22a,
title = {Private Robust Estimation by Stabilizing Convex Relaxations},
author = {Kothari, Pravesh and Manurangsi, Pasin and Velingker, Ameya},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {723--777},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/kothari22a/kothari22a.pdf},
url = {https://proceedings.mlr.press/v178/kothari22a.html},
abstract = {We give the first polynomial time and sample (epsilon, delta)-differentially private (DP) algorithm to estimate the mean, covariance and higher moments in the presence of a constant fraction of adversarial outliers. Our algorithm succeeds for families of distributions that satisfy two well-studied properties in prior works on robust estimation: certifiable subgaussianity of directional moments and certifiable hypercontractivity of degree 2 polynomials. Our recovery guarantees hold in the “right affine-invariant norms”: Mahalanobis distance for mean, multiplicative spectral and relative Frobenius distance guarantees for covariance and injective norms for higher moments. Prior works obtained private robust algorithms for mean estimation of subgaussian distributions with bounded covariance. For covariance estimation, ours is the first efficient algorithm (even in the absence of outliers) that succeeds without any condition-number assumptions. Our algorithms arise from a new framework that provides a general blueprint for modifying convex relaxations for robust estimation to satisfy strong worst-case stability guarantees in the appropriate parameter norms whenever the algorithms produce witnesses of correctness in their run. We verify such guarantees for a modification of standard sum-of-squares (SoS) semidefinite programming relaxations for robust estimation. Our privacy guarantees are obtained by combining stability guarantees with a new “estimate dependent” noise injection mechanism in which noise scales with the eigenvalues of the estimated covariance. We believe this framework will be useful more generally in obtaining DP counterparts of robust estimators. Independently of our work, Ashtiani and Liaw [AL21] also obtained a polynomial time and sample private robust estimation algorithm for Gaussian distributions.}
}
@InProceedings{pmlr-v178-alacaoglu22a,
title = {Stochastic Variance Reduction for Variational Inequality Methods},
author = {Alacaoglu, Ahmet and Malitsky, Yura},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {778--816},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/alacaoglu22a/alacaoglu22a.pdf},
url = {https://proceedings.mlr.press/v178/alacaoglu22a.html},
abstract = {We propose stochastic variance reduced algorithms for solving convex-concave saddle point problems, monotone variational inequalities, and monotone inclusions. Our framework applies to extragradient, forward-backward-forward, and forward-reflected-backward methods both in Euclidean and Bregman setups. All proposed methods converge in exactly the same setting as their deterministic counterparts and they either match or improve the best-known complexities for solving structured min-max problems. Our results reinforce the correspondence between variance reduction in variational inequalities and minimization. We also illustrate the improvements of our approach with numerical evaluations on matrix games.}
}
@InProceedings{pmlr-v178-shen22a,
title = {Self-Consistency of the Fokker Planck Equation},
author = {Shen, Zebang and Wang, Zhenfu and Kale, Satyen and Ribeiro, Alejandro and Karbasi, Amin and Hassani, Hamed},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {817--841},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/shen22a/shen22a.pdf},
url = {https://proceedings.mlr.press/v178/shen22a.html},
abstract = {The Fokker-Planck equation (FPE) is the partial differential equation that governs the density evolution of the Ito process and is of great importance to the literature of statistical physics and machine learning. The FPE can be regarded as a continuity equation where the change of the density is completely determined by a time varying velocity field. Importantly, this velocity field also depends on the current density function. As a result, the ground-truth velocity field can be shown to be the solution of a fixed-point equation, a property that we call self-consistency. In this paper, we exploit this concept to design a potential function of the hypothesis velocity fields, and prove that, if such a function diminishes to zero during the training procedure, the trajectory of the densities generated by the hypothesis velocity fields converges to the solution of the FPE in the Wasserstein-2 sense. The proposed potential function is amenable to neural-network based parameterization as the stochastic gradient with respect to the parameter can be efficiently computed. Once a parameterized model, such as Neural Ordinary Differential Equation is trained, we can generate the entire trajectory to the FPE.}
}
@InProceedings{pmlr-v178-bousquet22a,
title = {Monotone Learning},
author = {Bousquet, Olivier J and Daniely, Amit and Kaplan, Haim and Mansour, Yishay and Moran, Shay and Stemmer, Uri},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {842--866},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/bousquet22a/bousquet22a.pdf},
url = {https://proceedings.mlr.press/v178/bousquet22a.html},
abstract = {The amount of training-data is one of the key factors which determines the generalization capacity of learning algorithms. Intuitively, one expects the error rate to decrease as the amount of training-data increases. Perhaps surprisingly, natural attempts to formalize this intuition give rise to interesting and challenging mathematical questions. For example, in their classical book on pattern recognition, Devroye, Gyorfi and Lugosi (1996) ask whether there exists a {monotone} Bayes-consistent algorithm.This question remained open for over 25 years, until recently Pestov (2021) resolved it for binary classification, using an intricate construction of a monotone Bayes-consistent algorithm. We derive a general result in multiclass classification, showing that every learning algorithm $A$ can be transformed to a monotone one with similar performance. Further, the transformation is efficient and only uses a black-box oracle access to $A$. This demonstrates that one can provably avoid non-monotonic behaviour without compromising performance, thus answering questions asked by Devroye, Gyorfi, and Lugosi (1996), Viering, Mey, and Loog (2019), Viering and Loog (2021), and by Mhammedi (2021). Our general transformation readily implies monotone learners in a variety of contexts: for example, Pestov’s result follows by applying it on \emph{any} Bayes-consistent algorithm (e.g., $k$-Nearest-Neighbours). In fact, our transformation extends Pestov’s result to classification tasks with an arbitrary number of labels. This is contrast with Pestov’s work which is tailored to binary classification. In addition, we provide uniform bounds on the error of the monotone algorithm. This makes our transformation applicable in distribution-free settings. For example, in PAC learning it implies that every learnable class admits a monotone PAC learner. This resolves questions asked by Viering, Mey, and Loog (2019); Viering and Loog (2021); Mhammedi (2021)}
}
@InProceedings{pmlr-v178-christianson22a,
title = {Chasing Convex Bodies and Functions with Black-Box Advice},
author = {Christianson, Nicolas and Handina, Tinashe and Wierman, Adam},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {867--908},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/christianson22a/christianson22a.pdf},
url = {https://proceedings.mlr.press/v178/christianson22a.html},
abstract = {We consider the problem of convex function chasing with black-box advice, where an online decision-maker aims to minimize the total cost of making and switching between decisions in a normed vector space, aided by black-box advice such as the decisions of a machine-learned algorithm. The decision-maker seeks cost comparable to the advice when it performs well, known as \emph{consistency}, while also ensuring worst-case \emph{robustness} even when the advice is adversarial. We first consider the common paradigm of algorithms that switch between the decisions of the advice and a competitive algorithm, showing that no algorithm in this class can improve upon 3-consistency while staying robust. We then propose two novel algorithms that bypass this limitation by exploiting the problem’s convexity. The first, $\textsc{Interp}$, achieves $(\sqrt{2}+\epsilon)$-consistency and $\mathcal{O}(\frac{C}{\epsilon^2})$-robustness for any $\epsilon > 0$, where $C$ is the competitive ratio of an algorithm for convex function chasing or a subclass thereof. The second, $\textsc{BdInterp}$, achieves $(1+\epsilon)$-consistency and $\mathcal{O}(\frac{CD}{\epsilon})$-robustness when the problem has bounded diameter $D$. Further, we show that $\textsc{BdInterp}$ achieves near-optimal consistency-robustness trade-off for the special case where cost functions are $\alpha$-polyhedral.}
}
@InProceedings{pmlr-v178-li22a,
title = {ROOT-SGD: Sharp Nonasymptotics and Asymptotic Efficiency in a Single Algorithm},
author = {Li, Chris Junchi and Mou, Wenlong and Wainwright, Martin and Jordan, Michael},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {909--981},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/li22a/li22a.pdf},
url = {https://proceedings.mlr.press/v178/li22a.html},
abstract = {We study the problem of solving strongly convex and smooth unconstrained optimization problems using stochastic first-order algorithms. We devise a novel algorithm, referred to as \emph{Recursive One-Over-T SGD} (ROOT-SGD), based on an easily implementable, recursive averaging of past stochastic gradients. We prove that it simultaneously achieves state-of-the-art performance in both a finite-sample, nonasymptotic sense and an asymptotic sense. On the nonasymptotic side, we prove risk bounds on the last iterate of ROOT-SGD with leading-order terms that match the optimal statistical risk with a unity pre-factor, along with a higher-order term that scales at the sharp rate of $O(n^{-3/2})$ under the Lipschitz condition on the Hessian matrix. On the asymptotic side, we show that when a mild, one-point Hessian continuity condition is imposed, the rescaled last iterate of (multi-epoch) ROOT-SGD converges asymptotically to a Gaussian limit with the Cramér-Rao optimal asymptotic covariance, for a broad range of step-size choices.}
}
@InProceedings{pmlr-v178-chen22a,
title = {Policy Optimization for Stochastic Shortest Path},
author = {Chen, Liyu and Luo, Haipeng and Rosenberg, Aviv},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {982--1046},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/chen22a/chen22a.pdf},
url = {https://proceedings.mlr.press/v178/chen22a.html},
abstract = {Policy optimization is among the most popular and successful reinforcement learning algorithms, and there is increasing interest in understanding its theoretical guarantees. In this work, we initiate the study of policy optimization for the stochastic shortest path (SSP) problem, a goal-oriented reinforcement learning model that strictly generalizes the finite-horizon model and better captures many applications. We consider a wide range of settings, including stochastic and adversarial environments under full information or bandit feedback, and propose a policy optimization algorithm for each setting that makes use of novel correction terms and/or variants of dilated bonuses (Luo et al., 2021). For most settings, our algorithm is shown to achieve a near-optimal regret bound. One key technical contribution of this work is a new approximation scheme to tackle SSP problems that we call stacked discounted approximation and use in all our proposed algorithms. Unlike the finite-horizon approximation that is heavily used in recent SSP algorithms, our new approximation enables us to learn a near-stationary policy with only logarithmic changes during an episode and could lead to an exponential improvement in space complexity.}
}
@InProceedings{pmlr-v178-nasser22a,
title = {Optimal SQ Lower Bounds for Learning Halfspaces with Massart Noise},
author = {Nasser, Rajai and Tiegel, Stefan},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1047--1074},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/nasser22a/nasser22a.pdf},
url = {https://proceedings.mlr.press/v178/nasser22a.html},
abstract = {We give tight statistical query (SQ) lower bounds for learnining halfspaces in the presence of Massart noise. In particular, suppose that all labels are corrupted with probability at most $\eta$. We show that for arbitrary $\eta \in [0,1/2]$ every SQ algorithm achieving misclassification error better than $\eta$ requires queries of superpolynomial accuracy or at least a superpolynomial number of queries. Further, this continues to hold even if the information-theoretically optimal error $\OPT$ is as small as $\exp\Paren{-\log^c(d)}$, where $d$ is the dimension and $0 < c < 1$ is an arbitrary absolute constant, and an overwhelming fraction of examples are noiseless. Our lower bound matches known polynomial time algorithms, which are also implementable in the SQ framework. Previously, such lower bounds only ruled out algorithms achieving error $\OPT + \e$ or error better than $\Omega(\eta)$ or, if $\eta$ is close to $1/2$, error $\eta - o_\eta(1)$, where the term $o_\eta(1)$ is constant in $d$ but going to 0 for $\eta$ approaching $1/2$. As a consequence, we also show that achieving misclassification error better than $1/2$ in the $(A,\alpha)$-Tsybakov model is SQ-hard for $A$ constant and $\alpha$ bounded away from 1.}
}
@InProceedings{pmlr-v178-ashtiani22a,
title = {Private and polynomial time algorithms for learning Gaussians and beyond},
author = {Ashtiani, Hassan and Liaw, Christopher},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1075--1076},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/ashtiani22a/ashtiani22a.pdf},
url = {https://proceedings.mlr.press/v178/ashtiani22a.html},
abstract = {We present a fairly general framework for reducing $(\varepsilon, \delta)$-differentially private (DP) statistical estimation to its non-private counterpart. As the main application of this framework, we give a polynomial time and $(\varepsilon,\delta)$-DP algorithm for learning (unrestricted) Gaussian distributions in $\mathbb{R}^d$. The sample complexity of our approach for learning the Gaussian up to total variation distance $\alpha$ is $\tilde{O}(d^2/\alpha^2 + d^2\sqrt{\ln(1/\delta)}/\alpha \eps + d\ln(1/\delta) / \alpha \eps)$ matching (up to logarithmic factors) the best known information-theoretic (non-efficient) sample complexity upper bound due to Aden-Ali, Ashtiani, and Kamath (2021). In an independent work, Kamath, Mouzakis, Singhal, Steinke, and Ullman (2021) proved a similar result using a different approach and with $O(d^{5/2})$ sample complexity dependence on $d$. As another application of our framework, we provide the first polynomial time $(\varepsilon, \delta)$-DP algorithm for robust learning of (unrestricted) Gaussians with sample complexity $\tilde{O}(d^{3.5})$. In another independent work, Kothari, Manurangsi, and Velingker (2021) also provided a polynomial time $(\epsilon, \delta)$-DP algorithm for robust learning of Gaussians with sample complexity $\tilde{O}(d^8)$.}
}
@InProceedings{pmlr-v178-blanchard22b,
title = {Universal Online Learning: an Optimistically Universal Learning Rule},
author = {Blanchard, Moise},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1077--1125},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/blanchard22b/blanchard22b.pdf},
url = {https://proceedings.mlr.press/v178/blanchard22b.html},
abstract = {We study the subject of universal online learning with non-i.i.d. processes for bounded losses. The notion of universally consistent learning was defined by Hanneke in an effort to study learning theory under minimal assumptions, where the objective is to obtain low long-run average loss for any target function. We are interested in characterizing processes for which learning is possible and whether there exist learning rules guaranteed to be universally consistent given the only assumption that such learning is possible. The case of unbounded losses is very restrictive since the learnable processes almost surely have to visit a finite number of points and as a result, simple memorization is optimistically universal. We focus on the bounded setting and give a complete characterization of the processes admitting strong and weak universal learning. We further show that the k-nearest neighbor algorithm (kNN) is not optimistically universal and present a novel variant of 1NN which is optimistically universal for general input and value spaces in both strong and weak settings. This closes all the COLT 2021 open problems posed on universal online learning.}
}
@InProceedings{pmlr-v178-varshney22a,
title = {(Nearly) Optimal Private Linear Regression for Sub-Gaussian Data via Adaptive Clipping},
author = {Varshney, Prateek and Thakurta, Abhradeep and Jain, Prateek},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1126--1166},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/varshney22a/varshney22a.pdf},
url = {https://proceedings.mlr.press/v178/varshney22a.html},
abstract = {We study the problem of differentially private linear regression where each of the data point is sampled from a fixed sub-Gaussian style distribution. We propose and analyze a one-pass mini-batch stochastic gradient descent method (DP-AMBSSGD) where points in each iteration are sampled without replacement. Noise is added for DP but the noise standard deviation is estimated online. Compared to existing $(\epsilon, \delta)$-DP techniques which have sub-optimal error bounds, DP-AMBSSGD is able to provide nearly optimal error bounds in terms of key parameters like dimensionality $d$, number of points $N$, and the standard deviation \sigma of the noise in observations. For example, when the $d$-dimensional covariates are sampled i.i.d. from the normal distribution, then the excess error of DP-AMBSSGD due to privacy is $\sigma^2 d/N(1+d/(\epsilon^2 N))$, i.e., the error is meaningful when number of samples N\geq d \log d which is the standard operative regime for linear regression. In contrast, error bounds for existing efficient methods in this setting are: $d^3/(\epsilon^2 N^2)$, even for $\sigma=0$. That is, for constant $\epsilon$, the existing techniques require $N=d^{1.5}$ to provide a non-trivial result.}
}
@InProceedings{pmlr-v178-liu22b,
title = {Differential privacy and robust statistics in high dimensions},
author = {Liu, Xiyang and Kong, Weihao and Oh, Sewoong},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1167--1246},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/liu22b/liu22b.pdf},
url = {https://proceedings.mlr.press/v178/liu22b.html},
abstract = {We introduce a universal framework for characterizing the statistical efficiency of a statistical estimation problem with differential privacy guarantees. Our framework, which we call High-dimensional Propose-Test-Release (HPTR), builds upon three crucial components: the exponential mechanism, robust statistics, and the Propose-Test-Release mechanism. Connecting all these together is the concept of resilience, which is central to robust statistical estimation. Resilience guides the design of the algorithm, the sensitivity analysis, and the success probability analysis of the test step in Propose-Test-Release. The key insight is that if we design an exponential mechanism that accesses the data only via one-dimensional and robust statistics, then the resulting local sensitivity can be dramatically reduced. Using resilience, we can provide tight local sensitivity bounds. These tight bounds readily translate into near-optimal utility guarantees in several cases. We give a general recipe for applying HPTR to a given instance of a statistical estimation problem and demonstrate it on canonical problems of mean estimation, linear regression, covariance estimation, and principal component analysis. We introduce a general utility analysis technique that proves that HPTR achieves near-optimal sample complexity under several scenarios studied in the literature.}
}
@InProceedings{pmlr-v178-zadik22a,
title = {Lattice-Based Methods Surpass Sum-of-Squares in Clustering},
author = {Zadik, Ilias and Song, Min Jae and Wein, Alexander S and Bruna, Joan},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1247--1248},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/zadik22a/zadik22a.pdf},
url = {https://proceedings.mlr.press/v178/zadik22a.html},
abstract = {Clustering is a fundamental primitive in unsupervised learning which gives rise to a rich class of computationally-challenging inference tasks. In this work, we focus on the canonical task of clustering d-dimensional Gaussian mixtures with unknown (and possibly degenerate) covariance. Recent works (Ghosh et al. ’20; Mao, Wein ’21; Davis, Diaz, Wang ’21) have established lower bounds against the class of low-degree polynomial methods and the sum-of-squares (SoS) hierarchy for recovering certain hidden structures planted in Gaussian clustering instances. Prior work on many similar inference tasks portends that such lower bounds strongly suggest the presence of an inherent statistical-to-computational gap for clustering, that is, a parameter regime where the clustering task is statistically possible but no polynomial-time algorithm succeeds. One special case of the clustering task we consider is equivalent to the problem of finding a planted hypercube vector in an otherwise random subspace. We show that, perhaps surprisingly, this particular clustering model does not exhibit a statistical-to-computational gap, even though the aforementioned low-degree and SoS lower bounds continue to apply in this case. To achieve this, we give a polynomial-time algorithm based on the Lenstra–Lenstra–Lovasz lattice basis reduction method which achieves the statistically-optimal sample complexity of d+1 samples. This result extends the class of problems whose conjectured statistical-to-computational gaps can be "closed" by "brittle" polynomial-time algorithms, highlighting the crucial but subtle role of noise in the onset of statistical-to-computational gaps.}
}
@InProceedings{pmlr-v178-vardi22a,
title = {Width is Less Important than Depth in ReLU Neural Networks},
author = {Vardi, Gal and Yehudai, Gilad and Shamir, Ohad},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1249--1281},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/vardi22a/vardi22a.pdf},
url = {https://proceedings.mlr.press/v178/vardi22a.html},
abstract = {We solve an open question from Lu et al. (2017), by showing that any target network with inputs in $\mathbb{R}^d$ can be approximated by a width $O(d)$ network (independent of the target network’s architecture), whose number of parameters is essentially larger only by a linear factor. In light of previous depth separation theorems, which imply that a similar result cannot hold when the roles of width and depth are interchanged, it follows that depth plays a more significant role than width in the expressive power of neural networks. We extend our results to constructing networks with bounded weights, and to constructing networks with width at most $d+2$, which is close to the minimal possible width due to previous lower bounds. Both of these constructions cause an extra polynomial factor in the number of parameters over the target network. We also show an exact representation of wide and shallow networks using deep and narrow networks which, in certain cases, does not increase the number of parameters over the target network.}
}
@InProceedings{pmlr-v178-kane22a,
title = {Computational-Statistical Gap in Reinforcement Learning},
author = {Kane, Daniel and Liu, Sihan and Lovett, Shachar and Mahajan, Gaurav},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1282--1302},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/kane22a/kane22a.pdf},
url = {https://proceedings.mlr.press/v178/kane22a.html},
abstract = {Reinforcement learning with function approximation has recently achieved tremendous results in applications with large state spaces. This empirical success has motivated a growing body of theoretical work proposing necessary and sufficient conditions under which efficient reinforcement learning is possible. From this line of work, a remarkably simple minimal sufficient condition has emerged for sample efficient reinforcement learning: MDPs with optimal value function V* and Q* linear in some known low-dimensional features. In this setting, recent works have designed sample efficient algorithms which require a number of samples polynomial in the feature dimension and independent of the size of state space. They however leave finding computationally efficient algorithms as future work and this is considered a major open problem in the community. In this work, we make progress on this open problem by presenting the first computational lower bound for RL with linear function approximation: unless NP=RP, no randomized polynomial time algorithm exists for deterministic transition MDPs with a constant number of actions and linear optimal value functions. To prove this, we show a reduction from Unique-Sat, where we convert a CNF formula into an MDP with deterministic transitions, constant number of actions and low dimensional linear optimal value functions. This result also exhibits the first computational-statistical gap in reinforcement learning with linear function approximation, as the underlying statistical problem is information-theoretically solvable with a polynomial number of queries, but no computationally efficient algorithm exists unless NP=RP. Finally, we also prove a quasi-polynomial time lower bound under the Randomized Exponential Time Hypothesis.}
}
@InProceedings{pmlr-v178-boursier22a,
title = {Trace norm regularization for multi-task learning with scarce data},
author = {Boursier, Etienne and Konobeev, Mikhail and Flammarion, Nicolas},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1303--1327},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/boursier22a/boursier22a.pdf},
url = {https://proceedings.mlr.press/v178/boursier22a.html},
abstract = {Multi-task learning leverages structural similarities between multiple tasks to learn despite very few samples. Motivated by the recent success of neural networks applied to data-scarce tasks, we consider a linear low-dimensional shared representation model. Despite an extensive literature, existing theoretical results either guarantee weak estimation rates or require a large number of samples per task. This work provides the first estimation error bound for the trace norm regularized estimator when the number of samples per task is small. The advantages of trace norm regularization for learning data-scarce tasks extend to meta-learning and are confirmed empirically on synthetic datasets.}
}
@InProceedings{pmlr-v178-acharya22b,
title = {The Role of Interactivity in Structured Estimation},
author = {Acharya, Jayadev and Canonne, Clement L. and Sun, Ziteng and Tyagi, Himanshu},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1328--1355},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/acharya22b/acharya22b.pdf},
url = {https://proceedings.mlr.press/v178/acharya22b.html},
abstract = {We study high-dimensional sparse estimation under three natural constraints: communication constraints, local privacy constraints, and linear measurements (compressive sensing). Without sparsity assumptions, it has been established that interactivity cannot improve the minimax rates of estimation under these information constraints. The question of whether interactivity helps with natural inference tasks has been a topic of active research. We settle this question in the affirmative for the prototypical problems of high-dimensional sparse mean estimation and compressive sensing, by demonstrating a gap between interactive and noninteractive protocols. We further establish that the gap increases when we have more structured sparsity: for \emph{block sparsity} this gap can be as large as \emph{polynomial} in the dimensionality. Thus, the more structured the sparsity is, the greater is the advantage of interaction. Proving the lower bounds requires a careful breaking of a sum of correlated random variables into independent components using Baranyai’s theorem on decomposition of hypergraphs, which might be of independent interest.}
}
@InProceedings{pmlr-v178-muzellec22a,
title = {Dimension-free convergence rates for gradient Langevin dynamics in RKHS},
author = {Muzellec, Boris and Sato, Kanji and Massias, Mathurin and Suzuki, Taiji},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1356--1420},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/muzellec22a/muzellec22a.pdf},
url = {https://proceedings.mlr.press/v178/muzellec22a.html},
abstract = {Gradient Langevin dynamics (GLD) and stochastic GLD (SGLD) have attracted considerable attention lately, as a way to provide convergence guarantees in a non-convex setting. However, the known rates grow exponentially with the dimension of the space under the dissipative condition. In this work, we provide a convergence analysis of GLD and SGLD when the optimization space is an infinite-dimensional Hilbert space. More precisely, we derive non-asymptotic, dimension-free convergence rates for GLD/SGLD when performing regularized non-convex optimization in a reproducing kernel Hilbert space. Amongst others, the convergence analysis relies on the properties of a stochastic differential equation, its discrete time Galerkin approximation and the geometric ergodicity of the associated Markov chains.}
}
@InProceedings{pmlr-v178-ito22a,
title = {Adversarially Robust Multi-Armed Bandit Algorithm with Variance-Dependent Regret Bounds},
author = {Ito, Shinji and Tsuchiya, Taira and Honda, Junya},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1421--1422},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/ito22a/ito22a.pdf},
url = {https://proceedings.mlr.press/v178/ito22a.html},
abstract = {This paper considers the multi-armed bandit (MAB) problem and provides a new best-of-both-worlds (BOBW) algorithm that works nearly optimally in both stochastic and adversarial settings. In stochastic settings, some existing BOBW algorithms achieve tight gap-dependent regret bounds of $O(\sum_{i: \Delta_i>0} \frac{\log T}{\Delta_i})$ for suboptimality gap $\Delta_i$ of arm $i$ and time horizon $T$. On the other hand, it is shown in Audibert et al. (2007) that the regret bound can be tightened to $O(\sum_{i: \Delta_i>0} (\frac{\sigma_i^2}{\Delta_i} + 1) \log T )$ using the loss variance $\sigma_i^2$ of each arm $i$ in the stochastic environments. In this paper, we propose an algorithm based on the follow-the-regularized-leader method, which employs adaptive learning rates that depend on the empirical prediction error of the loss. This is the first BOBW algorithm with gap-variance-dependent bounds, showing that the variance information can be used even in the possibly adversarial environment. Further, the leading constant factor in our gap-variance dependent bound is only (almost) twice the value for the lower bound. In addition, the proposed algorithm enjoys multiple data-dependent regret bounds in adversarial settings and works well in stochastic settings with adversarial corruptions. Table 1 summarizes the achievable bounds in comparison with UCB-V Audibert et al. (2007), Tsallis-INF (Zimmert and Seldin, 2021) and LB-INF (Ito, 2021).}
}
@InProceedings{pmlr-v178-agarwal22a,
title = {A Sharp Memory-Regret Trade-off for Multi-Pass Streaming Bandits},
author = {Agarwal, Arpit and Khanna, Sanjeev and Patil, Prathamesh},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1423--1462},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/agarwal22a/agarwal22a.pdf},
url = {https://proceedings.mlr.press/v178/agarwal22a.html},
abstract = {The stochastic $K$-armed bandit problem has been studied extensively due to its applications in various domains ranging from online advertising to clinical trials. In practice however, the number of arms can be very large resulting in large memory requirements for simultaneously processing them. In this paper we consider a streaming setting where the arms are presented in a stream and the algorithm uses limited memory to process these arms. Here, the goal is not only to minimize regret, but also to do so in minimal memory. Previous algorithms for this problem operate in one of the two settings: they either use $\Omega(\log \log T)$ passes over the stream \citep{rathod2021reducing, ChaudhuriKa20, Liau+18}, or just a single pass \citep{Maiti+21}. In this paper we study the trade-off between memory and regret when $B$ passes over the stream are allowed, for any $B \geq 1$, and establish \emph{tight} regret upper and lower bounds for any $B$-pass algorithm. Our results uncover a surprising \emph{sharp transition phenomenon}: $O(1)$ memory is sufficient to achieve $\widetilde\Theta\paren{T^{\half + \frac{1}{2^{B+2}-2}}}$ regret in $B$ passes, and increasing the memory to any quantity that is $o(K)$ has almost no impact on further reducing this regret, unless we use $\Omega(K)$ memory. Our main technical contribution is our lower bound which requires the use of \emph{information-theoretic techniques} as well as ideas from \emph{round elimination} to show that the \emph{residual problem} remains challenging over subsequent passes.}
}
@InProceedings{pmlr-v178-gamlath22a,
title = {Approximate Cluster Recovery from Noisy Labels},
author = {Gamlath, Buddhima and Lattanzi, Silvio and Norouzi-Fard, Ashkan and Svensson, Ola},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1463--1509},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/gamlath22a/gamlath22a.pdf},
url = {https://proceedings.mlr.press/v178/gamlath22a.html},
abstract = {Designing algorithms for machine learning problems targeting beyond worst-case analysis and, in particular, analyzing the effect of side-information on the complexity of such problems is a very important line of research with many practical applications. In this paper we study the classic k-means clustering problem in the presence of noisy labels. In this problem, in addition to a set of points and parameter \(k\), we receive cluster labels of each point generated by either an adversarial or a random perturbation of the optimal solution. Our main goal is to formally study the effect of this extra information on the complexity of the k-means problem. In particular, in the context of random perturbations, we give an efficient algorithm that finds a clustering of cost within a factor $1+o(1)$ of the optimum even when the label of each point is perturbed with a large probability (think 99%). In contrast, we show that the side-information with adversarial perturbations is as hard as the original problem even if only a small $\epsilon$ fraction of the labels are perturbed. We complement this negative result by giving a simple algorithm in the case when the adversary is only allowed to perturb an $\epsilon$ fraction of the labels per \emph{each cluster}.}
}
@InProceedings{pmlr-v178-kur22a,
title = {An Efficient Minimax Optimal Estimator For Multivariate Convex Regression},
author = {Kur, Gil and Putterman, Eli},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1510--1546},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/kur22a/kur22a.pdf},
url = {https://proceedings.mlr.press/v178/kur22a.html},
abstract = {We study the computational aspects of the task of multivariate convex regression in dimension $d \geq 5$. We present the first computationally efficient minimax optimal (up to logarithmic factors) estimators for the tasks of $L$-Lipschitz and $\Gamma$-bounded convex regression under polytopal support. This work is the first to show the existence of efficient minimax optimal estimators for non-Donsker classes whose corresponding Least Squares Estimators are provably minimax suboptimal. The proof of the correctness of these estimators uses a variety of tools from different disciplines, among them empirical process theory, stochastic geometry, and potential theory.}
}
@InProceedings{pmlr-v178-lattimore22a,
title = {Minimax Regret for Partial Monitoring: Infinite Outcomes and Rustichini’s Regret},
author = {Lattimore, Tor},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1547--1575},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/lattimore22a/lattimore22a.pdf},
url = {https://proceedings.mlr.press/v178/lattimore22a.html},
abstract = {We show that a version of the generalised information ratio of Lattimore and Gyorgy (2020) determines the asymptotic minimax regret for all finite-action partial monitoring games provided that (a) the standard definition of regret is used but the latent space where the adversary plays is potentially infinite; or (b) the regret introduced by Rustichini (1999) is used and the latent space is finite. Our results are complemented by a number of examples. For any p ∈ [1/2, 1] there exists an infinite partial monitoring game for which the minimax regret over n rounds is n^p up to subpolynomial factors and there exist finite games for which the minimax Rustichini regret is n^(4/7) up to subpolynomial factors.}
}
@InProceedings{pmlr-v178-luo22a,
title = {Adaptive Bandit Convex Optimization with Heterogeneous Curvature},
author = {Luo, Haipeng and Zhang, Mengxiao and Zhao, Peng},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1576--1612},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/luo22a/luo22a.pdf},
url = {https://proceedings.mlr.press/v178/luo22a.html},
abstract = {We consider the problem of adversarial bandit convex optimization, that is, online learning over a sequence of arbitrary convex loss functions with only one function evaluation for each of them. While all previous works assume known and homogeneous curvature on these loss functions, we study a heterogeneous setting where each function has its own curvature that is only revealed after the learner makes a decision. We develop an efficient algorithm that is able to adapt to the curvature on the fly. Specifically, our algorithm not only recovers or \emph{even improves} existing results for several homogeneous settings, but also leads to surprising results for some heterogeneous settings — for example, while Hazan and Levy (2014) showed that $\tilde{O}(d^{\frac{3}{2}}\sqrt{T})$ regret is achievable for a sequence of $T$ smooth and strongly convex $d$-dimensional functions, our algorithm reveals that the same is achievable even if $T^{\frac{3}{4}}$ of them are not strongly convex, and sometimes even if a constant fraction of them are not strongly convex. Our approach is inspired by the framework of Bartlett et al. (2007) who studied a similar heterogeneous setting but with stronger gradient feedback. Extending their framework to the bandit feedback setting requires novel ideas such as lifting the feasible domain and using a logarithmically homogeneous self-concordant barrier regularizer.}
}
@InProceedings{pmlr-v178-li22b,
title = {Statistical Estimation and Online Inference via Local SGD},
author = {Li, Xiang and Liang, Jiadong and Chang, Xiangyu and Zhang, Zhihua},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1613--1661},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/li22b/li22b.pdf},
url = {https://proceedings.mlr.press/v178/li22b.html},
abstract = {We analyze the novel Local SGD in federated Learning, a multi-round estimation procedure that uses intermittent communication to improve communication efficiency. Under a $2{+}\delta$ moment condition on stochastic gradients, we first establish a {\it functional central limit theorem} that shows the averaged iterates of Local SGD converge weakly to a rescaled Brownian motion. We next provide two iterative inference methods: the {\it plug-in} and the {\it random scaling}. Random scaling constructs an asymptotically pivotal statistic for inference by using the information along the whole Local SGD path. Both the methods are communication efficient and applicable to online data. Our results show that Local SGD simultaneously achieves both statistical efficiency and communication efficiency.}
}
@InProceedings{pmlr-v178-cohen-addad22a,
title = {Community Recovery in the Degree-Heterogeneous Stochastic Block Model},
author = {Cohen-Addad, Vincent and Mallmann-Trenn, Frederik and Saulpic, David},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1662--1692},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/cohen-addad22a/cohen-addad22a.pdf},
url = {https://proceedings.mlr.press/v178/cohen-addad22a.html},
abstract = {We consider the problem of recovering communities in a random directed graph with planted communities. To model real-world directed graphs such as the Twitter or Instagram graphs that exhibit very heterogeneous degree sequences, we introduce the Degree-Heterogeneous Stochastic Block Model (DHSBM), a generalization of the classic Stochastic Block Model (SBM), where the vertex set is partitioned into communities and each vertex $u$ has two (unknown) associated probabilities, $p_u$ and $q_u$, $p_u > q_u$. An arc from $u$ to $v$ is generated with probability $p_u$ if $u$ and $v$ are in the same community and with probability $q_u$ otherwise. Given a graph generated from this model, the goal is to retrieve the communities. The DHSBM allows to generate graphs with planted communities while allowing heterogeneous degree distributions, a quite important feature of real-world networks. In the case where there are two communities, we present an iterative greedy linear-time algorithm that recovers them whenever $\min_u \frac{p_u - q_u}{\sqrt{p_u}} = \Omega(\sqrt{\log (n)/n})$. We also show that, up to a constant, this condition is necessary. Our results also extend to the standard (undirected) SBM, where $p_u = p$ and $q_u= q$ for all nodes $u$. Our algorithm presents the first linear-time algorithm that recovers exactly the communities at the asymptotic information-theoretic threshold, improving over previous near-linear time spectral approaches.}
}
@InProceedings{pmlr-v178-buzun22a,
title = {Strong Gaussian Approximation for the Sum of Random Vectors},
author = {Buzun, Nazar and Shvetsov, Nikolay and Dylov, Dmitry V.},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1693--1715},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/buzun22a/buzun22a.pdf},
url = {https://proceedings.mlr.press/v178/buzun22a.html},
abstract = {This paper derives a new strong Gaussian approximation bound for the sum of independent random vectors. The approach relies on the optimal transport theory and yields explicit dependence on the dimension size p and the sample size n. This dependence establishes a new fundamental limit for all practical applications of statistical learning theory. Particularly, based on this bound, we prove approximation in distribution for the maximum norm in a high-dimensional setting (p > n).}
}
@InProceedings{pmlr-v178-block22a,
title = {Smoothed Online Learning is as Easy as Statistical Learning},
author = {Block, Adam and Dagan, Yuval and Golowich, Noah and Rakhlin, Alexander},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1716--1786},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/block22a/block22a.pdf},
url = {https://proceedings.mlr.press/v178/block22a.html},
abstract = {Much of modern learning theory has been split between two regimes: the classical offline setting, where data arrive independently, and the online setting, where data arrive adversarially. While the former model is often both computationally and statistically tractable, the latter requires no distributional assumptions. In an attempt to achieve the best of both worlds, previous work proposed the smooth online setting where each sample is drawn from an adversarially chosen distribution, which is smooth, i.e., it has a bounded density with respect to a fixed dominating measure. Existing results for the smooth setting were known only for binary-valued function classes and were computation- ally expensive in general; in this paper, we fill these lacunae. In particular, we provide tight bounds on the minimax regret of learning a nonparametric function class, with nearly optimal dependence on both the horizon and smoothness parameters. Furthermore, we provide the first oracle-efficient, no-regret algorithms in this setting. In particular, we propose an oracle-efficient improper algorithm whose regret achieves optimal dependence on the horizon and a proper algorithm requiring only a single oracle call per round whose regret has the optimal horizon dependence in the classification setting and is sublinear in general. Both algorithms have exponentially worse dependence on the smoothness parameter of the adversary than the minimax rate. We then prove a lower bound on the oracle complexity of any proper learning algorithm, which matches the oracle-efficient upper bounds up to a polynomial factor, thus demonstrating the existence of a statistical-computational gap in smooth online learning. Finally, we apply our results to the contextual bandit setting to show that if a function class is learnable in the classical setting, then there is an oracle-efficient, no-regret algorithm for contextual bandits in the case that contexts arrive in a smooth manner.}
}
@InProceedings{pmlr-v178-bolthausen22a,
title = {Gardner formula for Ising perceptron models at small densities},
author = {Bolthausen, Erwin and Nakajima, Shuta and Sun, Nike and Xu, Changji},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1787--1911},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/bolthausen22a/bolthausen22a.pdf},
url = {https://proceedings.mlr.press/v178/bolthausen22a.html},
abstract = {We consider the Ising perceptron model with N spins and M = N*alpha patterns, with a general activation function U that is bounded above. For U bounded away from zero, or U a one-sided threshold function, it was shown by Talagrand (2000, 2011) that for small densities alpha, the free energy of the model converges in the large-N limit to the replica symmetric formula conjectured in the physics literature (Krauth–Mezard 1989, see also Gardner–Derrida 1988). We give a new proof of this result, which covers the more general class of all functions U that are bounded above and satisfy a certain variance bound. The proof uses the (first and second) moment method conditional on the approximate message passing iterates of the model. In order to deduce our main theorem, we also prove a new concentration result for the perceptron model in the case where U is not bounded away from zero.}
}
@InProceedings{pmlr-v178-bellec22a,
title = {Derivatives and residual distribution of regularized M-estimators with application to adaptive tuning},
author = {Bellec, Pierre C and Shen, Yiwei},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1912--1947},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/bellec22a/bellec22a.pdf},
url = {https://proceedings.mlr.press/v178/bellec22a.html},
abstract = {This paper studies M-estimators with gradient-Lipschitz loss function regularized with convex penalty in linear models with Gaussian design matrix and arbitrary noise distribution. A practical example is the robust M-estimator constructed with the Huber loss and the Elastic-Net penalty and the noise distribution has heavy-tails. Our main contributions are three-fold. (i) We provide general formulae for the derivatives of regularized M-estimators $\hat\beta(y,X)$ where differentiation is taken with respect to both X and y; this reveals a simple differentiability structure shared by all convex regularized M-estimators. (ii) Using these derivatives, we characterize the distribution of the residuals in the intermediate high-dimensional regime where dimension and sample size are of the same order. (iii) Motivated by the distribution of the residuals, we propose a novel adaptive criterion to select tuning parameters of regularized M-estimators. The criterion approximates the out-of-sample error up to an additive constant independent of the estimator, so that minimizing the criterion provides a proxy for minimizing the out-of-sample error. The proposed adaptive criterion does not require the knowledge of the noise distribution or of the covariance of the design. Simulated data confirms the theoretical findings, regarding both the distribution of the residuals and the success of the criterion as a proxy of the out-of-sample error. Finally our results reveal new relationships between the derivatives of the $\hat\beta$ and the effective degrees of freedom of the M-estimators, which are of independent interest.}
}
@InProceedings{pmlr-v178-gopi22a,
title = {Private Convex Optimization via Exponential Mechanism},
author = {Gopi, Sivakanth and Lee, Yin Tat and Liu, Daogao},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1948--1989},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/gopi22a/gopi22a.pdf},
url = {https://proceedings.mlr.press/v178/gopi22a.html},
abstract = {In this paper, we study the private optimization problems for non-smooth convex functions $F(x)=\mathbb{E}_i f_i(x)$ on $\mathbb{R}^d$. We show that modifying the exponential mechanism by adding an $\ell_2^2$ regularizer to $F(x)$ and sampling from $\pi(x)\propto \exp(-k(F(x)+\mu\|x\|_2^2/2))$ recovers both the known optimal empirical risk and population loss under $(\eps,\delta)$-DP. Furthermore, we show how to implement this mechanism using $\widetilde{O}(n \min(d, n))$ queries to $f_i(x)$ where $n$ is the number of samples/users in the DP-SCO. We also give a (nearly) matching lower bound $\widetilde{\Omega}(n \min(d, n))$ on the number of evaluation queries. Our results utilize the following tools that are of independent interests: \begin{itemize} \item We prove Gaussian Differential Privacy (GDP) of the exponential mechanism if the loss function is strongly convex and the perturbation is Lipschitz. Our privacy bound is \emph{optimal} as it includes the privacy of Gaussian mechanism as a special case. \item We show how to sample from $\exp(-F(x)-\mu \|x\|^2_2/2)$ for $G$-Lipschitz $F$ with $\eta$ error in TV distance using $\widetilde{O}((G^2/\mu) \log^2(d/\eta))$ unbiased queries to $F(x)$. This is the first sampler whose query complexity has \emph{polylogarithmic dependence} on both dimension $d$ and accuracy $\eta$. \end{itemize}}
}
@InProceedings{pmlr-v178-huang22a,
title = {Towards Optimal Algorithms for Multi-Player Bandits without Collision Sensing Information},
author = {Huang, Wei and Combes, Richard and Trinh, Cindy},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {1990--2012},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/huang22a/huang22a.pdf},
url = {https://proceedings.mlr.press/v178/huang22a.html},
abstract = {We propose a novel algorithm for multi-player multi-armed bandits without collision sensing information. Our algorithm circumvents two problems shared by all state-of-the-art algorithms: it does not need as an input a lower bound on the minimal expected reward of an arm, and its performance does not scale inversely proportionally to the minimal expected reward. We prove a theoretical regret upper bound to justify these claims. We complement our theoretical results with numerical experiments, showing that the proposed algorithm outperforms state-of-the-art in practice.}
}
@InProceedings{pmlr-v178-bartlett22a,
title = {Generalization Bounds for Data-Driven Numerical Linear Algebra},
author = {Bartlett, Peter and Indyk, Piotr and Wagner, Tal},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2013--2040},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/bartlett22a/bartlett22a.pdf},
url = {https://proceedings.mlr.press/v178/bartlett22a.html},
abstract = {Data-driven algorithms can adapt their internal structure or parameters to inputs from unknown application-specific distributions, by learning from a training sample of inputs. Several recent works have applied this approach to problems in numerical linear algebra, obtaining significant empirical gains in performance. However, no theoretical explanation for their success was known. In this work we prove generalization bounds for those algorithms, within the PAC-learning framework for data-driven algorithm selection proposed by Gupta and Roughgarden (SICOMP 2017). Our main results are closely matching upper and lower bounds on the fat shattering dimension of the learning-based low rank approximation algorithm of Indyk et al. (NeurIPS 2019). Our techniques are general, and provide generalization bounds for many other recently proposed data-driven algorithms in numerical linear algebra, covering both sketching-based and multigrid-based methods. This considerably broadens the class of data-driven algorithms for which a PAC-learning analysis is available.}
}
@InProceedings{pmlr-v178-chewi22b,
title = {The query complexity of sampling from strongly log-concave distributions in one dimension},
author = {Chewi, Sinho and Gerber, Patrik R and Lu, Chen and Gouic, Thibaut Le and Rigollet, Philippe},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2041--2059},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/chewi22b/chewi22b.pdf},
url = {https://proceedings.mlr.press/v178/chewi22b.html},
abstract = {We establish the first tight lower bound of $\Omega(\log\log\kappa)$ on the query complexity of sampling from the class of strongly log-concave and log-smooth distributions with condition number $\kappa$ in one dimension. Whereas existing guarantees for MCMC-based algorithms scale polynomially in $\kappa$, we introduce a novel algorithm based on rejection sampling that closes this doubly exponential gap.}
}
@InProceedings{pmlr-v178-mou22a,
title = {Optimal and instance-dependent guarantees for Markovian linear stochastic approximation},
author = {Mou, Wenlong and Pananjady, Ashwin and Wainwright, Martin and Bartlett, Peter},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2060--2061},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/mou22a/mou22a.pdf},
url = {https://proceedings.mlr.press/v178/mou22a.html},
abstract = {We study stochastic approximation procedures for approximately solving a $d$-dimensional linear fixed point equation based on observing a trajectory of length $n$ from an ergodic Markov chain. We first exhibit a non-asymptotic bound of the order $t_{\mathrm{mix}} \tfrac{d}{n}$ on the squared error of the last iterate of a standard scheme, where $t_{\mathrm{mix}}$ is a mixing time. We then prove a non-asymptotic instance-dependent bound on a suitably averaged sequence of iterates, with a leading term that matches the local asymptotic minimax limit, including sharp dependence on the parameters $(d, t_{\mathrm{mix}})$ in the higher order terms. We complement these upper bounds with a non-asymptotic minimax lower bound that establishes the instance-optimality of the averaged SA estimator. We derive corollaries of these results for policy evaluation with Markov noise—covering the TD($\lambda$) family of algorithms for all $\lambda \in [0, 1)$—and linear autoregressive models. Our instance-dependent characterizations open the door to the design of fine-grained model selection procedures for hyperparameter tuning (e.g., choosing the value of $\lambda$ when running the TD($\lambda$) algorithm).}
}
@InProceedings{pmlr-v178-varre22a,
title = {Accelerated SGD for Non-Strongly-Convex Least Squares},
author = {Varre, Aditya and Flammarion, Nicolas},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2062--2126},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/varre22a/varre22a.pdf},
url = {https://proceedings.mlr.press/v178/varre22a.html},
abstract = {We consider stochastic approximation for the least squares regression problem in the non-strongly convex setting. We present the first practical algorithm that achieves the optimal prediction error rates in terms of dependence on the noise of the problem, as $O(d/t)$ while accelerating the forgetting of the initial conditions to $O(d/t^2)$. Our new algorithm is based on a simple modification of the accelerated gradient descent. We provide convergence results for both the averaged and the last iterate of the algorithm. In order to describe the tightness of these new bounds, we present a matching lower bound in the noiseless setting and thus show the optimality of our algorithm.}
}
@InProceedings{pmlr-v178-vivien22a,
title = {Label noise (stochastic) gradient descent implicitly solves the Lasso for quadratic parametrisation},
author = {Vivien, Loucas Pillaud and Reygner, Julien and Flammarion, Nicolas},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2127--2159},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/vivien22a/vivien22a.pdf},
url = {https://proceedings.mlr.press/v178/vivien22a.html},
abstract = {Understanding the implicit bias of training algorithms is of crucial importance in order to explain the success of overparametrised neural networks. In this paper, we study the role of the label noise in the training dynamics of a quadratically parametrised model through its continuous time version. We explicitly characterise the solution chosen by the stochastic flow and prove that it implicitly solves a Lasso program. To fully complete our analysis, we provide nonasymptotic convergence guarantees for the dynamics as well as conditions for support recovery. We also give experimental results which support our theoretical claims. Our findings highlight the fact that structured noise can induce better generalisation and help explain the greater performances of stochastic dynamics as observed in practice.}
}
@InProceedings{pmlr-v178-suk22a,
title = {Tracking Most Significant Arm Switches in Bandits},
author = {Suk, Joe and Kpotufe, Samory},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2160--2182},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/suk22a/suk22a.pdf},
url = {https://proceedings.mlr.press/v178/suk22a.html},
abstract = {In \emph{bandit with distribution shifts}, one aims to automatically adapt to unknown changes in reward distribution, and \emph{restart} exploration when necessary. While this problem has been studied for many years, a recent breakthrough of Auer et al. (2018, 2019) provides the first adaptive procedure to guarantee an optimal (dynamic) regret $\sqrt{LT}$, for $T$ rounds, and an unknown number $L$ of changes. However, while this rate is tight in the worst case, it remained open whether faster rates are possible, without prior knowledge, if few changes in distribution are actually \emph{severe}. To resolve this question, we propose a new notion of \emph{significant shift}, which only counts very severe changes that clearly necessitate a restart: roughly, these are changes involving not only best arm switches, but also involving large aggregate differences in reward overtime. Thus, our resulting procedure adaptively achieves rates always faster (sometimes significantly) than $O(\sqrt{ST})$, where $S\ll L$ only counts best arm switches, while at the same time, always faster than the optimal $O(V^{\frac{1}{3}}T^{\frac{2}{3}})$ when expressed in terms of \emph{total variation} $V$ (which aggregates differences overtime). Our results are expressed in enough generality to also capture non-stochastic adversarial settings.}
}
@InProceedings{pmlr-v178-gaudio22a,
title = {Exact Community Recovery in Correlated Stochastic Block Models},
author = {Gaudio, Julia and Racz, Miklos Z. and Sridhar, Anirudh},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2183--2241},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/gaudio22a/gaudio22a.pdf},
url = {https://proceedings.mlr.press/v178/gaudio22a.html},
abstract = {We consider the problem of learning latent community structure from multiple correlated networks. We study edge-correlated stochastic block models with two balanced communities, focusing on the regime where the average degree is logarithmic in the number of vertices. Our main result derives the precise information-theoretic threshold for exact community recovery using multiple correlated graphs. This threshold captures the interplay between the community recovery and graph matching tasks. In particular, we uncover and characterize a region of the parameter space where exact community recovery is possible using multiple correlated graphs, even though (1) this is information-theoretically impossible using a single graph and (2) exact graph matching is also information-theoretically impossible. In this regime, we develop a novel algorithm that carefully synthesizes algorithms from the community recovery and graph matching literatures.}
}
@InProceedings{pmlr-v178-yao22a,
title = {Mean-field nonparametric estimation of interacting particle systems},
author = {Yao, Rentian and Chen, Xiaohui and Yang, Yun},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2242--2275},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/yao22a/yao22a.pdf},
url = {https://proceedings.mlr.press/v178/yao22a.html},
abstract = {This paper concerns the nonparametric estimation problem of the distribution-state dependent drift vector field in an interacting $N$-particle system. Observing single-trajectory data for each particle, we derive the mean-field rate of convergence for the maximum likelihood estimator (MLE), which depends on both Gaussian complexity and Rademacher complexity of the function class. In particular, when the function class contains $\alpha$-smooth H{ö}lder functions, our rate of convergence is minimax optimal on the order of $N^{-\frac{\alpha}{d+2\alpha}}$. Combining with a Fourier analytical deconvolution estimator, we derive the consistency of MLE for the external force and interaction kernel in the McKean-Vlasov equation.}
}
@InProceedings{pmlr-v178-jagadeesan22a,
title = {Inductive Bias of Multi-Channel Linear Convolutional Networks with Bounded Weight Norm},
author = {Jagadeesan, Meena and Razenshteyn, Ilya and Gunasekar, Suriya},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2276--2325},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/jagadeesan22a/jagadeesan22a.pdf},
url = {https://proceedings.mlr.press/v178/jagadeesan22a.html},
abstract = {We provide a function space characterization of the inductive bias resulting from minimizing the $\ell_2$ norm of the weights in multi-channel convolutional neural networks with linear activations and empirically test our resulting hypothesis on ReLU networks trained using gradient descent. We define an \emph{induced regularizer} in the function space as the minimum $\ell_2$ norm of weights of a network required to realize a function. For two layer linear convolutional networks with $C$ output channels and kernel size $K$, we show the following: (a) If the inputs to the network are single channeled, the induced regularizer for any $K$ is \emph{independent} of the number of output channels $C$. Furthermore, we derive the regularizer is a norm given by a semidefinite program (SDP). (b) In contrast, for multi-channel inputs, multiple output channels can be necessary to merely realize all matrix-valued linear functions and thus the inductive bias \emph{does} depend on $C$. However, for sufficiently large $C$, the induced regularizer is again given by an SDP that is independent of $C$. In particular, the induced regularizer for $K=1$ and $K=D$ (input dimension) are given in closed form as the nuclear norm and the $\ell_{2,1}$ group-sparse norm, respectively, of the Fourier coefficients of the linear predictor. We investigate the broader applicability of our theoretical results to implicit regularization from gradient descent on linear and ReLU networks through experiments on MNIST and CIFAR-10 datasets.}
}
@InProceedings{pmlr-v178-garber22a,
title = {New Projection-free Algorithms for Online Convex Optimization with Adaptive Regret Guarantees},
author = {Garber, Dan and Kretzu, Ben},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2326--2359},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/garber22a/garber22a.pdf},
url = {https://proceedings.mlr.press/v178/garber22a.html},
abstract = {We present new efficient \emph{projection-free} algorithms for online convex optimization (OCO), where by projection-free we refer to algorithms that avoid computing orthogonal projections onto the feasible set, and instead relay on different and potentially much more efficient oracles. While most state-of-the-art projection-free algorithms are based on the \emph{follow-the-leader} framework, our algorithms are fundamentally different and are based on the \emph{online gradient descent} algorithm with a novel and efficient approach to computing so-called \emph{infeasible projections}. As a consequence, we obtain the first projection-free algorithms which naturally yield \emph{adaptive regret} guarantees, i.e., regret bounds that hold w.r.t. any sub-interval of the sequence. Concretely, when assuming the availability of a linear optimization oracle (LOO) for the feasible set, on a sequence of length $T$, our algorithms guarantee $O(T^{3/4})$ adaptive regret and $O(T^{3/4})$ adaptive expected regret, for the full-information and bandit settings, respectively, using only $O(T)$ calls to the LOO. These bounds match the current state-of-the-art regret bounds for LOO-based projection-free OCO, which are \emph{not adaptive}. We also consider a new natural setting in which the feasible set is accessible through a separation oracle. We present algorithms which, using overall $O(T)$ calls to the separation oracle, guarantee $O(\sqrt{T})$ adaptive regret and $O(T^{3/4})$ adaptive expected regret for the full-information and bandit settings, respectively.}
}
@InProceedings{pmlr-v178-carmon22a,
title = {Making SGD Parameter-Free},
author = {Carmon, Yair and Hinder, Oliver},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2360--2389},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/carmon22a/carmon22a.pdf},
url = {https://proceedings.mlr.press/v178/carmon22a.html},
abstract = {We develop an algorithm for parameter-free stochastic convex optimization (SCO) whose rate of convergence is only a double-logarithmic factor larger than the optimal rate for the corresponding known-parameter setting. In contrast, the best previously known rates for parameter-free SCO are based on online parameter-free regret bounds, which contain unavoidable excess logarithmic terms compared to their known-parameter counterparts. Our algorithm is conceptually simple, has high-probability guarantees, and is also partially adaptive to unknown gradient norms, smoothness, and strong convexity. At the heart of our results is a novel parameter-free certificate for SGD step size choice, and a time-uniform concentration result that assumes no a-priori bounds on SGD iterates.}
}
@InProceedings{pmlr-v178-marsden22a,
title = {Efficient Convex Optimization Requires Superlinear Memory},
author = {Marsden, Annie and Sharan, Vatsal and Sidford, Aaron and Valiant, Gregory},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2390--2430},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/marsden22a/marsden22a.pdf},
url = {https://proceedings.mlr.press/v178/marsden22a.html},
abstract = {We show that any memory-constrained, first-order algorithm which minimizes $d$-dimensional, $1$-Lipschitz convex functions over the unit ball to $1/\mathrm{poly}(d)$ accuracy using at most $d^{1.25 - \delta}$ bits of memory must make at least $\Omega(d^{1 + (4/3)\delta})$ first-order queries (for any constant $\delta \in [0, 1/4]$). Consequently, the performance of such memory-constrained algorithms are a polynomial factor worse than the optimal $\tilde{O}(d)$ query bound for this problem obtained by cutting plane methods that use $\tilde{O}(d^2)$ memory. This resolves one of the open problems in the COLT 2019 open problem publication of Woodworth and Srebro.}
}
@InProceedings{pmlr-v178-kelner22a,
title = {Big-Step-Little-Step: Efficient Gradient Methods for Objectives with Multiple Scales},
author = {Kelner, Jonathan and Marsden, Annie and Sharan, Vatsal and Sidford, Aaron and Valiant, Gregory and Yuan, Honglin},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2431--2540},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/kelner22a/kelner22a.pdf},
url = {https://proceedings.mlr.press/v178/kelner22a.html},
abstract = {We provide new gradient-based methods for efficiently solving a broad class of ill-conditioned optimization problems. We consider the problem of minimizing a function $f : \mathbb{R}^d \rightarrow \mathbb{R}$ which is implicitly decomposable as the sum of $m$ unknown non-interacting smooth, strongly convex functions and provide a method which solves this problem with a number of gradient evaluations that scales (up to logarithmic factors) as the product of the square-root of the condition numbers of the components. This complexity bound (which we prove is nearly optimal) can improve almost exponentially on that of accelerated gradient methods, which grow as the square root of the condition number of $f$. Additionally, we provide efficient methods for solving stochastic, quadratic variants of this multiscale optimization problem. Rather than learn the decomposition of $f$ (which would be prohibitively expensive), our methods apply a clean recursive “Big-Step-Little-Step” interleaving of standard methods. The resulting algorithms use $\tilde{\mathcal{O}}(d m)$ space, are numerically stable, and open the door to a more fine-grained understanding of the complexity of convex optimization beyond condition number.}
}
@InProceedings{pmlr-v178-chen22b,
title = {Toward Instance-Optimal State Certification With Incoherent Measurements},
author = {Chen, Sitan and Li, Jerry and O'Donnell, Ryan},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2541--2596},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/chen22b/chen22b.pdf},
url = {https://proceedings.mlr.press/v178/chen22b.html},
abstract = {We revisit the basic problem of quantum state certification: given copies of unknown mixed state ρ∈ℂ^{d×d} and the description of a mixed state σ, decide whether σ=ρ or ‖σ−ρ‖_𝗍𝗋 ≥ ϵ. When σ is maximally mixed, this is mixedness testing, and it is known that Ω(d^{Θ(1)}/ϵ^2) copies are necessary, where the exact exponent depends on the type of measurements the learner can make [OW15, BCL20], and in many of these settings there is a matching upper bound [OW15, BOW19, BCL20]. Can one avoid this d^{Θ(1)} dependence for certain kinds of mixed states σ, e.g. ones which are approximately low rank? More ambitiously, does there exist a simple functional f : ℂ^{d×d} → ℝ_{≥0} for which one can show that Θ(f(σ)/ϵ^2) copies are necessary and sufficient for state certification with respect to any σ? Such instance-optimal bounds are known in the context of classical distribution testing, e.g. [VV17]. Here we give the first bounds of this nature for the quantum setting, showing (up to log factors) that the copy complexity for state certification using nonadaptive incoherent measurements is essentially given by the copy complexity for mixedness testing times the fidelity between σ and the maximally mixed state. Surprisingly, our bound differs substantially from instance optimal bounds for the classical problem, demonstrating a qualitative difference between the two settings.}
}
@InProceedings{pmlr-v178-dagan22b,
title = {EM’s Convergence in Gaussian Latent Tree Models},
author = {Dagan, Yuval and Kandiros, Vardis and Daskalakis, Constantinos},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2597--2667},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/dagan22b/dagan22b.pdf},
url = {https://proceedings.mlr.press/v178/dagan22b.html},
abstract = {We study the optimization landscape of the log-likelihood function and the convergence of the Expectation-Maximization (EM) algorithm in latent Gaussian tree models, i.e. tree-structured Gaussian graphical models whose leaf nodes are observable and non-leaf nodes are unobservable. We show that the unique non-trivial stationary point of the population log-likelihood is its global maximum, and establish that the expectation-maximization algorithm is guaranteed to converge to it in the single latent variable case. Our results for the landscape of the log-likelihood function in general latent tree models provide support for the extensive practical use of maximum likelihood based-methods in this setting. Our results for the expectation-maximization algorithm extend an emerging line of work on obtaining global convergence guarantees for this celebrated algorithm. We show our results for the non-trivial stationary points of the log-likelihood by arguing that a certain system of polynomial equations obtained from the EM updates has a unique non-trivial solution. The global convergence of the EM algorithm follows by arguing that all trivial fixed points are higher-order saddle points.}
}
@InProceedings{pmlr-v178-frei22a,
title = {Benign Overfitting without Linearity: Neural Network Classifiers Trained by Gradient Descent for Noisy Linear Data},
author = {Frei, Spencer and Chatterji, Niladri S and Bartlett, Peter},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2668--2703},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/frei22a/frei22a.pdf},
url = {https://proceedings.mlr.press/v178/frei22a.html},
abstract = {Benign overfitting, the phenomenon where interpolating models generalize well in the presence of noisy data, was first observed in neural network models trained with gradient descent. To better understand this empirical observation, we consider the generalization error of two-layer neural networks trained to interpolation by gradient descent on the logistic loss following random initialization. We assume the data comes from well-separated class-conditional log-concave distributions and allow for a constant fraction of the training labels to be corrupted by an adversary. We show that in this setting, neural networks exhibit benign overfitting: they can be driven to zero training error, perfectly fitting any noisy training labels, and simultaneously achieve minimax optimal test error. In contrast to previous work on benign overfitting that require linear or kernel-based predictors, our analysis holds in a setting where both the model and learning dynamics are fundamentally nonlinear.}
}
@InProceedings{pmlr-v178-agarwal22b,
title = {Minimax Regret Optimization for Robust Machine Learning under Distribution Shift},
author = {Agarwal, Alekh and Zhang, Tong},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2704--2729},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/agarwal22b/agarwal22b.pdf},
url = {https://proceedings.mlr.press/v178/agarwal22b.html},
abstract = {In this paper, we consider learning scenarios where the learned model is evaluated under an unknown test distribution which potentially differs from the training distribution (i.e. distribution shift). The learner has access to a family of weight functions such that the test distribution is a reweighting of the training distribution under one of these functions, a setting typically studied under the name of Distributionally Robust Optimization (DRO). We consider the problem of deriving regret bounds in the classical learning theory setting, and require that the resulting regret bounds hold uniformly for all potential test distributions. We show that the DRO formulation does not guarantee uniformly small regret under distribution shift. We instead propose an alternative method called Minimax Regret Optimization (MRO), and show that under suitable conditions, this method achieves uniformly low regret across all test distributions. We also adapt our technique to have strong guarantees when the test distributions are heterogeneous in their similarity to the training data. Given the widespead optimization of worst case risks in current approaches to robust machine learning, we believe that MRO can be an attractive framework to address a broad range of distribution shift scenarios.}
}
@InProceedings{pmlr-v178-zhan22a,
title = {Offline Reinforcement Learning with Realizability and Single-policy Concentrability},
author = {Zhan, Wenhao and Huang, Baihe and Huang, Audrey and Jiang, Nan and Lee, Jason},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2730--2775},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/zhan22a/zhan22a.pdf},
url = {https://proceedings.mlr.press/v178/zhan22a.html},
abstract = {Sample-efficiency guarantees for offline reinforcement learning (RL) often rely on strong assumptions on both the function classes (e.g., Bellman-completeness) and the data coverage (e.g., all-policy concentrability). Despite the recent efforts on relaxing these assumptions, existing works are only able to relax one of the two factors, leaving the strong assumption on the other factor intact. As an important open problem, can we achieve sample-efficient offline RL with weak assumptions on both factors? In this paper we answer the question in the positive. We analyze a simple algorithm based on the primal-dual formulation of MDPs, where the dual variables (discounted occupancy) are modeled using a density-ratio function against offline data. With proper regularization, the algorithm enjoys polynomial sample complexity, under only realizability and single-policy concentrability. We also provide alternative analyses based on different assumptions to shed light on the nature of primal-dual algorithms for offline RL.}
}
@InProceedings{pmlr-v178-agarwal22c,
title = {Non-Linear Reinforcement Learning in Large Action Spaces: Structural Conditions and Sample-efficiency of Posterior Sampling},
author = {Agarwal, Alekh and Zhang, Tong},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2776--2814},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/agarwal22c/agarwal22c.pdf},
url = {https://proceedings.mlr.press/v178/agarwal22c.html},
abstract = {Provably sample-efficient Reinforcement Learning (RL) with rich observations and function approximation has witnessed tremendous recent progress, particularly when the underlying function approximators are linear. In this linear regime, computationally and statistically efficient methods exist where the potentially infinite state and action spaces can be captured through a known feature embedding, with the sample complexity scaling with the (intrinsic) dimension of these features. When the action space is finite, significantly more sophisticated results allow non-linear function approximation under appropriate structural constraints on the underlying RL problem, permitting for instance, the learning of good features instead of assuming access to them. In this work, we present the first result for non-linear function approximation which holds for general action spaces under a linear embeddability condition, which generalizes all linear and finite action settings. We design a novel optimistic posterior sampling strategy, TS$^3$ for such problems. We further show worst case sample complexity guarantees that scale with a rank parameter of the RL problem, the linear embedding dimension introduced here and standard measures of function class complexity.}
}
@InProceedings{pmlr-v178-liu22c,
title = {Learning GMMs with Nearly Optimal Robustness Guarantees},
author = {Liu, Allen and Moitra, Ankur},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2815--2895},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/liu22c/liu22c.pdf},
url = {https://proceedings.mlr.press/v178/liu22c.html},
abstract = {In this work we solve the problem of robustly learning a high-dimensional Gaussian mixture model with $k$ components from $\epsilon$-corrupted samples up to accuracy $\widetilde{O}(\epsilon)$ in total variation distance for any constant $k$ and with mild assumptions on the mixture. This robustness guarantee is optimal up to polylogarithmic factors. The main challenge is that most earlier works rely on learning individual components in the mixture, but this is impossible in our setting, at least for the types of strong robustness guarantees we are aiming for. Instead we introduce a new framework which we call {\em strong observability} that gives us a route to circumvent this obstacle.}
}
@InProceedings{pmlr-v178-balasubramanian22a,
title = {Towards a Theory of Non-Log-Concave Sampling:First-Order Stationarity Guarantees for Langevin Monte Carlo},
author = {Balasubramanian, Krishna and Chewi, Sinho and Erdogdu, Murat A and Salim, Adil and Zhang, Shunshi},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2896--2923},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/balasubramanian22a/balasubramanian22a.pdf},
url = {https://proceedings.mlr.press/v178/balasubramanian22a.html},
abstract = {For the task of sampling from a density $\pi \propto \exp(-V)$ on $\R^d$, where $V$ is possibly non-convex but $L$-gradient Lipschitz, we prove that averaged Langevin Monte Carlo outputs a sample with $\varepsilon$-relative Fisher information after $O(L^2 d^2/\varepsilon^2)$ iterations. This is the sampling analogue of complexity bounds for finding an $\varepsilon$-approximate first-order stationary points in non-convex optimization and therefore constitutes a first step towards the general theory of non-log-concave sampling. We discuss numerous extensions and applications of our result; in particular, it yields a new state-of-the-art guarantee for sampling from distributions which satisfy a Poincaré inequality.}
}
@InProceedings{pmlr-v178-jin22a,
title = {Understanding Riemannian Acceleration via a Proximal Extragradient Framework},
author = {Jin, Jikai and Sra, Suvrit},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2924--2962},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/jin22a/jin22a.pdf},
url = {https://proceedings.mlr.press/v178/jin22a.html},
abstract = {We contribute to advancing the understanding of Riemannian accelerated gradient methods. In particular, we revisit “\emph{Accelerated Hybrid Proximal Extragradient}” (A-HPE), a powerful framework for obtaining Euclidean accelerated methods \citep{monteiro2013accelerated}. Building on A-HPE, we then propose and analyze Riemannian A-HPE. The core of our analysis consists of two key components: (i) a set of new insights into Euclidean A-HPE itself; and (ii) a careful control of metric distortion caused by Riemannian geometry. We illustrate our framework by obtaining a few existing and new Riemannian accelerated gradient methods as special cases, while characterizing their acceleration as corollaries of our main results.}
}
@InProceedings{pmlr-v178-liu22d,
title = {On Almost Sure Convergence Rates of Stochastic Gradient Methods},
author = {Liu, Jun and Yuan, Ye},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2963--2983},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/liu22d/liu22d.pdf},
url = {https://proceedings.mlr.press/v178/liu22d.html},
abstract = {The vast majority of convergence rates analysis for stochastic gradient methods in the literature focus on convergence in expectation, whereas trajectory-wise almost sure convergence is clearly important to ensure that any instantiation of the stochastic algorithms would converge with probability one. Here we provide a unified almost sure convergence rates analysis for stochastic gradient descent (SGD), stochastic heavy-ball (SHB), and stochastic Nesterov’s accelerated gradient (SNAG) methods. We show, for the first time, that the almost sure convergence rates obtained for these stochastic gradient methods on strongly convex functions, are arbitrarily close to their optimal convergence rates possible. For non-convex objective functions, we not only show that a weighted average of the squared gradient norms converges to zero almost surely, but also the last iterates of the algorithms. We further provide last-iterate almost sure convergence rates analysis for stochastic gradient methods on weakly convex smooth functions, in contrast with most existing results in the literature that only provide convergence in expectation for a weighted average of the iterates.}
}
@InProceedings{pmlr-v178-chen22c,
title = {Improved analysis for a proximal algorithm for sampling},
author = {Chen, Yongxin and Chewi, Sinho and Salim, Adil and Wibisono, Andre},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {2984--3014},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/chen22c/chen22c.pdf},
url = {https://proceedings.mlr.press/v178/chen22c.html},
abstract = {We study the proximal sampler of Lee, Shen, and Tian (2021) and obtain new convergence guarantees under weaker assumptions than strong log-concavity: namely, our results hold for (1) weakly log-concave targets, and (2) targets satisfying isoperimetric assumptions which allow for non-log-concavity. We demonstrate our results by obtaining new state-of-the-art sampling guarantees for several classes of target distributions. We also strengthen the connection between the proximal sampler and the proximal method in optimization by interpreting the former as an entropically regularized Wasserstein gradient flow and the latter as the limit of one.}
}
@InProceedings{pmlr-v178-hopkins22a,
title = {Realizable Learning is All You Need},
author = {Hopkins, Max and Kane, Daniel M. and Lovett, Shachar and Mahajan, Gaurav},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {3015--3069},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/hopkins22a/hopkins22a.pdf},
url = {https://proceedings.mlr.press/v178/hopkins22a.html},
abstract = {The equivalence of realizable and agnostic learnability is a fundamental phenomenon in learning theory. With variants ranging from classical settings like PAC learning and regression to recent trends such as adversarially robust and private learning, it’s surprising we still lack a unified theory; traditional proofs of the equivalence tend to be disparate, and rely on strong model-specific assumptions like uniform convergence and sample compression. In this work, we give the first model-independent framework explaining the equivalence of realizable and agnostic learnability: a three-line blackbox reduction that simplifies, unifies, and extends our understanding across a wide variety of settings. This includes models with no known characterization of learnability such as learning with arbitrary distributional assumptions or general loss, as well as a host of other popular settings such as robust learning, partial learning, fair learning, and the statistical query model. More generally, we argue that the equivalence of realizable and agnostic learning is actually a special case of a broader phenomenon we call property generalization: any desirable property of a learning algorithm (e.g. noise tolerance, privacy, stability) that can be satisfied over finite hypothesis classes extends (possibly in some variation) to any learnable hypothesis class.}
}
@InProceedings{pmlr-v178-makarychev22a,
title = {Streaming Algorithms for Ellipsoidal Approximation of Convex Polytopes},
author = {Makarychev, Yury and Manoj, Naren Sarayu and Ovsiankin, Max},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {3070--3093},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/makarychev22a/makarychev22a.pdf},
url = {https://proceedings.mlr.press/v178/makarychev22a.html},
abstract = {We give efficient deterministic one-pass streaming algorithms for finding an ellipsoidal approximation of a symmetric convex polytope. The algorithms are near-optimal in that their approximation factors differ from that of the optimal offline solution only by a factor sub-logarithmic in the aspect ratio of the polytope.}
}
@InProceedings{pmlr-v178-liu22e,
title = {The Pareto Frontier of Instance-Dependent Guarantees in Multi-Player Multi-Armed Bandits with no Communication},
author = {Liu, Allen X and Sellke, Mark},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {3094--3094},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/liu22e/liu22e.pdf},
url = {https://proceedings.mlr.press/v178/liu22e.html},
abstract = {We study the stochastic multi-player multi-armed bandit problem. In this problem, there are $m$ players and $K > m$ arms and the players cooperate to maximize their total reward. However the players cannot communicate and are penalized (e.g. receive no reward) if they pull the same arm at the same time. We ask whether it is possible to obtain optimal instance-dependent regret $\wt{O}(1/\Delta)$ where $\Delta$ is the gap between the $m$-th and $m+1$-st best arms. Such guarantees were recently achieved by \cite{pacchiano2021instance, huang2021towards} in a model in which the players are able to implicitly communicate through intentional collisions. We show that with no communication at all, such guarantees are, surprisingly, not achievable. In fact, obtaining the optimal $\wt{O}(1/\Delta)$ regret for some regimes of $\Delta$ necessarily implies strictly sub-optimal regret in other regimes. Our main result is a complete characterization of the Pareto optimal instance-dependent trade-offs that are possible with no communication. Our algorithm generalizes that of \cite{bubeck2021cooperative} and enjoys the same strong no-collision property, while our lower bound is completely new.}
}
@InProceedings{pmlr-v178-tang22a,
title = {Minimax Regret on Patterns Using Kullback-Leibler Divergence Covering},
author = {Tang, Jennifer},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {3095--3112},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/tang22a/tang22a.pdf},
url = {https://proceedings.mlr.press/v178/tang22a.html},
abstract = {This paper considers the problem of finding a tighter upper bound on the minimax regret of patterns, a class used to study large-alphabet distributions which avoids infinite asymptotic regret and redundancy. Our method for finding upper bounds for minimax regret uses cover numbers with Kullback-Leibler (KL) divergence as the distance. Compared to existing results by Acharya et al. (2013), we are able to improve the power of the exponent on the logarithmic term, giving a minimax regret bound which matches the best known minimax redundancy bound on patterns.}
}
@InProceedings{pmlr-v178-gupta22a,
title = {Sharp Constants in Uniformity Testing via the Huber Statistic},
author = {Gupta, Shivam and Price, Eric},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {3113--3192},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/gupta22a/gupta22a.pdf},
url = {https://proceedings.mlr.press/v178/gupta22a.html},
abstract = {Uniformity testing is one of the most well-studied problems in property testing, with many known test statistics, including ones based on counting collisions, singletons, and the empirical TV distance. It is known that the optimal sample complexity to distinguish the uniform distribution on $m$ elements from any $\eps$-far distribution with $1-\delta$ probability is $n = \Theta(\frac{\sqrt{m \log (1/\delta)}}{\eps^2} + \frac{\log (1/\delta)}{\eps^2})$, which is achieved by the empirical TV tester. Yet in simulation, these theoretical analyses are misleading: in many cases, they do not correctly rank order the performance of existing testers, even in an asymptotic regime of all parameters tending to $0$ or $\infty$. We explain this discrepancy by studying the \emph{constant factors} required by the algorithms. We show that the collisions tester achieves a sharp maximal constant in the number of standard deviations of separation between uniform and non-uniform inputs. We then introduce a new tester based on the Huber loss, and show that it not only matches this separation, but also has tails corresponding to a Gaussian with this separation. This leads to a sample complexity of $(1 + o(1))\frac{\sqrt{m \log (1/\delta)}}{\eps^2}$ in the regime where this term is dominant, unlike all other existing testers.}
}
@InProceedings{pmlr-v178-gopalan22a,
title = {Low-Degree Multicalibration},
author = {Gopalan, Parikshit and Kim, Michael P and Singhal, Mihir A and Zhao, Shengjia},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {3193--3234},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/gopalan22a/gopalan22a.pdf},
url = {https://proceedings.mlr.press/v178/gopalan22a.html},
abstract = {Introduced as a notion of algorithmic fairness, multicalibration has proved to be a powerful and versatile concept with implications far beyond its original intent. This stringent notion—that predictions be well-calibrated across a rich class of intersecting subpopulations—provides its strong guarantees at a cost: the computational and sample complexity of learning multicalibrated predictors are high, and grow exponentially with the number of class labels. In contrast, the relaxed notion of multiaccuracy can be achieved more efficiently, yet many of the most desirable properties of multicalibration cannot be guaranteed assuming multiaccuracy alone. This tension raises a key question: \emph{Can we learn predictors with multicalibration-style guarantees at a cost commensurate with multiaccuracy?} In this work, we define and initiate the study of \emph{Low-Degree Multicalibration}. Low-Degree Multicalibration defines a hierarchy of increasingly-powerful multi-group fairness notions that spans multiaccuracy and the original formulation of multicalibration at the extremes. Our main technical contribution demonstrates that key properties of multicalibration, related to fairness and accuracy, actually manifest as low-degree properties. Importantly, we show that low-degree multicalibration can be significantly more efficient than full multicalibration. In the multi-class setting, the sample complexity to achieve low-degree multicalibration improves exponentially (in the number of classes) over full multicalibration. Our work presents compelling evidence that low-degree multicalibration represents a sweet spot, pairing computational and sample efficiency with strong fairness and accuracy guarantees.}
}
@InProceedings{pmlr-v178-kargin22a,
title = {Thompson Sampling Achieves $\tilde{O}(\sqrt{T})$ Regret in Linear Quadratic Control},
author = {Kargin, Taylan and Lale, Sahin and Azizzadenesheli, Kamyar and Anandkumar, Animashree and Hassibi, Babak},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {3235--3284},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/kargin22a/kargin22a.pdf},
url = {https://proceedings.mlr.press/v178/kargin22a.html},
abstract = {Thompson Sampling (TS) is an efficient method for decision-making under uncertainty, where an action is sampled from a carefully prescribed distribution which is updated based on the observed data. In this work, we study the problem of adaptive control of stabilizable linear-quadratic regulators (LQRs) using TS, where the system dynamics are unknown. Previous works have established that $\tilde{O}(\sqrt{T})$ frequentist regret is optimal for the adaptive control of LQRs. However, the existing methods either work only in restrictive settings, require a priori known stabilizing controllers or utilize computationally intractable approaches. We propose an efficient TS algorithm for the adaptive control of LQRs, TS-based Adaptive Control, TSAC, that attains $\tilde{O}(\sqrt{T})$ regret, even for multidimensional systems, thereby solving the open problem posed in Abeille and Lazaric (2018). TSAC does not require a priori known stabilizing controller and achieves fast stabilization of the underlying system by effectively exploring the environment in the early stages. Our result hinges on developing a novel lower bound on the probability that the TS provides an optimistic sample. By carefully prescribing an early exploration strategy and a policy update rule, we show that TS achieves order-optimal regret in adaptive control of multidimensional stabilizable LQRs. We empirically demonstrate the performance and the efficiency of the proposed algorithm in several adaptive control tasks.}
}
@InProceedings{pmlr-v178-zimmert22b,
title = {Return of the bias: Almost minimax optimal high probability bounds for adversarial linear bandits},
author = {Zimmert, Julian and Lattimore, Tor},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {3285--3312},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/zimmert22b/zimmert22b.pdf},
url = {https://proceedings.mlr.press/v178/zimmert22b.html},
abstract = {We introduce a modification of follow the regularised leader and combine it with the log determinant potential and suitable loss estimators to prove that the minimax regret for adaptive adversarial linear bandits is at most $O(d \sqrt{T \log(T)})$ where $d$ is the dimension and $T$ is the number of rounds. By using exponential weights, we improve this bound to $O(\sqrt{dT\log(kT)})$ when the action set has size $k$. These results confirms an old conjecture. We also show that follow the regularized leader with the entropic barrier and suitable loss estimators has regret against an adaptive adversary of at most $O(d^2 \sqrt{T} \log(T))$ and can be implement in polynomial time, which improves on the best known bound for an efficient algorithm of $O(d^{7/2} \sqrt{T} \poly(\log(T)))$ by Lee et al 2020.}
}
@InProceedings{pmlr-v178-attia22a,
title = {Uniform Stability for First-Order Empirical Risk Minimization},
author = {Attia, Amit and Koren, Tomer},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {3313--3332},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/attia22a/attia22a.pdf},
url = {https://proceedings.mlr.press/v178/attia22a.html},
abstract = {We consider the problem of designing uniformly stable first-order optimization algorithms for empirical risk minimization. Uniform stability is often used to obtain generalization error bounds for optimization algorithms, and we are interested in a general approach to achieve it. For Euclidean geometry, we suggest a black-box conversion which given a smooth optimization algorithm, produces a uniformly stable version of the algorithm while maintaining its convergence rate up to logarithmic factors. Using this reduction we obtain a (nearly) optimal algorithm for smooth optimization with convergence rate $\tilde{O}(1/T^2)$ and uniform stability $O(T^2/n)$, resolving an open problem of Chen et al. (2018); Attia and Koren (2021). For more general geometries, we develop a variant of Mirror Descent for smooth optimization with convergence rate $\tilde{O}(1/T)$ and uniform stability $O(T/n)$, leaving open the question of devising a general conversion method as in the Euclidean case.}
}
@InProceedings{pmlr-v178-ziemann22a,
title = {Single Trajectory Nonparametric Learning of Nonlinear Dynamics},
author = {Ziemann, Ingvar M and Sandberg, Henrik and Matni, Nikolai},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {3333--3364},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/ziemann22a/ziemann22a.pdf},
url = {https://proceedings.mlr.press/v178/ziemann22a.html},
abstract = {Given a single trajectory of a dynamical system, we analyze the performance of the nonparametric least squares estimator (LSE). More precisely, we give nonasymptotic expected $l^2$-distance bounds between the LSE and the true regression function, where expectation is evaluated on a fresh, counterfactual, trajectory. We leverage recently developed information-theoretic methods to establish the optimality of the LSE for nonparametric hypotheses classes in terms of supremum norm metric entropy and a subgaussian parameter. Next, we relate this subgaussian parameter to the stability of the underlying process using notions from dynamical systems theory. When combined, these developments lead to rate-optimal error bounds that scale as $T^{-1/(2+q)}$ for suitably stable processes and hypothesis classes with metric entropy growth of order $\delta^{-q}$. Here, $T$ is the length of the observed trajectory, $\delta \in \mathbb{R}_+$ is the packing granularity and $q\in (0,2)$ is a complexity term. Finally, we specialize our results to a number of scenarios of practical interest, such as Lipschitz dynamics, generalized linear models, and dynamics described by functions in certain classes of Reproducing Kernel Hilbert Spaces (RKHS).}
}
@InProceedings{pmlr-v178-sterkenburg22a,
title = {On characterizations of learnability with computable learners},
author = {Sterkenburg, Tom F.},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {3365--3379},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/sterkenburg22a/sterkenburg22a.pdf},
url = {https://proceedings.mlr.press/v178/sterkenburg22a.html},
abstract = {We study computable PAC (CPAC) learning as introduced by Agarwal et al. (2020). First, we consider the main open question of finding characterizations of proper and improper CPAC learning. We give a characterization of a closely related notion of *strong* CPAC learning, and we provide a negative answer to the COLT open problem posed by Agarwal et al. (2021) whether all decidably representable PAC learnable classes are improperly CPAC learnable. Second, we consider undecidability of (computable) PAC learnability. We give a simple general argument to exhibit such undecidability, and we initiate a study of the arithmetical complexity of learnability. We briefly discuss the relation to the undecidability result of Ben-David et al. (2019), that motivated the work of Agarwal et al.}
}
@InProceedings{pmlr-v178-schliserman22a,
title = {Stability vs Implicit Bias of Gradient Methods on Separable Data and Beyond},
author = {Schliserman, Matan and Koren, Tomer},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {3380--3394},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/schliserman22a/schliserman22a.pdf},
url = {https://proceedings.mlr.press/v178/schliserman22a.html},
abstract = {An influential line of recent work has focused on the generalization properties of unregularized gradient-based learning procedures applied to separable linear classification with exponentially-tailed loss functions. The ability of such methods to generalize well has been attributed to their implicit bias towards large margin predictors, both asymptotically as well as in finite time. We give an additional unified explanation for this generalization and relate it to two simple properties of the optimization objective, that we refer to as realizability and self-boundedness. We introduce a general setting of unconstrained stochastic convex optimization with these properties, and analyze generalization of gradient methods through the lens of algorithmic stability. In this broader setting, we obtain sharp stability bounds for gradient descent and stochastic gradient descent which apply even for a very large number of gradient steps, and use them to derive general generalization bounds for these algorithms. Finally, as direct applications of the general bounds, we return to the setting of linear classification with separable data and establish several novel test loss and test accuracy bounds for gradient descent and stochastic gradient descent for a variety of loss functions with different tail decay rates. In some of these cases, our bounds significantly improve upon the existing generalization error bounds in the literature.}
}
@InProceedings{pmlr-v178-hahn-klimroth22a,
title = {Near optimal efficient decoding from pooled data},
author = {Hahn-Klimroth, Max and M\"uller, Noela},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {3395--3409},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/hahn-klimroth22a/hahn-klimroth22a.pdf},
url = {https://proceedings.mlr.press/v178/hahn-klimroth22a.html},
abstract = {Consider $n$ items, each of which is characterised by one of $d+1$ possible features in $\{0, \ldots, d\}$. We study the inference task of learning these types by queries on subsets, or pools, of the items that only reveal a form of coarsened information on the features - in our case, the sum of all the features in the pool. This is a realistic scenario in situations where one has memory or technical constraints in the data collection process, or where the data is subject to anonymisation. Related prominent problems are the quantitative group testing problem, of which it is a generalisation, as well as the compressed sensing problem, of which it is a special case. In the present article, we are interested in the minimum number of queries needed to efficiently infer the features, in the setting where the feature vector is chosen uniformly while fixing the frequencies, and one of the features, say $0$, is dominant in the sense that the number $k = n^{\theta}, \theta \in (0,1)$, of non-zero features among the items is much smaller than $n$. It is known that in this case, all features can be recovered in exponential time using no more than $O(k)$ queries. However, so far, all \emph{efficient} inference algorithms required at least $\Omega(k\ln n)$ queries, and it was unknown whether this gap is artificial or of a fundamental nature. Here we show that indeed, the previous gap between the information-theoretic and computational bounds is not inherent to the problem by providing an efficient algorithm that succeeds with high probability and employs no more than $O(k)$ measurements. This also solves a prominent open question for the quantitative group testing problem.}
}
@InProceedings{pmlr-v178-buchholz22a,
title = {Kernel interpolation in Sobolev spaces is not consistent in low dimensions},
author = {Buchholz, Simon},
booktitle = {Proceedings of Thirty Fifth Conference on Learning Theory},
pages = {3410--3440},
year = {2022},
editor = {Loh, Po-Ling and Raginsky, Maxim},
volume = {178},
series = {Proceedings of Machine Learning Research},
month = {02--05 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v178/buchholz22a/buchholz22a.pdf},
url = {https://proceedings.mlr.press/v178/buchholz22a.html},
abstract = {We consider kernel ridgeless ridge regression with kernels whose associated RKHS is a Sobolev space $H^s$. We show for $d/2