@Proceedings{COLT2017,
title = {Proceedings of the 2017 Conference on Learning Theory},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
editor = {Satyen Kale and Ohad Shamir},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
volume = 65
}
@InProceedings{pmlr-v65-kale17a,
title = {Preface: Conference on Learning Theory (COLT), 2017},
author = {Kale, Satyen and Shamir, Ohad},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {1--3},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/kale17a/kale17a.pdf},
url = {https://proceedings.mlr.press/v65/kale17a.html}
}
@InProceedings{pmlr-v65-agarwal17a,
title = {Open Problem: First-Order Regret Bounds for Contextual Bandits},
author = {Agarwal, Alekh and Krishnamurthy, Akshay and Langford, John and Luo, Haipeng and E., Schapire Robert},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {4--7},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/agarwal17a/agarwal17a.pdf},
url = {https://proceedings.mlr.press/v65/agarwal17a.html},
abstract = {We describe two open problems related to first order regret bounds for contextual bandits. The first asks for an algorithm with a regret bound of $\tilde{\mathcal{O}}(\sqrt{L_⋆}K \ln N)$ where there are $K$ actions, $N$ policies, and $L_⋆$ is the cumulative loss of the best policy. The second asks for an optimization-oracle-efficient algorithm with regret $\tilde{\mathcal{O}}(L_⋆^{2/3}poly(K, \ln(N/δ)))$. We describe some positive results, such as an inefficient algorithm for the second problem, and some partial negative results. }
}
@InProceedings{pmlr-v65-fish17a,
title = {Open Problem: Meeting Times for Learning Random Automata},
author = {Fish, Benjamin and Reyzin, Lev},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {8--11},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/fish17a/fish17a.pdf},
url = {https://proceedings.mlr.press/v65/fish17a.html},
abstract = {Learning automata is a foundational problem in computational learning theory. However, even efficiently learning random DFAs is hard. A natural restriction of this problem is to consider learning random DFAs under the uniform distribution. To date, this problem has no non-trivial lower bounds nor algorithms faster than brute force. In this note, we propose a method to find faster algorithms for this problem. We reduce the learning problem to a conjecture about meeting times of random walks over random DFAs, which may be of independent interest to prove.}
}
@InProceedings{pmlr-v65-agarwal17b,
title = {Corralling a Band of Bandit Algorithms},
author = {Agarwal, Alekh and Luo, Haipeng and Neyshabur, Behnam and Schapire, Robert E.},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {12--38},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/agarwal17b/agarwal17b.pdf},
url = {https://proceedings.mlr.press/v65/agarwal17b.html},
abstract = {We study the problem of combining multiple bandit algorithms (that is, online learning algorithms with partial feedback) with the goal of creating a master algorithm that performs almost as well as the best base algorithm \it if it were to be run on its own. The main challenge is that when run with a master, base algorithms unavoidably receive much less feedback and it is thus critical that the master not starve a base algorithm that might perform uncompetitively initially but would eventually outperform others if given enough feedback. We address this difficulty by devising a version of Online Mirror Descent with a special mirror map together with a sophisticated learning rate scheme. We show that this approach manages to achieve a more delicate balance between exploiting and exploring base algorithms than previous works yielding superior regret bounds. Our results are applicable to many settings, such as multi-armed bandits, contextual bandits, and convex bandits. As examples, we present two main applications. The first is to create an algorithm that enjoys worst-case robustness while at the same time performing much better when the environment is relatively easy. The second is to create an algorithm that works simultaneously under different assumptions of the environment, such as different priors or different loss structures.}
}
@InProceedings{pmlr-v65-agarwal17c,
title = {Learning with Limited Rounds of Adaptivity: Coin Tossing, Multi-Armed Bandits, and Ranking from Pairwise Comparisons},
author = {Agarwal, Arpit and Agarwal, Shivani and Assadi, Sepehr and Khanna, Sanjeev},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {39--75},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/agarwal17c/agarwal17c.pdf},
url = {https://proceedings.mlr.press/v65/agarwal17c.html},
abstract = {In many learning settings, active/adaptive querying is possible, but the number of rounds of adaptivity is limited. We study the relationship between query complexity and adaptivity in identifying the $k$ most biased coins among a set of $n$ coins with unknown biases. This problem is a common abstraction of many well-studied problems, including the problem of identifying the $k$ best arms in a stochastic multi-armed bandit, and the problem of top-$k$ ranking from pairwise comparisons. An $r$-round adaptive algorithm for the $k$ most biased coins problem specifies in each round the set of coin tosses to be performed based on the observed outcomes in earlier rounds, and outputs the set of $k$ most biased coins at the end of $r$ rounds. When $r=1$, the algorithm is known as \em non-adaptive; when $r$ is unbounded, the algorithm is known as \em fully adaptive. While the power of adaptivity in reducing query complexity is well known, full adaptivity requires repeated interaction with the coin tossing (feedback generation) mechanism, and is highly sequential, since the set of coins to be tossed in each round can only be determined after we have observed the outcomes of the coin tosses from the previous round. In contrast, algorithms with only few rounds of adaptivity require fewer rounds of interaction with the feedback generation mechanism, and offer the benefits of parallelism in algorithmic decision-making. Motivated by these considerations, we consider the question of how much adaptivity is needed to realize the optimal worst case query complexity for identifying the $k$ most biased coins. Given any positive integer $r$, we derive essentially matching upper and lower bounds on the query complexity of $r$-round algorithms. We then show that $Θ(\log^*n)$ rounds are both necessary and sufficient for achieving the optimal worst case query complexity for identifying the $k$ most biased coins. In particular, our algorithm achieves the optimal query complexity in at most $\log^*n$ rounds, which implies that on any realistic input, $5$ parallel rounds of exploration suffice to achieve the optimal worst-case sample complexity. The best known algorithm prior to our work required $Θ(\log n)$ rounds to achieve the optimal worst case query complexity even for the special case of $k=1$. }
}
@InProceedings{pmlr-v65-agrawal17a,
title = {Thompson Sampling for the MNL-Bandit},
author = {Agrawal, Shipra and Avadhanula, Vashist and Goyal, Vineet and Zeevi, Assaf},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {76--78},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/agrawal17a/agrawal17a.pdf},
url = {https://proceedings.mlr.press/v65/agrawal17a.html},
abstract = {We consider a sequential subset selection problem under parameter uncertainty, where at each time step, the decision maker selects a subset of cardinality $K$ from $N$ possible items (arms), and observes a (bandit) feedback in the form of the index of one of the items in said subset, or none. Each item in the index set is ascribed a certain value (reward), and the feedback is governed by a Multinomial Logit (MNL) choice model whose parameters are a priori unknown. The objective of the decision maker is to maximize the expected cumulative rewards over a finite horizon $T$, or alternatively, minimize the regret relative to an oracle that knows the MNL parameters. We refer to this as the MNL-Bandit problem. This problem is representative of a larger family of exploration-exploitation problems that involve a combinatorial objective, and arise in several important application domains. We present an approach to adapt Thompson Sampling to this problem and show that it achieves near-optimal regret as well as attractive numerical performance. }
}
@InProceedings{pmlr-v65-anandkumar17a,
title = {Homotopy Analysis for Tensor PCA},
author = {Anandkumar, Anima and Deng, Yuan and Ge, Rong and Mobahi, Hossein},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {79--104},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/anandkumar17a/anandkumar17a.pdf},
url = {https://proceedings.mlr.press/v65/anandkumar17a.html},
abstract = {Developing efficient and guaranteed nonconvex algorithms has been an important challenge in modern machine learning. Algorithms with good empirical performance such as stochastic gradient descent often lack theoretical guarantees. In this paper, we analyze the class of homotopy or continuation methods for global optimization of nonconvex functions. These methods start from an objective function that is efficient to optimize (e.g. convex), and progressively modify it to obtain the required objective, and the solutions are passed along the homotopy path. For the challenging problem of tensor PCA, we prove global convergence of the homotopy method in the “high noise” regime. The signal-to-noise requirement for our algorithm is tight in the sense that it matches the recovery guarantee for the \em best degree-$4$ sum-of-squares algorithm. In addition, we prove a phase transition along the homotopy path for tensor PCA. This allows us to simplify the homotopy method to a local search algorithm, viz., tensor power iterations, with a specific initialization and a noise injection procedure, while retaining the theoretical guarantees.}
}
@InProceedings{pmlr-v65-andoni17a,
title = {Correspondence retrieval},
author = {Andoni, Alexandr and Hsu, Daniel and Shi, Kevin and Sun, Xiaorui},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {105--126},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/andoni17a/andoni17a.pdf},
url = {https://proceedings.mlr.press/v65/andoni17a.html},
abstract = {This article studies the correspondence retrieval problem: a set of $k$ distinct but unknown points $\mathbf{x}_1, \mathbf{x}_2, \dotsc, \mathbf{x}_k ∈\mathbb{R}^d$ are to be recovered from the unordered collection of projection values $⟨\mathbf{w}_i,\mathbf{x}_1 ⟩, ⟨\mathbf{w}_i,\mathbf{x}_2 ⟩, \dotsc, ⟨\mathbf{w}_i,\mathbf{x}_k ⟩$ onto $n$ known measurement vectors $\mathbf{w}_1, \mathbf{w}_2, \dotsc, \mathbf{w}_n$. Importantly, the correspondence of the $k$ projections ${⟨\mathbf{w}_i,\mathbf{x}_j ⟩}_j=1^k$ across different measurements is unknown. A special case of this problem is the well-studied problem of (real-valued) phase retrieval. In the case of independent standard Gaussian measurement vectors, the main algorithm proposed in this work requires $n = d+1$ measurements to correctly return the $k$ unknown points with high probability. This number of measurements is optimal, and it is smaller than the number of measurements required for a stronger “for all” guarantee even in the phase retrieval setting. The algorithm is based on reductions to the Shortest Vector Problem on certain random lattices, and employs the Lenstra, Lenstra, and Lovász (1982) basis reduction algorithm in a manner similar to the Lagarias & Odlyzko (1985) algorithm for solving random instances of Subset Sum. Another algorithm, essentially due to Yi, Caramanis, & Sanghavi (2016), based on higher-order moments and tensor decompositions is shown to work in a setting where the projection values are corrupted by additive Gaussian noise, but it requires a significantly larger number of measurements.}
}
@InProceedings{pmlr-v65-awasthi17a,
title = {Efficient PAC Learning from the Crowd},
author = {Awasthi, Pranjal and Blum, Avrim and Haghtalab, Nika and Mansour, Yishay},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {127--150},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/awasthi17a/awasthi17a.pdf},
url = {https://proceedings.mlr.press/v65/awasthi17a.html},
abstract = {In recent years crowdsourcing has become the method of choice for gathering labeled training data for learning algorithms. Standard approaches to crowdsourcing view the process of acquiring labeled data separately from the process of learning a classifier from the gathered data. This can give rise to computational and statistical challenges. For example, in most cases there are no known computationally efficient learning algorithms that are robust to the high level of noise that exists in crowdsourced data, and efforts to eliminate noise through voting often require a large number of queries per example. In this paper, we show how by interleaving the process of labeling and learning, we can attain computational efficiency with much less overhead in the labeling cost. In particular, we consider the \em realizable setting where there exists a true target function in $\mathcal{F}$ and consider a pool of labelers. When a noticeable fraction of the labelers are \emphperfect, and the rest behave arbitrarily, we show that any $\mathcal{F}$ that can be efficiently learned in the traditional \em realizable PAC model can be learned in a computationally efficient manner by querying the crowd, despite high amounts of noise in the responses. Moreover, we show that this can be done while each labeler only labels a constant number of examples and the number of labels requested per example, on average, is a constant. When no perfect labelers exist, a related task is to find a set of the labelers which are \emphgood but not perfect. We show that we can identify all good labelers, when at least the majority of labelers are good. }
}
@InProceedings{pmlr-v65-bafna17a,
title = {The Price of Selection in Differential Privacy},
author = {Bafna, Mitali and Ullman, Jonathan},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {151--168},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/bafna17a/bafna17a.pdf},
url = {https://proceedings.mlr.press/v65/bafna17a.html},
abstract = {In the differentially private top-$k$ selection problem, we are given a dataset $X ∈\pmo^n \times d$, in which each row belongs to an individual and each column corresponds to some binary attribute, and our goal is to find a set of $k ≪d$ columns whose means are approximately as large as possible. Differential privacy requires that our choice of these $k$ columns does not depend too much on any on individual’s dataset. This problem can be solved using the well known exponential mechanism and composition properties of differential privacy. In the high-accuracy regime, where we require the error of the selection procedure to be to be smaller than the so-called sampling error $α≈\sqrt\ln(d)/n$, this procedure succeeds given a dataset of size $n ≳k \ln(d)$. We prove a matching lower bound, showing that a dataset of size $n ≳k \ln(d)$ is necessary for private top-$k$ selection in this high-accuracy regime. Our lower bound shows that selecting the $k$ largest columns requires more data than simply estimating the value of those $k$ columns, which can be done using a dataset of size just $n ≳k$. }
}
@InProceedings{pmlr-v65-balakrishnan17a,
title = {Computationally Efficient Robust Sparse Estimation in High Dimensions},
author = {Balakrishnan, Sivaraman and Du, Simon S. and Li, Jerry and Singh, Aarti},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {169--212},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/balakrishnan17a/balakrishnan17a.pdf},
url = {https://proceedings.mlr.press/v65/balakrishnan17a.html},
abstract = {Many conventional statistical procedures are extremely sensitive to seemingly minor deviations from modeling assumptions. This problem is exacerbated in modern high-dimensional settings, where the problem dimension can grow with and possibly exceed the sample size. We consider the problem of robust estimation of sparse functionals, and provide a computationally and statistically efficient algorithm in the high-dimensional setting. Our theory identifies a unified set of deterministic conditions under which our algorithm guarantees accurate recovery. By further establishing that these deterministic conditions hold with high-probability for a wide range of statistical models, our theory applies to many problems of considerable interest including sparse mean and covariance estimation; sparse linear regression; and sparse generalized linear models. In certain settings, such as the detection and estimation of sparse principal components in the spiked covariance model, our general theory does not yield optimal sample complexity, and we provide a novel algorithm based on the same intuition which is able to take advantage of further structure of the problem to achieve nearly optimal rates.}
}
@InProceedings{pmlr-v65-balcan17a,
title = {Learning-Theoretic Foundations of Algorithm Configuration for Combinatorial Partitioning Problems},
author = {Balcan, Maria-Florina and Nagarajan, Vaishnavh and Vitercik, Ellen and White, Colin},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {213--274},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/balcan17a/balcan17a.pdf},
url = {https://proceedings.mlr.press/v65/balcan17a.html},
abstract = {Max-cut, clustering, and many other partitioning problems that are of significant importance to machine learning and other scientific fields are NP-hard, a reality that has motivated researchers to develop a wealth of approximation algorithms and heuristics. Although the best algorithm to use typically depends on the specific application domain, a worst-case analysis is often used to compare algorithms. This may be misleading if worst-case instances occur infrequently, and thus there is a demand for optimization methods which return the algorithm configuration best suited for the given application’s typical inputs. Recently, Gupta and Roughgarden introduced the first learning-theoretic framework to rigorously study this problem, using it to analyze classes of greedy heuristics, parameter tuning in gradient descent, and other problems. We study this algorithm configuration problem for clustering, max-cut, and other partitioning problems, such as integer quadratic programming, by designing computationally efficient and sample efficient learning algorithms which receive samples from an application-specific distribution over problem instances and learn a partitioning algorithm with high expected performance. Our algorithms learn over common integer quadratic programming and clustering algorithm families: SDP rounding algorithms and agglomerative clustering algorithms with dynamic programming. For our sample complexity analysis, we provide tight bounds on the pseudodimension of these algorithm classes, and show that surprisingly, even for classes of algorithms parameterized by a single parameter, the pseudo-dimension is superconstant. In this way, our work both contributes to the foundations of algorithm configuration and pushes the boundaries of learning theory, since the algorithm classes we analyze consist of multi-stage optimization procedures and are significantly more complex than classes typically studied in learning theory.}
}
@InProceedings{pmlr-v65-balkanski17a,
title = {The Sample Complexity of Optimizing a Convex Function},
author = {Balkanski, Eric and Singer, Yaron},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {275--301},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/balkanski17a/balkanski17a.pdf},
url = {https://proceedings.mlr.press/v65/balkanski17a.html},
abstract = {In this paper we study optimization from samples of convex functions. There are many scenarios in which we do not know the function we wish to optimize but can learn it from data. In such cases, we are interested in bounding the number of samples required to optimize the function. Our main result shows that in general, the number of samples required to obtain a non-trivial approximation to the optimum of a convex function is exponential in its dimension, even when the function is PAC-learnable. We also obtain strong lower bounds for strongly convex and Lipschitz continuous functions. On the positive side, we show that there are interesting classes of functions and distributions for which the sample complexity is polynomial in the dimension of the function.}
}
@InProceedings{pmlr-v65-blum17a,
title = {Efficient Co-Training of Linear Separators under Weak Dependence},
author = {Blum, Avrim and Mansour, Yishay},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {302--318},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/blum17a/blum17a.pdf},
url = {https://proceedings.mlr.press/v65/blum17a.html},
abstract = {We develop the first polynomial-time algorithm for co-training of homogeneous linear separators under \em weak dependence, a relaxation of the condition of independence given the label. Our algorithm learns from purely unlabeled data, except for a single labeled example to break symmetry of the two classes, and works for any data distribution having an inverse-polynomial margin and with center of mass at the origin.}
}
@InProceedings{pmlr-v65-brosse17a,
title = {Sampling from a log-concave distribution with compact support with proximal Langevin Monte Carlo},
author = {Brosse, Nicolas and Durmus, Alain and Moulines, Éric and Pereyra, Marcelo},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {319--342},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/brosse17a/brosse17a.pdf},
url = {https://proceedings.mlr.press/v65/brosse17a.html},
abstract = { This paper presents a detailed theoretical analysis of the Langevin Monte Carlo sampling algorithm recently introduced in Durmus et al. (Efficient Bayesian computation by proximal Markov chain Monte Carlo: when Langevin meets Moreau, 2016) when applied to log-concave probability distributions that are restricted to a convex body $K$. This method relies on a regularisation procedure involving the Moreau-Yosida envelope of the indicator function associated with $K$. Explicit convergence bounds in total variation norm and in Wasserstein distance of order $1$ are established. In particular, we show that the complexity of this algorithm given a first order oracle is polynomial in the dimension of the state space. Finally, some numerical experiments are presented to compare our method with competing MCMC approaches from the literature.}
}
@InProceedings{pmlr-v65-brunel17a,
title = {Rates of estimation for determinantal point processes},
author = {Brunel, Victor-Emmanuel and Moitra, Ankur and Rigollet, Philippe and Urschel, John},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {343--345},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/brunel17a/brunel17a.pdf},
url = {https://proceedings.mlr.press/v65/brunel17a.html},
abstract = {Determinantal point processes (DPPs) have wide-ranging applications in machine learning, where they are used to enforce the notion of diversity in subset selection problems. Many estimators have been proposed, but surprisingly the basic properties of the maximum likelihood estimator (MLE) have received little attention. In this paper, we study the local geometry of the expected log-likelihood function to prove several rates of convergence for the MLE. We also give a complete characterization of the case where the MLE converges at a parametric rate. Even in the latter case, we also exhibit a potential curse of dimensionality where the asymptotic variance of the MLE is exponentially large in the dimension of the problem.}
}
@InProceedings{pmlr-v65-bshouty17a,
title = {Learning Disjunctions of Predicates},
author = {Bshouty, Nader H. and Drachsler-Cohen, Dana and Vechev, Martin and Yahav, Eran},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {346--369},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/bshouty17a/bshouty17a.pdf},
url = {https://proceedings.mlr.press/v65/bshouty17a.html},
abstract = {Let $\mathcal F$ be a set of boolean functions. We give an algorithm for learning $\mathcal F_∨:={\vee_f∈Sf | S⊆\mathcal {F}}$ from membership queries. Our algorithm asks at most $|\mathcal {F}|⋅\rm OPT(\mathcal {F}_∨)$ membership queries where $\rm OPT(\mathcal{F}_∨)$ is the minimum worst case number of membership queries for learning $\mathcal{F}_∨$. When $\mathcal{F}$ is a set of halfspaces over a constant dimension space or a set of variable inequalities, our algorithm runs in polynomial time. The problem we address has a practical importance in the field of program synthesis, where the goal is to synthesize a program meeting some requirements. Program synthesis has become popular especially in settings aimed to help end users. In such settings, the requirements are not provided upfront and the synthesizer can only learn them by posing membership queries to the end user. Our work completes such synthesizers with the ability to learn the exact requirements while bounding the number of membership queries. }
}
@InProceedings{pmlr-v65-canonne17a,
title = {Testing Bayesian Networks},
author = {Canonne, Clement L. and Diakonikolas, Ilias and Kane, Daniel M. and Stewart, Alistair},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {370--448},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/canonne17a/canonne17a.pdf},
url = {https://proceedings.mlr.press/v65/canonne17a.html},
abstract = {This work initiates a systematic investigation of testing \em high-dimensional structured distributions by focusing on testing \em Bayesian networks – the prototypical family of directed graphical models. A Bayesian network is defined by a directed acyclic graph, where we associate a random variable with each node. The value at any particular node is conditionally independent of all the other non-descendant nodes once its parents are fixed. Specifically, we study the properties of identity testing and closeness testing of Bayesian networks. Our main contribution is the first non-trivial efficient testing algorithms for these problems and corresponding information-theoretic lower bounds. For a wide range of parameter settings, our testing algorithms have sample complexity \em sublinear in the dimension and are sample-optimal, up to constant factors.}
}
@InProceedings{pmlr-v65-casalaina-martin17a,
title = {Multi-Observation Elicitation},
author = {Casalaina-Martin, Sebastian and Frongillo, Rafael and Morgan, Tom and Waggoner, Bo},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {449--464},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/casalaina-martin17a/casalaina-martin17a.pdf},
url = {https://proceedings.mlr.press/v65/casalaina-martin17a.html},
abstract = {We study loss functions that measure the accuracy of a prediction based on multiple data points simultaneously. To our knowledge, such loss functions have not been studied before in the area of property elicitation or in machine learning more broadly. As compared to traditional loss functions that take only a single data point, these multi-observation loss functions can in some cases drastically reduce the dimensionality of the hypothesis required. In elicitation, this corresponds to requiring many fewer reports; in empirical risk minimization, it corresponds to algorithms on a hypothesis space of much smaller dimension. We explore some examples of the tradeoff between dimensionality and number of observations, give some geometric characterizations and intuition for relating loss functions and the properties that they elicit, and discuss some implications for both elicitation and machine-learning contexts.}
}
@InProceedings{pmlr-v65-cesa-bianchi17a,
title = {Algorithmic Chaining and the Role of Partial Feedback in Online Nonparametric Learning},
author = {Cesa-Bianchi, Nicolò and Gaillard, Pierre and Gentile, Claudio and Gerchinovitz, Sébastien},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {465--481},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/cesa-bianchi17a/cesa-bianchi17a.pdf},
url = {https://proceedings.mlr.press/v65/cesa-bianchi17a.html},
abstract = {We investigate contextual online learning with nonparametric (Lipschitz) comparison classes under different assumptions on losses and feedback information. For full information feedback and Lipschitz losses, we design the first explicit algorithm achieving the minimax regret rate (up to log factors). In a partial feedback model motivated by second-price auctions, we obtain algorithms for Lipschitz and semi-Lipschitz losses with regret bounds improving on the known bounds for standard bandit feedback. Our analysis combines novel results for contextual second-price auctions with a novel algorithmic approach based on chaining. When the context space is Euclidean, our chaining approach is efficient and delivers an even better regret bound.}
}
@InProceedings{pmlr-v65-chen17a,
title = {Nearly Optimal Sampling Algorithms for Combinatorial Pure Exploration},
author = {Chen, Lijie and Gupta, Anupam and Li, Jian and Qiao, Mingda and Wang, Ruosong},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {482--534},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/chen17a/chen17a.pdf},
url = {https://proceedings.mlr.press/v65/chen17a.html},
abstract = { We study the combinatorial pure exploration problem \textscBest-Set in a stochastic multi-armed bandit game. In an \textscBest-Set instance, we are given $n$ stochastic arms with unknown reward distributions, as well as a family $\mathcal{F}$ of feasible subsets over the arms. Let the weight of an arm be the mean of its reward distribution. Our goal is to identify the feasible subset in $\mathcal{F}$ with the maximum total weight, using as few samples as possible. The problem generalizes the classical best arm identification problem and the top-$k$ arm identification problem, both of which have attracted significant attention in recent years. We provide a novel \textitinstance-wise lower bound for the sample complexity of the problem, as well as a nontrivial sampling algorithm, matching the lower bound up to a factor of $\ln|\mathcal{F}|$. For an important class of combinatorial families (including spanning trees, matchings, and path constraints), we also provide polynomial time implementation of the sampling algorithm, using the equivalence of separation and optimization for convex program, and the notion of approximate Pareto curves in multi-objective optimization (note that $|\mathcal{F}|$ can be exponential in $n$). We also show that the $\ln|\mathcal{F}|$ factor is inevitable in general, through a nontrivial lower bound construction utilizing a combinatorial structure resembling the Nisan-Wigderson design. Our results significantly improve several previous results for several important combinatorial constraints, and provide a tighter understanding of the general \textscBest-Set problem. We further introduce an even more general problem, formulated in geometric terms. We are given $n$ Gaussian arms with unknown means and unit variance. Consider the $n$-dimensional Euclidean space $\mathbb{R}^n$, and a collection $\mathcal{O}$ of disjoint subsets. Our goal is to determine the subset in $\mathcal{O}$ that contains the mean profile (which is the $n$-dimensional vector of the means), using as few samples as possible. The problem generalizes most pure exploration bandit problems studied in the literature. We provide the first nearly optimal sample complexity upper and lower bounds for the problem. }
}
@InProceedings{pmlr-v65-chen17b,
title = {Towards Instance Optimal Bounds for Best Arm Identification},
author = {Chen, Lijie and Li, Jian and Qiao, Mingda},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {535--592},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/chen17b/chen17b.pdf},
url = {https://proceedings.mlr.press/v65/chen17b.html},
abstract = { In the classical best arm identification (Best-$1$-Arm) problem, we are given $n$ stochastic bandit arms, each associated with a reward distribution with an unknown mean. Upon each play of an arm, we can get a reward sampled i.i.d. from its reward distribution. We would like to identify the arm with the largest mean with probability at least $1-δ$, using as few samples as possible. The problem has a long history and understanding its sample complexity has attracted significant attention since the last decade. However, the optimal sample complexity of the problem is still unknown. Recently, Chen and Li (2016) made an interesting conjecture, called gap-entropy conjecture, concerning the instance optimal sample complexity of Best-$1$-Arm. Given a Best-$1$-Arm instance $I$ (i.e., a set of arms), let $\mu_[i]$ denote the $i$th largest mean and $\Delta_[i]=\mu_[1]-\mu_[i]$ denote the corresponding gap. $H(I)=\sum_i=2^n\Delta_[i]^-2$ denotes the complexity of the instance. The gap-entropy conjecture states that for any instance $I$, $Ω\left(H(I)⋅\left(\lnδ^-1 + \mathsf{Ent}(I)\right)\right)$ is an instance lower bound, where $\mathsf{Ent}(I)$ is an entropy-like term determined by the gaps, and there is a $δ$-correct algorithm for Best-$1$-Arm with sample complexity $O\left(H(I)⋅\left(\lnδ^-1 + \mathsf{Ent}(I)\right)+\Delta_[2]^-2\ln\ln\Delta_[2]^-1\right)$. We note that $Θ\left(\Delta_[2]^-2\ln\ln\Delta_[2]^-1\right)$ is necessary and sufficient to solve the two-arm instance with the best and second best arms. If the conjecture is true, we would have a complete understanding of the instance-wise sample complexity of Best-$1$-Arm (up to constant factors). In this paper, we make significant progress towards a complete resolution of the gap-entropy conjecture. For the upper bound, we provide a highly nontrivial algorithm which requires \[O\left(H(I)⋅\left(\lnδ^-1 + \mathsf{Ent}(I)\right)+\Delta_[2]^-2\ln\ln\Delta_[2]^-1\mathrmpolylog(n,δ^-1)\right)\]samples in expectation for any instance $I$. For the lower bound, we show that for any Gaussian Best-$1$-Arm instance with gaps of the form $2^-k$, any $δ$-correct monotone algorithm requires at least \[Ω\left(H(I)⋅\left(\lnδ^-1 + \mathsf{Ent}(I)\right)\right)\]samples in expectation. Here, a monotone algorithm is one which uses no more samples (in expectation) on $I’$ than on $I$, if $I’$ is a sub-instance of $I$ obtained by removing some sub-optimal arms. }
}
@InProceedings{pmlr-v65-cherapanamjeri17a,
title = {Thresholding Based Outlier Robust PCA},
author = {Cherapanamjeri, Yeshwanth and Jain, Prateek and Netrapalli, Praneeth},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {593--628},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/cherapanamjeri17a/cherapanamjeri17a.pdf},
url = {https://proceedings.mlr.press/v65/cherapanamjeri17a.html},
abstract = {We consider the problem of outlier robust PCA (\textbfOR-PCA) where the goal is to recover principal directions despite the presence of outlier data points. That is, given a data matrix $M^*$, where $(1-α)$ fraction of the points are noisy samples from a low-dimensional subspace while $α$ fraction of the points can be arbitrary outliers, the goal is to recover the subspace accurately. Existing results for \textbfOR-PCA have serious drawbacks: while some results are quite weak in the presence of noise, other results have runtime quadratic in dimension, rendering them impractical for large scale applications. In this work, we provide a novel thresholding based iterative algorithm with per-iteration complexity at most linear in the data size. Moreover, the fraction of outliers, $α$, that our method can handle is tight up to constants while providing nearly optimal computational complexity for a general noise setting. For the special case where the inliers are obtained from a low-dimensional subspace with additive Gaussian noise, we show that a modification of our thresholding based method leads to significant improvement in recovery error (of the subspace) even in the presence of a large fraction of outliers.}
}
@InProceedings{pmlr-v65-cohen17a,
title = {Tight Bounds for Bandit Combinatorial Optimization},
author = {Cohen, Alon and Hazan, Tamir and Koren, Tomer},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {629--642},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/cohen17a/cohen17a.pdf},
url = {https://proceedings.mlr.press/v65/cohen17a.html},
abstract = {We revisit the study of optimal regret rates in bandit combinatorial optimization—a fundamental framework for sequential decision making under uncertainty that abstracts numerous combinatorial prediction problems. We prove that the attainable regret in this setting grows as $\widetildeΘ(k^3/2\sqrt{d}T)$ where $d$ is the dimension of the problem and $k$ is a bound over the maximal instantaneous loss, disproving a conjecture of Audibert, Bubeck, and Lugosi (2013) who argued that the optimal rate should be of the form $\widetildeΘ(k\sqrt{d}T)$. Our bounds apply to several important instances of the framework, and in particular, imply a tight bound for the well-studied bandit shortest path problem. By that, we also resolve an open problem posed by Cesa-Bianchi and Lugosi (2012).}
}
@InProceedings{pmlr-v65-cutkosky17a,
title = {Online Learning Without Prior Information},
author = {Cutkosky, Ashok and Boahen, Kwabena},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {643--677},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/cutkosky17a/cutkosky17a.pdf},
url = {https://proceedings.mlr.press/v65/cutkosky17a.html},
abstract = {The vast majority of optimization and online learning algorithms today require some prior information about the data (often in the form of bounds on gradients or on the optimal parameter value). When this information is not available, these algorithms require laborious manual tuning of various hyperparameters, motivating the search for algorithms that can adapt to the data with no prior information. We describe a frontier of new lower bounds on the performance of such algorithms, reflecting a tradeoff between a term that depends on the optimal parameter value and a term that depends on the gradients’ rate of growth. Further, we construct a family of algorithms whose performance matches any desired point on this frontier, which no previous algorithm reaches.}
}
@InProceedings{pmlr-v65-dalalyan17a,
title = {Further and stronger analogy between sampling and optimization: Langevin Monte Carlo and gradient descent},
author = {Dalalyan, Arnak},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {678--689},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/dalalyan17a/dalalyan17a.pdf},
url = {https://proceedings.mlr.press/v65/dalalyan17a.html},
abstract = {In this paper, we revisit the recently established theoretical guarantees for the convergence of the Langevin Monte Carlo algorithm of sampling from a smooth and (strongly) log-concave density. We improve the existing results when the convergence is measured in the Wasserstein distance and provide further insights on the very tight relations between, on the one hand, the Langevin Monte Carlo for sampling and, on the other hand, the gradient descent for optimization. Finally, we also establish guarantees for the convergence of a version of the Langevin Monte Carlo algorithm that is based on noisy evaluations of the gradient.}
}
@InProceedings{pmlr-v65-daniely17a,
title = {Depth Separation for Neural Networks},
author = {Daniely, Amit},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {690--696},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/daniely17a/daniely17a.pdf},
url = {https://proceedings.mlr.press/v65/daniely17a.html},
abstract = {Let $f:\mathbb{S}^d-1\times \mathbb{S}^d-1\to\mathbb{S}$ be a function of the form $f(x,x’) = g(⟨x,x’⟩)$ for $g:[-1,1]\to \mathbb{R}$. We give a simple proof that shows that poly-size depth two neural networks with (exponentially) bounded weights cannot approximate $f$ whenever $g$ cannot be approximated by a low degree polynomial. Moreover, for many $g$’s, such as $g(x)=\sin(\pi d^3x)$, the number of neurons must be $2^Ω\left(d\log(d)\right)$. Furthermore, the result holds w.r.t. the uniform distribution on $\mathbb{S}^d-1\times \mathbb{S}^d-1$. As many functions of the above form can be well approximated by poly-size depth three networks with poly-bounded weights, this establishes a separation between depth two and depth three networks w.r.t. the uniform distribution on $\mathbb{S}^d-1\times \mathbb{S}^d-1$.}
}
@InProceedings{pmlr-v65-daskalakis17a,
title = {Square Hellinger Subadditivity for Bayesian Networks and its Applications to Identity Testing},
author = {Daskalakis, Constantinos and Pan, Qinxuan},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {697--703},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/daskalakis17a/daskalakis17a.pdf},
url = {https://proceedings.mlr.press/v65/daskalakis17a.html},
abstract = {We show that the square Hellinger distance between two Bayesian networks on the same directed graph, $G$, is subadditive with respect to the neighborhoods of $G$. Namely, if $P$ and $Q$ are the probability distributions defined by two Bayesian networks on the same DAG, our inequality states that the square Hellinger distance, $H^2(P,Q)$, between $P$ and $Q$ is upper bounded by the sum, $\sum_v H^2(P_{v} ∪\Pi_v, Q_{v} ∪\Pi_v)$, of the square Hellinger distances between the marginals of $P$ and $Q$ on every node $v$ and its parents $\Pi_v$ in the DAG. Importantly, our bound does not involve the conditionals but the marginals of $P$ and $Q$. We derive a similar inequality for more general Markov Random Fields. As an application of our inequality, we show that distinguishing whether two (unknown) Bayesian networks $P$ and $Q$ on the same (but potentially unknown) DAG satisfy $P=Q$ vs $d_\rm TV(P,Q)>ε$ can be performed from $\tilde{O}(|Σ|^3/4(d+1) ⋅n/ε^2)$ samples, where $d$ is the maximum in-degree of the DAG and $Σ$ the domain of each variable of the Bayesian networks. If $P$ and $Q$ are defined on potentially different and potentially unknown trees, the sample complexity becomes $\tilde{O}(|Σ|^4.5 n/ε^2)$. In both cases the dependence of the sample complexity on $n, ε$ is optimal up to logarithmic factors. Lastly, if $P$ and $Q$ are product distributions over ${0,1}^n$ and $Q$ is known, the sample complexity becomes $O(\sqrt{n}/ε^2)$, which is optimal up to constant factors.}
}
@InProceedings{pmlr-v65-daskalakis17b,
title = {Ten Steps of EM Suffice for Mixtures of Two Gaussians},
author = {Daskalakis, Constantinos and Tzamos, Christos and Zampetakis, Manolis},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {704--710},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/daskalakis17b/daskalakis17b.pdf},
url = {https://proceedings.mlr.press/v65/daskalakis17b.html},
abstract = {The Expectation-Maximization (EM) algorithm is a widely used method for maximum likelihood estimation in models with latent variables. For estimating mixtures of Gaussians, its iteration can be viewed as a soft version of the k-means clustering algorithm. Despite its wide use and applications, there are essentially no known convergence guarantees for this method. We provide global convergence guarantees for mixtures of two Gaussians with known covariance matrices. We show that the population version of EM, where the algorithm is given access to infinitely many samples from the mixture, converges geometrically to the correct mean vectors, and provide simple, closed-form expressions for the convergence rate. As a simple illustration, we show that, in one dimension, ten steps of the EM algorithm initialized at infinity result in less than $1%$ error estimation of the means. In the finite sample regime, we show that, under a random initialization, $\tilde{O}(d/ε^2)$ samples suffice to compute the unknown vectors to within $ε$ in Mahalanobis distance, where $d$ is the dimension. In particular, the error rate of the EM based estimator is $\tilde{O}\left(\sqrt{d} \over n\right)$ where $n$ is the number of samples, which is optimal up to logarithmic factors.}
}
@InProceedings{pmlr-v65-diakonikolas17a,
title = {Learning Multivariate Log-concave Distributions},
author = {Diakonikolas, Ilias and Kane, Daniel M. and Stewart, Alistair},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {711--727},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/diakonikolas17a/diakonikolas17a.pdf},
url = {https://proceedings.mlr.press/v65/diakonikolas17a.html},
abstract = {We study the problem of estimating multivariate log-concave probability density functions. We prove the first sample complexity upper bound for learning log-concave densities on $\mathbb{R}^d$, for all $d ≥1$. Prior to our work, no upper bound on the sample complexity of this learning problem was known for the case of $d>3$. In more detail, we give an estimator that, for any $d \ge 1$ and $ε>0$, draws $\tilde{O}_d \left( (1/ε)^(d+5)/2 \right)$ samples from an unknown target log-concave density on $R^d$, and outputs a hypothesis that (with high probability) is $ε$-close to the target, in total variation distance. Our upper bound on the sample complexity comes close to the known lower bound of $\Omega_d \left( (1/ε)^(d+1)/2 \right)$ for this problem. }
}
@InProceedings{pmlr-v65-feldman17a,
title = {Generalization for Adaptively-chosen Estimators via Stable Median},
author = {Feldman, Vitaly and Steinke, Thomas},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {728--757},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/feldman17a/feldman17a.pdf},
url = {https://proceedings.mlr.press/v65/feldman17a.html},
abstract = {Datasets are often reused to perform multiple statistical analyses in an adaptive way, in which each analysis may depend on the outcomes of previous analyses on the same dataset. Standard statistical guarantees do not account for these dependencies and little is known about how to provably avoid overfitting and false discovery in the adaptive setting. We consider a natural formalization of this problem in which the goal is to design an algorithm that, given a limited number of i.i.d. samples from an unknown distribution, can answer adaptively-chosen queries about that distribution. We present an algorithm that estimates the expectations of $k$ arbitrary adaptively-chosen real-valued estimators using a number of samples that scales as $\sqrt{k}$. The answers given by our algorithm are essentially as accurate as if fresh samples were used to evaluate each estimator. In contrast, prior work yields error guarantees that scale with the worst-case sensitivity of each estimator. We also give a version of our algorithm that can be used to verify answers to such queries where the sample complexity depends logarithmically on the number of queries $k$ (as in the reusable holdout technique). Our algorithm is based on a simple approximate median algorithm that satisfies the strong stability guarantees of differential privacy. Our techniques provide a new approach for analyzing the generalization guarantees of differentially private algorithms.}
}
@InProceedings{pmlr-v65-feldman17b,
title = {Greed Is Good: Near-Optimal Submodular Maximization via Greedy Optimization},
author = {Feldman, Moran and Harshaw, Christopher and Karbasi, Amin},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {758--784},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/feldman17b/feldman17b.pdf},
url = {https://proceedings.mlr.press/v65/feldman17b.html},
abstract = {It is known that greedy methods perform well for maximizing \textitmonotone submodular functions. At the same time, such methods perform poorly in the face of non-monotonicity. In this paper, we show—arguably, surprisingly—that invoking the classical greedy algorithm $O(\sqrt{k})$-times leads to the (currently) fastest deterministic algorithm, called RepeatedGreedy, for maximizing a general submodular function subject to $k$-independent system constraints. RepeatedGreedy achieves $(1 + O(1/\sqrt{k}))k$ approximation using $O(nr\sqrt{k})$ function evaluations (here, $n$ and $r$ denote the size of the ground set and the maximum size of a feasible solution, respectively). We then show that by a careful sampling procedure, we can run the greedy algorithm only \textitonce and obtain the (currently) fastest randomized algorithm, called SampleGreedy, for maximizing a submodular function subject to $k$-extendible system constraints (a subclass of $k$-independent system constrains). SampleGreedy achieves $(k + 3)$-approximation with only $O(nr/k)$ function evaluations. Finally, we derive an almost matching lower bound, and show that no polynomial time algorithm can have an approximation ratio smaller than $ k + 1/2 - \varepsilon$. To further support our theoretical results, we compare the performance of RepeatedGreedy and SampleGreedy with prior art in a concrete application (movie recommendation). We consistently observe that while SampleGreedy achieves practically the same utility as the best baseline, it performs at least two orders of magnitude faster.}
}
@InProceedings{pmlr-v65-feldman17c,
title = {A General Characterization of the Statistical Query Complexity},
author = {Feldman, Vitaly},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {785--830},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/feldman17c/feldman17c.pdf},
url = {https://proceedings.mlr.press/v65/feldman17c.html},
abstract = {Statistical query (SQ) algorithms are algorithms that have access to an \em SQ oracle for the input distribution $D$ instead of i.i.d. samples from $D$. Given a query function $φ:X \to [-1,1]$, the oracle returns an estimate of $\bf E_x∼D[φ(x)]$ within some tolerance $\tau_φ$ that roughly corresponds to the number of samples. In this work we demonstrate that the complexity of solving an arbitrary statistical problem using SQ algorithms can be captured by a relatively simple notion of statistical dimension that we introduce. SQ algorithms capture a broad spectrum of algorithmic approaches used in theory and practice, most notably, convex optimization techniques. Hence our statistical dimension allows to investigate the power of a variety of algorithmic approaches by analyzing a single linear-algebraic parameter. Such characterizations were investigated over the past 20 years in learning theory but prior characterizations are restricted to the much simpler setting of classification problems relative to a fixed distribution on the domain. Our characterization is also the first to precisely characterize the necessary tolerance of queries. We give applications of our techniques to two open problems in learning theory and to algorithms that are subject to memory and communication constraints.}
}
@InProceedings{pmlr-v65-flammarion17a,
title = {Stochastic Composite Least-Squares Regression with Convergence Rate $O(1/n)$},
author = {Flammarion, Nicolas and Bach, Francis},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {831--875},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/flammarion17a/flammarion17a.pdf},
url = {https://proceedings.mlr.press/v65/flammarion17a.html},
abstract = {We consider the minimization of composite objective functions composed of the expectation of quadratic functions and an arbitrary convex function. We study the stochastic dual averaging algorithm with a constant step-size, showing that it leads to a convergence rate of O(1/n) without strong convexity assumptions. This thus extends earlier results on least-squares regression with the Euclidean geometry to (a) all convex regularizers and constraints, and (b) all geometries represented by a Bregman divergence. This is achieved by a new proof technique that relates stochastic and deterministic recursions}
}
@InProceedings{pmlr-v65-foster17a,
title = {ZigZag: A New Approach to Adaptive Online Learning},
author = {Foster, Dylan J. and Rakhlin, Alexander and Sridharan, Karthik},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {876--924},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/foster17a/foster17a.pdf},
url = {https://proceedings.mlr.press/v65/foster17a.html},
abstract = {We develop a new family of algorithms for the online learning setting with regret against any data sequence bounded by the empirical Rademacher complexity of that sequence. To develop a general theory of when this type of adaptive regret bound is achievable we establish a connection to the theory of decoupling inequalities for martingales in Banach spaces. When the hypothesis class is a set of linear functions bounded in some norm, such a regret bound is achievable if and only if the norm satisfies certain decoupling inequalities for martingales. Donald Burkholder’s celebrated geometric characterization of decoupling inequalities (1984) states that such an inequality holds if and only if there exists a special function called a Burkholder function satisfying certain restricted concavity properties. Our online learning algorithms are efficient in terms of queries to this function. We realize our general theory by giving new efficient and adaptive algorithms for classes including $\ell_p$ norms, group norms, and reproducing kernel Hilbert spaces. The empirical Rademacher complexity regret bound implies — when used in the i.i.d. setting — a data-dependent complexity bound for excess risk after online-to-batch conversion. To showcase the power of the empirical Rademacher complexity regret bound, we derive improved rates for a supervised learning generalization of the online learning with low rank experts task and for the online matrix prediction task. In addition to obtaining tight data-dependent regret bounds, our algorithms enjoy improved efficiency over previous techniques based on Rademacher complexity, automatically work in the infinite horizon setting, and adapt to scale. To obtain such adaptive methods, we introduce novel machinery, and the resulting algorithms are not based on the standard tools of online convex optimization. We conclude with a number of open problems and new directions, both algorithmic and information-theoretic.}
}
@InProceedings{pmlr-v65-frongillo17a,
title = {Memoryless Sequences for Differentiable Losses},
author = {Frongillo, Rafael and Nobel, Andrew},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {925--939},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/frongillo17a/frongillo17a.pdf},
url = {https://proceedings.mlr.press/v65/frongillo17a.html},
abstract = {One way to define the “randomness” of a fixed individual sequence is to ask how hard it is to predict. When prediction error is measured via squared loss, it has been established that memoryless sequences (which are, in a precise sense, hard to predict) have some of the stochastic attributes of truly random sequences. In this paper, we ask how changing the loss function used changes the set of memoryless sequences, and in particular, the stochastic attributes they possess. We answer this question for differentiable convex loss functions using tools from property elicitation, showing that the property elicited by the loss determines the stochastic attributes of the corresponding memoryless sequences. We apply our results to price calibration in prediction markets.}
}
@InProceedings{pmlr-v65-gamarnik17a,
title = {Matrix Completion from $O(n)$ Samples in Linear Time},
author = {Gamarnik, David and Li, Quan and Zhang, Hongyi},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {940--947},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/gamarnik17a/gamarnik17a.pdf},
url = {https://proceedings.mlr.press/v65/gamarnik17a.html},
abstract = {We consider the problem of reconstructing a rank-$k$ $n \times n$ matrix $M$ from a sampling of its entries. Under a certain incoherence assumption on $M$ and for the case when both the rank and the condition number of $M$ are bounded, it was shown in (Candès and Recht, 2009; Candès and Tao, 2010; Keshavan et al., 2010; Recht, 2011; Jain et al., 2012; Hardt, 2014) that $M$ can be recovered exactly or approximately (depending on some trade-off between accuracy and computational complexity) using $O(n \text{poly}(\log n))$ samples in super-linear time $O(n^a \text{poly}(\log n))$ for some constant $a ≥1$. In this paper, we propose a new matrix completion algorithm using a novel sampling scheme based on a union of independent sparse random regular bipartite graphs. We show that under the same conditions w.h.p. our algorithm recovers an $ε$-approximation of $M$ in terms of the Frobenius norm using $O(n \log^2(1/ε))$ samples and in linear time $O(n \log^2(1/ε))$. This provides the best known bounds both on the sample complexity and computational cost for reconstructing (approximately) an unknown low-rank matrix. The novelty of our algorithm is two new steps of thresholding singular values and rescaling singular vectors in the application of the “vanilla” alternating minimization algorithm. The structure of sparse random regular graphs is used heavily for controlling the impact of these regularization steps.}
}
@InProceedings{pmlr-v65-david17a,
title = {High Dimensional Regression with Binary Coefficients. Estimating Squared Error and a Phase Transtition},
author = {David, Gamarnik and Ilias, Zadik},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {948--953},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/david17a/david17a.pdf},
url = {https://proceedings.mlr.press/v65/david17a.html},
abstract = {We consider a sparse linear regression model $Y=Xβ^*+W$ where $X$ is $n\times p$ matrix Gaussian i.i.d. entries, $W$ is $n\times 1$ noise vector with i.i.d. mean zero Gaussian entries and standard deviation $σ$, and $β^*$ is $p\times 1$ binary vector with support size (sparsity) $k$. Using a novel conditional second moment method we obtain a tight up to a multiplicative constant approximation of the optimal squared error $\min_β\|Y-Xβ\|_2$, where the minimization is over all $k$-sparse binary vectors $β$. The approximation reveals interesting structural properties of the underlying regression problem. In particular, \beginenumerate \item [(a)] We establish that $n^*=2k\log p/\log (2k/σ^2+1)$ is a phase transition point with the following “all-or-nothing” property. When $n$ exceeds $n^*$, $(2k)^-1\|\beta_2-β^*\|_0≈0$, and when $n$ is below $n^*$, $(2k)^-1\|\beta_2-β^*\|_0≈1$, where $\beta_2$ is the optimal solution achieving the smallest squared error. As a corollary $n^*$ is the asymptotic threshold for recovering $β^*$ information theoretically. Note that $n^*$ is asymptotically below the threshold $n_\text{LASSO}/CS=(2k+σ^2)\log p$, above which the LASSO and Compressive Sensing methods are able to recover $β^*$. \item [(b)] We compute the squared error for an intermediate problem $\min_β\|Y-Xβ\|_2$ where the minimization is restricted to vectors $β$ with $\|β-β^*\|_0=2k ζ$, for some fixed ratio $ζ∈[0,1]$. We show that a lower bound part $Γ(ζ)$ of the estimate, which essentially corresponds to the estimate based on the first moment method, undergoes a phase transition at three different thresholds, namely $n_\text{inf},1=σ^2\log p$, which is information theoretic bound for recovering $β^*$ when $k=1$ and $σ$ is large, then at $n^*$ and finally at $n_\text{LASSO}/CS$. \item [(c)] We establish a certain Overlap Gap Property (OGP) on the space of all $k$-sparse binary vectors $β$ when $n\le ck\log p$ for sufficiently small constant $c$. By drawing a connection with a similar OGP exhibited by many randomly generated constraint satisfaction problems and statistical physics models, we conjecture that OGP is the source of algorithmic hardness of solving the minimization problem $\min_β\|Y-Xβ\|_2$ in the regime $n\sigma_0$, and the rest of the entries from $p_0$. The special rows with higher variance entries can be viewed as hidden higher-degree hubs. The problem we address is to identify the hubs efficiently. The planted Gaussian Submatrix Model is the special case where the higher variance entries must all lie in a $k \times k$ submatrix. If $k≥c\sqrt{n}\ln n$, just the row sums are sufficient to find $S$ in the general model. For the Gaussian submatrix problem (and the related planted clique problem), this can be improved by a $\sqrt\ln n$ factor to $k \ge c\sqrt{n}$ by spectral or combinatorial methods. We give a polynomial-time algorithm to identify all the hidden hubs with high probability for $k \ge n^0.5-δ$ for some $δ>0$, when $\sigma_1^2>2\sigma_0^2$. The algorithm extends to the setting where planted entries might have different variances, each at least $\sigma_1^2$. We also show a nearly matching lower bound: for $\sigma_1^2 \le 2\sigma_0^2$, there is no polynomial-time Statistical Query algorithm for distinguishing between a matrix whose entries are all from $N(0,\sigma_0^2)$ and a matrix with $k=n^0.5-δ$ hidden hubs for any $δ>0$. The lower bound as well as the algorithm are related to whether the chi-squared distance of the two distributions diverges. At the critical value $\sigma_1^2=2\sigma_0^2$, we show that the hidden hubs problem can be solved for $k≥c\sqrt n(\ln n)^1/4$, improving on the naive row sum-based method.}
}
@InProceedings{pmlr-v65-kearns17a,
title = {Predicting with Distributions},
author = {Kearns, Michael and Wu, Zhiwei Steven},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {1214--1241},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/kearns17a/kearns17a.pdf},
url = {https://proceedings.mlr.press/v65/kearns17a.html},
abstract = { We consider a new learning model in which a joint distribution over vector pairs $(x,y)$ is determined by an unknown function $c(x)$ that maps input vectors $x$ not to individual outputs, but to entire \em distributions\/ over output vectors $y$. Our main results take the form of rather general reductions from our model to algorithms for PAC learning the function class and the distribution class separately, and show that virtually every such combination yields an efficient algorithm in our model. Our methods include a randomized reduction to classification noise and an application of Le Cam’s method to obtain robust learning algorithms.}
}
@InProceedings{pmlr-v65-koren17a,
title = {Bandits with Movement Costs and Adaptive Pricing},
author = {Koren, Tomer and Livni, Roi and Mansour, Yishay},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {1242--1268},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/koren17a/koren17a.pdf},
url = {https://proceedings.mlr.press/v65/koren17a.html},
abstract = {We extend the model of Multi-Armed Bandit with unit switching cost to incorporate a metric between the actions. We consider the case where the metric over the actions can be modeled by a complete binary tree, and the distance between two leaves is the size of the subtree of their least common ancestor, which abstracts the case that the actions are points on the continuous interval $[0,1]$ and the switching cost is their distance. In this setting, we give a new algorithm that establishes a regret of $\widetilde{O}(\sqrt{k}T + T/k)$, where $k$ is the number of actions and $T$ is the time horizon. When the set of actions corresponds to whole $[0,1]$ interval we can exploit our method for the task of bandit learning with Lipschitz loss functions, where our algorithm achieves an optimal regret rate of $\widetilde{Θ}(T^2/3)$, which is the same rate one obtains when there is no penalty for movements. As our main application, we use our new algorithm to solve an adaptive pricing problem. Specifically, we consider the case of a single seller faced with a stream of patient buyers. Each buyer has a private value and a window of time in which they are interested in buying, and they buy at the lowest price in the window, if it is below their value. We show that with an appropriate discretization of the prices, the seller can achieve a regret of $\widetilde{O}(T^2/3)$ compared to the best fixed price in hindsight, which outperform the previous regret bound of $\widetilde{O}(T^3/4)$ for the problem. }
}
@InProceedings{pmlr-v65-kwon17a,
title = {Sparse Stochastic Bandits},
author = {Kwon, Joon and Perchet, Vianney and Vernade, Claire},
booktitle = {Proceedings of the 2017 Conference on Learning Theory},
pages = {1269--1270},
year = {2017},
editor = {Kale, Satyen and Shamir, Ohad},
volume = {65},
series = {Proceedings of Machine Learning Research},
month = {07--10 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v65/kwon17a/kwon17a.pdf},
url = {https://proceedings.mlr.press/v65/kwon17a.html},
abstract = {In the classical multi-armed bandit problem, $d$ arms are available to the decision maker who pulls them sequentially in order to maximize his cumulative reward. Guarantees can be obtained on a relative quantity called regret, which scales linearly with $d$ (or with $\sqrt{d}$ in the minimax sense). We here consider the \emphsparse case of this classical problem in the sense that only a small number of arms, namely $s