@Proceedings{ICML2020,
title = {Proceedings of the 37th International Conference on Machine Learning},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
editor = {Hal Daumé III and Aarti Singh},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
volume = 119
}
@InProceedings{pmlr-v119-abbas20a,
title = {Selective Dyna-Style Planning Under Limited Model Capacity},
author = {Abbas, Zaheer and Sokota, Samuel and Talvitie, Erin and White, Martha},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1--10},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/abbas20a/abbas20a.pdf},
url = {http://proceedings.mlr.press/v119/abbas20a.html},
abstract = {In model-based reinforcement learning, planning with an imperfect model of the environment has the potential to harm learning progress. But even when a model is imperfect, it may still contain information that is useful for planning. In this paper, we investigate the idea of using an imperfect model selectively. The agent should plan in parts of the state space where the model would be helpful but refrain from using the model where it would be harmful. An effective selective planning mechanism requires estimating predictive uncertainty, which arises out of aleatoric uncertainty, parameter uncertainty, and model inadequacy, among other sources. Prior work has focused on parameter uncertainty for selective planning. In this work, we emphasize the importance of model inadequacy. We show that heteroscedastic regression can signal predictive uncertainty arising from model inadequacy that is complementary to that which is detected by methods designed for parameter uncertainty, indicating that considering both parameter uncertainty and model inadequacy may be a more promising direction for effective selective planning than either in isolation.}
}
@InProceedings{pmlr-v119-abdolmaleki20a,
title = {A distributional view on multi-objective policy optimization},
author = {Abdolmaleki, Abbas and Huang, Sandy and Hasenclever, Leonard and Neunert, Michael and Song, Francis and Zambelli, Martina and Martins, Murilo and Heess, Nicolas and Hadsell, Raia and Riedmiller, Martin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {11--22},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/abdolmaleki20a/abdolmaleki20a.pdf},
url = {http://proceedings.mlr.press/v119/abdolmaleki20a.html},
abstract = {Many real-world problems require trading off multiple competing objectives. However, these objectives are often in different units and/or scales, which can make it challenging for practitioners to express numerical preferences over objectives in their native units. In this paper we propose a novel algorithm for multi-objective reinforcement learning that enables setting desired preferences for objectives in a scale-invariant way. We propose to learn an action distribution for each objective, and we use supervised learning to fit a parametric policy to a combination of these distributions. We demonstrate the effectiveness of our approach on challenging high-dimensional real and simulated robotics tasks, and show that setting different preferences in our framework allows us to trace out the space of nondominated solutions.}
}
@InProceedings{pmlr-v119-abeille20a,
title = {Efficient Optimistic Exploration in Linear-Quadratic Regulators via Lagrangian Relaxation},
author = {Abeille, Marc and Lazaric, Alessandro},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {23--31},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/abeille20a/abeille20a.pdf},
url = {http://proceedings.mlr.press/v119/abeille20a.html},
abstract = {We study the exploration-exploitation dilemma in the linear quadratic regulator (LQR) setting. Inspired by the extended value iteration algorithm used in optimistic algorithms for finite MDPs, we propose to relax the optimistic optimization of \ofulq and cast it into a constrained \emph{extended} LQR problem, where an additional control variable implicitly selects the system dynamics within a confidence interval. We then move to the corresponding Lagrangian formulation for which we prove strong duality. As a result, we show that an $\epsilon$-optimistic controller can be computed efficiently by solving at most $O\big(\log(1/\epsilon)\big)$ Riccati equations. Finally, we prove that relaxing the original \ofu problem does not impact the learning performance, thus recovering the $\wt O(\sqrt{T})$ regret of \ofulq. To the best of our knowledge, this is the first computationally efficient confidence-based algorithm for LQR with worst-case optimal regret guarantees.}
}
@InProceedings{pmlr-v119-ablin20a,
title = {Super-efficiency of automatic differentiation for functions defined as a minimum},
author = {Ablin, Pierre and Peyr{\'e}, Gabriel and Moreau, Thomas},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {32--41},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ablin20a/ablin20a.pdf},
url = {http://proceedings.mlr.press/v119/ablin20a.html},
abstract = {In min-min optimization or max-min optimization, one has to compute the gradient of a function defined as a minimum. In most cases, the minimum has no closed-form, and an approximation is obtained via an iterative algorithm. There are two usual ways of estimating the gradient of the function: using either an analytic formula obtained by assuming exactness of the approximation, or automatic differentiation through the algorithm. In this paper, we study the asymptotic error made by these estimators as a function of the optimization error. We find that the error of the automatic estimator is close to the square of the error of the analytic estimator, reflecting a super-efficiency phenomenon. The convergence of the automatic estimator greatly depends on the convergence of the Jacobian of the algorithm. We analyze it for gradient descent and stochastic gradient descent and derive convergence rates for the estimators in these cases. Our analysis is backed by numerical experiments on toy problems and on Wasserstein barycenter computation. Finally, we discuss the computational complexity of these estimators and give practical guidelines to chose between them.}
}
@InProceedings{pmlr-v119-abrol20a,
title = {A Geometric Approach to Archetypal Analysis via Sparse Projections},
author = {Abrol, Vinayak and Sharma, Pulkit},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {42--51},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/abrol20a/abrol20a.pdf},
url = {http://proceedings.mlr.press/v119/abrol20a.html},
abstract = {Archetypal analysis (AA) aims to extract patterns using self-expressive decomposition of data as convex combinations of extremal points (on the convex hull) of the data. This work presents a computationally efficient greedy AA (GAA) algorithm. GAA leverages the underlying geometry of AA, is scalable to larger datasets, and has significantly faster convergence rate. To achieve this, archetypes are learned via sparse projection of data. In the transformed space, GAA employs an iterative subset selection approach to identify archetypes based on the sparsity of convex representations. The work further presents the use of GAA algorithm for extended AA models such as robust and kernel AA. Experimental results show that GAA is considerably faster while performing comparable to existing methods for tasks such as classification, data visualization/categorization.}
}
@InProceedings{pmlr-v119-acharya20a,
title = {Context Aware Local Differential Privacy},
author = {Acharya, Jayadev and Bonawitz, Kallista and Kairouz, Peter and Ramage, Daniel and Sun, Ziteng},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {52--62},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/acharya20a/acharya20a.pdf},
url = {http://proceedings.mlr.press/v119/acharya20a.html},
abstract = {Local differential privacy (LDP) is a strong notion of privacy that often leads to a significant drop in utility. The original definition of LDP assumes that all the elements in the data domain are equally sensitive. However, in many real-life applications, some elements are more sensitive than others. We propose a context-aware framework for LDP that allows the privacy level to vary across the data domain, enabling system designers to place privacy constraints where they matter without paying the cost where they do not. For binary data domains, we provide a universally optimal privatization scheme and highlight its connections to Warner’s randomized response and Mangat’s improved response. Motivated by geo-location and web search applications, for k-ary data domains, we consider two special cases of context-aware LDP: block-structured LDP and high-low LDP. We study minimax discrete distribution estimation under both cases and provide communication-efficient, sample-optimal schemes, and information-theoretic lower bounds. We show, using worst-case analyses and experiments on Gowalla’s 3.6 million check-ins to 43,750 locations, that context-aware LDP achieves a far better accuracy under the same number of samples.}
}
@InProceedings{pmlr-v119-addanki20a,
title = {Efficient Intervention Design for Causal Discovery with Latents},
author = {Addanki, Raghavendra and Kasiviswanathan, Shiva and Mcgregor, Andrew and Musco, Cameron},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {63--73},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/addanki20a/addanki20a.pdf},
url = {http://proceedings.mlr.press/v119/addanki20a.html},
abstract = {We consider recovering a causal graph in presence of latent variables, where we seek to minimize the cost of interventions used in the recovery process. We consider two intervention cost models: (1) a linear cost model where the cost of an intervention on a subset of variables has a linear form, and (2) an identity cost model where the cost of an intervention is the same, regardless of what variables it is on, i.e., the goal is just to minimize the number of interventions. Under the linear cost model, we give an algorithm to identify the ancestral relations of the underlying causal graph, achieving within a $2$-factor of the optimal intervention cost. This approximation factor can be improved to $1+\eps$ for any $\eps > 0$ under some mild restrictions. Under the identity cost model, we bound the number of interventions needed to recover the entire causal graph, including the latent variables, using a parameterization of the causal graph through a special type of colliders. In particular, we introduce the notion of $p$-colliders, that are colliders between pair of nodes arising from a specific type of conditioning in the causal graph, and provide an upper bound on the number of interventions as a function of the maximum number of $p$-colliders between any two nodes in the causal graph.}
}
@InProceedings{pmlr-v119-adlam20a,
title = {The Neural Tangent Kernel in High Dimensions: Triple Descent and a Multi-Scale Theory of Generalization},
author = {Adlam, Ben and Pennington, Jeffrey},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {74--84},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/adlam20a/adlam20a.pdf},
url = {http://proceedings.mlr.press/v119/adlam20a.html},
abstract = {Modern deep learning models employ considerably more parameters than required to fit the training data. Whereas conventional statistical wisdom suggests such models should drastically overfit, in practice these models generalize remarkably well. An emerging paradigm for describing this unexpected behavior is in terms of a \emph{double descent} curve, in which increasing a model’s capacity causes its test error to first decrease, then increase to a maximum near the interpolation threshold, and then decrease again in the overparameterized regime. Recent efforts to explain this phenomenon theoretically have focused on simple settings, such as linear regression or kernel regression with unstructured random features, which we argue are too coarse to reveal important nuances of actual neural networks. We provide a precise high-dimensional asymptotic analysis of generalization under kernel regression with the Neural Tangent Kernel, which characterizes the behavior of wide neural networks optimized with gradient descent. Our results reveal that the test error has nonmonotonic behavior deep in the overparameterized regime and can even exhibit additional peaks and descents when the number of parameters scales quadratically with the dataset size.}
}
@InProceedings{pmlr-v119-agarwal20a,
title = {Rank Aggregation from Pairwise Comparisons in the Presence of Adversarial Corruptions},
author = {Agarwal, Arpit and Agarwal, Shivani and Khanna, Sanjeev and Patil, Prathamesh},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {85--95},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/agarwal20a/agarwal20a.pdf},
url = {http://proceedings.mlr.press/v119/agarwal20a.html},
abstract = {Rank aggregation from pairwise preferences has widespread applications in recommendation systems and information retrieval. Given the enormous economic and societal impact of these applications, and the consequent incentives for malicious players to manipulate ranking outcomes in their favor, an important challenge is to make rank aggregation algorithms robust to adversarial manipulations in data. In this paper, we initiate the study of robustness in rank aggregation under the popular Bradley-Terry-Luce (BTL) model for pairwise comparisons. We consider a setting where pairwise comparisons are initially generated according to a BTL model, but a fraction of these comparisons are corrupted by an adversary prior to being reported to us. We consider a strong contamination model, where an adversary having complete knowledge of the initial truthful data and the underlying true BTL parameters, can subsequently corrupt the truthful data by inserting, deleting, or changing data points. The goal is to estimate the true score/weight of each item under the BTL model, even in the presence of these corruptions. We characterize the extent of adversarial corruption under which the true BTL parameters are uniquely identifiable. We also provide a novel pruning algorithm that provably cleans the data of adversarial corruption under reasonable conditions on data generation and corruption. We corroborate our theory with experiments on both synthetic as well as real data showing that previous algorithms are vulnerable to even small amounts of corruption, whereas our algorithm can clean a reasonably high amount of corruption.}
}
@InProceedings{pmlr-v119-agarwal20b,
title = {Boosting for Control of Dynamical Systems},
author = {Agarwal, Naman and Brukhim, Nataly and Hazan, Elad and Lu, Zhou},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {96--103},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/agarwal20b/agarwal20b.pdf},
url = {http://proceedings.mlr.press/v119/agarwal20b.html},
abstract = {We study the question of how to aggregate controllers for dynamical systems in order to improve their performance. To this end, we propose a framework of boosting for online control. Our main result is an efficient boosting algorithm that combines weak controllers into a provably more accurate one. Empirical evaluation on a host of control settings supports our theoretical findings.}
}
@InProceedings{pmlr-v119-agarwal20c,
title = {An Optimistic Perspective on Offline Reinforcement Learning},
author = {Agarwal, Rishabh and Schuurmans, Dale and Norouzi, Mohammad},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {104--114},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/agarwal20c/agarwal20c.pdf},
url = {http://proceedings.mlr.press/v119/agarwal20c.html},
abstract = {Off-policy reinforcement learning (RL) using a fixed offline dataset of logged interactions is an important consideration in real world applications. This paper studies offline RL using the DQN replay dataset comprising the entire replay experience of a DQN agent on 60 Atari 2600 games. We demonstrate that recent off-policy deep RL algorithms, even when trained solely on this fixed dataset, outperform the fully trained DQN agent. To enhance generalization in the offline setting, we present Random Ensemble Mixture (REM), a robust Q-learning algorithm that enforces optimal Bellman consistency on random convex combinations of multiple Q-value estimates. Offline REM trained on the DQN replay dataset surpasses strong RL baselines. Ablation studies highlight the role of offline dataset size and diversity as well as the algorithm choice in our positive results. Overall, the results here present an optimistic view that robust RL algorithms trained on sufficiently large and diverse offline datasets can lead to high quality policies. The DQN replay dataset can serve as an offline RL benchmark and is open-sourced.}
}
@InProceedings{pmlr-v119-agrawal20a,
title = {Optimal Bounds between f-Divergences and Integral Probability Metrics},
author = {Agrawal, Rohit and Horel, Thibaut},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {115--124},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/agrawal20a/agrawal20a.pdf},
url = {http://proceedings.mlr.press/v119/agrawal20a.html},
abstract = {The families of f-divergences (e.g. the Kullback-Leibler divergence) and Integral Probability Metrics (e.g. total variation distance or maximum mean discrepancies) are commonly used in optimization and estimation. In this work, we systematically study the relationship between these two families from the perspective of convex duality. Starting from a tight variational representation of the f-divergence, we derive a generalization of the moment generating function, which we show exactly characterizes the best lower bound of the f-divergence as a function of a given IPM. Using this characterization, we obtain new bounds on IPMs defined by classes of unbounded functions, while also recovering in a unified manner well-known results for bounded and subgaussian functions (e.g. Pinsker’s inequality and Hoeffding’s lemma).}
}
@InProceedings{pmlr-v119-ahmaditeshnizi20a,
title = {{L}azy{I}ter: A Fast Algorithm for Counting {M}arkov Equivalent {DAG}s and Designing Experiments},
author = {Ahmaditeshnizi, Ali and Salehkaleybar, Saber and Kiyavash, Negar},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {125--133},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ahmaditeshnizi20a/ahmaditeshnizi20a.pdf},
url = {http://proceedings.mlr.press/v119/ahmaditeshnizi20a.html},
abstract = {The causal relationships among a set of random variables are commonly represented by a Directed Acyclic Graph (DAG), where there is a directed edge from variable $X$ to variable $Y$ if $X$ is a direct cause of $Y$. From the purely observational data, the true causal graph can be identified up to a Markov Equivalence Class (MEC), which is a set of DAGs with the same conditional independencies between the variables. The size of an MEC is a measure of complexity for recovering the true causal graph by performing interventions. We propose a method for efficient iteration over possible MECs given intervention results. We utilize the proposed method for computing MEC sizes and experiment design in active and passive learning settings. Compared to previous work for computing the size of MEC, our proposed algorithm reduces the time complexity by a factor of $O(n)$ for sparse graphs where $n$ is the number of variables in the system. Additionally, integrating our approach with dynamic programming, we design an optimal algorithm for passive experiment design. Experimental results show that our proposed algorithms for both computing the size of MEC and experiment design outperform the state of the art.}
}
@InProceedings{pmlr-v119-ahn20a,
title = {Learning What to Defer for Maximum Independent Sets},
author = {Ahn, Sungsoo and Seo, Younggyo and Shin, Jinwoo},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {134--144},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ahn20a/ahn20a.pdf},
url = {http://proceedings.mlr.press/v119/ahn20a.html},
abstract = {Designing efficient algorithms for combinatorial optimization appears ubiquitously in various scientific fields. Recently, deep reinforcement learning (DRL) frameworks have gained considerable attention as a new approach: they can automate the design of a solver while relying less on sophisticated domain knowledge of the target problem. However, the existing DRL solvers determine the solution using a number of stages proportional to the number of elements in the solution, which severely limits their applicability to large-scale graphs. In this paper, we seek to resolve this issue by proposing a novel DRL scheme, coined learning what to defer (LwD), where the agent adaptively shrinks or stretch the number of stages by learning to distribute the element-wise decisions of the solution at each stage. We apply the proposed framework to the maximum independent set (MIS) problem, and demonstrate its significant improvement over the current state-of-the-art DRL scheme. We also show that LwD can outperform the conventional MIS solvers on large-scale graphs having millions of vertices, under a limited time budget.}
}
@InProceedings{pmlr-v119-ahuja20a,
title = {Invariant Risk Minimization Games},
author = {Ahuja, Kartik and Shanmugam, Karthikeyan and Varshney, Kush and Dhurandhar, Amit},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {145--155},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ahuja20a/ahuja20a.pdf},
url = {http://proceedings.mlr.press/v119/ahuja20a.html},
abstract = {The standard risk minimization paradigm of machine learning is brittle when operating in environments whose test distributions are different from the training distribution due to spurious correlations. Training on data from many environments and finding invariant predictors reduces the effect of spurious features by concentrating models on features that have a causal relationship with the outcome. In this work, we pose such invariant risk minimization as finding the Nash equilibrium of an ensemble game among several environments. By doing so, we develop a simple training algorithm that uses best response dynamics and, in our experiments, yields similar or better empirical accuracy with much lower variance than the challenging bi-level optimization problem of Arjovsky et al. (2019). One key theoretical contribution is showing that the set of Nash equilibria for the proposed game are equivalent to the set of invariant predictors for any finite number of environments, even with nonlinear classifiers and transformations. As a result, our method also retains the generalization guarantees to a large set of environments shown in Arjovsky et al. (2019). The proposed algorithm adds to the collection of successful game-theoretic machine learning algorithms such as generative adversarial networks.}
}
@InProceedings{pmlr-v119-aitchison20a,
title = {Why bigger is not always better: on finite and infinite neural networks},
author = {Aitchison, Laurence},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {156--164},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/aitchison20a/aitchison20a.pdf},
url = {http://proceedings.mlr.press/v119/aitchison20a.html},
abstract = {Recent work has argued that neural networks can be understood theoretically by taking the number of channels to infinity, at which point the outputs become Gaussian process (GP) distributed. However, we note that infinite Bayesian neural networks lack a key facet of the behaviour of real neural networks: the fixed kernel, determined only by network hyperparameters, implies that they cannot do any form of representation learning. The lack of representation or equivalently kernel learning leads to less flexibility and hence worse performance, giving a potential explanation for the inferior performance of infinite networks observed in the literature (e.g. Novak et al. 2019). We give analytic results characterising the prior over representations and representation learning in finite deep linear networks. We show empirically that the representations in SOTA architectures such as ResNets trained with SGD are much closer to those suggested by our deep linear results than by the corresponding infinite network. This motivates the introduction of a new class of network: infinite networks with bottlenecks, which inherit the theoretical tractability of infinite networks while at the same time allowing representation learning.}
}
@InProceedings{pmlr-v119-alaa20a,
title = {Discriminative Jackknife: Quantifying Uncertainty in Deep Learning via Higher-Order Influence Functions},
author = {Alaa, Ahmed and Van Der Schaar, Mihaela},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {165--174},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/alaa20a/alaa20a.pdf},
url = {http://proceedings.mlr.press/v119/alaa20a.html},
abstract = {Deep learning models achieve high predictive accuracy across a broad spectrum of tasks, but rigorously quantifying their predictive uncertainty remains challenging. Usable estimates of predictive uncertainty should (1) cover the true prediction targets with high probability, and (2) discriminate between high- and low confidence prediction instances. Existing methods for uncertainty quantification are based predominantly on Bayesian neural networks; these may fall short of (1) and (2) {—} i.e., Bayesian credible intervals do not guarantee frequentist coverage, and approximate posterior inference undermines discriminative accuracy. In this paper, we develop the discriminative jackknife (DJ), a frequentist procedure that utilizes influence functions of a model’s loss functional to construct a jackknife (or leave one-out) estimator of predictive confidence intervals. The DJ satisfies (1) and (2), is applicable to a wide range of deep learning models, is easy to implement, and can be applied in a post-hoc fashion without interfering with model training or compromising its accuracy. Experiments demonstrate that DJ performs competitively compared to existing Bayesian and non-Bayesian regression baselines.}
}
@InProceedings{pmlr-v119-alaa20b,
title = {Frequentist Uncertainty in Recurrent Neural Networks via Blockwise Influence Functions},
author = {Alaa, Ahmed and Van Der Schaar, Mihaela},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {175--190},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/alaa20b/alaa20b.pdf},
url = {http://proceedings.mlr.press/v119/alaa20b.html},
abstract = {Recurrent neural networks (RNNs) are instrumental in modelling sequential and time-series data. Yet, when using RNNs to inform decision-making, predictions by themselves are not sufficient {—} we also need estimates of predictive uncertainty. Existing approaches for uncertainty quantification in RNNs are based predominantly on Bayesian methods; these are computationally prohibitive, and require major alterations to the RNN architecture and training. Capitalizing on ideas from classical jackknife resampling, we develop a frequentist alternative that: (a) does not interfere with model training or compromise its accuracy, (b) applies to any RNN architecture, and (c) provides theoretical coverage guarantees on the estimated uncertainty intervals. Our method derives predictive uncertainty from the variability of the (jackknife) sampling distribution of the RNN outputs, which is estimated by repeatedly deleting “blocks” of (temporally-correlated) training data, and collecting the predictions of the RNN re-trained on the remaining data. To avoid exhaustive re-training, we utilize influence functions to estimate the effect of removing training data blocks on the learned RNN parameters. Using data from a critical care setting, we demonstrate the utility of uncertainty quantification in sequential decision-making.}
}
@InProceedings{pmlr-v119-alacaoglu20a,
title = {Random extrapolation for primal-dual coordinate descent},
author = {Alacaoglu, Ahmet and Fercoq, Olivier and Cevher, Volkan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {191--201},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/alacaoglu20a/alacaoglu20a.pdf},
url = {http://proceedings.mlr.press/v119/alacaoglu20a.html},
abstract = {We introduce a randomly extrapolated primal-dual coordinate descent method that adapts to sparsity of the data matrix and the favorable structures of the objective function. Our method updates only a subset of primal and dual variables with sparse data, and it uses large step sizes with dense data, retaining the benefits of the specific methods designed for each case. In addition to adapting to sparsity, our method attains fast convergence guarantees in favorable cases \emph{without any modifications}. In particular, we prove linear convergence under metric subregularity, which applies to strongly convex-strongly concave problems and piecewise linear quadratic functions. We show almost sure convergence of the sequence and optimal sublinear convergence rates for the primal-dual gap and objective values, in the general convex-concave case. Numerical evidence demonstrates the state-of-the-art empirical performance of our method in sparse and dense settings, matching and improving the existing methods.}
}
@InProceedings{pmlr-v119-alacaoglu20b,
title = {A new regret analysis for {A}dam-type algorithms},
author = {Alacaoglu, Ahmet and Malitsky, Yura and Mertikopoulos, Panayotis and Cevher, Volkan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {202--210},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/alacaoglu20b/alacaoglu20b.pdf},
url = {http://proceedings.mlr.press/v119/alacaoglu20b.html},
abstract = {In this paper, we focus on a theory-practice gap for Adam and its variants (AMSGrad, AdamNC, etc.). In practice, these algorithms are used with a constant first-order moment parameter $\beta_{1}$ (typically between $0.9$ and $0.99$). In theory, regret guarantees for online convex optimization require a rapidly decaying $\beta_{1}\to0$ schedule. We show that this is an artifact of the standard analysis, and we propose a novel framework that allows us to derive optimal, data-dependent regret bounds with a constant $\beta_{1}$, without further assumptions. We also demonstrate the flexibility of our analysis on a wide range of different algorithms and settings.}
}
@InProceedings{pmlr-v119-alami20a,
title = {Restarted {B}ayesian Online Change-point Detector achieves Optimal Detection Delay},
author = {Alami, Reda and Maillard, Odalric and Feraud, Raphael},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {211--221},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/alami20a/alami20a.pdf},
url = {http://proceedings.mlr.press/v119/alami20a.html},
abstract = {we consider the problem of sequential change-point detection where both the change-points and the distributions before and after the change are assumed to be unknown. For this problem of primary importance in statistical and sequential learning theory, we derive a variant of the Bayesian Online Change Point Detector proposed by \cite{fearnhead2007line} which is easier to analyze than the original version while keeping its powerful message-passing algorithm. We provide a non-asymptotic analysis of the false-alarm rate and the detection delay that matches the existing lower-bound. We further provide the first explicit high-probability control of the detection delay for such approach. Experiments on synthetic and real-world data show that this proposal outperforms the state-of-art change-point detection strategy, namely the Improved Generalized Likelihood Ratio (Improved GLR) while compares favorably with the original Bayesian Online Change Point Detection strategy.}
}
@InProceedings{pmlr-v119-alexandari20a,
title = {Maximum Likelihood with Bias-Corrected Calibration is Hard-To-Beat at Label Shift Adaptation},
author = {Alexandari, Amr and Kundaje, Anshul and Shrikumar, Avanti},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {222--232},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/alexandari20a/alexandari20a.pdf},
url = {http://proceedings.mlr.press/v119/alexandari20a.html},
abstract = {Label shift refers to the phenomenon where the prior class probability p(y) changes between the training and test distributions, while the conditional probability p(x|y) stays fixed. Label shift arises in settings like medical diagnosis, where a classifier trained to predict disease given symptoms must be adapted to scenarios where the baseline prevalence of the disease is different. Given estimates of p(y|x) from a predictive model, Saerens et al. proposed an efficient maximum likelihood algorithm to correct for label shift that does not require model retraining, but a limiting assumption of this algorithm is that p(y|x) is calibrated, which is not true of modern neural networks. Recently, Black Box Shift Learning (BBSL) and Regularized Learning under Label Shifts (RLLS) have emerged as state-of-the-art techniques to cope with label shift when a classifier does not output calibrated probabilities, but both methods require model retraining with importance weights and neither has been benchmarked against maximum likelihood. Here we (1) show that combining maximum likelihood with a type of calibration we call bias-corrected calibration outperforms both BBSL and RLLS across diverse datasets and distribution shifts, (2) prove that the maximum likelihood objective is concave, and (3) introduce a principled strategy for estimating source-domain priors that improves robustness to poor calibration. This work demonstrates that the maximum likelihood with appropriate calibration is a formidable and efficient baseline for label shift adaptation; notebooks reproducing experiments available at https://github.com/kundajelab/labelshiftexperiments , video: https://youtu.be/ZBXjE9QTruE , blogpost: https://bit.ly/3kTds7J}
}
@InProceedings{pmlr-v119-ali20a,
title = {The Implicit Regularization of Stochastic Gradient Flow for Least Squares},
author = {Ali, Alnur and Dobriban, Edgar and Tibshirani, Ryan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {233--244},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ali20a/ali20a.pdf},
url = {http://proceedings.mlr.press/v119/ali20a.html},
abstract = {We study the implicit regularization of mini-batch stochastic gradient descent, when applied to the fundamental problem of least squares regression. We leverage a continuous-time stochastic differential equation having the same moments as stochastic gradient descent, which we call stochastic gradient flow. We give a bound on the excess risk of stochastic gradient flow at time $t$, over ridge regression with tuning parameter $\lambda = 1/t$. The bound may be computed from explicit constants (e.g., the mini-batch size, step size, number of iterations), revealing precisely how these quantities drive the excess risk. Numerical examples show the bound can be small, indicating a tight relationship between the two estimators. We give a similar result relating the coefficients of stochastic gradient flow and ridge. These results hold under no conditions on the data matrix $X$, and across the entire optimization path (not just at convergence).}
}
@InProceedings{pmlr-v119-alon20a,
title = {Structural Language Models of Code},
author = {Alon, Uri and Sadaka, Roy and Levy, Omer and Yahav, Eran},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {245--256},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/alon20a/alon20a.pdf},
url = {http://proceedings.mlr.press/v119/alon20a.html},
abstract = {We address the problem of any-code completion - generating a missing piece of source code in a given program without any restriction on the vocabulary or structure. We introduce a new approach to any-code completion that leverages the strict syntax of programming languages to model a code snippet as a tree - structural language modeling (SLM). SLM estimates the probability of the program’s abstract syntax tree (AST) by decomposing it into a product of conditional probabilities over its nodes. We present a neural model that computes these conditional probabilities by considering all AST paths leading to a target node. Unlike previous techniques that have severely restricted the kinds of expressions that can be generated in this task, our approach can generate arbitrary code in any programming language. Our model significantly outperforms both seq2seq and a variety of structured approaches in generating Java and C# code. Our code, data, and trained models are available at http://github.com/tech-srl/slm-code-generation/. An online demo is available at http://AnyCodeGen.org.}
}
@InProceedings{pmlr-v119-amin20a,
title = {{L}ow{FER}: Low-rank Bilinear Pooling for Link Prediction},
author = {Amin, Saadullah and Varanasi, Stalin and Dunfield, Katherine Ann and Neumann, G{\"u}nter},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {257--268},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/amin20a/amin20a.pdf},
url = {http://proceedings.mlr.press/v119/amin20a.html},
abstract = {Knowledge graphs are incomplete by nature, with only a limited number of observed facts from the world knowledge being represented as structured relations between entities. To partly address this issue, an important task in statistical relational learning is that of link prediction or knowledge graph completion. Both linear and non-linear models have been proposed to solve the problem. Bilinear models, while expressive, are prone to overfitting and lead to quadratic growth of parameters in number of relations. Simpler models have become more standard, with certain constraints on bilinear map as relation parameters. In this work, we propose a factorized bilinear pooling model, commonly used in multi-modal learning, for better fusion of entities and relations, leading to an efficient and constraint-free model. We prove that our model is fully expressive, providing bounds on the embedding dimensionality and factorization rank. Our model naturally generalizes Tucker decomposition based TuckER model, which has been shown to generalize other models, as efficient low-rank approximation without substantially compromising the performance. Due to low-rank approximation, the model complexity can be controlled by the factorization rank, avoiding the possible cubic growth of TuckER. Empirically, we evaluate on real-world datasets, reaching on par or state-of-the-art performance. At extreme low-ranks, model preserves the performance while staying parameter efficient.}
}
@InProceedings{pmlr-v119-amit20a,
title = {Discount Factor as a Regularizer in Reinforcement Learning},
author = {Amit, Ron and Meir, Ron and Ciosek, Kamil},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {269--278},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/amit20a/amit20a.pdf},
url = {http://proceedings.mlr.press/v119/amit20a.html},
abstract = {Specifying a Reinforcement Learning (RL) task involves choosing a suitable planning horizon, which is typically modeled by a discount factor. It is known that applying RL algorithms with a lower discount factor can act as a regularizer, improving performance in the limited data regime. Yet the exact nature of this regularizer has not been investigated. In this work, we fill in this gap. For several Temporal-Difference (TD) learning methods, we show an explicit equivalence between using a reduced discount factor and adding an explicit regularization term to the algorithm’s loss. Motivated by the equivalence, we empirically study this technique compared to standard L2 regularization by extensive experiments in discrete and continuous domains, using tabular and functional representations. Our experiments suggest the regularization effectiveness is strongly related to properties of the available data, such as size, distribution, and mixing rate.}
}
@InProceedings{pmlr-v119-amizadeh20a,
title = {Neuro-Symbolic Visual Reasoning: Disentangling "{V}isual" from "{R}easoning"},
author = {Amizadeh, Saeed and Palangi, Hamid and Polozov, Alex and Huang, Yichen and Koishida, Kazuhito},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {279--290},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/amizadeh20a/amizadeh20a.pdf},
url = {http://proceedings.mlr.press/v119/amizadeh20a.html},
abstract = {Visual reasoning tasks such as visual question answering (VQA) require an interplay of visual perception with reasoning about the question semantics grounded in perception. However, recent advances in this area are still primarily driven by perception improvements (e.g. scene graph generation) rather than reasoning. Neuro-symbolic models such as Neural Module Networks bring the benefits of compositional reasoning to VQA, but they are still entangled with visual representation learning, and thus neural reasoning is hard to improve and assess on its own. To address this, we propose (1) a framework to isolate and evaluate the reasoning aspect of VQA separately from its perception, and (2) a novel top-down calibration technique that allows the model to answer reasoning questions even with imperfect perception. To this end, we introduce a Differentiable First-Order Logic formalism for VQA that explicitly decouples question answering from visual perception. On the challenging GQA dataset, this framework is used to perform in-depth, disentangled comparisons between well-known VQA models leading to informative insights regarding the participating models as well as the task.}
}
@InProceedings{pmlr-v119-amos20a,
title = {The Differentiable Cross-Entropy Method},
author = {Amos, Brandon and Yarats, Denis},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {291--302},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/amos20a/amos20a.pdf},
url = {http://proceedings.mlr.press/v119/amos20a.html},
abstract = {We study the Cross-Entropy Method (CEM) for the non-convex optimization of a continuous and parameterized objective function and introduce a differentiable variant that enables us to differentiate the output of CEM with respect to the objective function’s parameters. In the machine learning setting this brings CEM inside of the end-to-end learning pipeline where this has otherwise been impossible. We show applications in a synthetic energy-based structured prediction task and in non-convex continuous control. In the control setting we show how to embed optimal action sequences into a lower-dimensional space. This enables us to use policy optimization to fine-tune modeling components by differentiating through the CEM-based controller.}
}
@InProceedings{pmlr-v119-anand20a,
title = {Customizing {ML} Predictions for Online Algorithms},
author = {Anand, Keerti and Ge, Rong and Panigrahi, Debmalya},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {303--313},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/anand20a/anand20a.pdf},
url = {http://proceedings.mlr.press/v119/anand20a.html},
abstract = {A popular line of recent research incorporates ML advice in the design of online algorithms to improve their performance in typical instances. These papers treat the ML algorithm as a black-box, and redesign online algorithms to take advantage of ML predictions. In this paper, we ask the complementary question: can we redesign ML algorithms to provide better predictions for online algorithms? We explore this question in the context of the classic rent-or-buy problem, and show that incorporating optimization benchmarks in ML loss functions leads to significantly better performance, while maintaining a worst-case adversarial result when the advice is completely wrong. We support this finding both through theoretical bounds and numerical simulations.}
}
@InProceedings{pmlr-v119-anders20a,
title = {Fairwashing explanations with off-manifold detergent},
author = {Anders, Christopher and Pasliev, Plamen and Dombrowski, Ann-Kathrin and M{\"u}ller, Klaus-Robert and Kessel, Pan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {314--323},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/anders20a/anders20a.pdf},
url = {http://proceedings.mlr.press/v119/anders20a.html},
abstract = {Explanation methods promise to make black-box classifiers more transparent. As a result, it is hoped that they can act as proof for a sensible, fair and trustworthy decision-making process of the algorithm and thereby increase its acceptance by the end-users. In this paper, we show both theoretically and experimentally that these hopes are presently unfounded. Specifically, we show that, for any classifier $g$, one can always construct another classifier $\tilde{g}$ which has the same behavior on the data (same train, validation, and test error) but has arbitrarily manipulated explanation maps. We derive this statement theoretically using differential geometry and demonstrate it experimentally for various explanation methods, architectures, and datasets. Motivated by our theoretical insights, we then propose a modification of existing explanation methods which makes them significantly more robust.}
}
@InProceedings{pmlr-v119-angermueller20a,
title = {Population-Based Black-Box Optimization for Biological Sequence Design},
author = {Angermueller, Christof and Belanger, David and Gane, Andreea and Mariet, Zelda and Dohan, David and Murphy, Kevin and Colwell, Lucy and Sculley, D},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {324--334},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/angermueller20a/angermueller20a.pdf},
url = {http://proceedings.mlr.press/v119/angermueller20a.html},
abstract = {The use of black-box optimization for the design of new biological sequences is an emerging research area with potentially revolutionary impact. The cost and latency of wet-lab experiments requires methods that find good sequences in few experimental rounds of large batches of sequences — a setting that off-the-shelf black-box optimization methods are ill-equipped to handle. We find that the performance of existing methods varies drastically across optimization tasks, posing a significant obstacle to real-world applications. To improve robustness, we propose Population-Based Black-Box Optimization (P3BO), which generates batches of sequences by sampling from an ensemble of methods. The number of sequences sampled from any method is proportional to the quality of sequences it previously proposed, allowing P3BO to combine the strengths of individual methods while hedging against their innate brittleness. Adapting the hyper-parameters of each of the methods online using evolutionary optimization further improves performance. Through extensive experiments on in-silico optimization tasks, we show that P3BO outperforms any single method in its population, proposing higher quality sequences as well as more diverse batches. As such, P3BO and Adaptive-P3BO are a crucial step towards deploying ML to real-world sequence design.}
}
@InProceedings{pmlr-v119-anokhin20a,
title = {Low-loss connection of weight vectors: distribution-based approaches},
author = {Anokhin, Ivan and Yarotsky, Dmitry},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {335--344},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/anokhin20a/anokhin20a.pdf},
url = {http://proceedings.mlr.press/v119/anokhin20a.html},
abstract = {Recent research shows that sublevel sets of the loss surfaces of overparameterized networks are connected, exactly or approximately. We describe and compare experimentally a panel of methods used to connect two low-loss points by a low-loss curve on this surface. Our methods vary in accuracy and complexity. Most of our methods are based on ”macroscopic” distributional assumptions and are insensitive to the detailed properties of the points to be connected. Some methods require a prior training of a ”global connection model” which can then be applied to any pair of points. The accuracy of the method generally correlates with its complexity and sensitivity to the endpoint detail.}
}
@InProceedings{pmlr-v119-antoniadis20a,
title = {Online metric algorithms with untrusted predictions},
author = {Antoniadis, Antonios and Coester, Christian and Elias, Marek and Polak, Adam and Simon, Bertrand},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {345--355},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/antoniadis20a/antoniadis20a.pdf},
url = {http://proceedings.mlr.press/v119/antoniadis20a.html},
abstract = {Machine-learned predictors, although achieving very good results for inputs resembling training data, cannot possibly provide perfect predictions in all situations. Still, decision-making systems that are based on such predictors need not only to benefit from good predictions but also to achieve a decent performance when the predictions are inadequate. In this paper, we propose a prediction setup for arbitrary metrical task systems (MTS) (e.g., caching, k-server and convex body chasing) and online matching on the line. We utilize results from the theory of online algorithms to show how to make the setup robust. Specifically for caching, we present an algorithm whose performance, as a function of the prediction error, is exponentially better than what is achievable for general MTS. Finally, we present an empirical evaluation of our methods on real world datasets, which suggests practicality.}
}
@InProceedings{pmlr-v119-ardywibowo20a,
title = {{NADS}: Neural Architecture Distribution Search for Uncertainty Awareness},
author = {Ardywibowo, Randy and Boluki, Shahin and Gong, Xinyu and Wang, Zhangyang and Qian, Xiaoning},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {356--366},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ardywibowo20a/ardywibowo20a.pdf},
url = {http://proceedings.mlr.press/v119/ardywibowo20a.html},
abstract = {Machine learning (ML) systems often encounter Out-of-Distribution (OoD) errors when dealing with testing data coming from a distribution different from training data. It becomes important for ML systems in critical applications to accurately quantify its predictive uncertainty and screen out these anomalous inputs. However, existing OoD detection approaches are prone to errors and even sometimes assign higher likelihoods to OoD samples. Unlike standard learning tasks, there is currently no well established guiding principle for designing OoD detection architectures that can accurately quantify uncertainty. To address these problems, we first seek to identify guiding principles for designing uncertainty-aware architectures, by proposing Neural Architecture Distribution Search (NADS). NADS searches for a distribution of architectures that perform well on a given task, allowing us to identify common building blocks among all uncertainty-aware architectures. With this formulation, we are able to optimize a stochastic OoD detection objective and construct an ensemble of models to perform OoD detection. We perform multiple OoD detection experiments and observe that our NADS performs favorably, with up to 57% improvement in accuracy compared to state-of-the-art methods among 15 different testing configurations.}
}
@InProceedings{pmlr-v119-arora20a,
title = {Provable Representation Learning for Imitation Learning via Bi-level Optimization},
author = {Arora, Sanjeev and Du, Simon and Kakade, Sham and Luo, Yuping and Saunshi, Nikunj},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {367--376},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/arora20a/arora20a.pdf},
url = {http://proceedings.mlr.press/v119/arora20a.html},
abstract = {A common strategy in modern learning systems is to learn a representation that is useful for many tasks, a.k.a. representation learning. We study this strategy in the imitation learning setting for Markov decision processes (MDPs) where multiple experts’ trajectories are available. We formulate representation learning as a bi-level optimization problem where the “outer" optimization tries to learn the joint representation and the “inner" optimization encodes the imitation learning setup and tries to learn task-specific parameters. We instantiate this framework for the imitation learning settings of behavior cloning and observation-alone. Theoretically, we show using our framework that representation learning can provide sample complexity benefits for imitation learning in both settings. We also provide proof-of-concept experiments to verify our theory.}
}
@InProceedings{pmlr-v119-arunachalam20a,
title = {Quantum Boosting},
author = {Arunachalam, Srinivasan and Maity, Reevu},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {377--387},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/arunachalam20a/arunachalam20a.pdf},
url = {http://proceedings.mlr.press/v119/arunachalam20a.html},
abstract = {Boosting is a technique that boosts a weak and inaccurate machine learning algorithm into a strong accurate learning algorithm. The AdaBoost algorithm by Freund and Schapire (for which they were awarded the G{ö}del prize in 2003) is one of the widely used boosting algorithms, with many applications in theory and practice. Suppose we have a gamma-weak learner for a Boolean concept class C that takes time R(C), then the time complexity of AdaBoost scales as VC(C)poly(R(C), 1/gamma), where VC(C) is the VC-dimension of C. In this paper, we show how quantum techniques can improve the time complexity of classical AdaBoost. To this end, suppose we have a gamma-weak quantum learning algorithm for a Boolean concept class C that takes time Q(C), we introduce a quantum boosting algorithm whose complexity scales as sqrt{VC(C)}poly(Q(C),1/gamma); thereby achieving quadratic quantum improvement over classical AdaBoost in terms of VC(C).}
}
@InProceedings{pmlr-v119-ashtiani20a,
title = {Black-box Certification and Learning under Adversarial Perturbations},
author = {Ashtiani, Hassan and Pathak, Vinayak and Urner, Ruth},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {388--398},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ashtiani20a/ashtiani20a.pdf},
url = {http://proceedings.mlr.press/v119/ashtiani20a.html},
abstract = {We formally study the problem of classification under adversarial perturbations from a learner’s perspective as well as a third-party who aims at certifying the robustness of a given black-box classifier. We analyze a PAC-type framework of semi-supervised learning and identify possibility and impossibility results for proper learning of VC-classes in this setting. We further introduce a new setting of black-box certification under limited query budget, and analyze this for various classes of predictors and perturbation. We also consider the viewpoint of a black-box adversary that aims at finding adversarial examples, showing that the existence of an adversary with polynomial query complexity can imply the existence of a sample efficient robust learner.}
}
@InProceedings{pmlr-v119-asim20a,
title = {Invertible generative models for inverse problems: mitigating representation error and dataset bias},
author = {Asim, Muhammad and Daniels, Max and Leong, Oscar and Ahmed, Ali and Hand, Paul},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {399--409},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/asim20a/asim20a.pdf},
url = {http://proceedings.mlr.press/v119/asim20a.html},
abstract = {Trained generative models have shown remarkable performance as priors for inverse problems in imaging – for example, Generative Adversarial Network priors permit recovery of test images from 5-10x fewer measurements than sparsity priors. Unfortunately, these models may be unable to represent any particular image because of architectural choices, mode collapse, and bias in the training dataset. In this paper, we demonstrate that invertible neural networks, which have zero representation error by design, can be effective natural signal priors at inverse problems such as denoising, compressive sensing, and inpainting. Given a trained generative model, we study the empirical risk formulation of the desired inverse problem under a regularization that promotes high likelihood images, either directly by penalization or algorithmically by initialization. For compressive sensing, invertible priors can yield higher accuracy than sparsity priors across almost all undersampling ratios, and due to their lack of representation error, invertible priors can yield better reconstructions than GAN priors for images that have rare features of variation within the biased training set, including out-of-distribution natural images. We additionally compare performance for compressive sensing to unlearned methods, such as the deep decoder, and we establish theoretical bounds on expected recovery error in the case of a linear invertible model.}
}
@InProceedings{pmlr-v119-assran20a,
title = {On the Convergence of {N}esterov’s Accelerated Gradient Method in Stochastic Settings},
author = {Assran, Mahmoud and Rabbat, Mike},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {410--420},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/assran20a/assran20a.pdf},
url = {http://proceedings.mlr.press/v119/assran20a.html},
abstract = {We study Nesterov’s accelerated gradient method with constant step-size and momentum parameters in the stochastic approximation setting (unbiased gradients with bounded variance) and the finite-sum setting (where randomness is due to sampling mini-batches). To build better insight into the behavior of Nesterov’s method in stochastic settings, we focus throughout on objectives that are smooth, strongly-convex, and twice continuously differentiable. In the stochastic approximation setting, Nesterov’s method converges to a neighborhood of the optimal point at the same accelerated rate as in the deterministic setting. Perhaps surprisingly, in the finite-sum setting, we prove that Nesterov’s method may diverge with the usual choice of step-size and momentum, unless additional conditions on the problem related to conditioning and data coherence are satisfied. Our results shed light as to why Nesterov’s method may fail to converge or achieve acceleration in the finite-sum setting.}
}
@InProceedings{pmlr-v119-atamturk20a,
title = {Safe screening rules for L0-regression from Perspective Relaxations},
author = {Atamturk, Alper and Gomez, Andres},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {421--430},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/atamturk20a/atamturk20a.pdf},
url = {http://proceedings.mlr.press/v119/atamturk20a.html},
abstract = {We give safe screening rules to eliminate variables from regression with $\ell_0$ regularization or cardinality constraint. These rules are based on guarantees that a feature may or may not be selected in an optimal solution. The screening rules can be computed from a convex relaxation solution in linear time, without solving the L0-optimization problem. Thus, they can be used in a preprocessing step to safely remove variables from consideration apriori. Numerical experiments on real and synthetic data indicate that a significant number of the variables can be removed quickly, hence reducing the computational burden for optimization substantially. Therefore, the proposed fast and effective screening rules extend the scope of algorithms for L0-regression to larger data sets.}
}
@InProceedings{pmlr-v119-awasthi20a,
title = {Adversarial Learning Guarantees for Linear Hypotheses and Neural Networks},
author = {Awasthi, Pranjal and Frank, Natalie and Mohri, Mehryar},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {431--441},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/awasthi20a/awasthi20a.pdf},
url = {http://proceedings.mlr.press/v119/awasthi20a.html},
abstract = {Adversarial or test time robustness measures the susceptibility of a classifier to perturbations to the test input. While there has been a flurry of recent work on designing defenses against such perturbations, the theory of adversarial robustness is not well understood. In order to make progress on this, we focus on the problem of understanding generalization in adversarial settings, via the lens of Rademacher complexity. We give upper and lower bounds for the adversarial empirical Rademacher complexity of linear hypotheses with adversarial perturbations measured in $l_r$-norm for an arbitrary $r \geq 1$. We then extend our analysis to provide Rademacher complexity lower and upper bounds for a single ReLU unit. Finally, we give adversarial Rademacher complexity bounds for feed-forward neural networks with one hidden layer.}
}
@InProceedings{pmlr-v119-axelrod20a,
title = {Sample Amplification: Increasing Dataset Size even when Learning is Impossible},
author = {Axelrod, Brian and Garg, Shivam and Sharan, Vatsal and Valiant, Gregory},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {442--451},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/axelrod20a/axelrod20a.pdf},
url = {http://proceedings.mlr.press/v119/axelrod20a.html},
abstract = {Given data drawn from an unknown distribution, D, to what extent is it possible to “amplify” this dataset and faithfully output an even larger set of samples that appear to have been drawn from D? We formalize this question as follows: an (n,m) amplification procedure takes as input n independent draws from an unknown distribution D, and outputs a set of m > n “samples” which must be indistinguishable from m samples drawn iid from D. We consider this sample amplification problem in two fundamental settings: the case where D is an arbitrary discrete distribution supported on k elements, and the case where D is a d-dimensional Gaussian with unknown mean, and fixed covariance matrix. Perhaps surprisingly, we show a valid amplification procedure exists for both of these settings, even in the regime where the size of the input dataset, n, is significantly less than what would be necessary to learn distribution D to non-trivial accuracy. We also show that our procedures are optimal up to constant factors. Beyond these results, we describe potential applications of such data amplification, and formalize a number of curious directions for future research along this vein.}
}
@InProceedings{pmlr-v119-axiotis20a,
title = {Sparse Convex Optimization via Adaptively Regularized Hard Thresholding},
author = {Axiotis, Kyriakos and Sviridenko, Maxim},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {452--462},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/axiotis20a/axiotis20a.pdf},
url = {http://proceedings.mlr.press/v119/axiotis20a.html},
abstract = {The goal of Sparse Convex Optimization is to optimize a convex function $f$ under a sparsity constraint $s\leq s^*\gamma$, where $s^*$ is the target number of non-zero entries in a feasible solution (sparsity) and $\gamma\geq 1$ is an approximation factor. There has been a lot of work to analyze the sparsity guarantees of various algorithms (LASSO, Orthogonal Matching Pursuit (OMP), Iterative Hard Thresholding (IHT)) in terms of the Restricted Condition Number $\kappa$. The best known algorithms guarantee to find an approximate solution of value $f(x^*)+\epsilon$ with the sparsity bound of $\gamma = O\left(\kappa\min\left\{\log \frac{f(x^0)-f(x^*)}{\epsilon}, \kappa\right\}\right)$, where $x^*$ is the target solution. We present a new Adaptively Regularized Hard Thresholding (ARHT) algorithm that makes significant progress on this problem by bringing the bound down to $\gamma=O(\kappa)$, which has been shown to be tight for a general class of algorithms including LASSO, OMP, and IHT. This is achieved without significant sacrifice in the runtime efficiency compared to the fastest known algorithms. We also provide a new analysis of OMP with Replacement (OMPR) for general $f$, under the condition $s > s^* \frac{\kappa^2}{4}$, which yields Compressed Sensing bounds under the Restricted Isometry Property (RIP). When compared to other Compressed Sensing approaches, it has the advantage of providing a strong tradeoff between the RIP condition and the solution sparsity, while working for any general function $f$ that meets the RIP condition.}
}
@InProceedings{pmlr-v119-ayoub20a,
title = {Model-Based Reinforcement Learning with Value-Targeted Regression},
author = {Ayoub, Alex and Jia, Zeyu and Szepesvari, Csaba and Wang, Mengdi and Yang, Lin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {463--474},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ayoub20a/ayoub20a.pdf},
url = {http://proceedings.mlr.press/v119/ayoub20a.html},
abstract = {This paper studies model-based reinforcement learning (RL) for regret minimization. We focus on finite-horizon episodic RL where the transition model $P$ belongs to a known family of models $\mathcal{P}$, a special case of which is when models in $\mathcal{P}$ take the form of linear mixtures: $P_{\theta} = \sum_{i=1}^{d} \theta_{i}P_{i}$. We propose a model based RL algorithm that is based on the optimism principle: In each episode, the set of models that are ‘consistent’ with the data collected is constructed. The criterion of consistency is based on the total squared error that the model incurs on the task of predicting \emph{state values} as determined by the last value estimate along the transitions. The next value function is then chosen by solving the optimistic planning problem with the constructed set of models. We derive a bound on the regret, which, in the special case of linear mixtures, takes the form $\tilde{\mathcal{O}}(d\sqrt{H^{3}T})$, where $H$, $T$ and $d$ are the horizon, the total number of steps and the dimension of $\theta$, respectively. In particular, this regret bound is independent of the total number of states or actions, and is close to a lower bound $\Omega(\sqrt{HdT})$. For a general model family $\mathcal{P}$, the regret bound is derived based on the Eluder dimension.}
}
@InProceedings{pmlr-v119-azencot20a,
title = {Forecasting Sequential Data Using Consistent Koopman Autoencoders},
author = {Azencot, Omri and Erichson, N. Benjamin and Lin, Vanessa and Mahoney, Michael},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {475--485},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/azencot20a/azencot20a.pdf},
url = {http://proceedings.mlr.press/v119/azencot20a.html},
abstract = {Recurrent neural networks are widely used on time series data, yet such models often ignore the underlying physical structures in such sequences. A new class of physics-based methods related to Koopman theory has been introduced, offering an alternative for processing nonlinear dynamical systems. In this work, we propose a novel Consistent Koopman Autoencoder model which, unlike the majority of existing work, leverages the forward and backward dynamics. Key to our approach is a new analysis which explores the interplay between consistent dynamics and their associated Koopman operators. Our network is directly related to the derived analysis, and its computational requirements are comparable to other baselines. We evaluate our method on a wide range of high-dimensional and short-term dependent problems, and it achieves accurate estimates for significant prediction horizons, while also being robust to noise.}
}
@InProceedings{pmlr-v119-bachmann20a,
title = {Constant Curvature Graph Convolutional Networks},
author = {Bachmann, Gregor and Becigneul, Gary and Ganea, Octavian},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {486--496},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bachmann20a/bachmann20a.pdf},
url = {http://proceedings.mlr.press/v119/bachmann20a.html},
abstract = {Interest has been rising lately towards methods representing data in non-Euclidean spaces, e.g. hyperbolic or spherical that provide specific inductive biases useful for certain real-world data properties, e.g. scale-free, hierarchical or cyclical. However, the popular graph neural networks are currently limited in modeling data only via Euclidean geometry and associated vector space operations. Here, we bridge this gap by proposing mathematically grounded generalizations of graph convolutional networks (GCN) to (products of) constant curvature spaces. We do this by i) introducing a unified formalism permitting a differentiable interpolation between all geometries of constant curvature irrespective of their sign, ii) leveraging gyro-barycentric coordinates that generalize the classic Euclidean concept of the center of mass. Our class of models smoothly recover their Euclidean counterparts when the curvature goes to zero from either side. Empirically, we outperform Euclidean GCNs in the tasks of node classification and distortion minimization for symbolic data exhibiting non-Euclidean behavior, according to their discrete curvature.}
}
@InProceedings{pmlr-v119-backurs20a,
title = {Scalable Nearest Neighbor Search for Optimal Transport},
author = {Backurs, Arturs and Dong, Yihe and Indyk, Piotr and Razenshteyn, Ilya and Wagner, Tal},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {497--506},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/backurs20a/backurs20a.pdf},
url = {http://proceedings.mlr.press/v119/backurs20a.html},
abstract = {The Optimal Transport (a.k.a. Wasserstein) distance is an increasingly popular similarity measure for rich data domains, such as images or text documents. This raises the necessity for fast nearest neighbor search algorithms according to this distance, which poses a substantial computational bottleneck on massive datasets. In this work we introduce Flowtree, a fast and accurate approximation algorithm for the Wasserstein-1 distance. We formally analyze its approximation factor and running time. We perform extensive experimental evaluation of nearest neighbor search algorithms in the W_1 distance on real-world dataset. Our results show that compared to previous state of the art, Flowtree achieves up to 7.4 times faster running time.}
}
@InProceedings{pmlr-v119-badia20a,
title = {Agent57: Outperforming the {A}tari Human Benchmark},
author = {Badia, Adri{\`a} Puigdom{\`e}nech and Piot, Bilal and Kapturowski, Steven and Sprechmann, Pablo and Vitvitskyi, Alex and Guo, Zhaohan Daniel and Blundell, Charles},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {507--517},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/badia20a/badia20a.pdf},
url = {http://proceedings.mlr.press/v119/badia20a.html},
abstract = {Atari games have been a long-standing benchmark in the reinforcement learning (RL) community for the past decade. This benchmark was proposed to test general competency of RL algorithms. Previous work has achieved good average performance by doing outstandingly well on many games of the set, but very poorly in several of the most challenging games. We propose Agent57, the first deep RL agent that outperforms the standard human benchmark on all 57 Atari games. To achieve this result, we train a neural network which parameterizes a family of policies ranging from very exploratory to purely exploitative. We propose an adaptive mechanism to choose which policy to prioritize throughout the training process. Additionally, we utilize a novel parameterization of the architecture that allows for more consistent and stable learning.}
}
@InProceedings{pmlr-v119-bahar20a,
title = {Fiduciary Bandits},
author = {Bahar, Gal and Ben-Porat, Omer and Leyton-Brown, Kevin and Tennenholtz, Moshe},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {518--527},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bahar20a/bahar20a.pdf},
url = {http://proceedings.mlr.press/v119/bahar20a.html},
abstract = {Recommendation systems often face exploration-exploitation tradeoffs: the system can only learn about the desirability of new options by recommending them to some user. Such systems can thus be modeled as multi-armed bandit settings; however, users are self-interested and cannot be made to follow recommendations. We ask whether exploration can nevertheless be performed in a way that scrupulously respects agents’ interests—i.e., by a system that acts as a fiduciary. More formally, we introduce a model in which a recommendation system faces an exploration-exploitation tradeoff under the constraint that it can never recommend any action that it knows yields lower reward in expectation than an agent would achieve if it acted alone. Our main contribution is a positive result: an asymptotically optimal, incentive compatible, and ex-ante individually rational recommendation algorithm.}
}
@InProceedings{pmlr-v119-bahng20a,
title = {Learning De-biased Representations with Biased Representations},
author = {Bahng, Hyojin and Chun, Sanghyuk and Yun, Sangdoo and Choo, Jaegul and Oh, Seong Joon},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {528--539},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bahng20a/bahng20a.pdf},
url = {http://proceedings.mlr.press/v119/bahng20a.html},
abstract = {Many machine learning algorithms are trained and evaluated by splitting data from a single source into training and test sets. While such focus on in-distribution learning scenarios has led to interesting advancement, it has not been able to tell if models are relying on dataset biases as shortcuts for successful prediction (e.g., using snow cues for recognising snowmobiles), resulting in biased models that fail to generalise when the bias shifts to a different class. The cross-bias generalisation problem has been addressed by de-biasing training data through augmentation or re-sampling, which are often prohibitive due to the data collection cost (e.g., collecting images of a snowmobile on a desert) and the difficulty of quantifying or expressing biases in the first place. In this work, we propose a novel framework to train a de-biased representation by encouraging it to be different from a set of representations that are biased by design. This tactic is feasible in many scenarios where it is much easier to define a set of biased representations than to define and quantify bias. We demonstrate the efficacy of our method across a variety of synthetic and real-world biases; our experiments show that the method discourages models from taking bias shortcuts, resulting in improved generalisation. Source code is available at https://github.com/clovaai/rebias.}
}
@InProceedings{pmlr-v119-bahri20a,
title = {Deep k-{NN} for Noisy Labels},
author = {Bahri, Dara and Jiang, Heinrich and Gupta, Maya},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {540--550},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bahri20a/bahri20a.pdf},
url = {http://proceedings.mlr.press/v119/bahri20a.html},
abstract = {Modern machine learning models are often trained on examples with noisy labels that hurt performance and are hard to identify. In this paper, we provide an empirical study showing that a simple $k$-nearest neighbor-based filtering approach on the logit layer of a preliminary model can remove mislabeled training data and produce more accurate models than many recently proposed methods. We also provide new statistical guarantees into its efficacy.}
}
@InProceedings{pmlr-v119-bai20a,
title = {Provable Self-Play Algorithms for Competitive Reinforcement Learning},
author = {Bai, Yu and Jin, Chi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {551--560},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bai20a/bai20a.pdf},
url = {http://proceedings.mlr.press/v119/bai20a.html},
abstract = {Self-play, where the algorithm learns by playing against itself without requiring any direct supervision, has become the new weapon in modern Reinforcement Learning (RL) for achieving superhuman performance in practice. However, the majority of exisiting theory in reinforcement learning only applies to the setting where the agent plays against a fixed environment; it remains largely open whether self-play algorithms can be provably effective, especially when it is necessary to manage the exploration/exploitation tradeoff. We study self-play in competitive reinforcement learning under the setting of Markov games, a generalization of Markov decision processes to the two-player case. We introduce a self-play algorithm—Value Iteration with Upper/Lower Confidence Bound (VI-ULCB)—and show that it achieves regret $\mathcal{\tilde{O}}(\sqrt{T})$ after playing $T$ steps of the game, where the regret is measured by the agent’s performance against a fully adversarial opponent who can exploit the agent’s strategy at any step. We also introduce an explore-then-exploit style algorithm, which achieves a slightly worse regret of $\mathcal{\tilde{O}}(T^{2/3})$, but is guaranteed to run in polynomial time even in the worst case. To the best of our knowledge, our work presents the first line of provably sample-efficient self-play algorithms for competitive reinforcement learning.}
}
@InProceedings{pmlr-v119-bai20b,
title = {Sparse Subspace Clustering with Entropy-Norm},
author = {Bai, Liang and Liang, Jiye},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {561--568},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bai20b/bai20b.pdf},
url = {http://proceedings.mlr.press/v119/bai20b.html},
abstract = {In this paper, we provide an explicit theoretical connection between Sparse subspace clustering (SSC) and spectral clustering (SC) from the perspective of learning a data similarity matrix. We show that spectral clustering with Gaussian kernel can be viewed as sparse subspace clustering with entropy-norm (SSC+E). Compared to SSC, SSC+E can obtain an analytical, symmetrical, nonnegative and nonlinearly-representational similarity matrix. Besides, SSC+E makes use of Gaussian kernel to compute the sparse similarity matrix of objects, which can avoid the complex computation of the sparse optimization program of SSC. Finally, we provide the experimental analysis to compare the efficiency and effectiveness of sparse subspace clustering and spectral clustering on ten benchmark data sets. The theoretical and experimental analysis can well help users for the selection of high-dimensional data clustering algorithms.}
}
@InProceedings{pmlr-v119-baker20a,
title = {Coresets for Clustering in Graphs of Bounded Treewidth},
author = {Baker, Daniel and Braverman, Vladimir and Huang, Lingxiao and Jiang, Shaofeng H.-C. and Krauthgamer, Robert and Wu, Xuan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {569--579},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/baker20a/baker20a.pdf},
url = {http://proceedings.mlr.press/v119/baker20a.html},
abstract = {We initiate the study of coresets for clustering in graph metrics, i.e., the shortest-path metric of edge-weighted graphs. Such clustering problems are essential to data analysis and used for example in road networks and data visualization. A coreset is a compact summary of the data that approximately preserves the clustering objective for every possible center set, and it offers significant efficiency improvements in terms of running time, storage, and communication, including in streaming and distributed settings. Our main result is a near-linear time construction of a coreset for k-Median in a general graph $G$, with size $O_{\epsilon, k}(\mathrm{tw}(G))$ where $\mathrm{tw}(G)$ is the treewidth of $G$, and we complement the construction with a nearly-tight size lower bound. The construction is based on the framework of Feldman and Langberg [STOC 2011], and our main technical contribution, as required by this framework, is a uniform bound of $O(\mathrm{tw}(G))$ on the shattering dimension under any point weights. We validate our coreset on real-world road networks, and our scalable algorithm constructs tiny coresets with high accuracy, which translates to a massive speedup of existing approximation algorithms such as local search for graph k-Median.}
}
@InProceedings{pmlr-v119-balcan20a,
title = {Refined bounds for algorithm configuration: The knife-edge of dual class approximability},
author = {Balcan, Maria-Florina and Sandholm, Tuomas and Vitercik, Ellen},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {580--590},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/balcan20a/balcan20a.pdf},
url = {http://proceedings.mlr.press/v119/balcan20a.html},
abstract = {Automating algorithm configuration is growing increasingly necessary as algorithms come with more and more tunable parameters. It is common to tune parameters using machine learning, optimizing algorithmic performance (runtime or solution quality, for example) using a training set of problem instances from the specific domain at hand. We investigate a fundamental question about these techniques: how large should the training set be to ensure that a parameter’s average empirical performance over the training set is close to its expected, future performance? We answer this question for algorithm configuration problems that exhibit a widely-applicable structure: the algorithm’s performance as a function of its parameters can be approximated by a “simple” function. We show that if this approximation holds under the L$\infty$-norm, we can provide strong sample complexity bounds, but if the approximation holds only under the Lp-norm for p < $\infty$, it is not possible to provide meaningful sample complexity bounds in the worst case. We empirically evaluate our bounds in the context of integer programming, obtaining sample complexity bounds that are up to 700 times smaller than the previously best-known bounds.}
}
@InProceedings{pmlr-v119-ball20a,
title = {Ready Policy One: World Building Through Active Learning},
author = {Ball, Philip and Parker-Holder, Jack and Pacchiano, Aldo and Choromanski, Krzysztof and Roberts, Stephen},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {591--601},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ball20a/ball20a.pdf},
url = {http://proceedings.mlr.press/v119/ball20a.html},
abstract = {Model-Based Reinforcement Learning (MBRL) offers a promising direction for sample efficient learning, often achieving state of the art results for continuous control tasks. However many existing MBRL methods rely on combining greedy policies with exploration heuristics, and even those which utilize principled exploration bonuses construct dual objectives in an ad hoc fashion. In this paper we introduce Ready Policy One (RP1), a framework that views MBRL as an active learning problem, where we aim to improve the world model in the fewest samples possible. RP1 achieves this by utilizing a hybrid objective function, which crucially adapts during optimization, allowing the algorithm to trade off reward v.s. exploration at different stages of learning. In addition, we introduce a principled mechanism to terminate sample collection once we have a rich enough trajectory batch to improve the model. We rigorously evaluate our method on a variety of continuous control tasks, and demonstrate statistically significant gains over existing approaches.}
}
@InProceedings{pmlr-v119-ballu20a,
title = {Stochastic Optimization for Regularized {W}asserstein Estimators},
author = {Ballu, Marin and Berthet, Quentin and Bach, Francis},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {602--612},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ballu20a/ballu20a.pdf},
url = {http://proceedings.mlr.press/v119/ballu20a.html},
abstract = {Optimal transport is a foundational problem in optimization, that allows to compare probability distributions while taking into account geometric aspects. Its optimal objective value, the Wasserstein distance, provides an important loss between distributions that has been used in many applications throughout machine learning and statistics. Recent algorithmic progress on this problem and its regularized versions have made these tools increasingly popular. However, existing techniques require solving an optimization problem to obtain a single gradient of the loss, thus slowing down first-order methods to minimize the sum of losses, that require many such gradient computations. In this work, we introduce an algorithm to solve a regularized version of this problem of Wasserstein estimators, with a time per step which is sublinear in the natural dimensions of the problem. We introduce a dual formulation, and optimize it with stochastic gradient steps that can be computed directly from samples, without solving additional optimization problems at each step. Doing so, the estimation and computation tasks are performed jointly. We show that this algorithm can be extended to other tasks, including estimation of Wasserstein barycenters. We provide theoretical guarantees and illustrate the performance of our algorithm with experiments on synthetic data.}
}
@InProceedings{pmlr-v119-balseiro20a,
title = {Dual Mirror Descent for Online Allocation Problems},
author = {Balseiro, Santiago and Lu, Haihao and Mirrokni, Vahab},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {613--628},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/balseiro20a/balseiro20a.pdf},
url = {http://proceedings.mlr.press/v119/balseiro20a.html},
abstract = {We consider online allocation problems with concave revenue functions and resource constraints, which are central problems in revenue management and online advertising. In these settings, requests arrive sequentially during a finite horizon and, for each request, a decision maker needs to choose an action that consumes a certain amount of resources and generates revenue. The revenue function and resource consumption of each request are drawn independently and at random from a probability distribution that is unknown to the decision maker. The objective is to maximize cumulative revenues subject to a constraint on the total consumption of resources. We design a general class of algorithms that achieve sub-linear expected regret compared to the hindsight optimal allocation. Our algorithms operate in the Lagrangian dual space: they maintain a dual multiplier for each resource that is updated using online mirror descent. By choosing the reference function accordingly, we recover dual sub-gradient descent and dual exponential weights algorithm. The resulting algorithms are simple, efficient, and shown to attain the optimal order of regret when the length of the horizon and the initial number of resources are scaled proportionally. We discuss applications to online bidding in repeated auctions with budget constraints and online proportional matching with high entropy.}
}
@InProceedings{pmlr-v119-banerjee20a,
title = {Inductive-bias-driven Reinforcement Learning For Efficient Schedules in Heterogeneous Clusters},
author = {Banerjee, Subho and Jha, Saurabh and Kalbarczyk, Zbigniew and Iyer, Ravishankar},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {629--641},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/banerjee20a/banerjee20a.pdf},
url = {http://proceedings.mlr.press/v119/banerjee20a.html},
abstract = {The problem of scheduling of workloads onto heterogeneous processors (e.g., CPUs, GPUs, FPGAs) is of fundamental importance in modern data centers. Current system schedulers rely on application/system-specific heuristics that have to be built on a case-by-case basis. Recent work has demonstrated ML techniques for automating the heuristic search by using black-box approaches which require significant training data and time, which make them challenging to use in practice. This paper presents Symphony, a scheduling framework that addresses the challenge in two ways: (i) a domain-driven Bayesian reinforcement learning (RL) model for scheduling, which inherently models the resource dependencies identified from the system architecture; and (ii) a sampling-based technique to compute the gradients of a Bayesian model without performing full probabilistic inference. Together, these techniques reduce both the amount of training data and the time required to produce scheduling policies that significantly outperform black-box approaches by up to 2.2{\texttimes}.}
}
@InProceedings{pmlr-v119-bao20a,
title = {{U}ni{LM}v2: Pseudo-Masked Language Models for Unified Language Model Pre-Training},
author = {Bao, Hangbo and Dong, Li and Wei, Furu and Wang, Wenhui and Yang, Nan and Liu, Xiaodong and Wang, Yu and Gao, Jianfeng and Piao, Songhao and Zhou, Ming and Hon, Hsiao-Wuen},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {642--652},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bao20a/bao20a.pdf},
url = {http://proceedings.mlr.press/v119/bao20a.html},
abstract = {We propose to pre-train a unified language model for both autoencoding and partially autoregressive language modeling tasks using a novel training procedure, referred to as a pseudo-masked language model (PMLM). Given an input text with masked tokens, we rely on conventional masks to learn inter-relations between corrupted tokens and context via autoencoding, and pseudo masks to learn intra-relations between masked spans via partially autoregressive modeling. With well-designed position embeddings and self-attention masks, the context encodings are reused to avoid redundant computation. Moreover, conventional masks used for autoencoding provide global masking information, so that all the position embeddings are accessible in partially autoregressive language modeling. In addition, the two tasks pre-train a unified language model as a bidirectional encoder and a sequence-to-sequence decoder, respectively. Our experiments show that the unified language models pre-trained using PMLM achieve new state-of-the-art results on a wide range of language understanding and generation tasks across several widely used benchmarks. The code and pre-trained models are available at https://github.com/microsoft/unilm.}
}
@InProceedings{pmlr-v119-bao20b,
title = {Fast {OSCAR} and {OWL} Regression via Safe Screening Rules},
author = {Bao, Runxue and Gu, Bin and Huang, Heng},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {653--663},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bao20b/bao20b.pdf},
url = {http://proceedings.mlr.press/v119/bao20b.html},
abstract = {Ordered Weighted $L_{1}$ (OWL) regularized regression is a new regression analysis for high-dimensional sparse learning. Proximal gradient methods are used as standard approaches to solve OWL regression. However, it is still a burning issue to solve OWL regression due to considerable computational cost and memory usage when the feature or sample size is large. In this paper, we propose the first safe screening rule for OWL regression by exploring the order of the primal solution with the unknown order structure via an iterative strategy, which overcomes the difficulties of tackling the non-separable regularizer. It effectively avoids the updates of the parameters whose coefficients must be zero during the learning process. More importantly, the proposed screening rule can be easily applied to standard and stochastic proximal gradient methods. Moreover, we prove that the algorithms with our screening rule are guaranteed to have identical results with the original algorithms. Experimental results on a variety of datasets show that our screening rule leads to a significant computational gain without any loss of accuracy, compared to existing competitive algorithms.}
}
@InProceedings{pmlr-v119-bar20a,
title = {Option Discovery in the Absence of Rewards with Manifold Analysis},
author = {Bar, Amitay and Talmon, Ronen and Meir, Ron},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {664--674},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bar20a/bar20a.pdf},
url = {http://proceedings.mlr.press/v119/bar20a.html},
abstract = {Options have been shown to be an effective tool in reinforcement learning, facilitating improved exploration and learning. In this paper, we present an approach based on spectral graph theory and derive an algorithm that systematically discovers options without access to a specific reward or task assignment. As opposed to the common practice used in previous methods, our algorithm makes full use of the spectrum of the graph Laplacian. Incorporating modes associated with higher graph frequencies unravels domain subtleties, which are shown to be useful for option discovery. Using geometric and manifold-based analysis, we present a theoretical justification for the algorithm. In addition, we showcase its performance in several domains, demonstrating clear improvements compared to competing methods.}
}
@InProceedings{pmlr-v119-bars20a,
title = {Learning the piece-wise constant graph structure of a varying Ising model},
author = {Bars, Batiste Le and Humbert, Pierre and Kalogeratos, Argyris and Vayatis, Nicolas},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {675--684},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bars20a/bars20a.pdf},
url = {http://proceedings.mlr.press/v119/bars20a.html},
abstract = {This work focuses on the estimation of multiple change-points in a time-varying Ising model that evolves piece-wise constantly. The aim is to identify both the moments at which significant changes occur in the Ising model, as well as the underlying graph structures. For this purpose, we propose to estimate the neighborhood of each node by maximizing a penalized version of its conditional log-likelihood. The objective of the penalization is twofold: it imposes sparsity in the learned graphs and, thanks to a fused-type penalty, it also enforces them to evolve piece-wise constantly. Using few assumptions, we provide two change-points consistency theorems. Those are the first in the context of unknown number of change-points detection in time-varying Ising model. Finally, experimental results on several synthetic datasets and a real-world dataset demonstrate the performance of our method.}
}
@InProceedings{pmlr-v119-basri20a,
title = {Frequency Bias in Neural Networks for Input of Non-Uniform Density},
author = {Basri, Ronen and Galun, Meirav and Geifman, Amnon and Jacobs, David and Kasten, Yoni and Kritchman, Shira},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {685--694},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/basri20a/basri20a.pdf},
url = {http://proceedings.mlr.press/v119/basri20a.html},
abstract = {Recent works have partly attributed the generalization ability of over-parameterized neural networks to frequency bias – networks trained with gradient descent on data drawn from a uniform distribution find a low frequency fit before high frequency ones. As realistic training sets are not drawn from a uniform distribution, we here use the Neural Tangent Kernel (NTK) model to explore the effect of variable density on training dynamics. Our results, which combine analytic and empirical observations, show that when learning a pure harmonic function of frequency $\kappa$, convergence at a point $x \in \S^{d-1}$ occurs in time $O(\kappa^d/p(x))$ where $p(x)$ denotes the local density at $x$. Specifically, for data in $\S^1$ we analytically derive the eigenfunctions of the kernel associated with the NTK for two-layer networks. We further prove convergence results for deep, fully connected networks with respect to the spectral decomposition of the NTK. Our empirical study highlights similarities and differences between deep and shallow networks in this model.}
}
@InProceedings{pmlr-v119-bassily20a,
title = {Private Query Release Assisted by Public Data},
author = {Bassily, Raef and Cheu, Albert and Moran, Shay and Nikolov, Aleksandar and Ullman, Jonathan and Wu, Steven},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {695--703},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bassily20a/bassily20a.pdf},
url = {http://proceedings.mlr.press/v119/bassily20a.html},
abstract = {We study the problem of differentially private query release assisted by access to public data. In this problem, the goal is to answer a large class $\mathcal{H}$ of statistical queries with error no more than $\alpha$ using a combination of public and private samples. The algorithm is required to satisfy differential privacy only with respect to the private samples. We study the limits of this task in terms of the private and public sample complexities. Our upper and lower bounds on the private sample complexity have matching dependence on the dual VC-dimension of $\mathcal{H}$. For a large category of query classes, our bounds on the public sample complexity have matching dependence on $\alpha$.}
}
@InProceedings{pmlr-v119-basu20a,
title = {{ECLIPSE}: An Extreme-Scale Linear Program Solver for Web-Applications},
author = {Basu, Kinjal and Ghoting, Amol and Mazumder, Rahul and Pan, Yao},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {704--714},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/basu20a/basu20a.pdf},
url = {http://proceedings.mlr.press/v119/basu20a.html},
abstract = {Key problems arising in web applications (with millions of users and thousands of items) can be formulated as linear programs involving billions to trillions of decision variables and constraints. Despite the appeal of linear program (LP) formulations, solving problems at these scales appear to be well beyond the capabilities of existing LP solvers. Often ad-hoc decomposition rules are used to approximately solve these LPs, which have limited optimality guarantees and may lead to sub-optimal performance in practice. In this work, we propose a distributed solver that solves a perturbation of the LP problems at scale via a gradient-based algorithm on the smooth dual of the perturbed LP. The main workhorses of our algorithm are distributed matrix-vector multiplications (with load balancing) and efficient projection operations on distributed machines. Experiments on real-world data show that our proposed LP solver, ECLIPSE, can solve problems with $10^{12}$ decision variables – well beyond the capabilities of current solvers.}
}
@InProceedings{pmlr-v119-basu20b,
title = {On Second-Order Group Influence Functions for Black-Box Predictions},
author = {Basu, Samyadeep and You, Xuchen and Feizi, Soheil},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {715--724},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/basu20b/basu20b.pdf},
url = {http://proceedings.mlr.press/v119/basu20b.html},
abstract = {With the rapid adoption of machine learning systems in sensitive applications, there is an increasing need to make black-box models explainable. Often we want to identify an influential group of training samples in a particular test prediction for a given machine learning model. Existing influence functions tackle this problem by using first-order approximations of the effect of removing a sample from the training set on model parameters. To compute the influence of a group of training samples (rather than an individual point) in model predictions, the change in optimal model parameters after removing that group from the training set can be large. Thus, in such cases, the first-order approximation can be loose. In this paper, we address this issue and propose second-order influence functions for identifying influential groups in test-time predictions. For linear models, across different sizes and types of groups, we show that using the proposed second-order influence function improves the correlation between the computed influence values and the ground truth ones. We also show that second-order influence functions could be used with optimization techniques to improve the selection of the most influential group for a test-sample.}
}
@InProceedings{pmlr-v119-belhadji20a,
title = {Kernel interpolation with continuous volume sampling},
author = {Belhadji, Ayoub and Bardenet, R{\'e}mi and Chainais, Pierre},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {725--735},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/belhadji20a/belhadji20a.pdf},
url = {http://proceedings.mlr.press/v119/belhadji20a.html},
abstract = {A fundamental task in kernel methods is to pick nodes and weights, so as to approximate a given function from an RKHS by the weighted sum of kernel translates located at the nodes. This is the crux of kernel density estimation, kernel quadrature, or interpolation from discrete samples. Furthermore, RKHSs offer a convenient mathematical and computational framework. We introduce and analyse continuous volume sampling (VS), the continuous counterpart -for choosing node locations- of a discrete distribution introduced in (Deshpande & Vempala, 2006). Our contribution is theoretical: we prove almost optimal bounds for interpolation and quadrature under VS. While similar bounds already exist for some specific RKHSs using ad-hoc node constructions, VS offers bounds that apply to any Mercer kernel and depend on the spectrum of the associated integration operator. We emphasize that, unlike previous randomized approaches that rely on regularized leverage scores or determinantal point processes, evaluating the pdf of VS only requires pointwise evaluations of the kernel. VS is thus naturally amenable to MCMC samplers.}
}
@InProceedings{pmlr-v119-belilovsky20a,
title = {Decoupled Greedy Learning of {CNN}s},
author = {Belilovsky, Eugene and Eickenberg, Michael and Oyallon, Edouard},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {736--745},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/belilovsky20a/belilovsky20a.pdf},
url = {http://proceedings.mlr.press/v119/belilovsky20a.html},
abstract = {A commonly cited inefficiency of neural network training by back-propagation is the update locking problem: each layer must wait for the signal to propagate through the network before updating. In recent years multiple authors have considered alternatives that can alleviate this issue. In this context, we consider a simpler, but more effective, substitute that uses minimal feedback, which we call Decoupled Greedy Learning (DGL). It is based on a greedy relaxation of the joint training objective, recently shown to be effective in the context of Convolutional Neural Networks (CNNs) on large-scale image classification. We consider an optimization of this objective that permits us to decouple the layer training, allowing for layers or modules in networks to be trained with a potentially linear parallelization in layers. We show theoretically and empirically that this approach converges. Then, we empirically find that it can lead to better generalization than sequential greedy optimization and sometimes end-to-end back-propagation. We show an extension of this approach to asynchronous settings, where modules can operate with large communication delays, is possible with the use of a replay buffer. We demonstrate the effectiveness of DGL on the CIFAR-10 dataset against alternatives and on the large-scale ImageNet dataset.}
}
@InProceedings{pmlr-v119-bellec20a,
title = {The Cost-free Nature of Optimally Tuning Tikhonov Regularizers and Other Ordered Smoothers},
author = {Bellec, Pierre and Yang, Dana},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {746--755},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bellec20a/bellec20a.pdf},
url = {http://proceedings.mlr.press/v119/bellec20a.html},
abstract = {We consider the problem of selecting the best estimator among a family of Tikhonov regularized estimators, or, alternatively, to select a linear combination of these regularizers that is as good as the best regularizer in the family. Our theory reveals that if the Tikhonov regularizers share the same penalty matrix with different tuning parameters, a convex procedure based on $Q$-aggregation achieves the mean square error of the best estimator, up to a small error term no larger than $C\sigma^2$, where $\sigma^2$ is the noise level and $C>0$ is an absolute constant. Remarkably, the error term does not depend on the penalty matrix or the number of estimators as long as they share the same penalty matrix, i.e., it applies to any grid of tuning parameters, no matter how large the cardinality of the grid is. This reveals the surprising "cost-free" nature of optimally tuning Tikhonov regularizers, in striking contrast with the existing literature on aggregation of estimators where one typically has to pay a cost of $\sigma^2\log(M)$ where $M$ is the number of estimators in the family. The result holds, more generally, for any family of ordered linear smoothers; this encompasses Ridge regression as well as Principal Component Regression. The result is extended to the problem of tuning Tikhonov regularizers with different penalty matrices.}
}
@InProceedings{pmlr-v119-bender20a,
title = {Defense Through Diverse Directions},
author = {Bender, Christopher and Li, Yang and Shi, Yifeng and Reiter, Michael K. and Oliva, Junier},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {756--766},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bender20a/bender20a.pdf},
url = {http://proceedings.mlr.press/v119/bender20a.html},
abstract = {In this work we develop a novel Bayesian neural network methodology to achieve strong adversarial robustness without the need for online adversarial training. Unlike previous efforts in this direction, we do not rely solely on the stochasticity of network weights by minimizing the divergence between the learned parameter distribution and a prior. Instead, we additionally require that the model maintain some expected uncertainty with respect to all input covariates. We demonstrate that by encouraging the network to distribute evenly across inputs, the network becomes less susceptible to localized, brittle features which imparts a natural robustness to targeted perturbations. We show empirical robustness on several benchmark datasets.}
}
@InProceedings{pmlr-v119-bengio20a,
title = {Interference and Generalization in Temporal Difference Learning},
author = {Bengio, Emmanuel and Pineau, Joelle and Precup, Doina},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {767--777},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bengio20a/bengio20a.pdf},
url = {http://proceedings.mlr.press/v119/bengio20a.html},
abstract = {We study the link between generalization and interference in temporal-difference (TD) learning. Interference is defined as the inner product of two different gradients, representing their alignment; this quantity emerges as being of interest from a variety of observations about neural networks, parameter sharing and the dynamics of learning. We find that TD easily leads to low-interference, under-generalizing parameters, while the effect seems reversed in supervised learning. We hypothesize that the cause can be traced back to the interplay between the dynamics of interference and bootstrapping. This is supported empirically by several observations: the negative relationship between the generalization gap and interference in TD, the negative effect of bootstrapping on interference and the local coherence of targets, and the contrast between the propagation rate of information in TD(0) versus TD($\lambda$) and regression tasks such as Monte-Carlo policy evaluation. We hope that these new findings can guide the future discovery of better bootstrapping methods.}
}
@InProceedings{pmlr-v119-bengs20a,
title = {Preselection Bandits},
author = {Bengs, Viktor and H{\"u}llermeier, Eyke},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {778--787},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bengs20a/bengs20a.pdf},
url = {http://proceedings.mlr.press/v119/bengs20a.html},
abstract = {In this paper, we introduce the Preselection Bandit problem, in which the learner preselects a subset of arms (choice alternatives) for a user, which then chooses the final arm from this subset. The learner is not aware of the user’s preferences, but can learn them from observed choices. In our concrete setting, we allow these choices to be stochastic and model the user’s actions by means of the Plackett-Luce model. The learner’s main task is to preselect subsets that eventually lead to highly preferred choices. To formalize this goal, we introduce a reasonable notion of regret and derive lower bounds on the expected regret. Moreover, we propose algorithms for which the upper bound on expected regret matches the lower bound up to a logarithmic term of the time horizon.}
}
@InProceedings{pmlr-v119-bennett20a,
title = {Efficient Policy Learning from Surrogate-Loss Classification Reductions},
author = {Bennett, Andrew and Kallus, Nathan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {788--798},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bennett20a/bennett20a.pdf},
url = {http://proceedings.mlr.press/v119/bennett20a.html},
abstract = {Recent work on policy learning from observational data has highlighted the importance of efficient policy evaluation and has proposed reductions to weighted (cost-sensitive) classification. But, efficient policy evaluation need not yield efficient estimation of policy parameters. We consider the estimation problem given by a weighted surrogate-loss classification with any score function, either direct, inverse-propensity-weighted, or doubly robust. We show that, under a correct specification assumption, the weighted classification formulation need not be efficient for policy parameters. We draw a contrast to actual (possibly weighted) binary classification, where correct specification implies a parametric model, while for policy learning it only implies a semi-parametric model. In light of this, we instead propose an estimation approach based on generalized method of moments, which is efficient for the policy parameters. We propose a particular method based on recent developments on solving moment problems using neural networks and demonstrate the efficiency and regret benefits of this method empirically.}
}
@InProceedings{pmlr-v119-berrada20a,
title = {Training Neural Networks for and by Interpolation},
author = {Berrada, Leonard and Zisserman, Andrew and Kumar, M. Pawan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {799--809},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/berrada20a/berrada20a.pdf},
url = {http://proceedings.mlr.press/v119/berrada20a.html},
abstract = {In modern supervised learning, many deep neural networks are able to interpolate the data: the empirical loss can be driven to near zero on all samples simultaneously. In this work, we explicitly exploit this interpolation property for the design of a new optimization algorithm for deep learning, which we term Adaptive Learning-rates for Interpolation with Gradients (ALI-G). ALI-G retains the two main advantages of Stochastic Gradient Descent (SGD), which are (i) a low computational cost per iteration and (ii) good generalization performance in practice. At each iteration, ALI-G exploits the interpolation property to compute an adaptive learning-rate in closed form. In addition, ALI-G clips the learning-rate to a maximal value, which we prove to be helpful for non-convex problems. Crucially, in contrast to the learning-rate of SGD, the maximal learning-rate of ALI-G does not require a decay schedule. This makes ALI-G considerably easier to tune than SGD. We prove the convergence of ALI-G in various stochastic settings. Notably, we tackle the realistic case where the interpolation property is satisfied up to some tolerance. We also provide experiments on a variety of deep learning architectures and tasks: (i) learning a differentiable neural computer; (ii) training a wide residual network on the SVHN data set; (iii) training a Bi-LSTM on the SNLI data set; and (iv) training wide residual networks and densely connected networks on the CIFAR data sets. ALI-G produces state-of-the-art results among adaptive methods, and even yields comparable performance with SGD, which requires manually tuned learning-rate schedules. Furthermore, ALI-G is simple to implement in any standard deep learning framework and can be used as a drop-in replacement in existing code.}
}
@InProceedings{pmlr-v119-bertrand20a,
title = {Implicit differentiation of Lasso-type models for hyperparameter optimization},
author = {Bertrand, Quentin and Klopfenstein, Quentin and Blondel, Mathieu and Vaiter, Samuel and Gramfort, Alexandre and Salmon, Joseph},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {810--821},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bertrand20a/bertrand20a.pdf},
url = {http://proceedings.mlr.press/v119/bertrand20a.html},
abstract = {Setting regularization parameters for Lasso-type estimators is notoriously difficult, though crucial for obtaining the best accuracy. The most popular hyperparameter optimization approach is grid-search on a held-out dataset. However, grid-search requires to choose a predefined grid of parameters and scales exponentially in the number of parameters. Another class of approaches casts hyperparameter optimization as a bi-level optimization problem, typically solved by gradient descent. The key challenge for these approaches is the estimation of the gradient w.r.t. the hyperparameters. Computing that gradient via forward or backward automatic differentiation usually suffers from high memory consumption, while implicit differentiation typically involves solving a linear system which can be prohibitive and numerically unstable. In addition, implicit differentiation usually assumes smooth loss functions, which is not the case of Lasso-type problems. This work introduces an efficient implicit differentiation algorithm, without matrix inversion, tailored for Lasso-type problems. Our proposal scales to high-dimensional data by leveraging the sparsity of the solutions. Empirically, we demonstrate that the proposed method outperforms a large number of standard methods for hyperparameter optimization.}
}
@InProceedings{pmlr-v119-bhaskara20a,
title = {Online Learning with Imperfect Hints},
author = {Bhaskara, Aditya and Cutkosky, Ashok and Kumar, Ravi and Purohit, Manish},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {822--831},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bhaskara20a/bhaskara20a.pdf},
url = {http://proceedings.mlr.press/v119/bhaskara20a.html},
abstract = {We consider a variant of the classical online linear optimization problem in which at every step, the online player receives a “hint” vector before choosing the action for that round. Rather surprisingly, it was shown that if the hint vector is guaranteed to have a positive correlation with the cost vector, then the online player can achieve a regret of $O(\log T)$, thus significantly improving over the $O(\sqrt{T})$ regret in the general setting. However, the result and analysis require the correlation property at \emph{all} time steps, thus raising the natural question: can we design online learning algorithms that are resilient to bad hints? In this paper we develop algorithms and nearly matching lower bounds for online learning with imperfect hints. Our algorithms are oblivious to the quality of the hints, and the regret bounds interpolate between the always-correlated hints case and the no-hints case. Our results also generalize, simplify, and improve upon previous results on optimistic regret bounds, which can be viewed as an additive version of hints.}
}
@InProceedings{pmlr-v119-bhattacharjee20a,
title = {When are Non-Parametric Methods Robust?},
author = {Bhattacharjee, Robi and Chaudhuri, Kamalika},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {832--841},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bhattacharjee20a/bhattacharjee20a.pdf},
url = {http://proceedings.mlr.press/v119/bhattacharjee20a.html},
abstract = {A growing body of research has shown that many classifiers are susceptible to adversarial examples – small strategic modifications to test inputs that lead to misclassification. In this work, we study general non-parametric methods, with a view towards understanding when they are robust to these modifications. We establish general conditions under which non-parametric methods are r-consistent – in the sense that they converge to optimally robust and accurate classifiers in the large sample limit. Concretely, our results show that when data is well-separated, nearest neighbors and kernel classifiers are r-consistent, while histograms are not. For general data distributions, we prove that preprocessing by Adversarial Pruning (Yang et. al., 2019)– that makes data well-separated – followed by nearest neighbors or kernel classifiers also leads to r-consistency.}
}
@InProceedings{pmlr-v119-bhattacharyya20a,
title = {Learning and Sampling of Atomic Interventions from Observations},
author = {Bhattacharyya, Arnab and Gayen, Sutanu and Kandasamy, Saravanan and Maran, Ashwin and Variyam, Vinodchandran N.},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {842--853},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bhattacharyya20a/bhattacharyya20a.pdf},
url = {http://proceedings.mlr.press/v119/bhattacharyya20a.html},
abstract = {We study the problem of efficiently estimating the effect of an intervention on a single variable using observational samples. Our goal is to give algorithms with polynomial time and sample complexity in a non-parametric setting. Tian and Pearl (AAAI ’02) have exactly characterized the class of causal graphs for which causal effects of atomic interventions can be identified from observational data. We make their result quantitative. Suppose 𝒫 is a causal model on a set V of n observable variables with respect to a given causal graph G, and let do(x) be an identifiable intervention on a variable X. We show that assuming that G has bounded in-degree and bounded c-components (k) and that the observational distribution satisfies a strong positivity condition: (i) [Evaluation] There is an algorithm that outputs with probability 2/3 an evaluator for a distribution P^ that satisfies TV(P(V | do(x)), P^(V)) < eps using m=O (n/eps^2) samples from P and O(mn) time. The evaluator can return in O(n) time the probability P^(v) for any assignment v to V. (ii) [Sampling] There is an algorithm that outputs with probability 2/3 a sampler for a distribution P^ that satisfies TV(P(V | do(x)), P^(V)) < eps using m=O (n/eps^2) samples from P and O(mn) time. The sampler returns an iid sample from P^ with probability 1 in O(n) time. We extend our techniques to estimate P(Y | do(x)) for a subset Y of variables of interest. We also show lower bounds for the sample complexity, demonstrating that our sample complexity has optimal dependence on the parameters n and eps, as well as if k=1 on the strong positivity parameter.}
}
@InProceedings{pmlr-v119-bhattacharyya20b,
title = {Near-optimal sample complexity bounds for learning Latent $k-$polytopes and applications to Ad-Mixtures},
author = {Bhattacharyya, Chiranjib and Kannan, Ravindran},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {854--863},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bhattacharyya20b/bhattacharyya20b.pdf},
url = {http://proceedings.mlr.press/v119/bhattacharyya20b.html},
abstract = {Deriving Optimal bounds on Sample Complexity of Latent Variable models is an active area of research. Recently such bounds were obtained for Mixture of Gaussians \cite{HSNCAY18}, no such results are known for Ad-mixtures, a generalization of Mixture distributions. In this paper we show that $O^*(dk/m)$ samples are sufficient to learn each of $k-$ topic vectors of LDA, a popular Ad-mixture model, with vocabulary size $d$ and $m\in \Omega(1)$ words per document, to any constant error in $L_1$ norm. The result is a corollary of the major contribution of this paper: the first sample complexity upper bound for the problem (introduced in \cite{BK20}) of learning the vertices of a Latent $k-$ Polytope in $\RR^d$, given perturbed points from it. The bound, $O^*(dk/\beta)$, is optimal and linear in number of parameters. It applies to many stochastic models including a broad class Ad-mixtures. To demonstrate the generality of the approach we specialize the setting to Mixed Membership Stochastic Block Models(MMSB) and show for the first time that if an MMSB has $k$ blocks, the sample complexity is $O^*(k^2)$ under usual assumptions.}
}
@InProceedings{pmlr-v119-bhojanapalli20a,
title = {Low-Rank Bottleneck in Multi-head Attention Models},
author = {Bhojanapalli, Srinadh and Yun, Chulhee and Rawat, Ankit Singh and Reddi, Sashank and Kumar, Sanjiv},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {864--873},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bhojanapalli20a/bhojanapalli20a.pdf},
url = {http://proceedings.mlr.press/v119/bhojanapalli20a.html},
abstract = {Attention based Transformer architecture has enabled significant advances in the field of natural language processing. In addition to new pre-training techniques, recent improvements crucially rely on working with a relatively larger embedding dimension for tokens. Unfortunately, this leads to models that are prohibitively large to be employed in the downstream tasks. In this paper we identify one of the important factors contributing to the large embedding size requirement. In particular, our analysis highlights that the scaling between the number of heads and the size of each head in the current architecture gives rise to a low-rank bottleneck in attention heads, causing this limitation. We further validate this in our experiments. As a solution we propose to set the head size of an attention unit to input sequence length, and independent of the number of heads, resulting in multi-head attention layers with provably more expressive power. We empirically show that this allows us to train models with a relatively smaller embedding dimension and with better performance scaling.}
}
@InProceedings{pmlr-v119-bianchi20a,
title = {Spectral Clustering with Graph Neural Networks for Graph Pooling},
author = {Bianchi, Filippo Maria and Grattarola, Daniele and Alippi, Cesare},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {874--883},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bianchi20a/bianchi20a.pdf},
url = {http://proceedings.mlr.press/v119/bianchi20a.html},
abstract = {Spectral clustering (SC) is a popular clustering technique to find strongly connected communities on a graph. SC can be used in Graph Neural Networks (GNNs) to implement pooling operations that aggregate nodes belonging to the same cluster. However, the eigendecomposition of the Laplacian is expensive and, since clustering results are graph-specific, pooling methods based on SC must perform a new optimization for each new sample. In this paper, we propose a graph clustering approach that addresses these limitations of SC. We formulate a continuous relaxation of the normalized minCUT problem and train a GNN to compute cluster assignments that minimize this objective. Our GNN-based implementation is differentiable, does not require to compute the spectral decomposition, and learns a clustering function that can be quickly evaluated on out-of-sample graphs. From the proposed clustering method, we design a graph pooling operator that overcomes some important limitations of state-of-the-art graph pooling techniques and achieves the best performance in several supervised and unsupervised tasks.}
}
@InProceedings{pmlr-v119-bica20a,
title = {Time Series Deconfounder: Estimating Treatment Effects over Time in the Presence of Hidden Confounders},
author = {Bica, Ioana and Alaa, Ahmed and Van Der Schaar, Mihaela},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {884--895},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bica20a/bica20a.pdf},
url = {http://proceedings.mlr.press/v119/bica20a.html},
abstract = {The estimation of treatment effects is a pervasive problem in medicine. Existing methods for estimating treatment effects from longitudinal observational data assume that there are no hidden confounders, an assumption that is not testable in practice and, if it does not hold, leads to biased estimates. In this paper, we develop the Time Series Deconfounder, a method that leverages the assignment of multiple treatments over time to enable the estimation of treatment effects in the presence of multi-cause hidden confounders. The Time Series Deconfounder uses a novel recurrent neural network architecture with multitask output to build a factor model over time and infer latent variables that render the assigned treatments conditionally independent; then, it performs causal inference using these latent variables that act as substitutes for the multi-cause unobserved confounders. We provide a theoretical analysis for obtaining unbiased causal effects of time-varying exposures using the Time Series Deconfounder. Using both simulated and real data we show the effectiveness of our method in deconfounding the estimation of treatment responses over time.}
}
@InProceedings{pmlr-v119-bielik20a,
title = {Adversarial Robustness for Code},
author = {Bielik, Pavol and Vechev, Martin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {896--907},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bielik20a/bielik20a.pdf},
url = {http://proceedings.mlr.press/v119/bielik20a.html},
abstract = {Machine learning and deep learning in particular has been recently used to successfully address many tasks in the domain of code such as finding and fixing bugs, code completion, decompilation, type inference and many others. However, the issue of adversarial robustness of models for code has gone largely unnoticed. In this work, we explore this issue by: (i) instantiating adversarial attacks for code (a domain with discrete and highly structured inputs), (ii) showing that, similar to other domains, neural models for code are vulnerable to adversarial attacks, and (iii) combining existing and novel techniques to improve robustness while preserving high accuracy.}
}
@InProceedings{pmlr-v119-bierkens20a,
title = {The Boomerang Sampler},
author = {Bierkens, Joris and Grazzi, Sebastiano and Kamatani, Kengo and Roberts, Gareth},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {908--918},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bierkens20a/bierkens20a.pdf},
url = {http://proceedings.mlr.press/v119/bierkens20a.html},
abstract = {This paper introduces the boomerang sampler as a novel class of continuous-time non-reversible Markov chain Monte Carlo algorithms. The methodology begins by representing the target density as a density, $e^{-U}$, with respect to a prescribed (usually) Gaussian measure and constructs a continuous trajectory consisting of a piecewise circular path. The method moves from one circular orbit to another according to a rate function which can be written in terms of $U$. We demonstrate that the method is easy to implement and demonstrate empirically that it can out-perform existing benchmark piecewise deterministic Markov processes such as the bouncy particle sampler and the Zig-Zag. In the Bayesian statistics context, these competitor algorithms are of substantial interest in the large data context due to the fact that they can adopt data subsampling techniques which are exact (ie induce no error in the stationary distribution). We demonstrate theoretically and empirically that we can also construct a control-variate subsampling boomerang sampler which is also exact, and which possesses remarkable scaling properties in the large data limit. We furthermore illustrate a factorised version on the simulation of diffusion bridges.}
}
@InProceedings{pmlr-v119-bilodeau20a,
title = {Tight Bounds on Minimax Regret under Logarithmic Loss via Self-Concordance},
author = {Bilodeau, Blair and Foster, Dylan and Roy, Daniel},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {919--929},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bilodeau20a/bilodeau20a.pdf},
url = {http://proceedings.mlr.press/v119/bilodeau20a.html},
abstract = {We consider the classical problem of sequential probability assignment under logarithmic loss while competing against an arbitrary, potentially nonparametric class of experts. We obtain tight bounds on the minimax regret via a new approach that exploits the self-concordance property of the logarithmic loss. We show that for any expert class with (sequential) metric entropy $\mathcal{O}(\gamma^{-p})$ at scale $\gamma$, the minimax regret is $\mathcal{O}(n^{\frac{p}{p+1}})$, and that this rate cannot be improved without additional assumptions on the expert class under consideration. As an application of our techniques, we resolve the minimax regret for nonparametric Lipschitz classes of experts.}
}
@InProceedings{pmlr-v119-bistritz20a,
title = {My Fair Bandit: Distributed Learning of Max-Min Fairness with Multi-player Bandits},
author = {Bistritz, Ilai and Baharav, Tavor and Leshem, Amir and Bambos, Nicholas},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {930--940},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bistritz20a/bistritz20a.pdf},
url = {http://proceedings.mlr.press/v119/bistritz20a.html},
abstract = {Consider N cooperative but non-communicating players where each plays one out of M arms for T turns. Players have different utilities for each arm, representable as an NxM matrix. These utilities are unknown to the players. In each turn players receive noisy observations of their utility for their selected arm. However, if any other players selected the same arm that turn, they will all receive zero utility due to the conflict. No other communication or coordination between the players is possible. Our goal is to design a distributed algorithm that learns the matching between players and arms that achieves max-min fairness while minimizing the regret. We present an algorithm and prove that it is regret optimal up to a \log\log T factor. This is the first max-min fairness multi-player bandit algorithm with (near) order optimal regret.}
}
@InProceedings{pmlr-v119-blanc20a,
title = {Provable guarantees for decision tree induction: the agnostic setting},
author = {Blanc, Guy and Lange, Jane and Tan, Li-Yang},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {941--949},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/blanc20a/blanc20a.pdf},
url = {http://proceedings.mlr.press/v119/blanc20a.html},
abstract = {We give strengthened provable guarantees on the performance of widely employed and empirically successful {\sl top-down decision tree learning heuristics}. While prior works have focused on the realizable setting, we consider the more realistic and challenging {\sl agnostic} setting. We show that for all monotone functions $f$ and $s\in \mathbb{N}$, these heuristics construct a decision tree of size $s^{\tilde{O}((\log s)/\varepsilon^2)}$ that achieves error $\le \mathsf{opt}_s + \varepsilon$, where $\mathsf{opt}_s$ denotes the error of the optimal size-$s$ decision tree for $f$. Previously such a guarantee was not known to be achievable by any algorithm, even one that is not based on top-down heuristics. We complement our algorithmic guarantee with a near-matching $s^{\tilde{\Omega}(\log s)}$ lower bound.}
}
@InProceedings{pmlr-v119-blondel20a,
title = {Fast Differentiable Sorting and Ranking},
author = {Blondel, Mathieu and Teboul, Olivier and Berthet, Quentin and Djolonga, Josip},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {950--959},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/blondel20a/blondel20a.pdf},
url = {http://proceedings.mlr.press/v119/blondel20a.html},
abstract = {The sorting operation is one of the most commonly used building blocks in computer programming. In machine learning, it is often used for robust statistics. However, seen as a function, it is piecewise linear and as a result includes many kinks where it is non-differentiable. More problematic is the related ranking operator, often used for order statistics and ranking metrics. It is a piecewise constant function, meaning that its derivatives are null or undefined. While numerous works have proposed differentiable proxies to sorting and ranking, they do not achieve the $O(n \log n)$ time complexity one would expect from sorting and ranking operations. In this paper, we propose the first differentiable sorting and ranking operators with $O(n \log n)$ time and $O(n)$ space complexity. Our proposal in addition enjoys exact computation and differentiation. We achieve this feat by constructing differentiable operators as projections onto the permutahedron, the convex hull of permutations, and using a reduction to isotonic optimization. Empirically, we confirm that our approach is an order of magnitude faster than existing approaches and showcase two novel applications: differentiable Spearman’s rank correlation coefficient and least trimmed squares.}
}
@InProceedings{pmlr-v119-blumenfeld20a,
title = {Beyond Signal Propagation: Is Feature Diversity Necessary in Deep Neural Network Initialization?},
author = {Blumenfeld, Yaniv and Gilboa, Dar and Soudry, Daniel},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {960--969},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/blumenfeld20a/blumenfeld20a.pdf},
url = {http://proceedings.mlr.press/v119/blumenfeld20a.html},
abstract = {Deep neural networks are typically initialized with random weights, with variances chosen to facilitate signal propagation and stable gradients. It is also believed that diversity of features is an important property of these initializations. We construct a deep convolutional network with identical features by initializing almost all the weights to $0$. The architecture also enables perfect signal propagation and stable gradients, and achieves high accuracy on standard benchmarks. This indicates that random, diverse initializations are \emph{not} necessary for training neural networks. An essential element in training this network is a mechanism of symmetry breaking; we study this phenomenon and find that standard GPU operations, which are non-deterministic, can serve as a sufficient source of symmetry breaking to enable training.}
}
@InProceedings{pmlr-v119-bodin20a,
title = {Modulating Surrogates for {B}ayesian Optimization},
author = {Bodin, Erik and Kaiser, Markus and Kazlauskaite, Ieva and Dai, Zhenwen and Campbell, Neill and Ek, Carl Henrik},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {970--979},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bodin20a/bodin20a.pdf},
url = {http://proceedings.mlr.press/v119/bodin20a.html},
abstract = {Bayesian optimization (BO) methods often rely on the assumption that the objective function is well-behaved, but in practice, this is seldom true for real-world objectives even if noise-free observations can be collected. Common approaches, which try to model the objective as precisely as possible, often fail to make progress by spending too many evaluations modeling irrelevant details. We address this issue by proposing surrogate models that focus on the well-behaved structure in the objective function, which is informative for search, while ignoring detrimental structure that is challenging to model from few observations. First, we demonstrate that surrogate models with appropriate noise distributions can absorb challenging structures in the objective function by treating them as irreducible uncertainty. Secondly, we show that a latent Gaussian process is an excellent surrogate for this purpose, comparing with Gaussian processes with standard noise distributions. We perform numerous experiments on a range of BO benchmarks and find that our approach improves reliability and performance when faced with challenging objective functions.}
}
@InProceedings{pmlr-v119-boehmer20a,
title = {Deep Coordination Graphs},
author = {Boehmer, Wendelin and Kurin, Vitaly and Whiteson, Shimon},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {980--991},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/boehmer20a/boehmer20a.pdf},
url = {http://proceedings.mlr.press/v119/boehmer20a.html},
abstract = {This paper introduces the deep coordination graph (DCG) for collaborative multi-agent reinforcement learning. DCG strikes a flexible trade-off between representational capacity and generalization by factoring the joint value function of all agents according to a coordination graph into payoffs between pairs of agents. The value can be maximized by local message passing along the graph, which allows training of the value function end-to-end with Q-learning. Payoff functions are approximated with deep neural networks that employ parameter sharing and low-rank approximations to significantly improve sample efficiency. We show that DCG can solve predator-prey tasks that highlight the relative overgeneralization pathology, as well as challenging StarCraft II micromanagement tasks.}
}
@InProceedings{pmlr-v119-bogatskiy20a,
title = {{L}orentz Group Equivariant Neural Network for Particle Physics},
author = {Bogatskiy, Alexander and Anderson, Brandon and Offermann, Jan and Roussi, Marwah and Miller, David and Kondor, Risi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {992--1002},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bogatskiy20a/bogatskiy20a.pdf},
url = {http://proceedings.mlr.press/v119/bogatskiy20a.html},
abstract = {We present a neural network architecture that is fully equivariant with respect to transformations under the Lorentz group, a fundamental symmetry of space and time in physics. The architecture is based on the theory of the finite-dimensional representations of the Lorentz group and the equivariant nonlinearity involves the tensor product. For classification tasks in particle physics, we show that such an equivariant architecture leads to drastically simpler models that have relatively few learnable parameters and are much more physically interpretable than leading approaches that use CNNs and point cloud approaches. The performance of the network is tested on a public classification dataset [https://zenodo.org/record/2603256] for tagging top quark decays given energy-momenta of jet constituents produced in proton-proton collisions.}
}
@InProceedings{pmlr-v119-bojchevski20a,
title = {Efficient Robustness Certificates for Discrete Data: Sparsity-Aware Randomized Smoothing for Graphs, Images and More},
author = {Bojchevski, Aleksandar and Klicpera, Johannes and G{\"u}nnemann, Stephan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1003--1013},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bojchevski20a/bojchevski20a.pdf},
url = {http://proceedings.mlr.press/v119/bojchevski20a.html},
abstract = {Existing techniques for certifying the robustness of models for discrete data either work only for a small class of models or are general at the expense of efficiency or tightness. Moreover, they do not account for sparsity in the input which, as our findings show, is often essential for obtaining non-trivial guarantees. We propose a model-agnostic certificate based on the randomized smoothing framework which subsumes earlier work and is tight, efficient, and sparsity-aware. Its computational complexity does not depend on the number of discrete categories or the dimension of the input (e.g. the graph size), making it highly scalable. We show the effectiveness of our approach on a wide variety of models, datasets, and tasks – specifically highlighting its use for Graph Neural Networks. So far, obtaining provable guarantees for GNNs has been difficult due to the discrete and non-i.i.d. nature of graph data. Our method can certify any GNN and handles perturbations to both the graph structure and the node attributes.}
}
@InProceedings{pmlr-v119-boopathy20a,
title = {Proper Network Interpretability Helps Adversarial Robustness in Classification},
author = {Boopathy, Akhilan and Liu, Sijia and Zhang, Gaoyuan and Liu, Cynthia and Chen, Pin-Yu and Chang, Shiyu and Daniel, Luca},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1014--1023},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/boopathy20a/boopathy20a.pdf},
url = {http://proceedings.mlr.press/v119/boopathy20a.html},
abstract = {Recent works have empirically shown that there exist adversarial examples that can be hidden from neural network interpretability (namely, making network interpretation maps visually similar), or interpretability is itself susceptible to adversarial attacks. In this paper, we theoretically show that with a proper measurement of interpretation, it is actually difficult to prevent prediction-evasion adversarial attacks from causing interpretation discrepancy, as confirmed by experiments on MNIST, CIFAR-10 and Restricted ImageNet. Spurred by that, we develop an interpretability-aware defensive scheme built only on promoting robust interpretation (without the need for resorting to adversarial loss minimization). We show that our defense achieves both robust classification and robust interpretation, outperforming state-of-the-art adversarial training methods against attacks of large perturbation in particular.}
}
@InProceedings{pmlr-v119-bordelon20a,
title = {Spectrum Dependent Learning Curves in Kernel Regression and Wide Neural Networks},
author = {Bordelon, Blake and Canatar, Abdulkadir and Pehlevan, Cengiz},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1024--1034},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bordelon20a/bordelon20a.pdf},
url = {http://proceedings.mlr.press/v119/bordelon20a.html},
abstract = {We derive analytical expressions for the generalization performance of kernel regression as a function of the number of training samples using theoretical methods from Gaussian processes and statistical physics. Our expressions apply to wide neural networks due to an equivalence between training them and kernel regression with the Neural Tangent Kernel (NTK). By computing the decomposition of the total generalization error due to different spectral components of the kernel, we identify a new spectral principle: as the size of the training set grows, kernel machines and neural networks fit successively higher spectral modes of the target function. When data are sampled from a uniform distribution on a high-dimensional hypersphere, dot product kernels, including NTK, exhibit learning stages where different frequency modes of the target function are learned. We verify our theory with simulations on synthetic data and MNIST dataset.}
}
@InProceedings{pmlr-v119-bornschein20a,
title = {Small Data, Big Decisions: Model Selection in the Small-Data Regime},
author = {Bornschein, Jorg and Visin, Francesco and Osindero, Simon},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1035--1044},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bornschein20a/bornschein20a.pdf},
url = {http://proceedings.mlr.press/v119/bornschein20a.html},
abstract = {Highly overparametrized neural networks can display curiously strong generalization performance – a phenomenon that has recently garnered a wealth of theoretical and empirical research in order to better understand it. In contrast to most previous work, which typically considers the performance as a function of the model size, in this paper we empirically study the generalization performance as the size of the training set varies over multiple orders of magnitude. These systematic experiments lead to some interesting and potentially very useful observations; perhaps most notably that training on smaller subsets of the data can lead to more reliable model selection decisions whilst simultaneously enjoying smaller computational overheads. Our experiments furthermore allow us to estimate Minimum Description Lengths for common datasets given modern neural network architectures, thereby paving the way for principled model selection taking into account Occams-razor.}
}
@InProceedings{pmlr-v119-bose20a,
title = {Latent Variable Modelling with Hyperbolic Normalizing Flows},
author = {Bose, Joey and Smofsky, Ariella and Liao, Renjie and Panangaden, Prakash and Hamilton, Will},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1045--1055},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bose20a/bose20a.pdf},
url = {http://proceedings.mlr.press/v119/bose20a.html},
abstract = {The choice of approximate posterior distributions plays a central role in stochastic variational inference (SVI). One effective solution is the use of normalizing flows \cut{defined on Euclidean spaces} to construct flexible posterior distributions. However, one key limitation of existing normalizing flows is that they are restricted to the Euclidean space and are ill-equipped to model data with an underlying hierarchical structure. To address this fundamental limitation, we present the first extension of normalizing flows to hyperbolic spaces. We first elevate normalizing flows to hyperbolic spaces using coupling transforms defined on the tangent bundle, termed Tangent Coupling ($\mathcal{TC}$). We further introduce Wrapped Hyperboloid Coupling ($\mathcal{W}\mathbb{H}C$), a fully invertible and learnable transformation that explicitly utilizes the geometric structure of hyperbolic spaces, allowing for expressive posteriors while being efficient to sample from. We demonstrate the efficacy of our novel normalizing flow over hyperbolic VAEs and Euclidean normalizing flows. Our approach achieves improved performance on density estimation, as well as reconstruction of real-world graph data, which exhibit a hierarchical structure. Finally, we show that our approach can be used to power a generative model over hierarchical data using hyperbolic latent variables.}
}
@InProceedings{pmlr-v119-bourel20a,
title = {Tightening Exploration in Upper Confidence Reinforcement Learning},
author = {Bourel, Hippolyte and Maillard, Odalric and Talebi, Mohammad Sadegh},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1056--1066},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bourel20a/bourel20a.pdf},
url = {http://proceedings.mlr.press/v119/bourel20a.html},
abstract = {The upper confidence reinforcement learning (UCRL2) algorithm introduced in \citep{jaksch2010near} is a popular method to perform regret minimization in unknown discrete Markov Decision Processes under the average-reward criterion. Despite its nice and generic theoretical regret guarantees, this algorithm and its variants have remained until now mostly theoretical as numerical experiments in simple environments exhibit long burn-in phases before the learning takes place. In pursuit of practical efficiency, we present UCRL3, following the lines of UCRL2, but with two key modifications: First, it uses state-of-the-art time-uniform concentration inequalities to compute confidence sets on the reward and (component-wise) transition distributions for each state-action pair. Furthermore, to tighten exploration, it uses an adaptive computation of the support of each transition distribution, which in turn enables us to revisit the extended value iteration procedure of UCRL2 to optimize over distributions with reduced support by disregarding low probability transitions, while still ensuring near-optimism. We demonstrate, through numerical experiments in standard environments, that reducing exploration this way yields a substantial numerical improvement compared to UCRL2 and its variants. On the theoretical side, these key modifications enable us to derive a regret bound for UCRL3 improving on UCRL2, that for the first time makes appear notions of local diameter and local effective support, thanks to variance-aware concentration bounds.}
}
@InProceedings{pmlr-v119-bower20a,
title = {Preference Modeling with Context-Dependent Salient Features},
author = {Bower, Amanda and Balzano, Laura},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1067--1077},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bower20a/bower20a.pdf},
url = {http://proceedings.mlr.press/v119/bower20a.html},
abstract = {We consider the problem of estimating a ranking on a set of items from noisy pairwise comparisons given item features. We address the fact that pairwise comparison data often reflects irrational choice, e.g. intransitivity. Our key observation is that two items compared in isolation from other items may be compared based on only a salient subset of features. Formalizing this framework, we propose the salient feature preference model and prove a finite sample complexity result for learning the parameters of our model and the underlying ranking with maximum likelihood estimation. We also provide empirical results that support our theoretical bounds and illustrate how our model explains systematic intransitivity. Finally we demonstrate strong performance of maximum likelihood estimation of our model on both synthetic data and two real data sets: the UT Zappos50K data set and comparison data about the compactness of legislative districts in the US.}
}
@InProceedings{pmlr-v119-bras20a,
title = {Adversarial Filters of Dataset Biases},
author = {Bras, Ronan Le and Swayamdipta, Swabha and Bhagavatula, Chandra and Zellers, Rowan and Peters, Matthew and Sabharwal, Ashish and Choi, Yejin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1078--1088},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bras20a/bras20a.pdf},
url = {http://proceedings.mlr.press/v119/bras20a.html},
abstract = {Large neural models have demonstrated human-level performance on language and vision benchmarks, while their performance degrades considerably on adversarial or out-of-distribution samples. This raises the question of whether these models have learned to solve a dataset rather than the underlying task by overfitting to spurious dataset biases. We investigate one recently proposed approach, AFLITE, which adversarially filters such dataset biases, as a means to mitigate the prevalent overestimation of machine performance. We provide a theoretical understanding for AFLITE, by situating it in the generalized framework for optimum bias reduction. We present extensive supporting evidence that AFLITE is broadly applicable for reduction of measurable dataset biases, and that models trained on the filtered datasets yield better generalization to out-of-distribution tasks. Finally, filtering results in a large drop in model performance (e.g., from 92% to 62% for SNLI), while human performance still remains high. Our work thus shows that such filtered datasets can pose new research challenges for robust generalization by serving as upgraded benchmarks.}
}
@InProceedings{pmlr-v119-braverman20a,
title = {Calibration, Entropy Rates, and Memory in Language Models},
author = {Braverman, Mark and Chen, Xinyi and Kakade, Sham and Narasimhan, Karthik and Zhang, Cyril and Zhang, Yi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1089--1099},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/braverman20a/braverman20a.pdf},
url = {http://proceedings.mlr.press/v119/braverman20a.html},
abstract = {Building accurate language models that capture meaningful long-term dependencies is a core challenge in natural language processing. Towards this end, we present a calibration-based approach to measure long-term discrepancies between a generative sequence model and the true distribution, and use these discrepancies to improve the model. Empirically, we show that state-of-the-art language models, including LSTMs and Transformers, are miscalibrated: the entropy rates of their generations drift dramatically upward over time. We then provide provable methods to mitigate this phenomenon. Furthermore, we show how this calibration-based approach can also be used to measure the amount of memory that language models use for prediction.}
}
@InProceedings{pmlr-v119-braverman20b,
title = {Schatten Norms in Matrix Streams: Hello Sparsity, Goodbye Dimension},
author = {Braverman, Vladimir and Krauthgamer, Robert and Krishnan, Aditya and Sinoff, Roi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1100--1110},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/braverman20b/braverman20b.pdf},
url = {http://proceedings.mlr.press/v119/braverman20b.html},
abstract = {Spectral functions of large matrices contains important structural information about the underlying data, and is thus becoming increasingly important. Many times, large matrices representing real-world data are sparse or doubly sparse (i.e., sparse in both rows and columns), and are accessed as a stream of updates, typically organized in row-order. In this setting, where space (memory) is the limiting resource, all known algorithms require space that is polynomial in the dimension of the matrix, even for sparse matrices. We address this challenge by providing the first algorithms whose space requirement is independent of the matrix dimension, assuming the matrix is doubly-sparse and presented in row-order. Our algorithms approximate the Schatten p-norms, which we use in turn to approximate other spectral functions, such as logarithm of the determinant, trace of matrix inverse, and Estrada index. We validate these theoretical performance bounds by numerical experiments on real-world matrices representing social networks. We further prove that multiple passes are unavoidable in this setting, and show extensions of our primary technique, including a trade-off between space requirements and number of passes.}
}
@InProceedings{pmlr-v119-brekelmans20a,
title = {All in the Exponential Family: {B}regman Duality in Thermodynamic Variational Inference},
author = {Brekelmans, Rob and Masrani, Vaden and Wood, Frank and Steeg, Greg Ver and Galstyan, Aram},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1111--1122},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/brekelmans20a/brekelmans20a.pdf},
url = {http://proceedings.mlr.press/v119/brekelmans20a.html},
abstract = {The recently proposed Thermodynamic Variational Objective (TVO) leverages thermodynamic integration to provide a family of variational inference objectives, which both tighten and generalize the ubiquitous Evidence Lower Bound (ELBO). However, the tightness of TVO bounds was not previously known, an expensive grid search was used to choose a “schedule” of intermediate distributions, and model learning suffered with ostensibly tighter bounds. In this work, we propose an exponential family interpretation of the geometric mixture curve underlying the TVO and various path sampling methods, which allows us to characterize the gap in TVO likelihood bounds as a sum of KL divergences. We propose to choose intermediate distributions using equal spacing in the moment parameters of our exponential family, which matches grid search performance and allows the schedule to adaptively update over the course of training. Finally, we derive a doubly reparameterized gradient estimator which improves model learning and allows the TVO to benefit from more refined bounds. To further contextualize our contributions, we provide a unified framework for understanding thermodynamic integration and the TVO using Taylor series remainders.}
}
@InProceedings{pmlr-v119-brennan20a,
title = {Estimating the Number and Effect Sizes of Non-null Hypotheses},
author = {Brennan, Jennifer and Vinayak, Ramya Korlakai and Jamieson, Kevin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1123--1133},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/brennan20a/brennan20a.pdf},
url = {http://proceedings.mlr.press/v119/brennan20a.html},
abstract = {We study the problem of estimating the distribution of effect sizes (the mean of the test statistic under the alternate hypothesis) in a multiple testing setting. Knowing this distribution allows us to calculate the power (type II error) of any experimental design. We show that it is possible to estimate this distribution using an inexpensive pilot experiment, which takes significantly fewer samples than would be required by an experiment that identified the discoveries. Our estimator can be used to guarantee the number of discoveries that will be made using a given experimental design in a future experiment. We prove that this simple and computationally efficient estimator enjoys a number of favorable theoretical properties, and demonstrate its effectiveness on data from a gene knockout experiment on influenza inhibition in Drosophila.}
}
@InProceedings{pmlr-v119-breuer20a,
title = {The {FAST} Algorithm for Submodular Maximization},
author = {Breuer, Adam and Balkanski, Eric and Singer, Yaron},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1134--1143},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/breuer20a/breuer20a.pdf},
url = {http://proceedings.mlr.press/v119/breuer20a.html},
abstract = {In this paper we describe a new parallel algorithm called Fast Adaptive Sequencing Technique (FAST) for maximizing a monotone submodular function under a cardinality constraint k. This algorithm achieves the optimal 1-1/e approximation guarantee and is orders of magnitude faster than the state-of-the-art on a variety of experiments over real-world data sets. Following recent work by Balkanski and Singer (2018), there has been a great deal of research on algorithms whose theoretical parallel runtime is exponentially faster than algorithms used for submodular maximization over the past 40 years. However, while these new algorithms are fast in terms of asymptotic worst-case guarantees, it is computationally infeasible to use them in practice even on small data sets because the number of rounds and queries they require depend on large constants and high-degree polynomials in terms of precision and confidence. The design principles behind the FAST algorithm we present here are a significant departure from those of recent theoretically fast algorithms. Rather than optimize for asymptotic theoretical guarantees, the design of FAST introduces several new techniques that achieve remarkable practical and theoretical parallel runtimes. The approximation guarantee obtained by FAST is arbitrarily close to 1 - 1/e, and its asymptotic parallel runtime (adaptivity) is O(log(n) log^2(log k)) using O(n log log(k)) total queries. We show that FAST is orders of magnitude faster than any algorithm for submodular maximization we are aware of, including hyper-optimized parallel versions of state-of-the-art serial algorithms, by running experiments on large data sets.}
}
@InProceedings{pmlr-v119-brockschmidt20a,
title = {{GNN}-{F}i{LM}: Graph Neural Networks with Feature-wise Linear Modulation},
author = {Brockschmidt, Marc},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1144--1152},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/brockschmidt20a/brockschmidt20a.pdf},
url = {http://proceedings.mlr.press/v119/brockschmidt20a.html},
abstract = {This paper presents a new Graph Neural Network (GNN) type using feature-wise linear modulation (FiLM). Many standard GNN variants propagate information along the edges of a graph by computing messages based only on the representation of the source of each edge. In GNN-FiLM, the representation of the target node of an edge is used to compute a transformation that can be applied to all incoming messages, allowing feature-wise modulation of the passed information. Different GNN architectures are compared in extensive experiments on three tasks from the literature, using re-implementations of many baseline methods. Hyperparameters for all methods were found using extensive search, yielding somewhat surprising results: differences between state of the art models are much smaller than reported in the literature and well-known simple baselines that are often not compared to perform better than recently proposed GNN variants. Nonetheless, GNN-FiLM outperforms these methods on a regression task on molecular graphs and performs competitively on other tasks.}
}
@InProceedings{pmlr-v119-bronskill20a,
title = {{T}ask{N}orm: Rethinking Batch Normalization for Meta-Learning},
author = {Bronskill, John and Gordon, Jonathan and Requeima, James and Nowozin, Sebastian and Turner, Richard},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1153--1164},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bronskill20a/bronskill20a.pdf},
url = {http://proceedings.mlr.press/v119/bronskill20a.html},
abstract = {Modern meta-learning approaches for image classification rely on increasingly deep networks to achieve state-of-the-art performance, making batch normalization an essential component of meta-learning pipelines. However, the hierarchical nature of the meta-learning setting presents several challenges that can render conventional batch normalization ineffective, giving rise to the need to rethink normalization in this setting. We evaluate a range of approaches to batch normalization for meta-learning scenarios, and develop a novel approach that we call TaskNorm. Experiments on fourteen datasets demonstrate that the choice of batch normalization has a dramatic effect on both classification accuracy and training time for both gradient based- and gradient-free meta-learning approaches. Importantly, TaskNorm is found to consistently improve performance. Finally, we provide a set of best practices for normalization that will allow fair comparison of meta-learning algorithms.}
}
@InProceedings{pmlr-v119-brown20a,
title = {Safe Imitation Learning via Fast {B}ayesian Reward Inference from Preferences},
author = {Brown, Daniel and Coleman, Russell and Srinivasan, Ravi and Niekum, Scott},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1165--1177},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/brown20a/brown20a.pdf},
url = {http://proceedings.mlr.press/v119/brown20a.html},
abstract = {Bayesian reward learning from demonstrations enables rigorous safety and uncertainty analysis when performing imitation learning. However, Bayesian reward learning methods are typically computationally intractable for complex control problems. We propose Bayesian Reward Extrapolation (Bayesian REX), a highly efficient Bayesian reward learning algorithm that scales to high-dimensional imitation learning problems by pre-training a low-dimensional feature encoding via self-supervised tasks and then leveraging preferences over demonstrations to perform fast Bayesian inference. Bayesian REX can learn to play Atari games from demonstrations, without access to the game score and can generate 100,000 samples from the posterior over reward functions in only 5 minutes on a personal laptop. Bayesian REX also results in imitation learning performance that is competitive with or better than state-of-the-art methods that only learn point estimates of the reward function. Finally, Bayesian REX enables efficient high-confidence policy evaluation without having access to samples of the reward function. These high-confidence performance bounds can be used to rank the performance and risk of a variety of evaluation policies and provide a way to detect reward hacking behaviors.}
}
@InProceedings{pmlr-v119-brubach20a,
title = {A Pairwise Fair and Community-preserving Approach to k-Center Clustering},
author = {Brubach, Brian and Chakrabarti, Darshan and Dickerson, John and Khuller, Samir and Srinivasan, Aravind and Tsepenekas, Leonidas},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1178--1189},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/brubach20a/brubach20a.pdf},
url = {http://proceedings.mlr.press/v119/brubach20a.html},
abstract = {Clustering is a foundational problem in machine learning with numerous applications. As machine learning increases in ubiquity as a backend for automated systems, concerns about fairness arise. Much of the current literature on fairness deals with discrimination against protected classes in supervised learning (group fairness). We define a different notion of fair clustering wherein the probability that two points (or a community of points) become separated is bounded by an increasing function of their pairwise distance (or community diameter). We capture the situation where data points represent people who gain some benefit from being clustered together. Unfairness arises when certain points are deterministically separated, either arbitrarily or by someone who intends to harm them as in the case of gerrymandering election districts. In response, we formally define two new types of fairness in the clustering setting, pairwise fairness and community preservation. To explore the practicality of our fairness goals, we devise an approach for extending existing $k$-center algorithms to satisfy these fairness constraints. Analysis of this approach proves that reasonable approximations can be achieved while maintaining fairness. In experiments, we compare the effectiveness of our approach to classical $k$-center algorithms/heuristics and explore the tradeoff between optimal clustering and fairness.}
}
@InProceedings{pmlr-v119-bruinsma20a,
title = {Scalable Exact Inference in Multi-Output {G}aussian Processes},
author = {Bruinsma, Wessel and Perim, Eric and Tebbutt, William and Hosking, Scott and Solin, Arno and Turner, Richard},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1190--1201},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bruinsma20a/bruinsma20a.pdf},
url = {http://proceedings.mlr.press/v119/bruinsma20a.html},
abstract = {Multi-output Gaussian processes (MOGPs) leverage the flexibility and interpretability of GPs while capturing structure across outputs, which is desirable, for example, in spatio-temporal modelling. The key problem with MOGPs is their computational scaling $O(n^3 p^3)$, which is cubic in the number of both inputs $n$ (e.g., time points or locations) and outputs $p$. For this reason, a popular class of MOGPs assumes that the data live around a low-dimensional linear subspace, reducing the complexity to $O(n^3 m^3)$. However, this cost is still cubic in the dimensionality of the subspace $m$, which is still prohibitively expensive for many applications. We propose the use of a sufficient statistic of the data to accelerate inference and learning in MOGPs with orthogonal bases. The method achieves linear scaling in $m$ in practice, allowing these models to scale to large $m$ without sacrificing significant expressivity or requiring approximation. This advance opens up a wide range of real-world tasks and can be combined with existing GP approximations in a plug-and-play way. We demonstrate the efficacy of the method on various synthetic and real-world data sets.}
}
@InProceedings{pmlr-v119-bu20a,
title = {Online Pricing with Offline Data: Phase Transition and Inverse Square Law},
author = {Bu, Jinzhi and Simchi-Levi, David and Xu, Yunzong},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1202--1210},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/bu20a/bu20a.pdf},
url = {http://proceedings.mlr.press/v119/bu20a.html},
abstract = {This paper investigates the impact of pre-existing offline data on online learning, in the context of dynamic pricing. We study a single-product dynamic pricing problem over a selling horizon of T periods. The demand in each period is determined by the price of the product according to a linear demand model with unknown parameters. We assume that the seller already has some pre-existing offline data before the start of the selling horizon. The seller wants to utilize both the pre-existing offline data and the sequential online data to minimize the regret of the online learning process. We characterize the joint effect of the size, location and dispersion of the offline data on the optimal regret of the online learning process. Our results reveal surprising transformations of the optimal regret rate with respect to the size of the offline data, which we refer to as phase transitions. In addition, our results demonstrate that the location and dispersion of the offline data also have an intrinsic effect on the optimal regret, and we quantify this effect via the inverse-square law.}
}
@InProceedings{pmlr-v119-buhai20a,
title = {Empirical Study of the Benefits of Overparameterization in Learning Latent Variable Models},
author = {Buhai, Rares-Darius and Halpern, Yoni and Kim, Yoon and Risteski, Andrej and Sontag, David},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1211--1219},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/buhai20a/buhai20a.pdf},
url = {http://proceedings.mlr.press/v119/buhai20a.html},
abstract = {One of the most surprising and exciting discoveries in supervised learning was the benefit of overparameterization (i.e. training a very large model) to improving the optimization landscape of a problem, with minimal effect on statistical performance (i.e. generalization). In contrast, unsupervised settings have been under-explored, despite the fact that it was observed that overparameterization can be helpful as early as Dasgupta & Schulman (2007). We perform an empirical study of different aspects of overparameterization in unsupervised learning of latent variable models via synthetic and semi-synthetic experiments. We discuss benefits to different metrics of success (recovering the parameters of the ground-truth model, held-out log-likelihood), sensitivity to variations of the training algorithm, and behavior as the amount of overparameterization increases. We find that across a variety of models (noisy-OR networks, sparse coding, probabilistic context-free grammars) and training algorithms (variational inference, alternating minimization, expectation-maximization), overparameterization can significantly increase the number of ground truth latent variables recovered.}
}
@InProceedings{pmlr-v119-buyl20a,
title = {{D}e{B}ayes: a {B}ayesian Method for Debiasing Network Embeddings},
author = {Buyl, Maarten and De Bie, Tijl},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1220--1229},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/buyl20a/buyl20a.pdf},
url = {http://proceedings.mlr.press/v119/buyl20a.html},
abstract = {As machine learning algorithms are increasingly deployed for high-impact automated decision making, ethical and increasingly also legal standards demand that they treat all individuals fairly, without discrimination based on their age, gender, race or other sensitive traits. In recent years much progress has been made on ensuring fairness and reducing bias in standard machine learning settings. Yet, for network embedding, with applications in vulnerable domains ranging from social network analysis to recommender systems, current options remain limited both in number and performance. We thus propose DeBayes: a conceptually elegant Bayesian method that is capable of learning debiased embeddings by using a biased prior. Our experiments show that these representations can then be used to perform link prediction that is significantly more fair in terms of popular metrics such as demographic parity and equalized opportunity.}
}
@InProceedings{pmlr-v119-cabannnes20a,
title = {Structured Prediction with Partial Labelling through the Infimum Loss},
author = {Cabannnes, Vivien and Rudi, Alessandro and Bach, Francis},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1230--1239},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cabannnes20a/cabannnes20a.pdf},
url = {http://proceedings.mlr.press/v119/cabannnes20a.html},
abstract = {Annotating datasets is one of the main costs in nowadays supervised learning. The goal of weak supervision is to enable models to learn using only forms of labelling which are cheaper to collect, as partial labelling. This is a type of incomplete annotation where, for each datapoint, supervision is cast as a set of labels containing the real one. The problem of supervised learning with partial labelling has been studied for specific instances such as classification, multi-label, ranking or segmentation, but a general framework is still missing. This paper provides a unified framework based on structured prediction and on the concept of \emph{infimum loss} to deal with partial labelling over a wide family of learning problems and loss functions. The framework leads naturally to explicit algorithms that can be easily implemented and for which proved statistical consistency and learning rates. Experiments confirm the superiority of the proposed approach over commonly used baselines.}
}
@InProceedings{pmlr-v119-caccia20a,
title = {Online Learned Continual Compression with Adaptive Quantization Modules},
author = {Caccia, Lucas and Belilovsky, Eugene and Caccia, Massimo and Pineau, Joelle},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1240--1250},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/caccia20a/caccia20a.pdf},
url = {http://proceedings.mlr.press/v119/caccia20a.html},
abstract = {We introduce and study the problem of Online Continual Compression, where one attempts to simultaneously learn to compress and store a representative dataset from a non i.i.d data stream, while only observing each sample once. A naive application of auto-encoder in this setting encounters a major challenge: representations derived from earlier encoder states must be usable by later decoder states. We show how to use discrete auto-encoders to effectively address this challenge and introduce Adaptive Quantization Modules (AQM) to control variation in the compression ability of the module at any given stage of learning. This enables selecting an appropriate compression for incoming samples, while taking into account overall memory constraints and current progress of the learned compression. Unlike previous methods, our approach does not require any pretraining, even on challenging datasets. We show that using AQM to replace standard episodic memory in continual learning settings leads to significant gains on continual learning benchmarks with images, LiDAR, and reinforcement learning agents.}
}
@InProceedings{pmlr-v119-cai20a,
title = {Boosted Histogram Transform for Regression},
author = {Cai, Yuchao and Hang, Hanyuan and Yang, Hanfang and Lin, Zhouchen},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1251--1261},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cai20a/cai20a.pdf},
url = {http://proceedings.mlr.press/v119/cai20a.html},
abstract = {In this paper, we propose a boosting algorithm for regression problems called \emph{boosted histogram transform for regression} (BHTR) based on histogram transforms composed of random rotations, stretchings, and translations. From the theoretical perspective, we first prove fast convergence rates for BHTR under the assumption that the target function lies in the spaces $C^{0,\alpha}$. Moreover, if the target function resides in the subspace $C^{1,\alpha}$, by establishing the upper bound of the convergence rate for the boosted regressor, i.e. BHTR, and the lower bound for base regressors, i.e. histogram transform regressors (HTR), we manage to explain the benefits of the boosting procedure. In the experiments, compared with other state-of-the-art algorithms such as gradient boosted regression tree (GBRT), Breiman’s forest, and kernel-based methods, our BHTR algorithm shows promising performance on both synthetic and real datasets.}
}
@InProceedings{pmlr-v119-cai20b,
title = {On Validation and Planning of An Optimal Decision Rule with Application in Healthcare Studies},
author = {Cai, Hengrui and Lu, Wenbin and Song, Rui},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1262--1270},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cai20b/cai20b.pdf},
url = {http://proceedings.mlr.press/v119/cai20b.html},
abstract = {In the current era of personalized recommendation, one major interest is to develop an optimal individualized decision rule that assigns individuals with the best treatment option according to their covariates. Estimation of optimal decision rules (ODR) has been extensively investigated recently, however, at present, no testing procedure is proposed to verify whether these ODRs are significantly better than the naive decision rule that always assigning individuals to a fixed treatment option. In this paper, we propose a testing procedure for detecting the existence of an ODR that is better than the naive decision rule under the randomized trials. We construct the proposed test based on the difference of estimated value functions using the augmented inverse probability weighted method. The asymptotic distributions of the proposed test statistic under the null and local alternative hypotheses are established. Based on the established asymptotic distributions, we further develop a sample size calculation formula for testing the existence of an ODR in designing A/B tests. Extensive simulations and a real data application to a schizophrenia clinical trial data are conducted to demonstrate the empirical validity of the proposed methods.}
}
@InProceedings{pmlr-v119-cai20c,
title = {Uncertainty quantification for nonconvex tensor completion: Confidence intervals, heteroscedasticity and optimality},
author = {Cai, Changxiao and Poor, H. Vincent and Chen, Yuxin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1271--1282},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cai20c/cai20c.pdf},
url = {http://proceedings.mlr.press/v119/cai20c.html},
abstract = {We study the distribution and uncertainty of nonconvex optimization for noisy tensor completion — the problem of estimating a low-rank tensor given incomplete and corrupted observations of its entries. Focusing on a two-stage nonconvex estimation algorithm proposed by (Cai et al., 2019), we characterize the distribution of this estimator down to fine scales. This distributional theory in turn allows one to construct valid and short confidence intervals for both the unseen tensor entries and its underlying tensor factors. The proposed inferential procedure enjoys several important features: (1) it is fully adaptive to noise heteroscedasticity, and (2) it is data-driven and adapts automatically to unknown noise distributions. Furthermore, our findings unveil the statistical optimality of nonconvex tensor completion: it attains un-improvable estimation accuracy — including both the rates and the pre-constants — under i.i.d. Gaussian noise.}
}
@InProceedings{pmlr-v119-cai20d,
title = {Provably Efficient Exploration in Policy Optimization},
author = {Cai, Qi and Yang, Zhuoran and Jin, Chi and Wang, Zhaoran},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1283--1294},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cai20d/cai20d.pdf},
url = {http://proceedings.mlr.press/v119/cai20d.html},
abstract = {While policy-based reinforcement learning (RL) achieves tremendous successes in practice, it is significantly less understood in theory, especially compared with value-based RL. In particular, it remains elusive how to design a provably efficient policy optimization algorithm that incorporates exploration. To bridge such a gap, this paper proposes an Optimistic variant of the Proximal Policy Optimization algorithm (OPPO), which follows an “optimistic version” of the policy gradient direction. This paper proves that, in the problem of episodic Markov decision process with linear function approximation, unknown transition, and adversarial reward with full-information feedback, OPPO achieves $\tilde{O}(\sqrt{d^2 H^3 T})$ regret. Here $d$ is the feature dimension, $H$ is the episode horizon, and $T$ is the total number of steps. To the best of our knowledge, OPPO is the first provably efficient policy optimization algorithm that explores.}
}
@InProceedings{pmlr-v119-calandriello20a,
title = {Near-linear time {G}aussian process optimization with adaptive batching and resparsification},
author = {Calandriello, Daniele and Carratino, Luigi and Lazaric, Alessandro and Valko, Michal and Rosasco, Lorenzo},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1295--1305},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/calandriello20a/calandriello20a.pdf},
url = {http://proceedings.mlr.press/v119/calandriello20a.html},
abstract = {Gaussian processes (GP) are one of the most successful frameworks to model uncertainty. However, GP optimization (e.g., GP-UCB) suffers from major scalability issues. Experimental time grows linearly with the number of evaluations, unless candidates are selected in batches (e.g., using GP-BUCB) and evaluated in parallel. Furthermore, computational cost is often prohibitive since algorithms such as GP-BUCB require a time at least quadratic in the number of dimensions and iterations to select each batch. In this paper, we introduce BBKB (Batch Budgeted Kernel Bandits), the first no-regret GP optimization algorithm that provably runs in near-linear time and selects candidates in batches. This is obtained with a new guarantee for the tracking of the posterior variances that allows BBKB to choose increasingly larger batches, improving over GP-BUCB. Moreover, we show that the same bound can be used to adaptively delay costly updates to the sparse GP approximation used by BBKB, achieving a near-constant per-step amortized cost. These findings are then confirmed in several experiments, where BBKB is much faster than state-of-the-art methods.}
}
@InProceedings{pmlr-v119-calder20a,
title = {Poisson Learning: Graph Based Semi-Supervised Learning At Very Low Label Rates},
author = {Calder, Jeff and Cook, Brendan and Thorpe, Matthew and Slepcev, Dejan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1306--1316},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/calder20a/calder20a.pdf},
url = {http://proceedings.mlr.press/v119/calder20a.html},
abstract = {We propose a new framework, called Poisson learning, for graph based semi-supervised learning at very low label rates. Poisson learning is motivated by the need to address the degeneracy of Laplacian semi-supervised learning in this regime. The method replaces the assignment of label values at training points with the placement of sources and sinks, and solves the resulting Poisson equation on the graph. The outcomes are provably more stable and informative than those of Laplacian learning. Poisson learning is efficient and simple to implement, and we present numerical experiments showing the method is superior to other recent approaches to semi-supervised learning at low label rates on MNIST, FashionMNIST, and Cifar-10. We also propose a graph-cut enhancement of Poisson learning, called Poisson MBO, that gives higher accuracy and can incorporate prior knowledge of relative class sizes.}
}
@InProceedings{pmlr-v119-campos20a,
title = {Explore, Discover and Learn: Unsupervised Discovery of State-Covering Skills},
author = {Campos, Victor and Trott, Alexander and Xiong, Caiming and Socher, Richard and Giro-I-Nieto, Xavier and Torres, Jordi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1317--1327},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/campos20a/campos20a.pdf},
url = {http://proceedings.mlr.press/v119/campos20a.html},
abstract = {Acquiring abilities in the absence of a task-oriented reward function is at the frontier of reinforcement learning research. This problem has been studied through the lens of empowerment, which draws a connection between option discovery and information theory. Information-theoretic skill discovery methods have garnered much interest from the community, but little research has been conducted in understanding their limitations. Through theoretical analysis and empirical evidence, we show that existing algorithms suffer from a common limitation – they discover options that provide a poor coverage of the state space. In light of this, we propose Explore, Discover and Learn (EDL), an alternative approach to information-theoretic skill discovery. Crucially, EDL optimizes the same information-theoretic objective derived from the empowerment literature, but addresses the optimization problem using different machinery. We perform an extensive evaluation of skill discovery methods on controlled environments and show that EDL offers significant advantages, such as overcoming the coverage problem, reducing the dependence of learned skills on the initial state, and allowing the user to define a prior over which behaviors should be learned.}
}
@InProceedings{pmlr-v119-cassel20a,
title = {Logarithmic Regret for Learning Linear Quadratic Regulators Efficiently},
author = {Cassel, Asaf and Cohen, Alon and Koren, Tomer},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1328--1337},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cassel20a/cassel20a.pdf},
url = {http://proceedings.mlr.press/v119/cassel20a.html},
abstract = {We consider the problem of learning in Linear Quadratic Control systems whose transition parameters are initially unknown. Recent results in this setting have demonstrated efficient learning algorithms with regret growing with the square root of the number of decision steps. We present new efficient algorithms that achieve, perhaps surprisingly,regret that scales only (poly-)logarithmically with the number of steps, in two scenarios: when only the state transition matrix A is unknown, and when only the state-action transition matrix B is unknown and the optimal policy satisfies a certain non-degeneracy condition. On the other hand, we give a lower bound which shows that when the latter condition is violated, square root regret is unavoidable.}
}
@InProceedings{pmlr-v119-cauwet20a,
title = {Fully Parallel Hyperparameter Search: Reshaped Space-Filling},
author = {Cauwet, Marie-Liesse and Couprie, Camille and Dehos, Julien and Luc, Pauline and Rapin, Jeremy and Riviere, Morgane and Teytaud, Fabien and Teytaud, Olivier and Usunier, Nicolas},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1338--1348},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cauwet20a/cauwet20a.pdf},
url = {http://proceedings.mlr.press/v119/cauwet20a.html},
abstract = {Space-filling designs such as Low Discrepancy Sequence (LDS), Latin Hypercube Sampling (LHS) and Jittered Sampling (JS) were proposed for fully parallel hyperparameter search, and were shown to be more effective than random and grid search. We prove that LHS and JS outperform random search only by a constant factor. Consequently, we introduce a new sampling approach based on the reshaping of the search distribution, and we show both theoretically and numerically that it leads to significant gains over random search. Two methods are proposed for the reshaping: Recentering (when the distribution of the optimum is known), and Cauchy transformation (when the distribution of the optimum is unknown). The proposed methods are first validated on artificial experiments and simple real-world tests on clustering and Salmon mappings. Then we demonstrate that they drive performance improvement in a wide range of expensive artificial intelligence tasks, namely attend/infer/repeat, video next frame segmentation forecasting and progressive generative adversarial networks.}
}
@InProceedings{pmlr-v119-celis20a,
title = {Data preprocessing to mitigate bias: A maximum entropy based approach},
author = {Celis, L. Elisa and Keswani, Vijay and Vishnoi, Nisheeth},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1349--1359},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/celis20a/celis20a.pdf},
url = {http://proceedings.mlr.press/v119/celis20a.html},
abstract = {Data containing human or social attributes may over- or under-represent groups with respect to salient social attributes such as gender or race, which can lead to biases in downstream applications. This paper presents an algorithmic framework that can be used as a data preprocessing method towards mitigating such bias. Unlike prior work, it can efficiently learn distributions over large domains, controllably adjust the representation rates of protected groups and achieve target fairness metrics such as statistical parity, yet remains close to the empirical distribution induced by the given dataset. Our approach leverages the principle of maximum entropy {–} amongst all distributions satisfying a given set of constraints, we should choose the one closest in KL-divergence to a given prior. While maximum entropy distributions can succinctly encode distributions over large domains, they can be difficult to compute. Our main contribution is an instantiation of this framework for our set of constraints and priors, which encode our bias mitigation goals, and that runs in time polynomial in the dimension of the data. Empirically, we observe that samples from the learned distribution have desired representation rates and statistical rates, and when used for training a classifier incurs only a slight loss in accuracy while maintaining fairness properties.}
}
@InProceedings{pmlr-v119-cella20a,
title = {Meta-learning with Stochastic Linear Bandits},
author = {Cella, Leonardo and Lazaric, Alessandro and Pontil, Massimiliano},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1360--1370},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cella20a/cella20a.pdf},
url = {http://proceedings.mlr.press/v119/cella20a.html},
abstract = {We investigate meta-learning procedures in the setting of stochastic linear bandits tasks. The goal is to select a learning algorithm which works well on average over a class of bandits tasks, that are sampled from a task-distribution. Inspired by recent work on learning-to-learn linear regression, we consider a class of bandit algorithms that implement a regularized version of the well-known OFUL algorithm, where the regularization is a square euclidean distance to a bias vector. We first study the benefit of the biased OFUL algorithm in terms of regret minimization. We then propose two strategies to estimate the bias within the learning-to-learn setting. We show both theoretically and experimentally, that when the number of tasks grows and the variance of the task-distribution is small, our strategies have a significant advantage over learning the tasks in isolation.}
}
@InProceedings{pmlr-v119-chai20a,
title = {Description Based Text Classification with Reinforcement Learning},
author = {Chai, Duo and Wu, Wei and Han, Qinghong and Wu, Fei and Li, Jiwei},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1371--1382},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chai20a/chai20a.pdf},
url = {http://proceedings.mlr.press/v119/chai20a.html},
abstract = {The task of text classification is usually divided into two stages: text feature extraction and classification. In this standard formalization, categories are merely represented as indexes in the label vocabulary, and the model lacks for explicit instructions on what to classify. Inspired by the current trend of formalizing NLP problems as question answering tasks, we propose a new framework for text classification, in which each category label is associated with a category description. Descriptions are generated by hand-crafted templates or using abstractive/extractive models from reinforcement learning. The concatenation of the description and the text is fed to the classifier to decide whether or not the current label should be assigned to the text. The proposed strategy forces the model to attend to the most salient texts with respect to the label, which can be regarded as a hard version of attention, leading to better performances. We observe significant performance boosts over strong baselines on a wide range of text classification tasks including single-label classification, multi-label classification and multi-aspect sentiment analysis.}
}
@InProceedings{pmlr-v119-chalasani20a,
title = {Concise Explanations of Neural Networks using Adversarial Training},
author = {Chalasani, Prasad and Chen, Jiefeng and Chowdhury, Amrita Roy and Wu, Xi and Jha, Somesh},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1383--1391},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chalasani20a/chalasani20a.pdf},
url = {http://proceedings.mlr.press/v119/chalasani20a.html},
abstract = {We show new connections between adversarial learning and explainability for deep neural networks (DNNs). One form of explanation of the output of a neural network model in terms of its input features, is a vector of feature-attributions, which can be generated by various techniques such as Integrated Gradients (IG), DeepSHAP, LIME, and CXPlain. Two desirable characteristics of an attribution-based explanation are: (1) \emph{sparseness}: the attributions of irrelevant or weakly relevant features should be negligible, thus resulting in \emph{concise} explanations in terms of the significant features, and (2) \emph{stability}: it should not vary significantly within a small local neighborhood of the input. Our first contribution is a theoretical exploration of how these two properties (when using IG-based attributions) are related to adversarial training, for a class of 1-layer networks (which includes logistic regression models for binary and multi-class classification); for these networks we show that (a) adversarial training using an $\ell_\infty$-bounded adversary produces models with sparse attribution vectors, and (b) natural model-training while encouraging stable explanations (via an extra term in the loss function), is equivalent to adversarial training. Our second contribution is an empirical verification of phenomenon (a), which we show, somewhat surprisingly, occurs \emph{not only in 1-layer networks, but also DNNs trained on standard image datasets}, and extends beyond IG-based attributions, to those based on DeepSHAP: adversarial training with $\linf$-bounded perturbations yields significantly sparser attribution vectors, with little degradation in performance on natural test data, compared to natural training. Moreover, the sparseness of the attribution vectors is significantly better than that achievable via $\ell_1$-regularized natural training.}
}
@InProceedings{pmlr-v119-chan20a,
title = {Unlabelled Data Improves {B}ayesian Uncertainty Calibration under Covariate Shift},
author = {Chan, Alex and Alaa, Ahmed and Qian, Zhaozhi and Van Der Schaar, Mihaela},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1392--1402},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chan20a/chan20a.pdf},
url = {http://proceedings.mlr.press/v119/chan20a.html},
abstract = {Modern neural networks have proven to be powerful function approximators, providing state-of-the-art performance in a multitude of applications. They however fall short in their ability to quantify confidence in their predictions — this is crucial in high-stakes applications that involve critical decision-making. Bayesian neural networks (BNNs) aim at solving this problem by placing a prior distribution over the network’s parameters, thereby inducing a posterior distribution that encapsulates predictive uncertainty. While existing variants of BNNs based on Monte Carlo dropout produce reliable (albeit approximate) uncertainty estimates over in-distribution data, they tend to exhibit over-confidence in predictions made on target data whose feature distribution differs from the training data, i.e., the covariate shift setup. In this paper, we develop an approximate Bayesian inference scheme based on posterior regularisation, wherein unlabelled target data are used as “pseudo-labels” of model confidence that are used to regularise the model’s loss on labelled source data. We show that this approach significantly improves the accuracy of uncertainty quantification on covariate-shifted data sets, with minimal modification to the underlying model architecture. We demonstrate the utility of our method in the context of transferring prognostic models of prostate cancer across globally diverse populations.}
}
@InProceedings{pmlr-v119-chan20b,
title = {Imputer: Sequence Modelling via Imputation and Dynamic Programming},
author = {Chan, William and Saharia, Chitwan and Hinton, Geoffrey and Norouzi, Mohammad and Jaitly, Navdeep},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1403--1413},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chan20b/chan20b.pdf},
url = {http://proceedings.mlr.press/v119/chan20b.html},
abstract = {This paper presents the Imputer, a neural sequence model that generates output sequences iteratively via imputations. The Imputer is an iterative generation model, requiring only a constant number of generation steps independent of the number of input or output tokens. The Imputer can be trained to approximately marginalize over all possible alignments between the input and output sequences, and all possible generation orders. We present a tractable dynamic programming training algorithm, which yields a lower bound on the log marginal likelihood. When applied to end-to-end speech recognition, the Imputer outperforms prior non-autoregressive models and achieves competitive results to autoregressive models. On LibriSpeech test-other, the Imputer achieves 11.1 WER, outperforming CTC at 13.0 WER and seq2seq at 12.5 WER.}
}
@InProceedings{pmlr-v119-chandak20a,
title = {Optimizing for the Future in Non-Stationary {MDP}s},
author = {Chandak, Yash and Theocharous, Georgios and Shankar, Shiv and White, Martha and Mahadevan, Sridhar and Thomas, Philip},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1414--1425},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chandak20a/chandak20a.pdf},
url = {http://proceedings.mlr.press/v119/chandak20a.html},
abstract = {Most reinforcement learning methods are based upon the key assumption that the transition dynamics and reward functions are fixed, that is, the underlying Markov decision process is stationary. However, in many real-world applications, this assumption is violated, and using existing algorithms may result in a performance lag. To proactively search for a good future policy, we present a policy gradient algorithm that maximizes a forecast of future performance. This forecast is obtained by fitting a curve to the counter-factual estimates of policy performance over time, without explicitly modeling the underlying non-stationarity. The resulting algorithm amounts to a non-uniform reweighting of past data, and we observe that minimizing performance over some of the data from past episodes can be beneficial when searching for a policy that maximizes future performance. We show that our algorithm, called Prognosticator, is more robust to non-stationarity than two online adaptation techniques, on three simulated problems motivated by real-world applications.}
}
@InProceedings{pmlr-v119-chang20a,
title = {Learning to Simulate and Design for Structural Engineering},
author = {Chang, Kai-Hung and Cheng, Chin-Yi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1426--1436},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chang20a/chang20a.pdf},
url = {http://proceedings.mlr.press/v119/chang20a.html},
abstract = {The structural design process for buildings is time-consuming and laborious. To automate this process, structural engineers combine optimization methods with simulation tools to find an optimal design with minimal building mass subject to building regulations. However, structural engineers in practice often avoid optimization and compromise on a suboptimal design for the majority of buildings, due to the large size of the design space, the iterative nature of the optimization methods, and the slow simulation tools. In this work, we formulate the building structures as graphs and create an end-to-end pipeline that can learn to propose the optimal cross-sections of columns and beams by training together with a pre-trained differentiable structural simulator. The performance of the proposed structural designs is comparable to the ones optimized by genetic algorithm (GA), with all the constraints satisfied. The optimal structural design with the reduced the building mass can not only lower the material cost, but also decrease the carbon footprint.}
}
@InProceedings{pmlr-v119-chang20b,
title = {Decentralized Reinforcement Learning: Global Decision-Making via Local Economic Transactions},
author = {Chang, Michael and Kaushik, Sid and Weinberg, S. Matthew and Griffiths, Tom and Levine, Sergey},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1437--1447},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chang20b/chang20b.pdf},
url = {http://proceedings.mlr.press/v119/chang20b.html},
abstract = {This paper seeks to establish a framework for directing a society of simple, specialized, self-interested agents to solve what traditionally are posed as monolithic single-agent sequential decision problems. What makes it challenging to use a decentralized approach to collectively optimize a central objective is the difficulty in characterizing the equilibrium strategy profile of non-cooperative games. To overcome this challenge, we design a mechanism for defining the learning environment of each agent for which we know that the optimal solution for the global objective coincides with a Nash equilibrium strategy profile of the agents optimizing their own local objectives. The society functions as an economy of agents that learn the credit assignment process itself by buying and selling to each other the right to operate on the environment state. We derive a class of decentralized reinforcement learning algorithms that are broadly applicable not only to standard reinforcement learning but also for selecting options in semi-MDPs and dynamically composing computation graphs. Lastly, we demonstrate the potential advantages of a society’s inherent modular structure for more efficient transfer learning.}
}
@InProceedings{pmlr-v119-chang20c,
title = {Invariant Rationalization},
author = {Chang, Shiyu and Zhang, Yang and Yu, Mo and Jaakkola, Tommi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1448--1458},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chang20c/chang20c.pdf},
url = {http://proceedings.mlr.press/v119/chang20c.html},
abstract = {Selective rationalization improves neural network interpretability by identifying a small subset of input features {—} the rationale {—} that best explains or supports the prediction. A typical rationalization criterion, i.e. maximum mutual information (MMI), finds the rationale that maximizes the prediction performance based only on the rationale. However, MMI can be problematic because it picks up spurious correlations between the input features and the output. Instead, we introduce a game-theoretic invariant rationalization criterion where the rationales are constrained to enable the same predictor to be optimal across different environments. We show both theoretically and empirically that the proposed rationales can rule out spurious correlations and generalize better to different test scenarios. The resulting explanations also align better with human judgments. Our implementations are publicly available at https://github.com/code-terminator/invariant_rationalization.}
}
@InProceedings{pmlr-v119-chatterjee20a,
title = {Circuit-Based Intrinsic Methods to Detect Overfitting},
author = {Chatterjee, Satrajit and Mishchenko, Alan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1459--1468},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chatterjee20a/chatterjee20a.pdf},
url = {http://proceedings.mlr.press/v119/chatterjee20a.html},
abstract = {The focus of this paper is on intrinsic methods to detect overfitting. By intrinsic methods, we mean methods that rely only on the model and the training data, as opposed to traditional methods (we call them extrinsic methods) that rely on performance on a test set or on bounds from model complexity. We propose a family of intrinsic methods called Counterfactual Simulation (CFS) which analyze the flow of training examples through the model by identifying and perturbing rare patterns. By applying CFS to logic circuits we get a method that has no hyper-parameters and works uniformly across different types of models such as neural networks, random forests and lookup tables. Experimentally, CFS can separate models with different levels of overfit using only their logic circuit representations without any access to the high level structure. By comparing lookup tables, neural networks, and random forests using CFS, we get insight into why neural networks generalize. In particular, we find that stochastic gradient descent in neural nets does not lead to "brute force" memorization, but finds common patterns (whether we train with actual or randomized labels), and neural networks are not unlike forests in this regard. Finally, we identify a limitation with our proposal that makes it unsuitable in an adversarial setting, but points the way to future work on robust intrinsic methods.}
}
@InProceedings{pmlr-v119-chatziafratis20a,
title = {Better depth-width trade-offs for neural networks through the lens of dynamical systems},
author = {Chatziafratis, Vaggos and Nagarajan, Sai Ganesh and Panageas, Ioannis},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1469--1478},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chatziafratis20a/chatziafratis20a.pdf},
url = {http://proceedings.mlr.press/v119/chatziafratis20a.html},
abstract = {The expressivity of neural networks as a function of their depth, width and type of activation units has been an important question in deep learning theory. Recently, depth separation results for ReLU networks were obtained via a new connection with dynamical systems, using a generalized notion of fixed points of a continuous map $f$, called periodic points. In this work, we strengthen the connection with dynamical systems and we improve the existing width lower bounds along several aspects. Our first main result is period-specific width lower bounds that hold under the stronger notion of $L^1$-approximation error, instead of the weaker classification error. Our second contribution is that we provide sharper width lower bounds, still yielding meaningful exponential depth-width separations, in regimes where previous results wouldn’t apply. A byproduct of our results is that there exists a universal constant characterizing the depth-width trade-offs, as long as $f$ has odd periods. Technically, our results follow by unveiling a tighter connection between the following three quantities of a given function: its period, its Lipschitz constant and the growth rate of the number of oscillations arising under compositions of the function $f$ with itself.}
}
@InProceedings{pmlr-v119-chaudhary20a,
title = {Explainable and Discourse Topic-aware Neural Language Understanding},
author = {Chaudhary, Yatin and Schuetze, Hinrich and Gupta, Pankaj},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1479--1488},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chaudhary20a/chaudhary20a.pdf},
url = {http://proceedings.mlr.press/v119/chaudhary20a.html},
abstract = {Marrying topic models and language models exposes language understanding to a broader source of document-level context beyond sentences via topics. While introducing topical semantics in language models, existing approaches incorporate latent document topic proportions and ignore topical discourse in sentences of the document. This work extends the line of research by additionally introducing an explainable topic representation in language understanding, obtained from a set of key terms correspondingly for each latent topic of the proportion. Moreover, we retain sentence-topic association along with document-topic association by modeling topical discourse for every sentence in the document. We present a novel neural composite language modeling (NCLM) framework that exploits both the latent and explainable topics along with topical discourse at sentence-level in a joint learning framework of topic and language models. Experiments over a range of tasks such as language modeling, word sense disambiguation, document classiﬁcation, retrieval and text generation demonstrate ability of the proposed model in improving language understanding.}
}
@InProceedings{pmlr-v119-chauhan20a,
title = {Uncertainty-Aware Lookahead Factor Models for Quantitative Investing},
author = {Chauhan, Lakshay and Alberg, John and Lipton, Zachary},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1489--1499},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chauhan20a/chauhan20a.pdf},
url = {http://proceedings.mlr.press/v119/chauhan20a.html},
abstract = {On a periodic basis, publicly traded companies report fundamentals, financial data including revenue, earnings, debt, among others. Quantitative finance research has identified several factors, functions of the reported data that historically correlate with stock market performance. In this paper, we first show through simulation that if we could select stocks via factors calculated on future fundamentals (via oracle), that our portfolios would far outperform standard factor models. Motivated by this insight, we train deep nets to forecast future fundamentals from a trailing 5-year history. We propose lookahead factor models which plug these predicted future fundamentals into traditional factors. Finally, we incorporate uncertainty estimates from both neural heteroscedastic regression and a dropout-based heuristic, improving performance by adjusting our portfolios to avert risk. In retrospective analysis, we leverage an industry-grade portfolio simulator (backtester) to show simultaneous improvement in annualized return and Sharpe ratio. Specifically, the simulated annualized return for the uncertainty-aware model is 17.7% (vs 14.0% for a standard factor model) and the Sharpe ratio is 0.84 (vs 0.52).}
}
@InProceedings{pmlr-v119-chen20a,
title = {Deep Reasoning Networks for Unsupervised Pattern De-mixing with Constraint Reasoning},
author = {Chen, Di and Bai, Yiwei and Zhao, Wenting and Ament, Sebastian and Gregoire, John and Gomes, Carla},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1500--1509},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20a/chen20a.pdf},
url = {http://proceedings.mlr.press/v119/chen20a.html},
abstract = {We introduce Deep Reasoning Networks (DRNets), an end-to-end framework that combines deep learning with constraint reasoning for solving pattern de-mixing problems, typically in an unsupervised or very-weakly-supervised setting. DRNets exploit problem structure and prior knowledge by tightly combining constraint reasoning with stochastic-gradient-based neural network optimization. Our motivating task is from materials discovery and concerns inferring crystal structures of materials from X-ray diffraction data (Crystal-Structure-Phase-Mapping). Given the complexity of its underlying scientific domain, we start by introducing DRNets on an analogous but much simpler task: de-mixing overlapping hand-written Sudokus (Multi-MNIST-Sudoku). On Multi-MNIST-Sudoku, DRNets almost perfectly recovered the mixed Sudokus’ digits, with 100% digit accuracy, outperforming the supervised state-of-the-art MNIST de-mixing models. On Crystal-Structure-Phase-Mapping, DRNets significantly outperform the state of the art and experts’ capabilities, recovering more precise and physically meaningful crystal structures.}
}
@InProceedings{pmlr-v119-chen20b,
title = {Self-{PU}: Self Boosted and Calibrated Positive-Unlabeled Training},
author = {Chen, Xuxi and Chen, Wuyang and Chen, Tianlong and Yuan, Ye and Gong, Chen and Chen, Kewei and Wang, Zhangyang},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1510--1519},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20b/chen20b.pdf},
url = {http://proceedings.mlr.press/v119/chen20b.html},
abstract = {Many real-world applications have to tackle the Positive-Unlabeled (PU) learning problem, i.e., learning binary classifiers from a large amount of unlabeled data and a few labeled positive examples. While current state-of-the-art methods employ importance reweighting to design various biased or unbiased risk estimators, they completely ignored the learning capability of the model itself, which could provide reliable supervision. This motivates us to propose a novel Self-PU learning framework, which seamlessly integrates PU learning and self-training. Self-PU highlights three “self”-oriented building blocks: a self-paced training algorithm that adaptively discovers and augments confident positive/negative examples as the training proceeds; a self-reweighted, instance-aware loss; and a self-distillation scheme that introduces teacher-students learning as an effective regularization for PU learning. We demonstrate the state-of-the-art performance of Self-PU on common PU learning benchmarks (MNIST and CIFAR10), which compare favorably against the latest competitors. Moreover, we study a real-world application of PU learning, i.e., classifying brain images of Alzheimer’s Disease. Self-PU obtains significantly improved results on the renowned Alzheimer’s Disease Neuroimaging Initiative (ADNI) database over existing methods.}
}
@InProceedings{pmlr-v119-chen20c,
title = {Learning To Stop While Learning To Predict},
author = {Chen, Xinshi and Dai, Hanjun and Li, Yu and Gao, Xin and Song, Le},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1520--1530},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20c/chen20c.pdf},
url = {http://proceedings.mlr.press/v119/chen20c.html},
abstract = {There is a recent surge of interest in designing deep architectures based on the update steps in traditional algorithms, or learning neural networks to improve and replace traditional algorithms. While traditional algorithms have certain stopping criteria for outputting results at different iterations, many algorithm-inspired deep models are restricted to a “fixed-depth” for all inputs. Similar to algorithms, the optimal depth of a deep architecture may be different for different input instances, either to avoid “over-thinking”, or because we want to compute less for operations converged already. In this paper, we tackle this varying depth problem using a steerable architecture, where a feed-forward deep model and a variational stopping policy are learned together to sequentially determine the optimal number of layers for each input instance. Training such architecture is very challenging. We provide a variational Bayes perspective and design a novel and effective training procedure which decomposes the task into an oracle model learning stage and an imitation stage. Experimentally, we show that the learned deep model along with the stopping policy improves the performances on a diverse set of tasks, including learning sparse recovery, few-shot meta learning, and computer vision tasks.}
}
@InProceedings{pmlr-v119-chen20d,
title = {Combinatorial Pure Exploration for Dueling Bandit},
author = {Chen, Wei and Du, Yihan and Huang, Longbo and Zhao, Haoyu},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1531--1541},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20d/chen20d.pdf},
url = {http://proceedings.mlr.press/v119/chen20d.html},
abstract = {In this paper, we study combinatorial pure exploration for dueling bandits (CPE-DB): we have multiple candidates for multiple positions as modeled by a bipartite graph, and in each round we sample a duel of two candidates on one position and observe who wins in the duel, with the goal of finding the best candidate-position matching with high probability after multiple rounds of samples. CPE-DB is an adaptation of the original combinatorial pure exploration for multi-armed bandit (CPE-MAB) problem to the dueling bandit setting. We consider both the Borda winner and the Condorcet winner cases. For Borda winner, we establish a reduction of the problem to the original CPE-MAB setting and design PAC and exact algorithms that achieve both the sample complexity similar to that in the CPE-MAB setting (which is nearly optimal for a subclass of problems) and polynomial running time per round. For Condorcet winner, we first design a fully polynomial time approximation scheme (FPTAS) for the offline problem of finding the Condorcet winner with known winning probabilities, and then use the FPTAS as an oracle to design a novel pure exploration algorithm CAR-Cond with sample complexity analysis. CAR-Cond is the first algorithm with polynomial running time per round for identifying the Condorcet winner in CPE-DB.}
}
@InProceedings{pmlr-v119-chen20e,
title = {Graph Optimal Transport for Cross-Domain Alignment},
author = {Chen, Liqun and Gan, Zhe and Cheng, Yu and Li, Linjie and Carin, Lawrence and Liu, Jingjing},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1542--1553},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20e/chen20e.pdf},
url = {http://proceedings.mlr.press/v119/chen20e.html},
abstract = {Cross-domain alignment between two sets of entities (e.g., objects in an image, words in a sentence) is fundamental to both computer vision and natural language processing. Existing methods mainly focus on designing advanced attention mechanisms to simulate soft alignment, where no training signals are provided to explicitly encourage alignment. Plus, the learned attention matrices are often dense and difficult to interpret. We propose Graph Optimal Transport (GOT), a principled framework that builds upon recent advances in Optimal Transport (OT). In GOT, cross-domain alignment is formulated as a graph matching problem, by representing entities as a dynamically-constructed graph. Two types of OT distances are considered: (i) Wasserstein distance (WD) for node (entity) matching; and (ii) Gromov-Wasserstein distance (GWD) for edge (structure) matching. Both WD and GWD can be incorporated into existing neural network models, effectively acting as a drop-in regularizer. The inferred transport plan also yields sparse and self-normalized alignment, enhancing the interpretability of the learned model. Experiments show consistent outperformance of GOT over baselines across a wide range of tasks, including image-text retrieval, visual question answering, image captioning, machine translation, and text summarization.}
}
@InProceedings{pmlr-v119-chen20f,
title = {Stabilizing Differentiable Architecture Search via Perturbation-based Regularization},
author = {Chen, Xiangning and Hsieh, Cho-Jui},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1554--1565},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20f/chen20f.pdf},
url = {http://proceedings.mlr.press/v119/chen20f.html},
abstract = {Differentiable architecture search (DARTS) is a prevailing NAS solution to identify architectures. Based on the continuous relaxation of the architecture space, DARTS learns a differentiable architecture weight and largely reduces the search cost. However, its stability has been challenged for yielding deteriorating architectures as the search proceeds. We find that the precipitous validation loss landscape, which leads to a dramatic performance drop when distilling the final architecture, is an essential factor that causes instability. Based on this observation, we propose a perturbation-based regularization - SmoothDARTS (SDARTS), to smooth the loss landscape and improve the generalizability of DARTS-based methods. In particular, our new formulations stabilize DARTS-based methods by either random smoothing or adversarial attack. The search trajectory on NAS-Bench-1Shot1 demonstrates the effectiveness of our approach and due to the improved stability, we achieve performance gain across various search spaces on 4 datasets. Furthermore, we mathematically show that SDARTS implicitly regularizes the Hessian norm of the validation loss, which accounts for a smoother loss landscape and improved performance.}
}
@InProceedings{pmlr-v119-chen20g,
title = {Mapping natural-language problems to formal-language solutions using structured neural representations},
author = {Chen, Kezhen and Huang, Qiuyuan and Palangi, Hamid and Smolensky, Paul and Forbus, Ken and Gao, Jianfeng},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1566--1575},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20g/chen20g.pdf},
url = {http://proceedings.mlr.press/v119/chen20g.html},
abstract = {Generating formal-language programs represented by relational tuples, such as Lisp programs or mathematical operations, to solve problems stated in natural language is a challenging task because it requires explicitly capturing discrete symbolic structural information implicit in the input. However, most general neural sequence models do not explicitly capture such structural information, limiting their performance on these tasks. In this paper, we propose a new encoder-decoder model based on a structured neural representation, Tensor Product Representations (TPRs), for mapping Natural-language problems to Formal-language solutions, called TP-N2F. The encoder of TP-N2F employs TPR ‘binding’ to encode natural-language symbolic structure in vector space and the decoder uses TPR ‘unbinding’ to generate, in symbolic space, a sequential program represented by relational tuples, each consisting of a relation (or operation) and a number of arguments. TP-N2F considerably outperforms LSTM-based seq2seq models on two benchmarks and creates new state-of-the-art results. Ablation studies show that improvements can be attributed to the use of structured TPRs explicitly in both the encoder and decoder. Analysis of the learned structures shows how TPRs enhance the interpretability of TP-N2F.}
}
@InProceedings{pmlr-v119-chen20h,
title = {Convolutional Kernel Networks for Graph-Structured Data},
author = {Chen, Dexiong and Jacob, Laurent and Mairal, Julien},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1576--1586},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20h/chen20h.pdf},
url = {http://proceedings.mlr.press/v119/chen20h.html},
abstract = {We introduce a family of multilayer graph kernels and establish new links between graph convolutional neural networks and kernel methods. Our approach generalizes convolutional kernel networks to graph-structured data, by representing graphs as a sequence of kernel feature maps, where each node carries information about local graph substructures. On the one hand, the kernel point of view offers an unsupervised, expressive, and easy-to-regularize data representation, which is useful when limited samples are available. On the other hand, our model can also be trained end-to-end on large-scale data, leading to new types of graph convolutional neural networks. We show that our method achieves competitive performance on several graph classification benchmarks, while offering simple model interpretation. Our code is freely available at https://github.com/claying/GCKN.}
}
@InProceedings{pmlr-v119-chen20i,
title = {Learning Flat Latent Manifolds with {VAE}s},
author = {Chen, Nutan and Klushyn, Alexej and Ferroni, Francesco and Bayer, Justin and Van Der Smagt, Patrick},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1587--1596},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20i/chen20i.pdf},
url = {http://proceedings.mlr.press/v119/chen20i.html},
abstract = {Measuring the similarity between data points often requires domain knowledge, which can in parts be compensated by relying on unsupervised methods such as latent-variable models, where similarity/distance is estimated in a more compact latent space. Prevalent is the use of the Euclidean metric, which has the drawback of ignoring information about similarity of data stored in the decoder, as captured by the framework of Riemannian geometry. We propose an extension to the framework of variational auto-encoders allows learning flat latent manifolds, where the Euclidean metric is a proxy for the similarity between data points. This is achieved by defining the latent space as a Riemannian manifold and by regularising the metric tensor to be a scaled identity matrix. Additionally, we replace the compact prior typically used in variational auto-encoders with a recently presented, more expressive hierarchical one—and formulate the learning problem as a constrained optimisation problem. We evaluate our method on a range of data-sets, including a video-tracking benchmark, where the performance of our unsupervised approach nears that of state-of-the-art supervised approaches, while retaining the computational efficiency of straight-line-based approaches.}
}
@InProceedings{pmlr-v119-chen20j,
title = {A Simple Framework for Contrastive Learning of Visual Representations},
author = {Chen, Ting and Kornblith, Simon and Norouzi, Mohammad and Hinton, Geoffrey},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1597--1607},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20j/chen20j.pdf},
url = {http://proceedings.mlr.press/v119/chen20j.html},
abstract = {This paper presents SimCLR: a simple framework for contrastive learning of visual representations. We simplify recently proposed contrastive self-supervised learning algorithms without requiring specialized architectures or a memory bank. In order to understand what enables the contrastive prediction tasks to learn useful representations, we systematically study the major components of our framework. We show that (1) composition of data augmentations plays a critical role in defining effective predictive tasks, (2) introducing a learnable nonlinear transformation between the representation and the contrastive loss substantially improves the quality of the learned representations, and (3) contrastive learning benefits from larger batch sizes and more training steps compared to supervised learning. By combining these findings, we are able to considerably outperform previous methods for self-supervised and semi-supervised learning on ImageNet. A linear classifier trained on self-supervised representations learned by SimCLR achieves 76.5% top-1 accuracy, which is a 7% relative improvement over previous state-of-the-art, matching the performance of a supervised ResNet-50. When fine-tuned on only 1% of the labels, we achieve 85.8% top-5 accuracy, outperforming AlexNet with 100X fewer labels.}
}
@InProceedings{pmlr-v119-chen20k,
title = {Retro*: Learning Retrosynthetic Planning with Neural Guided A* Search},
author = {Chen, Binghong and Li, Chengtao and Dai, Hanjun and Song, Le},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1608--1616},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20k/chen20k.pdf},
url = {http://proceedings.mlr.press/v119/chen20k.html},
abstract = {Retrosynthetic planning is a critical task in organic chemistry which identifies a series of reactions that can lead to the synthesis of a target product. The vast number of possible chemical transformations makes the size of the search space very big, and retrosynthetic planning is challenging even for experienced chemists. However, existing methods either require expensive return estimation by rollout with high variance, or optimize for search speed rather than the quality. In this paper, we propose Retro*, a neural-based A*-like algorithm that finds high-quality synthetic routes efficiently. It maintains the search as an AND-OR tree, and learns a neural search bias with off-policy data. Then guided by this neural network, it performs best-first search efficiently during new planning episodes. Experiments on benchmark USPTO datasets show that, our proposed method outperforms existing state-of-the-art with respect to both the success rate and solution quality, while being more efficient at the same time.}
}
@InProceedings{pmlr-v119-chen20l,
title = {Differentiable Product Quantization for End-to-End Embedding Compression},
author = {Chen, Ting and Li, Lala and Sun, Yizhou},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1617--1626},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20l/chen20l.pdf},
url = {http://proceedings.mlr.press/v119/chen20l.html},
abstract = {Embedding layers are commonly used to map discrete symbols into continuous embedding vectors that reflect their semantic meanings. Despite their effectiveness, the number of parameters in an embedding layer increases linearly with the number of symbols and poses a critical challenge on memory and storage constraints. In this work, we propose a generic and end-to-end learnable compression framework termed differentiable product quantization (DPQ). We present two instantiations of DPQ that leverage different approximation techniques to enable differentiability in end-to-end learning. Our method can readily serve as a drop-in alternative for any existing embedding layer. Empirically, DPQ offers significant compression ratios (14-238X) at negligible or no performance cost on 10 datasets across three different language tasks.}
}
@InProceedings{pmlr-v119-chen20m,
title = {On Efficient Constructions of Checkpoints},
author = {Chen, Yu and Liu, Zhenming and Ren, Bin and Jin, Xin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1627--1636},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20m/chen20m.pdf},
url = {http://proceedings.mlr.press/v119/chen20m.html},
abstract = {Efficient construction of checkpoints/snapshots is a critical tool for training and diagnosing deep learning models. In this paper, we propose a lossy compression scheme for checkpoint constructions (called LC-Checkpoint). LC-Checkpoint simultaneously maximizes the compression rate and optimizes the recovery speed, under the assumption that SGD is used to train the model. LC-Checkpoint uses quantization and priority promotion to store the most crucial information for SGD to recover, and then uses a Huffman coding to leverage the non-uniform distribution of the gradient scales. Our extensive experiments show that LC-Checkpoint achieves a compression rate up to 28{\texttimes} and recovery speedup up to 5.77{\texttimes} over a state-of-the-art algorithm (SCAR).}
}
@InProceedings{pmlr-v119-chen20n,
title = {Angular Visual Hardness},
author = {Chen, Beidi and Liu, Weiyang and Yu, Zhiding and Kautz, Jan and Shrivastava, Anshumali and Garg, Animesh and Anandkumar, Animashree},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1637--1648},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20n/chen20n.pdf},
url = {http://proceedings.mlr.press/v119/chen20n.html},
abstract = {Recent convolutional neural networks (CNNs) have led to impressive performance but often suffer from poor calibration. They tend to be overconfident, with the model confidence not always reflecting the underlying true ambiguity and hardness. In this paper, we propose angular visual hardness (AVH), a score given by the normalized angular distance between the sample feature embedding and the target classifier to measure sample hardness. We validate this score with an in-depth and extensive scientific study, and observe that CNN models with the highest accuracy also have the best AVH scores. This agrees with an earlier finding that state-of-art models improve on the classification of harder examples. We observe that the training dynamics of AVH is vastly different compared to the training loss. Specifically, AVH quickly reaches a plateau for all samples even though the training loss keeps improving. This suggests the need for designing better loss functions that can target harder examples more effectively. We also find that AVH has a statistically significant correlation with human visual hardness. Finally, we demonstrate the benefit of AVH to a variety of applications such as self-training for domain adaptation and domain generalization.}
}
@InProceedings{pmlr-v119-chen20o,
title = {Estimating the Error of Randomized {N}ewton Methods: A Bootstrap Approach},
author = {Chen, Jessie X.T. and Lopes, Miles},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1649--1659},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20o/chen20o.pdf},
url = {http://proceedings.mlr.press/v119/chen20o.html},
abstract = {Randomized Newton methods have recently become the focus of intense research activity in large-scale and distributed optimization. In general, these methods are based on a “computation-accuracy trade-off”, which allows the user to gain scalability in exchange for error in the solution. However, the user does not know how much error is created by the randomized approximation, which can be detrimental in two ways: On one hand, the user may try to assess the unknown error with theoretical worst-case error bounds, but this approach is impractical when the bounds involve unknown constants, and it often leads to excessive computation. On the other hand, the user may select the “sketch size” and stopping criteria in a heuristic manner, but this can lead to unreliable results. Motivated by these difficulties, we show how bootstrapping can be used to directly estimate the unknown error, which prevents excessive computation, and offers more confidence about the quality of a randomized solution.}
}
@InProceedings{pmlr-v119-chen20p,
title = {{VF}low: More Expressive Generative Flows with Variational Data Augmentation},
author = {Chen, Jianfei and Lu, Cheng and Chenli, Biqi and Zhu, Jun and Tian, Tian},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1660--1669},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20p/chen20p.pdf},
url = {http://proceedings.mlr.press/v119/chen20p.html},
abstract = {Generative flows are promising tractable models for density modeling that define probabilistic distributions with invertible transformations. However, tractability imposes architectural constraints on generative flows. In this work, we study a previously overlooked constraint that all the intermediate representations must have the same dimensionality with the data due to invertibility, limiting the width of the network. We propose VFlow to tackle this constraint on dimensionality. VFlow augments the data with extra dimensions and defines a maximum evidence lower bound (ELBO) objective for estimating the distribution of augmented data jointly with the variational data augmentation distribution. Under mild assumptions, we show that the maximum ELBO solution of VFlow is always better than the original maximum likelihood solution. For image density modeling on the CIFAR-10 dataset, VFlow achieves a new state-of-the-art 2.98 bits per dimension.}
}
@InProceedings{pmlr-v119-chen20q,
title = {More Data Can Expand The Generalization Gap Between Adversarially Robust and Standard Models},
author = {Chen, Lin and Min, Yifei and Zhang, Mingrui and Karbasi, Amin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1670--1680},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20q/chen20q.pdf},
url = {http://proceedings.mlr.press/v119/chen20q.html},
abstract = {Despite remarkable success in practice, modern machine learning models have been found to be susceptible to adversarial attacks that make human-imperceptible perturbations to the data, but result in serious and potentially dangerous prediction errors. To address this issue, practitioners often use adversarial training to learn models that are robust against such attacks at the cost of higher generalization error on unperturbed test sets. The conventional wisdom is that more training data should shrink the gap between the generalization error of adversarially-trained models and standard models. However, we study the training of robust classifiers for both Gaussian and Bernoulli models under $\ell_\infty$ attacks, and we prove that more data may actually increase this gap. Furthermore, our theoretical results identify if and when additional data will finally begin to shrink the gap. Lastly, we experimentally demonstrate that our results also hold for linear regression models, which may indicate that this phenomenon occurs more broadly.}
}
@InProceedings{pmlr-v119-chen20r,
title = {An Accelerated {DFO} Algorithm for Finite-sum Convex Functions},
author = {Chen, Yuwen and Orvieto, Antonio and Lucchi, Aurelien},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1681--1690},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20r/chen20r.pdf},
url = {http://proceedings.mlr.press/v119/chen20r.html},
abstract = {Derivative-free optimization (DFO) has recently gained a lot of momentum in machine learning, spawning interest in the community to design faster methods for problems where gradients are not accessible. While some attention has been given to the concept of acceleration in the DFO literature, existing stochastic algorithms for objective functions with a finite-sum structure have not been shown theoretically to achieve an accelerated rate of convergence. Algorithms that use acceleration in such a setting are prone to instabilities, making it difficult to reach convergence. In this work, we exploit the finite-sum structure of the objective in order to design a variance-reduced DFO algorithm that provably yields acceleration. We prove rates of convergence for both smooth convex and strongly-convex finite-sum objective functions. Finally, we validate our theoretical results empirically on several tasks and datasets.}
}
@InProceedings{pmlr-v119-chen20s,
title = {Generative Pretraining From Pixels},
author = {Chen, Mark and Radford, Alec and Child, Rewon and Wu, Jeffrey and Jun, Heewoo and Luan, David and Sutskever, Ilya},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1691--1703},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20s/chen20s.pdf},
url = {http://proceedings.mlr.press/v119/chen20s.html},
abstract = {Inspired by progress in unsupervised representation learning for natural language, we examine whether similar models can learn useful representations for images. We train a sequence Transformer to auto-regressively predict pixels, without incorporating knowledge of the 2D input structure. Despite training on low-resolution ImageNet without labels, we find that a GPT-2 scale model learns strong image representations as measured by linear probing, fine-tuning, and low-data classification. On CIFAR-10, we achieve 96.3% accuracy with a linear probe, outperforming a supervised Wide ResNet, and 99.0% accuracy with full fine-tuning, matching the top supervised pre-trained models. We are also competitive with self-supervised benchmarks on ImageNet when substituting pixels for a VQVAE encoding, achieving 69.0% top-1 accuracy on a linear probe of our features.}
}
@InProceedings{pmlr-v119-chen20t,
title = {Negative Sampling in Semi-Supervised learning},
author = {Chen, John and Shah, Vatsal and Kyrillidis, Anastasios},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1704--1714},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20t/chen20t.pdf},
url = {http://proceedings.mlr.press/v119/chen20t.html},
abstract = {We introduce Negative Sampling in Semi-Supervised Learning (NS^3L), a simple, fast, easy to tune algorithm for semi-supervised learning (SSL). NS^3L is motivated by the success of negative sampling/contrastive estimation. We demonstrate that adding the NS^3L loss to state-of-the-art SSL algorithms, such as the Virtual Adversarial Training (VAT), significantly improves upon vanilla VAT and its variant, VAT with Entropy Minimization. By adding the NS^3L loss to MixMatch, the current state-of-the-art approach on semi-supervised tasks, we observe significant improvements over vanilla MixMatch. We conduct extensive experiments on the CIFAR10, CIFAR100, SVHN and STL10 benchmark datasets. Finally, we perform an ablation study for NS3L regarding its hyperparameter tuning.}
}
@InProceedings{pmlr-v119-chen20u,
title = {Optimization from Structured Samples for Coverage Functions},
author = {Chen, Wei and Sun, Xiaoming and Zhang, Jialin and Zhang, Zhijie},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1715--1724},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20u/chen20u.pdf},
url = {http://proceedings.mlr.press/v119/chen20u.html},
abstract = {We revisit the optimization from samples (OPS) model, which studies the problem of optimizing objective functions directly from the sample data. Previous results showed that we cannot obtain a constant approximation ratio for the maximum coverage problem using polynomially many independent samples of the form $\{S_i, f(S_i)\}_{i=1}^t$ (Balkanski et al., 2017), even if coverage functions are $(1 - \epsilon)$-PMAC learnable using these samples (Badanidiyuru et al., 2012), which means most of the function values can be approximately learned very well with high probability. In this work, to circumvent the impossibility result of OPS, we propose a stronger model called optimization from structured samples (OPSS) for coverage functions, where the data samples encode the structural information of the functions. We show that under three general assumptions on the sample distributions, we can design efficient OPSS algorithms that achieve a constant approximation for the maximum coverage problem. We further prove a constant lower bound under these assumptions, which is tight when not considering computational efficiency. Moreover, we also show that if we remove any one of the three assumptions, OPSS for the maximum coverage problem has no constant approximation.}
}
@InProceedings{pmlr-v119-chen20v,
title = {Simple and Deep Graph Convolutional Networks},
author = {Chen, Ming and Wei, Zhewei and Huang, Zengfeng and Ding, Bolin and Li, Yaliang},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1725--1735},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20v/chen20v.pdf},
url = {http://proceedings.mlr.press/v119/chen20v.html},
abstract = {Graph convolutional networks (GCNs) are a powerful deep learning approach for graph-structured data. Recently, GCNs and subsequent variants have shown superior performance in various application areas on real-world datasets. Despite their success, most of the current GCN models are shallow, due to the \emph{over-smoothing} problem. In this paper, we study the problem of designing and analyzing deep graph convolutional networks. We propose the GCNII, an extension of the vanilla GCN model with two simple yet effective techniques: \emph{Initial residual} and \emph{Identity mapping}. We provide theoretical and empirical evidence that the two techniques effectively relieves the problem of over-smoothing. Our experiments show that the deep GCNII model outperforms the state-of-the-art methods on various semi- and full-supervised tasks.}
}
@InProceedings{pmlr-v119-chen20w,
title = {On Breaking Deep Generative Model-based Defenses and Beyond},
author = {Chen, Yanzhi and Xie, Renjie and Zhu, Zhanxing},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1736--1745},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20w/chen20w.pdf},
url = {http://proceedings.mlr.press/v119/chen20w.html},
abstract = {Deep neural networks have been proven to be vulnerable to the so-called adversarial attacks. Recently there have been efforts to defend such attacks with deep generative models. These defenses often predict by inverting the deep generative models rather than simple feedforward propagation. Such defenses are difficult to attack due to the obfuscated gradients caused by inversion. In this work, we propose a new white-box attack to break these defenses. The idea is to view the inversion phase as a dynamical system, through which we extract the gradient w.r.t the image by backtracking its trajectory. An amortized strategy is also developed to accelerate the attack. Experiments show that our attack better breaks state-of-the-art defenses (e.g DefenseGAN, ABS) than other attacks (e.g BPDA). Additionally, our empirical results provide insights for understanding the weaknesses of deep generative model defenses.}
}
@InProceedings{pmlr-v119-chen20x,
title = {Automated Synthetic-to-Real Generalization},
author = {Chen, Wuyang and Yu, Zhiding and Wang, Zhangyang and Anandkumar, Animashree},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1746--1756},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20x/chen20x.pdf},
url = {http://proceedings.mlr.press/v119/chen20x.html},
abstract = {Models trained on synthetic images often face degraded generalization to real data. As a convention, these models are often initialized with ImageNet pretrained representation. Yet the role of ImageNet knowledge is seldom discussed despite common practices that leverage this knowledge to maintain the generalization ability. An example is the careful hand-tuning of early stopping and layer-wise learning rates, which is shown to improve synthetic-to-real generalization but is also laborious and heuristic. In this work, we explicitly encourage the synthetically trained model to maintain similar representations with the ImageNet pretrained model, and propose a \emph{learning-to-optimize (L2O)} strategy to automate the selection of layer-wise learning rates. We demonstrate that the proposed framework can significantly improve the synthetic-to-real generalization performance without seeing and training on real data, while also benefiting downstream tasks such as domain adaptation. Code is available at: https://github.com/NVlabs/ASG.}
}
@InProceedings{pmlr-v119-chen20y,
title = {({L}ocally) Differentially Private Combinatorial Semi-Bandits},
author = {Chen, Xiaoyu and Zheng, Kai and Zhou, Zixin and Yang, Yunchang and Chen, Wei and Wang, Liwei},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1757--1767},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chen20y/chen20y.pdf},
url = {http://proceedings.mlr.press/v119/chen20y.html},
abstract = {In this paper, we study Combinatorial Semi-Bandits (CSB) that is an extension of classic Multi-Armed Bandits (MAB) under Differential Privacy (DP) and stronger Local Differential Privacy (LDP) setting. Since the server receives more information from users in CSB, it usually causes additional dependence on the dimension of data, which is a notorious side-effect for privacy preserving learning. However for CSB under two common smoothness assumptions, we show it is possible to remove this side-effect. In detail, for $B_{\infty}$-bounded smooth CSB under either $\varepsilon$-LDP or $\varepsilon$-DP, we prove the optimal regret bound is $\Theta(\frac{mB^2_{\infty}\ln T } {\Delta\varepsilon^2})$ or $\tilde{\Theta}(\frac{mB^2_{\infty}\ln T} { \Delta\varepsilon})$ respectively, where $T$ is time period, $\Delta$ is the gap of rewards and $m$ is the number of base arms, by proposing novel algorithms and matching lower bounds. For $B_1$-bounded smooth CSB under $\varepsilon$-DP, we also prove the optimal regret bound is $\tilde{\Theta}(\frac{mKB^2_1\ln T} {\Delta\varepsilon})$ with both upper bound and lower bound, where $K$ is the maximum number of feedback in each round. All above results nearly match corresponding non-private optimal rates, which imply there is no additional price for (locally) differentially private CSB in above common settings.}
}
@InProceedings{pmlr-v119-cheng20a,
title = {High-dimensional Robust Mean Estimation via Gradient Descent},
author = {Cheng, Yu and Diakonikolas, Ilias and Ge, Rong and Soltanolkotabi, Mahdi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1768--1778},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cheng20a/cheng20a.pdf},
url = {http://proceedings.mlr.press/v119/cheng20a.html},
abstract = {We study the problem of high-dimensional robust mean estimation in the presence of a constant fraction of adversarial outliers. A recent line of work has provided sophisticated polynomial-time algorithms for this problem with dimension-independent error guarantees for a range of natural distribution families. In this work, we show that a natural non-convex formulation of the problem can be solved directly by gradient descent. Our approach leverages a novel structural lemma, roughly showing that any approximate stationary point of our non-convex objective gives a near-optimal solution to the underlying robust estimation task. Our work establishes an intriguing connection between algorithmic high-dimensional robust statistics and non-convex optimization, which may have broader applications to other robust estimation tasks.}
}
@InProceedings{pmlr-v119-cheng20b,
title = {{CLUB}: A Contrastive Log-ratio Upper Bound of Mutual Information},
author = {Cheng, Pengyu and Hao, Weituo and Dai, Shuyang and Liu, Jiachang and Gan, Zhe and Carin, Lawrence},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1779--1788},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cheng20b/cheng20b.pdf},
url = {http://proceedings.mlr.press/v119/cheng20b.html},
abstract = {Mutual information (MI) minimization has gained considerable interests in various machine learning tasks. However, estimating and minimizing MI in high-dimensional spaces remains a challenging problem, especially when only samples, rather than distribution forms, are accessible. Previous works mainly focus on MI lower bound approximation, which is not applicable to MI minimization problems. In this paper, we propose a novel Contrastive Log-ratio Upper Bound (CLUB) of mutual information. We provide a theoretical analysis of the properties of CLUB and its variational approximation. Based on this upper bound, we introduce a MI minimization training scheme and further accelerate it with a negative sampling strategy. Simulation studies on Gaussian distributions show the reliable estimation ability of CLUB. Real-world MI minimization experiments, including domain adaptation and information bottleneck, demonstrate the effectiveness of the proposed method. The code is at https://github.com/Linear95/CLUB.}
}
@InProceedings{pmlr-v119-cheng20c,
title = {Learning with Bounded Instance and Label-dependent Label Noise},
author = {Cheng, Jiacheng and Liu, Tongliang and Ramamohanarao, Kotagiri and Tao, Dacheng},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1789--1799},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cheng20c/cheng20c.pdf},
url = {http://proceedings.mlr.press/v119/cheng20c.html},
abstract = {Instance- and Label-dependent label Noise (ILN) widely exists in real-world datasets but has been rarely studied. In this paper, we focus on Bounded Instance- and Label-dependent label Noise (BILN), a particular case of ILN where the label noise rates—the probabilities that the true labels of examples flip into the corrupted ones—have upper bound less than $1$. Specifically, we introduce the concept of distilled examples, i.e. examples whose labels are identical with the labels assigned for them by the Bayes optimal classifier, and prove that under certain conditions classifiers learnt on distilled examples will converge to the Bayes optimal classifier. Inspired by the idea of learning with distilled examples, we then propose a learning algorithm with theoretical guarantees for its robustness to BILN. At last, empirical evaluations on both synthetic and real-world datasets show effectiveness of our algorithm in learning with BILN.}
}
@InProceedings{pmlr-v119-cheng20d,
title = {Mutual Transfer Learning for Massive Data},
author = {Cheng, Ching-Wei and Qiao, Xingye and Cheng, Guang},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1800--1809},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cheng20d/cheng20d.pdf},
url = {http://proceedings.mlr.press/v119/cheng20d.html},
abstract = {In the transfer learning problem, the target and the source data domains are typically known. In this article, we study a new paradigm called mutual transfer learning where among many heterogeneous data domains, every data domain could potentially be the target of interest, and it could also be a useful source to help the learning in other data domains. However, it is important to note that given a target not every data domain can be a successful source; only data sets that are similar enough to be thought as from the same population can be useful sources for each other. Under this mutual learnability assumption, a confidence distribution fusion approach is proposed to recover the mutual learnability relation in the transfer learning regime. Our proposed method achieves the same oracle statistical inferential accuracy as if the true learnability structure were known. It can be implemented in an efficient parallel fashion to deal with large-scale data. Simulated and real examples are analyzed to illustrate the usefulness of the proposed method.}
}
@InProceedings{pmlr-v119-cheng20e,
title = {Stochastic Gradient and {L}angevin Processes},
author = {Cheng, Xiang and Yin, Dong and Bartlett, Peter and Jordan, Michael},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1810--1819},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cheng20e/cheng20e.pdf},
url = {http://proceedings.mlr.press/v119/cheng20e.html},
abstract = {We prove quantitative convergence rates at which discrete Langevin-like processes converge to the invariant distribution of a related stochastic differential equation. We study the setup where the additive noise can be non-Gaussian and state-dependent and the potential function can be non-convex. We show that the key properties of these processes depend on the potential function and the second moment of the additive noise. We apply our theoretical findings to studying the convergence of Stochastic Gradient Descent (SGD) for non-convex problems and corroborate them with experiments using SGD to train deep neural networks on the CIFAR-10 dataset.}
}
@InProceedings{pmlr-v119-cherian20a,
title = {Representation Learning via Adversarially-Contrastive Optimal Transport},
author = {Cherian, Anoop and Aeron, Shuchin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1820--1830},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cherian20a/cherian20a.pdf},
url = {http://proceedings.mlr.press/v119/cherian20a.html},
abstract = {In this paper, we study the problem of learning compact (low-dimensional) representations for sequential data that captures its implicit spatio-temporal cues. To maximize extraction of such informative cues from the data, we set the problem within the context of contrastive representation learning and to that end propose a novel objective via optimal transport. Specifically, our formulation seeks a low-dimensional subspace representation of the data that jointly (i) maximizes the distance of the data (embedded in this subspace) from an adversarial data distribution under the optimal transport, a.k.a. the Wasserstein distance, (ii) captures the temporal order, and (iii) minimizes the data distortion. To generate the adversarial distribution, we propose a novel framework connecting Wasserstein GANs with a classifier, allowing a principled mechanism for producing good negative distributions for contrastive learning, which is currently a challenging problem. Our full objective is cast as a subspace learning problem on the Grassmann manifold and solved via Riemannian optimization. To empirically study our formulation, we provide experiments on the task of human action recognition in video sequences. Our results demonstrate competitive performance against challenging baselines.}
}
@InProceedings{pmlr-v119-cherief-abdellatif20a,
title = {Convergence Rates of Variational Inference in Sparse Deep Learning},
author = {Ch{\'e}rief-Abdellatif, Badr-Eddine},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1831--1842},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cherief-abdellatif20a/cherief-abdellatif20a.pdf},
url = {http://proceedings.mlr.press/v119/cherief-abdellatif20a.html},
abstract = {Variational inference is becoming more and more popular for approximating intractable posterior distributions in Bayesian statistics and machine learning. Meanwhile, a few recent works have provided theoretical justification and new insights on deep neural networks for estimating smooth functions in usual settings such as nonparametric regression. In this paper, we show that variational inference for sparse deep learning retains precisely the same generalization properties than exact Bayesian inference. In particular, we show that a wise choice of the neural network architecture leads to near-minimax rates of convergence for Hölder smooth functions. Additionally, we show that the model selection framework over the architecture of the network via ELBO maximization does not overfit and adaptively achieves the optimal rate of convergence.}
}
@InProceedings{pmlr-v119-cheung20a,
title = {Reinforcement Learning for Non-Stationary {M}arkov Decision Processes: The Blessing of ({M}ore) Optimism},
author = {Cheung, Wang Chi and Simchi-Levi, David and Zhu, Ruihao},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1843--1854},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cheung20a/cheung20a.pdf},
url = {http://proceedings.mlr.press/v119/cheung20a.html},
abstract = {We consider un-discounted reinforcement learning (RL) in Markov decision processes (MDPs) under drifting non-stationarity, \ie, both the reward and state transition distributions are allowed to evolve over time, as long as their respective total variations, quantified by suitable metrics, do not exceed certain \emph{variation budgets}. We first develop the Sliding Window Upper-Confidence bound for Reinforcement Learning with Confidence Widening (\texttt{SWUCRL2-CW}) algorithm, and establish its dynamic regret bound when the variation budgets are known. In addition, we propose the Bandit-over-Reinforcement Learning (\texttt{BORL}) algorithm to adaptively tune the \sw to achieve the same dynamic regret bound, but in a \emph{parameter-free} manner, \ie, without knowing the variation budgets. Notably, learning drifting MDPs via conventional optimistic exploration presents a unique challenge absent in existing (non-stationary) bandit learning settings. We overcome the challenge by a novel confidence widening technique that incorporates additional optimism.}
}
@InProceedings{pmlr-v119-chhaya20a,
title = {Streaming Coresets for Symmetric Tensor Factorization},
author = {Chhaya, Rachit and Choudhari, Jayesh and Dasgupta, Anirban and Shit, Supratim},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1855--1865},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chhaya20a/chhaya20a.pdf},
url = {http://proceedings.mlr.press/v119/chhaya20a.html},
abstract = {Factorizing tensors has recently become an important optimization module in a number of machine learning pipelines, especially in latent variable models. We show how to do this efficiently in the streaming setting. Given a set of $n$ vectors, each in $\mathbb{R}^d$, we present algorithms to select a sublinear number of these vectors as coreset, while guaranteeing that the CP decomposition of the $p$-moment tensor of the coreset approximates the corresponding decomposition of the $p$-moment tensor computed from the full data. We introduce two novel algorithmic techniques: online filtering and kernelization. Using these two, we present four algorithms that achieve different tradeoffs of coreset size, update time and working space, beating or matching various state of the art algorithms. In the case of matrices (2-ordered tensor), our online row sampling algorithm guarantees $(1 \pm \epsilon)$ relative error spectral approximation. We show applications of our algorithms in learning single topic modeling.}
}
@InProceedings{pmlr-v119-chhaya20b,
title = {On Coresets for Regularized Regression},
author = {Chhaya, Rachit and Dasgupta, Anirban and Shit, Supratim},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1866--1876},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chhaya20b/chhaya20b.pdf},
url = {http://proceedings.mlr.press/v119/chhaya20b.html},
abstract = {We study the effect of norm based regularization on the size of coresets for regression problems. Specifically, given a matrix $ \mathbf{A} \in {\mathbb{R}}^{n \times d}$ with $n\gg d$ and a vector $\mathbf{b} \in \mathbb{R} ^ n $ and $\lambda > 0$, we analyze the size of coresets for regularized versions of regression of the form $\|\mathbf{Ax}-\mathbf{b}\|_p^r + \lambda\|{\mathbf{x}}\|_q^s$. Prior work has shown that for ridge regression (where $p,q,r,s=2$) we can obtain a coreset that is smaller than the coreset for the unregularized counterpart i.e. least squares regression \cite{avron2017sharper}. We show that when $r \neq s$, no coreset for regularized regression can have size smaller than the optimal coreset of the unregularized version. The well known lasso problem falls under this category and hence does not allow a coreset smaller than the one for least squares regression. We propose a modified version of the lasso problem and obtain for it a coreset of size smaller than the least square regression. We empirically show that the modified version of lasso also induces sparsity in solution, similar to the original lasso. We also obtain smaller coresets for $\ell_p$ regression with $\ell_p$ regularization. We extend our methods to multi response regularized regression. Finally, we empirically demonstrate the coreset performance for the modified lasso and the $\ell_1$ regression with $\ell_1$ regularization.}
}
@InProceedings{pmlr-v119-chiplunkar20a,
title = {How to Solve Fair k-Center in Massive Data Models},
author = {Chiplunkar, Ashish and Kale, Sagar and Ramamoorthy, Sivaramakrishnan Natarajan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1877--1886},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chiplunkar20a/chiplunkar20a.pdf},
url = {http://proceedings.mlr.press/v119/chiplunkar20a.html},
abstract = {Fueled by massive data, important decision making is being automated with the help of algorithms, therefore, fairness in algorithms has become an especially important research topic. In this work, we design new streaming and distributed algorithms for the fair k-center problem that models fair data summarization. The streaming and distributed models of computation have an attractive feature of being able to handle massive data sets that do not fit into main memory. Our main contributions are: (a) the first distributed algorithm; which has provably constant approximation ratio and is extremely parallelizable, and (b) a two-pass streaming algorithm with a provable approximation guarantee matching the best known algorithm (which is not a streaming algorithm). Our algorithms have the advantages of being easy to implement in practice, being fast with linear running times, having very small working memory and communication, and outperforming existing algorithms on several real and synthetic data sets. To complement our distributed algorithm, we also give a hardness result for natural distributed algorithms, which holds for even the special case of k-center.}
}
@InProceedings{pmlr-v119-choi20a,
title = {Fair Generative Modeling via Weak Supervision},
author = {Choi, Kristy and Grover, Aditya and Singh, Trisha and Shu, Rui and Ermon, Stefano},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1887--1898},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/choi20a/choi20a.pdf},
url = {http://proceedings.mlr.press/v119/choi20a.html},
abstract = {Real-world datasets are often biased with respect to key demographic factors such as race and gender. Due to the latent nature of the underlying factors, detecting and mitigating bias is especially challenging for unsupervised machine learning. We present a weakly supervised algorithm for overcoming dataset bias for deep generative models. Our approach requires access to an additional small, unlabeled reference dataset as the supervision signal, thus sidestepping the need for explicit labels on the underlying bias factors. Using this supplementary dataset, we detect the bias in existing datasets via a density ratio technique and learn generative models which efficiently achieve the twin goals of: 1) data efficiency by using training examples from both biased and reference datasets for learning; and 2) data generation close in distribution to the reference dataset at test time. Empirically, we demonstrate the efficacy of our approach which reduces bias w.r.t. latent factors by an average of up to 34.6% over baselines for comparable image generation using generative adversarial networks.}
}
@InProceedings{pmlr-v119-choi20b,
title = {Encoding Musical Style with Transformer Autoencoders},
author = {Choi, Kristy and Hawthorne, Curtis and Simon, Ian and Dinculescu, Monica and Engel, Jesse},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1899--1908},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/choi20b/choi20b.pdf},
url = {http://proceedings.mlr.press/v119/choi20b.html},
abstract = {We consider the problem of learning high-level controls over the global structure of generated sequences, particularly in the context of symbolic music generation with complex language models. In this work, we present the Transformer autoencoder, which aggregates encodings of the input data across time to obtain a global representation of style from a given performance. We show it is possible to combine this global representation with other temporally distributed embeddings, enabling improved control over the separate aspects of performance style and melody. Empirically, we demonstrate the effectiveness of our method on various music generation tasks on the MAESTRO dataset and a YouTube dataset with 10,000+ hours of piano performances, where we achieve improvements in terms of log-likelihood and mean listening scores as compared to baselines.}
}
@InProceedings{pmlr-v119-choo20a,
title = {k-means++: few more steps yield constant approximation},
author = {Choo, Davin and Grunau, Christoph and Portmann, Julian and Rozhon, Vaclav},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1909--1917},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/choo20a/choo20a.pdf},
url = {http://proceedings.mlr.press/v119/choo20a.html},
abstract = {The k-means++ algorithm of Arthur and Vassilvitskii (SODA 2007) is a state-of-the-art algorithm for solving the k-means clustering problem and is known to give an O(log k) approximation. Recently, Lattanzi and Sohler (ICML 2019) proposed augmenting k-means++ with O(k log log k) local search steps to yield a constant approximation (in expectation) to the k-means clustering problem. In this paper, we improve their analysis to show that, for any arbitrarily small constant epsilon > 0, with only epsilon * k additional local search steps, one can achieve a constant approximation guarantee (with high probability in k), resolving an open problem in their paper.}
}
@InProceedings{pmlr-v119-choromanski20a,
title = {Stochastic Flows and Geometric Optimization on the Orthogonal Group},
author = {Choromanski, Krzysztof and Cheikhi, David and Davis, Jared and Likhosherstov, Valerii and Nazaret, Achille and Bahamou, Achraf and Song, Xingyou and Akarte, Mrugank and Parker-Holder, Jack and Bergquist, Jacob and Gao, Yuan and Pacchiano, Aldo and Sarlos, Tamas and Weller, Adrian and Sindhwani, Vikas},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1918--1928},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/choromanski20a/choromanski20a.pdf},
url = {http://proceedings.mlr.press/v119/choromanski20a.html},
abstract = {We present a new class of stochastic, geometrically-driven optimization algorithms on the orthogonal group O(d) and naturally reductive homogeneous manifolds obtained from the action of the rotation group SO(d). We theoretically and experimentally demonstrate that our methods can be applied in various fields of machine learning including deep, convolutional and recurrent neural networks, reinforcement learning, normalizing flows and metric learning. We show an intriguing connection between efficient stochastic optimization on the orthogonal group and graph theory (e.g. matching problem, partition functions over graphs, graph-coloring). We leverage the theory of Lie groups and provide theoretical results for the designed class of algorithms. We demonstrate broad applicability of our methods by showing strong performance on the seemingly unrelated tasks of learning world models to obtain stable policies for the most difficult Humanoid agent from OpenAI Gym and improving convolutional neural networks.}
}
@InProceedings{pmlr-v119-chou20a,
title = {Unbiased Risk Estimators Can Mislead: A Case Study of Learning with Complementary Labels},
author = {Chou, Yu-Ting and Niu, Gang and Lin, Hsuan-Tien and Sugiyama, Masashi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1929--1938},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chou20a/chou20a.pdf},
url = {http://proceedings.mlr.press/v119/chou20a.html},
abstract = {In weakly supervised learning, unbiased risk estimator(URE) is a powerful tool for training classifiers when training and test data are drawn from different distributions. Nevertheless, UREs lead to overfitting in many problem settings when the models are complex like deep networks. In this paper, we investigate reasons for such overfitting by studying a weakly supervised problem called learning with complementary labels. We argue the quality of gradient estimation matters more in risk minimization. Theoretically, we show that a URE gives an unbiased gradient estimator(UGE). Practically, however, UGEs may suffer from huge variance, which causes empirical gradients to be usually far away from true gradients during minimization. To this end, we propose a novel surrogate complementary loss(SCL) framework that trades zero bias with reduced variance and makes empirical gradients more aligned with true gradients in the direction. Thanks to this characteristic, SCL successfully mitigates the overfitting issue and improves URE-based methods.}
}
@InProceedings{pmlr-v119-chowdhury20a,
title = {Data-Dependent Differentially Private Parameter Learning for Directed Graphical Models},
author = {Chowdhury, Amrita Roy and Rekatsinas, Theodoros and Jha, Somesh},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1939--1951},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chowdhury20a/chowdhury20a.pdf},
url = {http://proceedings.mlr.press/v119/chowdhury20a.html},
abstract = {Directed graphical models (DGMs) are a class of probabilistic models that are widely used for predictive analysis in sensitive domains such as medical diagnostics. In this paper, we present an algorithm for differentially-private learning of the parameters of a DGM. Our solution optimizes for the utility of inference queries over the DGM and \emph{adds noise that is customized to the properties of the private input dataset and the graph structure of the DGM}. To the best of our knowledge, this is the first explicit data-dependent privacy budget allocation algorithm in the context of DGMs. We compare our algorithm with a standard data-independent approach over a diverse suite of benchmarks and demonstrate that our solution requires a privacy budget that is roughly $3\times$ smaller to obtain the same or higher utility.}
}
@InProceedings{pmlr-v119-chrysakis20a,
title = {Online Continual Learning from Imbalanced Data},
author = {Chrysakis, Aristotelis and Moens, Marie-Francine},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1952--1961},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chrysakis20a/chrysakis20a.pdf},
url = {http://proceedings.mlr.press/v119/chrysakis20a.html},
abstract = {A well-documented weakness of neural networks is the fact that they suffer from catastrophic forgetting when trained on data provided by a non-stationary distribution. Recent work in the field of continual learning attempts to understand and overcome this issue. Unfortunately, the majority of relevant work embraces the implicit assumption that the distribution of observed data is perfectly balanced, despite the fact that, in the real world, humans and animals learn from observations that are temporally correlated and severely imbalanced. Motivated by this remark, we aim to evaluate memory population methods that are used in online continual learning, when dealing with highly imbalanced and temporally correlated streams of data. More importantly, we introduce a new memory population approach, which we call class-balancing reservoir sampling (CBRS). We demonstrate that CBRS outperforms the state-of-the-art memory population algorithms in a considerably challenging learning setting, over a range of different datasets, and for multiple architectures.}
}
@InProceedings{pmlr-v119-chu20a,
title = {Distance Metric Learning with Joint Representation Diversification},
author = {Chu, Xu and Lin, Yang and Wang, Yasha and Wang, Xiting and Yu, Hailong and Gao, Xin and Tong, Qi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1962--1973},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chu20a/chu20a.pdf},
url = {http://proceedings.mlr.press/v119/chu20a.html},
abstract = {Distance metric learning (DML) is to learn a representation space equipped with a metric, such that similar examples are closer than dissimilar examples concerning the metric. The recent success of DNNs motivates many DML losses that encourage the intra-class compactness and inter-class separability. The trade-off between inter-class compactness and inter-class separability shapes the DML representation space by determining how much information of the original inputs to retain. In this paper, we propose a Distance Metric Learning with Joint Representation Diversification (JRD) that allows a better balancing point between intra-class compactness and inter-class separability. Specifically, we propose a Joint Representation Similarity regularizer that captures different abstract levels of invariant features and diversifies the joint distributions of representations across multiple layers. Experiments on three deep DML benchmark datasets demonstrate the effectiveness of the proposed approach.}
}
@InProceedings{pmlr-v119-chu20b,
title = {Semismooth {N}ewton Algorithm for Efficient Projections onto $\ell_{1, ∞}$-norm Ball},
author = {Chu, Dejun and Zhang, Changshui and Sun, Shiliang and Tao, Qing},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1974--1983},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chu20b/chu20b.pdf},
url = {http://proceedings.mlr.press/v119/chu20b.html},
abstract = {The structured sparsity-inducing $\ell_{1, \infty}$-norm, as a generalization of the classical $\ell_1$-norm, plays an important role in jointly sparse models which select or remove simultaneously all the variables forming a group. However, its resulting problem is more difficult to solve than the conventional $\ell_1$-norm constrained problem. In this paper, we propose an efficient algorithm for Euclidean projection onto $\ell_{1, \infty}$-norm ball. We tackle the projection problem via semismooth Newton algorithm to solve the system of semismooth equations. Meanwhile, exploiting the structure of the Jacobian matrix via LU decomposition yields an equivalent algorithm which is proved to terminate after a finite number of iterations. Empirical studies demonstrate that our proposed algorithm outperforms the existing state-of-the-art solver and is promising for the optimization of learning problems with the $\ell_{1, \infty}$-norm ball constraint.}
}
@InProceedings{pmlr-v119-chuang20a,
title = {Estimating Generalization under Distribution Shifts via Domain-Invariant Representations},
author = {Chuang, Ching-Yao and Torralba, Antonio and Jegelka, Stefanie},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1984--1994},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chuang20a/chuang20a.pdf},
url = {http://proceedings.mlr.press/v119/chuang20a.html},
abstract = {When machine learning models are deployed on a test distribution different from the training distribution, they can perform poorly, but overestimate their performance. In this work, we aim to better estimate a model’s performance under distribution shift, without supervision. To do so, we use a set of domain-invariant predictors as a proxy for the unknown, true target labels. Since the error of the resulting risk estimate depends on the target risk of the proxy model, we study generalization of domain-invariant representations and show that the complexity of the latent representation has a significant influence on the target risk. Empirically, our approach (1) enables self-tuning of domain adaptation models, and (2) accurately estimates the target error of given models under distribution shift. Other applications include model selection, deciding early stopping and error detection.}
}
@InProceedings{pmlr-v119-chumbalov20a,
title = {Scalable and Efficient Comparison-based Search without Features},
author = {Chumbalov, Daniyar and Maystre, Lucas and Grossglauser, Matthias},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1995--2005},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chumbalov20a/chumbalov20a.pdf},
url = {http://proceedings.mlr.press/v119/chumbalov20a.html},
abstract = {We consider the problem of finding a target object t using pairwise comparisons, by asking an oracle questions of the form “Which object from the pair (i,j) is more similar to t?”. Objects live in a space of latent features, from which the oracle generates noisy answers. First, we consider the non-blind setting where these features are accessible. We propose a new Bayesian comparison-based search algorithm with noisy answers; it has low computational complexity yet is efficient in the number of queries. We provide theoretical guarantees, deriving the form of the optimal query and proving almost sure convergence to the target t. Second, we consider the blind setting, where the object features are hidden from the search algorithm. In this setting, we combine our search method and a new distributional triplet embedding algorithm into one scalable learning framework called Learn2Search. We show that the query complexity of our approach on two real-world datasets is on par with the non-blind setting, which is not achievable using any of the current state-of-the-art embedding methods. Finally, we demonstrate the efficacy of our framework by conducting a movie actors search experiment with real users.}
}
@InProceedings{pmlr-v119-chung20a,
title = {Feature-map-level Online Adversarial Knowledge Distillation},
author = {Chung, Inseop and Park, Seonguk and Kim, Jangho and Kwak, Nojun},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2006--2015},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/chung20a/chung20a.pdf},
url = {http://proceedings.mlr.press/v119/chung20a.html},
abstract = {Feature maps contain rich information about image intensity and spatial correlation. However, previous online knowledge distillation methods only utilize the class probabilities. Thus in this paper, we propose an online knowledge distillation method that transfers not only the knowledge of the class probabilities but also that of the feature map using the adversarial training framework. We train multiple networks simultaneously by employing discriminators to distinguish the feature map distributions of different networks. Each network has its corresponding discriminator which discriminates the feature map from its own as fake while classifying that of the other network as real. By training a network to fool the corresponding discriminator, it can learn the other network’s feature map distribution. We show that our method performs better than the conventional direct alignment method such as L1 and is more suitable for online distillation. Also, we propose a novel cyclic learning scheme for training more than two networks together. We have applied our method to various network architectures on the classification task and discovered a significant improvement of performance especially in the case of training a pair of a small network and a large one.}
}
@InProceedings{pmlr-v119-cicalese20a,
title = {Teaching with Limited Information on the Learner’s Behaviour},
author = {Cicalese, Ferdinando and Filho, Sergio and Laber, Eduardo and Molinaro, Marco},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2016--2026},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cicalese20a/cicalese20a.pdf},
url = {http://proceedings.mlr.press/v119/cicalese20a.html},
abstract = {Machine Teaching studies how efficiently a Teacher can guide a Learner to a target hypothesis. We focus on the model of Machine Teaching with a black box learner introduced in [Dasgupta et al., ICML 2019], where the teaching is done interactively without having any knowledge of the Learner’s algorithm and class of hypotheses, apart from the fact that it contains the target hypothesis $h^*$. We first refine some existing results for this model and, then, we study new variants of it. Motivated by the realistic possibility that $h^*$ is not available to the learner, we consider the case where the teacher can only aim at having the learner converge to a best available approximation of $h^*$. We also consider weaker black box learners, where, in each round, the choice of the consistent hypothesis returned to the Teacher is not adversarial, and in particular, we show that better provable bounds can be obtained for a type of Learner that moves to the next hypothesis smoothly, preferring hypotheses that are close to the current one; and for another type of Learner that can provide to the Teacher hypotheses chosen at random among those consistent with the examples received so far. Finally, we present an empirical evaluation of our basic interactive teacher on real datasets.}
}
@InProceedings{pmlr-v119-cilingir20a,
title = {Deep Divergence Learning},
author = {Cilingir, Hatice Kubra and Manzelli, Rachel and Kulis, Brian},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2027--2037},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cilingir20a/cilingir20a.pdf},
url = {http://proceedings.mlr.press/v119/cilingir20a.html},
abstract = {Classical linear metric learning methods have recently been extended along two distinct lines: deep metric learning methods for learning embeddings of the data using neural networks, and Bregman divergence learning approaches for extending learning Euclidean distances to more general divergence measures such as divergences over distributions. In this paper, we introduce deep Bregman divergences, which are based on learning and parameterizing functional Bregman divergences using neural networks, and which unify and extend these existing lines of work. We show in particular how deep metric learning formulations, kernel metric learning, Mahalanobis metric learning, and moment-matching functions for comparing distributions arise as special cases of these divergences in the symmetric setting. We then describe a deep learning framework for learning general functional Bregman divergences, and show in experiments that this method yields superior performance on benchmark datasets as compared to existing deep metric learning approaches. We also discuss novel applications, including a semi-supervised distributional clustering problem, and a new loss function for unsupervised data generation.}
}
@InProceedings{pmlr-v119-claici20a,
title = {Model Fusion with Kullback-Leibler Divergence},
author = {Claici, Sebastian and Yurochkin, Mikhail and Ghosh, Soumya and Solomon, Justin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2038--2047},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/claici20a/claici20a.pdf},
url = {http://proceedings.mlr.press/v119/claici20a.html},
abstract = {We propose a method to fuse posterior distributions learned from heterogeneous datasets. Our algorithm relies on a mean field assumption for both the fused model and the individual dataset posteriors and proceeds using a simple assign-and-average approach. The components of the dataset posteriors are assigned to the proposed global model components by solving a regularized variant of the assignment problem. The global components are then updated based on these assignments by their mean under a KL divergence. For exponential family variational distributions, our formulation leads to an efficient non-parametric algorithm for computing the fused model. Our algorithm is easy to describe and implement, efficient, and competitive with state-of-the-art on motion capture analysis, topic modeling, and federated learning of Bayesian neural networks.}
}
@InProceedings{pmlr-v119-cobbe20a,
title = {Leveraging Procedural Generation to Benchmark Reinforcement Learning},
author = {Cobbe, Karl and Hesse, Chris and Hilton, Jacob and Schulman, John},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2048--2056},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cobbe20a/cobbe20a.pdf},
url = {http://proceedings.mlr.press/v119/cobbe20a.html},
abstract = {We introduce Procgen Benchmark, a suite of 16 procedurally generated game-like environments designed to benchmark both sample efficiency and generalization in reinforcement learning. We believe that the community will benefit from increased access to high quality training environments, and we provide detailed experimental protocols for using this benchmark. We empirically demonstrate that diverse environment distributions are essential to adequately train and evaluate RL agents, thereby motivating the extensive use of procedural content generation. We then use this benchmark to investigate the effects of scaling model size, finding that larger models significantly improve both sample efficiency and generalization.}
}
@InProceedings{pmlr-v119-cohen20a,
title = {Composable Sketches for Functions of Frequencies: Beyond the Worst Case},
author = {Cohen, Edith and Geri, Ofir and Pagh, Rasmus},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2057--2067},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cohen20a/cohen20a.pdf},
url = {http://proceedings.mlr.press/v119/cohen20a.html},
abstract = {Recently there has been increased interest in using machine learning techniques to improve classical algorithms. In this paper we study when it is possible to construct compact, composable sketches for weighted sampling and statistics estimation according to functions of data frequencies. Such structures are now central components of large-scale data analytics and machine learning pipelines. However, many common functions, such as thresholds and $p$th frequency moments with $p>2$, are known to require polynomial size sketches in the worst case. We explore performance beyond the worst case under two different types of assumptions. The first is having access to noisy \emph{advice} on item frequencies. This continues the line of work of Hsu et al. (ICLR 2019), who assume predictions are provided by a machine learning model. The second is providing guaranteed performance on a restricted class of input frequency distributions that are better aligned with what is observed in practice. This extends the work on heavy hitters under Zipfian distributions in a seminal paper of Charikar et al. (ICALP 2002). Surprisingly, we show analytically and empirically that "in practice" small polylogarithmic-size sketches provide accuracy for "hard" functions.}
}
@InProceedings{pmlr-v119-cohen20b,
title = {Healing Products of {G}aussian Process Experts},
author = {Cohen, Samuel and Mbuvha, Rendani and Marwala, Tshilidzi and Deisenroth, Marc},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2068--2077},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cohen20b/cohen20b.pdf},
url = {http://proceedings.mlr.press/v119/cohen20b.html},
abstract = {Gaussian processes (GPs) are nonparametric Bayesian models that have been applied to regression and classification problems. One of the approaches to alleviate their cubic training cost is the use of local GP experts trained on subsets of the data. In particular, product-of-expert models combine the predictive distributions of local experts through a tractable product operation. While these expert models allow for massively distributed computation, their predictions typically suffer from erratic behaviour of the mean or uncalibrated uncertainty quantification. By calibrating predictions via a tempered softmax weighting, we provide a solution to these problems for multiple product-of-expert models, including the generalised product of experts and the robust Bayesian committee machine. Furthermore, we leverage the optimal transport literature and propose a new product-of-expert model that combines predictions of local experts by computing their Wasserstein barycenter, which can be applied to both regression and classification.}
}
@InProceedings{pmlr-v119-cohen-addad20a,
title = {On Efficient Low Distortion Ultrametric Embedding},
author = {Cohen-Addad, Vincent and S., Karthik C. and Lagarde, Guillaume},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2078--2088},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cohen-addad20a/cohen-addad20a.pdf},
url = {http://proceedings.mlr.press/v119/cohen-addad20a.html},
abstract = {A classic problem in unsupervised learning and data analysis is to find simpler and easy-to-visualize representations of the data that preserve its essential properties. A widely-used method to preserve the underlying hierarchical structure of the data while reducing its complexity is to find an embedding of the data into a tree or an ultrametric, but computing such an embedding on a data set of $n$ points in $\Omega(\log n)$ dimensions incurs a quite prohibitive running time of $\Theta(n^2)$. In this paper, we provide a new algorithm which takes as input a set of points $P$ in $\R^d$, and for every $c\ge 1$, runs in time $n^{1+\frac{\rho}{c^2}}$ (for some universal constant $\rho>1$) to output an ultrametric $\Delta$ such that for any two points $u,v$ in $P$, we have $\Delta(u,v)$ is within a multiplicative factor of $5c$ to the distance between $u$ and $v$ in the best ultrametric representation of $P$. Here, the best ultrametric is the ultrametric $\tilde\Delta$ that minimizes the maximum distance distortion with respect to the $\ell_2$ distance, namely that minimizes $\underset{u,v \in P}{\max} \nicefrac{\tilde\Delta(u,v)}{\|u-v\|_2}$. We complement the above result by showing that under popular complexity theoretic assumptions, for every constant $\varepsilon>0$, no algorithm with running time $n^{2-\varepsilon}$ can distinguish between inputs in $\ell_\infty$-metric that admit isometric embedding and those that incur a distortion of $\nicefrac{3}{2}$. Finally, we present empirical evaluation on classic machine learning datasets and show that the output of our algorithm is comparable to the output of the linkage algorithms while achieving a much faster running time.}
}
@InProceedings{pmlr-v119-coleman20a,
title = {Sub-linear Memory Sketches for Near Neighbor Search on Streaming Data},
author = {Coleman, Benjamin and Baraniuk, Richard and Shrivastava, Anshumali},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2089--2099},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/coleman20a/coleman20a.pdf},
url = {http://proceedings.mlr.press/v119/coleman20a.html},
abstract = {We present the first sublinear memory sketch that can be queried to find the nearest neighbors in a dataset. Our online sketching algorithm compresses an N element dataset to a sketch of size $O(N^b \log^3 N)$ in $O(N^{(b+1)} \log^3 N)$ time, where $b < 1$. This sketch can correctly report the nearest neighbors of any query that satisfies a stability condition parameterized by $b$. We achieve sublinear memory performance on stable queries by combining recent advances in locality sensitive hash (LSH)-based estimators, online kernel density estimation, and compressed sensing. Our theoretical results shed new light on the memory-accuracy tradeoff for nearest neighbor search, and our sketch, which consists entirely of short integer arrays, has a variety of attractive features in practice. We evaluate the memory-recall tradeoff of our method on a friend recommendation task in the Google plus social media network. We obtain orders of magnitude better compression than the random projection based alternative while retaining the ability to report the nearest neighbors of practical queries.}
}
@InProceedings{pmlr-v119-collobert20a,
title = {Word-Level Speech Recognition With a Letter to Word Encoder},
author = {Collobert, Ronan and Hannun, Awni and Synnaeve, Gabriel},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2100--2110},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/collobert20a/collobert20a.pdf},
url = {http://proceedings.mlr.press/v119/collobert20a.html},
abstract = {We propose a direct-to-word sequence model which uses a word network to learn word embeddings from letters. The word network can be integrated seamlessly with arbitrary sequence models including Connectionist Temporal Classification and encoder-decoder models with attention. We show our direct-to-word model can achieve word error rate gains over sub-word level models for speech recognition. We also show that our direct-to-word approach retains the ability to predict words not seen at training time without any retraining. Finally, we demonstrate that a word-level model can use a larger stride than a sub-word level model while maintaining accuracy. This makes the model more efficient both for training and inference.}
}
@InProceedings{pmlr-v119-combettes20a,
title = {Boosting Frank-{W}olfe by Chasing Gradients},
author = {Combettes, Cyrille and Pokutta, Sebastian},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2111--2121},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/combettes20a/combettes20a.pdf},
url = {http://proceedings.mlr.press/v119/combettes20a.html},
abstract = {The Frank-Wolfe algorithm has become a popular first-order optimization algorithm for it is simple and projection-free, and it has been successfully applied to a variety of real-world problems. Its main drawback however lies in its convergence rate, which can be excessively slow due to naive descent directions. We propose to speed up the Frank-Wolfe algorithm by better aligning the descent direction with that of the negative gradient via a subroutine. This subroutine chases the negative gradient direction in a matching pursuit-style while still preserving the projection-free property. Although the approach is reasonably natural, it produces very significant results. We derive convergence rates $\mathcal{O}(1/t)$ to $\mathcal{O}(e^{-\omega t})$ of our method and we demonstrate its competitive advantage both per iteration and in CPU time over the state-of-the-art in a series of computational experiments.}
}
@InProceedings{pmlr-v119-conitzer20a,
title = {Learning Opinions in Social Networks},
author = {Conitzer, Vincent and Panigrahi, Debmalya and Zhang, Hanrui},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2122--2132},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/conitzer20a/conitzer20a.pdf},
url = {http://proceedings.mlr.press/v119/conitzer20a.html},
abstract = {We study the problem of learning opinions in social networks. The learner observes the states of some sample nodes from a social network, and tries to infer the states of other nodes, based on the structure of the network. We show that sample-efficient learning is impossible when the network exhibits strong noise, and give a polynomial-time algorithm for the problem with nearly optimal sample complexity when the network is sufficiently stable.}
}
@InProceedings{pmlr-v119-cornish20a,
title = {Relaxing Bijectivity Constraints with Continuously Indexed Normalising Flows},
author = {Cornish, Rob and Caterini, Anthony and Deligiannidis, George and Doucet, Arnaud},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2133--2143},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cornish20a/cornish20a.pdf},
url = {http://proceedings.mlr.press/v119/cornish20a.html},
abstract = {We show that normalising flows become pathological when used to model targets whose supports have complicated topologies. In this scenario, we prove that a flow must become arbitrarily numerically noninvertible in order to approximate the target closely. This result has implications for all flow-based models, and especially residual flows (ResFlows), which explicitly control the Lipschitz constant of the bijection used. To address this, we propose continuously indexed flows (CIFs), which replace the single bijection used by normalising flows with a continuously indexed family of bijections, and which can intuitively "clean up" mass that would otherwise be misplaced by a single bijection. We show theoretically that CIFs are not subject to the same topological limitations as normalising flows, and obtain better empirical performance on a variety of models and benchmarks.}
}
@InProceedings{pmlr-v119-cortes20a,
title = {Adaptive Region-Based Active Learning},
author = {Cortes, Corinna and Desalvo, Giulia and Gentile, Claudio and Mohri, Mehryar and Zhang, Ningshan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2144--2153},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cortes20a/cortes20a.pdf},
url = {http://proceedings.mlr.press/v119/cortes20a.html},
abstract = {We present a new active learning algorithm that adaptively partitions the input space into a finite number of regions, and subsequently seeks a distinct predictor for each region, while actively requesting labels. We prove theoretical guarantees for both the generalization error and the label complexity of our algorithm, and analyze the number of regions defined by the algorithm under some mild assumptions. We also report the results of an extensive suite of experiments on several real-world datasets demonstrating substantial empirical benefits over existing single-region and non-adaptive region-based active learning baselines.}
}
@InProceedings{pmlr-v119-cortes20b,
title = {Online Learning with Dependent Stochastic Feedback Graphs},
author = {Cortes, Corinna and Desalvo, Giulia and Gentile, Claudio and Mohri, Mehryar and Zhang, Ningshan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2154--2163},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cortes20b/cortes20b.pdf},
url = {http://proceedings.mlr.press/v119/cortes20b.html},
abstract = {A general framework for online learning with partial information is one where feedback graphs specify which losses can be observed by the learner. We study a challenging scenario where feedback graphs vary stochastically with time and, more importantly, where graphs and losses are dependent. This scenario appears in several real-world applications that we describe where the outcome of actions are correlated. We devise a new algorithm for this setting that exploits the stochastic properties of the graphs and that benefits from favorable regret guarantees. We present a detailed theoretical analysis of this algorithm, and also report the result of a series of experiments on real-world datasets, which show that our algorithm outperforms standard baselines for online learning with feedback graphs.}
}
@InProceedings{pmlr-v119-cosentino20a,
title = {Learnable Group Transform For Time-Series},
author = {Cosentino, Romain and Aazhang, Behnaam},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2164--2173},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cosentino20a/cosentino20a.pdf},
url = {http://proceedings.mlr.press/v119/cosentino20a.html},
abstract = {We propose a novel approach to filter bank learning for time-series by considering spectral decompositions of signals defined as a Group Transform. This framework allows us to generalize classical time-frequency transformations such as the Wavelet Transform, and to efficiently learn the representation of signals. While the creation of the wavelet transform filter-bank relies on affine transformations of a mother filter, our approach allows for non-linear transformations. The transformations induced by such maps enable us to span a larger class of signal representations, from wavelet to chirplet-like filters. We propose a parameterization of such a non-linear map such that its sampling can be optimized for a specific task and signal. The Learnable Group Transform can be cast into a Deep Neural Network. The experiments on diverse time-series datasets demonstrate the expressivity of this framework, which competes with state-of-the-art performances.}
}
@InProceedings{pmlr-v119-crane20a,
title = {{DINO}: Distributed {N}ewton-Type Optimization Method},
author = {Crane, Rixon and Roosta, Fred},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2174--2184},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/crane20a/crane20a.pdf},
url = {http://proceedings.mlr.press/v119/crane20a.html},
abstract = {We present a novel communication-efficient Newton-type algorithm for finite-sum optimization over a distributed computing environment. Our method, named DINO, overcomes both theoretical and practical shortcomings of similar existing methods. Under minimal assumptions, we guarantee global sub-linear convergence of DINO to a first-order stationary point for general non-convex functions and arbitrary data distribution over the network. Furthermore, for functions satisfying Polyak-Lojasiewicz (PL) inequality, we show that DINO enjoys a linear convergence rate. Our proposed algorithm is practically parameter free, in that it will converge regardless of the selected hyper-parameters, which are easy to tune. Additionally, its sub-problems are simple linear least-squares, for which efficient solvers exist, and numerical simulations demonstrate the efficiency of DINO as compared with similar alternatives.}
}
@InProceedings{pmlr-v119-creager20a,
title = {Causal Modeling for Fairness In Dynamical Systems},
author = {Creager, Elliot and Madras, David and Pitassi, Toniann and Zemel, Richard},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2185--2195},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/creager20a/creager20a.pdf},
url = {http://proceedings.mlr.press/v119/creager20a.html},
abstract = {In many applications areas—lending, education, and online recommenders, for example—fairness and equity concerns emerge when a machine learning system interacts with a dynamically changing environment to produce both immediate and long-term effects for individuals and demographic groups. We discuss causal directed acyclic graphs (DAGs) as a unifying framework for the recent literature on fairness in such dynamical systems. We show that this formulation affords several new directions of inquiry to the modeler, where sound causal assumptions can be expressed and manipulated. We emphasize the importance of computing interventional quantities in the dynamical fairness setting, and show how causal assumptions enable simulation (when environment dynamics are known) and estimation by adjustment (when dynamics are unknown) of intervention on short- and long-term outcomes, at both the group and individual levels.}
}
@InProceedings{pmlr-v119-croce20a,
title = {Minimally distorted Adversarial Examples with a Fast Adaptive Boundary Attack},
author = {Croce, Francesco and Hein, Matthias},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2196--2205},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/croce20a/croce20a.pdf},
url = {http://proceedings.mlr.press/v119/croce20a.html},
abstract = {The evaluation of robustness against adversarial manipulation of neural networks-based classifiers is mainly tested with empirical attacks as methods for the exact computation, even when available, do not scale to large networks. We propose in this paper a new white-box adversarial attack wrt the $l_p$-norms for $p \in \{1,2,\infty\}$ aiming at finding the minimal perturbation necessary to change the class of a given input. It has an intuitive geometric meaning, yields quickly high quality results, minimizes the size of the perturbation (so that it returns the robust accuracy at every threshold with a single run). It performs better or similar to state-of-the-art attacks which are partially specialized to one $l_p$-norm, and is robust to the phenomenon of gradient obfuscation.}
}
@InProceedings{pmlr-v119-croce20b,
title = {Reliable evaluation of adversarial robustness with an ensemble of diverse parameter-free attacks},
author = {Croce, Francesco and Hein, Matthias},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2206--2216},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/croce20b/croce20b.pdf},
url = {http://proceedings.mlr.press/v119/croce20b.html},
abstract = {The field of defense strategies against adversarial attacks has significantly grown over the last years, but progress is hampered as the evaluation of adversarial defenses is often insufficient and thus gives a wrong impression of robustness. Many promising defenses could be broken later on, making it difficult to identify the state-of-the-art. Frequent pitfalls in the evaluation are improper tuning of hyperparameters of the attacks, gradient obfuscation or masking. In this paper we first propose two extensions of the PGD-attack overcoming failures due to suboptimal step size and problems of the objective function. We then combine our novel attacks with two complementary existing ones to form a parameter-free, computationally affordable and user-independent ensemble of attacks to test adversarial robustness. We apply our ensemble to over 50 models from papers published at recent top machine learning and computer vision venues. In all except one of the cases we achieve lower robust test accuracy than reported in these papers, often by more than $10%$, identifying several broken defenses.}
}
@InProceedings{pmlr-v119-croissant20a,
title = {Real-Time Optimisation for Online Learning in Auctions},
author = {Croissant, Lorenzo and Abeille, Marc and Calauzenes, Clement},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2217--2226},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/croissant20a/croissant20a.pdf},
url = {http://proceedings.mlr.press/v119/croissant20a.html},
abstract = {In display advertising, a small group of sellers and bidders face each other in up to $10^{12}$ auctions a day. In this context, revenue maximisation via monopoly price learning is a high-value problem for sellers. By nature, these auctions are online and produce a very high frequency stream of data. This results in a computational strain that requires algorithms be real-time. Unfortunately, existing methods inherited from the batch setting suffer $O(\sqrt{t})$ time/memory complexity at each update, prohibiting their use. In this paper, we provide the first algorithm for online learning of monopoly prices in online auctions whose update is constant in time and memory.}
}
@InProceedings{pmlr-v119-cummings20a,
title = {Privately detecting changes in unknown distributions},
author = {Cummings, Rachel and Krehbiel, Sara and Lut, Yuliia and Zhang, Wanrong},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2227--2237},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cummings20a/cummings20a.pdf},
url = {http://proceedings.mlr.press/v119/cummings20a.html},
abstract = {The change-point detection problem seeks to identify distributional changes in streams of data. Increasingly, tools for change-point detection are applied in settings where data may be highly sensitive and formal privacy guarantees are required, such as identifying disease outbreaks based on hospital records, or IoT devices detecting activity within a home. Differential privacy has emerged as a powerful technique for enabling data analysis while preventing information leakage about individuals. Much of the prior work on change-point detection{—}including the only private algorithms for this problem{—}requires complete knowledge of the pre-change and post-change distributions, which is an unrealistic assumption for many practical applications of interest. This work develops differentially private algorithms for solving the change-point detection problem when the data distributions are unknown. Additionally, the data may be sampled from distributions that change smoothly over time, rather than fixed pre-change and post-change distributions. We apply our algorithms to detect changes in the linear trends of such data streams. Finally, we also provide experimental results to empirically validate the performance of our algorithms.}
}
@InProceedings{pmlr-v119-curtis20a,
title = {Flexible and Efficient Long-Range Planning Through Curious Exploration},
author = {Curtis, Aidan and Xin, Minjian and Arumugam, Dilip and Feigelis, Kevin and Yamins, Daniel},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2238--2249},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/curtis20a/curtis20a.pdf},
url = {http://proceedings.mlr.press/v119/curtis20a.html},
abstract = {Identifying algorithms that flexibly and efficiently discover temporally-extended multi-phase plans is an essential step for the advancement of robotics and model-based reinforcement learning. The core problem of long-range planning is finding an efficient way to search through the tree of possible action sequences. Existing non-learned planning solutions from the Task and Motion Planning (TAMP) literature rely on the existence of logical descriptions for the effects and preconditions for actions. This constraint allows TAMP methods to efficiently reduce the tree search problem but limits their ability to generalize to unseen and complex physical environments. In contrast, deep reinforcement learning (DRL) methods use flexible neural-network-based function approximators to discover policies that generalize naturally to unseen circumstances. However, DRL methods struggle to handle the very sparse reward landscapes inherent to long-range multi-step planning situations. Here, we propose the Curious Sample Planner (CSP), which fuses elements of TAMP and DRL by combining a curiosity-guided sampling strategy with imitation learning to accelerate planning. We show that CSP can efficiently discover interesting and complex temporally-extended plans for solving a wide range of physically realistic 3D tasks. In contrast, standard planning and learning methods often fail to solve these tasks at all or do so only with a huge and highly variable number of training samples. We explore the use of a variety of curiosity metrics with CSP and analyze the types of solutions that CSP discovers. Finally, we show that CSP supports task transfer so that the exploration policies learned during experience with one task can help improve efficiency on related tasks.}
}
@InProceedings{pmlr-v119-cutkosky20a,
title = {Parameter-free, Dynamic, and Strongly-Adaptive Online Learning},
author = {Cutkosky, Ashok},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2250--2259},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cutkosky20a/cutkosky20a.pdf},
url = {http://proceedings.mlr.press/v119/cutkosky20a.html},
abstract = {We provide a new online learning algorithm that for the first time combines several disparate notions of adaptivity. First, our algorithm obtains a “parameter-free” regret bound that adapts to the norm of the comparator and the squared norm of the size of the gradients it observes. Second, it obtains a “strongly-adaptive” regret bound, so that for any given interval of length $N$, the regret over the interval is $\tilde O(\sqrt{N})$. Finally, our algorithm obtains an optimal “dynamic” regret bound: for any sequence of comparators with path-length $P$, our algorithm obtains regret $\tilde O(\sqrt{PN})$ over intervals of length $N$. Our primary technique for achieving these goals is a new method of combining constrained online learning regret bounds that does not rely on an expert meta-algorithm to aggregate learners.}
}
@InProceedings{pmlr-v119-cutkosky20b,
title = {Momentum Improves Normalized {SGD}},
author = {Cutkosky, Ashok and Mehta, Harsh},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2260--2268},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cutkosky20b/cutkosky20b.pdf},
url = {http://proceedings.mlr.press/v119/cutkosky20b.html},
abstract = {We provide an improved analysis of normalized SGD showing that adding momentum provably removes the need for large batch sizes on non-convex objectives. Then, we consider the case of objectives with bounded second derivative and show that in this case a small tweak to the momentum formula allows normalized SGD with momentum to find an $\epsilon$-critical point in $O(1/\epsilon^{3.5})$ iterations, matching the best-known rates without accruing any logarithmic factors or dependence on dimension. We provide an adaptive learning rate schedule that automatically improves convergence rates when the variance in the gradients is small. Finally, we show that our method is effective when employed on popular large scale tasks such as ResNet-50 and BERT pretraining, matching the performance of the disparate methods used to get state-of-the-art results on both tasks.}
}
@InProceedings{pmlr-v119-cuturi20a,
title = {Supervised Quantile Normalization for Low Rank Matrix Factorization},
author = {Cuturi, Marco and Teboul, Olivier and Niles-Weed, Jonathan and Vert, Jean-Philippe},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2269--2279},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cuturi20a/cuturi20a.pdf},
url = {http://proceedings.mlr.press/v119/cuturi20a.html},
abstract = {Low rank matrix factorization is a fundamental building block in machine learning, used for instance to summarize gene expression profile data or word-document counts. To be robust to outliers and differences in scale across features, a matrix factorization step is usually preceded by ad-hoc feature normalization steps, such as tf-idf scaling or data whitening. We propose in this work to learn these normalization operators jointly with the factorization itself. More precisely, given a $d\times n$ matrix $X$ of $d$ features measured on $n$ individuals, we propose to learn the parameters of quantile normalization operators that can operate row-wise on the values of $X$ and/or of its factorization $UV$ to improve the quality of the low-rank representation of $X$ itself. This optimization is facilitated by the introduction of a new differentiable quantile normalization operator built using optimal transport, providing new results on top of existing work by Cuturi et al. (2019). We demonstrate the applicability of these techniques on synthetic and genomics datasets.}
}
@InProceedings{pmlr-v119-d-ascoli20a,
title = {Double Trouble in Double Descent: Bias and Variance(s) in the Lazy Regime},
author = {D'Ascoli, St{\'e}phane and Refinetti, Maria and Biroli, Giulio and Krzakala, Florent},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2280--2290},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/d-ascoli20a/d-ascoli20a.pdf},
url = {http://proceedings.mlr.press/v119/d-ascoli20a.html},
abstract = {Deep neural networks can achieve remarkable generalization performances while interpolating the training data. Rather than the U-curve emblematic of the bias-variance trade-off, their test error often follows a "double descent"—a mark of the beneficial role of overparametrization. In this work, we develop a quantitative theory for this phenomenon in the so-called lazy learning regime of neural networks, by considering the problem of learning a high-dimensional function with random features regression. We obtain a precise asymptotic expression for the bias-variance decomposition of the test error, and show that the bias displays a phase transition at the interpolation threshold, beyond it which it remains constant. We disentangle the variances stemming from the sampling of the dataset, from the additive noise corrupting the labels, and from the initialization of the weights. We demonstrate that the latter two contributions are the crux of the double descent: they lead to the overfitting peak at the interpolation threshold and to the decay of the test error upon overparametrization. We quantify how they are suppressed by ensembling the outputs of $K$ independently initialized estimators. For $K\rightarrow \infty$, the test error is monotonously decreasing and remains constant beyond the interpolation threshold. We further compare the effects of overparametrizing, ensembling and regularizing. Finally, we present numerical experiments on classic deep learning setups to show that our results hold qualitatively in realistic lazy learning scenarios.}
}
@InProceedings{pmlr-v119-dai20a,
title = {R2-B2: Recursive Reasoning-Based {B}ayesian Optimization for No-Regret Learning in Games},
author = {Dai, Zhongxiang and Chen, Yizhou and Low, Bryan Kian Hsiang and Jaillet, Patrick and Ho, Teck-Hua},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2291--2301},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dai20a/dai20a.pdf},
url = {http://proceedings.mlr.press/v119/dai20a.html},
abstract = {This paper presents a recursive reasoning formalism of Bayesian optimization (BO) to model the reasoning process in the interactions between boundedly rational, self-interested agents with unknown, complex, and costly-to-evaluate payoff functions in repeated games, which we call Recursive Reasoning-Based BO (R2-B2). Our R2-B2 algorithm is general in that it does not constrain the relationship among the payoff functions of different agents and can thus be applied to various types of games such as constant-sum, general-sum, and common-payoff games. We prove that by reasoning at level 2 or more and at one level higher than the other agents, our R2-B2 agent can achieve faster asymptotic convergence to no regret than that without utilizing recursive reasoning. We also propose a computationally cheaper variant of R2-B2 called R2-B2-Lite at the expense of a weaker convergence guarantee. The performance and generality of our R2-B2 algorithm are empirically demonstrated using synthetic games, adversarial machine learning, and multi-agent reinforcement learning.}
}
@InProceedings{pmlr-v119-dai20b,
title = {Scalable Deep Generative Modeling for Sparse Graphs},
author = {Dai, Hanjun and Nazi, Azade and Li, Yujia and Dai, Bo and Schuurmans, Dale},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2302--2312},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dai20b/dai20b.pdf},
url = {http://proceedings.mlr.press/v119/dai20b.html},
abstract = {Learning graph generative models is a challenging task for deep learning and has wide applicability to a range of domains like chemistry, biology and social science. However current deep neural methods suffer from limited scalability: for a graph with n nodes and m edges, existing deep neural methods require Omega(n^2) complexity by building up the adjacency matrix. On the other hand, many real world graphs are actually sparse in the sense that m << n^2. Based on this, we develop a novel autoregressive model, named BiGG, that utilizes this sparsity to avoid generating the full adjacency matrix, and importantly reduces the graph generation time complexity to O((n + m) log n). Furthermore, during training this autoregressive model can be parallelized with O(log n) synchronization stages, which makes it much more efficient than other autoregressive models that require Omega(n). Experiments on several benchmarks show that the proposed approach not only scales to orders of magnitude larger graphs than previously possible with deep autoregressive graph generative models, but also yields better graph generation quality.}
}
@InProceedings{pmlr-v119-dai20c,
title = {The Usual Suspects? {R}eassessing Blame for {VAE} Posterior Collapse},
author = {Dai, Bin and Wang, Ziyu and Wipf, David},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2313--2322},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dai20c/dai20c.pdf},
url = {http://proceedings.mlr.press/v119/dai20c.html},
abstract = {In narrow asymptotic settings Gaussian VAE models of continuous data have been shown to possess global optima aligned with ground-truth distributions. Even so, it is well known that poor solutions whereby the latent posterior collapses to an uninformative prior are sometimes obtained in practice. However, contrary to conventional wisdom that largely assigns blame for this phenomena on the undue influence of KL-divergence regularization, we will argue that posterior collapse is, at least in part, a direct consequence of bad local minima inherent to the loss surface of deep autoencoder networks. In particular, we prove that even small nonlinear perturbations of affine VAE decoder models can produce such minima, and in deeper models, analogous minima can force the VAE to behave like an aggressive truncation operator, provably discarding information along all latent dimensions in certain circumstances. Regardless, the underlying message here is not meant to undercut valuable existing explanations of posterior collapse, but rather, to refine the discussion and elucidate alternative risk factors that may have been previously underappreciated.}
}
@InProceedings{pmlr-v119-dalmasso20a,
title = {Confidence Sets and Hypothesis Testing in a Likelihood-Free Inference Setting},
author = {Dalmasso, Niccolo and Izbicki, Rafael and Lee, Ann},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2323--2334},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dalmasso20a/dalmasso20a.pdf},
url = {http://proceedings.mlr.press/v119/dalmasso20a.html},
abstract = {Parameter estimation, statistical tests and conﬁdence sets are the cornerstones of classical statistics that allow scientists to make inferences about the underlying process that generated the observed data. A key question is whether one can still construct hypothesis tests and conﬁdence sets with proper coverage and high power in a so-called likelihood-free inference (LFI) setting; that is, a setting where the likelihood is not explicitly known but one can forward-simulate observable data according to a stochastic model. In this paper, we present ACORE (Approximate Computation via Odds Ratio Estimation), a frequentist approach to LFI that ﬁrst formulates the classical likelihood ratio test (LRT) as a parametrized classiﬁcation problem, and then uses the equivalence of tests and conﬁdence sets to build conﬁdence regions for parameters of interest. We also present a goodness-of-ﬁt procedure for checking whether the constructed tests and conﬁdence regions are valid. ACORE is based on the key observation that the LRT statistic, the rejection probability of the test, and the coverage of the conﬁdence set are conditional distribution functions which often vary smoothly as a function of the parameters of interest. Hence, instead of relying solely on samples simulated at ﬁxed parameter settings (as is the convention in standard Monte Carlo solutions), one can leverage machine learning tools and data simulated in the neighborhood of a parameter to improve estimates of quantities of interest. We demonstrate the efﬁcacy of ACORE with both theoretical and empirical results. Our implementation is available on Github.}
}
@InProceedings{pmlr-v119-dan20a,
title = {Goodness-of-Fit Tests for Inhomogeneous Random Graphs},
author = {Dan, Soham and Bhattacharya, Bhaswar B.},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2335--2344},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dan20a/dan20a.pdf},
url = {http://proceedings.mlr.press/v119/dan20a.html},
abstract = {Hypothesis testing of random networks is an emerging area of modern research, especially in the high-dimensional regime, where the number of samples is smaller or comparable to the size of the graph. In this paper we consider the goodness-of-fit testing problem for large inhomogeneous random (IER) graphs, where given a (known) reference symmetric matrix $Q \in [0, 1]^{n \times n}$ and $m$ independent samples from an IER graph given by an unknown symmetric matrix $P \in [0, 1]^{n \times n}$, the goal is to test the hypothesis $P=Q$ versus $||P-Q|| \geq \varepsilon$, where $||\cdot||$ is some specified norm on symmetric matrices. Building on recent related work on two-sample testing for IER graphs, we derive the optimal minimax sample complexities for the goodness-of-fit problem in various natural norms, such as the Frobenius norm and the operator norm. We also propose practical implementations of natural test statistics, using their asymptotic distributions and through the parametric bootstrap. We compare the performances of the different tests in simulations, and show that the proposed tests outperform the baseline tests across various natural random graphs models.}
}
@InProceedings{pmlr-v119-dan20b,
title = {Sharp Statistical Guaratees for Adversarially Robust {G}aussian Classification},
author = {Dan, Chen and Wei, Yuting and Ravikumar, Pradeep},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2345--2355},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dan20b/dan20b.pdf},
url = {http://proceedings.mlr.press/v119/dan20b.html},
abstract = {Adversarial robustness has become a fundamental requirement in modern machine learning applications. Yet, there has been surprisingly little statistical understanding so far. In this paper, we provide the first result of the \emph{optimal} minimax guarantees for the excess risk for adversarially robust classification, under Gaussian mixture model proposed by \cite{schmidt2018adversarially}. The results are stated in terms of the \emph{Adversarial Signal-to-Noise Ratio (AdvSNR)}, which generalizes a similar notion for standard linear classification to the adversarial setting. For the Gaussian mixtures with AdvSNR value of $r$, we prove an excess risk lower bound of order $\Theta(e^{-(\frac{1}{2}+o(1)) r^2} \frac{d}{n})$ and design a computationally efficient estimator that achieves this optimal rate. Our results built upon minimal assumptions while cover a wide spectrum of adversarial perturbations including $\ell_p$ balls for any $p \ge 1$.}
}
@InProceedings{pmlr-v119-dang-nhu20a,
title = {Adversarial Attacks on Probabilistic Autoregressive Forecasting Models},
author = {Dang-Nhu, Rapha{\"e}l and Singh, Gagandeep and Bielik, Pavol and Vechev, Martin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2356--2365},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dang-nhu20a/dang-nhu20a.pdf},
url = {http://proceedings.mlr.press/v119/dang-nhu20a.html},
abstract = {We develop an effective generation of adversarial attacks on neural models that output a sequence of probability distributions rather than a sequence of single values. This setting includes the recently proposed deep probabilistic autoregressive forecasting models that estimate the probability distribution of a time series given its past and achieve state-of-the-art results in a diverse set of application domains. The key technical challenge we address is how to effectively differentiate through the Monte-Carlo estimation of statistics of the output sequence joint distribution. Additionally, we extend prior work on probabilistic forecasting to the Bayesian setting which allows conditioning on future observations, instead of only on past observations. We demonstrate that our approach can successfully generate attacks with small input perturbations in two challenging tasks where robust decision making is crucial – stock market trading and prediction of electricity consumption.}
}
@InProceedings{pmlr-v119-dar20a,
title = {Subspace Fitting Meets Regression: The Effects of Supervision and Orthonormality Constraints on Double Descent of Generalization Errors},
author = {Dar, Yehuda and Mayer, Paul and Luzi, Lorenzo and Baraniuk, Richard},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2366--2375},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dar20a/dar20a.pdf},
url = {http://proceedings.mlr.press/v119/dar20a.html},
abstract = {We study the linear subspace fitting problem in the overparameterized setting, where the estimated subspace can perfectly interpolate the training examples. Our scope includes the least-squares solutions to subspace fitting tasks with varying levels of supervision in the training data (i.e., the proportion of input-output examples of the desired low-dimensional mapping) and orthonormality of the vectors defining the learned operator. This flexible family of problems connects standard, unsupervised subspace fitting that enforces strict orthonormality with a corresponding regression task that is fully supervised and does not constrain the linear operator structure. This class of problems is defined over a supervision-orthonormality plane, where each coordinate induces a problem instance with a unique pair of supervision level and softness of orthonormality constraints. We explore this plane and show that the generalization errors of the corresponding subspace fitting problems follow double descent trends as the settings become more supervised and less orthonormally constrained.}
}
@InProceedings{pmlr-v119-das20a,
title = {Probing Emergent Semantics in Predictive Agents via Question Answering},
author = {Das, Abhishek and Carnevale, Federico and Merzic, Hamza and Rimell, Laura and Schneider, Rosalia and Abramson, Josh and Hung, Alden and Ahuja, Arun and Clark, Stephen and Wayne, Greg and Hill, Felix},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2376--2391},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/das20a/das20a.pdf},
url = {http://proceedings.mlr.press/v119/das20a.html},
abstract = {Recent work has shown how predictive modeling can endow agents with rich knowledge of their surroundings, improving their ability to act in complex environments. We propose question-answering as a general paradigm to decode and understand the representations that such agents develop, applying our method to two recent approaches to predictive modelling - action-conditional CPC (Guo et al., 2018) and SimCore (Gregor et al., 2019). After training agents with these predictive objectives in a visually-rich, 3D environment with an assortment of objects, colors, shapes, and spatial configurations, we probe their internal state representations with a host of synthetic (English) questions, without backpropagating gradients from the question-answering decoder into the agent. The performance of different agents when probed in this way reveals that they learn to encode factual, and seemingly compositional, information about objects, properties and spatial relations from their physical environment. Our approach is intuitive, i.e. humans can easily interpret the responses of the model as opposed to inspecting continuous vectors, and model-agnostic, i.e. applicable to any modeling approach. By revealing the implicit knowledge of objects, quantities, properties and relations acquired by agents as they learn, question-conditional agent probing can stimulate the design and development of stronger predictive learning objectives.}
}
@InProceedings{pmlr-v119-davis20a,
title = {Low-Variance and Zero-Variance Baselines for Extensive-Form Games},
author = {Davis, Trevor and Schmid, Martin and Bowling, Michael},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2392--2401},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/davis20a/davis20a.pdf},
url = {http://proceedings.mlr.press/v119/davis20a.html},
abstract = {Extensive-form games (EFGs) are a common model of multi-agent interactions with imperfect information. State-of-the-art algorithms for solving these games typically perform full walks of the game tree that can prove prohibitively slow in large games. Alternatively, sampling-based methods such as Monte Carlo Counterfactual Regret Minimization walk one or more trajectories through the tree, touching only a fraction of the nodes on each iteration, at the expense of requiring more iterations to converge due to the variance of sampled values. In this paper, we extend recent work that uses baseline estimates to reduce this variance. We introduce a framework of baseline-corrected values in EFGs that generalizes the previous work. Within our framework, we propose new baseline functions that result in significantly reduced variance compared to existing techniques. We show that one particular choice of such a function — predictive baseline — is provably optimal under certain sampling schemes. This allows for efficient computation of zero-variance value estimates even along sampled trajectories.}
}
@InProceedings{pmlr-v119-de-avila-belbute-peres20a,
title = {Combining Differentiable {PDE} Solvers and Graph Neural Networks for Fluid Flow Prediction},
author = {De Avila Belbute-Peres, Filipe and Economon, Thomas and Kolter, Zico},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2402--2411},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/de-avila-belbute-peres20a/de-avila-belbute-peres20a.pdf},
url = {http://proceedings.mlr.press/v119/de-avila-belbute-peres20a.html},
abstract = {Solving large complex partial differential equations (PDEs), such as those that arise in computational fluid dynamics (CFD), is a computationally expensive process. This has motivated the use of deep learning approaches to approximate the PDE solutions, yet the simulation results predicted from these approaches typically do not generalize well to truly novel scenarios. In this work, we develop a hybrid (graph) neural network that combines a traditional graph convolutional network with an embedded differentiable fluid dynamics simulator inside the network itself. By combining an actual CFD simulator (run on a much coarser resolution representation of the problem) with the graph network, we show that we can both generalize well to new situations and benefit from the substantial speedup of neural network CFD predictions, while also substantially outperforming the coarse CFD simulation alone.}
}
@InProceedings{pmlr-v119-debenedetto20a,
title = {Representing Unordered Data Using Complex-Weighted Multiset Automata},
author = {{DeBenedetto}, Justin and Chiang, David},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2412--2420},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/debenedetto20a/debenedetto20a.pdf},
url = {http://proceedings.mlr.press/v119/debenedetto20a.html},
abstract = {Unordered, variable-sized inputs arise in many settings across
multiple fields. The ability for set- and multiset-oriented neural
networks to handle this type of input has been the focus of much
work in recent years. We propose to represent multisets using
complex-weighted *multiset automata* and show how the
multiset representations of certain existing neural architectures
can be viewed as special cases of ours. Namely, (1) we provide a new
theoretical and intuitive justification for the Transformer model’s
representation of positions using sinusoidal functions, and (2) we
extend the DeepSets model to use complex numbers, enabling it to
outperform the existing model on an extension of one of their tasks.
}
}
@InProceedings{pmlr-v119-decarolis20a,
title = {An end-to-end Differentially Private Latent {D}irichlet Allocation Using a Spectral Algorithm},
author = {Decarolis, Chris and Ram, Mukul and Esmaeili, Seyed and Wang, Yu-Xiang and Huang, Furong},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2421--2431},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/decarolis20a/decarolis20a.pdf},
url = {http://proceedings.mlr.press/v119/decarolis20a.html},
abstract = {We provide an end-to-end differentially private spectral algorithm for learning LDA, based on matrix/tensor decompositions, and establish theoretical guarantees on utility/consistency of the estimated model parameters. We represent the spectral algorithm as a computational graph. Noise can be injected along the edges of this graph to obtain differential privacy. We identify subsets of edges, named “configurations”, such that adding noise to all edges in such a subset guarantees differential privacy of the end-to-end spectral algorithm. We characterize the sensitivity of the edges with respect to the input and thus estimate the amount of noise to be added to each edge for any required privacy level. We then characterize the utility loss for each configuration as a function of injected noise. Overall, by combining the sensitivity and utility characterization, we obtain an end-to-end differentially private spectral algorithm for LDA and identify which configurations outperform others under specific regimes. We are the first to achieve utility guarantees under a required level of differential privacy for learning in LDA. We additionally show that our method systematically outperforms differentially private variational inference.}
}
@InProceedings{pmlr-v119-degenne20a,
title = {Gamification of Pure Exploration for Linear Bandits},
author = {Degenne, R{\'e}my and Menard, Pierre and Shang, Xuedong and Valko, Michal},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2432--2442},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/degenne20a/degenne20a.pdf},
url = {http://proceedings.mlr.press/v119/degenne20a.html},
abstract = {We investigate an active \emph{pure-exploration} setting, that includes \emph{best-arm identification}, in the context of \emph{linear stochastic bandits}. While asymptotically optimal algorithms exist for standard \emph{multi-armed bandits}, the existence of such algorithms for the best-arm identification in linear bandits has been elusive despite several attempts to address it. First, we provide a thorough comparison and new insight over different notions of optimality in the linear case, including G-optimality, transductive optimality from optimal experimental design and asymptotic optimality. Second, we design the first asymptotically optimal algorithm for fixed-confidence pure exploration in linear bandits. As a consequence, our algorithm naturally bypasses the pitfall caused by a simple but difficult instance, that most prior algorithms had to be engineered to deal with explicitly. Finally, we avoid the need to fully solve an optimal design problem by providing an approach that entails an efficient implementation.}
}
@InProceedings{pmlr-v119-degenne20b,
title = {Structure Adaptive Algorithms for Stochastic Bandits},
author = {Degenne, R{\'e}my and Shao, Han and Koolen, Wouter},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2443--2452},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/degenne20b/degenne20b.pdf},
url = {http://proceedings.mlr.press/v119/degenne20b.html},
abstract = {We study reward maximisation in a wide class of structured stochastic multi-armed bandit problems, where the mean rewards of arms satisfy some given structural constraints, e.g. linear, unimodal, sparse, etc. Our aim is to develop methods that are \emph{flexible} (in that they easily adapt to different structures), \emph{powerful} (in that they perform well empirically and/or provably match instance-dependent lower bounds) and \emph{efficient} in that the per-round computational burden is small. We develop asymptotically optimal algorithms from instance-dependent lower-bounds using iterative saddle-point solvers. Our approach generalises recent iterative methods for pure exploration to reward maximisation, where a major challenge arises from the estimation of the sub-optimality gaps and their reciprocals. Still we manage to achieve all the above desiderata. Notably, our technique avoids the computational cost of the full-blown saddle point oracle employed by previous work, while at the same time enabling finite-time regret bounds. Our experiments reveal that our method successfully leverages the structural assumptions, while its regret is at worst comparable to that of vanilla UCB.}
}
@InProceedings{pmlr-v119-delbridge20a,
title = {Randomly Projected Additive {G}aussian Processes for Regression},
author = {Delbridge, Ian and Bindel, David and Wilson, Andrew Gordon},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2453--2463},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/delbridge20a/delbridge20a.pdf},
url = {http://proceedings.mlr.press/v119/delbridge20a.html},
abstract = {Gaussian processes (GPs) provide flexible distributions over functions, with inductive biases controlled by a kernel. However, in many applications Gaussian processes can struggle with even moderate input dimensionality. Learning a low dimensional projection can help alleviate this curse of dimensionality, but introduces many trainable hyperparameters, which can be cumbersome, especially in the small data regime. We use additive sums of kernels for GP regression, where each kernel operates on a different random projection of its inputs. Surprisingly, we find that as the number of random projections increases, the predictive performance of this approach quickly converges to the performance of a kernel operating on the original full dimensional inputs, over a wide range of data sets, even if we are projecting into a single dimension. As a consequence, many problems can remarkably be reduced to one dimensional input spaces, without learning a transformation. We prove this convergence and its rate, and additionally propose a deterministic approach that converges more quickly than purely random projections. Moreover, we demonstrate our approach can achieve faster inference and improved predictive accuracy for high-dimensional inputs compared to kernels in the original input space.}
}
@InProceedings{pmlr-v119-deng20a,
title = {Interpreting Robust Optimization via Adversarial Influence Functions},
author = {Deng, Zhun and Dwork, Cynthia and Wang, Jialiang and Zhang, Linjun},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2464--2473},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/deng20a/deng20a.pdf},
url = {http://proceedings.mlr.press/v119/deng20a.html},
abstract = {Robust optimization has been widely used in nowadays data science, especially in adversarial training. However, little research has been done to quantify how robust optimization changes the optimizers and the prediction losses comparing to standard training. In this paper, inspired by the influence function in robust statistics, we introduce the Adversarial Influence Function (AIF) as a tool to investigate the solution produced by robust optimization. The proposed AIF enjoys a closed-form and can be calculated efficiently. To illustrate the usage of AIF, we apply it to study model sensitivity — a quantity defined to capture the change of prediction losses on the natural data after implementing robust optimization. We use AIF to analyze how model complexity and randomized smoothing affect the model sensitivity with respect to specific models. We further derive AIF for kernel regressions, with a particular application to neural tangent kernels, and experimentally demonstrate the effectiveness of the proposed AIF. Lastly, the theories of AIF will be extended to distributional robust optimization.}
}
@InProceedings{pmlr-v119-deng20b,
title = {Non-convex Learning via Replica Exchange Stochastic Gradient {MCMC}},
author = {Deng, Wei and Feng, Qi and Gao, Liyao and Liang, Faming and Lin, Guang},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2474--2483},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/deng20b/deng20b.pdf},
url = {http://proceedings.mlr.press/v119/deng20b.html},
abstract = {Replica exchange Monte Carlo (reMC), also known as parallel tempering, is an important technique for accelerating the convergence of the conventional Markov Chain Monte Carlo (MCMC) algorithms. However, such a method requires the evaluation of the energy function based on the full dataset and is not scalable to big data. The naïve implementation of reMC in mini-batch settings introduces large biases, which cannot be directly extended to the stochastic gradient MCMC (SGMCMC), the standard sampling method for simulating from deep neural networks (DNNs). In this paper, we propose an adaptive replica exchange SGMCMC (reSGMCMC) to automatically correct the bias and study the corresponding properties. The analysis implies an acceleration-accuracy trade-off in the numerical discretization of a Markov jump process in a stochastic environment. Empirically, we test the algorithm through extensive experiments on various setups and obtain the state-of-the-art results on CIFAR10, CIFAR100, and SVHN in both supervised learning and semi-supervised learning tasks.}
}
@InProceedings{pmlr-v119-deng20c,
title = {Towards Understanding the Dynamics of the First-Order Adversaries},
author = {Deng, Zhun and He, Hangfeng and Huang, Jiaoyang and Su, Weijie},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2484--2493},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/deng20c/deng20c.pdf},
url = {http://proceedings.mlr.press/v119/deng20c.html},
abstract = {An acknowledged weakness of neural networks is their vulnerability to adversarial perturbations to the inputs. To improve the robustness of these models, one of the most popular defense mechanisms is to alternatively maximize the loss over the constrained perturbations (or called adversaries) on the inputs using projected gradient ascent and minimize over weights. In this paper, we analyze the dynamics of the maximization step towards understanding the experimentally observed effectiveness of this defense mechanism. Specifically, we investigate the non-concave landscape of the adversaries for a two-layer neural network with a quadratic loss. Our main result proves that projected gradient ascent finds a local maximum of this non-concave problem in a polynomial number of iterations with high probability. To our knowledge, this is the first work that provides a convergence analysis of the first-order adversaries. Moreover, our analysis demonstrates that, in the initial phase of adversarial training, the scale of the inputs matters in the sense that a smaller input scale leads to faster convergence of adversarial training and a “more regular” landscape. Finally, we show that these theoretical findings are in excellent agreement with a series of experiments.}
}
@InProceedings{pmlr-v119-deng20d,
title = {Robust Pricing in Dynamic Mechanism Design},
author = {Deng, Yuan and Lahaie, Sebastien and Mirrokni, Vahab},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2494--2503},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/deng20d/deng20d.pdf},
url = {http://proceedings.mlr.press/v119/deng20d.html},
abstract = {Motivated by the repeated sale of online ads via auctions, optimal pricing in repeated auctions has attracted a large body of research. While dynamic mechanisms offer powerful techniques to improve on both revenue and efficiency by optimizing auctions across different items, their reliance on exact distributional information of buyers’ valuations (present and future) limits their use in practice. In this paper, we propose robust dynamic mechanism design. We develop a new framework to design dynamic mechanisms that are robust to both estimation errors in value distributions and strategic behavior. We apply the framework in learning environments, leading to the first policy that achieves provably low regret against the optimal dynamic mechanism in contextual auctions, where the dynamic benchmark has full and accurate distributional information.}
}
@InProceedings{pmlr-v119-dhouib20a,
title = {A Swiss Army Knife for Minimax Optimal Transport},
author = {Dhouib, Sofien and Redko, Ievgen and Kerdoncuff, Tanguy and Emonet, R{\'e}mi and Sebban, Marc},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2504--2513},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dhouib20a/dhouib20a.pdf},
url = {http://proceedings.mlr.press/v119/dhouib20a.html},
abstract = {The Optimal transport (OT) problem and its associated Wasserstein distance have recently become a topic of great interest in the machine learning community. However, the underlying optimization problem is known to have two major restrictions: (i) it largely depends on the choice of the cost function and (ii) its sample complexity scales exponentially with the dimension. In this paper, we propose a general formulation of a minimax OT problem that can tackle these restrictions by jointly optimizing the cost matrix and the transport plan, allowing us to define a robust distance between distributions. We propose to use a cutting-set method to solve this general problem and show its links and advantages compared to other existing minimax OT approaches. Additionally, we use this method to define a notion of stability allowing us to select the most robust cost matrix. Finally, we provide an experimental study highlighting the efficiency of our approach.}
}
@InProceedings{pmlr-v119-dhouib20b,
title = {Margin-aware Adversarial Domain Adaptation with Optimal Transport},
author = {Dhouib, Sofien and Redko, Ievgen and Lartizien, Carole},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2514--2524},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dhouib20b/dhouib20b.pdf},
url = {http://proceedings.mlr.press/v119/dhouib20b.html},
abstract = {In this paper, we propose a new theoretical analysis of unsupervised domain adaptation that relates notions of large margin separation, adversarial learning and optimal transport. This analysis generalizes previous work on the subject by providing a bound on the target margin violation rate, thus reflecting a better control of the quality of separation between classes in the target domain than bounding the misclassification rate. The bound also highlights the benefit of a large margin separation on the source domain for adaptation and introduces an optimal transport (OT) based distance between domains that has the virtue of being task-dependent, contrary to other approaches. From the obtained theoretical results, we derive a novel algorithmic solution for domain adaptation that introduces a novel shallow OT-based adversarial approach and outperforms other OT-based DA baselines on several simulated and real-world classification tasks.}
}
@InProceedings{pmlr-v119-dhurandhar20a,
title = {Enhancing Simple Models by Exploiting What They Already Know},
author = {Dhurandhar, Amit and Shanmugam, Karthikeyan and Luss, Ronny},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2525--2534},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dhurandhar20a/dhurandhar20a.pdf},
url = {http://proceedings.mlr.press/v119/dhurandhar20a.html},
abstract = {There has been recent interest in improving performance of simple models for multiple reasons such as interpretability, robust learning from small data, deployment in memory constrained settings as well as environmental considerations. In this paper, we propose a novel method SRatio that can utilize information from high performing complex models (viz. deep neural networks, boosted trees, random forests) to reweight a training dataset for a potentially low performing simple model of much lower complexity such as a decision tree or a shallow network enhancing its performance. Our method also leverages the per sample hardness estimate of the simple model which is not the case with the prior works which primarily consider the complex model’s confidences/predictions and is thus conceptually novel. Moreover, we generalize and formalize the concept of attaching probes to intermediate layers of a neural network to other commonly used classifiers and incorporate this into our method. The benefit of these contributions is witnessed in the experiments where on 6 UCI datasets and CIFAR-10 we outperform competitors in a majority (16 out of 27) of the cases and tie for best performance in the remaining cases. In fact, in a couple of cases, we even approach the complex model’s performance. We also conduct further experiments to validate assertions and intuitively understand why our method works. Theoretically, we motivate our approach by showing that the weighted loss minimized by simple models using our weighting upper bounds the loss of the complex model.}
}
@InProceedings{pmlr-v119-ding20a,
title = {Spectral Frank-{W}olfe Algorithm: Strict Complementarity and Linear Convergence},
author = {Ding, Lijun and Fei, Yingjie and Xu, Qiantong and Yang, Chengrun},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2535--2544},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ding20a/ding20a.pdf},
url = {http://proceedings.mlr.press/v119/ding20a.html},
abstract = {We develop a novel variant of the classical Frank-Wolfe algorithm, which we call spectral Frank-Wolfe, for convex optimization over a spectrahedron. The spectral Frank-Wolfe algorithm has a novel ingredient: it computes a few eigenvectors of the gradient and solves a small-scale subproblem in each iteration. Such a procedure overcomes the slow convergence of the classical Frank-Wolfe algorithm due to ignoring eigenvalue coalescence. We demonstrate that strict complementarity of the optimization problem is key to proving linear convergence of various algorithms, such as the spectral Frank-Wolfe algorithm as well as the projected gradient method and its accelerated version. We showcase that the strict complementarity is equivalent to the eigengap assumption on the gradient at the optimal solution considered in the literature. As a byproduct of this observation, we also develop a generalized block Frank-Wolfe algorithm and prove its linear convergence.}
}
@InProceedings{pmlr-v119-ding20b,
title = {Generalization Guarantees for Sparse Kernel Approximation with Entropic Optimal Features},
author = {Ding, Liang and Tuo, Rui and Shahrampour, Shahin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2545--2555},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ding20b/ding20b.pdf},
url = {http://proceedings.mlr.press/v119/ding20b.html},
abstract = {Despite their success, kernel methods suffer from a massive computational cost in practice. In this paper, in lieu of commonly used kernel expansion with respect to $N$ inputs, we develop a novel optimal design maximizing the entropy among kernel features. This procedure results in a kernel expansion with respect to entropic optimal features (EOF), improving the data representation dramatically due to features dissimilarity. Under mild technical assumptions, our generalization bound shows that with only $O(N^{\frac{1}{4}})$ features (disregarding logarithmic factors), we can achieve the optimal statistical accuracy (i.e., $O(1/\sqrt{N})$). The salient feature of our design is its sparsity that significantly reduces the time and space costs. Our numerical experiments on benchmark datasets verify the superiority of EOF over the state-of-the-art in kernel approximation.}
}
@InProceedings{pmlr-v119-ding20c,
title = {Layered Sampling for Robust Optimization Problems},
author = {Ding, Hu and Wang, Zixiu},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2556--2566},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ding20c/ding20c.pdf},
url = {http://proceedings.mlr.press/v119/ding20c.html},
abstract = {In real world, our datasets often contain outliers. Most existing algorithms for handling outliers take high time complexities (\emph{e.g.} quadratic or cubic complexity). \emph{Coreset} is a popular approach for compressing data so as to speed up the optimization algorithms. However, the current coreset methods cannot be easily extended to handle the case with outliers. In this paper, we propose a new variant of coreset technique, \emph{layered sampling}, to deal with two fundamental robust optimization problems: \emph{$k$-median/means clustering with outliers} and \emph{linear regression with outliers}. This new coreset method is in particular suitable to speed up the iterative algorithms (which often improve the solution within a local range) for those robust optimization problems.}
}
@InProceedings{pmlr-v119-djuric20a,
title = {Growing Adaptive Multi-hyperplane Machines},
author = {Djuric, Nemanja and Wang, Zhuang and Vucetic, Slobodan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2567--2576},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/djuric20a/djuric20a.pdf},
url = {http://proceedings.mlr.press/v119/djuric20a.html},
abstract = {Adaptive Multi-hyperplane Machine (AMM) is an online algorithm for learning Multi-hyperplane Machine (MM), a classification model which allows multiple hyperplanes per class. AMM is based on Stochastic Gradient Descent (SGD), with training time comparable to linear Support Vector Machine (SVM) and significantly higher accuracy. On the other hand, empirical results indicate there is a large accuracy gap between AMM and non-linear SVMs. In this paper we show that this performance gap is not due to limited representability of the MM model, as it can represent arbitrary concepts. We set to explain the connection between the AMM and Learning Vector Quantization (LVQ) algorithms, and introduce a novel Growing AMM (GAMM) classifier motivated by Growing LVQ, that imputes duplicate hyperplanes into the MM model during SGD training. We provide theoretical results showing that GAMM has favorable convergence properties, and analyze the generalization bound of the MM models. Experiments indicate that GAMM achieves significantly improved accuracy on non-linear problems, with only slightly slower training compared to AMM. On some tasks GAMM comes close to non-linear SVM, and outperforms other popular classifiers such as Neural Networks and Random Forests.}
}
@InProceedings{pmlr-v119-doikov20a,
title = {Inexact Tensor Methods with Dynamic Accuracies},
author = {Doikov, Nikita and Nesterov, Yurii},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2577--2586},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/doikov20a/doikov20a.pdf},
url = {http://proceedings.mlr.press/v119/doikov20a.html},
abstract = {In this paper, we study inexact high-order Tensor Methods for solving convex optimization problems with composite objective. At every step of such methods, we use approximate solution of the auxiliary problem, defined by the bound for the residual in function value. We propose two dynamic strategies for choosing the inner accuracy: the first one is decreasing as $1/k^{p + 1}$, where $p \geq 1$ is the order of the method and $k$ is the iteration counter, and the second approach is using for the inner accuracy the last progress in the target objective. We show that inexact Tensor Methods with these strategies achieve the same global convergence rate as in the error-free case. For the second approach we also establish local superlinear rates (for $p \geq 2$), and propose the accelerated scheme. Lastly, we present computational results on a variety of machine learning problems for several methods and different accuracy policies.}
}
@InProceedings{pmlr-v119-domke20a,
title = {Provable Smoothness Guarantees for Black-Box Variational Inference},
author = {Domke, Justin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2587--2596},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/domke20a/domke20a.pdf},
url = {http://proceedings.mlr.press/v119/domke20a.html},
abstract = {Black-box variational inference tries to approximate a complex target distribution through a gradient-based optimization of the parameters of a simpler distribution. Provable convergence guarantees require structural properties of the objective. This paper shows that for location-scale family approximations, if the target is M-Lipschitz smooth, then so is the “energy” part of the variational objective. The key proof idea is to describe gradients in a certain inner-product space, thus permitting the use of Bessel’s inequality. This result gives bounds on the location of the optimal parameters, and is a key ingredient for convergence guarantees.}
}
@InProceedings{pmlr-v119-dong20a,
title = {Optimal Differential Privacy Composition for Exponential Mechanisms},
author = {Dong, Jinshuo and Durfee, David and Rogers, Ryan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2597--2606},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dong20a/dong20a.pdf},
url = {http://proceedings.mlr.press/v119/dong20a.html},
abstract = {Composition is one of the most important properties of differential privacy (DP), as it allows algorithm designers to build complex private algorithms from DP primitives. We consider precise composition bounds of the overall privacy loss for exponential mechanisms, one of the fundamental classes of mechanisms in DP. Exponential mechanism has also become a fundamental building block in private machine learning, e.g. private PCA and hyper-parameter selection. We give explicit formulations of the optimal privacy loss for both the adaptive and non-adaptive composition of exponential mechanism. For the non-adaptive setting in which each mechanism has the same privacy parameter, we give an efficiently computable formulation of the optimal privacy loss. In the adaptive case, we derive a recursive formula and an efficiently computable upper bound. These precise understandings about the problem lead to a 40% saving of the privacy budget in a practical application. Furthermore, the algorithm-specific analysis shows a difference in privacy parameters of adaptive and non-adaptive composition, which was widely believed to not exist based on the evidence from general analysis.}
}
@InProceedings{pmlr-v119-dong20b,
title = {Multinomial Logit Bandit with Low Switching Cost},
author = {Dong, Kefan and Li, Yingkai and Zhang, Qin and Zhou, Yuan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2607--2615},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dong20b/dong20b.pdf},
url = {http://proceedings.mlr.press/v119/dong20b.html},
abstract = {We study multinomial logit bandit with limited adaptivity, where the algorithms change their exploration actions as infrequently as possible when achieving almost optimal minimax regret. We propose two measures of adaptivity: the assortment switching cost and the more fine-grained item switching cost. We present an anytime algorithm (AT-DUCB) with $O(N \log T)$ assortment switches, almost matching the lower bound $\Omega(\frac{N \log T}{ \log \log T})$. In the fixed-horizon setting, our algorithm FH-DUCB incurs $O(N \log \log T)$ assortment switches, matching the asymptotic lower bound. We also present the ESUCB algorithm with item switching cost $O(N \log^2 T)$.}
}
@InProceedings{pmlr-v119-dong20c,
title = {Towards Adaptive Residual Network Training: A Neural-{ODE} Perspective},
author = {Dong, Chengyu and Liu, Liyuan and Li, Zichao and Shang, Jingbo},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2616--2626},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dong20c/dong20c.pdf},
url = {http://proceedings.mlr.press/v119/dong20c.html},
abstract = {In pursuit of resource-economical machine learning, attempts have been made to dynamically adjust computation workloads in different training stages, i.e., starting with a shallow network and gradually increasing the model depth (and computation workloads) during training. However, there is neither guarantee nor guidance on designing such network grow, due to the lack of its theoretical underpinnings. In this work, to explore the theory behind, we conduct theoretical analyses from an ordinary differential equation perspective. Specifically, we illustrate the dynamics of network growth and propose a novel performance measure specific to the depth increase. Illuminated by our analyses, we move towards theoretically sound growing operations and schedulers, giving rise to an adaptive training algorithm for residual networks, LipGrow, which automatically increases network depth thus accelerates training. In our experiments, it achieves comparable performance while reducing ∼ 50% of training time.}
}
@InProceedings{pmlr-v119-dong20d,
title = {On the Expressivity of Neural Networks for Deep Reinforcement Learning},
author = {Dong, Kefan and Luo, Yuping and Yu, Tianhe and Finn, Chelsea and Ma, Tengyu},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2627--2637},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dong20d/dong20d.pdf},
url = {http://proceedings.mlr.press/v119/dong20d.html},
abstract = {We compare the model-free reinforcement learning with the model-based approaches through the lens of the expressive power of neural networks for policies, Q-functions, and dynamics. We show, theoretically and empirically, that even for one-dimensional continuous state space, there are many MDPs whose optimal Q-functions and policies are much more complex than the dynamics. For these MDPs, model-based planning is a favorable algorithm, because the resulting policies can approximate the optimal policy significantly better than a neural network parameterization can, and model-free or model-based policy optimization rely on policy parameterization. Motivated by the theory, we apply a simple multi-step model-based bootstrapping planner (BOOTS) to bootstrap a weak Q-function into a stronger policy. Empirical results show that applying BOOTS on top of model-based or model-free policy optimization algorithms at the test time improves the performance on benchmark tasks.}
}
@InProceedings{pmlr-v119-dong20e,
title = {Collapsed Amortized Variational Inference for Switching Nonlinear Dynamical Systems},
author = {Dong, Zhe and Seybold, Bryan and Murphy, Kevin and Bui, Hung},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2638--2647},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dong20e/dong20e.pdf},
url = {http://proceedings.mlr.press/v119/dong20e.html},
abstract = {We propose an efficient inference method for switching nonlinear dynamical systems. The key idea is to learn an inference network which can be used as a proposal distribution for the continuous latent variables, while performing exact marginalization of the discrete latent variables. This allows us to use the reparameterization trick, and apply end-to-end training with stochastic gradient descent. We show that the proposed method can successfully segment time series data, including videos and 3D human pose, into meaningful “regimes” by using the piece-wise nonlinear dynamics.}
}
@InProceedings{pmlr-v119-dong20f,
title = {Expert Learning through Generalized Inverse Multiobjective Optimization: Models, Insights, and Algorithms},
author = {Dong, Chaosheng and Zeng, Bo},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2648--2657},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dong20f/dong20f.pdf},
url = {http://proceedings.mlr.press/v119/dong20f.html},
abstract = {We consider a new unsupervised learning task of inferring parameters of a multiobjective decision making model, based on a set of observed decisions from the human expert. This setting is important in applications (such as the task of portfolio management) where it may be difficult to obtain the human expert’s intrinsic decision making model. We formulate such a learning problem as an inverse multiobjective optimization problem (IMOP) and propose its first sophisticated model with statistical guarantees. Then, we reveal several fundamental connections between IMOP, K-means clustering, and manifold learning. Leveraging these critical insights and connections, we propose two algorithms to solve IMOP through manifold learning and clustering. Numerical results confirm the effectiveness of our model and the computational efficacy of algorithms.}
}
@InProceedings{pmlr-v119-drori20a,
title = {The Complexity of Finding Stationary Points with Stochastic Gradient Descent},
author = {Drori, Yoel and Shamir, Ohad},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2658--2667},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/drori20a/drori20a.pdf},
url = {http://proceedings.mlr.press/v119/drori20a.html},
abstract = {We study the iteration complexity of stochastic gradient descent (SGD) for minimizing the gradient norm of smooth, possibly nonconvex functions. We provide several results, implying that the classical $\mathcal{O}(\epsilon^{-4})$ upper bound (for making the average gradient norm less than $\epsilon$) cannot be improved upon, unless a combination of additional assumptions is made. Notably, this holds even if we limit ourselves to convex quadratic functions. We also show that for nonconvex functions, the feasibility of minimizing gradients with SGD is surprisingly sensitive to the choice of optimality criteria.}
}
@InProceedings{pmlr-v119-drutsa20a,
title = {Optimal Non-parametric Learning in Repeated Contextual Auctions with Strategic Buyer},
author = {Drutsa, Alexey},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2668--2677},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/drutsa20a/drutsa20a.pdf},
url = {http://proceedings.mlr.press/v119/drutsa20a.html},
abstract = {We study learning algorithms that optimize revenue in repeated contextual posted-price auctions where a seller interacts with a single strategic buyer that seeks to maximize his cumulative discounted surplus. The buyer’s valuation of a good is a fixed private function of a $d$-dimensional context (feature) vector that describes the good being sold. In contrast to existing studies on repeated contextual auctions with strategic buyer, in our work, the seller is not assumed to know the parametric model that underlies this valuation function. We introduce a novel non-parametric learning algorithm that is horizon-independent and has tight strategic regret upper bound of $\Theta(T^{d/(d+1)})$. We also non-trivially generalize several value-localization techniques of non-contextual repeated auctions to make them effective in the considered contextual non-parametric learning of the buyer valuation function.}
}
@InProceedings{pmlr-v119-drutsa20b,
title = {Reserve Pricing in Repeated Second-Price Auctions with Strategic Bidders},
author = {Drutsa, Alexey},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2678--2689},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/drutsa20b/drutsa20b.pdf},
url = {http://proceedings.mlr.press/v119/drutsa20b.html},
abstract = {We study revenue optimization learning algorithms for repeated second-price auctions with reserve where a seller interacts with multiple strategic bidders each of which holds a fixed private valuation for a good and seeks to maximize his expected future cumulative discounted surplus. We propose a novel algorithm that has strategic regret upper bound of $O(\log\log T)$ for worst-case valuations. This pricing is based on our novel transformation that upgrades an algorithm designed for the setup with a single buyer to the multi-buyer case. We provide theoretical guarantees on the ability of a transformed algorithm to learn the valuation of a strategic buyer, which has uncertainty about the future due to the presence of rivals.}
}
@InProceedings{pmlr-v119-duan20a,
title = {{NGB}oost: Natural Gradient Boosting for Probabilistic Prediction},
author = {Duan, Tony and Anand, Avati and Ding, Daisy Yi and Thai, Khanh K. and Basu, Sanjay and Ng, Andrew and Schuler, Alejandro},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2690--2700},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/duan20a/duan20a.pdf},
url = {http://proceedings.mlr.press/v119/duan20a.html},
abstract = {We present Natural Gradient Boosting (NGBoost), an algorithm for generic probabilistic prediction via gradient boosting. Typical regression models return a point estimate, conditional on covariates, but probabilistic regression models output a full probability distribution over the outcome space, conditional on the covariates. This allows for predictive uncertainty estimation - crucial in applications like healthcare and weather forecasting. NGBoost generalizes gradient boosting to probabilistic regression by treating the parameters of the conditional distribution as targets for a multiparameter boosting algorithm. Furthermore, we show how the Natural Gradient is required to correct the training dynamics of our multiparameter boosting approach. NGBoost can be used with any base learner, any family of distributions with continuous parameters, and any scoring rule. NGBoost matches or exceeds the performance of existing methods for probabilistic prediction while offering additional benefits in flexibility, scalability, and usability. An open-source implementation is available at github.com/stanfordmlgroup/ngboost.}
}
@InProceedings{pmlr-v119-duan20b,
title = {Minimax-Optimal Off-Policy Evaluation with Linear Function Approximation},
author = {Duan, Yaqi and Jia, Zeyu and Wang, Mengdi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2701--2709},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/duan20b/duan20b.pdf},
url = {http://proceedings.mlr.press/v119/duan20b.html},
abstract = {This paper studies the statistical theory of off-policy evaluation with function approximation in batch data reinforcement learning problem. We consider a regression-based fitted Q-iteration method, show that it is equivalent to a model-based method that estimates a conditional mean embedding of the transition operator, and prove that this method is information-theoretically optimal and has nearly minimal estimation error. In particular, by leveraging contraction property of Markov processes and martingale concentration, we establish a finite-sample instance-dependent error upper bound and a nearly-matching minimax lower bound. The policy evaluation error depends sharply on a restricted $\chi^2$-divergence over the function class between the long-term distribution of target policy and the distribution of past data. This restricted $\chi^2$-divergence characterizes the statistical limit of off-policy evaluation and is both instance-dependent and function-class-dependent. Further, we provide an easily computable confidence bound for the policy evaluator, which may be useful for optimistic planning and safe policy improvement.}
}
@InProceedings{pmlr-v119-duan20c,
title = {Online {B}ayesian Moment Matching based {SAT} Solver Heuristics},
author = {Duan, Haonan and Nejati, Saeed and Trimponias, George and Poupart, Pascal and Ganesh, Vijay},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2710--2719},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/duan20c/duan20c.pdf},
url = {http://proceedings.mlr.press/v119/duan20c.html},
abstract = {In this paper, we present a Bayesian Moment Matching (BMM) based method aimed at solving the initialization problem in Boolean SAT solvers. The initialization problem can be stated as follows: given a SAT formula $\phi$, compute an initial order over the variables of $\phi$ and values/polarity for these variables such that the runtime of SAT solvers on input $\phi$ is minimized. At the start of a solver run, our BMM-based methods compute a posterior probability distribution for an assignment to the variables of the input formula after analyzing its clauses, which will then be used by the solver to initialize its search. We perform extensive experiments to evaluate the efficacy of our BMM-based heuristic against 4 other initialization methods (random, survey propagation, Jeroslow-Wang, and default) in state-of-the-art solvers, MapleCOMSPS and MapleLCMDistChronotBT over the SAT competition 2018 application benchmark, as well as the best-known solvers in the cryptographic category, namely, CryptoMiniSAT, Glucose, and MapleSAT. On the cryptographic benchmark, BMM-based solvers out-perform all other initialization methods. Further, the BMM-based MapleCOMSPS significantly out-perform the same solver using all other initialization methods by 12 additional instances solved and better average runtime, over the SAT 2018 competition benchmark.}
}
@InProceedings{pmlr-v119-duan20d,
title = {Familywise Error Rate Control by Interactive Unmasking},
author = {Duan, Boyan and Ramdas, Aaditya and Wasserman, Larry},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2720--2729},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/duan20d/duan20d.pdf},
url = {http://proceedings.mlr.press/v119/duan20d.html},
abstract = {We propose a method for multiple hypothesis testing with familywise error rate (FWER) control, called the i-FWER test. Most testing methods are predefined algorithms that do not allow modifications after observing the data. However, in practice, analysts tend to choose a promising algorithm after observing the data; unfortunately, this violates the validity of the conclusion. The i-FWER test allows much flexibility: a human (or a computer program acting on the human’s behalf) may adaptively guide the algorithm in a data-dependent manner. We prove that our test controls FWER if the analysts adhere to a particular protocol of masking and unmasking. We demonstrate via numerical experiments the power of our test under structured non-nulls, and then explore new forms of masking.}
}
@InProceedings{pmlr-v119-dubey20a,
title = {Cooperative Multi-Agent Bandits with Heavy Tails},
author = {Dubey, Abhimanyu and Pentland, Alex `Sandy'},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2730--2739},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dubey20a/dubey20a.pdf},
url = {http://proceedings.mlr.press/v119/dubey20a.html},
abstract = {We study the heavy-tailed stochastic bandit problem in the cooperative multi-agent setting, where a group of agents interact with a common bandit problem, while communicating on a network with delays. Existing algorithms for the stochastic bandit in this setting utilize confidence intervals arising from an averaging-based communication protocol known as running consensus, that does not lend itself to robust estimation for heavy-tailed settings. We propose MP-UCB, a decentralized multi-agent algorithm for the cooperative stochastic bandit that incorporates robust estimation with a message-passing protocol. We prove optimal regret bounds for MP-UCB for several problem settings, and also demonstrate its superiority to existing methods. Furthermore, we establish the first lower bounds for the cooperative bandit problem, in addition to providing efficient algorithms for robust bandit estimation of location.}
}
@InProceedings{pmlr-v119-dubey20b,
title = {Kernel Methods for Cooperative Multi-Agent Contextual Bandits},
author = {Dubey, Abhimanyu and Pentland, Alex `Sandy'},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2740--2750},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dubey20b/dubey20b.pdf},
url = {http://proceedings.mlr.press/v119/dubey20b.html},
abstract = {Cooperative multi-agent decision making involves a group of agents cooperatively solving learning problems while communicating over a network with delays. In this paper, we consider the kernelised contextual bandit problem, where the reward obtained by an agent is an arbitrary linear function of the contexts’ images in the related reproducing kernel Hilbert space (RKHS), and a group of agents must cooperate to collectively solve their unique decision problems. For this problem, we propose Coop-KernelUCB, an algorithm that provides near-optimal bounds on the per-agent regret, and is both computationally and communicatively efficient. For special cases of the cooperative problem, we also provide variants of Coop-KernelUCB that provides optimal per-agent regret. In addition, our algorithm generalizes several existing results in the multi-agent bandit setting. Finally, on a series of both synthetic and real-world multi-agent network benchmarks, we demonstrate that our algorithm significantly outperforms existing benchmarks.}
}
@InProceedings{pmlr-v119-dukler20a,
title = {Optimization Theory for {R}e{LU} Neural Networks Trained with Normalization Layers},
author = {Dukler, Yonatan and Gu, Quanquan and Montufar, Guido},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2751--2760},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dukler20a/dukler20a.pdf},
url = {http://proceedings.mlr.press/v119/dukler20a.html},
abstract = {The current paradigm of deep neural networks has been successful in part due to the use of normalization layers. Normalization layers like Batch Normalization, Layer Normalization and Weight Normalization are ubiquitous in practice as they improve the generalization performance and training speed of neural networks significantly. Nonetheless, the vast majority of current deep learning theory and non-convex optimization literature focuses on the un-normalized setting. We bridge this gap by providing the first global convergence result for 2 layer non-linear neural networks with ReLU activations trained with a normalization layer, namely Weight Normalization. The analysis shows how the introduction of normalization layers changes the optimization landscape and in some settings enables faster convergence as compared with un-normalized neural networks.}
}
@InProceedings{pmlr-v119-dupont20a,
title = {Equivariant Neural Rendering},
author = {Dupont, Emilien and Martin, Miguel Bautista and Colburn, Alex and Sankar, Aditya and Susskind, Josh and Shan, Qi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2761--2770},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dupont20a/dupont20a.pdf},
url = {http://proceedings.mlr.press/v119/dupont20a.html},
abstract = {We propose a framework for learning neural scene representations directly from images, without 3D supervision. Our key insight is that 3D structure can be imposed by ensuring that the learned representation transforms like a real 3D scene. Specifically, we introduce a loss which enforces equivariance of the scene representation with respect to 3D transformations. Our formulation allows us to infer and render scenes in real time while achieving comparable results to models requiring minutes for inference. In addition, we introduce two challenging new datasets for scene representation and neural rendering, including scenes with complex lighting and backgrounds. Through experiments, we show that our model achieves compelling results on these datasets as well as on standard ShapeNet benchmarks.}
}
@InProceedings{pmlr-v119-durkan20a,
title = {On Contrastive Learning for Likelihood-free Inference},
author = {Durkan, Conor and Murray, Iain and Papamakarios, George},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2771--2781},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/durkan20a/durkan20a.pdf},
url = {http://proceedings.mlr.press/v119/durkan20a.html},
abstract = {Likelihood-free methods perform parameter inference in stochastic simulator models where evaluating the likelihood is intractable but sampling synthetic data is possible. One class of methods for this likelihood-free problem uses a classifier to distinguish between pairs of parameter-observation samples generated using the simulator and pairs sampled from some reference distribution, which implicitly learns a density ratio proportional to the likelihood. Another popular class of methods fits a conditional distribution to the parameter posterior directly, and a particular recent variant allows for the use of flexible neural density estimators for this task. In this work, we show that both of these approaches can be unified under a general contrastive learning scheme, and clarify how they should be run and compared.}
}
@InProceedings{pmlr-v119-dusenberry20a,
title = {Efficient and Scalable {B}ayesian Neural Nets with Rank-1 Factors},
author = {Dusenberry, Michael and Jerfel, Ghassen and Wen, Yeming and Ma, Yian and Snoek, Jasper and Heller, Katherine and Lakshminarayanan, Balaji and Tran, Dustin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2782--2792},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dusenberry20a/dusenberry20a.pdf},
url = {http://proceedings.mlr.press/v119/dusenberry20a.html},
abstract = {Bayesian neural networks (BNNs) demonstrate promising success in improving the robustness and uncertainty quantification of modern deep learning. However, they generally struggle with underfitting at scale and parameter efficiency. On the other hand, deep ensembles have emerged as alternatives for uncertainty quantification that, while outperforming BNNs on certain problems, also suffer from efficiency issues. It remains unclear how to combine the strengths of these two approaches and remediate their common issues. To tackle this challenge, we propose a rank-1 parameterization of BNNs, where each weight matrix involves only a distribution on a rank-1 subspace. We also revisit the use of mixture approximate posteriors to capture multiple modes, where unlike typical mixtures, this approach admits a significantly smaller memory increase (e.g., only a 0.4% increase for a ResNet-50 mixture of size 10). We perform a systematic empirical study on the choices of prior, variational posterior, and methods to improve training. For ResNet-50 on ImageNet, Wide ResNet 28-10 on CIFAR-10/100, and an RNN on MIMIC-III, rank-1 BNNs achieve state-of-the-art performance across log-likelihood, accuracy, and calibration on the test sets and out-of-distribution variants.}
}
@InProceedings{pmlr-v119-dutordoir20a,
title = {Sparse {G}aussian Processes with Spherical Harmonic Features},
author = {Dutordoir, Vincent and Durrande, Nicolas and Hensman, James},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2793--2802},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dutordoir20a/dutordoir20a.pdf},
url = {http://proceedings.mlr.press/v119/dutordoir20a.html},
abstract = {We introduce a new class of inter-domain variational Gaussian processes (GP) where data is mapped onto the unit hypersphere in order to use spherical harmonic representations. Our inference scheme is comparable to variational Fourier features, but it does not suffer from the curse of dimensionality, and leads to diagonal covariance matrices between inducing variables. This enables a speed-up in inference, because it bypasses the need to invert large covariance matrices. Our experiments show that our model is able to fit a regression model for a dataset with 6 million entries two orders of magnitude faster compared to standard sparse GPs, while retaining state of the art accuracy. We also demonstrate competitive performance on classification with non-conjugate likelihoods.}
}
@InProceedings{pmlr-v119-dutta20a,
title = {Is There a Trade-Off Between Fairness and Accuracy? {A} Perspective Using Mismatched Hypothesis Testing},
author = {Dutta, Sanghamitra and Wei, Dennis and Yueksel, Hazar and Chen, Pin-Yu and Liu, Sijia and Varshney, Kush},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2803--2813},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dutta20a/dutta20a.pdf},
url = {http://proceedings.mlr.press/v119/dutta20a.html},
abstract = {A trade-off between accuracy and fairness is almost taken as a given in the existing literature on fairness in machine learning. Yet, it is not preordained that accuracy should decrease with increased fairness. Novel to this work, we examine fair classification through the lens of mismatched hypothesis testing: trying to find a classifier that distinguishes between two ideal distributions when given two mismatched distributions that are biased. Using Chernoff information, a tool in information theory, we theoretically demonstrate that, contrary to popular belief, there always exist ideal distributions such that optimal fairness and accuracy (with respect to the ideal distributions) are achieved simultaneously: there is no trade-off. Moreover, the same classifier yields the lack of a trade-off with respect to ideal distributions while yielding a trade-off when accuracy is measured with respect to the given (possibly biased) dataset. To complement our main result, we formulate an optimization to find ideal distributions and derive fundamental limits to explain why a trade-off exists on the given biased dataset. We also derive conditions under which active data collection can alleviate the fairness-accuracy trade-off in the real world. Our results lead us to contend that it is problematic to measure accuracy with respect to data that reflects bias, and instead, we should be considering accuracy with respect to ideal, unbiased data.}
}
@InProceedings{pmlr-v119-dvurechensky20a,
title = {Self-Concordant Analysis of Frank-{W}olfe Algorithms},
author = {Dvurechensky, Pavel and Ostroukhov, Petr and Safin, Kamil and Shtern, Shimrit and Staudigl, Mathias},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2814--2824},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/dvurechensky20a/dvurechensky20a.pdf},
url = {http://proceedings.mlr.press/v119/dvurechensky20a.html},
abstract = {Projection-free optimization via different variants of the Frank-Wolfe (FW), a.k.a. Conditional Gradient method has become one of the cornerstones in optimization for machine learning since in many cases the linear minimization oracle is much cheaper to implement than projections and some sparsity needs to be preserved. In a number of applications, e.g. Poisson inverse problems or quantum state tomography, the loss is given by a self-concordant (SC) function having unbounded curvature, implying absence of theoretical guarantees for the existing FW methods. We use the theory of SC functions to provide a new adaptive step size for FW methods and prove global convergence rate O(1/k) after k iterations. If the problem admits a stronger local linear minimization oracle, we construct a novel FW method with linear convergence rate for SC functions.}
}
@InProceedings{pmlr-v119-edwards20a,
title = {Estimating Q(s,s’) with Deep Deterministic Dynamics Gradients},
author = {Edwards, Ashley and Sahni, Himanshu and Liu, Rosanne and Hung, Jane and Jain, Ankit and Wang, Rui and Ecoffet, Adrien and Miconi, Thomas and Isbell, Charles and Yosinski, Jason},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2825--2835},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/edwards20a/edwards20a.pdf},
url = {http://proceedings.mlr.press/v119/edwards20a.html},
abstract = {In this paper, we introduce a novel form of value function, $Q(s, s’)$, that expresses the utility of transitioning from a state $s$ to a neighboring state $s’$ and then acting optimally thereafter. In order to derive an optimal policy, we develop a forward dynamics model that learns to make next-state predictions that maximize this value. This formulation decouples actions from values while still learning off-policy. We highlight the benefits of this approach in terms of value function transfer, learning within redundant action spaces, and learning off-policy from state observations generated by sub-optimal or completely random policies. Code and videos are available at http://sites.google.com/view/qss-paper.}
}
@InProceedings{pmlr-v119-eftekhari20a,
title = {Training Linear Neural Networks: Non-Local Convergence and Complexity Results},
author = {Eftekhari, Armin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2836--2847},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/eftekhari20a/eftekhari20a.pdf},
url = {http://proceedings.mlr.press/v119/eftekhari20a.html},
abstract = {Linear networks provide valuable insights into the workings of neural networks in general. This paper identifies conditions under which the gradient flow provably trains a linear network, in spite of the non-strict saddle points present in the optimization landscape. This paper also provides the computational complexity of training linear networks with gradient flow. To achieve these results, this work develops a machinery to provably identify the stable set of gradient flow, which then enables us to improve over the state of the art in the literature of linear networks (Bah et al., 2019;Arora et al., 2018a). Crucially, our results appear to be the first to break away from the lazy training regime which has dominated the literature of neural networks. This work requires the network to have a layer with one neuron, which subsumes the networks with a scalar output, but extending the results of this theoretical work to all linear networks remains a challenging open problem.}
}
@InProceedings{pmlr-v119-el-bouri20a,
title = {Student-Teacher Curriculum Learning via Reinforcement Learning: Predicting Hospital Inpatient Admission Location},
author = {El-Bouri, Rasheed and Eyre, David and Watkinson, Peter and Zhu, Tingting and Clifton, David},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2848--2857},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/el-bouri20a/el-bouri20a.pdf},
url = {http://proceedings.mlr.press/v119/el-bouri20a.html},
abstract = {Accurate and reliable prediction of hospital admission location is important due to resource-constraints and space availability in a clinical setting, particularly when dealing with patients who come from the emergency department. In this work we propose a student-teacher network via reinforcement learning to deal with this specific problem. A representation of the weights of the student network is treated as the state and is fed as an input to the teacher network. The teacher network’s action is to select the most appropriate batch of data to train the student network on from a training set sorted according to entropy. By validating on three datasets, not only do we show that our approach outperforms state-of-the-art methods on tabular data and performs competitively on image recognition, but also that novel curricula are learned by the teacher network. We demonstrate experimentally that the teacher network can actively learn about the student network and guide it to achieve better performance than if trained alone.}
}
@InProceedings{pmlr-v119-elmachtoub20a,
title = {Decision Trees for Decision-Making under the Predict-then-Optimize Framework},
author = {Elmachtoub, Adam and Liang, Jason Cheuk Nam and Mcnellis, Ryan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2858--2867},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/elmachtoub20a/elmachtoub20a.pdf},
url = {http://proceedings.mlr.press/v119/elmachtoub20a.html},
abstract = {We consider the use of decision trees for decision-making problems under the predict-then-optimize framework. That is, we would like to first use a decision tree to predict unknown input parameters of an optimization problem, and then make decisions by solving the optimization problem using the predicted parameters. A natural loss function in this framework is to measure the suboptimality of the decisions induced by the predicted input parameters, as opposed to measuring loss using input parameter prediction error. This natural loss function is known in the literature as the Smart Predict-then-Optimize (SPO) loss, and we propose a tractable methodology called SPO Trees (SPOTs) for training decision trees under this loss. SPOTs benefit from the interpretability of decision trees, providing an interpretable segmentation of contextual features into groups with distinct optimal solutions to the optimization problem of interest. We conduct several numerical experiments on synthetic and real data including the prediction of travel times for shortest path problems and predicting click probabilities for news article recommendation. We demonstrate on these datasets that SPOTs simultaneously provide higher quality decisions and significantly lower model complexity than other machine learning approaches (e.g., CART) trained to minimize prediction error.}
}
@InProceedings{pmlr-v119-elsayed20a,
title = {Revisiting Spatial Invariance with Low-Rank Local Connectivity},
author = {Elsayed, Gamaleldin and Ramachandran, Prajit and Shlens, Jonathon and Kornblith, Simon},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2868--2879},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/elsayed20a/elsayed20a.pdf},
url = {http://proceedings.mlr.press/v119/elsayed20a.html},
abstract = {Convolutional neural networks are among the most successful architectures in deep learning with this success at least partially attributable to the efficacy of spatial invariance as an inductive bias. Locally connected layers, which differ from convolutional layers only in their lack of spatial invariance, usually perform poorly in practice. However, these observations still leave open the possibility that some degree of relaxation of spatial invariance may yield a better inductive bias than either convolution or local connectivity. To test this hypothesis, we design a method to relax the spatial invariance of a network layer in a controlled manner; we create a \emph{low-rank} locally connected layer, where the filter bank applied at each position is constructed as a linear combination of basis set of filter banks with spatially varying combining weights. By varying the number of basis filter banks, we can control the degree of relaxation of spatial invariance. In experiments with small convolutional networks, we find that relaxing spatial invariance improves classification accuracy over both convolution and locally connected layers across MNIST, CIFAR-10, and CelebA datasets, thus suggesting that spatial invariance may be an overly restrictive prior.}
}
@InProceedings{pmlr-v119-elthakeb20a,
title = {Divide and Conquer: Leveraging Intermediate Feature Representations for Quantized Training of Neural Networks},
author = {Elthakeb, Ahmed Taha and Pilligundla, Prannoy and Mireshghallah, Fatemeh and Cloninger, Alexander and Esmaeilzadeh, Hadi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2880--2891},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/elthakeb20a/elthakeb20a.pdf},
url = {http://proceedings.mlr.press/v119/elthakeb20a.html},
abstract = {The deep layers of modern neural networks extract a rather rich set of features as an input propagates through the network, this paper sets out to harvest these rich intermediate representations for quantization with minimal accuracy loss while significantly reducing the memory footprint and compute intensity of the DNN. This paper utilizes knowledge distillation through teacher-student paradigm (Hinton et al., 2015) in a novel setting that exploits the feature extraction capability of DNNs for higher accuracy quantization. As such, our algorithm logically divides a pretrained full-precision DNN to multiple sections, each of which exposes intermediate features to train a team of students independently in the quantized domain and simply stitching them afterwards. This divide and conquer strategy, makes the training of each student section possible in isolation, speeding up training by enabling parallelization. Experiments on various DNNs (AlexNet, LeNet, MobileNet, ResNet-18, ResNet-20, SVHN and VGG-11) show that, this approach{—}called DCQ (Divide and Conquer Quantization){—}on average, improves the performance of a state-of-the-art quantized training technique, DoReFa-Net (Zhou et al., 2016) by 21.6% and 9.3% for binary and ternary quantization, respectively. Additionally, we show that incorporating DCQ to existing quantized training methods leads to improved accuracies as compared to previously reported by multiple state-of-the-art quantized training methods.}
}
@InProceedings{pmlr-v119-emami20a,
title = {Generalization Error of Generalized Linear Models in High Dimensions},
author = {Emami, Melikasadat and Sahraee-Ardakan, Mojtaba and Pandit, Parthe and Rangan, Sundeep and Fletcher, Alyson},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2892--2901},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/emami20a/emami20a.pdf},
url = {http://proceedings.mlr.press/v119/emami20a.html},
abstract = {At the heart of machine learning lies the question of generalizability of learned rules over previously unseen data. While over-parameterized models based on neural networks are now ubiquitous in machine learning applications, our understanding of their generalization capabilities is incomplete and this task is made harder by the non-convexity of the underlying learning problems. We provide a general framework to characterize the asymptotic generalization error for single-layer neural networks (i.e., generalized linear models) with arbitrary non-linearities, making it applicable to regression as well as classification problems. This framework enables analyzing the effect of (i) over-parameterization and non-linearity during modeling; (ii) choices of loss function, initialization, and regularizer during learning; and (iii) mismatch between training and test distributions. As examples, we analyze a few special cases, namely linear regression and logistic regression. We are also able to rigorously and analytically explain the \emph{double descent} phenomenon in generalized linear models.}
}
@InProceedings{pmlr-v119-ene20a,
title = {Parallel Algorithm for Non-Monotone {DR}-Submodular Maximization},
author = {Ene, Alina and Nguyen, Huy},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2902--2911},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ene20a/ene20a.pdf},
url = {http://proceedings.mlr.press/v119/ene20a.html},
abstract = {In this work, we give a new parallel algorithm for the problem of maximizing a non-monotone diminishing returns submodular function subject to a cardinality constraint. For any desired accuracy $\epsilon$, our algorithm achieves a $1/e - \epsilon$ approximation using $O(\log{n} \log(1/\epsilon) / \epsilon^3)$ parallel rounds of function evaluations. The approximation guarantee nearly matches the best approximation guarantee known for the problem in the sequential setting and the number of parallel rounds is nearly-optimal for any constant $\epsilon$. Previous algorithms achieve worse approximation guarantees using $\Omega(\log^2{n})$ parallel rounds. Our experimental evaluation suggests that our algorithm obtains solutions whose objective value nearly matches the value obtained by the state of the art sequential algorithms, and it outperforms previous parallel algorithms in number of parallel rounds, iterations, and solution quality.}
}
@InProceedings{pmlr-v119-engelmann20a,
title = {Continuous Time {B}ayesian Networks with Clocks},
author = {Engelmann, Nicolai and Linzner, Dominik and Koeppl, Heinz},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2912--2921},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/engelmann20a/engelmann20a.pdf},
url = {http://proceedings.mlr.press/v119/engelmann20a.html},
abstract = {Structured stochastic processes evolving in continuous time present a widely adopted framework to model phenomena occurring in nature and engineering. However, such models are often chosen to satisfy the Markov property to maintain tractability. One of the more popular of such memoryless models are Continuous Time Bayesian Networks (CTBNs). In this work, we lift its restriction to exponential survival times to arbitrary distributions. Current extensions achieve this via auxiliary states, which hinder tractability. To avoid that, we introduce a set of node-wise clocks to construct a collection of graph-coupled semi-Markov chains. We provide algorithms for parameter and structure inference, which make use of local dependencies and conduct experiments on synthetic data and a data-set generated through a benchmark tool for gene regulatory networks. In doing so, we point out advantages compared to current CTBN extensions.}
}
@InProceedings{pmlr-v119-engstrom20a,
title = {Identifying Statistical Bias in Dataset Replication},
author = {Engstrom, Logan and Ilyas, Andrew and Santurkar, Shibani and Tsipras, Dimitris and Steinhardt, Jacob and Madry, Aleksander},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2922--2932},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/engstrom20a/engstrom20a.pdf},
url = {http://proceedings.mlr.press/v119/engstrom20a.html},
abstract = {Dataset replication is a useful tool for assessing whether improvements in test accuracy on a specific benchmark correspond to improvements in models’ ability to generalize reliably. In this work, we present unintuitive yet significant ways in which standard approaches to dataset replication introduce statistical bias, skewing the resulting observations. We study ImageNet-v2, a replication of the ImageNet dataset on which models exhibit a significant (11-14%) drop in accuracy, even after controlling for selection frequency, a human-in-the-loop measure of data quality. We show that after remeasuring selection frequencies and correcting for statistical bias, only an estimated 3.6% of the original 11.7% accuracy drop remains unaccounted for. We conclude with concrete recommendations for recognizing and avoiding bias in dataset replication. Code for our study is publicly available: https://git.io/data-rep-analysis.}
}
@InProceedings{pmlr-v119-eshraghi20a,
title = {Distributed Online Optimization over a Heterogeneous Network with Any-Batch Mirror Descent},
author = {Eshraghi, Nima and Liang, Ben},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2933--2942},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/eshraghi20a/eshraghi20a.pdf},
url = {http://proceedings.mlr.press/v119/eshraghi20a.html},
abstract = {In distributed online optimization over a computing network with heterogeneous nodes, slow nodes can adversely affect the progress of fast nodes, leading to drastic slowdown of the overall convergence process. To address this issue, we consider a new algorithm termed Distributed Any-Batch Mirror Descent (DABMD), which is based on distributed Mirror Descent but uses a fixed per-round computing time to limit the waiting by fast nodes to receive information updates from slow nodes. DABMD is characterized by varying minibatch sizes across nodes. It is applicable to a broader range of problems compared with existing distributed online optimization methods such as those based on dual averaging, and it accommodates time-varying network topology. We study two versions of DABMD, depending on whether the computing nodes average their primal variables via single or multiple consensus iterations. We show that both versions provide strong theoretical performance guarantee, by deriving upperbounds on their expected dynamic regret, which capture the variability in minibatch sizes. Our experimental results show substantial reduction in cost and acceleration in convergence compared with the known best alternative.}
}
@InProceedings{pmlr-v119-evci20a,
title = {Rigging the Lottery: Making All Tickets Winners},
author = {Evci, Utku and Gale, Trevor and Menick, Jacob and Castro, Pablo Samuel and Elsen, Erich},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2943--2952},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/evci20a/evci20a.pdf},
url = {http://proceedings.mlr.press/v119/evci20a.html},
abstract = {Many applications require sparse neural networks due to space or inference time restrictions. There is a large body of work on training dense networks to yield sparse networks for inference, but this limits the size of the largest trainable sparse model to that of the largest trainable dense model. In this paper we introduce a method to train sparse neural networks with a fixed parameter count and a fixed computational cost throughout training, without sacrificing accuracy relative to existing dense-to-sparse training methods. Our method updates the topology of the sparse network during training by using parameter magnitudes and infrequent gradient calculations. We show that this approach requires fewer floating-point operations (FLOPs) to achieve a given level of accuracy compared to prior techniques. We demonstrate state-of-the-art sparse training results on a variety of networks and datasets, including ResNet-50, MobileNets on Imagenet-2012, and RNNs on WikiText-103. Finally, we provide some insights into why allowing the topology to change during the optimization can overcome local minima encountered when the topology remains static.}
}
@InProceedings{pmlr-v119-fahrbach20a,
title = {Faster Graph Embeddings via Coarsening},
author = {Fahrbach, Matthew and Goranci, Gramoz and Peng, Richard and Sachdeva, Sushant and Wang, Chi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2953--2963},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/fahrbach20a/fahrbach20a.pdf},
url = {http://proceedings.mlr.press/v119/fahrbach20a.html},
abstract = {Graph embeddings are a ubiquitous tool for machine learning tasks, such as node classification and link prediction, on graph-structured data. However, computing the embeddings for large-scale graphs is prohibitively inefficient even if we are interested only in a small subset of relevant vertices. To address this, we present an efficient graph coarsening approach, based on Schur complements, for computing the embedding of the relevant vertices. We prove that these embeddings are preserved exactly by the Schur complement graph that is obtained via Gaussian elimination on the non-relevant vertices. As computing Schur complements is expensive, we give a nearly-linear time algorithm that generates a coarsened graph on the relevant vertices that provably matches the Schur complement in expectation in each iteration. Our experiments involving prediction tasks on graphs demonstrate that computing embeddings on the coarsened graph, rather than the entire graph, leads to significant time savings without sacrificing accuracy.}
}
@InProceedings{pmlr-v119-fajtl20a,
title = {Latent Bernoulli Autoencoder},
author = {Fajtl, Jiri and Argyriou, Vasileios and Monekosso, Dorothy and Remagnino, Paolo},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2964--2974},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/fajtl20a/fajtl20a.pdf},
url = {http://proceedings.mlr.press/v119/fajtl20a.html},
abstract = {In this work, we pose the question whether it is possible to design and train an autoencoder model in an end-to-end fashion to learn representations in the multivariate Bernoulli latent space, and achieve performance comparable with the state-of-the-art variational methods. Moreover, we investigate how to generate novel samples and perform smooth interpolation and attributes modification in the binary latent space. To meet our objective, we propose a simplified, deterministic model with a straight-through gradient estimator to learn the binary latents and show its competitiveness with the latest VAE methods. Furthermore, we propose a novel method based on a random hyperplane rounding for sampling and smooth interpolation in the latent space. Our method performs on a par or better than the current state-of-the-art methods on common CelebA, CIFAR-10 and MNIST datasets.}
}
@InProceedings{pmlr-v119-falahatgar20a,
title = {Optimal Sequential Maximization: One Interview is Enough!},
author = {Falahatgar, Moein and Orlitsky, Alon and Pichapati, Venkatadheeraj},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2975--2984},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/falahatgar20a/falahatgar20a.pdf},
url = {http://proceedings.mlr.press/v119/falahatgar20a.html},
abstract = {Maximum selection under probabilistic queries \emph{(probabilistic maximization)} is a fundamental algorithmic problem arising in numerous theoretical and practical contexts. We derive the first query-optimal sequential algorithm for probabilistic-maximization. Departing from previous assumptions, the algorithm and performance guarantees apply even for infinitely many items, hence in particular do not require a-priori knowledge of the number of items. The algorithm has linear query complexity, and is optimal also in the streaming setting. To derive these results we consider a probabilistic setting where several candidates for a position are asked multiple questions with the goal of finding who has the highest probability of answering interview questions correctly. Previous work minimized the total number of questions asked by alternating back and forth between the best performing candidates, in a sense, inviting them to multiple interviews. We show that the same order-wise selection accuracy can be achieved by querying the candidates sequentially, never returning to a previously queried candidate. Hence one interview is enough!}
}
@InProceedings{pmlr-v119-fan20a,
title = {Spectral Graph Matching and Regularized Quadratic Relaxations: Algorithm and Theory},
author = {Fan, Zhou and Mao, Cheng and Wu, Yihong and Xu, Jiaming},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2985--2995},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/fan20a/fan20a.pdf},
url = {http://proceedings.mlr.press/v119/fan20a.html},
abstract = {Graph matching, also known as network alignment, aims at recovering the latent vertex correspondence between two unlabeled, edge-correlated weighted graphs. To tackle this task, we propose a spectral method, GRAph Matching by Pairwise eigen-Alignments (GRAMPA), which first constructs a similarity matrix as a weighted sum of outer products between all pairs of eigenvectors of the two graphs, and then outputs a matching by a simple rounding procedure. For a universality class of correlated Wigner models, GRAMPA achieves exact recovery of the latent matching between two graphs with edge correlation $1 - 1/\mathrm{polylog}(n)$ and average degree at least $\mathrm{polylog}(n)$. This matches the state-of-the-art guarantees for polynomial-time algorithms established for correlated Erdős-Rényi graphs, and significantly improves over existing spectral methods. The superiority of GRAMPA is also demonstrated on a variety of synthetic and real datasets, in terms of both statistical accuracy and computational efficiency.}
}
@InProceedings{pmlr-v119-fan20b,
title = {On hyperparameter tuning in general clustering problemsm},
author = {Fan, Xinjie and Yue, Yuguang and Sarkar, Purnamrita and Wang, Y. X. Rachel},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {2996--3007},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/fan20b/fan20b.pdf},
url = {http://proceedings.mlr.press/v119/fan20b.html},
abstract = {Tuning hyperparameters for unsupervised learning problems is difficult in general due to the lack of ground truth for validation. However, the success of most clustering methods depends heavily on the correct choice of the involved hyperparameters. Take for example the Lagrange multipliers of penalty terms in semidefinite programming (SDP) relaxations of community detection in networks, or the bandwidth parameter needed in the Gaussian kernel used to construct similarity matrices for spectral clustering. Despite the popularity of these clustering algorithms, there are not many provable methods for tuning these hyperparameters. In this paper, we provide an overarching framework with provable guarantees for tuning hyperparameters in the above class of problems under two different models. Our framework can be augmented with a cross validation procedure to do model selection as well. In a variety of simulation and real data experiments, we show that our framework outperforms other widely used tuning procedures in a broad range of parameter settings.}
}
@InProceedings{pmlr-v119-fang20a,
title = {Online mirror descent and dual averaging: keeping pace in the dynamic case},
author = {Fang, Huang and Harvey, Nick and Portella, Victor and Friedlander, Michael},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3008--3017},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/fang20a/fang20a.pdf},
url = {http://proceedings.mlr.press/v119/fang20a.html},
abstract = {Online mirror descent (OMD) and dual averaging (DA)—two fundamental algorithms for online convex optimization—are known to have very similar (and sometimes identical) performance guarantees when used with a fixed learning rate. Under dynamic learning rates, however, OMD is provably inferior to DA and suffers a linear regret, even in common settings such as prediction with expert advice. We modify the OMD algorithm through a simple technique that we call stabilization. We give essentially the same abstract regret bound for OMD with stabilization and for DA by modifying the classical OMD convergence analysis in a careful and modular way that allows for straightforward and flexible proofs. Simple corollaries of these bounds show that OMD with stabilization and DA enjoy the same performance guarantees in many applications—even under dynamic learning rates. We also shed light on the similarities between OMD and DA and show simple conditions under which stabilized-OMD and DA generate the same iterates.}
}
@InProceedings{pmlr-v119-farina20a,
title = {Stochastic Regret Minimization in Extensive-Form Games},
author = {Farina, Gabriele and Kroer, Christian and Sandholm, Tuomas},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3018--3028},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/farina20a/farina20a.pdf},
url = {http://proceedings.mlr.press/v119/farina20a.html},
abstract = {Monte-Carlo counterfactual regret minimization (MCCFR) is the state-of-the-art algorithm for solving sequential games that are too large for full tree traversals. It works by using gradient estimates that can be computed via sampling. However, stochastic methods for sequential games have not been investigated extensively beyond MCCFR. In this paper we develop a new framework for developing stochastic regret minimization methods. This framework allows us to use any regret-minimization algorithm, coupled with any gradient estimator. The MCCFR algorithm can be analyzed as a special case of our framework, and this analysis leads to significantly stronger theoretical guarantees on convergence, while simultaneously yielding a simplified proof. Our framework allows us to instantiate several new stochastic methods for solving sequential games. We show extensive experiments on five games, where some variants of our methods outperform MCCFR.}
}
@InProceedings{pmlr-v119-farnia20a,
title = {Do {GAN}s always have {N}ash equilibria?},
author = {Farnia, Farzan and Ozdaglar, Asuman},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3029--3039},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/farnia20a/farnia20a.pdf},
url = {http://proceedings.mlr.press/v119/farnia20a.html},
abstract = {Generative adversarial networks (GANs) represent a zero-sum game between two machine players, a generator and a discriminator, designed to learn the distribution of data. While GANs have achieved state-of-the-art performance in several benchmark learning tasks, GAN minimax optimization still poses great theoretical and empirical challenges. GANs trained using first-order optimization methods commonly fail to converge to a stable solution where the players cannot improve their objective, i.e., the Nash equilibrium of the underlying game. Such issues raise the question of the existence of Nash equilibria in GAN zero-sum games. In this work, we show through theoretical and numerical results that indeed GAN zero-sum games may have no Nash equilibria. To characterize an equilibrium notion applicable to GANs, we consider the equilibrium of a new zero-sum game with an objective function given by a proximal operator applied to the original objective, a solution we call the proximal equilibrium. Unlike the Nash equilibrium, the proximal equilibrium captures the sequential nature of GANs, in which the generator moves first followed by the discriminator. We prove that the optimal generative model in Wasserstein GAN problems provides a proximal equilibrium. Inspired by these results, we propose a new approach, which we call proximal training, for solving GAN problems. We perform several numerical experiments indicating the existence of proximal equilibria in GANs.}
}
@InProceedings{pmlr-v119-farquhar20a,
title = {Growing Action Spaces},
author = {Farquhar, Gregory and Gustafson, Laura and Lin, Zeming and Whiteson, Shimon and Usunier, Nicolas and Synnaeve, Gabriel},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3040--3051},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/farquhar20a/farquhar20a.pdf},
url = {http://proceedings.mlr.press/v119/farquhar20a.html},
abstract = {In complex tasks, such as those with large combinatorial action spaces, random exploration may be too inefficient to achieve meaningful learning progress. In this work, we use a curriculum of progressively growing action spaces to accelerate learning. We assume the environment is out of our control, but that the agent may set an internal curriculum by initially restricting its action space. Our approach uses off-policy reinforcement learning to estimate optimal value functions for multiple action spaces simultaneously and efficiently transfers data, value estimates, and state representations from restricted action spaces to the full task. We show the efficacy of our approach in proof-of-concept control tasks and on challenging large-scale StarCraft micromanagement tasks with large, multi-agent action spaces.}
}
@InProceedings{pmlr-v119-faury20a,
title = {Improved Optimistic Algorithms for Logistic Bandits},
author = {Faury, Louis and Abeille, Marc and Calauzenes, Clement and Fercoq, Olivier},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3052--3060},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/faury20a/faury20a.pdf},
url = {http://proceedings.mlr.press/v119/faury20a.html},
abstract = {The generalized linear bandit framework has attracted a lot of attention in recent years by extending the well-understood linear setting and allowing to model richer reward structures. It notably covers the logistic model, widely used when rewards are binary. For logistic bandits, the frequentist regret guarantees of existing algorithms are $\tilde{\mathcal{O}}(\kappa \sqrt{T})$, where $\kappa$ is a problem-dependent constant. Unfortunately, $\kappa$ can be arbitrarily large as it scales exponentially with the size of the decision set. This may lead to significantly loose regret bounds and poor empirical performance. In this work, we study the logistic bandit with a focus on the prohibitive dependencies introduced by $\kappa$. We propose a new optimistic algorithm based on a finer examination of the non-linearities of the reward function. We show that it enjoys a $\tilde{\mathcal{O}}(\sqrt{T})$ regret with no dependency in $\kappa$, but for a second order term. Our analysis is based on a new tail-inequality for self-normalized martingales, of independent interest.}
}
@InProceedings{pmlr-v119-fedus20a,
title = {Revisiting Fundamentals of Experience Replay},
author = {Fedus, William and Ramachandran, Prajit and Agarwal, Rishabh and Bengio, Yoshua and Larochelle, Hugo and Rowland, Mark and Dabney, Will},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3061--3071},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/fedus20a/fedus20a.pdf},
url = {http://proceedings.mlr.press/v119/fedus20a.html},
abstract = {Experience replay is central to off-policy algorithms in deep reinforcement learning (RL), but there remain significant gaps in our understanding. We therefore present a systematic and extensive analysis of experience replay in Q-learning methods, focusing on two fundamental properties: the replay capacity and the ratio of learning updates to experience collected (replay ratio). Our additive and ablative studies upend conventional wisdom around experience replay {—} greater capacity is found to substantially increase the performance of certain algorithms, while leaving others unaffected. Counterintuitively we show that theoretically ungrounded, uncorrected n-step returns are uniquely beneficial while other techniques confer limited benefit for sifting through larger memory. Separately, by directly controlling the replay ratio we contextualize previous observations in the literature and empirically measure its importance across a variety of deep RL algorithms. Finally, we conclude by testing a set of hypotheses on the nature of these performance benefits.}
}
@InProceedings{pmlr-v119-feng20a,
title = {Learning with Multiple Complementary Labels},
author = {Feng, Lei and Kaneko, Takuo and Han, Bo and Niu, Gang and An, Bo and Sugiyama, Masashi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3072--3081},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/feng20a/feng20a.pdf},
url = {http://proceedings.mlr.press/v119/feng20a.html},
abstract = {A complementary label (CL) simply indicates an incorrect class of an example, but learning with CLs results in multi-class classifiers that can predict the correct class. Unfortunately, the problem setting only allows a single CL for each example, which notably limits its potential since our labelers may easily identify multiple CLs (MCLs) to one example. In this paper, we propose a novel problem setting to allow MCLs for each example and two ways for learning with MCLs. In the first way, we design two wrappers that decompose MCLs into many single CLs, so that we could use any method for learning with CLs. However, the supervision information that MCLs hold is conceptually diluted after decomposition. Thus, in the second way, we derive an unbiased risk estimator; minimizing it processes each set of MCLs as a whole and possesses an estimation error bound. We further improve the second way into minimizing properly chosen upper bounds. Experiments show that the former way works well for learning with MCLs but the latter is even better.}
}
@InProceedings{pmlr-v119-feng20b,
title = {Global Concavity and Optimization in a Class of Dynamic Discrete Choice Models},
author = {Feng, Yiding and Khmelnitskaya, Ekaterina and Nekipelov, Denis},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3082--3091},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/feng20b/feng20b.pdf},
url = {http://proceedings.mlr.press/v119/feng20b.html},
abstract = {Discrete choice models with unobserved heterogeneity are commonly used Econometric models for dynamic Economic behavior which have been adopted in practice to predict behavior of individuals and firms from schooling and job choices to strategic decisions in market competition. These models feature optimizing agents who choose among a finite set of options in a sequence of periods and receive choice-specific payoffs that depend on both variables that are observed by the agent and recorded in the data and variables that are only observed by the agent but not recorded in the data. Existing work in Econometrics assumes that optimizing agents are fully rational and requires finding a functional fixed point to find the optimal policy. We show that in an important class of discrete choice models the value function is globally concave in the policy. That means that simple algorithms that do not require fixed point computation, such as the policy gradient algorithm, globally converge to the optimal policy. This finding can both be used to relax behavioral assumption regarding the optimizing agents and to facilitate Econometric analysis of dynamic behavior. In particular, we demonstrate significant computational advantages in using a simple implementation policy gradient algorithm over existing “nested fixed point” algorithms used in Econometrics.}
}
@InProceedings{pmlr-v119-feng20c,
title = {The Intrinsic Robustness of Stochastic Bandits to Strategic Manipulation},
author = {Feng, Zhe and Parkes, David and Xu, Haifeng},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3092--3101},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/feng20c/feng20c.pdf},
url = {http://proceedings.mlr.press/v119/feng20c.html},
abstract = {Motivated by economic applications such as recommender systems, we study the behavior of stochastic bandits algorithms under \emph{strategic behavior} conducted by rational actors, i.e., the arms. Each arm is a \emph{self-interested} strategic player who can modify its own reward whenever pulled, subject to a cross-period budget constraint, in order to maximize its own expected number of times of being pulled. We analyze the robustness of three popular bandit algorithms: UCB, $\varepsilon$-Greedy, and Thompson Sampling. We prove that all three algorithms achieve a regret upper bound $\mathcal{O}(\max \{ B, K\ln T\})$ where $B$ is the total budget across arms, $K$ is the total number of arms and $T$ is the running time of the algorithms. This regret guarantee holds for \emph{arbitrary adaptive} manipulation strategy of arms. Our second set of main results shows that this regret bound is \emph{tight}— in fact, for UCB, it is tight even when we restrict the arms’ manipulation strategies to form a \emph{Nash equilibrium}. We do so by characterizing the Nash equilibrium of the game induced by arms’ strategic manipulations and show a regret lower bound of $\Omega(\max \{ B, K\ln T\})$ at the equilibrium.}
}
@InProceedings{pmlr-v119-feng20d,
title = {Accountable Off-Policy Evaluation With Kernel {B}ellman Statistics},
author = {Feng, Yihao and Ren, Tongzheng and Tang, Ziyang and Liu, Qiang},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3102--3111},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/feng20d/feng20d.pdf},
url = {http://proceedings.mlr.press/v119/feng20d.html},
abstract = {We consider off-policy evaluation (OPE), which evaluates the performance of a new policy from observed data collected from previous experiments, without requiring the execution of the new policy. This finds important applications in areas with high execution cost or safety concerns, such as medical diagnosis, recommendation systems and robotics. In practice, due to the limited information from off-policy data, it is highly desirable to construct rigorous confidence intervals, not just point estimation, for the policy performance. In this work, we propose a new variational framework which reduces the problem of calculating tight confidence bounds in OPE into an optimization problem on a feasible set that catches the true state-action value function with high probability. The feasible set is constructed by leveraging statistical properties of a recently proposed kernel Bellman loss (Feng et al., 2019). We design an efficient computational approach for calculating our bounds, and extend it to perform post-hoc diagnosis and correction for existing estimators. Empirical results show that our method yields tight confidence intervals in different settings.}
}
@InProceedings{pmlr-v119-fernandez20a,
title = {Kernelized Stein Discrepancy Tests of Goodness-of-fit for Time-to-Event Data},
author = {Fernandez, Tamara and Rivera, Nicolas and Xu, Wenkai and Gretton, Arthur},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3112--3122},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/fernandez20a/fernandez20a.pdf},
url = {http://proceedings.mlr.press/v119/fernandez20a.html},
abstract = {Survival Analysis and Reliability Theory are concerned with the analysis of time-to-event data, in which observations correspond to waiting times until an event of interest such as death from a particular disease or failure of a component in a mechanical system. This type of data is unique due to the presence of censoring, a type of missing data that occurs when we do not observe the actual time of the event of interest but, instead, we have access to an approximation for it given by random interval in which the observation is known to belong. Most traditional methods are not designed to deal with censoring, and thus we need to adapt them to censored time-to-event data. In this paper, we focus on non-parametric goodness-of-fit testing procedures based on combining the Stein’s method and kernelized discrepancies. While for uncensored data, there is a natural way of implementing a kernelized Stein discrepancy test, for censored data there are several options, each of them with different advantages and disadvantages. In this paper, we propose a collection of kernelized Stein discrepancy tests for time-to-event data, and we study each of them theoretically and empirically; our experimental results show that our proposed methods perform better than existing tests, including previous tests based on a kernelized maximum mean discrepancy.}
}
@InProceedings{pmlr-v119-ferragina20a,
title = {Why Are Learned Indexes So Effective?},
author = {Ferragina, Paolo and Lillo, Fabrizio and Vinciguerra, Giorgio},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3123--3132},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ferragina20a/ferragina20a.pdf},
url = {http://proceedings.mlr.press/v119/ferragina20a.html},
abstract = {A recent trend in algorithm design consists of augmenting classic data structures with machine learning models, which are better suited to reveal and exploit patterns and trends in the input data so to achieve outstanding practical improvements in space occupancy and time efficiency. This is especially known in the context of indexing data structures where, despite few attempts in evaluating their asymptotic efficiency, theoretical results are yet missing in showing that learned indexes are provably better than classic indexes, such as B+ trees and their variants. In this paper, we present the first mathematically-grounded answer to this open problem. We obtain this result by discovering and exploiting a link between the original problem and a mean exit time problem over a proper stochastic process which, we show, is related to the space and time occupancy of those learned indexes. Our general result is then specialised to five well-known distributions: Uniform, Lognormal, Pareto, Exponential, and Gamma; and it is corroborated in precision and robustness by a large set of experiments.}
}
@InProceedings{pmlr-v119-fiez20a,
title = {Implicit Learning Dynamics in Stackelberg Games: Equilibria Characterization, Convergence Analysis, and Empirical Study},
author = {Fiez, Tanner and Chasnov, Benjamin and Ratliff, Lillian},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3133--3144},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/fiez20a/fiez20a.pdf},
url = {http://proceedings.mlr.press/v119/fiez20a.html},
abstract = {Contemporary work on learning in continuous games has commonly overlooked the hierarchical decision-making structure present in machine learning problems formulated as games, instead treating them as simultaneous play games and adopting the Nash equilibrium solution concept. We deviate from this paradigm and provide a comprehensive study of learning in Stackelberg games. This work provides insights into the optimization landscape of zero-sum games by establishing connections between Nash and Stackelberg equilibria along with the limit points of simultaneous gradient descent. We derive novel gradient-based learning dynamics emulating the natural structure of a Stackelberg game using the implicit function theorem and provide convergence analysis for deterministic and stochastic updates for zero-sum and general-sum games. Notably, in zero-sum games using deterministic updates, we show the only critical points the dynamics converge to are Stackelberg equilibria and provide a local convergence rate. Empirically, our learning dynamics mitigate rotational behavior and exhibit benefits for training generative adversarial networks compared to simultaneous gradient descent.}
}
@InProceedings{pmlr-v119-filos20a,
title = {Can Autonomous Vehicles Identify, Recover From, and Adapt to Distribution Shifts?},
author = {Filos, Angelos and Tigkas, Panagiotis and Mcallister, Rowan and Rhinehart, Nicholas and Levine, Sergey and Gal, Yarin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3145--3153},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/filos20a/filos20a.pdf},
url = {http://proceedings.mlr.press/v119/filos20a.html},
abstract = {Out-of-training-distribution (OOD) scenarios are a common challenge of learning agents at deployment, typically leading to arbitrary deductions and poorly-informed decisions. In principle, detection of and adaptation to OOD scenes can mitigate their adverse effects. In this paper, we highlight the limitations of current approaches to novel driving scenes and propose an epistemic uncertainty-aware planning method, called \emph{robust imitative planning} (RIP). Our method can detect and recover from some distribution shifts, reducing the overconfident and catastrophic extrapolations in OOD scenes. If the model’s uncertainty is too great to suggest a safe course of action, the model can instead query the expert driver for feedback, enabling sample-efficient online adaptation, a variant of our method we term \emph{adaptive robust imitative planning} (AdaRIP). Our methods outperform current state-of-the-art approaches in the nuScenes \emph{prediction} challenge, but since no benchmark evaluating OOD detection and adaption currently exists to assess \emph{control}, we introduce an autonomous car novel-scene benchmark, \texttt{CARNOVEL}, to evaluate the robustness of driving agents to a suite of tasks with distribution shifts, where our methods outperform all the baselines.}
}
@InProceedings{pmlr-v119-finlay20a,
title = {How to Train Your Neural {ODE}: the World of {J}acobian and Kinetic Regularization},
author = {Finlay, Chris and Jacobsen, Joern-Henrik and Nurbekyan, Levon and Oberman, Adam},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3154--3164},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/finlay20a/finlay20a.pdf},
url = {http://proceedings.mlr.press/v119/finlay20a.html},
abstract = {Training neural ODEs on large datasets has not been tractable due to the necessity of allowing the adaptive numerical ODE solver to refine its step size to very small values. In practice this leads to dynamics equivalent to many hundreds or even thousands of layers. In this paper, we overcome this apparent difficulty by introducing a theoretically-grounded combination of both optimal transport and stability regularizations which encourage neural ODEs to prefer simpler dynamics out of all the dynamics that solve a problem well. Simpler dynamics lead to faster convergence and to fewer discretizations of the solver, considerably decreasing wall-clock time without loss in performance. Our approach allows us to train neural ODE-based generative models to the same performance as the unregularized dynamics, with significant reductions in training time. This brings neural ODEs closer to practical relevance in large-scale applications.}
}
@InProceedings{pmlr-v119-finzi20a,
title = {Generalizing Convolutional Neural Networks for Equivariance to Lie Groups on Arbitrary Continuous Data},
author = {Finzi, Marc and Stanton, Samuel and Izmailov, Pavel and Wilson, Andrew Gordon},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3165--3176},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/finzi20a/finzi20a.pdf},
url = {http://proceedings.mlr.press/v119/finzi20a.html},
abstract = {The translation equivariance of convolutional layers enables CNNs to generalize well on image problems. While translation equivariance provides a powerful inductive bias for images, we often additionally desire equivariance to other transformations, such as rotations, especially for non-image data. We propose a general method to construct a convolutional layer that is equivariant to transformations from any specified Lie group with a surjective exponential map. Incorporating equivariance to a new group requires implementing only the group exponential and logarithm maps, enabling rapid prototyping. Showcasing the simplicity and generality of our method, we apply the same model architecture to images, ball-and-stick molecular data, and Hamiltonian dynamical systems. For Hamiltonian systems, the equivariance of our models is especially impactful, leading to exact conservation of linear and angular momentum.}
}
@InProceedings{pmlr-v119-fischer20a,
title = {Information Particle Filter Tree: An Online Algorithm for {POMDP}s with Belief-Based Rewards on Continuous Domains},
author = {Fischer, Johannes and Tas, \"Omer Sahin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3177--3187},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/fischer20a/fischer20a.pdf},
url = {http://proceedings.mlr.press/v119/fischer20a.html},
abstract = {Planning in Partially Observable Markov Decision Processes (POMDPs) inherently gathers the information necessary to act optimally under uncertainties. The framework can be extended to model pure information gathering tasks by considering belief-based rewards. This allows us to use reward shaping to guide POMDP planning to informative beliefs by using a weighted combination of the original reward and the expected information gain as the objective. In this work we propose a novel online algorithm, Information Particle Filter Tree (IPFT), to solve problems with belief-dependent rewards on continuous domains. It simulates particle-based belief trajectories in a Monte Carlo Tree Search (MCTS) approach to construct a search tree in the belief space. The evaluation shows that the consideration of information gain greatly improves the performance in problems where information gathering is an essential part of the optimal policy.}
}
@InProceedings{pmlr-v119-fisher20a,
title = {Topic Modeling via Full Dependence Mixtures},
author = {Fisher, Dan and Kozdoba, Mark and Mannor, Shie},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3188--3198},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/fisher20a/fisher20a.pdf},
url = {http://proceedings.mlr.press/v119/fisher20a.html},
abstract = {In this paper we introduce a new approach to topic modelling that scales to large datasets by using a compact representation of the data and by leveraging the GPU architecture. In this approach, topics are learned directly from the co-occurrence data of the corpus. In particular, we introduce a novel mixture model which we term the Full Dependence Mixture (FDM) model. FDMs model second moment under general generative assumptions on the data. While there is previous work on topic modeling using second moments, we develop a direct stochastic optimization procedure for fitting an FDM with a single Kullback Leibler objective. Moment methods in general have the benefit that an iteration no longer needs to scale with the size of the corpus. Our approach allows us to leverage standard optimizers and GPUs for the problem of topic modeling. In particular, we evaluate the approach on two large datasets, NeurIPS papers and a Twitter corpus, with a large number of topics, and show that the approach performs comparably or better than the standard benchmarks.}
}
@InProceedings{pmlr-v119-foster20a,
title = {Beyond {UCB}: Optimal and Efficient Contextual Bandits with Regression Oracles},
author = {Foster, Dylan and Rakhlin, Alexander},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3199--3210},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/foster20a/foster20a.pdf},
url = {http://proceedings.mlr.press/v119/foster20a.html},
abstract = {A fundamental challenge in contextual bandits is to develop flexible, general-purpose algorithms with computational requirements no worse than classical supervised learning tasks such as classification and regression. Algorithms based on regression have shown promising empirical success, but theoretical guarantees have remained elusive except in special cases. We provide the first universal and optimal reduction from contextual bandits to online regression. We show how to transform any oracle for online regression with a given value function class into an algorithm for contextual bandits with the induced policy class, with no overhead in runtime or memory requirements. We characterize the minimax rates for contextual bandits with general, potentially nonparametric function classes, and show that our algorithm is minimax optimal whenever the oracle obtains the optimal rate for regression. Compared to previous results, our algorithm requires no distributional assumptions beyond realizability, and works even when contexts are chosen adversarially.}
}
@InProceedings{pmlr-v119-foster20b,
title = {Logarithmic Regret for Adversarial Online Control},
author = {Foster, Dylan and Simchowitz, Max},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3211--3221},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/foster20b/foster20b.pdf},
url = {http://proceedings.mlr.press/v119/foster20b.html},
abstract = {We introduce a new algorithm for online linear-quadratic control in a known system subject to adversarial disturbances. Existing regret bounds for this setting scale as $\sqrt{T}$ unless strong stochastic assumptions are imposed on the disturbance process. We give the first algorithm with logarithmic regret for arbitrary adversarial disturbance sequences, provided the state and control costs are given by known quadratic functions. Our algorithm and analysis use a characterization for the optimal offline control law to reduce the online control problem to (delayed) online learning with approximate advantage functions. Compared to previous techniques, our approach does not need to control movement costs for the iterates, leading to logarithmic regret.}
}
@InProceedings{pmlr-v119-fountoulakis20a,
title = {p-Norm Flow Diffusion for Local Graph Clustering},
author = {Fountoulakis, Kimon and Wang, Di and Yang, Shenghao},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3222--3232},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/fountoulakis20a/fountoulakis20a.pdf},
url = {http://proceedings.mlr.press/v119/fountoulakis20a.html},
abstract = {Local graph clustering and the closely related seed set expansion problem are primitives on graphs that are central to a wide range of analytic and learning tasks such as local clustering, community detection, semi-supervised learning, nodes ranking and feature inference. Prior work on local graph clustering mostly falls into two categories with numerical and combinatorial roots respectively, in this work we draw inspiration from both fields and propose a family of convex optimization formulations based on the idea of diffusion with $p$-norm network flow for $p\in (1,\infty)$. In the context of local clustering, we characterize the optimal solutions for these optimization problems and show their usefulness in finding low conductance cuts around input seed set. In particular, we achieve quadratic approximation of conductance in the case of $p=2$ similar to the Cheeger-type bounds of spectral methods, constant factor approximation when $p\rightarrow\infty$ similar to max-flow based methods, and a smooth transition for general $p$ values in between. Thus, our optimization formulation can be viewed as bridging the numerical and combinatorial approaches, and we can achieve the best of both worlds in terms of speed and noise robustness. We show that the proposed problem can be solved in strongly local running time for $p\ge 2$ and conduct empirical evaluations on both synthetic and real-world graphs to illustrate our approach compares favorably with existing methods.}
}
@InProceedings{pmlr-v119-franceschi20a,
title = {Stochastic Latent Residual Video Prediction},
author = {Franceschi, Jean-Yves and Delasalles, Edouard and Chen, Mickael and Lamprier, Sylvain and Gallinari, Patrick},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3233--3246},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/franceschi20a/franceschi20a.pdf},
url = {http://proceedings.mlr.press/v119/franceschi20a.html},
abstract = {Designing video prediction models that account for the inherent uncertainty of the future is challenging. Most works in the literature are based on stochastic image-autoregressive recurrent networks, which raises several performance and applicability issues. An alternative is to use fully latent temporal models which untie frame synthesis and temporal dynamics. However, no such model for stochastic video prediction has been proposed in the literature yet, due to design and training difficulties. In this paper, we overcome these difficulties by introducing a novel stochastic temporal model whose dynamics are governed in a latent space by a residual update rule. This first-order scheme is motivated by discretization schemes of differential equations. It naturally models video dynamics as it allows our simpler, more interpretable, latent model to outperform prior state-of-the-art methods on challenging datasets.}
}
@InProceedings{pmlr-v119-frank20a,
title = {Leveraging Frequency Analysis for Deep Fake Image Recognition},
author = {Frank, Joel and Eisenhofer, Thorsten and Sch{\"o}nherr, Lea and Fischer, Asja and Kolossa, Dorothea and Holz, Thorsten},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3247--3258},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/frank20a/frank20a.pdf},
url = {http://proceedings.mlr.press/v119/frank20a.html},
abstract = {Deep neural networks can generate images that are astonishingly realistic, so much so that it is often hard for humans to distinguish them from actual photos. These achievements have been largely made possible by Generative Adversarial Networks (GANs). While deep fake images have been thoroughly investigated in the image domain{—}a classical approach from the area of image forensics{—}an analysis in the frequency domain has been missing so far. In this paper,we address this shortcoming and our results reveal that in frequency space, GAN-generated images exhibit severe artifacts that can be easily identified. We perform a comprehensive analysis, showing that these artifacts are consistent across different neural network architectures, data sets, and resolutions. In a further investigation, we demonstrate that these artifacts are caused by upsampling operations found in all current GAN architectures, indicating a structural and fundamental problem in the way images are generated via GANs. Based on this analysis, we demonstrate how the frequency representation can be used to identify deep fake images in an automated way, surpassing state-of-the-art methods.}
}
@InProceedings{pmlr-v119-frankle20a,
title = {Linear Mode Connectivity and the Lottery Ticket Hypothesis},
author = {Frankle, Jonathan and Dziugaite, Gintare Karolina and Roy, Daniel and Carbin, Michael},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3259--3269},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/frankle20a/frankle20a.pdf},
url = {http://proceedings.mlr.press/v119/frankle20a.html},
abstract = {We study whether a neural network optimizes to the same, linearly connected minimum under different samples of SGD noise (e.g., random data order and augmentation). We find that standard vision models become stable to SGD noise in this way early in training. From then on, the outcome of optimization is determined to a linearly connected region. We use this technique to study iterative magnitude pruning (IMP), the procedure used by work on the lottery ticket hypothesis to identify subnetworks that could have trained in isolation to full accuracy. We find that these subnetworks only reach full accuracy when they are stable to SGD noise, which either occurs at initialization for small-scale settings (MNIST) or early in training for large-scale settings (ResNet-50 and Inception-v3 on ImageNet).}
}
@InProceedings{pmlr-v119-freeman20a,
title = {No-Regret and Incentive-Compatible Online Learning},
author = {Freeman, Rupert and Pennock, David and Podimata, Chara and Vaughan, Jennifer Wortman},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3270--3279},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/freeman20a/freeman20a.pdf},
url = {http://proceedings.mlr.press/v119/freeman20a.html},
abstract = {We study online learning settings in which experts act strategically to maximize their influence on the learning algorithm’s predictions by potentially misreporting their beliefs about a sequence of binary events. Our goal is twofold. First, we want the learning algorithm to be no-regret with respect to the best-fixed expert in hindsight. Second, we want incentive compatibility, a guarantee that each expert’s best strategy is to report his true beliefs about the realization of each event. To achieve this goal, we build on the literature on wagering mechanisms, a type of multi-agent scoring rule. We provide algorithms that achieve no regret and incentive compatibility for myopic experts for both the full and partial information settings. In experiments on datasets from FiveThirtyEight, our algorithms have regret comparable to classic no-regret algorithms, which are not incentive-compatible. Finally, we identify an incentive-compatible algorithm for forward-looking strategic agents that exhibits diminishing regret in practice.}
}
@InProceedings{pmlr-v119-fu20a,
title = {Fast and Three-rious: Speeding Up Weak Supervision with Triplet Methods},
author = {Fu, Daniel and Chen, Mayee and Sala, Frederic and Hooper, Sarah and Fatahalian, Kayvon and Re, Christopher},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3280--3291},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/fu20a/fu20a.pdf},
url = {http://proceedings.mlr.press/v119/fu20a.html},
abstract = {Weak supervision is a popular method for building machine learning models without relying on ground truth annotations. Instead, it generates probabilistic training labels by estimating the accuracies of multiple noisy labeling sources (e.g., heuristics, crowd workers). Existing approaches use latent variable estimation to model the noisy sources, but these methods can be computationally expensive, scaling superlinearly in the data. In this work, we show that, for a class of latent variable models highly applicable to weak supervision, we can find a closed-form solution to model parameters, obviating the need for iterative solutions like stochastic gradient descent (SGD). We use this insight to build FlyingSquid, a weak supervision framework that runs orders of magnitude faster than previous weak supervision approaches and requires fewer assumptions. In particular, we prove bounds on generalization error without assuming that the latent variable model can exactly parameterize the underlying data distribution. Empirically, we validate FlyingSquid on benchmark weak supervision datasets and find that it achieves the same or higher quality compared to previous approaches without the need to tune an SGD procedure, recovers model parameters 170 times faster on average, and enables new video analysis and online learning applications.}
}
@InProceedings{pmlr-v119-fu20b,
title = {{A}uto{GAN}-Distiller: Searching to Compress Generative Adversarial Networks},
author = {Fu, Yonggan and Chen, Wuyang and Wang, Haotao and Li, Haoran and Lin, Yingyan and Wang, Zhangyang},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3292--3303},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/fu20b/fu20b.pdf},
url = {http://proceedings.mlr.press/v119/fu20b.html},
abstract = {The compression of Generative Adversarial Networks (GANs) has lately drawn attention, due to the increasing demand for deploying GANs into mobile devices for numerous applications such as image translation, enhancement and editing. However, compared to the substantial efforts to compressing other deep models, the research on compressing GANs (usually the generators) remains at its infancy stage. Existing GAN compression algorithms are limited to handling specific GAN architectures and losses. Inspired by the recent success of AutoML in deep compression, we introduce AutoML to GAN compression and develop an AutoGAN-Distiller (AGD) framework. Starting with a specifically designed efficient search space, AGD performs an end-to-end discovery for new efficient generators, given the target computational resource constraints. The search is guided by the original GAN model via knowledge distillation, therefore fulfilling the compression. AGD is fully automatic, standalone (i.e., needing no trained discriminators), and generically applicable to various GAN models. We evaluate AGD in two representative GAN tasks: image translation and super resolution. Without bells and whistles, AGD yields remarkably lightweight yet more competitive compressed models, that largely outperform existing alternatives. Our codes and pretrained models are available at: https://github.com/TAMU-VITA/AGD.}
}
@InProceedings{pmlr-v119-fu20c,
title = {Don’t Waste Your Bits! {S}queeze Activations and Gradients for Deep Neural Networks via {T}iny{S}cript},
author = {Fu, Fangcheng and Hu, Yuzheng and He, Yihan and Jiang, Jiawei and Shao, Yingxia and Zhang, Ce and Cui, Bin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3304--3314},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/fu20c/fu20c.pdf},
url = {http://proceedings.mlr.press/v119/fu20c.html},
abstract = {Recent years have witnessed intensive research interests on training deep neural networks (DNNs) more efficiently by quantization-based compression methods, which facilitate DNNs training in two ways: (1) activations are quantized to shrink the memory consumption, and (2) gradients are quantized to decrease the communication cost. However, existing methods mostly use a uniform mechanism that quantizes the values evenly. Such a scheme may cause a large quantization variance and slow down the convergence in practice. In this work, we introduce TinyScript, which applies a non-uniform quantization algorithm to both activations and gradients. TinyScript models the original values by a family of Weibull distributions and searches for ”quantization knobs” that minimize quantization variance. We also discuss the convergence of the non-uniform quantization algorithm on DNNs with varying depths, shedding light on the number of bits required for convergence. Experiments show that TinyScript always obtains lower quantization variance, and achieves comparable model qualities against full precision training using 1-2 bits less than the uniform-based counterpart.}
}
@InProceedings{pmlr-v119-fu20d,
title = {{D}essi{LBI}: Exploring Structural Sparsity of Deep Networks via Differential Inclusion Paths},
author = {Fu, Yanwei and Liu, Chen and Li, Donghao and Sun, Xinwei and Zeng, Jinshan and Yao, Yuan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3315--3326},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/fu20d/fu20d.pdf},
url = {http://proceedings.mlr.press/v119/fu20d.html},
abstract = {Over-parameterization is ubiquitous nowadays in training neural networks to benefit both optimization in seeking global optima and generalization in reducing prediction error. However, compressive networks are desired in many real world applications and direct training of small networks may be trapped in local optima. In this paper, instead of pruning or distilling over-parameterized models to compressive ones, we propose a new approach based on differential inclusions of inverse scale spaces. Specifically, it generates a family of models from simple to complex ones that couples a pair of parameters to simultaneously train over-parameterized deep models and structural sparsity on weights of fully connected and convolutional layers. Such a differential inclusion scheme has a simple discretization, proposed as Deep structurally splitting Linearized Bregman Iteration (DessiLBI), whose global convergence analysis in deep learning is established that from any initializations, algorithmic iterations converge to a critical point of empirical risks. Experimental evidence shows that DessiLBI achieve comparable and even better performance than the competitive optimizers in exploring the structural sparsity of several widely used backbones on the benchmark datasets. Remarkably, with early stopping, DessiLBI unveils “winning tickets” in early epochs: the effective sparse structure with comparable test accuracy to fully trained over-parameterized models.}
}
@InProceedings{pmlr-v119-fujii20a,
title = {Approximation Guarantees of Local Search Algorithms via Localizability of Set Functions},
author = {Fujii, Kaito},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3327--3336},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/fujii20a/fujii20a.pdf},
url = {http://proceedings.mlr.press/v119/fujii20a.html},
abstract = {This paper proposes a new framework for providing approximation guarantees of local search algorithms. Local search is a basic algorithm design technique and is widely used for various combinatorial optimization problems. To analyze local search algorithms for set function maximization, we propose a new notion called \emph{localizability} of set functions, which measures how effective local improvement is. Moreover, we provide approximation guarantees of standard local search algorithms under various combinatorial constraints in terms of localizability. The main application of our framework is sparse optimization, for which we show that restricted strong concavity and restricted smoothness of the objective function imply localizability, and further develop accelerated versions of local search algorithms. We conduct experiments in sparse regression and structure learning of graphical models to confirm the practical efficiency of the proposed local search algorithms.}
}
@InProceedings{pmlr-v119-futami20a,
title = {Accelerating the diffusion-based ensemble sampling by non-reversible dynamics},
author = {Futami, Futoshi and Sato, Issei and Sugiyama, Masashi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3337--3347},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/futami20a/futami20a.pdf},
url = {http://proceedings.mlr.press/v119/futami20a.html},
abstract = {Posterior distribution approximation is a central task in Bayesian inference. Stochastic gradient Langevin dynamics (SGLD) and its extensions have been practically used and theoretically studied. While SGLD updates a single particle at a time, ensemble methods that update multiple particles simultaneously have been recently gathering attention. Compared with the naive parallel-chain SGLD that updates multiple particles independently, ensemble methods update particles with their interactions. Thus, these methods are expected to be more particle-efficient than the naive parallel-chain SGLD because particles can be aware of other particles’ behavior through their interactions. Although ensemble methods numerically demonstrated their superior performance, no theoretical guarantee exists to assure such particle-efficiency and it is unclear whether those ensemble methods are really superior to the naive parallel-chain SGLD in the non-asymptotic settings. To cope with this problem, we propose a novel ensemble method that uses a non-reversible Markov chain for the interaction, and we present a non-asymptotic theoretical analysis for our method. Our analysis shows that, for the first time, the interaction causes a faster convergence rate than the naive parallel-chain SGLD in the non-asymptotic setting if the discretization error is appropriately controlled. Numerical experiments show that we can control the discretization error by tuning the interaction appropriately.}
}
@InProceedings{pmlr-v119-gael20a,
title = {Stochastic bandits with arm-dependent delays},
author = {Gael, Manegueu Anne and Vernade, Claire and Carpentier, Alexandra and Valko, Michal},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3348--3356},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/gael20a/gael20a.pdf},
url = {http://proceedings.mlr.press/v119/gael20a.html},
abstract = {Significant work has been recently dedicated to the stochastic delayed bandits because of its relevance in applications. The applicability of existing algorithms is however restricted by the fact that strong assumptions are often made on the delay distributions, such as full observability, restrictive shape constraints, or uniformity over arms. In this work, we weaken them significantly and only assume that there is a bound on the tail of the delay. In particular, we cover the important case where the delay distributions vary across arms, and the case where the delays are heavy-tailed. Addressing these difficulties, we propose a simple but efficient UCB-based algorithm called the PatientBandits. We provide both problemsdependent and problems-independent bounds on the regret as well as performance lower bounds.}
}
@InProceedings{pmlr-v119-gain20a,
title = {Abstraction Mechanisms Predict Generalization in Deep Neural Networks},
author = {Gain, Alex and Siegelmann, Hava},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3357--3366},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/gain20a/gain20a.pdf},
url = {http://proceedings.mlr.press/v119/gain20a.html},
abstract = {A longstanding problem for Deep Neural Networks (DNNs) is understanding their puzzling ability to generalize well. We approach this problem through the unconventional angle of \emph{cognitive abstraction mechanisms}, drawing inspiration from recent neuroscience work, allowing us to define the Cognitive Neural Activation metric (CNA) for DNNs, which is the correlation between information complexity (entropy) of given input and the concentration of higher activation values in deeper layers of the network. The CNA is highly predictive of generalization ability, outperforming norm-and-sharpness-based generalization metrics on an extensive evaluation of close to 200 network instances comprising a breadth of dataset-architecture combinations, especially in cases where additive noise is present and/or training labels are corrupted. These strong empirical results show the usefulness of the CNA as a generalization metric and encourage further research on the connection between information complexity and representations in the deeper layers of networks in order to better understand the generalization capabilities of DNNs.}
}
@InProceedings{pmlr-v119-gao20a,
title = {A Free-Energy Principle for Representation Learning},
author = {Gao, Yansong and Chaudhari, Pratik},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3367--3376},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/gao20a/gao20a.pdf},
url = {http://proceedings.mlr.press/v119/gao20a.html},
abstract = {This paper employs a formal connection of machine learning with thermodynamics to characterize the quality of learnt representations for transfer learning. We discuss how information-theoretic functionals such as rate, distortion and classification loss of a model lie on a convex, so-called equilibrium surface. We prescribe dynamical processes to traverse this surface under constraints, e.g., an iso-classification process that trades off rate and distortion to keep the classification loss unchanged. We demonstrate how this process can be used for transferring representations from a source dataset to a target dataset while keeping the classification loss constant. Experimental validation of the theoretical results is provided on standard image-classification datasets.}
}
@InProceedings{pmlr-v119-gao20b,
title = {Can Stochastic Zeroth-Order Frank-{W}olfe Method Converge Faster for Non-Convex Problems?},
author = {Gao, Hongchang and Huang, Heng},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3377--3386},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/gao20b/gao20b.pdf},
url = {http://proceedings.mlr.press/v119/gao20b.html},
abstract = {Frank-Wolfe algorithm is an efficient method for optimizing non-convex constrained problems. However, most of existing methods focus on the first-order case. In real-world applications, the gradient is not always available. To address the problem of lacking gradient in many applications, we propose two new stochastic zeroth-order Frank-Wolfe algorithms and theoretically proved that they have a faster convergence rate than existing methods for non-convex problems. Specifically, the function queries oracle of the proposed faster zeroth-order Frank-Wolfe (FZFW) method is $O(\frac{n^{1/2}d}{\epsilon^2})$ which can match the iteration complexity of the first-order counterpart approximately. As for the proposed faster zeroth-order conditional gradient sliding (FZCGS) method, its function queries oracle is improved to $O(\frac{n^{1/2}d}{\epsilon})$, indicating that its iteration complexity is even better than that of its first-order counterpart NCGS-VR. In other words, the iteration complelxity of the accelerated first-order Frank-Wolfe method NCGS-VR is suboptimal. Then, we proposed a new algorithm to improve its IFO (incremental first-order oracle) to $O(\frac{n^{1/2}}{\epsilon})$. At last, the empirical studies on benchmark datasets validate our theoretical results.}
}
@InProceedings{pmlr-v119-garber20a,
title = {Online Convex Optimization in the Random Order Model},
author = {Garber, Dan and Korcia, Gal and Levy, Kfir},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3387--3396},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/garber20a/garber20a.pdf},
url = {http://proceedings.mlr.press/v119/garber20a.html},
abstract = {Online Convex Optimization (OCO) is a powerful framework for sequential prediction, portraying the natural uncertainty inherent in data-streams as though the data were generated by an almost omniscient adversary. However, this view, which is often too pessimistic for real-world data, comes with a price. The complexity of solving many important online tasks in this adversarial framework becomes much worse than that of their offline and even stochastic counterparts. In this work we consider a natural random-order version of the OCO model, in which the adversary can choose the set of loss functions, but does not get to choose the order in which they are supplied to the learner; Instead, they are observed in uniformly random order. Focusing on two important families of online tasks, one in which the cumulative loss function is strongly convex (though individual loss functions may not even be convex), and the other being online $k$-PCA, we show that under standard well-conditioned-data assumptions, standard online gradient descent (OGD) methods become much more efficient in the random-order model. In particular, for the first group of tasks OGD guarantees poly-logarithmic regret. In the case of online $k$-PCA, OGD guarantees sublinear regret using only a rank-$k$ SVD on each iteration and memory linear in the size of the solution.}
}
@InProceedings{pmlr-v119-garg20a,
title = {Symbolic Network: Generalized Neural Policies for Relational {MDP}s},
author = {Garg, Sankalp and Bajpai, Aniket and { }, {Mausam}},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3397--3407},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/garg20a/garg20a.pdf},
url = {http://proceedings.mlr.press/v119/garg20a.html},
abstract = {A Relational Markov Decision Process (RMDP) is a first-order representation to express all instances of a single probabilistic planning domain with possibly unbounded number of objects. Early work in RMDPs outputs generalized (instance-independent) first-order policies or value functions as a means to solve all instances of a domain at once. Unfortunately, this line of work met with limited success due to inherent limitations of the representation space used in such policies or value functions. Can neural models provide the missing link by easily representing more complex generalized policies, thus making them effective on all instances of a given domain? We present SymNet, the first neural approach for solving RMDPs that are expressed in the probabilistic planning language of RDDL. SymNet trains a set of shared parameters for an RDDL domain using training instances from that domain. For each instance, SymNet first converts it to an instance graph and then uses relational neural models to compute node embeddings. It then scores each ground action as a function over the first-order action symbols and node embeddings related to the action. Given a new test instance from the same domain, SymNet architecture with pre-trained parameters scores each ground action and chooses the best action. This can be accomplished in a single forward pass without any retraining on the test instance, thus implicitly representing a neural generalized policy for the whole domain. Our experiments on nine RDDL domains from IPPC demonstrate that SymNet policies are significantly better than random and sometimes even more effective than training a state-of-the-art deep reactive policy from scratch.}
}
@InProceedings{pmlr-v119-garg20b,
title = {Predicting deliberative outcomes},
author = {Garg, Vikas and Jaakkola, Tommi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3408--3418},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/garg20b/garg20b.pdf},
url = {http://proceedings.mlr.press/v119/garg20b.html},
abstract = {We extend structured prediction to deliberative outcomes. Specifically, we learn parameterized games that can map any inputs to equilibria as the outcomes. Standard structured prediction models rely heavily on global scoring functions and are therefore unable to model individual player preferences or how they respond to others asymmetrically. Our games take as input, e.g., UN resolution to be voted on, and map such contexts to initial strategies, player utilities, and interactions. Players are then thought to repeatedly update their strategies in response to weighted aggregates of other players’ choices towards maximizing their individual utilities. The output from the game is a sample from the resulting (near) equilibrium mixed strategy profile. We characterize conditions under which players’ strategies converge to an equilibrium in such games and when the game parameters can be provably recovered from observations. Empirically, we demonstrate on two real voting datasets that our games can recover interpretable strategic interactions, and predict strategies for players in new settings.}
}
@InProceedings{pmlr-v119-garg20c,
title = {Generalization and Representational Limits of Graph Neural Networks},
author = {Garg, Vikas and Jegelka, Stefanie and Jaakkola, Tommi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3419--3430},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/garg20c/garg20c.pdf},
url = {http://proceedings.mlr.press/v119/garg20c.html},
abstract = {We address two fundamental questions about graph neural networks (GNNs). First, we prove that several important graph properties, e.g., shortest/longest cycle, diameter, or certain motifs, cannot be computed by GNNs that rely entirely on local information. Such GNNs include the standard message passing models, and more powerful spatial variants that exploit local graph structure (e.g., via relative orientation of messages, or local port ordering) to distinguish neighbors of each node. Our treatment includes a novel graph-theoretic formalism. Second, we provide the first data dependent generalization bounds for message passing GNNs. This analysis explicitly accounts for the local permutation invariance of GNNs. Our bounds are much tighter than existing VC-dimension based guarantees for GNNs, and are comparable to Rademacher bounds for recurrent neural networks.}
}
@InProceedings{pmlr-v119-geng20a,
title = {Deep {PQR}: Solving Inverse Reinforcement Learning using Anchor Actions},
author = {Geng, Sinong and Nassif, Houssam and Manzanares, Carlos and Reppen, Max and Sircar, Ronnie},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3431--3441},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/geng20a/geng20a.pdf},
url = {http://proceedings.mlr.press/v119/geng20a.html},
abstract = {We propose a reward function estimation framework for inverse reinforcement learning with deep energy-based policies. We name our method PQR, as it sequentially estimates the Policy, the Q-function, and the Reward function by deep learning. PQR does not assume that the reward solely depends on the state, instead it allows for a dependency on the choice of action. Moreover, PQR allows for stochastic state transitions. To accomplish this, we assume the existence of one anchor action whose reward is known, typically the action of doing nothing, yielding no reward. We present both estimators and algorithms for the PQR method. When the environment transition is known, we prove that the PQR reward estimator uniquely recovers the true reward. With unknown transitions, we bound the estimation error of PQR. Finally, the performance of PQR is demonstrated by synthetic and real-world datasets.}
}
@InProceedings{pmlr-v119-georgopoulos20a,
title = {Multilinear Latent Conditioning for Generating Unseen Attribute Combinations},
author = {Georgopoulos, Markos and Chrysos, Grigorios and Pantic, Maja and Panagakis, Yannis},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3442--3451},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/georgopoulos20a/georgopoulos20a.pdf},
url = {http://proceedings.mlr.press/v119/georgopoulos20a.html},
abstract = {Deep generative models rely on their inductive bias to facilitate generalization, especially for problems with high dimensional data, like images. However, empirical studies have shown that variational autoencoders (VAE) and generative adversarial networks (GAN) lack the generalization ability that occurs naturally in human perception. For example, humans can visualize a woman smiling after only seeing a smiling man. On the contrary, the standard conditional VAE (cVAE) is unable to generate unseen attribute combinations. To this end, we extend cVAE by introducing a multilinear latent conditioning framework that captures the multiplicative interactions between the attributes. We implement two variants of our model and demonstrate their efficacy on MNIST, Fashion-MNIST and CelebA. Altogether, we design a novel conditioning framework that can be used with any architecture to synthesize unseen attribute combinations.}
}
@InProceedings{pmlr-v119-gerace20a,
title = {Generalisation error in learning with random features and the hidden manifold model},
author = {Gerace, Federica and Loureiro, Bruno and Krzakala, Florent and Mezard, Marc and Zdeborova, Lenka},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3452--3462},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/gerace20a/gerace20a.pdf},
url = {http://proceedings.mlr.press/v119/gerace20a.html},
abstract = {We study generalised linear regression and classification for a synthetically generated dataset encompassing different problems of interest, such as learning with random features, neural networks in the lazy training regime, and the hidden manifold model. We consider the high-dimensional regime and using the replica method from statistical physics, we provide a closed-form expression for the asymptotic generalisation performance in these problems, valid in both the under- and over-parametrised regimes and for a broad choice of generalised linear model loss functions. In particular, we show how to obtain analytically the so-called double descent behaviour for logistic regression with a peak at the interpolation threshold, we illustrate the superiority of orthogonal against random Gaussian projections in learning with random features, and discuss the role played by correlations in the data generated by the hidden manifold model. Beyond the interest in these particular problems, the theoretical formalism introduced in this manuscript provides a path to further extensions to more complex tasks.}
}
@InProceedings{pmlr-v119-gergatsouli20a,
title = {Black-Box Methods for Restoring Monotonicity},
author = {Gergatsouli, Evangelia and Lucier, Brendan and Tzamos, Christos},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3463--3473},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/gergatsouli20a/gergatsouli20a.pdf},
url = {http://proceedings.mlr.press/v119/gergatsouli20a.html},
abstract = {In many practical applications, heuristic or approximation algorithms are used to efficiently solve the task at hand. However their solutions frequently do not satisfy natural monotonicity properties expected to hold in the optimum. In this work we develop algorithms that are able to restore monotonicity in the parameters of interest. Specifically, given oracle access to a possibly non monotone function, we provide an algorithm that restores monotonicity while degrading the expected value of the function by at most $\epsilon$. The number of queries required is at most logarithmic in $1/\epsilon$ and exponential in the number of parameters. We also give a lower bound showing that this exponential dependence is necessary. Finally, we obtain improved query complexity bounds for restoring the weaker property of $k$-marginal monotonicity. Under this property, every $k$-dimensional projection of the function is required to be monotone. The query complexity we obtain only scales exponentially with $k$ and is polynomial in the number of parameters.}
}
@InProceedings{pmlr-v119-ghari20a,
title = {Online Multi-Kernel Learning with Graph-Structured Feedback},
author = {Ghari, Pouya M and Shen, Yanning},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3474--3483},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ghari20a/ghari20a.pdf},
url = {http://proceedings.mlr.press/v119/ghari20a.html},
abstract = {Multi-kernel learning (MKL) exhibits reliable performance in nonlinear function approximation tasks. Instead of using one kernel, it learns the optimal kernel from a pre-selected dictionary of kernels. The selection of the dictionary has crucial impact on both the performance and complexity of MKL. Specifically, inclusion of a large number of irrelevant kernels may impair the accuracy, and increase the complexity of MKL algorithms. To enhance the accuracy, and alleviate the computational burden, the present paper develops a novel scheme which actively chooses relevant kernels. The proposed framework models the pruned kernel combination as feedback collected from a graph, that is refined ’on the fly.’ Leveraging the random feature approximation, we propose an online scalable multi-kernel learning approach with graph feedback, and prove that the proposed algorithm enjoys sublinear regret. Numerical tests on real datasets demonstrate the effectiveness of the novel approach.}
}
@InProceedings{pmlr-v119-ghasemi20a,
title = {Task-Oriented Active Perception and Planning in Environments with Partially Known Semantics},
author = {Ghasemi, Mahsa and Bulgur, Erdem and Topcu, Ufuk},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3484--3493},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ghasemi20a/ghasemi20a.pdf},
url = {http://proceedings.mlr.press/v119/ghasemi20a.html},
abstract = {We consider an agent that is assigned with a temporal logic task in an environment whose semantic representation is only partially known. We represent the semantics of the environment with a set of state properties, called \emph{atomic propositions} over which, the agent holds a probabilistic belief and updates it as new sensory measurements arrive. The goal is to design a joint perception and planning strategy for the agent that realizes the task with high probability. We develop a planning strategy that takes the semantic uncertainties into account and by doing so provides probabilistic guarantees on the task success. Furthermore, as new data arrive, the belief over the atomic propositions evolves and, subsequently, the planning strategy adapts accordingly. We evaluate the proposed method on various finite-horizon tasks in planar navigation settings where the empirical results show that the proposed method provides reliable task performance that also improves as the knowledge about the environment enhances.}
}
@InProceedings{pmlr-v119-ghassami20a,
title = {Characterizing Distribution Equivalence and Structure Learning for Cyclic and Acyclic Directed Graphs},
author = {Ghassami, Amiremad and Yang, Alan and Kiyavash, Negar and Zhang, Kun},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3494--3504},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ghassami20a/ghassami20a.pdf},
url = {http://proceedings.mlr.press/v119/ghassami20a.html},
abstract = {The main approach to defining equivalence among acyclic directed causal graphical models is based on the conditional independence relationships in the distributions that the causal models can generate, in terms of the Markov equivalence. However, it is known that when cycles are allowed in the causal structure, conditional independence may not be a suitable notion for equivalence of two structures, as it does not reflect all the information in the distribution that is useful for identification of the underlying structure. In this paper, we present a general, unified notion of equivalence for linear Gaussian causal directed graphical models, whether they are cyclic or acyclic. In our proposed definition of equivalence, two structures are equivalent if they can generate the same set of data distributions. We also propose a weaker notion of equivalence called quasi-equivalence, which we show is the extent of identifiability from observational data. We propose analytic as well as graphical methods for characterizing the equivalence of two structures. Additionally, we propose a score-based method for learning the structure from observational data, which successfully deals with both acyclic and cyclic structures.}
}
@InProceedings{pmlr-v119-ghazi20a,
title = {Private Counting from Anonymous Messages: Near-Optimal Accuracy with Vanishing Communication Overhead},
author = {Ghazi, Badih and Kumar, Ravi and Manurangsi, Pasin and Pagh, Rasmus},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3505--3514},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ghazi20a/ghazi20a.pdf},
url = {http://proceedings.mlr.press/v119/ghazi20a.html},
abstract = {Differential privacy (DP) is a formal notion for quantifying the privacy loss of algorithms. Algorithms in the central model of DP achieve high accuracy but make the strongest trust assumptions whereas those in the local DP model make the weakest trust assumptions but incur substantial accuracy loss. The shuffled DP model [Bittau et al 2017, Erlingsson et al 2019, Cheu et al 19] has recently emerged as a feasible middle ground between the central and local models, providing stronger trust assumptions than the former while promising higher accuracies than the latter. In this paper, we obtain practical communication-efficient algorithms in the shuffled DP model for two basic aggregation primitives used in machine learning: 1) binary summation, and 2) histograms over a moderate number of buckets. Our algorithms achieve accuracy that is arbitrarily close to that of central DP algorithms with an expected communication per user essentially matching what is needed without any privacy constraints! We demonstrate the practicality of our algorithms by experimentally evaluating them and comparing their performance to several widely-used protocols such as Randomized Response [Warner 1965] and RAPPOR [Erlingsson et al. 2014].}
}
@InProceedings{pmlr-v119-ghazvininejad20a,
title = {Aligned Cross Entropy for Non-Autoregressive Machine Translation},
author = {Ghazvininejad, Marjan and Karpukhin, Vladimir and Zettlemoyer, Luke and Levy, Omer},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3515--3523},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ghazvininejad20a/ghazvininejad20a.pdf},
url = {http://proceedings.mlr.press/v119/ghazvininejad20a.html},
abstract = {Non-autoregressive machine translation models significantly speed up decoding by allowing for parallel prediction of the entire target sequence. However, modeling word order is more challenging due to the lack of autoregressive factors in the model. This difficultly is compounded during training with cross entropy loss, which can highly penalize small shifts in word order. In this paper, we propose aligned cross entropy (AXE) as an alternative loss function for training of non-autoregressive models. AXE uses a differentiable dynamic program to assign loss based on the best possible monotonic alignment between target tokens and model predictions. AXE-based training of conditional masked language models (CMLMs) substantially improves performance on major WMT benchmarks, while setting a new state of the art for non-autoregressive models.}
}
@InProceedings{pmlr-v119-ghiassian20a,
title = {Gradient Temporal-Difference Learning with Regularized Corrections},
author = {Ghiassian, Sina and Patterson, Andrew and Garg, Shivam and Gupta, Dhawal and White, Adam and White, Martha},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3524--3534},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ghiassian20a/ghiassian20a.pdf},
url = {http://proceedings.mlr.press/v119/ghiassian20a.html},
abstract = {It is still common to use Q-learning and temporal difference (TD) learning{—}even though they have divergence issues and sound Gradient TD alternatives exist{—}because divergence seems rare and they typically perform well. However, recent work with large neural network learning systems reveals that instability is more common than previously thought. Practitioners face a difficult dilemma: choose an easy to use and performant TD method, or a more complex algorithm that is more sound but harder to tune and all but unexplored with non-linear function approximation or control. In this paper, we introduce a new method called TD with Regularized Corrections (TDRC), that attempts to balance ease of use, soundness, and performance. It behaves as well as TD, when TD performs well, but is sound in cases where TD diverges. We empirically investigate TDRC across a range of problems, for both prediction and control, and for both linear and non-linear function approximation, and show, potentially for the first time, that Gradient TD methods could be a better alternative to TD and Q-learning.}
}
@InProceedings{pmlr-v119-ghorbani20a,
title = {A Distributional Framework For Data Valuation},
author = {Ghorbani, Amirata and Kim, Michael and Zou, James},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3535--3544},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ghorbani20a/ghorbani20a.pdf},
url = {http://proceedings.mlr.press/v119/ghorbani20a.html},
abstract = {Shapley value is a classic notion from game theory, historically used to quantify the contributions of individuals within groups, and more recently applied to assign values to data points when training machine learning models. Despite its foundational role, a key limitation of the data Shapley framework is that it only provides valuations for points within a fixed data set. It does not account for statistical aspects of the data and does not give a way to reason about points outside the data set. To address these limitations, we propose a novel framework – distributional Shapley– where the value of a point is defined in the context of an underlying data distribution. We prove that distributional Shapley has several desirable statistical properties; for example, the values are stable under perturbations to the data points themselves and to the underlying data distribution. We leverage these properties to develop a new algorithm for estimating values from data, which comes with formal guarantees and runs two orders of magnitude faster than state-of-the-art algorithms for computing the (non distributional) data Shapley values. We apply distributional Shapley to diverse data sets and demonstrate its utility in a data market setting.}
}
@InProceedings{pmlr-v119-ghosh20a,
title = {Fractal {G}aussian Networks: A sparse random graph model based on {G}aussian Multiplicative Chaos},
author = {Ghosh, Subhroshekhar and Balasubramanian, Krishna and Yang, Xiaochuan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3545--3555},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ghosh20a/ghosh20a.pdf},
url = {http://proceedings.mlr.press/v119/ghosh20a.html},
abstract = {We propose a novel stochastic network model, called Fractal Gaussian Network (FGN), that embodies well-defined and analytically tractable fractal structures. Such fractal structures have been empirically observed in diverse applications. FGNs interpolate continuously between the popular purely random geometric graphs (a.k.a. the Poisson Boolean network), and random graphs with increasingly fractal behavior. In fact, they form a parametric family of sparse random geometric graphs that are parametrised by a fractality parameter $\nu$ which governs the strength of the fractal structure. FGNs are driven by the latent spatial geometry of Gaussian Multiplicative Chaos (GMC), a canonical model of fractality in its own right. We explore the natural question of detecting the presence of fractality and the problem of parameter estimation based on observed network data. Finally, we explore fractality in community structures by unveiling a natural stochastic block model in the setting of FGNs.}
}
@InProceedings{pmlr-v119-ghosh20b,
title = {Representations for Stable Off-Policy Reinforcement Learning},
author = {Ghosh, Dibya and Bellemare, Marc G.},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3556--3565},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ghosh20b/ghosh20b.pdf},
url = {http://proceedings.mlr.press/v119/ghosh20b.html},
abstract = {Reinforcement learning with function approximation can be unstable and even divergent, especially when combined with off-policy learning and Bellman updates. In deep reinforcement learning, these issues have been dealt with empirically by adapting and regularizing the representation, in particular with auxiliary tasks. This suggests that representation learning may provide a means to guarantee stability. In this paper, we formally show that there are indeed nontrivial state representations under which the canonical SARSA algorithm is stable, even when learning off-policy. We analyze representation learning schemes that are based on the transition matrix of a policy, such as proto-value functions, along three axes: approximation error, stability, and ease of estimation. In the most general case of a defective transition matrix, we show that a Schur basis provides convergence guarantees, but is difficult to estimate from samples. For a fixed reward function, we find that an orthogonal basis of the corresponding Krylov subspace is an even better choice. We conclude by empirically demonstrating that these stable representations can be learned using stochastic gradient descent, opening the door to improved techniques for representation learning with deep networks.}
}
@InProceedings{pmlr-v119-gittens20a,
title = {Adaptive Sketching for Fast and Convergent Canonical Polyadic Decomposition},
author = {Gittens, Alex and Aggour, Kareem and Yener, B{\"u}lent},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3566--3575},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/gittens20a/gittens20a.pdf},
url = {http://proceedings.mlr.press/v119/gittens20a.html},
abstract = {This work considers the canonical polyadic decomposition (CPD) of tensors using proximally regularized sketched alternating least squares algorithms. First, it establishes a sublinear rate of convergence for proximally regularized sketched CPD algorithms under two natural conditions that are known to be satisfied by many popular forms of sketching. Second, it demonstrates that the iterative nature of CPD algorithms can be exploited algorithmically to choose more performant sketching rates. This is accomplished by introducing CPD-MWU, a proximally-regularized sketched alternating least squares algorithm that adaptively selects the sketching rate at each iteration. On both synthetic and real data we observe that for noisy tensors CPD-MWU produces decompositions of comparable accuracy to the standard CPD decomposition in less time, often half the time; for ill-conditioned tensors, given the same time budget, CPD-MWU produces decompositions with an order-of-magnitude lower relative error. For a representative real-world dataset CPD-MWU produces residual errors on average 20% lower than CPRAND-MIX and 44% lower than SPALS, two recent sketched CPD algorithms.}
}
@InProceedings{pmlr-v119-gnanasambandam20a,
title = {One Size Fits All: Can We Train One Denoiser for All Noise Levels?},
author = {Gnanasambandam, Abhiram and Chan, Stanley},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3576--3586},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/gnanasambandam20a/gnanasambandam20a.pdf},
url = {http://proceedings.mlr.press/v119/gnanasambandam20a.html},
abstract = {When training an estimator such as a neural network for tasks like image denoising, it is often preferred to train one estimator and apply it to all noise levels. The de facto training protocol to achieve this goal is to train the estimator with noisy samples whose noise levels are uniformly distributed across the range of interest. However, why should we allocate the samples uniformly? Can we have more training samples that are less noisy, and fewer samples that are more noisy? What is the optimal distribution? How do we obtain such a distribution? The goal of this paper is to address this training sample distribution problem from a minimax risk optimization perspective. We derive a dual ascent algorithm to determine the optimal sampling distribution of which the convergence is guaranteed as long as the set of admissible estimators is closed and convex. For estimators with non-convex admissible sets such as deep neural networks, our dual formulation converges to a solution of the convex relaxation. We discuss how the algorithm can be implemented in practice. We evaluate the algorithm on linear estimators and deep networks.}
}
@InProceedings{pmlr-v119-goel20a,
title = {Superpolynomial Lower Bounds for Learning One-Layer Neural Networks using Gradient Descent},
author = {Goel, Surbhi and Gollakota, Aravind and Jin, Zhihan and Karmalkar, Sushrut and Klivans, Adam},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3587--3596},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/goel20a/goel20a.pdf},
url = {http://proceedings.mlr.press/v119/goel20a.html},
abstract = {We give the first superpolynomial lower bounds for learning one-layer neural networks with respect to the Gaussian distribution for a broad class of algorithms. In the regression setting, we prove that gradient descent run on any classifier with respect to square loss will fail to achieve small test error in polynomial time. Prior work held only for gradient descent run with small batch sizes and sufficiently smooth classifiers. For classification, we give a stronger result, namely that any statistical query (SQ) algorithm will fail to achieve small test error in polynomial time. Our lower bounds hold for commonly used activations such as ReLU and sigmoid. The core of our result relies on a novel construction of a simple family of neural networks that are exactly orthogonal with respect to all spherically symmetric distributions.}
}
@InProceedings{pmlr-v119-golany20a,
title = {{S}im{GAN}s: Simulator-Based Generative Adversarial Networks for {ECG} Synthesis to Improve Deep {ECG} Classification},
author = {Golany, Tomer and Radinsky, Kira and Freedman, Daniel},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3597--3606},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/golany20a/golany20a.pdf},
url = {http://proceedings.mlr.press/v119/golany20a.html},
abstract = {Generating training examples for supervised tasks is a long sought after goal in AI. We study the problem of heart signal electrocardiogram (ECG) synthesis for improved heartbeat classification. ECG synthesis is challenging: the generation of training examples for such biological-physiological systems is not straightforward, due to their dynamic nature in which the various parts of the system interact in complex ways. However, an understanding of these dynamics has been developed for years in the form of mathematical process simulators. We study how to incorporate this knowledge into the generative process by leveraging a biological simulator for the task of ECG classification. Specifically, we use a system of ordinary differential equations representing heart dynamics, and incorporate this ODE system into the optimization process of a generative adversarial network to create biologically plausible ECG training examples. We perform empirical evaluation and show that heart simulation knowledge during the generation process improves ECG classification.}
}
@InProceedings{pmlr-v119-goldblum20a,
title = {Unraveling Meta-Learning: Understanding Feature Representations for Few-Shot Tasks},
author = {Goldblum, Micah and Reich, Steven and Fowl, Liam and Ni, Renkun and Cherepanova, Valeriia and Goldstein, Tom},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3607--3616},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/goldblum20a/goldblum20a.pdf},
url = {http://proceedings.mlr.press/v119/goldblum20a.html},
abstract = {Meta-learning algorithms produce feature extractors which achieve state-of-the-art performance on few-shot classification. While the literature is rich with meta-learning methods, little is known about why the resulting feature extractors perform so well. We develop a better understanding of the underlying mechanics of meta-learning and the difference between models trained using meta-learning and models which are trained classically. In doing so, we introduce and verify several hypotheses for why meta-learned models perform better. Furthermore, we develop a regularizer which boosts the performance of standard training routines for few-shot classification. In many cases, our routine outperforms meta-learning while simultaneously running an order of magnitude faster.}
}
@InProceedings{pmlr-v119-golikov20a,
title = {Towards a General Theory of Infinite-Width Limits of Neural Classifiers},
author = {Golikov, Eugene},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3617--3626},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/golikov20a/golikov20a.pdf},
url = {http://proceedings.mlr.press/v119/golikov20a.html},
abstract = {Obtaining theoretical guarantees for neural networks training appears to be a hard problem in a general case. Recent research has been focused on studying this problem in the limit of infinite width and two different theories have been developed: a mean-field (MF) and a constant kernel (NTK) limit theories. We propose a general framework that provides a link between these seemingly distinct theories. Our framework out of the box gives rise to a discrete-time MF limit which was not previously explored in the literature. We prove a convergence theorem for it, and show that it provides a more reasonable approximation for finite-width nets compared to the NTK limit if learning rates are not very small. Also, our framework suggests a limit model that coincides neither with the MF limit nor with the NTK one. We show that for networks with more than two hidden layers RMSProp training has a non-trivial discrete-time MF limit but GD training does not have one. Overall, our framework demonstrates that both MF and NTK limits have considerable limitations in approximating finite-sized neural nets, indicating the need for designing more accurate infinite-width approximations for them.}
}
@InProceedings{pmlr-v119-gopi20a,
title = {Differentially Private Set Union},
author = {Gopi, Sivakanth and Gulhane, Pankaj and Kulkarni, Janardhan and Shen, Judy Hanwen and Shokouhi, Milad and Yekhanin, Sergey},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3627--3636},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/gopi20a/gopi20a.pdf},
url = {http://proceedings.mlr.press/v119/gopi20a.html},
abstract = {We study the basic operation of set union in the global model of differential privacy. In this problem, we are given a universe $U$ of items, possibly of infinite size, and a database $D$ of users. Each user $i$ contributes a subset $W_i \subseteq U$ of items. We want an ($\epsilon$,$\delta$)-differentially private Algorithm which outputs a subset $S \subset \cup_i W_i$ such that the size of $S$ is as large as possible. The problem arises in countless real world applications, and is particularly ubiquitous in natural language processing (NLP) applications. For example, discovering words, sentences, $n$-grams etc., from private text data belonging to users is an instance of the set union problem. In this paper we design new algorithms for this problem that significantly outperform the best known algorithms.}
}
@InProceedings{pmlr-v119-gordon-rodriguez20a,
title = {The continuous categorical: a novel simplex-valued exponential family},
author = {Gordon-Rodriguez, Elliott and Loaiza-Ganem, Gabriel and Cunningham, John},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3637--3647},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/gordon-rodriguez20a/gordon-rodriguez20a.pdf},
url = {http://proceedings.mlr.press/v119/gordon-rodriguez20a.html},
abstract = {Simplex-valued data appear throughout statistics and machine learning, for example in the context of transfer learning and compression of deep networks. Existing models for this class of data rely on the Dirichlet distribution or other related loss functions; here we show these standard choices suffer systematically from a number of limitations, including bias and numerical issues that frustrate the use of flexible network models upstream of these distributions. We resolve these limitations by introducing a novel exponential family of distributions for modeling simplex-valued data {–} the continuous categorical, which arises as a nontrivial multivariate generalization of the recently discovered continuous Bernoulli. Unlike the Dirichlet and other typical choices, the continuous categorical results in a well-behaved probabilistic loss function that produces unbiased estimators, while preserving the mathematical simplicity of the Dirichlet. As well as exploring its theoretical properties, we introduce sampling methods for this distribution that are amenable to the reparameterization trick, and evaluate their performance. Lastly, we demonstrate that the continuous categorical outperforms standard choices empirically, across a simulation study, an applied example on multi-party elections, and a neural network compression task.}
}
@InProceedings{pmlr-v119-gorinova20a,
title = {Automatic Reparameterisation of Probabilistic Programs},
author = {Gorinova, Maria and Moore, Dave and Hoffman, Matthew},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3648--3657},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/gorinova20a/gorinova20a.pdf},
url = {http://proceedings.mlr.press/v119/gorinova20a.html},
abstract = {Probabilistic programming has emerged as a powerful paradigm in statistics, applied science, and machine learning: by decoupling modelling from inference, it promises to allow modellers to directly reason about the processes generating data. However, the performance of inference algorithms can be dramatically affected by the parameterisation used to express a model, requiring users to transform their programs in non-intuitive ways. We argue for automating these transformations, and demonstrate that mechanisms available in recent modelling frameworks can implement non-centring and related reparameterisations. This enables new inference algorithms, and we propose two: a simple approach using interleaved sampling and a novel variational formulation that searches over a continuous space of parameterisations. We show that these approaches enable robust inference across a range of models, and can yield more efficient samplers than the best fixed parameterisation.}
}
@InProceedings{pmlr-v119-gottesman20a,
title = {Interpretable Off-Policy Evaluation in Reinforcement Learning by Highlighting Influential Transitions},
author = {Gottesman, Omer and Futoma, Joseph and Liu, Yao and Parbhoo, Sonali and Celi, Leo and Brunskill, Emma and Doshi-Velez, Finale},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3658--3667},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/gottesman20a/gottesman20a.pdf},
url = {http://proceedings.mlr.press/v119/gottesman20a.html},
abstract = {Off-policy evaluation in reinforcement learning offers the chance of using observational data to improve future outcomes in domains such as healthcare and education, but safe deployment in high stakes settings requires ways of assessing its validity. Traditional measures such as confidence intervals may be insufficient due to noise, limited data and confounding. In this paper we develop a method that could serve as a hybrid human-AI system, to enable human experts to analyze the validity of policy evaluation estimates. This is accomplished by highlighting observations in the data whose removal will have a large effect on the OPE estimate, and formulating a set of rules for choosing which ones to present to domain experts for validation. We develop methods to compute exactly the influence functions for fitted Q-evaluation with two different function classes: kernel-based and linear least squares, as well as importance sampling methods. Experiments on medical simulations and real-world intensive care unit data demonstrate that our method can be used to identify limitations in the evaluation process and make evaluation more robust.}
}
@InProceedings{pmlr-v119-gottipati20a,
title = {Learning to Navigate The Synthetically Accessible Chemical Space Using Reinforcement Learning},
author = {Gottipati, Sai Krishna and Sattarov, Boris and Niu, Sufeng and Pathak, Yashaswi and Wei, Haoran and Liu, Shengchao and Liu, Shengchao and Blackburn, Simon and Thomas, Karam and Coley, Connor and Tang, Jian and Chandar, Sarath and Bengio, Yoshua},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3668--3679},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/gottipati20a/gottipati20a.pdf},
url = {http://proceedings.mlr.press/v119/gottipati20a.html},
abstract = {Over the last decade, there has been significant progress in the field of machine learning for de novo drug design, particularly in generative modeling of novel chemical structures. However, current generative approaches exhibit a significant challenge: they do not ensure that the proposed molecular structures can be feasibly synthesized nor do they provide the synthesis routes of the proposed small molecules, thereby seriously limiting their practical applicability. In this work, we propose a novel reinforcement learning (RL) setup for de novo drug design: Policy Gradient for Forward Synthesis (PGFS), that addresses this challenge by embedding the concept of synthetic accessibility directly into the de novo drug design system. In this setup, the agent learns to navigate through the immense synthetically accessible chemical space by subjecting initial commercially available molecules to valid chemical reactions at every time step of the iterative virtual synthesis process. The proposed environment for drug discovery provides a highly challenging test-bed for RL algorithms owing to the large state space and high-dimensional continuous action space with hierarchical actions. PGFS achieves state-of-the-art performance in generating structures with high QED and clogP. Moreover, we validate PGFS in an in-silico proof-of-concept associated with three HIV targets. Finally, we describe how the end-to-end training conceptualized in this study represents an important paradigm in radically expanding the synthesizable chemical space and automating the drug discovery process.}
}
@InProceedings{pmlr-v119-gouvert20a,
title = {Ordinal Non-negative Matrix Factorization for Recommendation},
author = {Gouvert, Olivier and Oberlin, Thomas and F{\'e}votte, C{\'e}dric},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3680--3689},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/gouvert20a/gouvert20a.pdf},
url = {http://proceedings.mlr.press/v119/gouvert20a.html},
abstract = {We introduce a new non-negative matrix factorization (NMF) method for ordinal data, called OrdNMF. Ordinal data are categorical data which exhibit a natural ordering between the categories. In particular, they can be found in recommender systems, either with explicit data (such as ratings) or implicit data (such as quantized play counts). OrdNMF is a probabilistic latent factor model that generalizes Bernoulli-Poisson factorization (BePoF) and Poisson factorization (PF) applied to binarized data. Contrary to these methods, OrdNMF circumvents binarization and can exploit a more informative representation of the data. We design an efficient variational algorithm based on a suitable model augmentation and related to variational PF. In particular, our algorithm preserves the scalability of PF and can be applied to huge sparse datasets. We report recommendation experiments on explicit and implicit datasets, and show that OrdNMF outperforms BePoF and PF applied to binarized data.}
}
@InProceedings{pmlr-v119-goyal20a,
title = {{P}o{WER}-{BERT}: Accelerating {BERT} Inference via Progressive Word-vector Elimination},
author = {Goyal, Saurabh and Choudhury, Anamitra Roy and Raje, Saurabh and Chakaravarthy, Venkatesan and Sabharwal, Yogish and Verma, Ashish},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3690--3699},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/goyal20a/goyal20a.pdf},
url = {http://proceedings.mlr.press/v119/goyal20a.html},
abstract = {We develop a novel method, called PoWER-BERT, for improving the inference time of the popular BERT model, while maintaining the accuracy. It works by: a) exploiting redundancy pertaining to word-vectors (intermediate transformer block outputs) and eliminating the redundant vectors. b) determining which word-vectors to eliminate by developing a strategy for measuring their significance, based on the self-attention mechanism. c) learning how many word-vectors to eliminate by augmenting the BERT model and the loss function. Experiments on the standard GLUE benchmark shows that PoWER-BERT achieves up to 4.5x reduction in inference time over BERT with < 1% loss in accuracy. We show that PoWER-BERT offers significantly better trade-off between accuracy and inference time compared to prior methods. We demonstrate that our method attains up to 6.8x reduction in inference time with < 1% loss in accuracy when applied over ALBERT, a highly compressed version of BERT. The code for PoWER-BERT is publicly available at https://github.com/IBM/PoWER-BERT.}
}
@InProceedings{pmlr-v119-goyal20b,
title = {{P}ack{I}t: A Virtual Environment for Geometric Planning},
author = {Goyal, Ankit and Deng, Jia},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3700--3710},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/goyal20b/goyal20b.pdf},
url = {http://proceedings.mlr.press/v119/goyal20b.html},
abstract = {The ability to jointly understand the geometry of objects and plan actions for manipulating them is crucial for intelligent agents. We refer to this ability as geometric planning. Recently, many interactive environments have been proposed to evaluate intelligent agents on various skills, however, none of them cater to the needs of geometric planning. We present PackIt, a virtual environment to evaluate and potentially learn the ability to do geometric planning, where an agent needs to take a sequence of actions to pack a set of objects into a box with limited space. We also construct a set of challenging packing tasks using an evolutionary algorithm. Further, we study various baselines for the task that include model-free learning-based and heuristic-based methods, as well as search-based optimization methods that assume access to the model of the environment.}
}
@InProceedings{pmlr-v119-goyal20c,
title = {{DROCC}: Deep Robust One-Class Classification},
author = {Goyal, Sachin and Raghunathan, Aditi and Jain, Moksh and Simhadri, Harsha Vardhan and Jain, Prateek},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3711--3721},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/goyal20c/goyal20c.pdf},
url = {http://proceedings.mlr.press/v119/goyal20c.html},
abstract = {Classical approaches for one-class problems such as one-class SVM and isolation forest require careful feature engineering when applied to structured domains like images. State-of-the-art methods aim to leverage deep learning to learn appropriate features via two main approaches. The first approach based on predicting transformations (Golan & El-Yaniv, 2018; Hendrycks et al., 2019a) while successful in some domains, crucially depends on an appropriate domain-specific set of transformations that are hard to obtain in general. The second approach of minimizing a classical one-class loss on the learned final layer representations, e.g., DeepSVDD (Ruff et al., 2018) suffers from the fundamental drawback of representation collapse. In this work, we propose Deep Robust One Class Classification (DROCC) that is both applicable to most standard domains without requiring any side-information and robust to representation collapse. DROCC is based on the assumption that the points from the class of interest lie on a well-sampled, locally linear low dimensional manifold. Empirical evaluation demonstrates that DROCC is highly effective in two different one-class problem settings and on a range of real-world datasets across different domains: tabular data, images (CIFAR and ImageNet), audio, and time-series, offering up to 20% increase in accuracy over the state-of-the-art in anomaly detection. Code is available at https://github.com/microsoft/EdgeML}
}
@InProceedings{pmlr-v119-grasshoff20a,
title = {Scalable {G}aussian Process Separation for Kernels with a Non-Stationary Phase},
author = {Gra{\ss}hoff, Jan and Jankowski, Alexandra and Rostalski, Philipp},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3722--3731},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/grasshoff20a/grasshoff20a.pdf},
url = {http://proceedings.mlr.press/v119/grasshoff20a.html},
abstract = {The application of Gaussian processes (GPs) to large data sets is limited due to heavy memory and computational requirements. A variety of methods has been proposed to enable scalability, one of which is to exploit structure in the kernel matrix. Previous methods, however, cannot easily deal with mixtures of non-stationary processes. This paper investigates an efficient GP framework, that extends structured kernel interpolation methods to GPs with a non-stationary phase. We particularly treat the separation of nonstationary sources, which is a problem that commonly arises e.g. in spatio-temporal biomedical datasets. Our approach employs multiple sets of non-equidistant inducing points to account for the non-stationarity and retrieve Toeplitz and Kronecker structure in the kernel matrix allowing for efficient inference and kernel learning. Our approach is demonstrated on numerical examples and large spatio-temporal biomedical problems.}
}
@InProceedings{pmlr-v119-grathwohl20a,
title = {Learning the Stein Discrepancy for Training and Evaluating Energy-Based Models without Sampling},
author = {Grathwohl, Will and Wang, Kuan-Chieh and Jacobsen, Joern-Henrik and Duvenaud, David and Zemel, Richard},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3732--3747},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/grathwohl20a/grathwohl20a.pdf},
url = {http://proceedings.mlr.press/v119/grathwohl20a.html},
abstract = {We present a new method for evaluating and training unnormalized density models. Our approach only requires access to the gradient of the unnormalized model’s log-density. We estimate the Stein discrepancy between the data density p(x) and the model density q(x) based on a vector function of the data. We parameterize this function with a neural network and fit its parameters to maximize this discrepancy. This yields a novel goodness-of-fit test which outperforms existing methods on high dimensional data. Furthermore, optimizing q(x) to minimize this discrepancy produces a novel method for training unnormalized models. This training method can fit large unnormalized models faster than existing approaches. The ability to both learn and compare models is a unique feature of the proposed method.}
}
@InProceedings{pmlr-v119-grazzi20a,
title = {On the Iteration Complexity of Hypergradient Computation},
author = {Grazzi, Riccardo and Franceschi, Luca and Pontil, Massimiliano and Salzo, Saverio},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3748--3758},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/grazzi20a/grazzi20a.pdf},
url = {http://proceedings.mlr.press/v119/grazzi20a.html},
abstract = {We study a general class of bilevel problems, consisting in the minimization of an upper-level objective which depends on the solution to a parametric fixed-point equation. Important instances arising in machine learning include hyperparameter optimization, meta-learning, and certain graph and recurrent neural networks. Typically the gradient of the upper-level objective (hypergradient) is hard or even impossible to compute exactly, which has raised the interest in approximation methods. We investigate some popular approaches to compute the hypergradient, based on reverse mode iterative differentiation and approximate implicit differentiation. Under the hypothesis that the fixed point equation is defined by a contraction mapping, we present a unified analysis which allows for the first time to quantitatively compare these methods, providing explicit bounds for their iteration complexity. This analysis suggests a hierarchy in terms of computational efficiency among the above methods, with approximate implicit differentiation based on conjugate gradient performing best. We present an extensive experimental comparison among the methods which confirm the theoretical findings.}
}
@InProceedings{pmlr-v119-greenfeld20a,
title = {Robust Learning with the {H}ilbert-Schmidt Independence Criterion},
author = {Greenfeld, Daniel and Shalit, Uri},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3759--3768},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/greenfeld20a/greenfeld20a.pdf},
url = {http://proceedings.mlr.press/v119/greenfeld20a.html},
abstract = {We investigate the use of a non-parametric independence measure, the Hilbert-Schmidt Independence Criterion (HSIC), as a loss-function for learning robust regression and classification models. This loss-function encourages learning models where the distribution of the residuals between the label and the model prediction is statistically independent of the distribution of the instances themselves. This loss-function was first proposed by \citet{mooij2009regression} in the context of learning causal graphs. We adapt it to the task of learning for unsupervised covariate shift: learning on a source domain without access to any instances or labels from the unknown target domain, but with the assumption that $p(y|x)$ (the conditional probability of labels given instances) remains the same in the target domain. We show that the proposed loss is expected to give rise to models that generalize well on a class of target domains characterised by the complexity of their description within a reproducing kernel Hilbert space. Experiments on unsupervised covariate shift tasks demonstrate that models learned with the proposed loss-function outperform models learned with standard loss functions, achieving state-of-the-art results on a challenging cell-microscopy unsupervised covariate shift task.}
}
@InProceedings{pmlr-v119-grill20a,
title = {{M}onte-{C}arlo Tree Search as Regularized Policy Optimization},
author = {Grill, Jean-Bastien and Altch{\'e}, Florent and Tang, Yunhao and Hubert, Thomas and Valko, Michal and Antonoglou, Ioannis and Munos, Remi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3769--3778},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/grill20a/grill20a.pdf},
url = {http://proceedings.mlr.press/v119/grill20a.html},
abstract = {The combination of Monte-Carlo tree search (MCTS) with deep reinforcement learning has led to groundbreaking results in artificial intelligence. However, AlphaZero, the current state-of-the-art MCTS algorithm still relies on handcrafted heuristics that are only partially understood. In this paper, we show that AlphaZero’s search heuristic, along with other common ones, can be interpreted as an approximation to the solution of a specific regularized policy optimization problem. With this insight, we propose a variant of AlphaZero which uses the exact solution to this policy optimization problem, and show experimentally that it reliably outperforms the original algorithm in multiple domains.}
}
@InProceedings{pmlr-v119-gronlund20a,
title = {Near-Tight Margin-Based Generalization Bounds for Support Vector Machines},
author = {Gr{\o}nlund, Allan and Kamma, Lior and Larsen, Kasper Green},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3779--3788},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/gronlund20a/gronlund20a.pdf},
url = {http://proceedings.mlr.press/v119/gronlund20a.html},
abstract = {Support Vector Machines (SVMs) are among the most fundamental tools for binary classification. In its simplest formulation, an SVM produces a hyperplane separating two classes of data using the largest possible margin to the data. The focus on maximizing the margin has been well motivated through numerous generalization bounds. In this paper, we revisit and improve the classic generalization bounds in terms of margins. Furthermore, we complement our new generalization bound by a nearly matching lower bound, thus almost settling the generalization performance of SVMs in terms of margins.}
}
@InProceedings{pmlr-v119-gropp20a,
title = {Implicit Geometric Regularization for Learning Shapes},
author = {Gropp, Amos and Yariv, Lior and Haim, Niv and Atzmon, Matan and Lipman, Yaron},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3789--3799},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/gropp20a/gropp20a.pdf},
url = {http://proceedings.mlr.press/v119/gropp20a.html},
abstract = {Representing shapes as level-sets of neural networks has been recently proved to be useful for different shape analysis and reconstruction tasks. So far, such representations were computed using either: (i) pre-computed implicit shape representations; or (ii) loss functions explicitly defined over the neural level-sets. In this paper we offer a new paradigm for computing high fidelity implicit neural representations directly from raw data (i.e., point clouds, with or without normal information). We observe that a rather simple loss function, encouraging the neural network to vanish on the input point cloud and to have a unit norm gradient, possesses an implicit geometric regularization property that favors smooth and natural zero level-set surfaces, avoiding bad zero-loss solutions. We provide a theoretical analysis of this property for the linear case, and show that, in practice, our method leads to state-of-the-art implicit neural representations with higher level-of-details and fidelity compared to previous methods.}
}
@InProceedings{pmlr-v119-gu20a,
title = {Improving the Gating Mechanism of Recurrent Neural Networks},
author = {Gu, Albert and Gulcehre, Caglar and Paine, Thomas and Hoffman, Matt and Pascanu, Razvan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3800--3809},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/gu20a/gu20a.pdf},
url = {http://proceedings.mlr.press/v119/gu20a.html},
abstract = {Gating mechanisms are widely used in neural network models, where they allow gradients to backpropagate easily through depth or time. However, their saturation property introduces problems of its own. For example, in recurrent models these gates need to have outputs near 1 to propagate information over long time-delays, which requires them to operate in their saturation regime and hinders gradient-based learning of the gate mechanism. We address this problem by deriving two synergistic modifications to the standard gating mechanism that are easy to implement, introduce no additional hyperparameters, and improve learnability of the gates when they are close to saturation. We show how these changes are related to and improve on alternative recently proposed gating mechanisms such as chrono-initialization and Ordered Neurons. Empirically, our simple gating mechanisms robustly improve the performance of recurrent models on a range of applications, including synthetic memorization tasks, sequential image classification, language modeling, and reinforcement learning, particularly when long-term dependencies are involved.}
}
@InProceedings{pmlr-v119-guo20a,
title = {Recurrent Hierarchical Topic-Guided {RNN} for Language Generation},
author = {Guo, Dandan and Chen, Bo and Lu, Ruiying and Zhou, Mingyuan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3810--3821},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/guo20a/guo20a.pdf},
url = {http://proceedings.mlr.press/v119/guo20a.html},
abstract = {To simultaneously capture syntax and global semantics from a text corpus, we propose a new larger-context recurrent neural network (RNN) based language model, which extracts recurrent hierarchical semantic structure via a dynamic deep topic model to guide natural language generation. Moving beyond a conventional RNN-based language model that ignores long-range word dependencies and sentence order, the proposed model captures not only intra-sentence word dependencies, but also temporal transitions between sentences and inter-sentence topic dependencies. For inference, we develop a hybrid of stochastic-gradient Markov chain Monte Carlo and recurrent autoencoding variational Bayes. Experimental results on a variety of real-world text corpora demonstrate that the proposed model not only outperforms larger-context RNN-based language models, but also learns interpretable recurrent multilayer topics and generates diverse sentences and paragraphs that are syntactically correct and semantically coherent.}
}
@InProceedings{pmlr-v119-guo20b,
title = {Breaking the Curse of Space Explosion: Towards Efficient {NAS} with Curriculum Search},
author = {Guo, Yong and Chen, Yaofo and Zheng, Yin and Zhao, Peilin and Chen, Jian and Huang, Junzhou and Tan, Mingkui},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3822--3831},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/guo20b/guo20b.pdf},
url = {http://proceedings.mlr.press/v119/guo20b.html},
abstract = {Neural architecture search (NAS) has become an important approach to automatically find effective architectures. To cover all possible good architectures, we need to search in an extremely large search space with billions of candidate architectures. More critically, given a large search space, we may face a very challenging issue of space explosion. However, due to the limitation of computational resources, we can only sample a very small proportion of the architectures, which provides insufficient information for the training. As a result, existing methods may often produce sub-optimal architectures. To alleviate this issue, we propose a curriculum search method that starts from a small search space and gradually incorporates the learned knowledge to guide the search in a large space. With the proposed search strategy, our Curriculum Neural Architecture Search (CNAS) method significantly improves the search efficiency and finds better architectures than existing NAS methods. Extensive experiments on CIFAR-10 and ImageNet demonstrate the effectiveness of the proposed method.}
}
@InProceedings{pmlr-v119-guo20c,
title = {Certified Data Removal from Machine Learning Models},
author = {Guo, Chuan and Goldstein, Tom and Hannun, Awni and Van Der Maaten, Laurens},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3832--3842},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/guo20c/guo20c.pdf},
url = {http://proceedings.mlr.press/v119/guo20c.html},
abstract = {Good data stewardship requires removal of data at the request of the data’s owner. This raises the question if and how a trained machine-learning model, which implicitly stores information about its training data, should be affected by such a removal request. Is it possible to “remove” data from a machine-learning model? We study this problem by defining certified removal: a very strong theoretical guarantee that a model from which data is removed cannot be distinguished from a model that never observed the data to begin with. We develop a certified-removal mechanism for linear classifiers and empirically study learning settings in which this mechanism is practical.}
}
@InProceedings{pmlr-v119-guo20d,
title = {{LTF}: A Label Transformation Framework for Correcting Label Shift},
author = {Guo, Jiaxian and Gong, Mingming and Liu, Tongliang and Zhang, Kun and Tao, Dacheng},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3843--3853},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/guo20d/guo20d.pdf},
url = {http://proceedings.mlr.press/v119/guo20d.html},
abstract = {Distribution shift is a major obstacle to the deployment of current deep learning models on real-world problems. Let $Y$ be the class label and $X$ the features. We focus on one type of distribution shift, \emph{ label shift}, where the label marginal distribution $P_Y$ changes but the conditional distribution $P_{X|Y}$ does not. Most existing methods estimate the density ratio between the source- and target-domain label distributions by density matching. However, these methods are either computationally infeasible for large-scale data or restricted to shift correction for discrete labels. In this paper, we propose an end-to-end Label Transformation Framework (LTF) for correcting label shift, which implicitly models the shift of $P_Y$ and the conditional distribution $P_{X|Y}$ using neural networks. Thanks to the flexibility of deep networks, our framework can handle continuous, discrete, and even multi-dimensional labels in a unified way and is scalable to large data. Moreover, for high dimensional $X$, such as images, we find that the redundant information in $X$ severely degrades the estimation accuracy. To remedy this issue, we propose to match the distribution implied by our generative model and the target-domain distribution in a low-dimensional feature space that discards information irrelevant to $Y$. Both theoretical and empirical studies demonstrate the superiority of our method over previous approaches.}
}
@InProceedings{pmlr-v119-guo20e,
title = {Learning to Branch for Multi-Task Learning},
author = {Guo, Pengsheng and Lee, Chen-Yu and Ulbricht, Daniel},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3854--3863},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/guo20e/guo20e.pdf},
url = {http://proceedings.mlr.press/v119/guo20e.html},
abstract = {Training multiple tasks jointly in one deep network yields reduced latency during inference and better performance over the single-task counterpart by sharing certain layers of a network. However, over-sharing a network could erroneously enforce over-generalization, causing negative knowledge transfer across tasks. Prior works rely on human intuition or pre-computed task relatedness scores for ad hoc branching structures. They provide sub-optimal end results and often require huge efforts for the trial-and-error process. In this work, we present an automated multi-task learning algorithm that learns where to share or branch within a network, designing an effective network topology that is directly optimized for multiple objectives across tasks. Specifically, we propose a novel tree-structured design space that casts a tree branching operation as a gumbel-softmax sampling procedure. This enables differentiable network splitting that is end-to-end trainable. We validate the proposed method on controlled synthetic data, CelebA, and Taskonomy.}
}
@InProceedings{pmlr-v119-guo20f,
title = {Communication-Efficient Distributed Stochastic {AUC} Maximization with Deep Neural Networks},
author = {Guo, Zhishuai and Liu, Mingrui and Yuan, Zhuoning and Shen, Li and Liu, Wei and Yang, Tianbao},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3864--3874},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/guo20f/guo20f.pdf},
url = {http://proceedings.mlr.press/v119/guo20f.html},
abstract = {In this paper, we study distributed algorithms for large-scale AUC maximization with a deep neural network as a predictive model. Although distributed learning techniques have been investigated extensively in deep learning, they are not directly applicable to stochastic AUC maximization with deep neural networks due to its striking differences from standard loss minimization problems (e.g., cross-entropy). Towards addressing this challenge, we propose and analyze a communication-efficient distributed optimization algorithm based on a \emph{non-convex concave} reformulation of the AUC maximization, in which the communication of both the primal variable and the dual variable between each worker and the parameter server only occurs after multiple steps of gradient-based updates in each worker. Compared with the naive parallel version of an existing algorithm that computes stochastic gradients at individual machines and averages them for updating the model parameter, our algorithm requires a much less number of communication rounds and still achieves linear speedup in theory. To the best of our knowledge, this is the \textbf{first} work that solves the \emph{non-convex concave min-max} problem for AUC maximization with deep neural networks in a communication-efficient distributed manner while still maintaining the linear speedup property in theory. Our experiments on several benchmark datasets show the effectiveness of our algorithm and also confirm our theory.}
}
@InProceedings{pmlr-v119-guo20g,
title = {Bootstrap Latent-Predictive Representations for Multitask Reinforcement Learning},
author = {Guo, Zhaohan Daniel and Pires, Bernardo Avila and Piot, Bilal and Grill, Jean-Bastien and Altch{\'e}, Florent and Munos, Remi and Azar, Mohammad Gheshlaghi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3875--3886},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/guo20g/guo20g.pdf},
url = {http://proceedings.mlr.press/v119/guo20g.html},
abstract = {Learning a good representation is an essential component for deep reinforcement learning (RL). Representation learning is especially important in multitask and partially observable settings where building a representation of the unknown environment is crucial to solve the tasks. Here we introduce Predictions of Bootstrapped Latents (PBL), a simple and flexible self-supervised representation learning algorithm for multitask deep RL. PBL builds on multistep predictive representations of future observations, and focuses on capturing structured information about environment dynamics. Specifically, PBL trains its representation by predicting latent embeddings of future observations. These latent embeddings are themselves trained to be predictive of the aforementioned representations. These predictions form a bootstrapping effect, allowing the agent to learn more about the key aspects of the environment dynamics. In addition, by defining prediction tasks completely in latent space, PBL provides the flexibility of using multimodal observations involving pixel images, language instructions, rewards and more. We show in our experiments that PBL delivers across-the-board improved performance over state of the art deep RL agents in the DMLab-30 multitask setting.}
}
@InProceedings{pmlr-v119-guo20h,
title = {Accelerating Large-Scale Inference with Anisotropic Vector Quantization},
author = {Guo, Ruiqi and Sun, Philip and Lindgren, Erik and Geng, Quan and Simcha, David and Chern, Felix and Kumar, Sanjiv},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3887--3896},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/guo20h/guo20h.pdf},
url = {http://proceedings.mlr.press/v119/guo20h.html},
abstract = {Quantization based techniques are the current state-of-the-art for scaling maximum inner product search to massive databases. Traditional approaches to quantization aim to minimize the reconstruction error of the database points. Based on the observation that for a given query, the database points that have the largest inner products are more relevant, we develop a family of anisotropic quantization loss functions. Under natural statistical assumptions, we show that quantization with these loss functions leads to a new variant of vector quantization that more greatly penalizes the parallel component of a datapoint’s residual relative to its orthogonal component. The proposed approach, whose implementation is open-source, achieves state-of-the-art results on the public benchmarks available at ann-benchmarks.com.}
}
@InProceedings{pmlr-v119-guo20i,
title = {Safe Deep Semi-Supervised Learning for Unseen-Class Unlabeled Data},
author = {Guo, Lan-Zhe and Zhang, Zhen-Yu and Jiang, Yuan and Li, Yu-Feng and Zhou, Zhi-Hua},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3897--3906},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/guo20i/guo20i.pdf},
url = {http://proceedings.mlr.press/v119/guo20i.html},
abstract = {Deep semi-supervised learning (SSL) has been recently shown very effectively. However, its performance is seriously decreased when the class distribution is mismatched, among which a common situation is that unlabeled data contains some classes not seen in the labeled data. Efforts on this issue remain to be limited. This paper proposes a simple and effective safe deep SSL method to alleviate the harm caused by it. In theory, the result learned from the new method is never worse than learning from merely labeled data, and it is theoretically guaranteed that its generalization approaches the optimal in the order $O(\sqrt{d\ln(n)/n})$, even faster than the convergence rate in supervised learning associated with massive parameters. In the experiment of benchmark data, unlike the existing deep SSL methods which are no longer as good as supervised learning in 40% of unseen-class unlabeled data, the new method can still achieve performance gain in more than 60% of unseen-class unlabeled data. Moreover, the proposal is suitable for many deep SSL algorithms and can be easily extended to handle other cases of class distribution mismatch.}
}
@InProceedings{pmlr-v119-gupta20a,
title = {Neural Topic Modeling with Continual Lifelong Learning},
author = {Gupta, Pankaj and Chaudhary, Yatin and Runkler, Thomas and Schuetze, Hinrich},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3907--3917},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/gupta20a/gupta20a.pdf},
url = {http://proceedings.mlr.press/v119/gupta20a.html},
abstract = {Lifelong learning has recently attracted attention in building machine learning systems that continually accumulate and transfer knowledge to help future learning. Unsupervised topic modeling has been popularly used to discover topics from document collections. However, the application of topic modeling is challenging due to data sparsity, e.g., in a small collection of (short) documents and thus, generate incoherent topics and sub-optimal document representations. To address the problem, we propose a lifelong learning framework for neural topic modeling that can continuously process streams of document collections, accumulate topics and guide future topic modeling tasks by knowledge transfer from several sources to better deal with the sparse data. In the lifelong process, we particularly investigate jointly: (1) sharing generative homologies (latent topics) over lifetime to transfer prior knowledge, and (2) minimizing catastrophic forgetting to retain the past learning via novel selective data augmentation, co-training and topic regularization approaches. Given a stream of document collections, we apply the proposed Lifelong Neural Topic Modeling (LNTM) framework in modeling three sparse document collections as future tasks and demonstrate improved performance quantiﬁed by perplexity, topic coherence and information retrieval task. Code: https://github.com/pgcool/Lifelong-Neural-Topic-Modeling}
}
@InProceedings{pmlr-v119-gupta20b,
title = {Multidimensional Shape Constraints},
author = {Gupta, Maya and Louidor, Erez and Mangylov, Oleksandr and Morioka, Nobu and Narayan, Taman and Zhao, Sen},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3918--3928},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/gupta20b/gupta20b.pdf},
url = {http://proceedings.mlr.press/v119/gupta20b.html},
abstract = {We propose new multi-input shape constraints across four intuitive categories: complements, diminishers, dominance, and unimodality constraints. We show these shape constraints can be checked and even enforced when training machine-learned models for linear models, generalized additive models, and the nonlinear function class of multi-layer lattice models. Real-world experiments illustrate how the different shape constraints can be used to increase explainability and improve regularization, especially for non-IID train-test distribution shift.}
}
@InProceedings{pmlr-v119-guu20a,
title = {Retrieval Augmented Language Model Pre-Training},
author = {Guu, Kelvin and Lee, Kenton and Tung, Zora and Pasupat, Panupong and Chang, Mingwei},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3929--3938},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/guu20a/guu20a.pdf},
url = {http://proceedings.mlr.press/v119/guu20a.html},
abstract = {Language model pre-training has been shown to capture a surprising amount of world knowledge, crucial for NLP tasks such as question answering. However, this knowledge is stored implicitly in the parameters of a neural network, requiring ever-larger networks to cover more facts. To capture knowledge in a more modular and interpretable way, we augment language model pre-training with a latent knowledge retriever, which allows the model to retrieve and attend over documents from a large corpus such as Wikipedia, used during pre-training, fine-tuning and inference. For the first time, we show how to pre-train such a knowledge retriever in an unsupervised manner, using masked language modeling as the learning signal and backpropagating through a retrieval step that considers millions of documents. We demonstrate the effectiveness of Retrieval-Augmented Language Model pre-training (REALM) by fine-tuning on the challenging task of Open-domain Question Answering (Open-QA). We compare against state-of-the-art models for both explicit and implicit knowledge storage on three popular Open-QA benchmarks, and find that we outperform all previous methods by a significant margin (4-16% absolute accuracy), while also providing qualitative benefits such as interpretability and modularity.}
}
@InProceedings{pmlr-v119-haba20a,
title = {Streaming Submodular Maximization under a k-Set System Constraint},
author = {Haba, Ran and Kazemi, Ehsan and Feldman, Moran and Karbasi, Amin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3939--3949},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/haba20a/haba20a.pdf},
url = {http://proceedings.mlr.press/v119/haba20a.html},
abstract = {In this paper, we propose a novel framework that converts streaming algorithms for monotone submodular maximization into streaming algorithms for non-monotone submodular maximization. This reduction readily leads to the currently tightest deterministic approximation ratio for submodular maximization subject to a $k$-matchoid constraint. Moreover, we propose the first streaming algorithm for monotone submodular maximization subject to $k$-extendible and $k$-set system constraints. Together with our proposed reduction, we obtain $O(k\log k)$ and $O(k^2\log k)$ approximation ratio for submodular maximization subject to the above constraints, respectively. We extensively evaluate the empirical performance of our algorithm against the existing work in a series of experiments including finding the maximum independent set in randomly generated graphs, maximizing linear functions over social networks, movie recommendation, Yelp location summarization, and Twitter data summarization.}
}
@InProceedings{pmlr-v119-hacohen20a,
title = {Let’s Agree to Agree: Neural Networks Share Classification Order on Real Datasets},
author = {Hacohen, Guy and Choshen, Leshem and Weinshall, Daphna},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3950--3960},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hacohen20a/hacohen20a.pdf},
url = {http://proceedings.mlr.press/v119/hacohen20a.html},
abstract = {We report a series of robust empirical observations, demonstrating that deep Neural Networks learn the examples in both the training and test sets in a similar order. This phenomenon is observed in all the commonly used benchmarks we evaluated, including many image classification benchmarks, and one text classification benchmark. While this phenomenon is strongest for models of the same architecture, it also crosses architectural boundaries – models of different architectures start by learning the same examples, after which the more powerful model may continue to learn additional examples. We further show that this pattern of results reflects the interplay between the way neural networks learn benchmark datasets. Specifically, when fixing the architecture, we describe synthetic datasets for which this pattern is no longer observed. When fixing the dataset, we show that other learning paradigms may learn the data in a different order. We hypothesize that our results reflect how neural networks discover structure in natural datasets.}
}
@InProceedings{pmlr-v119-halabi20a,
title = {Optimal approximation for unconstrained non-submodular minimization},
author = {Halabi, Marwa El and Jegelka, Stefanie},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3961--3972},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/halabi20a/halabi20a.pdf},
url = {http://proceedings.mlr.press/v119/halabi20a.html},
abstract = {Submodular function minimization is well studied, and existing algorithms solve it exactly or up to arbitrary accuracy. However, in many applications, such as structured sparse learning or batch Bayesian optimization, the objective function is not exactly submodular, but close. In this case, no theoretical guarantees exist. Indeed, submodular minimization algorithms rely on intricate connections between submodularity and convexity. We show how these relations can be extended to obtain approximation guarantees for minimizing non-submodular functions, characterized by how close the function is to submodular. We also extend this result to noisy function evaluations. Our approximation results are the first for minimizing non-submodular functions, and are optimal, as established by our matching lower bound.}
}
@InProceedings{pmlr-v119-hamer20a,
title = {{F}ed{B}oost: A Communication-Efficient Algorithm for Federated Learning},
author = {Hamer, Jenny and Mohri, Mehryar and Suresh, Ananda Theertha},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3973--3983},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hamer20a/hamer20a.pdf},
url = {http://proceedings.mlr.press/v119/hamer20a.html},
abstract = {Communication cost is often a bottleneck in federated learning and other client-based distributed learning scenarios. To overcome this, several gradient compression and model compression algorithms have been proposed. In this work, we propose an alternative approach whereby an ensemble of pre-trained base predictors is trained via federated learning. This method allows for training a model which may otherwise surpass the communication bandwidth and storage capacity of the clients to be learned with on-device data through federated learning. Motivated by language modeling, we prove the optimality of ensemble methods for density estimation for standard empirical risk minimization and agnostic risk minimization. We provide communication-efficient ensemble algorithms for federated learning, where per-round communication cost is independent of the size of the ensemble. Furthermore, unlike works on gradient compression, our proposed approach reduces the communication cost of both server-to-client and client-to-server communication.}
}
@InProceedings{pmlr-v119-han20a,
title = {Polynomial Tensor Sketch for Element-wise Function of Low-Rank Matrix},
author = {Han, Insu and Avron, Haim and Shin, Jinwoo},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3984--3993},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/han20a/han20a.pdf},
url = {http://proceedings.mlr.press/v119/han20a.html},
abstract = {This paper studies how to sketch element-wise functions of low-rank matrices. Formally, given low-rank matrix A = [Aij] and scalar non-linear function f, we aim for finding an approximated low-rank representation of the (possibly high-rank) matrix [f(Aij)]. To this end, we propose an efficient sketching-based algorithm whose complexity is significantly lower than the number of entries of A, i.e., it runs without accessing all entries of [f(Aij)] explicitly. The main idea underlying our method is to combine a polynomial approximation of f with the existing tensor sketch scheme for approximating monomials of entries of A. To balance the errors of the two approximation components in an optimal manner, we propose a novel regression formula to find polynomial coefficients given A and f. In particular, we utilize a coreset-based regression with a rigorous approximation guarantee. Finally, we demonstrate the applicability and superiority of the proposed scheme under various machine learning tasks.}
}
@InProceedings{pmlr-v119-han20b,
title = {{DRWR}: A Differentiable Renderer without Rendering for Unsupervised 3{D} Structure Learning from Silhouette Images},
author = {Han, Zhizhong and Chen, Chao and Liu, Yu-Shen and Zwicker, Matthias},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {3994--4005},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/han20b/han20b.pdf},
url = {http://proceedings.mlr.press/v119/han20b.html},
abstract = {Differentiable renderers have been used successfully for unsupervised 3D structure learning from 2D images because they can bridge the gap between 3D and 2D. To optimize 3D shape parameters, current renderers rely on pixel-wise losses between rendered images of 3D reconstructions and ground truth images from corresponding viewpoints. Hence they require interpolation of the recovered 3D structure at each pixel, visibility handling, and optionally evaluating a shading model. In contrast, here we propose a Differentiable Renderer Without Rendering (DRWR) that omits these steps. DRWR only relies on a simple but effective loss that evaluates how well the projections of reconstructed 3D point clouds cover the ground truth object silhouette. Specifically, DRWR employs a smooth silhouette loss to pull the projection of each individual 3D point inside the object silhouette, and a structure-aware repulsion loss to push each pair of projections that fall inside the silhouette far away from each other. Although we omit surface interpolation, visibility handling, and shading, our results demonstrate that DRWR achieves state-of-the-art accuracies under widely used benchmarks, outperforming previous methods both qualitatively and quantitatively. In addition, our training times are significantly lower due to the simplicity of DRWR.}
}
@InProceedings{pmlr-v119-han20c,
title = {{SIGUA}: Forgetting May Make Learning with Noisy Labels More Robust},
author = {Han, Bo and Niu, Gang and Yu, Xingrui and Yao, Quanming and Xu, Miao and Tsang, Ivor and Sugiyama, Masashi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4006--4016},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/han20c/han20c.pdf},
url = {http://proceedings.mlr.press/v119/han20c.html},
abstract = {Given data with noisy labels, over-parameterized deep networks can gradually memorize the data, and fit everything in the end. Although equipped with corrections for noisy labels, many learning methods in this area still suffer overfitting due to undesired memorization. In this paper, to relieve this issue, we propose stochastic integrated gradient underweighted ascent (SIGUA): in a mini-batch, we adopt gradient descent on good data as usual, and learning-rate-reduced gradient ascent on bad data; the proposal is a versatile approach where data goodness or badness is w.r.t. desired or undesired memorization given a base learning method. Technically, SIGUA pulls optimization back for generalization when their goals conflict with each other; philosophically, SIGUA shows forgetting undesired memorization can reinforce desired memorization. Experiments demonstrate that SIGUA successfully robustifies two typical base learning methods, so that their performance is often significantly improved.}
}
@InProceedings{pmlr-v119-han20d,
title = {Training Binary Neural Networks through Learning with Noisy Supervision},
author = {Han, Kai and Wang, Yunhe and Xu, Yixing and Xu, Chunjing and Wu, Enhua and Xu, Chang},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4017--4026},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/han20d/han20d.pdf},
url = {http://proceedings.mlr.press/v119/han20d.html},
abstract = {This paper formalizes the binarization operations over neural networks from a learning perspective. In contrast to classical hand crafted rules (\eg hard thresholding) to binarize full-precision neurons, we propose to learn a mapping from full-precision neurons to the target binary ones. Each individual weight entry will not be binarized independently. Instead, they are taken as a whole to accomplish the binarization, just as they work together in generating convolution features. To help the training of the binarization mapping, the full-precision neurons after taking sign operations is regarded as some auxiliary supervision signal, which is noisy but still has valuable guidance. An unbiased estimator is therefore introduced to mitigate the influence of the supervision noise. Experimental results on benchmark datasets indicate that the proposed binarization technique attains consistent improvements over baselines.}
}
@InProceedings{pmlr-v119-hanzely20a,
title = {Stochastic Subspace Cubic {N}ewton Method},
author = {Hanzely, Filip and Doikov, Nikita and Nesterov, Yurii and Richtarik, Peter},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4027--4038},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hanzely20a/hanzely20a.pdf},
url = {http://proceedings.mlr.press/v119/hanzely20a.html},
abstract = {In this paper, we propose a new randomized second-order optimization algorithm—Stochastic Subspace Cubic Newton (SSCN)—for minimizing a high dimensional convex function $f$. Our method can be seen both as a \emph{stochastic} extension of the cubically-regularized Newton method of Nesterov and Polyak (2006), and a \emph{second-order} enhancement of stochastic subspace descent of Kozak et al. (2019). We prove that as we vary the minibatch size, the global convergence rate of SSCN interpolates between the rate of stochastic coordinate descent (CD) and the rate of cubic regularized Newton, thus giving new insights into the connection between first and second-order methods. Remarkably, the local convergence rate of SSCN matches the rate of stochastic subspace descent applied to the problem of minimizing the quadratic function $\frac12 (x-x^*)^\top \nabla^2f(x^*)(x-x^*)$, where $x^*$ is the minimizer of $f$, and hence depends on the properties of $f$ at the optimum only. Our numerical experiments show that SSCN outperforms non-accelerated first-order CD algorithms while being competitive to their accelerated variants.}
}
@InProceedings{pmlr-v119-hanzely20b,
title = {Variance Reduced Coordinate Descent with Acceleration: New Method With a Surprising Application to Finite-Sum Problems},
author = {Hanzely, Filip and Kovalev, Dmitry and Richtarik, Peter},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4039--4048},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hanzely20b/hanzely20b.pdf},
url = {http://proceedings.mlr.press/v119/hanzely20b.html},
abstract = {We propose an accelerated version of stochastic variance reduced coordinate descent – ASVRCD. As other variance reduced coordinate descent methods such as SEGA or SVRCD, our method can deal with problems that include a non-separable and non-smooth regularizer, while accessing a random block of partial derivatives in each iteration only. However, ASVRCD incorporates Nesterov’s momentum, which offers favorable iteration complexity guarantees over both SEGA and SVRCD. As a by-product of our theory, we show that a variant of Katyusha (Allen-Zhu, 2017) is a specific case of ASVRCD, recovering the optimal oracle complexity for the finite sum objective.}
}
@InProceedings{pmlr-v119-hao20a,
title = {Data Amplification: Instance-Optimal Property Estimation},
author = {Hao, Yi and Orlitsky, Alon},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4049--4059},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hao20a/hao20a.pdf},
url = {http://proceedings.mlr.press/v119/hao20a.html},
abstract = {The best-known and most commonly used technique for distribution-property estimation uses a plug-in estimator, with empirical frequency replacing the underlying distribution. We present novel linear-time-computable estimators that significantly “amplify” the effective amount of data available. For a large variety of distribution properties including four of the most popular ones and for every underlying distribution, they achieve the accuracy that the empirical-frequency plug-in estimators would attain using a logarithmic-factor more samples. Specifically, for Shannon entropy and a broad class of Lipschitz properties including the $L_1$ distance to a fixed distribution, the new estimators use $n$ samples to achieve the accuracy attained by the empirical estimators with $n\log n$ samples. For support-size and coverage, the new estimators use $n$ samples to achieve the performance of empirical frequency with sample size $n$ times the logarithm of the property value. Significantly strengthening the traditional min-max formulation, these results hold not only for the worst distributions, but for each and every underlying distribution. Furthermore, the logarithmic amplification factors are optimal. Experiments on a wide variety of distributions show that the new estimators outperform the previous state-of-the-art estimators designed for each specific property.}
}
@InProceedings{pmlr-v119-hao20b,
title = {Dynamic Knapsack Optimization Towards Efficient Multi-Channel Sequential Advertising},
author = {Hao, Xiaotian and Peng, Zhaoqing and Ma, Yi and Wang, Guan and Jin, Junqi and Hao, Jianye and Chen, Shan and Bai, Rongquan and Xie, Mingzhou and Xu, Miao and Zheng, Zhenzhe and Yu, Chuan and Li, Han and Xu, Jian and Gai, Kun},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4060--4070},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hao20b/hao20b.pdf},
url = {http://proceedings.mlr.press/v119/hao20b.html},
abstract = {In E-commerce, advertising is essential for merchants to reach their target users. The typical objective is to maximize the advertiser’s cumulative revenue over a period of time under a budget constraint. In real applications, an advertisement (ad) usually needs to be exposed to the same user multiple times until the user finally contributes revenue (e.g., places an order). However, existing advertising systems mainly focus on the immediate revenue with single ad exposures, ignoring the contribution of each exposure to the final conversion, thus usually falls into suboptimal solutions. In this paper, we formulate the sequential advertising strategy optimization as a dynamic knapsack problem. We propose a theoretically guaranteed bilevel optimization framework, which significantly reduces the solution space of the original optimization space while ensuring the solution quality. To improve the exploration efficiency of reinforcement learning, we also devise an effective action space reduction approach. Extensive offline and online experiments show the superior performance of our approaches over state-of-the-art baselines in terms of cumulative revenue.}
}
@InProceedings{pmlr-v119-harutyunyan20a,
title = {Improving generalization by controlling label-noise information in neural network weights},
author = {Harutyunyan, Hrayr and Reing, Kyle and Steeg, Greg Ver and Galstyan, Aram},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4071--4081},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/harutyunyan20a/harutyunyan20a.pdf},
url = {http://proceedings.mlr.press/v119/harutyunyan20a.html},
abstract = {In the presence of noisy or incorrect labels, neural networks have the undesirable tendency to memorize information about the noise. Standard regularization techniques such as dropout, weight decay or data augmentation sometimes help, but do not prevent this behavior. If one considers neural network weights as random variables that depend on the data and stochasticity of training, the amount of memorized information can be quantified with the Shannon mutual information between weights and the vector of all training labels given inputs, $I(w; \mathbf{y} \mid \mathbf{x})$. We show that for any training algorithm, low values of this term correspond to reduction in memorization of label-noise and better generalization bounds. To obtain these low values, we propose training algorithms that employ an auxiliary network that predicts gradients in the final layers of a classifier without accessing labels. We illustrate the effectiveness of our approach on versions of MNIST, CIFAR-10, and CIFAR-100 corrupted with various noise models, and on a large-scale dataset Clothing1M that has noisy labels.}
}
@InProceedings{pmlr-v119-hasani20a,
title = {A Natural Lottery Ticket Winner: Reinforcement Learning with Ordinary Neural Circuits},
author = {Hasani, Ramin and Lechner, Mathias and Amini, Alexander and Rus, Daniela and Grosu, Radu},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4082--4093},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hasani20a/hasani20a.pdf},
url = {http://proceedings.mlr.press/v119/hasani20a.html},
abstract = {We propose a neural information processing system obtained by re-purposing the function of a biological neural circuit model to govern simulated and real-world control tasks. Inspired by the structure of the nervous system of the soil-worm, C. elegans, we introduce ordinary neural circuits (ONCs), defined as the model of biological neural circuits reparameterized for the control of alternative tasks. We first demonstrate that ONCs realize networks with higher maximum flow compared to arbitrary wired networks. We then learn instances of ONCs to control a series of robotic tasks, including the autonomous parking of a real-world rover robot. For reconfiguration of the purpose of the neural circuit, we adopt a search-based optimization algorithm. Ordinary neural circuits perform on par and, in some cases, significantly surpass the performance of contemporary deep learning models. ONC networks are compact, 77% sparser than their counterpart neural controllers, and their neural dynamics are fully interpretable at the cell-level.}
}
@InProceedings{pmlr-v119-hasanzadeh20a,
title = {{B}ayesian Graph Neural Networks with Adaptive Connection Sampling},
author = {Hasanzadeh, Arman and Hajiramezanali, Ehsan and Boluki, Shahin and Zhou, Mingyuan and Duffield, Nick and Narayanan, Krishna and Qian, Xiaoning},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4094--4104},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hasanzadeh20a/hasanzadeh20a.pdf},
url = {http://proceedings.mlr.press/v119/hasanzadeh20a.html},
abstract = {We propose a unified framework for adaptive connection sampling in graph neural networks (GNNs) that generalizes existing stochastic regularization methods for training GNNs. The proposed framework not only alleviates over-smoothing and over-fitting tendencies of deep GNNs, but also enables learning with uncertainty in graph analytic tasks with GNNs. Instead of using fixed sampling rates or hand-tuning themas model hyperparameters in existing stochastic regularization methods, our adaptive connection sampling can be trained jointly with GNN model parameters in both global and local fashions. GNN training with adaptive connection sampling is shown to be mathematically equivalent to an efficient approximation of training BayesianGNNs. Experimental results with ablation studies on benchmark datasets validate that adaptively learning the sampling rate given graph training data is the key to boost the performance of GNNs in semi-supervised node classification, less prone to over-smoothing and over-fitting with more robust prediction.}
}
@InProceedings{pmlr-v119-hasenclever20a,
title = {{C}o{M}ic: Complementary Task Learning & Mimicry for Reusable Skills},
author = {Hasenclever, Leonard and Pardo, Fabio and Hadsell, Raia and Heess, Nicolas and Merel, Josh},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4105--4115},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hasenclever20a/hasenclever20a.pdf},
url = {http://proceedings.mlr.press/v119/hasenclever20a.html},
abstract = {Learning to control complex bodies and reuse learned behaviors is a longstanding challenge in continuous control. We study the problem of learning reusable humanoid skills by imitating motion capture data and joint training with complementary tasks. We show that it is possible to learn reusable skills through reinforcement learning on 50 times more motion capture data than prior work. We systematically compare a variety of different network architectures across different data regimes both in terms of imitation performance as well as transfer to challenging locomotion tasks. Finally we show that it is possible to interleave the motion capture tracking with training on complementary tasks, enriching the resulting skill space, and enabling the reuse of skills not well covered by the motion capture data such as getting up from the ground or catching a ball.}
}
@InProceedings{pmlr-v119-hassani20a,
title = {Contrastive Multi-View Representation Learning on Graphs},
author = {Hassani, Kaveh and Khasahmadi, Amir Hosein},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4116--4126},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hassani20a/hassani20a.pdf},
url = {http://proceedings.mlr.press/v119/hassani20a.html},
abstract = {We introduce a self-supervised approach for learning node and graph level representations by contrasting structural views of graphs. We show that unlike visual representation learning, increasing the number of views to more than two or contrasting multi-scale encodings do not improve performance, and the best performance is achieved by contrasting encodings from first-order neighbors and a graph diffusion. We achieve new state-of-the-art results in self-supervised learning on 8 out of 8 node and graph classification benchmarks under the linear evaluation protocol. For example, on Cora (node) and Reddit-Binary (graph) classification benchmarks, we achieve 86.8% and 84.5% accuracy, which are 5.5% and 2.4% relative improvements over previous state-of-the-art. When compared to supervised baselines, our approach outperforms them in 4 out of 8 benchmarks.}
}
@InProceedings{pmlr-v119-hata20a,
title = {Nested Subspace Arrangement for Representation of Relational Data},
author = {Hata, Nozomi and Kaji, Shizuo and Yoshida, Akihiro and Fujisawa, Katsuki},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4127--4137},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hata20a/hata20a.pdf},
url = {http://proceedings.mlr.press/v119/hata20a.html},
abstract = {Studies of acquiring appropriate continuous representations of a discrete objects such as graph and knowledge based data have been conducted by many researches in the field of machine learning. In this paper, we introduce Nested SubSpace arrangement (NSS arrangement), a comprehensive framework for representation learning. We show that existing embedding techniques can be regarded as a member of NSS arrangement. Based on the concept of the NSS arrangement, we implemented Disk-ANChor ARrangement (DANCAR), a representation learning method specializing to reproduce general graphs. Numerical experiments have shown that DANCAR has successfully embedded WordNet in ${\mathbb R}^{20}$ with the F1 score of 0.993 in the reconstruction task. DANCAR is also suitable for visualization to understand the characteristics of graph.}
}
@InProceedings{pmlr-v119-hazimeh20a,
title = {The Tree Ensemble Layer: Differentiability meets Conditional Computation},
author = {Hazimeh, Hussein and Ponomareva, Natalia and Mol, Petros and Tan, Zhenyu and Mazumder, Rahul},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4138--4148},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hazimeh20a/hazimeh20a.pdf},
url = {http://proceedings.mlr.press/v119/hazimeh20a.html},
abstract = {Neural networks and tree ensembles are state-of-the-art learners, each with its unique statistical and computational advantages. We aim to combine these advantages by introducing a new layer for neural networks, composed of an ensemble of differentiable decision trees (a.k.a. soft trees). While differentiable trees demonstrate promising results in the literature, they are typically slow in training and inference as they do not support conditional computation. We mitigate this issue by introducing a new sparse activation function for sample routing, and implement true conditional computation by developing specialized forward and backward propagation algorithms that exploit sparsity. Our efficient algorithms pave the way for jointly training over deep and wide tree ensembles using first-order methods (e.g., SGD). Experiments on 23 classification datasets indicate over 10x speed-ups compared to the differentiable trees used in the literature and over 20x reduction in the number of parameters compared to gradient boosted trees, while maintaining competitive performance. Moreover, experiments on CIFAR, MNIST, and Fashion MNIST indicate that replacing dense layers in CNNs with our tree layer reduces the test loss by 7-53% and the number of parameters by 8x. We provide an open-source TensorFlow implementation with a Keras API.}
}
@InProceedings{pmlr-v119-heckel20a,
title = {Compressive sensing with un-trained neural networks: Gradient descent finds a smooth approximation},
author = {Heckel, Reinhard and Soltanolkotabi, Mahdi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4149--4158},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/heckel20a/heckel20a.pdf},
url = {http://proceedings.mlr.press/v119/heckel20a.html},
abstract = {Un-trained convolutional neural networks have emerged as highly successful tools for image recovery and restoration. They are capable of solving standard inverse problems such as denoising and compressive sensing with excellent results by simply fitting a neural network model to measurements from a single image or signal without the need for any additional training data. For some applications, this critically requires additional regularization in the form of early stopping the optimization. For signal recovery from a few measurements, however, un-trained convolutional networks have an intriguing self-regularizing property: Even though the network can perfectly fit any image, the network recovers a natural image from few measurements when trained with gradient descent until convergence. In this paper, we provide numerical evidence for this property and study it theoretically. We show that—without any further regularization—an un-trained convolutional neural network can approximately reconstruct signals and images that are sufficiently structured, from a near minimal number of random measurements.}
}
@InProceedings{pmlr-v119-hejna20a,
title = {Hierarchically Decoupled Imitation For Morphological Transfer},
author = {Hejna, Donald and Pinto, Lerrel and Abbeel, Pieter},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4159--4171},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hejna20a/hejna20a.pdf},
url = {http://proceedings.mlr.press/v119/hejna20a.html},
abstract = {Learning long-range behaviors on complex high-dimensional agents is a fundamental problem in robot learning. For such tasks, we argue that transferring learned information from a morphologically simpler agent can massively improve the sample efficiency of a more complex one. To this end, we propose a hierarchical decoupling of policies into two parts: an independently learned low-level policy and a transferable high-level policy. To remedy poor transfer performance due to mismatch in morphologies, we contribute two key ideas. First, we show that incentivizing a complex agent’s low-level to imitate a simpler agent’s low-level significantly improves zero-shot high-level transfer. Second, we show that KL-regularized training of the high level stabilizes learning and prevents mode-collapse. Finally, on a suite of publicly released navigation and manipulation environments, we demonstrate the applicability of hierarchical transfer on long-range tasks across morphologies. Our code and videos can be found at https://sites.google.com/berkeley.edu/morphology-transfer.}
}
@InProceedings{pmlr-v119-heliou20a,
title = {Gradient-free Online Learning in Continuous Games with Delayed Rewards},
author = {H{\'e}liou, Am{\'e}lie and Mertikopoulos, Panayotis and Zhou, Zhengyuan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4172--4181},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/heliou20a/heliou20a.pdf},
url = {http://proceedings.mlr.press/v119/heliou20a.html},
abstract = {Motivated by applications to online advertising and recommender systems, we consider a game-theoretic model with delayed rewards and asynchronous, payoff-based feedback. In contrast to previous work on delayed multi-armed bandits, we focus on games with continuous action spaces, and we examine the long-run behavior of strategic agents that follow a no-regret learning policy (but are otherwise oblivious to the game being played, the objectives of their opponents, etc.). To account for the lack of a consistent stream of information (for instance, rewards can arrive out of order and with an a priori unbounded delay), we introduce a gradient-free learning policy where payoff information is placed in a priority queue as it arrives. Somewhat surprisingly, we find that under a standard diagonal concavity assumption, the induced sequence of play converges to Nash Equilibrium (NE) with probability 1, even if the delay between choosing an action and receiving the corresponding reward is unbounded.}
}
@InProceedings{pmlr-v119-henaff20a,
title = {Data-Efficient Image Recognition with Contrastive Predictive Coding},
author = {Henaff, Olivier},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4182--4192},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/henaff20a/henaff20a.pdf},
url = {http://proceedings.mlr.press/v119/henaff20a.html},
abstract = {Human observers can learn to recognize new categories of images from a handful of examples, yet doing so with artificial ones remains an open challenge. We hypothesize that data-efficient recognition is enabled by representations which make the variability in natural signals more predictable. We therefore revisit and improve Contrastive Predictive Coding, an unsupervised objective for learning such representations. This new implementation produces features which support state-of-the-art linear classification accuracy on the ImageNet dataset. When used as input for non-linear classification with deep neural networks, this representation allows us to use 2-5x less labels than classifiers trained directly on image pixels. Finally, this unsupervised representation substantially improves transfer learning to object detection on the PASCAL VOC dataset, surpassing fully supervised pre-trained ImageNet classifiers.}
}
@InProceedings{pmlr-v119-hendrickx20a,
title = {Minimax Rate for Learning From Pairwise Comparisons in the {BTL} Model},
author = {Hendrickx, Julien and Olshevsky, Alex and Saligrama, Venkatesh},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4193--4202},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hendrickx20a/hendrickx20a.pdf},
url = {http://proceedings.mlr.press/v119/hendrickx20a.html},
abstract = {We consider the problem of learning the qualities w_1, ... , w_n of a collection of items by performing noisy comparisons among them. We assume there is a fixed “comparison graph” and every neighboring pair of items is compared k times. We will study the popular Bradley-Terry-Luce model, where the probability that item i wins a comparison against j equals w_i/(w_i + w_j). We are interested in how the expected error in estimating the vector w = (w_1, ... , w_n) behaves in the regime when the number of comparisons k is large. Our contribution is the determination of the minimax rate up to a constant factor. We show that this rate is achieved by a simple algorithm based on weighted least squares, with weights determined from the empirical outcomes of the comparisons. This algorithm can be implemented in nearly linear time in the total number of comparisons.}
}
@InProceedings{pmlr-v119-hendrikx20a,
title = {Statistically Preconditioned Accelerated Gradient Method for Distributed Optimization},
author = {Hendrikx, Hadrien and Xiao, Lin and Bubeck, Sebastien and Bach, Francis and Massoulie, Laurent},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4203--4227},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hendrikx20a/hendrikx20a.pdf},
url = {http://proceedings.mlr.press/v119/hendrikx20a.html},
abstract = {We consider the setting of distributed empirical risk minimization where multiple machines compute the gradients in parallel and a centralized server updates the model parameters. In order to reduce the number of communications required to reach a given accuracy, we propose a preconditioned accelerated gradient method where the preconditioning is done by solving a local optimization problem over a subsampled dataset at the server. The convergence rate of the method depends on the square root of the relative condition number between the global and local loss functions. We estimate the relative condition number for linear prediction models by studying uniform concentration of the Hessians over a bounded domain, which allows us to derive improved convergence rates for existing preconditioned gradient methods and our accelerated method. Experiments on real-world datasets illustrate the benefits of acceleration in the ill-conditioned regime.}
}
@InProceedings{pmlr-v119-heo20a,
title = {Cost-Effective Interactive Attention Learning with Neural Attention Processes},
author = {Heo, Jay and Park, Junhyeon and Jeong, Hyewon and Kim, Kwang Joon and Lee, Juho and Yang, Eunho and Hwang, Sung Ju},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4228--4238},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/heo20a/heo20a.pdf},
url = {http://proceedings.mlr.press/v119/heo20a.html},
abstract = {We propose a novel interactive learning framework which we refer to as Interactive Attention Learning (IAL), in which the human supervisors interactively manipulate the allocated attentions, to correct the model’s behaviour by updating the attention-generating network. However, such a model is prone to overfitting due to scarcity of human annotations, and requires costly retraining. Moreover, it is almost infeasible for the human annotators to examine attentions on tons of instances and features. We tackle these challenges by proposing a sample-efficient attention mechanism and a cost-effective reranking algorithm for instances and features. First, we propose Neural Attention Processes (NAP), which is an attention generator that can update its behaviour by incorporating new attention-level supervisions without any retraining. Secondly, we propose an algorithm which prioritizes the instances and the features by their negative impacts, such that the model can yield large improvements with minimal human feedback. We validate IAL on various time-series datasets from multiple domains (healthcare, real-estate, and computer vision) on which it significantly outperforms baselines with conventional attention mechanisms, or without cost-effective reranking, with substantially less retraining and human-model interaction cost.}
}
@InProceedings{pmlr-v119-hermans20a,
title = {Likelihood-free {MCMC} with Amortized Approximate Ratio Estimators},
author = {Hermans, Joeri and Begy, Volodimir and Louppe, Gilles},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4239--4248},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hermans20a/hermans20a.pdf},
url = {http://proceedings.mlr.press/v119/hermans20a.html},
abstract = {Posterior inference with an intractable likelihood is becoming an increasingly common task in scientific domains which rely on sophisticated computer simulations. Typically, these forward models do not admit tractable densities forcing practitioners to rely on approximations. This work introduces a novel approach to address the intractability of the likelihood and the marginal model. We achieve this by learning a flexible amortized estimator which approximates the likelihood-to-evidence ratio. We demonstrate that the learned ratio estimator can be embedded in \textsc{mcmc} samplers to approximate likelihood-ratios between consecutive states in the Markov chain, allowing us to draw samples from the intractable posterior. Techniques are presented to improve the numerical stability and to measure the quality of an approximation. The accuracy of our approach is demonstrated on a variety of benchmarks against well-established techniques. Scientific applications in physics show its applicability.}
}
@InProceedings{pmlr-v119-hinder20a,
title = {Towards Non-Parametric Drift Detection via Dynamic Adapting Window Independence Drift Detection ({DAWIDD})},
author = {Hinder, Fabian and Artelt, Andr{\'e} and Hammer, Barbara},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4249--4259},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hinder20a/hinder20a.pdf},
url = {http://proceedings.mlr.press/v119/hinder20a.html},
abstract = {The notion of concept drift refers to the phenomenon that the distribution, which is underlying the observed data, changes over time; as a consequence machine learning models may become inaccurate and need adjustment. Many online learning schemes include drift detection to actively detect and react to observed changes. Yet, reliable drift detection constitutes a challenging problem in particular in the context of high dimensional data, varying drift characteristics, and the absence of a parametric model such as a classification scheme which reflects the drift. In this paper we present a novel concept drift detection method, Dynamic Adapting Window Independence Drift Detection (DAWIDD), which aims for non-parametric drift detection of diverse drift characteristics. For this purpose, we establish a mathematical equivalence of the presence of drift to the dependency of specific random variables in an according drift process. This allows us to rely on independence tests rather than parametric models or the classification loss, resulting in a fairly robust scheme to universally detect different types of drift, as it is also confirmed in experiments.}
}
@InProceedings{pmlr-v119-hiranandani20a,
title = {Optimization and Analysis of the p{A}p@k Metric for Recommender Systems},
author = {Hiranandani, Gaurush and Vijitbenjaronk, Warut and Koyejo, Sanmi and Jain, Prateek},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4260--4270},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hiranandani20a/hiranandani20a.pdf},
url = {http://proceedings.mlr.press/v119/hiranandani20a.html},
abstract = {Modern recommendation and notification systems must be robust to data imbalance, limitations on the number of recommendations/notifications, and heterogeneous engagement profiles across users. The pAp@k metric, which combines the partial-AUC and the precision@k metrics, was recently proposed to evaluate such recommendation systems and has been used in real-world deployments. Conceptually, pAp@k measures the probability of correctly ranking a top-ranked positive instance over top-ranked negative instances. Due to the combinatorial aspect surfaced by top-ranked points, little is known about the characteristics and optimization methods of pAp@k. In this paper, we analyze the learning-theoretic properties of pAp@k, particularly its benefits in evaluating modern recommender systems, and propose novel surrogates that are consistent under certain data regularity conditions. We then provide gradient descent based algorithms to optimize the surrogates directly. Our analysis and experimental evaluation suggest that pAp@k indeed exhibits a certain dual behavior with respect to partial-AUC and precision@k. Moreover, the proposed methods outperform all the baselines in various applications. Taken together, our results motivate the use of pAp@k for large-scale recommender systems with heterogeneous user-engagement.}
}
@InProceedings{pmlr-v119-hoang20a,
title = {Optimizing Dynamic Structures with {B}ayesian Generative Search},
author = {Hoang, Minh and Kingsford, Carleton},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4271--4281},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hoang20a/hoang20a.pdf},
url = {http://proceedings.mlr.press/v119/hoang20a.html},
abstract = {Kernel selection for kernel-based methods is prohibitively expensive due to the NP-hard nature of discrete optimization. Since gradient-based optimizers are not applicable due to the lack of a differentiable objective function, many state-of-the-art solutions resort to heuristic search or gradient-free optimization. These approaches, however, require imposing restrictive assumptions on the explorable space of structures such as limiting the active candidate pool, thus depending heavily on the intuition of domain experts. This paper instead proposes \textbf{DTERGENS}, a novel generative search framework that constructs and optimizes a high-performance composite kernel expressions generator. \textbf{DTERGENS} does not restrict the space of candidate kernels and is capable of obtaining flexible length expressions by jointly optimizing a generative termination criterion. We demonstrate that our framework explores more diverse kernels and obtains better performance than state-of-the-art approaches on many real-world predictive tasks.}
}
@InProceedings{pmlr-v119-hoang20b,
title = {Learning Task-Agnostic Embedding of Multiple Black-Box Experts for Multi-Task Model Fusion},
author = {Hoang, Nghia and Lam, Thanh and Low, Bryan Kian Hsiang and Jaillet, Patrick},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4282--4292},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hoang20b/hoang20b.pdf},
url = {http://proceedings.mlr.press/v119/hoang20b.html},
abstract = {Model fusion is an emerging study in collective learning where heterogeneous experts with private data and learning architectures need to combine their black-box knowledge for better performance. Existing literature achieves this via a local knowledge distillation scheme that transfuses the predictive patterns of each pre-trained expert onto a white-box imitator model, which can be incorporated efficiently into a global model. This scheme however does not extend to multi-task scenarios where different experts were trained to solve different tasks and only part of their distilled knowledge is relevant to a new task. To address this multi-task challenge, we develop a new fusion paradigm that represents each expert as a distribution over a spectrum of predictive prototypes, which are isolated from task-specific information encoded within the prototype distribution. The task-agnostic prototypes can then be reintegrated to generate a new model that solves a new task encoded with a different prototype distribution. The fusion and adaptation performance of the proposed framework is demonstrated empirically on several real-world benchmark datasets.}
}
@InProceedings{pmlr-v119-hoang20c,
title = {Parameterized Rate-Distortion Stochastic Encoder},
author = {Hoang, Quan and Le, Trung and Phung, Dinh},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4293--4303},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hoang20c/hoang20c.pdf},
url = {http://proceedings.mlr.press/v119/hoang20c.html},
abstract = {We propose a novel gradient-based tractable approach for the Blahut-Arimoto (BA) algorithm to compute the rate-distortion function where the BA algorithm is fully parameterized. This results in a rich and flexible framework to learn a new class of stochastic encoders, termed PArameterized RAte-DIstortion Stochastic Encoder (PARADISE). The framework can be applied to a wide range of settings from semi-supervised, multi-task to supervised and robust learning. We show that the training objective of PARADISE can be seen as a form of regularization that helps improve generalization. With an emphasis on robust learning we further develop a novel posterior matching objective to encourage smoothness on the loss function and show that PARADISE can significantly improve interpretability as well as robustness to adversarial attacks on the CIFAR-10 and ImageNet datasets. In particular, on the CIFAR-10 dataset, our model reduces standard and adversarial error rates in comparison to the state-of-the-art by 50% and 41%, respectively without the expensive computational cost of adversarial training.}
}
@InProceedings{pmlr-v119-hofer20a,
title = {Topologically Densified Distributions},
author = {Hofer, Christoph and Graf, Florian and Niethammer, Marc and Kwitt, Roland},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4304--4313},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hofer20a/hofer20a.pdf},
url = {http://proceedings.mlr.press/v119/hofer20a.html},
abstract = {We study regularization in the context of small sample-size learning with over-parametrized neural networks. Specifically, we shift focus from architectural properties, such as norms on the network weights, to properties of the internal representations before a linear classifier. Specifically, we impose a topological constraint on samples drawn from the probability measure induced in that space. This provably leads to mass concentration effects around the representations of training instances, i.e., a property beneficial for generalization. By leveraging previous work to impose topological constrains in a neural network setting, we provide empirical evidence (across various vision benchmarks) to support our claim for better generalization.}
}
@InProceedings{pmlr-v119-hofer20b,
title = {Graph Filtration Learning},
author = {Hofer, Christoph and Graf, Florian and Rieck, Bastian and Niethammer, Marc and Kwitt, Roland},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4314--4323},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hofer20b/hofer20b.pdf},
url = {http://proceedings.mlr.press/v119/hofer20b.html},
abstract = {We propose an approach to learning with graph-structured data in the problem domain of graph classification. In particular, we present a novel type of readout operation to aggregate node features into a graph-level representation. To this end, we leverage persistent homology computed via a real-valued, learnable, filter function. We establish the theoretical foundation for differentiating through the persistent homology computation. Empirically, we show that this type of readout operation compares favorably to previous techniques, especially when the graph connectivity structure is informative for the learning problem.}
}
@InProceedings{pmlr-v119-hoffman20a,
title = {Black-Box Variational Inference as a Parametric Approximation to {L}angevin Dynamics},
author = {Hoffman, Matthew and Ma, Yian},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4324--4341},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hoffman20a/hoffman20a.pdf},
url = {http://proceedings.mlr.press/v119/hoffman20a.html},
abstract = {Variational inference (VI) and Markov chain Monte Carlo (MCMC) are approximate posterior inference algorithms that are often said to have complementary strengths, with VI being fast but biased and MCMC being slower but asymptotically unbiased. In this paper, we analyze gradient-based MCMC and VI procedures and find theoretical and empirical evidence that these procedures are not as different as one might think. In particular, a close examination of the Fokker-Planck equation that governs the Langevin dynamics (LD) MCMC procedure reveals that LD implicitly follows a gradient flow that corresponds to a variational inference procedure based on optimizing a nonparametric normalizing flow. This result suggests that the transient bias of LD (due to the Markov chain not having burned in) may track that of VI (due to the optimizer not having converged), up to differences due to VI’s asymptotic bias and parameterization. Empirically, we find that the transient biases of these algorithms (and their momentum-accelerated counterparts) do evolve similarly. This suggests that practitioners with a limited time budget may get more accurate results by running an MCMC procedure (even if it’s far from burned in) than a VI procedure, as long as the variance of the MCMC estimator can be dealt with (e.g., by running many parallel chains).}
}
@InProceedings{pmlr-v119-hoffmann20a,
title = {Learning Mixtures of Graphs from Epidemic Cascades},
author = {Hoffmann, Jessica and Basu, Soumya and Goel, Surbhi and Caramanis, Constantine},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4342--4352},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hoffmann20a/hoffmann20a.pdf},
url = {http://proceedings.mlr.press/v119/hoffmann20a.html},
abstract = {We consider the problem of learning the weighted edges of a balanced mixture of two undirected graphs from epidemic cascades. While mixture models are popular modeling tools, algorithmic development with rigorous guarantees has lagged. Graph mixtures are apparently no exception: until now, very little is known about whether this problem is solvable. To the best of our knowledge, we establish the first necessary and sufficient conditions for this problem to be solvable in polynomial time on edge-separated graphs. When the conditions are met, i.e., when the graphs are connected with at least three edges, we give an efficient algorithm for learning the weights of both graphs with optimal sample complexity (up to log factors). We give complementary results and provide sample-optimal (up to log factors) algorithms for mixtures of directed graphs of out-degree at least three, and for mixture of undirected graphs of unbalanced and/or unknown priors.}
}
@InProceedings{pmlr-v119-horn20a,
title = {Set Functions for Time Series},
author = {Horn, Max and Moor, Michael and Bock, Christian and Rieck, Bastian and Borgwardt, Karsten},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4353--4363},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/horn20a/horn20a.pdf},
url = {http://proceedings.mlr.press/v119/horn20a.html},
abstract = {Despite the eminent successes of deep neural networks, many architectures are often hard to transfer to irregularly-sampled and asynchronous time series that commonly occur in real-world datasets, especially in healthcare applications. This paper proposes a novel approach for classifying irregularly-sampled time series with unaligned measurements, focusing on high scalability and data efficiency. Our method SeFT (Set Functions for Time Series) is based on recent advances in differentiable set function learning, extremely parallelizable with a beneficial memory footprint, thus scaling well to large datasets of long time series and online monitoring scenarios. Furthermore, our approach permits quantifying per-observation contributions to the classification outcome. We extensively compare our method with existing algorithms on multiple healthcare time series datasets and demonstrate that it performs competitively whilst significantly reducing runtime.}
}
@InProceedings{pmlr-v119-hornakova20a,
title = {Lifted Disjoint Paths with Application in Multiple Object Tracking},
author = {Hornakova, Andrea and Henschel, Roberto and Rosenhahn, Bodo and Swoboda, Paul},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4364--4375},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hornakova20a/hornakova20a.pdf},
url = {http://proceedings.mlr.press/v119/hornakova20a.html},
abstract = {We present an extension to the disjoint paths problem in which additional lifted edges are introduced to provide path connectivity priors. We call the resulting optimization problem the lifted disjoint paths problem. We show that this problem is NP-hard by reduction from integer multicommodity flow and 3-SAT. To enable practical global optimization, we propose several classes of linear inequalities that produce a high-quality LP-relaxation. Additionally, we propose efficient cutting plane algorithms for separating the proposed linear inequalities. The lifted disjoint path problem is a natural model for multiple object tracking and allows an elegant mathematical formulation for long range temporal interactions. Lifted edges help to prevent id switches and to re-identify persons. Our lifted disjoint paths tracker achieves nearly optimal assignments with respect to input detections. As a consequence, it leads on all three main benchmarks of the MOT challenge, improving significantly over state-of-the-art.}
}
@InProceedings{pmlr-v119-hron20a,
title = {Infinite attention: {NNGP} and {NTK} for deep attention networks},
author = {Hron, Jiri and Bahri, Yasaman and Sohl-Dickstein, Jascha and Novak, Roman},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4376--4386},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hron20a/hron20a.pdf},
url = {http://proceedings.mlr.press/v119/hron20a.html},
abstract = {There is a growing amount of literature on the relationship between wide neural networks (NNs) and Gaussian processes (GPs), identifying an equivalence between the two for a variety of NN architectures. This equivalence enables, for instance, accurate approximation of the behaviour of wide Bayesian NNs without MCMC or variational approximations, or characterisation of the distribution of randomly initialised wide NNs optimised by gradient descent without ever running an optimiser. We provide a rigorous extension of these results to NNs involving attention layers, showing that unlike single-head attention, which induces non-Gaussian behaviour, multi-head attention architectures behave as GPs as the number of heads tends to infinity. We further discuss the effects of positional encodings and layer normalisation, and propose modifications of the attention mechanism which lead to improved results for both finite and infinitely wide NNs. We evaluate attention kernels empirically, leading to a moderate improvement upon the previous state-of-the-art on CIFAR-10 for GPs without trainable kernels and advanced data preprocessing. Finally, we introduce new features to the Neural Tangents library (Novak et al.,2020) allowing applications of NNGP/NTK models, with and without attention, to variable-length sequences, with an example on the IMDb reviews dataset.}
}
@InProceedings{pmlr-v119-hsieh20a,
title = {The Non-{IID} Data Quagmire of Decentralized Machine Learning},
author = {Hsieh, Kevin and Phanishayee, Amar and Mutlu, Onur and Gibbons, Phillip},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4387--4398},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hsieh20a/hsieh20a.pdf},
url = {http://proceedings.mlr.press/v119/hsieh20a.html},
abstract = {Many large-scale machine learning (ML) applications need to perform decentralized learning over datasets generated at different devices and locations. Such datasets pose a significant challenge to decentralized learning because their different contexts result in significant data distribution skew across devices/locations. In this paper, we take a step toward better understanding this challenge by presenting a detailed experimental study of decentralized DNN training on a common type of data skew: skewed distribution of data labels across devices/locations. Our study shows that: (i) skewed data labels are a fundamental and pervasive problem for decentralized learning, causing significant accuracy loss across many ML applications, DNN models, training datasets, and decentralized learning algorithms; (ii) the problem is particularly challenging for DNN models with batch normalization; and (iii) the degree of data skew is a key determinant of the difficulty of the problem. Based on these findings, we present SkewScout, a system-level approach that adapts the communication frequency of decentralized learning algorithms to the (skew-induced) accuracy loss between data partitions. We also show that group normalization can recover much of the accuracy loss of batch normalization.}
}
@InProceedings{pmlr-v119-hu20a,
title = {“{O}ther-Play” for Zero-Shot Coordination},
author = {Hu, Hengyuan and Lerer, Adam and Peysakhovich, Alex and Foerster, Jakob},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4399--4410},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hu20a/hu20a.pdf},
url = {http://proceedings.mlr.press/v119/hu20a.html},
abstract = {We consider the problem of zero-shot coordination - constructing AI agents that can coordinate with novel partners they have not seen before (e.g.humans). Standard Multi-Agent Reinforcement Learning (MARL) methods typically focus on the self-play (SP) setting where agents construct strategies by playing the game with themselves repeatedly. Unfortunately, applying SP naively to the zero-shot coordination problem can produce agents that establish highly specialized conventions that do not carry over to novel partners they have not been trained with. We introduce a novel learning algorithm called other-play (OP), that enhances self-play by looking for more robust strategies. We characterize OP theoretically as well as experimentally. We study the cooperative card game Hanabi and show that OP agents achieve higher scores when paired with independently trained agents as well as with human players than SP agents.}
}
@InProceedings{pmlr-v119-hu20b,
title = {{XTREME}: A Massively Multilingual Multi-task Benchmark for Evaluating Cross-lingual Generalisation},
author = {Hu, Junjie and Ruder, Sebastian and Siddhant, Aditya and Neubig, Graham and Firat, Orhan and Johnson, Melvin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4411--4421},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/hu20b/hu20b.pdf},
url = {http://proceedings.mlr.press/v119/hu20b.html},
abstract = {Much recent progress in applications of machine learning models to NLP has been driven by benchmarks that evaluate models across a wide variety of tasks. However, these broad-coverage benchmarks have been mostly limited to English, and despite an increasing interest in multilingual models, a benchmark that enables the comprehensive evaluation of such methods on a diverse range of languages and tasks is still missing. To this end, we introduce the Cross-lingual TRansfer Evaluation of Multilingual Encoders (XTREME) benchmark, a multi-task benchmark for evaluating the cross-lingual generalization capabilities of multilingual representations across 40 languages and 9 tasks. We demonstrate that while models tested on English reach human performance on many tasks, there is still a sizable gap in the performance of cross-lingually transferred models, particularly on syntactic and sentence retrieval tasks. There is also a wide spread of results across languages. We will release the benchmark to encourage research on cross-lingual learning methods that transfer linguistic knowledge across a diverse and representative set of languages and tasks.}
}
@InProceedings{pmlr-v119-huang20a,
title = {Momentum-Based Policy Gradient Methods},
author = {Huang, Feihu and Gao, Shangqian and Pei, Jian and Huang, Heng},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4422--4433},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/huang20a/huang20a.pdf},
url = {http://proceedings.mlr.press/v119/huang20a.html},
abstract = {In the paper, we propose a class of efficient momentum-based policy gradient methods for the model-free reinforcement learning, which use adaptive learning rates and do not require any large batches. Specifically, we propose a fast important-sampling momentum-based policy gradient (IS-MBPG) method based on a new momentum-based variance reduced technique and the importance sampling technique. We also propose a fast Hessian-aided momentum-based policy gradient (HA-MBPG) method based on the momentum-based variance reduced technique and the Hessian-aided technique. Moreover, we prove that both the IS-MBPG and HA-MBPG methods reach the best known sample complexity of $O(\epsilon^{-3})$ for finding an $\epsilon$-stationary point of the nonconcave performance function, which only require one trajectory at each iteration. In particular, we present a non-adaptive version of IS-MBPG method, i.e., IS-MBPG*, which also reaches the best known sample complexity of $O(\epsilon^{-3})$ without any large batches. In the experiments, we apply four benchmark tasks to demonstrate the effectiveness of our algorithms.}
}
@InProceedings{pmlr-v119-huang20b,
title = {From Importance Sampling to Doubly Robust Policy Gradient},
author = {Huang, Jiawei and Jiang, Nan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4434--4443},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/huang20b/huang20b.pdf},
url = {http://proceedings.mlr.press/v119/huang20b.html},
abstract = {We show that on-policy policy gradient (PG) and its variance reduction variants can be derived by taking finite-difference of function evaluations supplied by estimators from the importance sampling (IS) family for off-policy evaluation (OPE). Starting from the doubly robust (DR) estimator (Jiang & Li, 2016), we provide a simple derivation of a very general and flexible form of PG, which subsumes the state-of-the-art variance reduction technique (Cheng et al., 2019) as its special case and immediately hints at further variance reduction opportunities overlooked by existing literature. We analyze the variance of the new DR-PG estimator, compare it to existing methods as well as the Cramer-Rao lower bound of policy gradient, and empirically show its effectiveness.}
}
@InProceedings{pmlr-v119-huang20c,
title = {Evaluating Lossy Compression Rates of Deep Generative Models},
author = {Huang, Sicong and Makhzani, Alireza and Cao, Yanshuai and Grosse, Roger},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4444--4454},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/huang20c/huang20c.pdf},
url = {http://proceedings.mlr.press/v119/huang20c.html},
abstract = {The field of deep generative modeling has succeeded in producing astonishingly realistic-seeming images and audio, but quantitative evaluation remains a challenge. Log-likelihood is an appealing metric due to its grounding in statistics and information theory, but it can be challenging to estimate for implicit generative models, and scalar-valued metrics give an incomplete picture of a model’s quality. In this work, we propose to use rate distortion (RD) curves to evaluate and compare deep generative models. While estimating RD curves is seemingly even more computationally demanding than log-likelihood estimation, we show that we can approximate the entire RD curve using nearly the same computations as were previously used to achieve a single log-likelihood estimate. We evaluate lossy compression rates of VAEs, GANs, and adversarial autoencoders (AAEs) on the MNIST and CIFAR10 datasets. Measuring the entire RD curve gives a more complete picture than scalar-valued metrics, and we arrive at a number of insights not obtainable from log-likelihoods alone.}
}
@InProceedings{pmlr-v119-huang20d,
title = {One Policy to Control Them All: Shared Modular Policies for Agent-Agnostic Control},
author = {Huang, Wenlong and Mordatch, Igor and Pathak, Deepak},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4455--4464},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/huang20d/huang20d.pdf},
url = {http://proceedings.mlr.press/v119/huang20d.html},
abstract = {Reinforcement learning is typically concerned with learning control policies tailored to a particular agent. We investigate whether there exists a single global policy that can generalize to control a wide variety of agent morphologies – ones in which even dimensionality of state and action spaces changes. We propose to express this global policy as a collection of identical modular neural networks, dubbed as Shared Modular Policies (SMP), that correspond to each of the agent’s actuators. Every module is only responsible for controlling its corresponding actuator and receives information from only its local sensors. In addition, messages are passed between modules, propagating information between distant modules. We show that a single modular policy can successfully generate locomotion behaviors for several planar agents with different skeletal structures such as monopod hoppers, quadrupeds, bipeds, and generalize to variants not seen during training – a process that would normally require training and manual hyperparameter tuning for each morphology. We observe that a wide variety of drastically diverse locomotion styles across morphologies as well as centralized coordination emerges via message passing between decentralized modules purely from the reinforcement learning objective. Videos and code at https://huangwl18.github.io/modular-rl/}
}
@InProceedings{pmlr-v119-huang20e,
title = {Communication-Efficient Distributed {PCA} by {R}iemannian Optimization},
author = {Huang, Long-Kai and Pan, Sinno},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4465--4474},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/huang20e/huang20e.pdf},
url = {http://proceedings.mlr.press/v119/huang20e.html},
abstract = {In this paper, we study the leading eigenvector problem in a statistically distributed setting and propose a communication-efficient algorithm based on Riemannian optimization, which trades local computation for global communication. Theoretical analysis shows that the proposed algorithm linearly converges to the centralized empirical risk minimization solution regarding the number of communication rounds. When the number of data points in local machines is sufficiently large, the proposed algorithm achieves a significant reduction of communication cost over existing distributed PCA algorithms. Superior performance in terms of communication cost of the proposed algorithm is verified on real-world and synthetic datasets.}
}
@InProceedings{pmlr-v119-huang20f,
title = {Improving Transformer Optimization Through Better Initialization},
author = {Huang, Xiao Shi and Perez, Felipe and Ba, Jimmy and Volkovs, Maksims},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4475--4483},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/huang20f/huang20f.pdf},
url = {http://proceedings.mlr.press/v119/huang20f.html},
abstract = {The Transformer architecture has achieved considerable success recently; the key component of the Transformer is the attention layer that enables the model to focus on important regions within an input sequence. Gradient optimization with attention layers can be notoriously difficult requiring tricks such as learning rate warmup to prevent divergence. As Transformer models are becoming larger and more expensive to train, recent research has focused on understanding and improving optimization in these architectures. In this work our contributions are two-fold: we first investigate and empirically validate the source of optimization problems in the encoder-decoder Transformer architecture; we then propose a new weight initialization scheme with theoretical justification, that enables training without warmup or layer normalization. Empirical results on public machine translation benchmarks show that our approach achieves leading accuracy, allowing to train deep Transformer models with 200 layers in both encoder and decoder (over 1000 attention/MLP blocks) without difficulty. Code for this work is available here: \url{https://github.com/layer6ai-labs/T-Fixup}.}
}
@InProceedings{pmlr-v119-huang20g,
title = {More Information Supervised Probabilistic Deep Face Embedding Learning},
author = {Huang, Ying and Qiu, Shangfeng and Zhang, Wenwei and Luo, Xianghui and Wang, Jinzhuo},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4484--4494},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/huang20g/huang20g.pdf},
url = {http://proceedings.mlr.press/v119/huang20g.html},
abstract = {Researches using margin based comparison loss demonstrate the effectiveness of penalizing the distance between face feature and their corresponding class centers. Despite their popularity and excellent performance, they do not explicitly encourage the generic embedding learning for an open set recognition problem. In this paper, we analyse margin based softmax loss in probability view. With this perspective, we propose two general principles: 1) monotonically decreasing and 2) margin probability penalty, for designing new margin loss functions. Unlike methods optimized with single comparison metric, we provide a new perspective to treat open set face recognition as a problem of information transmission. And the generalization capability for face embedding is gained with more clean information. An auto-encoder architecture called Linear-Auto-TS-Encoder(LATSE) is proposed to corroborate this finding. Extensive experiments on several benchmarks demonstrate that LATSE help face embedding to gain more generalization capability and it boost the single model performance with open training dataset to more than 99% on MegaFace test.}
}
@InProceedings{pmlr-v119-huang20h,
title = {Generating Programmatic Referring Expressions via Program Synthesis},
author = {Huang, Jiani and Smith, Calvin and Bastani, Osbert and Singh, Rishabh and Albarghouthi, Aws and Naik, Mayur},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4495--4506},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/huang20h/huang20h.pdf},
url = {http://proceedings.mlr.press/v119/huang20h.html},
abstract = {Incorporating symbolic reasoning into machine learning algorithms is a promising approach to improve performance on learning tasks that require logical reasoning. We study the problem of generating a programmatic variant of referring expressions that we call referring relational programs. In particular, given a symbolic representation of an image and a target object in that image, the goal is to generate a relational program that uniquely identifies the target object in terms of its attributes and its relations to other objects in the image. We propose a neurosymbolic program synthesis algorithm that combines a policy neural network with enumerative search to generate such relational programs. The policy neural network employs a program interpreter that provides immediate feedback on the consequences of the decisions made by the policy, and also takes into account the uncertainty in the symbolic representation of the image. We evaluate our algorithm on challenging benchmarks based on the CLEVR dataset, and demonstrate that our approach significantly outperforms several baselines.}
}
@InProceedings{pmlr-v119-huang20i,
title = {{I}nsta{H}ide: Instance-hiding Schemes for Private Distributed Learning},
author = {Huang, Yangsibo and Song, Zhao and Li, Kai and Arora, Sanjeev},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4507--4518},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/huang20i/huang20i.pdf},
url = {http://proceedings.mlr.press/v119/huang20i.html},
abstract = {How can multiple distributed entities train a shared deep net on their private data while protecting data privacy? This paper introduces InstaHide, a simple encryption of training images. Encrypted images can be used in standard deep learning pipelines (PyTorch, Federated Learning etc.) with no additional setup or infrastructure. The encryption has a minor effect on test accuracy (unlike differential privacy). Encryption consists of mixing the image with a set of other images (in the sense of Mixup data augmentation technique (Zhang et al., 2018)) followed by applying a random pixel-wise mask on the mixed image. Other contributions of this paper are: (a) Use of large public dataset of images (e.g. ImageNet) for mixing during encryption; this improves security. (b) Experiments demonstrating effectiveness in protecting privacy against known attacks while preserving model accuracy. (c) Theoretical analysis showing that successfully attacking privacy requires attackers to solve a difficult computational problem. (d) Demonstration that Mixup alone is insecure as (contrary to recent proposals), by showing some efficient attacks. (e) Release of a challenge dataset to allow design of new attacks.}
}
@InProceedings{pmlr-v119-huang20j,
title = {Accelerated Stochastic Gradient-free and Projection-free Methods},
author = {Huang, Feihu and Tao, Lue and Chen, Songcan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4519--4530},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/huang20j/huang20j.pdf},
url = {http://proceedings.mlr.press/v119/huang20j.html},
abstract = {In the paper, we propose a class of accelerated stochastic gradient-free and projection-free (a.k.a., zeroth-order Frank-Wolfe) methods to solve the constrained stochastic and finite-sum nonconvex optimization. Specifically, we propose an accelerated stochastic zeroth-order Frank-Wolfe (Acc-SZOFW) method based on the variance reduced technique of SPIDER/SpiderBoost and a novel momentum accelerated technique. Moreover, under some mild conditions, we prove that the Acc-SZOFW has the function query complexity of $O(d\sqrt{n}\epsilon^{-2})$ for finding an $\epsilon$-stationary point in the finite-sum problem, which improves the exiting best result by a factor of $O(\sqrt{n}\epsilon^{-2})$, and has the function query complexity of $O(d\epsilon^{-3})$ in the stochastic problem, which improves the exiting best result by a factor of $O(\epsilon^{-1})$. To relax the large batches required in the Acc-SZOFW, we further propose a novel accelerated stochastic zeroth-order Frank-Wolfe (Acc-SZOFW*) based on a new variance reduced technique of STORM, which still reaches the function query complexity of $O(d\epsilon^{-3})$ in the stochastic problem without relying on any large batches. In particular, we present an accelerated framework of the Frank-Wolfe methods based on the proposed momentum accelerated technique. The extensive experimental results on black-box adversarial attack and robust black-box classification demonstrate the efficiency of our algorithms.}
}
@InProceedings{pmlr-v119-huang20k,
title = {Deep Graph Random Process for Relational-Thinking-Based Speech Recognition},
author = {Huang, Hengguan and Xue, Fuzhao and Wang, Hao and Wang, Ye},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4531--4541},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/huang20k/huang20k.pdf},
url = {http://proceedings.mlr.press/v119/huang20k.html},
abstract = {Lying at the core of human intelligence, relational thinking is characterized by initially relying on innumerable unconscious percepts pertaining to relations between new sensory signals and prior knowledge, consequently becoming a recognizable concept or object through coupling and transformation of these percepts. Such mental processes are difficult to model in real-world problems such as in conversational automatic speech recognition (ASR), as the percepts (if they are modelled as graphs indicating relationships among utterances) are supposed to be innumerable and not directly observable. In this paper, we present a Bayesian nonparametric deep learning method called deep graph random process (DGP) that can generate an infinite number of probabilistic graphs representing percepts. We further provide a closed-form solution for coupling and transformation of these percept graphs for acoustic modeling. Our approach is able to successfully infer relations among utterances without using any relational data during training. Experimental evaluations on ASR tasks including CHiME-2 and CHiME-5 demonstrate the effectiveness and benefits of our method.}
}
@InProceedings{pmlr-v119-huang20l,
title = {Dynamics of Deep Neural Networks and Neural Tangent Hierarchy},
author = {Huang, Jiaoyang and Yau, Horng-Tzer},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4542--4551},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/huang20l/huang20l.pdf},
url = {http://proceedings.mlr.press/v119/huang20l.html},
abstract = {The evolution of a deep neural network trained by the gradient descent in the overparametrization regime can be described by its neural tangent kernel (NTK) \cite{jacot2018neural, du2018gradient1,du2018gradient2,arora2019fine}. It was observed \cite{arora2019exact} that there is a performance gap between the kernel regression using the limiting NTK and the deep neural networks. We study the dynamic of neural networks of finite width and derive an infinite hierarchy of differential equations, the neural tangent hierarchy (NTH). We prove that the NTH hierarchy truncated at the level $p\geq 2$ approximates the dynamic of the NTK up to arbitrary precision under certain conditions on the neural network width and the data set dimension. The assumptions needed for these approximations become weaker as $p$ increases. Finally, NTH can be viewed as higher order extensions of NTK. In particular, the NTH truncated at $p=2$ recovers the NTK dynamics.}
}
@InProceedings{pmlr-v119-huh20a,
title = {Curvature-corrected learning dynamics in deep neural networks},
author = {Huh, Dongsung},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4552--4560},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/huh20a/huh20a.pdf},
url = {http://proceedings.mlr.press/v119/huh20a.html},
abstract = {Deep neural networks exhibit complex learning dynamics due to their non-convex loss landscapes. Second-order optimization methods facilitate learning dynamics by compensating for ill-conditioned curvature. In this work, we investigate how curvature correction modifies the learning dynamics in deep linear neural networks and provide analytical solutions. We derive a generalized conservation law that preserves the path of parameter dynamics from curvature correction, which shows that curvature correction only modifies the temporal profiles of dynamics along the path. We show that while curvature correction accelerates the convergence dynamics of the input-output map, it can also negatively affect the generalization performance. Our analysis also reveals an undesirable effect of curvature correction that compromises stability of parameters dynamics during learning, especially with block-diagonal approximation of natural gradient descent. We introduce fractional curvature correction that resolves this problem while retaining most of the acceleration benefits of full curvature correction.}
}
@InProceedings{pmlr-v119-huynh20a,
title = {Multigrid Neural Memory},
author = {Huynh, Tri and Maire, Michael and Walter, Matthew},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4561--4571},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/huynh20a/huynh20a.pdf},
url = {http://proceedings.mlr.press/v119/huynh20a.html},
abstract = {We introduce a novel approach to endowing neural networks with emergent, long-term, large-scale memory. Distinct from strategies that connect neural networks to external memory banks via intricately crafted controllers and hand-designed attentional mechanisms, our memory is internal, distributed, co-located alongside computation, and implicitly addressed, while being drastically simpler than prior efforts. Architecting networks with multigrid structure and connectivity, while distributing memory cells alongside computation throughout this topology, we observe the emergence of coherent memory subsystems. Our hierarchical spatial organization, parameterized convolutionally, permits efficient instantiation of large-capacity memories, while multigrid topology provides short internal routing pathways, allowing convolutional networks to efficiently approximate the behavior of fully connected networks. Such networks have an implicit capacity for internal attention; augmented with memory, they learn to read and write specific memory locations in a dynamic data-dependent manner. We demonstrate these capabilities on exploration and mapping tasks, where our network is able to self-organize and retain long-term memory for trajectories of thousands of time steps. On tasks decoupled from any notion of spatial geometry: sorting, associative recall, and question answering, our design functions as a truly generic memory and yields excellent results.}
}
@InProceedings{pmlr-v119-iakovleva20a,
title = {Meta-Learning with Shared Amortized Variational Inference},
author = {Iakovleva, Ekaterina and Verbeek, Jakob and Alahari, Karteek},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4572--4582},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/iakovleva20a/iakovleva20a.pdf},
url = {http://proceedings.mlr.press/v119/iakovleva20a.html},
abstract = {We propose a novel amortized variational inference scheme for an empirical Bayes meta-learning model, where model parameters are treated as latent variables. We learn the prior distribution over model parameters conditioned on limited training data using a variational autoencoder approach. Our framework proposes sharing the same amortized inference network between the conditional prior and variational posterior distributions over the model parameters. While the posterior leverages both the labeled support and query data, the conditional prior is based only on the labeled support data. We show that in earlier work, relying on Monte-Carlo approximation, the conditional prior collapses to a Dirac delta function. In contrast, our variational approach prevents this collapse and preserves uncertainty over the model parameters. We evaluate our approach on the miniImageNet, CIFAR-FS and FC100 datasets, and present results demonstrating its advantages over previous work.}
}
@InProceedings{pmlr-v119-ibrahim20a,
title = {Linear Lower Bounds and Conditioning of Differentiable Games},
author = {Ibrahim, Adam and Azizian, Wa\"{\i}ss and Gidel, Gauthier and Mitliagkas, Ioannis},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4583--4593},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ibrahim20a/ibrahim20a.pdf},
url = {http://proceedings.mlr.press/v119/ibrahim20a.html},
abstract = {Recent successes of game-theoretic formulations in ML have caused a resurgence of research interest in differentiable games. Overwhelmingly, that research focuses on methods and upper bounds on their speed of convergence. In this work, we approach the question of fundamental iteration complexity by providing lower bounds to complement the linear (i.e. geometric) upper bounds observed in the literature on a wide class of problems. We cast saddle-point and min-max problems as 2-player games. We leverage tools from single-objective convex optimisation to propose new linear lower bounds for convex-concave games. Notably, we give a linear lower bound for $n$-player differentiable games, by using the spectral properties of the update operator. We then propose a new definition of the condition number arising from our lower bound analysis. Unlike past definitions, our condition number captures the fact that linear rates are possible in games, even in the absence of strong convexity or strong concavity in the variables.}
}
@InProceedings{pmlr-v119-ida20a,
title = {Fast Deterministic {CUR} Matrix Decomposition with Accuracy Assurance},
author = {Ida, Yasutoshi and Kanai, Sekitoshi and Fujiwara, Yasuhiro and Iwata, Tomoharu and Takeuchi, Koh and Kashima, Hisashi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4594--4603},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ida20a/ida20a.pdf},
url = {http://proceedings.mlr.press/v119/ida20a.html},
abstract = {The deterministic CUR matrix decomposition is a low-rank approximation method to analyze a data matrix. It has attracted considerable attention due to its high interpretability, which results from the fact that the decomposed matrices consist of subsets of the original columns and rows of the data matrix. The subset is obtained by optimizing an objective function with sparsity-inducing norms via coordinate descent. However, the existing algorithms for optimization incur high computation costs. This is because coordinate descent iteratively updates all the parameters in the objective until convergence. This paper proposes a fast deterministic CUR matrix decomposition. Our algorithm safely skips unnecessary updates by efficiently evaluating the optimality conditions for the parameters to be zeros. In addition, we preferentially update the parameters that must be nonzeros. Theoretically, our approach guarantees the same result as the original approach. Experiments demonstrate that our algorithm speeds up the deterministic CUR while achieving the same accuracy.}
}
@InProceedings{pmlr-v119-ishida20a,
title = {Do We Need Zero Training Loss After Achieving Zero Training Error?},
author = {Ishida, Takashi and Yamane, Ikko and Sakai, Tomoya and Niu, Gang and Sugiyama, Masashi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4604--4614},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ishida20a/ishida20a.pdf},
url = {http://proceedings.mlr.press/v119/ishida20a.html},
abstract = {Overparameterized deep networks have the capacity to memorize training data with zero \emph{training error}. Even after memorization, the \emph{training loss} continues to approach zero, making the model overconfident and the test performance degraded. Since existing regularizers do not directly aim to avoid zero training loss, it is hard to tune their hyperparameters in order to maintain a fixed/preset level of training loss. We propose a direct solution called \emph{flooding} that intentionally prevents further reduction of the training loss when it reaches a reasonably small value, which we call the \emph{flood level}. Our approach makes the loss float around the flood level by doing mini-batched gradient descent as usual but gradient ascent if the training loss is below the flood level. This can be implemented with one line of code and is compatible with any stochastic optimizer and other regularizers. With flooding, the model will continue to “random walk” with the same non-zero training loss, and we expect it to drift into an area with a flat loss landscape that leads to better generalization. We experimentally show that flooding improves performance and, as a byproduct, induces a double descent curve of the test loss.}
}
@InProceedings{pmlr-v119-izmailov20a,
title = {Semi-Supervised Learning with Normalizing Flows},
author = {Izmailov, Pavel and Kirichenko, Polina and Finzi, Marc and Wilson, Andrew Gordon},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4615--4630},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/izmailov20a/izmailov20a.pdf},
url = {http://proceedings.mlr.press/v119/izmailov20a.html},
abstract = {Normalizing flows transform a latent distribution through an invertible neural network for a flexible and pleasingly simple approach to generative modelling, while preserving an exact likelihood. We propose FlowGMM, an end-to-end approach to generative semi supervised learning with normalizing flows, using a latent Gaussian mixture model. FlowGMM is distinct in its simplicity, unified treatment of labelled and unlabelled data with an exact likelihood, interpretability, and broad applicability beyond image data. We show promising results on a wide range of applications, including AG-News and Yahoo Answers text data, tabular data, and semi-supervised image classification. We also show that FlowGMM can discover interpretable structure, provide real-time optimization-free feature visualizations, and specify well calibrated predictive distributions.}
}
@InProceedings{pmlr-v119-jacot20a,
title = {Implicit Regularization of Random Feature Models},
author = {Jacot, Arthur and Simsek, Berfin and Spadaro, Francesco and Hongler, Clement and Gabriel, Franck},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4631--4640},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jacot20a/jacot20a.pdf},
url = {http://proceedings.mlr.press/v119/jacot20a.html},
abstract = {Random Features (RF) models are used as efficient parametric approximations of kernel methods. We investigate, by means of random matrix theory, the connection between Gaussian RF models and Kernel Ridge Regression (KRR). For a Gaussian RF model with $P$ features, $N$ data points, and a ridge $\lambda$, we show that the average (i.e. expected) RF predictor is close to a KRR predictor with an \emph{effective ridge} $\tilde{\lambda}$. We show that $\tilde{\lambda} > \lambda$ and $\tilde{\lambda} \searrow \lambda$ monotonically as $P$ grows, thus revealing the \emph{implicit regularization effect} of finite RF sampling. We then compare the risk (i.e. test error) of the $\tilde{\lambda}$-KRR predictor with the average risk of the $\lambda$-RF predictor and obtain a precise and explicit bound on their difference. Finally, we empirically find an extremely good agreement between the test errors of the average $\lambda$-RF predictor and $\tilde{\lambda}$-KRR predictor.}
}
@InProceedings{pmlr-v119-jafarov20a,
title = {Correlation Clustering with Asymmetric Classification Errors},
author = {Jafarov, Jafar and Kalhan, Sanchit and Makarychev, Konstantin and Makarychev, Yury},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4641--4650},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jafarov20a/jafarov20a.pdf},
url = {http://proceedings.mlr.press/v119/jafarov20a.html},
abstract = {In the Correlation Clustering problem, we are given a weighted graph $G$ with its edges labelled as "similar" or "dissimilar" by a binary classifier. The goal is to produce a clustering that minimizes the weight of "disagreements": the sum of the weights of "similar" edges across clusters and "dissimilar" edges within clusters. We study the correlation clustering problem under the following assumption: Every "similar" edge $e$ has weight $w_e \in [ \alpha w, w ]$ and every "dissimilar" edge $e$ has weight $w_e \geq \alpha w$ (where $\alpha \leq 1$ and $w > 0$ is a scaling parameter). We give a $(3 + 2 \log_e (1/\alpha))$ approximation algorithm for this problem. This assumption captures well the scenario when classification errors are asymmetric. Additionally, we show an asymptotically matching Linear Programming integrality gap of $\Omega(\log 1/\alpha)$.}
}
@InProceedings{pmlr-v119-jain20a,
title = {Optimal Robust Learning of Discrete Distributions from Batches},
author = {Jain, Ayush and Orlitsky, Alon},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4651--4660},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jain20a/jain20a.pdf},
url = {http://proceedings.mlr.press/v119/jain20a.html},
abstract = {Many applications, including natural language processing, sensor networks, collaborative filtering, and federated learning, call for estimating discrete distributions from data collected in batches, some of which may be untrustworthy, erroneous, faulty, or even adversarial. Previous estimators for this setting ran in exponential time, and for some regimes required a suboptimal number of batches. We provide the first polynomial-time estimator that is optimal in the number of batches and achieves essentially the best possible estimation accuracy.}
}
@InProceedings{pmlr-v119-jain20b,
title = {Generalization to New Actions in Reinforcement Learning},
author = {Jain, Ayush and Szot, Andrew and Lim, Joseph},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4661--4672},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jain20b/jain20b.pdf},
url = {http://proceedings.mlr.press/v119/jain20b.html},
abstract = {A fundamental trait of intelligence is the ability to achieve goals in the face of novel circumstances, such as making decisions from new action choices. However, standard reinforcement learning assumes a fixed set of actions and requires expensive retraining when given a new action set. To make learning agents more adaptable, we introduce the problem of zero-shot generalization to new actions. We propose a two-stage framework where the agent first infers action representations from action information acquired separately from the task. A policy flexible to varying action sets is then trained with generalization objectives. We benchmark generalization on sequential tasks, such as selecting from an unseen tool-set to solve physical reasoning puzzles and stacking towers with novel 3D shapes. Videos and code are available at https://sites.google.com/view/action-generalization.}
}
@InProceedings{pmlr-v119-jaini20a,
title = {Tails of {L}ipschitz Triangular Flows},
author = {Jaini, Priyank and Kobyzev, Ivan and Yu, Yaoliang and Brubaker, Marcus},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4673--4681},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jaini20a/jaini20a.pdf},
url = {http://proceedings.mlr.press/v119/jaini20a.html},
abstract = {We investigate the ability of popular flow models to capture tail-properties of a target density by studying the increasing triangular maps used in these flow methods acting on a tractable source density. We show that the density quantile functions of the source and target density provide a precise characterization of the slope of transformation required to capture tails in a target density. We further show that any Lipschitz-continuous transport map acting on a source density will result in a density with similar tail properties as the source, highlighting the trade-off between the importance of choosing a complex source density and a sufficiently expressive transformation to capture desirable properties of a target density. Subsequently, we illustrate that flow models like Real-NVP, MAF, and Glow as implemented lack the ability to capture a distribution with non-Gaussian tails. We circumvent this problem by proposing tail-adaptive flows consisting of a source distribution that can be learned simultaneously with the triangular map to capture tail-properties of a target density. We perform several synthetic and real-world experiments to complement our theoretical findings.}
}
@InProceedings{pmlr-v119-james20a,
title = {Learning Portable Representations for High-Level Planning},
author = {James, Steven and Rosman, Benjamin and Konidaris, George},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4682--4691},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/james20a/james20a.pdf},
url = {http://proceedings.mlr.press/v119/james20a.html},
abstract = {We present a framework for autonomously learning a portable representation that describes a collection of low-level continuous environments. We show that these abstract representations can be learned in a task-independent egocentric space specific to the agent that, when grounded with problem-specific information, are provably sufficient for planning. We demonstrate transfer in two different domains, where an agent learns a portable, task-independent symbolic vocabulary, as well as operators expressed in that vocabulary, and then learns to instantiate those operators on a per-task basis. This reduces the number of samples required to learn a representation of a new task.}
}
@InProceedings{pmlr-v119-janati20a,
title = {Debiased {S}inkhorn barycenters},
author = {Janati, Hicham and Cuturi, Marco and Gramfort, Alexandre},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4692--4701},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/janati20a/janati20a.pdf},
url = {http://proceedings.mlr.press/v119/janati20a.html},
abstract = {Entropy regularization in optimal transport (OT) has been the driver of many recent interests for Wasserstein metrics and barycenters in machine learning. It allows to keep the appealing geometrical properties of the unregularized Wasserstein distance while having a significantly lower complexity thanks to Sinkhorn’s algorithm. However, entropy brings some inherent smoothing bias, resulting for example in blurred barycenters. This side effect has prompted an increasing temptation in the community to settle for a slower algorithm such as log-domain stabilized Sinkhorn which breaks the parallel structure that can be leveraged on GPUs, or even go back to unregularized OT. Here we show how this bias is tightly linked to the reference measure that defines the entropy regularizer and propose debiased Sinkhorn barycenters that preserve the best of worlds: fast Sinkhorn-like iterations without entropy smoothing. Theoretically, we prove that this debiasing is perfect for Gaussian distributions with equal variance. Empirically, we illustrate the reduced blurring and the computational advantage.}
}
@InProceedings{pmlr-v119-jankowiak20a,
title = {Parametric {G}aussian Process Regressors},
author = {Jankowiak, Martin and Pleiss, Geoff and Gardner, Jacob},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4702--4712},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jankowiak20a/jankowiak20a.pdf},
url = {http://proceedings.mlr.press/v119/jankowiak20a.html},
abstract = {The combination of inducing point methods with stochastic variational inference has enabled approximate Gaussian Process (GP) inference on large datasets. Unfortunately, the resulting predictive distributions often exhibit substantially underestimated uncertainties. Notably, in the regression case the predictive variance is typically dominated by observation noise, yielding uncertainty estimates that make little use of the input-dependent function uncertainty that makes GP priors attractive. In this work we propose two simple methods for scalable GP regression that address this issue and thus yield substantially improved predictive uncertainties. The first applies variational inference to FITC (Fully Independent Training Conditional; Snelson et. al. 2006). The second bypasses posterior approximations and instead directly targets the posterior predictive distribution. In an extensive empirical comparison with a number of alternative methods for scalable GP regression, we find that the resulting predictive distributions exhibit significantly better calibrated uncertainties and higher log likelihoods–often by as much as half a nat per datapoint.}
}
@InProceedings{pmlr-v119-jarrett20a,
title = {Inverse Active Sensing: Modeling and Understanding Timely Decision-Making},
author = {Jarrett, Daniel and Van Der Schaar, Mihaela},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4713--4723},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jarrett20a/jarrett20a.pdf},
url = {http://proceedings.mlr.press/v119/jarrett20a.html},
abstract = {Evidence-based decision-making entails collecting (costly) observations about an underlying phenomenon of interest, and subsequently committing to an (informed) decision on the basis of accumulated evidence. In this setting, *active sensing* is the goal-oriented problem of efficiently selecting which acquisitions to make, and when and what decision to settle on. As its complement, *inverse active sensing* seeks to uncover an agent’s preferences and strategy given their observable decision-making behavior. In this paper, we develop an expressive, unified framework for the general setting of evidence-based decision-making under endogenous, context-dependent time pressure—which requires negotiating (subjective) tradeoffs between accuracy, speediness, and cost of information. Using this language, we demonstrate how it enables *modeling* intuitive notions of surprise, suspense, and optimality in decision strategies (the forward problem). Finally, we illustrate how this formulation enables *understanding* decision-making behavior by quantifying preferences implicit in observed decision strategies (the inverse problem).}
}
@InProceedings{pmlr-v119-jayaram20a,
title = {Source Separation with Deep Generative Priors},
author = {Jayaram, Vivek and Thickstun, John},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4724--4735},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jayaram20a/jayaram20a.pdf},
url = {http://proceedings.mlr.press/v119/jayaram20a.html},
abstract = {Despite substantial progress in signal source separation, results for richly structured data continue to contain perceptible artifacts. In contrast, recent deep generative models can produce authentic samples in a variety of domains that are indistinguishable from samples of the data distribution. This paper introduces a Bayesian approach to source separation that uses deep generative models as priors over the components of a mixture of sources, and noise-annealed Langevin dynamics to sample from the posterior distribution of sources given a mixture. This decouples the source separation problem from generative modeling, enabling us to directly use cutting-edge generative models as priors. The method achieves state-of-the-art performance for MNIST digit separation. We introduce new methodology for evaluating separation quality on richer datasets, providing quantitative evaluation and qualitative discussion of results for CIFAR-10 image separation. We also provide qualitative results on LSUN.}
}
@InProceedings{pmlr-v119-jelassi20a,
title = {Extra-gradient with player sampling for faster convergence in n-player games},
author = {Jelassi, Samy and Domingo-Enrich, Carles and Scieur, Damien and Mensch, Arthur and Bruna, Joan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4736--4745},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jelassi20a/jelassi20a.pdf},
url = {http://proceedings.mlr.press/v119/jelassi20a.html},
abstract = {Data-driven modeling increasingly requires to find a Nash equilibrium in multi-player games, e.g. when training GANs. In this paper, we analyse a new extra-gradient method for Nash equilibrium finding, that performs gradient extrapolations and updates on a random subset of players at each iteration. This approach provably exhibits a better rate of convergence than full extra-gradient for non-smooth convex games with noisy gradient oracle. We propose an additional variance reduction mechanism to obtain speed-ups in smooth convex games. Our approach makes extrapolation amenable to massive multiplayer settings, and brings empirical speed-ups, in particular when using a heuristic cyclic sampling scheme. Most importantly, it allows to train faster and better GANs and mixtures of GANs.}
}
@InProceedings{pmlr-v119-jeon20a,
title = {T-{GD}: Transferable {GAN}-generated Images Detection Framework},
author = {Jeon, Hyeonseong and Bang, Young Oh and Kim, Junyaup and Woo, Simon},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4746--4761},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jeon20a/jeon20a.pdf},
url = {http://proceedings.mlr.press/v119/jeon20a.html},
abstract = {Recent advancements in Generative Adversarial Networks (GANs) enable the generation of highly realistic images, raising concerns about their misuse for malicious purposes. Detecting these GAN-generated images (GAN-images) becomes increasingly challenging due to the significant reduction of underlying artifacts and specific patterns. The absence of such traces can hinder detection algorithms from identifying GAN-images and transferring knowledge to identify other types of GAN-images as well. In this work, we present the Transferable GAN-images Detection framework T-GD, a robust transferable framework for an effective detection of GAN-images. T-GD is composed of a teacher and a student model that can iteratively teach and evaluate each other to improve the detection performance. First, we train the teacher model on the source dataset and use it as a starting point for learning the target dataset. To train the student model, we inject noise by mixing up the source and target datasets, while constraining the weight variation to preserve the starting point. Our approach is a self-training method, but distinguishes itself from prior approaches by focusing on improving the transferability of GAN-image detection. T-GD achieves high performance on the source dataset by overcoming catastrophic forgetting and effectively detecting state-of-the-art GAN-images with only a small volume of data without any metadata information.}
}
@InProceedings{pmlr-v119-ji20a,
title = {History-Gradient Aided Batch Size Adaptation for Variance Reduced Algorithms},
author = {Ji, Kaiyi and Wang, Zhe and Weng, Bowen and Zhou, Yi and Zhang, Wei and Liang, Yingbin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4762--4772},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/ji20a/ji20a.pdf},
url = {http://proceedings.mlr.press/v119/ji20a.html},
abstract = {Variance-reduced algorithms, although achieve great theoretical performance, can run slowly in practice due to the periodic gradient estimation with a large batch of data. Batch-size adaptation thus arises as a promising approach to accelerate such algorithms. However, existing schemes either apply prescribed batch-size adaption rule or exploit the information along optimization path via additional backtracking and condition verification steps. In this paper, we propose a novel scheme, which eliminates backtracking line search but still exploits the information along optimization path by adapting the batch size via history stochastic gradients. We further theoretically show that such a scheme substantially reduces the overall complexity for popular variance-reduced algorithms SVRG and SARAH/SPIDER for both conventional nonconvex optimization and reinforcement learning problems. To this end, we develop a new convergence analysis framework to handle the dependence of the batch size on history stochastic gradients. Extensive experiments validate the effectiveness of the proposed batch-size adaptation scheme.}
}
@InProceedings{pmlr-v119-jia20a,
title = {Information-Theoretic Local Minima Characterization and Regularization},
author = {Jia, Zhiwei and Su, Hao},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4773--4783},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jia20a/jia20a.pdf},
url = {http://proceedings.mlr.press/v119/jia20a.html},
abstract = {Recent advances in deep learning theory have evoked the study of generalizability across different local minima of deep neural networks (DNNs). While current work focused on either discovering properties of good local minima or developing regularization techniques to induce good local minima, no approach exists that can tackle both problems. We achieve these two goals successfully in a unified manner. Specifically, based on the observed Fisher information we propose a metric both strongly indicative of generalizability of local minima and effectively applied as a practical regularizer. We provide theoretical analysis including a generalization bound and empirically demonstrate the success of our approach in both capturing and improving the generalizability of DNNs. Experiments are performed on CIFAR-10, CIFAR-100 and ImageNet for various network architectures.}
}
@InProceedings{pmlr-v119-jiang20a,
title = {Optimizing Black-box Metrics with Adaptive Surrogates},
author = {Jiang, Qijia and Adigun, Olaoluwa and Narasimhan, Harikrishna and Fard, Mahdi Milani and Gupta, Maya},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4784--4793},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jiang20a/jiang20a.pdf},
url = {http://proceedings.mlr.press/v119/jiang20a.html},
abstract = {We address the problem of training models with black-box and hard-to-optimize metrics by expressing the metric as a monotonic function of a small number of easy-to-optimize surrogates. We pose the training problem as an optimization over a relaxed surrogate space, which we solve by estimating local gradients for the metric and performing inexact convex projections. We analyze gradient estimates based on finite differences and local linear interpolations, and show convergence of our approach under smoothness assumptions with respect to the surrogates. Experimental results on classification and ranking problems verify the proposal performs on par with methods that know the mathematical formulation, and adds notable value when the form of the metric is unknown.}
}
@InProceedings{pmlr-v119-jiang20b,
title = {{BINOCULARS} for efficient, nonmyopic sequential experimental design},
author = {Jiang, Shali and Chai, Henry and Gonzalez, Javier and Garnett, Roman},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4794--4803},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jiang20b/jiang20b.pdf},
url = {http://proceedings.mlr.press/v119/jiang20b.html},
abstract = {Finite-horizon sequential experimental design (SED) arises naturally in many contexts, including hyperparameter tuning in machine learning among more traditional settings. Computing the optimal policy for such problems requires solving Bellman equations, which are generally intractable. Most existing work resorts to severely myopic approximations by limiting the decision horizon to only a single time-step, which can underweight exploration in favor of exploitation. We present BINOCULARS: Batch-Informed NOnmyopic Choices, Using Long-horizons for Adaptive, Rapid SED, a general framework for deriving efficient, nonmyopic approximations to the optimal experimental policy. Our key idea is simple and surprisingly effective: we first compute a one-step optimal batch of experiments, then select a single point from this batch to evaluate. We realize BINOCULARS for Bayesian optimization and Bayesian quadrature – two notable example problems with radically different objectives – and demonstrate that BINOCULARS significantly outperforms significantly outperforms myopic alternatives in real-world scenarios.}
}
@InProceedings{pmlr-v119-jiang20c,
title = {Beyond Synthetic Noise: Deep Learning on Controlled Noisy Labels},
author = {Jiang, Lu and Huang, Di and Liu, Mason and Yang, Weilong},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4804--4815},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jiang20c/jiang20c.pdf},
url = {http://proceedings.mlr.press/v119/jiang20c.html},
abstract = {Performing controlled experiments on noisy data is essential in understanding deep learning across noise levels. Due to the lack of suitable datasets, previous research has only examined deep learning on controlled synthetic label noise, and real-world label noise has never been studied in a controlled setting. This paper makes three contributions. First, we establish the first benchmark of controlled real-world label noise from the web. This new benchmark enables us to study the web label noise in a controlled setting for the first time. The second contribution is a simple but effective method to overcome both synthetic and real noisy labels. We show that our method achieves the best result on our dataset as well as on two public benchmarks (CIFAR and WebVision). Third, we conduct the largest study by far into understanding deep neural networks trained on noisy labels across different noise levels, noise types, network architectures, and training settings.}
}
@InProceedings{pmlr-v119-jiang20d,
title = {Implicit Class-Conditioned Domain Alignment for Unsupervised Domain Adaptation},
author = {Jiang, Xiang and Lao, Qicheng and Matwin, Stan and Havaei, Mohammad},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4816--4827},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jiang20d/jiang20d.pdf},
url = {http://proceedings.mlr.press/v119/jiang20d.html},
abstract = {We present an approach for unsupervised domain adaptation{—}with a strong focus on practical considerations of within-domain class imbalance and between-domain class distribution shift{—}from a class-conditioned domain alignment perspective. Current methods for class-conditioned domain alignment aim to explicitly minimize a loss function based on pseudo-label estimations of the target domain. However, these methods suffer from pseudo-label bias in the form of error accumulation. We propose a method that removes the need for explicit optimization of model parameters from pseudo-labels. Instead, we present a sampling-based implicit alignment approach, where the sample selection is implicitly guided by the pseudo-labels. Theoretical analysis reveals the existence of a domain-discriminator shortcut in misaligned classes, which is addressed by the proposed approach to facilitate domain-adversarial learning. Empirical results and ablation studies confirm the effectiveness of the proposed approach, especially in the presence of within-domain class imbalance and between-domain class distribution shift.}
}
@InProceedings{pmlr-v119-jiang20e,
title = {Associative Memory in Iterated Overparameterized Sigmoid Autoencoders},
author = {Jiang, Yibo and Pehlevan, Cengiz},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4828--4838},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jiang20e/jiang20e.pdf},
url = {http://proceedings.mlr.press/v119/jiang20e.html},
abstract = {Recent work showed that overparameterized autoencoders can be trained to implement associative memory via iterative maps, when the trained input-output Jacobian of the network has all of its eigenvalue norms strictly below one. Here, we theoretically analyze this phenomenon for sigmoid networks by leveraging recent developments in deep learning theory, especially the correspondence between training neural networks in the infinite-width limit and performing kernel regression with the Neural Tangent Kernel (NTK). We find that overparameterized sigmoid autoencoders can have attractors in the NTK limit for both training with a single example and multiple examples under certain conditions. In particular, for multiple training examples, we find that the norm of the largest Jacobian eigenvalue drops below one with increasing input norm, leading to associative memory.}
}
@InProceedings{pmlr-v119-jin20a,
title = {Hierarchical Generation of Molecular Graphs using Structural Motifs},
author = {Jin, Wengong and Barzilay, Dr.Regina and Jaakkola, Tommi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4839--4848},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jin20a/jin20a.pdf},
url = {http://proceedings.mlr.press/v119/jin20a.html},
abstract = {Graph generation techniques are increasingly being adopted for drug discovery. Previous graph generation approaches have utilized relatively small molecular building blocks such as atoms or simple cycles, limiting their effectiveness to smaller molecules. Indeed, as we demonstrate, their performance degrades significantly for larger molecules. In this paper, we propose a new hierarchical graph encoder-decoder that employs significantly larger and more flexible graph motifs as basic building blocks. Our encoder produces a multi-resolution representation for each molecule in a fine-to-coarse fashion, from atoms to connected motifs. Each level integrates the encoding of constituents below with the graph at that level. Our autoregressive coarse-to-fine decoder adds one motif at a time, interleaving the decision of selecting a new motif with the process of resolving its attachments to the emerging molecule. We evaluate our model on multiple molecule generation tasks, including polymers, and show that our model significantly outperforms previous state-of-the-art baselines.}
}
@InProceedings{pmlr-v119-jin20b,
title = {Multi-Objective Molecule Generation using Interpretable Substructures},
author = {Jin, Wengong and Barzilay, Dr.Regina and Jaakkola, Tommi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4849--4859},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jin20b/jin20b.pdf},
url = {http://proceedings.mlr.press/v119/jin20b.html},
abstract = {Drug discovery aims to find novel compounds with specified chemical property profiles. In terms of generative modeling, the goal is to learn to sample molecules in the intersection of multiple property constraints. This task becomes increasingly challenging when there are many property constraints. We propose to offset this complexity by composing molecules from a vocabulary of substructures that we call molecular rationales. These rationales are identified from molecules as substructures that are likely responsible for each property of interest. We then learn to expand rationales into a full molecule using graph generative models. Our final generative model composes molecules as mixtures of multiple rationale completions, and this mixture is fine-tuned to preserve the properties of interest. We evaluate our model on various drug design tasks and demonstrate significant improvements over state-of-the-art baselines in terms of accuracy, diversity, and novelty of generated compounds.}
}
@InProceedings{pmlr-v119-jin20c,
title = {Learning Adversarial {M}arkov Decision Processes with Bandit Feedback and Unknown Transition},
author = {Jin, Chi and Jin, Tiancheng and Luo, Haipeng and Sra, Suvrit and Yu, Tiancheng},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4860--4869},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jin20c/jin20c.pdf},
url = {http://proceedings.mlr.press/v119/jin20c.html},
abstract = {We consider the task of learning in episodic finite-horizon Markov decision processes with an unknown transition function, bandit feedback, and adversarial losses. We propose an efficient algorithm that achieves $\mathcal{\tilde{O}}(L|X|\sqrt{|A|T})$ regret with high probability, where $L$ is the horizon, $|X|$ the number of states, $|A|$ the number of actions, and T the number of episodes. To our knowledge, our algorithm is the first to ensure $\mathcal{\tilde{O}}(\sqrt{T})$ regret in this challenging setting; in fact, it achieves the same regret as (Rosenberg & Mansour, 2019a) who consider the easier setting with full-information. Our key contributions are two-fold: a tighter confidence set for the transition function; and an optimistic loss estimator that is inversely weighted by an "upper occupancy bound".}
}
@InProceedings{pmlr-v119-jin20d,
title = {Reward-Free Exploration for Reinforcement Learning},
author = {Jin, Chi and Krishnamurthy, Akshay and Simchowitz, Max and Yu, Tiancheng},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4870--4879},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jin20d/jin20d.pdf},
url = {http://proceedings.mlr.press/v119/jin20d.html},
abstract = {Exploration is widely regarded as one of the most challenging aspects of reinforcement learning (RL), with many naive approaches succumbing to exponential sample complexity. To isolate the challenges of exploration, we propose the following “reward-free RL” framework. In the exploration phase, the agent first collects trajectories from an MDP $M$ without a pre-specified reward function. After exploration, it is tasked with computing a near-policies under the transitions of $\mathcal{M}$ for a collection of given reward functions. This framework is particularly suitable where there are many reward functions of interest, or where the reward function is shaped by an external agent to elicit desired behavior. We give an efficient algorithm that conducts $\widetilde{O}(S^2A\mathrm{poly}(H)/\epsilon^2)$ episodes of exploration, and returns $\epsilon$-suboptimal policies for an arbitrary number of reward functions. We achieve this by finding exploratory policies that jointly visit each “significant” state with probability proportional to its maximum visitation probability under any possible policy. Moreover, our planning procedure can be instantiated by any black-box approximate planner, such as value iteration or natural policy gradient. Finally, we give a nearly-matching $\Omega(S^2AH^2/\epsilon^2)$ lower bound, demonstrating the near-optimality of our algorithm in this setting.}
}
@InProceedings{pmlr-v119-jin20e,
title = {What is Local Optimality in Nonconvex-Nonconcave Minimax Optimization?},
author = {Jin, Chi and Netrapalli, Praneeth and Jordan, Michael},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4880--4889},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jin20e/jin20e.pdf},
url = {http://proceedings.mlr.press/v119/jin20e.html},
abstract = {Minimax optimization has found extensive applications in modern machine learning, in settings such as generative adversarial networks (GANs), adversarial training and multi-agent reinforcement learning. As most of these applications involve continuous nonconvex-nonconcave formulations, a very basic question arises—“what is a proper definition of local optima?” Most previous work answers this question using classical notions of equilibria from simultaneous games, where the min-player and the max-player act simultaneously. In contrast, most applications in machine learning, including GANs and adversarial training, correspond to sequential games, where the order of which player acts first is crucial (since minimax is in general not equal to maximin due to the nonconvex-nonconcave nature of the problems). The main contribution of this paper is to propose a proper mathematical definition of local optimality for this sequential setting—local minimax, as well as to present its properties and existence results. Finally, we establish a strong connection to a basic local search algorithm—gradient descent ascent (GDA): under mild conditions, all stable limit points of GDA are exactly local minimax points up to some degenerate points.}
}
@InProceedings{pmlr-v119-jin20f,
title = {Efficiently Solving {MDP}s with Stochastic Mirror Descent},
author = {Jin, Yujia and Sidford, Aaron},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4890--4900},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jin20f/jin20f.pdf},
url = {http://proceedings.mlr.press/v119/jin20f.html},
abstract = {We present a unified framework based on primal-dual stochastic mirror descent for approximately solving infinite-horizon Markov decision processes (MDPs) given a generative model. When applied to an average-reward MDP with $A_{tot}$ total actions and mixing time bound $t_{mix}$ our method computes an $\epsilon$-optimal policy with an expected $\widetilde{O}(t_{mix}^2 A_{tot} \epsilon^{-2})$ samples from the state-transition matrix, removing the ergodicity dependence of prior art. When applied to a $\gamma$-discounted MDP with $A_{tot}$ total actions our method computes an $\epsilon$-optimal policy with an expected $\widetilde{O}((1-\gamma)^{-4} A_{tot} \epsilon^{-2})$ samples, improving over the best-known primal-dual methods while matching the state-of-the-art up to a $(1-\gamma)^{-1}$ factor. Both methods are model-free, update state values and policies simultaneously, and run in time linear in the number of samples taken. We achieve these results through a more general stochastic mirror descent framework for solving bilinear saddle-point problems with simplex and box domains and we demonstrate the flexibility of this framework by providing further applications to constrained MDPs.}
}
@InProceedings{pmlr-v119-jin20g,
title = {Computational and Statistical Tradeoffs in Inferring Combinatorial Structures of Ising Model},
author = {Jin, Ying and Wang, Zhaoran and Lu, Junwei},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4901--4910},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jin20g/jin20g.pdf},
url = {http://proceedings.mlr.press/v119/jin20g.html},
abstract = {We study the computational and statistical tradeoffs in inferring combinatorial structures of high dimensional simple zero-field ferromagnetic Ising model. Under the framework of oracle computational model where an algorithm interacts with an oracle that discourses a randomized version of truth, we characterize the computational lower bounds of learning combinatorial structures in polynomial time, under which no algorithms within polynomial-time can distinguish between graphs with and without certain structures. This hardness of learning with limited computational budget is shown to be characterized by a novel quantity called vertex overlap ratio. Such quantity is universally valid for many specific graph structures including cliques and nearest neighbors. On the other side, we attain the optimal rates for testing these structures against empty graph by proposing the quadratic testing statistics to match the lower bounds. We also investigate the relationship between computational bounds and information-theoretic bounds for such problems, and found gaps between the two boundaries in inferring some particular structures, especially for those with dense edges.}
}
@InProceedings{pmlr-v119-johnson20a,
title = {{A}da{S}cale {SGD}: A User-Friendly Algorithm for Distributed Training},
author = {Johnson, Tyler and Agrawal, Pulkit and Gu, Haijie and Guestrin, Carlos},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4911--4920},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/johnson20a/johnson20a.pdf},
url = {http://proceedings.mlr.press/v119/johnson20a.html},
abstract = {When using large-batch training to speed up stochastic gradient descent, learning rates must adapt to new batch sizes in order to maximize speed-ups and preserve model quality. Re-tuning learning rates is resource intensive, while fixed scaling rules often degrade model quality. We propose AdaScale SGD, an algorithm that reliably adapts learning rates to large-batch training. By continually adapting to the gradient’s variance, AdaScale automatically achieves speed-ups for a wide range of batch sizes. We formally describe this quality with AdaScale’s convergence bound, which maintains final objective values, even as batch sizes grow large and the number of iterations decreases. In empirical comparisons, AdaScale trains well beyond the batch size limits of popular “linear learning rate scaling” rules. This includes large-batch training with no model degradation for machine translation, image classification, object detection, and speech recognition tasks. AdaScale’s qualitative behavior is similar to that of "warm-up" heuristics, but unlike warm-up, this behavior emerges naturally from a principled mechanism. The algorithm introduces negligible computational overhead and no new hyperparameters, making AdaScale an attractive choice for large-scale training in practice.}
}
@InProceedings{pmlr-v119-johnson20b,
title = {Guided Learning of Nonconvex Models through Successive Functional Gradient Optimization},
author = {Johnson, Rie and Zhang, Tong},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4921--4930},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/johnson20b/johnson20b.pdf},
url = {http://proceedings.mlr.press/v119/johnson20b.html},
abstract = {This paper presents a framework of successive functional gradient optimization for training nonconvex models such as neural networks, where training is driven by mirror descent in a function space. We provide a theoretical analysis and empirical study of the training method derived from this framework. It is shown that the method leads to better performance than that of standard training techniques.}
}
@InProceedings{pmlr-v119-jolicoeur-martineau20a,
title = {On Relativistic f-Divergences},
author = {Jolicoeur-Martineau, Alexia},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4931--4939},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jolicoeur-martineau20a/jolicoeur-martineau20a.pdf},
url = {http://proceedings.mlr.press/v119/jolicoeur-martineau20a.html},
abstract = {We take a more rigorous look at Relativistic Generative Adversarial Networks (RGANs) and prove that the objective function of the discriminator is a statistical divergence for any concave function f with minimal properties. We devise additional variants of relativistic f-divergences. We show that the Wasserstein distance is weaker than f-divergences which are weaker than relativistic f-divergences. Given the good performance of RGANs, this suggests that Wasserstein GAN does not performs well primarily because of the weak metric, but rather because of regularization and the use of a relativistic discriminator. We introduce the minimum-variance unbiased estimator (MVUE) for Relativistic paired GANs (RpGANs; originally called RGANs which could bring confusion) and show that it does not perform better. We show that the estimator of Relativistic average GANs (RaGANs) is asymptotically unbiased and that the finite-sample bias is small; removing this bias does not improve performance.}
}
@InProceedings{pmlr-v119-jones20a,
title = {Fair k-Centers via Maximum Matching},
author = {Jones, Matthew and Nguyen, Huy and Nguyen, Thy},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4940--4949},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jones20a/jones20a.pdf},
url = {http://proceedings.mlr.press/v119/jones20a.html},
abstract = {The field of algorithms has seen a push for fairness, or the removal of inherent bias, in recent history. In data summarization, where a much smaller subset of a data set is chosen to represent the whole of the data, fairness can be introduced by guaranteeing each "demographic group" a specific portion of the representative subset. Specifically, this paper examines this fair variant of the k-centers problem, where a subset of the data with cardinality k is chosen to minimize distance to the rest of the data. Previous papers working on this problem presented both a 3-approximation algorithm with a super-linear runtime and a linear-time algorithm whose approximation factor is exponential in the number of demographic groups. This paper combines the best of each algorithm by presenting a linear-time algorithm with a guaranteed 3-approximation factor and provides empirical evidence of both the algorithm’s runtime and effectiveness.}
}
@InProceedings{pmlr-v119-joo20a,
title = {Being {B}ayesian about Categorical Probability},
author = {Joo, Taejong and Chung, Uijung and Seo, Min-Gwan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4950--4961},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/joo20a/joo20a.pdf},
url = {http://proceedings.mlr.press/v119/joo20a.html},
abstract = {Neural networks utilize the softmax as a building block in classification tasks, which contains an overconfidence problem and lacks an uncertainty representation ability. As a Bayesian alternative to the softmax, we consider a random variable of a categorical probability over class labels. In this framework, the prior distribution explicitly models the presumed noise inherent in the observed label, which provides consistent gains in generalization performance in multiple challenging tasks. The proposed method inherits advantages of Bayesian approaches that achieve better uncertainty estimation and model calibration. Our method can be implemented as a plug-and-play loss function with negligible computational overhead compared to the softmax with the cross-entropy loss function.}
}
@InProceedings{pmlr-v119-jordan20a,
title = {Evaluating the Performance of Reinforcement Learning Algorithms},
author = {Jordan, Scott and Chandak, Yash and Cohen, Daniel and Zhang, Mengxue and Thomas, Philip},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4962--4973},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jordan20a/jordan20a.pdf},
url = {http://proceedings.mlr.press/v119/jordan20a.html},
abstract = {Performance evaluations are critical for quantifying algorithmic advances in reinforcement learning. Recent reproducibility analyses have shown that reported performance results are often inconsistent and difficult to replicate. In this work, we argue that the inconsistency of performance stems from the use of flawed evaluation metrics. Taking a step towards ensuring that reported results are consistent, we propose a new comprehensive evaluation methodology for reinforcement learning algorithms that produces reliable measurements of performance both on a single environment and when aggregated across environments. We demonstrate this method by evaluating a broad class of reinforcement learning algorithms on standard benchmark tasks.}
}
@InProceedings{pmlr-v119-jorgensen20a,
title = {Stochastic Differential Equations with Variational Wishart Diffusions},
author = {J{\o}rgensen, Martin and Deisenroth, Marc and Salimbeni, Hugh},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4974--4983},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jorgensen20a/jorgensen20a.pdf},
url = {http://proceedings.mlr.press/v119/jorgensen20a.html},
abstract = {We present a Bayesian non-parametric way of inferring stochastic differential equations for both regression tasks and continuous-time dynamical modelling. The work has high emphasis on the stochastic part of the differential equation, also known as the diffusion, and modelling it by means of Wishart processes. Further, we present a semiparametric approach that allows the framework to scale to high dimensions. This successfully leads us onto how to model both latent and autoregressive temporal systems with conditional heteroskedastic noise. We provide experimental evidence that modelling diffusion often improves performance and that this randomness in the differential equation can be essential to avoid overfitting.}
}
@InProceedings{pmlr-v119-joulani20a,
title = {A simpler approach to accelerated optimization: iterative averaging meets optimism},
author = {Joulani, Pooria and Raj, Anant and Gyorgy, Andras and Szepesvari, Csaba},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4984--4993},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/joulani20a/joulani20a.pdf},
url = {http://proceedings.mlr.press/v119/joulani20a.html},
abstract = {Recently there have been several attempts to extend Nesterov’s accelerated algorithm to smooth stochastic and variance-reduced optimization. In this paper, we show that there is a simpler approach to acceleration: applying optimistic online learning algorithms and querying the gradient oracle at the online average of the intermediate optimization iterates. In particular, we tighten a recent result of Cutkosky (2019) to demonstrate theoretically that online iterate averaging results in a reduced optimization gap, independently of the algorithm involved. We show that carefully combining this technique with existing generic optimistic online learning algorithms yields the optimal accelerated rates for optimizing strongly-convex and non-strongly-convex, possibly composite objectives, with deterministic as well as stochastic first-order oracles. We further extend this idea to variance-reduced optimization. Finally, we also provide “universal” algorithms that achieve the optimal rate for smooth and non-smooth composite objectives simultaneously without further tuning, generalizing the results of Kavis et al. (2019) and solving a number of their open problems.}
}
@InProceedings{pmlr-v119-jubran20a,
title = {Sets Clustering},
author = {Jubran, Ibrahim and Tukan, Murad and Maalouf, Alaa and Feldman, Dan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {4994--5005},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jubran20a/jubran20a.pdf},
url = {http://proceedings.mlr.press/v119/jubran20a.html},
abstract = {The input to the \emph{sets-$k$-means} problem is an integer $k\geq 1$ and a set $\mathcal{P}=\{P_1,\cdots,P_n\}$ of fixed sized sets in $\mathbb{R}^d$. The goal is to compute a set $C$ of $k$ centers (points) in $\mathbb{R}^d$ that minimizes the sum $\sum_{P\in \mathcal{P}} \min_{p\in P, c\in C}\left\|{p}-c \right\|^2$ of squared distances to these sets. An \emph{$\varepsilon$-core-set} for this problem is a weighted subset of $\mathcal{P}$ that approximates this sum up to $1\pm\varepsilon$ factor, for \emph{every} set $C$ of $k$ centers in $\mathbb{R}^d$. We prove that such a core-set of $O(\log^2{n})$ sets always exists, and can be computed in $O(n\log{n})$ time, for every input $\mathcal{P}$ and every fixed $d,k\geq 1$ and $\varepsilon \in (0,1)$. The result easily generalized for any metric space, distances to the power of $z>0$, and M-estimators that handle outliers. Applying an inefficient but optimal algorithm on this coreset allows us to obtain the first PTAS ($1+\varepsilon$ approximation) for the sets-$k$-means problem that takes time near linear in $n$. This is the first result even for sets-mean on the plane ($k=1$, $d=2$). Open source code and experimental results for document classification and facility locations are also provided.}
}
@InProceedings{pmlr-v119-jun20a,
title = {Distribution Augmentation for Generative Modeling},
author = {Jun, Heewoo and Child, Rewon and Chen, Mark and Schulman, John and Ramesh, Aditya and Radford, Alec and Sutskever, Ilya},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5006--5019},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jun20a/jun20a.pdf},
url = {http://proceedings.mlr.press/v119/jun20a.html},
abstract = {We present distribution augmentation (DistAug), a simple and powerful method of regularizing generative models. Our approach applies augmentation functions to data and, importantly, conditions the generative model on the specific function used. Unlike typical data augmentation, DistAug allows usage of functions which modify the target density, enabling aggressive augmentations more commonly seen in supervised and self-supervised learning. We demonstrate this is a more effective regularizer than standard methods, and use it to train a 152M parameter autoregressive model on CIFAR-10 to 2.56 bits per dim (relative to the state-of-the-art 2.80). Samples from this model attain FID 12.75 and IS 8.40, outperforming the majority of GANs. We further demonstrate the technique is broadly applicable across model architectures and problem domains.}
}
@InProceedings{pmlr-v119-jurgenson20a,
title = {Sub-Goal Trees a Framework for Goal-Based Reinforcement Learning},
author = {Jurgenson, Tom and Avner, Or and Groshev, Edward and Tamar, Aviv},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5020--5030},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/jurgenson20a/jurgenson20a.pdf},
url = {http://proceedings.mlr.press/v119/jurgenson20a.html},
abstract = {Many AI problems, in robotics and other domains, are goal-directed, essentially seeking a trajectory leading to some goal state. Reinforcement learning (RL), building on Bellman’s optimality equation, naturally optimizes for a single goal, yet can be made goal-directed by augmenting the state with the goal. Instead, we propose a new RL framework, derived from a dynamic programming equation for the all pairs shortest path (APSP) problem, which naturally solves goal-directed queries. We show that this approach has computational benefits for both standard and approximate dynamic programming. Interestingly, our formulation prescribes a novel protocol for computing a trajectory: instead of predicting the next state given its predecessor, as in standard RL, a goal-conditioned trajectory is constructed by first predicting an intermediate state between start and goal, partitioning the trajectory into two. Then, recursively, predicting intermediate points on each sub-segment, until a complete trajectory is obtained. We call this trajectory structure a sub-goal tree. Building on it, we additionally extend the policy gradient methodology to recursively predict sub-goals, resulting in novel goal-based algorithms. Finally, we apply our method to neural motion planning, where we demonstrate significant improvements compared to standard RL on navigating a 7-DoF robot arm between obstacles.}
}
@InProceedings{pmlr-v119-kadri20a,
title = {Partial Trace Regression and Low-Rank Kraus Decomposition},
author = {Kadri, Hachem and Ayache, Stephane and Huusari, Riikka and Rakotomamonjy, Alain and Liva, Ralaivola},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5031--5041},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kadri20a/kadri20a.pdf},
url = {http://proceedings.mlr.press/v119/kadri20a.html},
abstract = {The trace regression model, a direct extension of the well-studied linear regression model, allows one to map matrices to real-valued outputs. We here introduce an even more general model, namely the partial-trace regression model, a family of linear mappings from matrix-valued inputs to matrix-valued outputs; this model subsumes the trace regression model and thus the linear regression model. Borrowing tools from quantum information theory, where partial trace operators have been extensively studied, we propose a framework for learning partial trace regression models from data by taking advantage of the so-called low-rank Kraus representation of completely positive maps. We show the relevance of our framework with synthetic and real-world experiments conducted for both i) matrix-to-matrix regression and ii) positive semidefinite matrix completion, two tasks which can be formulated as partial trace regression problems.}
}
@InProceedings{pmlr-v119-kahng20a,
title = {Strategyproof Mean Estimation from Multiple-Choice Questions},
author = {Kahng, Anson and Kehne, Gregory and Procaccia, Ariel},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5042--5052},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kahng20a/kahng20a.pdf},
url = {http://proceedings.mlr.press/v119/kahng20a.html},
abstract = {Given n values possessed by n agents, we study the problem of estimating the mean by truthfully eliciting agents’ answers to multiple-choice questions about their values. We consider two natural candidates for estimation error: mean squared error (MSE) and mean absolute error (MAE). We design a randomized estimator which is asymptotically optimal for both measures in the worst case. In the case where prior distributions over the agents’ values are known, we give an optimal, polynomial-time algorithm for MSE, and show that the task of computing an optimal estimate for MAE is #P-hard. Finally, we demonstrate empirically that knowledge of prior distributions gives a significant edge.}
}
@InProceedings{pmlr-v119-kalatzis20a,
title = {Variational Autoencoders with {R}iemannian Brownian Motion Priors},
author = {Kalatzis, Dimitrios and Eklund, David and Arvanitidis, Georgios and Hauberg, Soren},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5053--5066},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kalatzis20a/kalatzis20a.pdf},
url = {http://proceedings.mlr.press/v119/kalatzis20a.html},
abstract = {Variational Autoencoders (VAEs) represent the given data in a low-dimensional latent space, which is generally assumed to be Euclidean. This assumption naturally leads to the common choice of a standard Gaussian prior over continuous latent variables. Recent work has, however, shown that this prior has a detrimental effect on model capacity, leading to subpar performance. We propose that the Euclidean assumption lies at the heart of this failure mode. To counter this, we assume a Riemannian structure over the latent space, which constitutes a more principled geometric view of the latent codes, and replace the standard Gaussian prior with a Riemannian Brownian motion prior. We propose an efficient inference scheme that does not rely on the unknown normalizing factor of this prior. Finally, we demonstrate that this prior significantly increases model capacity using only one additional scalar parameter.}
}
@InProceedings{pmlr-v119-kallus20a,
title = {{D}eep{M}atch: Balancing Deep Covariate Representations for Causal Inference Using Adversarial Training},
author = {Kallus, Nathan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5067--5077},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kallus20a/kallus20a.pdf},
url = {http://proceedings.mlr.press/v119/kallus20a.html},
abstract = {We study optimal covariate balance for causal inferences from observational data when rich covariates and complex relationships necessitate flexible modeling with neural networks. Standard approaches such as propensity weighting and matching/balancing fail in such settings due to miscalibrated propensity nets and inappropriate covariate representations, respectively. We propose a new method based on adversarial training of a weighting and a discriminator network that effectively addresses this methodological gap. This is demonstrated through new theoretical characterizations and empirical results on both synthetic and clinical data showing how causal analyses can be salvaged in such challenging settings.}
}
@InProceedings{pmlr-v119-kallus20b,
title = {Double Reinforcement Learning for Efficient and Robust Off-Policy Evaluation},
author = {Kallus, Nathan and Uehara, Masatoshi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5078--5088},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kallus20b/kallus20b.pdf},
url = {http://proceedings.mlr.press/v119/kallus20b.html},
abstract = {Off-policy evaluation (OPE) in reinforcement learning allows one to evaluate novel decision policies without needing to conduct exploration, which is often costly or otherwise infeasible. We consider for the first time the semiparametric efficiency limits of OPE in Markov decision processes (MDPs), where actions, rewards, and states are memoryless. We show existing OPE estimators may fail to be efficient in this setting. We develop a new estimator based on cross-fold estimation of $q$-functions and marginalized density ratios, which we term double reinforcement learning (DRL). We show that DRL is efficient when both components are estimated at fourth-root rates and is also doubly robust when only one component is consistent. We investigate these properties empirically and demonstrate the performance benefits due to harnessing memorylessness.}
}
@InProceedings{pmlr-v119-kallus20c,
title = {Statistically Efficient Off-Policy Policy Gradients},
author = {Kallus, Nathan and Uehara, Masatoshi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5089--5100},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kallus20c/kallus20c.pdf},
url = {http://proceedings.mlr.press/v119/kallus20c.html},
abstract = {Policy gradient methods in reinforcement learning update policy parameters by taking steps in the direction of an estimated gradient of policy value. In this paper, we consider the efficient estimation of policy gradients from off-policy data, where the estimation is particularly non-trivial. We derive the asymptotic lower bound on the feasible mean-squared error in both Markov and non-Markov decision processes and show that existing estimators fail to achieve it in general settings. We propose a meta-algorithm that achieves the lower bound without any parametric assumptions and exhibits a unique 4-way double robustness property. We discuss how to estimate nuisances that the algorithm relies on. Finally, we establish guarantees at the rate at which we approach a stationary point when we take steps in the direction of our new estimated policy gradient.}
}
@InProceedings{pmlr-v119-kamath20a,
title = {On the Power of Compressed Sensing with Generative Models},
author = {Kamath, Akshay and Price, Eric and Karmalkar, Sushrut},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5101--5109},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kamath20a/kamath20a.pdf},
url = {http://proceedings.mlr.press/v119/kamath20a.html},
abstract = {The goal of compressed sensing is to learn a structured signal $x$ from a limited number of noisy linear measurements $y \approx Ax$. In traditional compressed sensing, “structure” is represented by sparsity in some known basis. Inspired by the success of deep learning in modeling images, recent work starting with Bora-Jalal-Price-Dimakis’17 has instead considered structure to come from a generative model $G: \mathbb{R}^k \to \mathbb{R}^n$. We present two results establishing the difficulty and strength of this latter task, showing that existing bounds are tight: First, we provide a lower bound matching the Bora et.al upper bound for compressed sensing with $L$-Lipschitz generative models $G$ which holds even for the more relaxed goal of \emph{non-uniform} recovery. Second, we show that generative models generalize sparsity as a representation of structure by constructing a ReLU-based neural network with $2$ hidden layers and $O(n)$ activations per layer whose range is precisely the set of all $k$-sparse vectors.}
}
@InProceedings{pmlr-v119-kanade20a,
title = {Learning and Evaluating Contextual Embedding of Source Code},
author = {Kanade, Aditya and Maniatis, Petros and Balakrishnan, Gogul and Shi, Kensen},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5110--5121},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kanade20a/kanade20a.pdf},
url = {http://proceedings.mlr.press/v119/kanade20a.html},
abstract = {Recent research has achieved impressive results on understanding and improving source code by building up on machine-learning techniques developed for natural languages. A significant advancement in natural-language understanding has come with the development of pre-trained contextual embeddings, such as BERT, which can be fine-tuned for downstream tasks with less labeled data and training budget, while achieving better accuracies. However, there is no attempt yet to obtain a high-quality contextual embedding of source code, and to evaluate it on multiple program-understanding tasks simultaneously; that is the gap that this paper aims to mitigate. Specifically, first, we curate a massive, deduplicated corpus of 7.4M Python files from GitHub, which we use to pre-train CuBERT, an open-sourced code-understanding BERT model; and, second, we create an open-sourced benchmark that comprises five classification tasks and one program-repair task, akin to code-understanding tasks proposed in the literature before. We fine-tune CuBERT on our benchmark tasks, and compare the resulting models to different variants of Word2Vec token embeddings, BiLSTM and Transformer models, as well as published state-of-the-art models, showing that CuBERT outperforms them all, even with shorter training, and with fewer labeled examples. Future work on source-code embedding can benefit from reusing our benchmark, and from comparing against CuBERT models as a strong baseline.}
}
@InProceedings{pmlr-v119-kang20a,
title = {Operation-Aware Soft Channel Pruning using Differentiable Masks},
author = {Kang, Minsoo and Han, Bohyung},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5122--5131},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kang20a/kang20a.pdf},
url = {http://proceedings.mlr.press/v119/kang20a.html},
abstract = {We propose a simple but effective data-driven channel pruning algorithm, which compresses deep neural networks in a differentiable way by exploiting the characteristics of operations. The proposed approach makes a joint consideration of batch normalization (BN) and rectified linear unit (ReLU) for channel pruning; it estimates how likely the two successive operations deactivate each feature map and prunes the channels with high probabilities. To this end, we learn differentiable masks for individual channels and make soft decisions throughout the optimization procedure, which facilitates to explore larger search space and train more stable networks. The proposed framework enables us to identify compressed models via a joint learning of model parameters and channel pruning without an extra procedure of fine-tuning. We perform extensive experiments and achieve outstanding performance in terms of the accuracy of output networks given the same amount of resources when compared with the state-of-the-art methods.}
}
@InProceedings{pmlr-v119-karimireddy20a,
title = {{SCAFFOLD}: Stochastic Controlled Averaging for Federated Learning},
author = {Karimireddy, Sai Praneeth and Kale, Satyen and Mohri, Mehryar and Reddi, Sashank and Stich, Sebastian and Suresh, Ananda Theertha},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5132--5143},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/karimireddy20a/karimireddy20a.pdf},
url = {http://proceedings.mlr.press/v119/karimireddy20a.html},
abstract = {Federated learning is a key scenario in modern large-scale machine learning where the data remains distributed over a large number of clients and the task is to learn a centralized model without transmitting the client data. The standard optimization algorithm used in this setting is Federated Averaging (FedAvg) due to its low communication cost. We obtain a tight characterization of the convergence of FedAvg and prove that heterogeneity (non-iid-ness) in the client’s data results in a ‘drift’ in the local updates resulting in poor performance. As a solution, we propose a new algorithm (SCAFFOLD) which uses control variates (variance reduction) to correct for the ‘client drift’. We prove that SCAFFOLD requires significantly fewer communication rounds and is not affected by data heterogeneity or client sampling. Further, we show that (for quadratics) SCAFFOLD can take advantage of similarity in the client’s data yielding even faster convergence. The latter is the first result to quantify the usefulness of local-steps in distributed optimization.}
}
@InProceedings{pmlr-v119-kasai20a,
title = {Non-autoregressive Machine Translation with Disentangled Context Transformer},
author = {Kasai, Jungo and Cross, James and Ghazvininejad, Marjan and Gu, Jiatao},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5144--5155},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kasai20a/kasai20a.pdf},
url = {http://proceedings.mlr.press/v119/kasai20a.html},
abstract = {State-of-the-art neural machine translation models generate a translation from left to right and every step is conditioned on the previously generated tokens. The sequential nature of this generation process causes fundamental latency in inference since we cannot generate multiple tokens in each sentence in parallel. We propose an attention-masking based model, called Disentangled Context (DisCo) transformer, that simultaneously generates all tokens given different contexts. The DisCo transformer is trained to predict every output token given an arbitrary subset of the other reference tokens. We also develop the parallel easy-first inference algorithm, which iteratively refines every token in parallel and reduces the number of required iterations. Our extensive experiments on 7 translation directions with varying data sizes demonstrate that our model achieves competitive, if not better, performance compared to the state of the art in non-autoregressive machine translation while significantly reducing decoding time on average.}
}
@InProceedings{pmlr-v119-katharopoulos20a,
title = {Transformers are {RNN}s: Fast Autoregressive Transformers with Linear Attention},
author = {Katharopoulos, Angelos and Vyas, Apoorv and Pappas, Nikolaos and Fleuret, Fran{\c{c}}ois},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5156--5165},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/katharopoulos20a/katharopoulos20a.pdf},
url = {http://proceedings.mlr.press/v119/katharopoulos20a.html},
abstract = {Transformers achieve remarkable performance in several tasks but due to their quadratic complexity, with respect to the input’s length, they are prohibitively slow for very long sequences. To address this limitation, we express the self-attention as a linear dot-product of kernel feature maps and make use of the associativity property of matrix products to reduce the complexity from $\bigO{N^2}$ to $\bigO{N}$, where $N$ is the sequence length. We show that this formulation permits an iterative implementation that dramatically accelerates autoregressive transformers and reveals their relationship to recurrent neural networks. Our \emph{Linear Transformers} achieve similar performance to vanilla Transformers and they are up to 4000x faster on autoregressive prediction of very long sequences.}
}
@InProceedings{pmlr-v119-kato20a,
title = {Rate-distortion optimization guided autoencoder for isometric embedding in {E}uclidean latent space},
author = {Kato, Keizo and Zhou, Jing and Sasaki, Tomotake and Nakagawa, Akira},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5166--5176},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kato20a/kato20a.pdf},
url = {http://proceedings.mlr.press/v119/kato20a.html},
abstract = {To analyze high-dimensional and complex data in the real world, deep generative models, such as variational autoencoder (VAE) embed data in a low-dimensional space (latent space) and learn a probabilistic model in the latent space. However, they struggle to accurately reproduce the probability distribution function (PDF) in the input space from that in the latent space. If the embedding were isometric, this issue can be solved, because the relation of PDFs can become tractable. To achieve isometric property, we propose Rate-Distortion Optimization guided autoencoder inspired by orthonormal transform coding. We show our method has the following properties: (i) the Jacobian matrix between the input space and a Euclidean latent space forms a constantly-scaled orthonormal system and enables isometric data embedding; (ii) the relation of PDFs in both spaces can become tractable one such as proportional relation. Furthermore, our method outperforms state-of-the-art methods in unsupervised anomaly detection with four public datasets.}
}
@InProceedings{pmlr-v119-keeley20a,
title = {Efficient Non-conjugate {G}aussian Process Factor Models for Spike Count Data using Polynomial Approximations},
author = {Keeley, Stephen and Zoltowski, David and Yu, Yiyi and Smith, Spencer and Pillow, Jonathan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5177--5186},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/keeley20a/keeley20a.pdf},
url = {http://proceedings.mlr.press/v119/keeley20a.html},
abstract = {Gaussian Process Factor Analysis (GPFA) has been broadly applied to the problem of identifying smooth, low-dimensional temporal structure underlying large-scale neural recordings. However, spike trains are non-Gaussian, which motivates combining GPFA with discrete observation models for binned spike count data. The drawback to this approach is that GPFA priors are not conjugate to count model likelihoods, which makes inference challenging. Here we address this obstacle by introducing a fast, approximate inference method for non-conjugate GPFA models. Our approach uses orthogonal second-order polynomials to approximate the nonlinear terms in the non-conjugate log-likelihood, resulting in a method we refer to as polynomial approximate log-likelihood (PAL) estimators. This approximation allows for accurate closed-form evaluation of marginal likelihoods and fast numerical optimization for parameters and hyperparameters. We derive PAL estimators for GPFA models with binomial, Poisson, and negative binomial observations and find the PAL estimation is highly accurate, and achieves faster convergence times compared to existing state-of-the-art inference methods. We also find that PAL hyperparameters can provide sensible initialization for black box variational inference (BBVI), which improves BBVI accuracy. We demonstrate that PAL estimators achieve fast and accurate extraction of latent structure from multi-neuron spike train data.}
}
@InProceedings{pmlr-v119-kerenidis20a,
title = {Quantum Expectation-Maximization for {G}aussian mixture models},
author = {Kerenidis, Iordanis and Luongo, Alessandro and Prakash, Anupam},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5187--5197},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kerenidis20a/kerenidis20a.pdf},
url = {http://proceedings.mlr.press/v119/kerenidis20a.html},
abstract = {We define a quantum version of Expectation-Maximization (QEM), a fundamental tool in unsupervised machine learning, often used to solve Maximum Likelihood (ML) and Maximum A Posteriori (MAP) estimation problems. We use QEM to fit a Gaussian Mixture Model, and show how to generalize it to fit mixture models with base distributions in the exponential family. Given quantum access to a dataset, our algorithm has convergence and precision guarantees similar to the classical algorithm, while the runtime is polylogarithmic in the number of elements in the training set and polynomial in other parameters, such as the dimension of the feature space and the number of components in the mixture. We discuss the performance of the algorithm on a dataset that is expected to be classified successfully by classical EM and provide guarantees for its runtime.}
}
@InProceedings{pmlr-v119-kersting20a,
title = {Differentiable Likelihoods for Fast Inversion of ’{L}ikelihood-Free’ Dynamical Systems},
author = {Kersting, Hans and Kr{\"a}mer, Nicholas and Schiegg, Martin and Daniel, Christian and Tiemann, Michael and Hennig, Philipp},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5198--5208},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kersting20a/kersting20a.pdf},
url = {http://proceedings.mlr.press/v119/kersting20a.html},
abstract = {Likelihood-free (a.k.a. simulation-based) inference problems are inverse problems with expensive, or intractable, forward models. ODE inverse problems are commonly treated as likelihood-free, as their forward map has to be numerically approximated by an ODE solver. This, however, is not a fundamental constraint but just a lack of functionality in classic ODE solvers, which do not return a likelihood but a point estimate. To address this shortcoming, we employ Gaussian ODE filtering (a probabilistic numerical method for ODEs) to construct a local Gaussian approximation to the likelihood. This approximation yields tractable estimators for the gradient and Hessian of the (log-)likelihood. Insertion of these estimators into existing gradient-based optimization and sampling methods engenders new solvers for ODE inverse problems. We demonstrate that these methods outperform standard likelihood-free approaches on three benchmark-systems.}
}
@InProceedings{pmlr-v119-khani20a,
title = {Feature Noise Induces Loss Discrepancy Across Groups},
author = {Khani, Fereshte and Liang, Percy},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5209--5219},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/khani20a/khani20a.pdf},
url = {http://proceedings.mlr.press/v119/khani20a.html},
abstract = {The performance of standard learning procedures has been observed to differ widely across groups. Recent studies usually attribute this loss discrepancy to an information deficiency for one group (e.g., one group has less data). In this work, we point to a more subtle source of loss discrepancy—feature noise. Our main result is that even when there is no information deficiency specific to one group (e.g., both groups have infinite data), adding the same amount of feature noise to all individuals leads to loss discrepancy. For linear regression, we thoroughly characterize the effect of feature noise on loss discrepancy in terms of the amount of noise, the difference between moments of the two groups, and whether group information is used or not. We then show this loss discrepancy does not vanish immediately if a shift in distribution causes the groups to have similar moments. On three real-world datasets, we show feature noise increases the loss discrepancy if groups have different distributions, while it does not affect the loss discrepancy on datasets where groups have similar distributions.}
}
@InProceedings{pmlr-v119-kharitonov20a,
title = {Entropy Minimization In Emergent Languages},
author = {Kharitonov, Eugene and Chaabouni, Rahma and Bouchacourt, Diane and Baroni, Marco},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5220--5230},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kharitonov20a/kharitonov20a.pdf},
url = {http://proceedings.mlr.press/v119/kharitonov20a.html},
abstract = {There is growing interest in studying the languages that emerge when neural agents are jointly trained to solve tasks requiring communication through a discrete channel. We investigate here the information-theoretic complexity of such languages, focusing on the basic two-agent, one-exchange setup. We find that, under common training procedures, the emergent languages are subject to an entropy minimization pressure that has also been detected in human language, whereby the mutual information between the communicating agent’s inputs and the messages is minimized, within the range afforded by the need for successful communication. That is, emergent languages are (nearly) as simple as the task they are developed for allow them to be. This pressure is amplified as we increase communication channel discreteness. Further, we observe that stronger discrete-channel-driven entropy minimization leads to representations with increased robustness to overfitting and adversarial attacks. We conclude by discussing the implications of our findings for the study of natural and artificial communication systems.}
}
@InProceedings{pmlr-v119-kharkovskii20a,
title = {Private Outsourced {B}ayesian Optimization},
author = {Kharkovskii, Dmitrii and Dai, Zhongxiang and Low, Bryan Kian Hsiang},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5231--5242},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kharkovskii20a/kharkovskii20a.pdf},
url = {http://proceedings.mlr.press/v119/kharkovskii20a.html},
abstract = {This paper presents the private-outsourced-Gaussian process-upper confidence bound (PO-GP-UCB) algorithm, which is the first algorithm for privacy-preserving Bayesian optimization (BO) in the outsourced setting with a provable performance guarantee. We consider the outsourced setting where the entity holding the dataset and the entity performing BO are represented by different parties, and the dataset cannot be released non-privately. For example, a hospital holds a dataset of sensitive medical records and outsources the BO task on this dataset to an industrial AI company. The key idea of our approach is to make the BO performance of our algorithm similar to that of non-private GP-UCB run using the original dataset, which is achieved by using a random projection-based transformation that preserves both privacy and the pairwise distances between inputs. Our main theoretical contribution is to show that a regret bound similar to that of the standard GP-UCB algorithm can be established for our PO-GP-UCB algorithm. We empirically evaluate the performance of our PO-GP-UCB algorithm with synthetic and real-world datasets.}
}
@InProceedings{pmlr-v119-khetarpal20a,
title = {What can I do here? {A} Theory of Affordances in Reinforcement Learning},
author = {Khetarpal, Khimya and Ahmed, Zafarali and Comanici, Gheorghe and Abel, David and Precup, Doina},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5243--5253},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/khetarpal20a/khetarpal20a.pdf},
url = {http://proceedings.mlr.press/v119/khetarpal20a.html},
abstract = {Reinforcement learning algorithms usually assume that all actions are always available to an agent. However, both people and animals understand the general link between the features of their environment and the actions that are feasible. Gibson (1977) coined the term "affordances" to describe the fact that certain states enable an agent to do certain actions, in the context of embodied agents. In this paper, we develop a theory of affordances for agents who learn and plan in Markov Decision Processes. Affordances play a dual role in this case. On one hand, they allow faster planning, by reducing the number of actions available in any given situation. On the other hand, they facilitate more efficient and precise learning of transition models from data, especially when such models require function approximation. We establish these properties through theoretical results as well as illustrative examples. We also propose an approach to learn affordances and use it to estimate transition models that are simpler and generalize better.}
}
@InProceedings{pmlr-v119-khim20a,
title = {Uniform Convergence of Rank-weighted Learning},
author = {Khim, Justin and Leqi, Liu and Prasad, Adarsh and Ravikumar, Pradeep},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5254--5263},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/khim20a/khim20a.pdf},
url = {http://proceedings.mlr.press/v119/khim20a.html},
abstract = {The decision-theoretic foundations of classical machine learning models have largely focused on estimating model parameters that minimize the expectation of a given loss function. However, as machine learning models are deployed in varied contexts, such as in high-stakes decision-making and societal settings, it is clear that these models are not just evaluated by their average performances. In this work, we study a novel notion of L-Risk based on the classical idea of rank-weighted learning. These L-Risks, induced by rank-dependent weighting functions with bounded variation, is a unification of popular risk measures such as conditional value-at-risk and those defined by cumulative prospect theory. We give uniform convergence bounds of this broad class of risk measures and study their consequences on a logistic regression example.}
}
@InProceedings{pmlr-v119-kim20a,
title = {{FACT}: A Diagnostic for Group Fairness Trade-offs},
author = {Kim, Joon Sik and Chen, Jiahao and Talwalkar, Ameet},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5264--5274},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kim20a/kim20a.pdf},
url = {http://proceedings.mlr.press/v119/kim20a.html},
abstract = {Group fairness, a class of fairness notions that measure how different groups of individuals are treated differently according to their protected attributes, has been shown to conflict with one another, often with a necessary cost in loss of model’s predictive performance. We propose a general diagnostic that enables systematic characterization of these trade-offs in group fairness. We observe that the majority of group fairness notions can be expressed via the fairness-confusion tensor, which is the confusion matrix split according to the protected attribute values. We frame several optimization problems that directly optimize both accuracy and fairness objectives over the elements of this tensor, which yield a general perspective for understanding multiple trade-offs including group fairness incompatibilities. It also suggests an alternate post-processing method for designing fair classifiers. On synthetic and real datasets, we demonstrate the use cases of our diagnostic, particularly on understanding the trade-off landscape between accuracy and fairness.}
}
@InProceedings{pmlr-v119-kim20b,
title = {Puzzle Mix: Exploiting Saliency and Local Statistics for Optimal Mixup},
author = {Kim, Jang-Hyun and Choo, Wonho and Song, Hyun Oh},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5275--5285},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kim20b/kim20b.pdf},
url = {http://proceedings.mlr.press/v119/kim20b.html},
abstract = {While deep neural networks achieve great performance on fitting the training distribution, the learned networks are prone to overfitting and are susceptible to adversarial attacks. In this regard, a number of mixup based augmentation methods have been recently proposed. However, these approaches mainly focus on creating previously unseen virtual examples and can sometimes provide misleading supervisory signal to the network. To this end, we propose Puzzle Mix, a mixup method for explicitly utilizing the saliency information and the underlying statistics of the natural examples. This leads to an interesting optimization problem alternating between the multi-label objective for optimal mixing mask and saliency discounted optimal transport objective. Our experiments show Puzzle Mix achieves the state of the art generalization and the adversarial robustness results compared to other mixup methods on CIFAR-100, Tiny-ImageNet, and ImageNet datasets, and the source code is available at https://github.com/snu-mllab/PuzzleMix.}
}
@InProceedings{pmlr-v119-kim20c,
title = {Domain Adaptive Imitation Learning},
author = {Kim, Kuno and Gu, Yihong and Song, Jiaming and Zhao, Shengjia and Ermon, Stefano},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5286--5295},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kim20c/kim20c.pdf},
url = {http://proceedings.mlr.press/v119/kim20c.html},
abstract = {We study the question of how to imitate tasks across domains with discrepancies such as embodiment, viewpoint, and dynamics mismatch. Many prior works require paired, aligned demonstrations and an additional RL step that requires environment interactions. However, paired, aligned demonstrations are seldom obtainable and RL procedures are expensive. In this work, we formalize the Domain Adaptive Imitation Learning (DAIL) problem - a unified framework for imitation learning in the presence of viewpoint, embodiment, and/or dynamics mismatch. Informally, DAIL is the process of learning how to perform a task optimally, given demonstrations of the task in a distinct domain. We propose a two step approach to DAIL: alignment followed by adaptation. In the alignment step we execute a novel unsupervised MDP alignment algorithm, Generative Adversarial MDP Alignment (GAMA), to learn state and action correspondences from \emph{unpaired, unaligned} demonstrations. In the adaptation step we leverage the correspondences to zero-shot imitate tasks across domains. To describe when DAIL is feasible via alignment and adaptation, we introduce a theory of MDP alignability. We experimentally evaluate GAMA against baselines in embodiment, viewpoint, and dynamics mismatch scenarios where aligned demonstrations don’t exist and show the effectiveness of our approach}
}
@InProceedings{pmlr-v119-kim20d,
title = {Variational Inference for Sequential Data with Future Likelihood Estimates},
author = {Kim, Geon-Hyeong and Jang, Youngsoo and Yang, Hongseok and Kim, Kee-Eung},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5296--5305},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kim20d/kim20d.pdf},
url = {http://proceedings.mlr.press/v119/kim20d.html},
abstract = {The recent development of flexible and scalable variational inference algorithms has popularized the use of deep probabilistic models in a wide range of applications. However, learning and reasoning about high-dimensional models with nondifferentiable densities are still a challenge. For such a model, inference algorithms struggle to estimate the gradients of variational objectives accurately, due to high variance in their estimates. To tackle this challenge, we present a novel variational inference algorithm for sequential data, which performs well even when the density from the model is not differentiable, for instance, due to the use of discrete random variables. The key feature of our algorithm is that it estimates future likelihoods at all time steps. The estimated future likelihoods form the core of our new low-variance gradient estimator. We formally analyze our gradient estimator from the perspective of variational objective, and show the effectiveness of our algorithm with synthetic and real datasets.}
}
@InProceedings{pmlr-v119-kim20e,
title = {Active World Model Learning with Progress Curiosity},
author = {Kim, Kuno and Sano, Megumi and De Freitas, Julian and Haber, Nick and Yamins, Daniel},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5306--5315},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kim20e/kim20e.pdf},
url = {http://proceedings.mlr.press/v119/kim20e.html},
abstract = {World models are self-supervised predictive models of how the world evolves. Humans learn world models by curiously exploring their environment, in the process acquiring compact abstractions of high bandwidth sensory inputs, the ability to plan across long temporal horizons, and an understanding of the behavioral patterns of other agents. In this work, we study how to design such a curiosity-driven Active World Model Learning (AWML) system. To do so, we construct a curious agent building world models while visually exploring a 3D physical environment rich with distillations of representative real-world agents. We propose an AWML system driven by $\gamma$-Progress: a scalable and effective learning progress-based curiosity signal and show that $\gamma$-Progress naturally gives rise to an exploration policy that directs attention to complex but learnable dynamics in a balanced manner, as a result overcoming the “white noise problem”. As a result, our $\gamma$-Progress-driven controller achieves significantly higher AWML performance than baseline controllers equipped with state-of-the-art exploration strategies such as Random Network Distillation and Model Disagreement.}
}
@InProceedings{pmlr-v119-kleinegesse20a,
title = {{B}ayesian Experimental Design for Implicit Models by Mutual Information Neural Estimation},
author = {Kleinegesse, Steven and Gutmann, Michael U.},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5316--5326},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kleinegesse20a/kleinegesse20a.pdf},
url = {http://proceedings.mlr.press/v119/kleinegesse20a.html},
abstract = {Implicit stochastic models, where the data-generation distribution is intractable but sampling is possible, are ubiquitous in the natural sciences. The models typically have free parameters that need to be inferred from data collected in scientific experiments. A fundamental question is how to design the experiments so that the collected data are most useful. The field of Bayesian experimental design advocates that, ideally, we should choose designs that maximise the mutual information (MI) between the data and the parameters. For implicit models, however, this approach is severely hampered by the high computational cost of computing posteriors and maximising MI, in particular when we have more than a handful of design variables to optimise. In this paper, we propose a new approach to Bayesian experimental design for implicit models that leverages recent advances in neural MI estimation to deal with these issues. We show that training a neural network to maximise a lower bound on MI allows us to jointly determine the optimal design and the posterior. Simulation studies illustrate that this gracefully extends Bayesian experimental design for implicit models to higher design dimensions.}
}
@InProceedings{pmlr-v119-knoblauch20a,
title = {Optimal Continual Learning has Perfect Memory and is {NP}-hard},
author = {Knoblauch, Jeremias and Husain, Hisham and Diethe, Tom},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5327--5337},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/knoblauch20a/knoblauch20a.pdf},
url = {http://proceedings.mlr.press/v119/knoblauch20a.html},
abstract = {Continual Learning (CL) algorithms incrementally learn a predictor or representation across multiple sequentially observed tasks. Designing CL algorithms that perform reliably and avoid so-called catastrophic forgetting has proven a persistent challenge. The current paper develops a theoretical approach that explains why. In particular, we derive the computational properties which CL algorithms would have to possess in order to avoid catastrophic forgetting. Our main finding is that such optimal CL algorithms generally solve an NP-hard problem and will require perfect memory to do so. The findings are of theoretical interest, but also explain the excellent performance of CL algorithms using experience replay, episodic memory and core sets relative to regularization-based approaches.}
}
@InProceedings{pmlr-v119-koh20a,
title = {Concept Bottleneck Models},
author = {Koh, Pang Wei and Nguyen, Thao and Tang, Yew Siang and Mussmann, Stephen and Pierson, Emma and Kim, Been and Liang, Percy},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5338--5348},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/koh20a/koh20a.pdf},
url = {http://proceedings.mlr.press/v119/koh20a.html},
abstract = {We seek to learn models that we can interact with using high-level concepts: if the model did not think there was a bone spur in the x-ray, would it still predict severe arthritis? State-of-the-art models today do not typically support the manipulation of concepts like "the existence of bone spurs", as they are trained end-to-end to go directly from raw input (e.g., pixels) to output (e.g., arthritis severity). We revisit the classic idea of first predicting concepts that are provided at training time, and then using these concepts to predict the label. By construction, we can intervene on these concept bottleneck models by editing their predicted concept values and propagating these changes to the final prediction. On x-ray grading and bird identification, concept bottleneck models achieve competitive accuracy with standard end-to-end models, while enabling interpretation in terms of high-level clinical concepts ("bone spurs") or bird attributes ("wing color"). These models also allow for richer human-model interaction: accuracy improves significantly if we can correct model mistakes on concepts at test time.}
}
@InProceedings{pmlr-v119-kohl20a,
title = {Learning Similarity Metrics for Numerical Simulations},
author = {Kohl, Georg and Um, Kiwon and Thuerey, Nils},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5349--5360},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kohl20a/kohl20a.pdf},
url = {http://proceedings.mlr.press/v119/kohl20a.html},
abstract = {We propose a neural network-based approach that computes a stable and generalizing metric (LSiM) to compare data from a variety of numerical simulation sources. We focus on scalar time-dependent 2D data that commonly arises from motion and transport-based partial differential equations (PDEs). Our method employs a Siamese network architecture that is motivated by the mathematical properties of a metric. We leverage a controllable data generation setup with PDE solvers to create increasingly different outputs from a reference simulation in a controlled environment. A central component of our learned metric is a specialized loss function that introduces knowledge about the correlation between single data samples into the training process. To demonstrate that the proposed approach outperforms existing metrics for vector spaces and other learned, image-based metrics, we evaluate the different methods on a large range of test data. Additionally, we analyze generalization benefits of an adjustable training data difficulty and demonstrate the robustness of LSiM via an evaluation on three real-world data sets.}
}
@InProceedings{pmlr-v119-kohler20a,
title = {Equivariant Flows: Exact Likelihood Generative Learning for Symmetric Densities},
author = {K{\"o}hler, Jonas and Klein, Leon and Noe, Frank},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5361--5370},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kohler20a/kohler20a.pdf},
url = {http://proceedings.mlr.press/v119/kohler20a.html},
abstract = {Normalizing flows are exact-likelihood generative neural networks which approximately transform samples from a simple prior distribution to samples of the probability distribution of interest. Recent work showed that such generative models can be utilized in statistical mechanics to sample equilibrium states of many-body systems in physics and chemistry. To scale and generalize these results, it is essential that the natural symmetries in the probability density – in physics defined by the invariances of the target potential – are built into the flow. We provide a theoretical sufficient criterion showing that the distribution generated by equivariant normalizing flows is invariant with respect to these symmetries by design. Furthermore, we propose building blocks for flows which preserve symmetries which are usually found in physical/chemical many-body particle systems. Using benchmark systems motivated from molecular physics, we demonstrate that those symmetry preserving flows can provide better generalization capabilities and sampling efficiency.}
}
@InProceedings{pmlr-v119-kolobov20a,
title = {Online Learning for Active Cache Synchronization},
author = {Kolobov, Andrey and Bubeck, Sebastien and Zimmert, Julian},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5371--5380},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kolobov20a/kolobov20a.pdf},
url = {http://proceedings.mlr.press/v119/kolobov20a.html},
abstract = {Existing multi-armed bandit (MAB) models make two implicit assumptions: an arm generates a payoff only when it is played, and the agent observes every payoff that is generated. This paper introduces synchronization bandits, a MAB variant where all arms generate costs at all times, but the agent observes an arm’s instantaneous cost only when the arm is played. Synchronization MABs are inspired by online caching scenarios such as Web crawling, where an arm corresponds to a cached item and playing the arm means downloading its fresh copy from a server. We present MirrorSync, an online learning algorithm for synchronization bandits, establish an adversarial regret of $O(T^{2/3})$ for it, and show how to make it practical.}
}
@InProceedings{pmlr-v119-koloskova20a,
title = {A Unified Theory of Decentralized {SGD} with Changing Topology and Local Updates},
author = {Koloskova, Anastasia and Loizou, Nicolas and Boreiri, Sadra and Jaggi, Martin and Stich, Sebastian},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5381--5393},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/koloskova20a/koloskova20a.pdf},
url = {http://proceedings.mlr.press/v119/koloskova20a.html},
abstract = {Decentralized stochastic optimization methods have gained a lot of attention recently, mainly because of their cheap per iteration cost, data locality, and their communication-efficiency. In this paper we introduce a unified convergence analysis that covers a large variety of decentralized SGD methods which so far have required different intuitions, have different applications, and which have been developed separately in various communities. Our algorithmic framework covers local SGD updates and synchronous and pairwise gossip updates on adaptive network topology. We derive universal convergence rates for smooth (convex and non-convex) problems and the rates interpolate between the heterogeneous (non-identically distributed data) and iid-data settings, recovering linear convergence rates in many special cases, for instance for over-parametrized models. Our proofs rely on weak assumptions (typically improving over prior work in several aspects) and recover (and improve) the best known complexity results for a host of important scenarios, such as for instance coorperative SGD and federated averaging (local SGD).}
}
@InProceedings{pmlr-v119-kong20a,
title = {Meta-learning for Mixed Linear Regression},
author = {Kong, Weihao and Somani, Raghav and Song, Zhao and Kakade, Sham and Oh, Sewoong},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5394--5404},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kong20a/kong20a.pdf},
url = {http://proceedings.mlr.press/v119/kong20a.html},
abstract = {In modern supervised learning, there are a large number of tasks, but many of them are associated with only a small amount of labelled data. These include data from medical image processing and robotic interaction. Even though each individual task cannot be meaningfully trained in isolation, one seeks to meta-learn across the tasks from past experiences by exploiting some similarities. We study a fundamental question of interest: When can abundant tasks with small data compensate for lack of tasks with big data? We focus on a canonical scenario where each task is drawn from a mixture of $k$ linear regressions, and identify sufficient conditions for such a graceful exchange to hold; there is little loss in sample complexity even when we only have access to small data tasks. To this end, we introduce a novel spectral approach and show that we can efficiently utilize small data tasks with the help of $\tilde\Omega(k^{3/2})$ medium data tasks each with $\tilde\Omega(k^{1/2})$ examples.}
}
@InProceedings{pmlr-v119-kong20b,
title = {{SDE}-Net: Equipping Deep Neural Networks with Uncertainty Estimates},
author = {Kong, Lingkai and Sun, Jimeng and Zhang, Chao},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5405--5415},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kong20b/kong20b.pdf},
url = {http://proceedings.mlr.press/v119/kong20b.html},
abstract = {Uncertainty quantification is a fundamental yet unsolved problem for deep learning. The Bayesian framework provides a principled way of uncertainty estimation but is often not scalable to modern deep neural nets (DNNs) that have a large number of parameters. Non-Bayesian methods are simple to implement but often conflate different sources of uncertainties and require huge computing resources. We propose a new method for quantifying uncertainties of DNNs from a dynamical system perspective. The core of our method is to view DNN transformations as state evolution of a stochastic dynamical system and introduce a Brownian motion term for capturing epistemic uncertainty. Based on this perspective, we propose a neural stochastic differential equation model (SDE-Net) which consists of (1) a drift net that controls the system to fit the predictive function; and (2) a diffusion net that captures epistemic uncertainty. We theoretically analyze the existence and uniqueness of the solution to SDE-Net. Our experiments demonstrate that the SDE-Net model can outperform existing uncertainty estimation methods across a series of tasks where uncertainty plays a fundamental role.}
}
@InProceedings{pmlr-v119-konstantinov20a,
title = {On the Sample Complexity of Adversarial Multi-Source {PAC} Learning},
author = {Konstantinov, Nikola and Frantar, Elias and Alistarh, Dan and Lampert, Christoph},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5416--5425},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/konstantinov20a/konstantinov20a.pdf},
url = {http://proceedings.mlr.press/v119/konstantinov20a.html},
abstract = {We study the problem of learning from multiple untrusted data sources, a scenario of increasing practical relevance given the recent emergence of crowdsourcing and collaborative learning paradigms. Specifically, we analyze the situation in which a learning system obtains datasets from multiple sources, some of which might be biased or even adversarially perturbed. It is known that in the single-source case, an adversary with the power to corrupt a fixed fraction of the training data can prevent PAC-learnability, that is, even in the limit of infinitely much training data, no learning system can approach the optimal test error. In this work we show that, surprisingly, the same is not true in the multi-source setting, where the adversary can arbitrarily corrupt a fixed fraction of the data sources. Our main results are a generalization bound that provides finite-sample guarantees for this learning setting, as well as corresponding lower bounds. Besides establishing PAC-learnability our results also show that in a cooperative learning setting sharing data with other parties has provable benefits, even if some participants are malicious.}
}
@InProceedings{pmlr-v119-kostas20a,
title = {Asynchronous Coagent Networks},
author = {Kostas, James and Nota, Chris and Thomas, Philip},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5426--5435},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kostas20a/kostas20a.pdf},
url = {http://proceedings.mlr.press/v119/kostas20a.html},
abstract = {Coagent policy gradient algorithms (CPGAs) are reinforcement learning algorithms for training a class of stochastic neural networks called coagent networks. In this work, we prove that CPGAs converge to locally optimal policies. Additionally, we extend prior theory to encompass asynchronous and recurrent coagent networks. These extensions facilitate the straightforward design and analysis of hierarchical reinforcement learning algorithms like the option-critic, and eliminate the need for complex derivations of customized learning rules for these algorithms.}
}
@InProceedings{pmlr-v119-kristiadi20a,
title = {Being Bayesian, Even Just a Bit, Fixes Overconfidence in {R}e{LU} Networks},
author = {Kristiadi, Agustinus and Hein, Matthias and Hennig, Philipp},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5436--5446},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kristiadi20a/kristiadi20a.pdf},
url = {http://proceedings.mlr.press/v119/kristiadi20a.html},
abstract = {The point estimates of ReLU classification networks—arguably the most widely used neural network architecture—have been shown to yield arbitrarily high confidence far away from the training data. This architecture, in conjunction with a maximum a posteriori estimation scheme, is thus not calibrated nor robust. Approximate Bayesian inference has been empirically demonstrated to improve predictive uncertainty in neural networks, although the theoretical analysis of such Bayesian approximations is limited. We theoretically analyze approximate Gaussian distributions on the weights of ReLU networks and show that they fix the overconfidence problem. Furthermore, we show that even a simplistic, thus cheap, Bayesian approximation, also fixes these issues. This indicates that a sufficient condition for a calibrated uncertainty on a ReLU network is “to be a bit Bayesian”. These theoretical results validate the usage of last-layer Bayesian approximation and motivate a range of a fidelity-cost trade-off. We further validate these findings empirically via various standard experiments using common deep ReLU networks and Laplace approximations.}
}
@InProceedings{pmlr-v119-kumar20a,
title = {A Sequential Self Teaching Approach for Improving Generalization in Sound Event Recognition},
author = {Kumar, Anurag and Ithapu, Vamsi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5447--5457},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kumar20a/kumar20a.pdf},
url = {http://proceedings.mlr.press/v119/kumar20a.html},
abstract = {An important problem in machine auditory perception is to recognize and detect sound events. In this paper, we propose a sequential self-teaching approach to learning sounds. Our main proposition is that it is harder to learn sounds in adverse situations such as from weakly labeled and/or noisy labeled data, and in these situations a single stage of learning is not sufficient. Our proposal is a sequential stage-wise learning process that improves generalization capabilities of a given modeling system. We justify this method via technical results and on Audioset, the largest sound events dataset, our sequential learning approach can lead to up to 9% improvement in performance. A comprehensive evaluation also shows that the method leads to improved transferability of knowledge from previously trained models, thereby leading to improved generalization capabilities on transfer learning tasks.}
}
@InProceedings{pmlr-v119-kumar20b,
title = {Curse of Dimensionality on Randomized Smoothing for Certifiable Robustness},
author = {Kumar, Aounon and Levine, Alexander and Goldstein, Tom and Feizi, Soheil},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5458--5467},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kumar20b/kumar20b.pdf},
url = {http://proceedings.mlr.press/v119/kumar20b.html},
abstract = {Randomized smoothing, using just a simple isotropic Gaussian distribution, has been shown to produce good robustness guarantees against $\ell_2$-norm bounded adversaries. In this work, we show that extending the smoothing technique to defend against other attack models can be challenging, especially in the high-dimensional regime. In particular, for a vast class of i.i.d. smoothing distributions, we prove that the largest $\ell_p$-radius that can be certified decreases as $O(1/d^{\frac{1}{2} - \frac{1}{p}})$ with dimension $d$ for $p > 2$. Notably, for $p \geq 2$, this dependence on $d$ is no better than that of the $\ell_p$-radius that can be certified using isotropic Gaussian smoothing, essentially putting a matching lower bound on the robustness radius. When restricted to \emph{generalized} Gaussian smoothing, these two bounds can be shown to be within a constant factor of each other in an asymptotic sense, establishing that Gaussian smoothing provides the best possible results, up to a constant factor, when $p \geq 2$. We present experimental results on CIFAR to validate our theory. For other smoothing distributions, such as, a uniform distribution within an $\ell_1$ or an $\ell_\infty$-norm ball, we show upper bounds of the form $O(1 / d)$ and $O(1 / d^{1 - \frac{1}{p}})$ respectively, which have an even worse dependence on $d$.}
}
@InProceedings{pmlr-v119-kumar20c,
title = {Understanding Self-Training for Gradual Domain Adaptation},
author = {Kumar, Ananya and Ma, Tengyu and Liang, Percy},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5468--5479},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kumar20c/kumar20c.pdf},
url = {http://proceedings.mlr.press/v119/kumar20c.html},
abstract = {Machine learning systems must adapt to data distributions that evolve over time, in applications ranging from sensor networks and self-driving car perception modules to brain-machine interfaces. Traditional domain adaptation is only guaranteed to work when the distribution shift is small; empirical methods combine several heuristics for larger shifts but can be dataset specific. To adapt to larger shifts we consider gradual domain adaptation, where the goal is to adapt an initial classifier trained on a source domain given only unlabeled data that shifts gradually in distribution towards a target domain. We prove the first non-vacuous upper bound on the error of self-training with gradual shifts, under settings where directly adapting to the target domain can result in unbounded error. The theoretical analysis leads to algorithmic insights, highlighting that regularization and label sharpening are essential even when we have infinite data. Leveraging the gradual shift structure leads to higher accuracies on a rotating MNIST dataset, a forest Cover Type dataset, and a realistic Portraits dataset.}
}
@InProceedings{pmlr-v119-kumar20d,
title = {On Implicit Regularization in $β$-{VAE}s},
author = {Kumar, Abhishek and Poole, Ben},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5480--5490},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kumar20d/kumar20d.pdf},
url = {http://proceedings.mlr.press/v119/kumar20d.html},
abstract = {While the impact of variational inference (VI) on posterior inference in a fixed generative model is well-characterized, its role in regularizing a learned generative model when used in variational autoencoders (VAEs) is poorly understood. We study the regularizing effects of variational distributions on learning in generative models from two perspectives. First, we analyze the role that the choice of variational family plays in imparting uniqueness to the learned model by restricting the set of optimal generative models. Second, we study the regularization effect of the variational family on the local geometry of the decoding model. This analysis uncovers the regularizer implicit in the $\beta$-VAE objective, and leads to an approximation consisting of a deterministic autoencoding objective plus analytic regularizers that depend on the Hessian or Jacobian of the decoding model, unifying VAEs with recent heuristics proposed for training regularized autoencoders. We empirically verify these findings, observing that the proposed deterministic objective exhibits similar behavior to the $\beta$-VAE in terms of objective value and sample quality.}
}
@InProceedings{pmlr-v119-kumar20e,
title = {Problems with Shapley-value-based explanations as feature importance measures},
author = {Kumar, I. Elizabeth and Venkatasubramanian, Suresh and Scheidegger, Carlos and Friedler, Sorelle},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5491--5500},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kumar20e/kumar20e.pdf},
url = {http://proceedings.mlr.press/v119/kumar20e.html},
abstract = {Game-theoretic formulations of feature importance have become popular as a way to "explain" machine learning models. These methods define a cooperative game between the features of a model and distribute influence among these input elements using some form of the game’s unique Shapley values. Justification for these methods rests on two pillars: their desirable mathematical properties, and their applicability to specific motivations for explanations. We show that mathematical problems arise when Shapley values are used for feature importance and that the solutions to mitigate these necessarily induce further complexity, such as the need for causal reasoning. We also draw on additional literature to argue that Shapley values do not provide explanations which suit human-centric goals of explainability.}
}
@InProceedings{pmlr-v119-kumor20a,
title = {Efficient Identification in Linear Structural Causal Models with Auxiliary Cutsets},
author = {Kumor, Daniel and Cinelli, Carlos and Bareinboim, Elias},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5501--5510},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kumor20a/kumor20a.pdf},
url = {http://proceedings.mlr.press/v119/kumor20a.html},
abstract = {We develop a polynomial-time algorithm for identification of structural coefficients in linear causal models that subsumes previous efficient state-of-the-art methods, unifying several disparate approaches to identification in this setting. Building on these results, we develop a procedure for identifying total causal effects in linear systems.}
}
@InProceedings{pmlr-v119-kunin20a,
title = {Two Routes to Scalable Credit Assignment without Weight Symmetry},
author = {Kunin, Daniel and Nayebi, Aran and Sagastuy-Brena, Javier and Ganguli, Surya and Bloom, Jonathan and Yamins, Daniel},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5511--5521},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kunin20a/kunin20a.pdf},
url = {http://proceedings.mlr.press/v119/kunin20a.html},
abstract = {The neural plausibility of backpropagation has long been disputed, primarily for its use of non-local weight transport — the biologically dubious requirement that one neuron instantaneously measure the synaptic weights of another. Until recently, attempts to create local learning rules that avoid weight transport have typically failed in the large-scale learning scenarios where backpropagation shines, e.g. ImageNet categorization with deep convolutional networks. Here, we investigate a recently proposed local learning rule that yields competitive performance with backpropagation and find that it is highly sensitive to metaparameter choices, requiring laborious tuning that does not transfer across network architecture. Our analysis indicates the underlying mathematical reason for this instability, allowing us to identify a more robust local learning rule that better transfers without metaparameter tuning. Nonetheless, we find a performance and stability gap between this local rule and backpropagation that widens with increasing model depth. We then investigate several non-local learning rules that relax the need for instantaneous weight transport into a more biologically-plausible "weight estimation" process, showing that these rules match state-of-the-art performance on deep networks and operate effectively in the presence of noisy updates. Taken together, our results suggest two routes towards the discovery of neural implementations for credit assignment without weight symmetry: further improvement of local rules so that they perform consistently across architectures and the identification of biological implementations for non-local learning mechanisms.}
}
@InProceedings{pmlr-v119-kuroki20a,
title = {Online Dense Subgraph Discovery via Blurred-Graph Feedback},
author = {Kuroki, Yuko and Miyauchi, Atsushi and Honda, Junya and Sugiyama, Masashi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5522--5532},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kuroki20a/kuroki20a.pdf},
url = {http://proceedings.mlr.press/v119/kuroki20a.html},
abstract = {Dense subgraph discovery aims to find a dense component in edge-weighted graphs. This is a fundamental graph-mining task with a variety of applications and thus has received much attention recently. Although most existing methods assume that each individual edge weight is easily obtained, such an assumption is not necessarily valid in practice. In this paper, we introduce a novel learning problem for dense subgraph discovery in which a learner queries edge subsets rather than only single edges and observes a noisy sum of edge weights in a queried subset. For this problem, we first propose a polynomial-time algorithm that obtains a nearly-optimal solution with high probability. Moreover, to deal with large-sized graphs, we design a more scalable algorithm with a theoretical guarantee. Computational experiments using real-world graphs demonstrate the effectiveness of our algorithms.}
}
@InProceedings{pmlr-v119-kurtz20a,
title = {Inducing and Exploiting Activation Sparsity for Fast Inference on Deep Neural Networks},
author = {Kurtz, Mark and Kopinsky, Justin and Gelashvili, Rati and Matveev, Alexander and Carr, John and Goin, Michael and Leiserson, William and Moore, Sage and Shavit, Nir and Alistarh, Dan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5533--5543},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kurtz20a/kurtz20a.pdf},
url = {http://proceedings.mlr.press/v119/kurtz20a.html},
abstract = {Optimizing convolutional neural networks for fast inference has recently become an extremely active area of research. One of the go-to solutions in this context is weight pruning, which aims to reduce computational and memory footprint by removing large subsets of the connections in a neural network. Surprisingly, much less attention has been given to exploiting sparsity in the activation maps, which tend to be naturally sparse in many settings thanks to the structure of rectified linear (ReLU) activation functions. In this paper, we present an in-depth analysis of methods for maximizing the sparsity of the activations in a trained neural network, and show that, when coupled with an efficient sparse-input convolution algorithm, we can leverage this sparsity for significant performance gains. To induce highly sparse activation maps without accuracy loss, we introduce a new regularization technique, coupled with a new threshold-based sparsification method based on a parameterized activation function called Forced-Activation-Threshold Rectified Linear Unit (FATReLU). We examine the impact of our methods on popular image classification models, showing that most architectures can adapt to significantly sparser activation maps without any accuracy loss. Our second contribution is showing that these these compression gains can be translated into inference speedups: we provide a new algorithm to enable fast convolution operations over networks with sparse activations, and show that it can enable significant speedups for end-to-end inference on a range of popular models on the large-scale ImageNet image classification task on modern Intel CPUs, with little or no retraining cost.}
}
@InProceedings{pmlr-v119-kusupati20a,
title = {Soft Threshold Weight Reparameterization for Learnable Sparsity},
author = {Kusupati, Aditya and Ramanujan, Vivek and Somani, Raghav and Wortsman, Mitchell and Jain, Prateek and Kakade, Sham and Farhadi, Ali},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5544--5555},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kusupati20a/kusupati20a.pdf},
url = {http://proceedings.mlr.press/v119/kusupati20a.html},
abstract = {Sparsity in Deep Neural Networks (DNNs) is studied extensively with the focus of maximizing prediction accuracy given an overall parameter budget. Existing methods rely on uniform or heuristic non-uniform sparsity budgets which have sub-optimal layer-wise parameter allocation resulting in a) lower prediction accuracy or b) higher inference cost (FLOPs). This work proposes Soft Threshold Reparameterization (STR), a novel use of the soft-threshold operator on DNN weights. STR smoothly induces sparsity while learning pruning thresholds thereby obtaining a non-uniform sparsity budget. Our method achieves state-of-the-art accuracy for unstructured sparsity in CNNs (ResNet50 and MobileNetV1 on ImageNet-1K), and, additionally, learns non-uniform budgets that empirically reduce the FLOPs by up to 50%. Notably, STR boosts the accuracy over existing results by up to 10% in the ultra sparse (99%) regime and can also be used to induce low-rank (structured sparsity) in RNNs. In short, STR is a simple mechanism which learns effective sparsity budgets that contrast with popular heuristics. Code, pretrained models and sparsity budgets are at https://github.com/RAIVNLab/STR.}
}
@InProceedings{pmlr-v119-kuznetsov20a,
title = {Controlling Overestimation Bias with Truncated Mixture of Continuous Distributional Quantile Critics},
author = {Kuznetsov, Arsenii and Shvechikov, Pavel and Grishin, Alexander and Vetrov, Dmitry},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5556--5566},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kuznetsov20a/kuznetsov20a.pdf},
url = {http://proceedings.mlr.press/v119/kuznetsov20a.html},
abstract = {The overestimation bias is one of the major impediments to accurate off-policy learning. This paper investigates a novel way to alleviate the overestimation bias in a continuous control setting. Our method—Truncated Quantile Critics, TQC,—blends three ideas: distributional representation of a critic, truncation of critics prediction, and ensembling of multiple critics. Distributional representation and truncation allow for arbitrary granular overestimation control, while ensembling provides additional score improvements. TQC outperforms the current state of the art on all environments from the continuous control benchmark suite, demonstrating 25% improvement on the most challenging Humanoid environment.}
}
@InProceedings{pmlr-v119-kwon20a,
title = {Principled learning method for {W}asserstein distributionally robust optimization with local perturbations},
author = {Kwon, Yongchan and Kim, Wonyoung and Won, Joong-Ho and Paik, Myunghee Cho},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5567--5576},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/kwon20a/kwon20a.pdf},
url = {http://proceedings.mlr.press/v119/kwon20a.html},
abstract = {Wasserstein distributionally robust optimization (WDRO) attempts to learn a model that minimizes the local worst-case risk in the vicinity of the empirical data distribution defined by Wasserstein ball. While WDRO has received attention as a promising tool for inference since its introduction, its theoretical understanding has not been fully matured. Gao et al. (2017) proposed a minimizer based on a tractable approximation of the local worst-case risk, but without showing risk consistency. In this paper, we propose a minimizer based on a novel approximation theorem and provide the corresponding risk consistency results. Furthermore, we develop WDRO inference for locally perturbed data that include the Mixup (Zhang et al., 2017) as a special case. We show that our approximation and risk consistency results naturally extend to the cases when data are locally perturbed. Numerical experiments demonstrate robustness of the proposed method using image classification datasets. Our results show that the proposed method achieves significantly higher accuracy than baseline models on noisy datasets.}
}
@InProceedings{pmlr-v119-l-a-20a,
title = {Concentration bounds for {CV}a{R} estimation: The cases of light-tailed and heavy-tailed distributions},
author = {L.A., Prashanth and Jagannathan, Krishna and Kolla, Ravi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5577--5586},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/l-a-20a/l-a-20a.pdf},
url = {http://proceedings.mlr.press/v119/l-a-20a.html},
abstract = {Conditional Value-at-Risk (CVaR) is a widely used risk metric in applications such as finance. We derive concentration bounds for CVaR estimates, considering separately the cases of sub-Gaussian, light-tailed and heavy-tailed distributions. For the sub-Gaussian and light-tailed cases, we use a classical CVaR estimator based on the empirical distribution constructed from the samples. For heavy-tailed random variables, we assume a mild ‘bounded moment’ condition, and derive a concentration bound for a truncation-based estimator. Our concentration bounds exhibit exponential decay in the sample size, and are tighter than those available in the literature for the above distribution classes. To demonstrate the applicability of our concentration results, we consider the CVaR optimization problem in a multi-armed bandit setting. Specifically, we address the best CVaR-arm identification problem under a fixed budget. Using our CVaR concentration results, we derive an upper-bound on the probability of incorrect arm identification.}
}
@InProceedings{pmlr-v119-lacotte20a,
title = {Optimal Randomized First-Order Methods for Least-Squares Problems},
author = {Lacotte, Jonathan and Pilanci, Mert},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5587--5597},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lacotte20a/lacotte20a.pdf},
url = {http://proceedings.mlr.press/v119/lacotte20a.html},
abstract = {We provide an exact analysis of a class of randomized algorithms for solving overdetermined least-squares problems. We consider first-order methods, where the gradients are pre-conditioned by an approximation of the Hessian, based on a subspace embedding of the data matrix. This class of algorithms encompasses several randomized methods among the fastest solvers for least-squares problems. We focus on two classical embeddings, namely, Gaussian projections and subsampled randomized Hadamard transforms (SRHT). Our key technical innovation is the derivation of the limiting spectral density of SRHT embeddings. Leveraging this novel result, we derive the family of normalized orthogonal polynomials of the SRHT density and we find the optimal pre-conditioned first-order method along with its rate of convergence. Our analysis of Gaussian embeddings proceeds similarly, and leverages classical random matrix theory results. In particular, we show that for a given sketch size, SRHT embeddings exhibits a faster rate of convergence than Gaussian embeddings. Then, we propose a new algorithm by optimizing the computational complexity over the choice of the sketching dimension. To our knowledge, our resulting algorithm yields the best known complexity for solving least-squares problems with no condition number dependence.}
}
@InProceedings{pmlr-v119-laforgue20a,
title = {Duality in {RKHS}s with Infinite Dimensional Outputs: Application to Robust Losses},
author = {Laforgue, Pierre and Lambert, Alex and Brogat-Motte, Luc and D'Alch{\'e}-Buc, Florence},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5598--5607},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/laforgue20a/laforgue20a.pdf},
url = {http://proceedings.mlr.press/v119/laforgue20a.html},
abstract = {Operator-Valued Kernels (OVKs) and associated vector-valued Reproducing Kernel Hilbert Spaces provide an elegant way to extend scalar kernel methods when the output space is a Hilbert space. Although primarily used in finite dimension for problems like multi-task regression, the ability of this framework to deal with infinite dimensional output spaces unlocks many more applications, such as functional regression, structured output prediction, and structured data representation. However, these sophisticated schemes crucially rely on the kernel trick in the output space, so that most of previous works have focused on the square norm loss function, completely neglecting robustness issues that may arise in such surrogate problems. To overcome this limitation, this paper develops a duality approach that allows to solve OVK machines for a wide range of loss functions. The infinite dimensional Lagrange multipliers are handled through a Double Representer Theorem, and algorithms for \epsilon-insensitive losses and the Huber loss are thoroughly detailed. Robustness benefits are emphasized by a theoretical stability analysis, as well as empirical improvements on structured data applications.}
}
@InProceedings{pmlr-v119-lai20a,
title = {Recht-Re Noncommutative Arithmetic-Geometric Mean Conjecture is False},
author = {Lai, Zehua and Lim, Lek-Heng},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5608--5617},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lai20a/lai20a.pdf},
url = {http://proceedings.mlr.press/v119/lai20a.html},
abstract = {Stochastic optimization algorithms have become indispensable in modern machine learning. An unresolved foundational question in this area is the difference between with-replacement sampling and without-replacement sampling — does the latter have superior convergence rate compared to the former? A groundbreaking result of Recht and Ré reduces the problem to a noncommutative analogue of the arithmetic-geometric mean inequality where $n$ positive numbers are replaced by $n$ positive definite matrices. If this inequality holds for all $n$, then without-replacement sampling (also known as random reshuffling) indeed outperforms with-replacement sampling in some important optimization problems. The conjectured Recht–Ré inequality has so far only been established for $n = 2$ and a special case of $n = 3$. We will show that the Recht–Ré conjecture is false for general $n$. Our approach relies on the noncommutative Positivstellensatz, which allows us to reduce the conjectured inequality to a semidefinite program and the validity of the conjecture to certain bounds for the optimum values, which we show are false as soon as $n = 5$.}
}
@InProceedings{pmlr-v119-lai20b,
title = {Bidirectional Model-based Policy Optimization},
author = {Lai, Hang and Shen, Jian and Zhang, Weinan and Yu, Yong},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5618--5627},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lai20b/lai20b.pdf},
url = {http://proceedings.mlr.press/v119/lai20b.html},
abstract = {Model-based reinforcement learning approaches leverage a forward dynamics model to support planning and decision making, which, however, may fail catastrophically if the model is inaccurate. Although there are several existing methods dedicated to combating the model error, the potential of the single forward model is still limited. In this paper, we propose to additionally construct a backward dynamics model to reduce the reliance on accuracy in forward model predictions. We develop a novel method, called Bidirectional Model-based Policy Optimization (BMPO) to utilize both the forward model and backward model to generate short branched rollouts for policy optimization. Furthermore, we theoretically derive a tighter bound of return discrepancy, which shows the superiority of BMPO against the one using merely the forward model. Extensive experiments demonstrate that BMPO outperforms state-of-the-art model-based methods in terms of sample efficiency and asymptotic performance.}
}
@InProceedings{pmlr-v119-lakkaraju20a,
title = {Robust and Stable Black Box Explanations},
author = {Lakkaraju, Himabindu and Arsov, Nino and Bastani, Osbert},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5628--5638},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lakkaraju20a/lakkaraju20a.pdf},
url = {http://proceedings.mlr.press/v119/lakkaraju20a.html},
abstract = {As machine learning black boxes are increasingly being deployed in real-world applications, there has been a growing interest in developing post hoc explanations that summarize the behaviors of these black boxes. However, existing algorithms for generating such explanations have been shown to lack stability and robustness to distribution shifts. We propose a novel framework for generating robust and stable explanations of black box models based on adversarial training. Our framework optimizes a minimax objective that aims to construct the highest fidelity explanation with respect to the worst-case over a set of adversarial perturbations. We instantiate this algorithm for explanations in the form of linear models and decision sets by devising the required optimization procedures. To the best of our knowledge, this work makes the first attempt at generating post hoc explanations that are robust to a general class of adversarial perturbations that are of practical interest. Experimental evaluation with real-world and synthetic datasets demonstrates that our approach substantially improves robustness of explanations without sacrificing their fidelity on the original data distribution.}
}
@InProceedings{pmlr-v119-laskin20a,
title = {{CURL}: Contrastive Unsupervised Representations for Reinforcement Learning},
author = {Laskin, Michael and Srinivas, Aravind and Abbeel, Pieter},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5639--5650},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/laskin20a/laskin20a.pdf},
url = {http://proceedings.mlr.press/v119/laskin20a.html},
abstract = {We present CURL: Contrastive Unsupervised Representations for Reinforcement Learning. CURL extracts high-level features from raw pixels using contrastive learning and performs off-policy control on top of the extracted features. CURL outperforms prior pixel-based methods, both model-based and model-free, on complex tasks in the DeepMind Control Suite and Atari Games showing 1.9x and 1.2x performance gains at the 100K environment and interaction steps benchmarks respectively. On the DeepMind Control Suite, CURL is the first image-based algorithm to nearly match the sample-efficiency of methods that use state-based features. Our code is open-sourced and available at https://www.github.com/MishaLaskin/curl.}
}
@InProceedings{pmlr-v119-latorre20a,
title = {Efficient Proximal Mapping of the 1-path-norm of Shallow Networks},
author = {Latorre, Fabian and Rolland, Paul and Hallak, Nadav and Cevher, Volkan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5651--5661},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/latorre20a/latorre20a.pdf},
url = {http://proceedings.mlr.press/v119/latorre20a.html},
abstract = {We demonstrate two new important properties of the 1-path-norm of shallow neural networks. First, despite its non-smoothness and non-convexity it allows a closed form proximal operator which can be efficiently computed, allowing the use of stochastic proximal-gradient-type methods for regularized empirical risk minimization. Second, when the activation functions is differentiable, it provides an upper bound on the Lipschitz constant of the network. Such bound is tighter than the trivial layer-wise product of Lipschitz constants, motivating its use for training networks robust to adversarial perturbations. In practical experiments we illustrate the advantages of using the proximal mapping and we compare the robustness-accuracy trade-off induced by the 1-path-norm, L1-norm and layer-wise constraints on the Lipschitz constant (Parseval networks).}
}
@InProceedings{pmlr-v119-lattimore20a,
title = {Learning with Good Feature Representations in Bandits and in {RL} with a Generative Model},
author = {Lattimore, Tor and Szepesvari, Csaba and Weisz, Gellert},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5662--5670},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lattimore20a/lattimore20a.pdf},
url = {http://proceedings.mlr.press/v119/lattimore20a.html},
abstract = {The construction in the recent paper by Du et al. [2019] implies that searching for a near-optimal action in a bandit sometimes requires examining essentially all the actions, even if the learner is given linear features in R^d that approximate the rewards with a small uniform error. We use the Kiefer-Wolfowitz theorem to prove a positive result that by checking only a few actions, a learner can always find an action that is suboptimal with an error of at most O($\epsilon$$\sqrt{}$d) where $\epsilon$ is the approximation error of the features. Thus, features are useful when the approximation error is small relative to the dimensionality of the features. The idea is applied to stochastic bandits and reinforcement learning with a generative model where the learner has access to d-dimensional linear features that approximate the action-value functions for all policies to an accuracy of $\epsilon$. For linear bandits, we prove a bound on the regret of order d$\sqrt{}$(n log(k)) + $\epsilon$n$\sqrt{}$d log(n) with k the number of actions and n the horizon. For RL we show that approximate policy iteration can learn a policy that is optimal up to an additive error of order $\epsilon$$\sqrt{}$d/(1 − $\gamma$)^2 and using about d/($\epsilon$^2(1 − $\gamma$)^4) samples from the generative model. These bounds are independent of the finer details of the features. We also investigate how the structure of the feature set impacts the tradeoff between sample complexity and estimation error.}
}
@InProceedings{pmlr-v119-le20a,
title = {Inertial Block Proximal Methods for Non-Convex Non-Smooth Optimization},
author = {Le, Hien and Gillis, Nicolas and Patrinos, Panagiotis},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5671--5681},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/le20a/le20a.pdf},
url = {http://proceedings.mlr.press/v119/le20a.html},
abstract = {We propose inertial versions of block coordinate descent methods for solving non-convex non-smooth composite optimization problems. Our methods possess three main advantages compared to current state-of-the-art accelerated first-order methods: (1) they allow using two different extrapolation points to evaluate the gradients and to add the inertial force (we will empirically show that it is more efficient than using a single extrapolation point), (2) they allow to randomly select the block of variables to update, and (3) they do not require a restarting step. We prove the subsequential convergence of the generated sequence under mild assumptions, prove the global convergence under some additional assumptions, and provide convergence rates. We deploy the proposed methods to solve non-negative matrix factorization (NMF) and show that they compete favorably with the state-of-the-art NMF algorithms. Additional experiments on non-negative approximate canonical polyadic decomposition, also known as nonnegative tensor factorization, are also provided.}
}
@InProceedings{pmlr-v119-le20b,
title = {Self-Attentive Associative Memory},
author = {Le, Hung and Tran, Truyen and Venkatesh, Svetha},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5682--5691},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/le20b/le20b.pdf},
url = {http://proceedings.mlr.press/v119/le20b.html},
abstract = {Heretofore, neural networks with external memory are restricted to single memory with lossy representations of memory interactions. A rich representation of relationships between memory pieces urges a high-order and segregated relational memory. In this paper, we propose to separate the storage of individual experiences (item memory) and their occurring relationships (relational memory). The idea is implemented through a novel Self-attentive Associative Memory (SAM) operator. Found upon outer product, SAM forms a set of associative memories that represent the hypothetical high-order relationships between arbitrary pairs of memory elements, through which a relational memory is constructed from an item memory. The two memories are wired into a single sequential model capable of both memorization and relational reasoning. We achieve competitive results with our proposed two-memory model in a diversity of machine learning tasks, from challenging synthetic problems to practical testbeds such as geometry, graph, reinforcement learning, and question answering.}
}
@InProceedings{pmlr-v119-lee20a,
title = {Causal Effect Identifiability under Partial-Observability},
author = {Lee, Sanghack and Bareinboim, Elias},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5692--5701},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lee20a/lee20a.pdf},
url = {http://proceedings.mlr.press/v119/lee20a.html},
abstract = {Causal effect identifiability is concerned with establishing the effect of intervening on a set of variables on another set of variables from observational or interventional distributions under causal assumptions that are usually encoded in the form of a causal graph. Most of the results of this literature implicitly assume that every variable modeled in the graph is measured in the available distributions. In practice, however, the data collections of the different studies considered do not measure the same variables, consistently. In this paper, we study the causal effect identifiability problem when the available distributions encompass different sets of variables, which we refer to as identification under partial-observability. We study a number of properties of the factors that comprise a causal effect under various levels of abstraction, and then characterize the relationship between them with respect to their status relative to the identification of a targeted intervention. We establish a sufficient graphical criterion for determining whether the effects are identifiable from partially-observed distributions. Finally, building on these graphical properties, we develop an algorithm that returns a formula for a causal effect in terms of the available distributions.}
}
@InProceedings{pmlr-v119-lee20b,
title = {Estimating Model Uncertainty of Neural Networks in Sparse Information Form},
author = {Lee, Jongseok and Humt, Matthias and Feng, Jianxiang and Triebel, Rudolph},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5702--5713},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lee20b/lee20b.pdf},
url = {http://proceedings.mlr.press/v119/lee20b.html},
abstract = {We present a sparse representation of model uncertainty for Deep Neural Networks (DNNs) where the parameter posterior is approximated with an inverse formulation of the Multivariate Normal Distribution (MND), also known as the information form. The key insight of our work is that the information matrix, i.e. the inverse of the covariance matrix tends to be sparse in its spectrum. Therefore, dimensionality reduction techniques such as low rank approximations (LRA) can be effectively exploited. To achieve this, we develop a novel sparsification algorithm and derive a cost-effective analytical sampler. As a result, we show that the information form can be scalably applied to represent model uncertainty in DNNs. Our exhaustive theoretical analysis and empirical evaluations on various benchmarks show the competitiveness of our approach over the current methods.}
}
@InProceedings{pmlr-v119-lee20c,
title = {Self-supervised Label Augmentation via Input Transformations},
author = {Lee, Hankook and Hwang, Sung Ju and Shin, Jinwoo},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5714--5724},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lee20c/lee20c.pdf},
url = {http://proceedings.mlr.press/v119/lee20c.html},
abstract = {Self-supervised learning, which learns by constructing artificial labels given only the input signals, has recently gained considerable attention for learning representations with unlabeled datasets, i.e., learning without any human-annotated supervision. In this paper, we show that such a technique can be used to significantly improve the model accuracy even under fully-labeled datasets. Our scheme trains the model to learn both original and self-supervised tasks, but is different from conventional multi-task learning frameworks that optimize the summation of their corresponding losses. Our main idea is to learn a single unified task with respect to the joint distribution of the original and self-supervised labels, i.e., we augment original labels via self-supervision. This simple, yet effective approach allows to train models easier by relaxing a certain invariant constraint during learning the original and self-supervised tasks simultaneously. It also enables an aggregated inference which combines the predictions from different augmentations to improve the prediction accuracy. Furthermore, we propose a novel knowledge transfer technique, which we refer to as self-distillation, that has the effect of the aggregated inference in a single (faster) inference. We demonstrate the large accuracy improvement and wide applicability of our framework on various fully-supervised settings, e.g., the few-shot and imbalanced classification scenarios.}
}
@InProceedings{pmlr-v119-lee20d,
title = {Batch Reinforcement Learning with Hyperparameter Gradients},
author = {Lee, Byungjun and Lee, Jongmin and Vrancx, Peter and Kim, Dongho and Kim, Kee-Eung},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5725--5735},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lee20d/lee20d.pdf},
url = {http://proceedings.mlr.press/v119/lee20d.html},
abstract = {We consider the batch reinforcement learning problem where the agent needs to learn only from a fixed batch of data, without further interaction with the environment. In such a scenario, we want to prevent the optimized policy from deviating too much from the data collection policy since the estimation becomes highly unstable otherwise due to the off-policy nature of the problem. However, imposing this requirement too strongly will result in a policy that merely follows the data collection policy. Unlike prior work where this trade-off is controlled by hand-tuned hyperparameters, we propose a novel batch reinforcement learning approach, batch optimization of policy and hyperparameter (BOPAH), that uses a gradient-based optimization of the hyperparameter using held-out data. We show that BOPAH outperforms other batch reinforcement learning algorithms in tabular and continuous control tasks, by finding a good balance to the trade-off between adhering to the data collection policy and pursuing the possible policy improvement.}
}
@InProceedings{pmlr-v119-lee20e,
title = {Accelerated Message Passing for Entropy-Regularized {MAP} Inference},
author = {Lee, Jonathan and Pacchiano, Aldo and Bartlett, Peter and Jordan, Michael},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5736--5746},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lee20e/lee20e.pdf},
url = {http://proceedings.mlr.press/v119/lee20e.html},
abstract = {Maximum a posteriori (MAP) inference in discrete-valued Markov random fields is a fundamental problem in machine learning that involves identifying the most likely configuration of random variables given a distribution. Due to the difficulty of this combinatorial problem, linear programming (LP) relaxations are commonly used to derive specialized message passing algorithms that are often interpreted as coordinate descent on the dual LP. To achieve more desirable computational properties, a number of methods regularize the LP with an entropy term, leading to a class of smooth message passing algorithms with convergence guarantees. In this paper, we present randomized methods for accelerating these algorithms by leveraging techniques that underlie classical accelerated gradient methods. The proposed algorithms incorporate the familiar steps of standard smooth message passing algorithms, which can be viewed as coordinate minimization steps. We show that these accelerated variants achieve faster rates for finding $\epsilon$-optimal points of the unregularized problem, and, when the LP is tight, we prove that the proposed algorithms recover the true MAP solution in fewer iterations than standard message passing algorithms.}
}
@InProceedings{pmlr-v119-lee20f,
title = {Learning Compound Tasks without Task-specific Knowledge via Imitation and Self-supervised Learning},
author = {Lee, Sang-Hyun and Seo, Seung-Woo},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5747--5756},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lee20f/lee20f.pdf},
url = {http://proceedings.mlr.press/v119/lee20f.html},
abstract = {Most real-world tasks are compound tasks that consist of multiple simpler sub-tasks. The main challenge of learning compound tasks is that we have no explicit supervision to learn the hierarchical structure of compound tasks. To address this challenge, previous imitation learning methods exploit task-specific knowledge, e.g., labeling demonstrations manually or specifying termination conditions for each sub-task. However, the need for task-specific knowledge makes it difficult to scale imitation learning to real-world tasks. In this paper, we propose an imitation learning method that can learn compound tasks without task-specific knowledge. The key idea behind our method is to leverage a self-supervised learning framework to learn the hierarchical structure of compound tasks. Our work also proposes a task-agnostic regularization technique to prevent unstable switching between sub-tasks, which has been a common degenerate case in previous works. We evaluate our method against several baselines on compound tasks. The results show that our method achieves state-of-the-art performance on compound tasks, outperforming prior imitation learning methods.}
}
@InProceedings{pmlr-v119-lee20g,
title = {Context-aware Dynamics Model for Generalization in Model-Based Reinforcement Learning},
author = {Lee, Kimin and Seo, Younggyo and Lee, Seunghyun and Lee, Honglak and Shin, Jinwoo},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5757--5766},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lee20g/lee20g.pdf},
url = {http://proceedings.mlr.press/v119/lee20g.html},
abstract = {Model-based reinforcement learning (RL) enjoys several benefits, such as data-efficiency and planning, by learning a model of the environment’s dynamics. However, learning a global model that can generalize across different dynamics remains a challenge. To tackle this problem, we decompose the task of learning a global dynamics model into two stages: (a) learning a context latent vector that captures the local dynamics, then (b) predicting the next state conditioned on it. In order to encode dynamics-specific information into the context latent vector, we introduce a novel loss function that encourages the context latent vector to be useful for predicting both forward and backward dynamics. The proposed method achieves superior generalization ability across various simulated robotics and control tasks, compared to existing RL schemes.}
}
@InProceedings{pmlr-v119-lee20h,
title = {Temporal Phenotyping using Deep Predictive Clustering of Disease Progression},
author = {Lee, Changhee and Van Der Schaar, Mihaela},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5767--5777},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lee20h/lee20h.pdf},
url = {http://proceedings.mlr.press/v119/lee20h.html},
abstract = {Due to the wider availability of modern electronic health records, patient care data is often being stored in the form of time-series. Clustering such time-series data is crucial for patient phenotyping, anticipating patients’ prognoses by identifying “similar” patients, and designing treatment guidelines that are tailored to homogeneous patient subgroups. In this paper, we develop a deep learning approach for clustering time-series data, where each cluster comprises patients who share similar future outcomes of interest (e.g., adverse events, the onset of comorbidities). To encourage each cluster to have homogeneous future outcomes, the clustering is carried out by learning discrete representations that best describe the future outcome distribution based on novel loss functions. Experiments on two real-world datasets show that our model achieves superior clustering performance over state-of-the-art benchmarks and identifies meaningful clusters that can be translated into actionable information for clinical decision-making.}
}
@InProceedings{pmlr-v119-lee20i,
title = {Tensor denoising and completion based on ordinal observations},
author = {Lee, Chanwoo and Wang, Miaoyan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5778--5788},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lee20i/lee20i.pdf},
url = {http://proceedings.mlr.press/v119/lee20i.html},
abstract = {Higher-order tensors arise frequently in applications such as neuroimaging, recommendation system, and social network analysis. We consider the problem of low-rank tensor estimation from possibly incomplete, ordinal-valued observations. Two related problems are studied, one on tensor denoising and another on tensor completion. We propose a multi-linear cumulative link model, develop a rank-constrained M-estimator, and obtain theoretical accuracy guarantees. Our mean squared error bound enjoys a faster convergence rate than previous results, and we show that the proposed estimator is minimax optimal under the class of low-rank models. Furthermore, the procedure developed serves as an efficient completion method which guarantees consistent recovery of an order-K (d,...,d)-dimensional low-rank tensor using only O(Kd) noisy, quantized observations. We demonstrate the outperformance of our approach over previous methods on the tasks of clustering and collaborative filtering.}
}
@InProceedings{pmlr-v119-lei20a,
title = {Analytic Marching: An Analytic Meshing Solution from Deep Implicit Surface Networks},
author = {Lei, Jiabao and Jia, Kui},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5789--5798},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lei20a/lei20a.pdf},
url = {http://proceedings.mlr.press/v119/lei20a.html},
abstract = {This paper studies a problem of learning surface mesh via implicit functions in an emerging field of deep learning surface reconstruction, where implicit functions are popularly implemented as multi-layer perceptrons (MLPs) with rectified linear units (ReLU). To achieve meshing from the learned implicit functions, existing methods adopt the de-facto standard algorithm of marching cubes; while promising, they suffer from loss of precision learned in the MLPs, due to the discretization nature of marching cubes. Motivated by the knowledge that a ReLU based MLP partitions its input space into a number of linear regions, we identify from these regions analytic cells and faces that are associated with zero-level isosurface of the implicit function, and characterize the conditions under which the identified faces are guaranteed to connect and form a closed, piecewise planar surface. We propose a naturally parallelizable algorithm of analytic marching to exactly recover the mesh captured by a learned MLP. Experiments on deep learning mesh reconstruction verify the advantages of our algorithm over existing ones.}
}
@InProceedings{pmlr-v119-lei20b,
title = {{SGD} Learns One-Layer Networks in {WGAN}s},
author = {Lei, Qi and Lee, Jason and Dimakis, Alex and Daskalakis, Constantinos},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5799--5808},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lei20b/lei20b.pdf},
url = {http://proceedings.mlr.press/v119/lei20b.html},
abstract = {Generative adversarial networks (GANs) are a widely used framework for learning generative models. Wasserstein GANs (WGANs), one of the most successful variants of GANs, require solving a minmax optimization problem to global optimality, but are in practice successfully trained using stochastic gradient descent-ascent. In this paper, we show that, when the generator is a one-layer network, stochastic gradient descent-ascent converges to a global solution with polynomial time and sample complexity.}
}
@InProceedings{pmlr-v119-lei20c,
title = {Fine-Grained Analysis of Stability and Generalization for Stochastic Gradient Descent},
author = {Lei, Yunwen and Ying, Yiming},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5809--5819},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lei20c/lei20c.pdf},
url = {http://proceedings.mlr.press/v119/lei20c.html},
abstract = {Recently there are a considerable amount of work devoted to the study of the algorithmic stability and generalization for stochastic gradient descent (SGD). However, the existing stability analysis requires to impose restrictive assumptions on the boundedness of gradients, smoothness and convexity of loss functions. In this paper, we provide a fine-grained analysis of stability and generalization for SGD by substantially relaxing these assumptions. Firstly, we establish stability and generalization for SGD by removing the existing bounded gradient assumptions. The key idea is the introduction of a new stability measure called on-average model stability, for which we develop novel bounds controlled by the risks of SGD iterates. This yields generalization bounds depending on the behavior of the best model, and leads to the first-ever-known fast bounds in the low-noise setting using stability approach. Secondly, the smoothness assumption is relaxed by considering loss functions with Holder continuous (sub)gradients for which we show that optimal bounds are still achieved by balancing computation and stability. To our best knowledge, this gives the first-ever-known stability and generalization bounds for SGD with non-smooth loss functions (e.g., hinge loss). Finally, we study learning problems with (strongly) convex objectives but non-convex loss functions.}
}
@InProceedings{pmlr-v119-leng20a,
title = {Learning Quadratic Games on Networks},
author = {Leng, Yan and Dong, Xiaowen and Wu, Junfeng and Pentland, Alex},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5820--5830},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/leng20a/leng20a.pdf},
url = {http://proceedings.mlr.press/v119/leng20a.html},
abstract = {Individuals, or organizations, cooperate with or compete against one another in a wide range of practical situations. Such strategic interactions are often modeled as games played on networks, where an individual’s payoff depends not only on her action but also on that of her neighbors. The current literature has largely focused on analyzing the characteristics of network games in the scenario where the structure of the network, which is represented by a graph, is known beforehand. It is often the case, however, that the actions of the players are readily observable while the underlying interaction network remains hidden. In this paper, we propose two novel frameworks for learning, from the observations on individual actions, network games with linear-quadratic payoffs, and in particular, the structure of the interaction network. Our frameworks are based on the Nash equilibrium of such games and involve solving a joint optimization problem for the graph structure and the individual marginal benefits. Both synthetic and real-world experiments demonstrate the effectiveness of the proposed frameworks, which have theoretical as well as practical implications for understanding strategic interactions in a network environment.}
}
@InProceedings{pmlr-v119-li20a,
title = {{ACF}low: Flow Models for Arbitrary Conditional Likelihoods},
author = {Li, Yang and Akbar, Shoaib and Oliva, Junier},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5831--5841},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/li20a/li20a.pdf},
url = {http://proceedings.mlr.press/v119/li20a.html},
abstract = {Understanding the dependencies among features of a dataset is at the core of most unsupervised learning tasks. However, a majority of generative modeling approaches are focused solely on the joint distribution $p(x)$ and utilize models where it is intractable to obtain the conditional distribution of some arbitrary subset of features $x_u$ given the rest of the observed covariates $x_o$: $p(x_u \mid x_o)$. Traditional conditional approaches provide a model for a \emph{fixed} set of covariates conditioned on another \emph{fixed} set of observed covariates. Instead, in this work we develop a model that is capable of yielding \emph{all} conditional distributions $p(x_u \mid x_o)$ (for arbitrary $x_u$) via tractable conditional likelihoods. We propose a novel extension of (change of variables based) flow generative models, arbitrary conditioning flow models (ACFlow). ACFlow can be conditioned on arbitrary subsets of observed covariates, which was previously infeasible. We further extend ACFlow to model the joint distributions $p(x)$ and arbitrary marginal distributions $p(x_u)$. We also apply ACFlow to the imputation of features, and develop a unified platform for both multiple and single imputation by introducing an auxiliary objective that provides a principled single “best guess” for flow models. Extensive empirical evaluations show that our model achieves state-of-the-art performance in modeling arbitrary conditional likelihoods in addition to both single and multiple imputation in synthetic and real-world datasets.}
}
@InProceedings{pmlr-v119-li20b,
title = {Manifold Identification for Ultimately Communication-Efficient Distributed Optimization},
author = {Li, Yu-Sheng and Chiang, Wei-Lin and Lee, Ching-Pei},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5842--5852},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/li20b/li20b.pdf},
url = {http://proceedings.mlr.press/v119/li20b.html},
abstract = {This work proposes a progressive manifold identification approach for distributed optimization with sound theoretical justifications to greatly reduce both the rounds of communication and the bytes communicated per round for partly-smooth regularized problems such as the $\ell_1$- and group-LASSO-regularized ones. Our two-stage method first uses an inexact proximal quasi-Newton method to iteratively identify a sequence of low-dimensional manifolds in which the final solution would lie, and restricts the model update within the current manifold to gradually lower the order of the per-round communication cost from the problem dimension to the dimension of the manifold that contains a solution and makes the problem within it smooth. After identifying this manifold, we take superlinear-convergent truncated semismooth Newton steps computed by preconditioned conjugate gradient to largely reduce the communication rounds by improving the convergence rate from the existing linear or sublinear ones to a superlinear rate. Experiments show that our method can be orders of magnitudes lower in the communication cost and an order of magnitude faster in the running time than the state of the art.}
}
@InProceedings{pmlr-v119-li20c,
title = {Neural Architecture Search in A Proxy Validation Loss Landscape},
author = {Li, Yanxi and Dong, Minjing and Wang, Yunhe and Xu, Chang},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5853--5862},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/li20c/li20c.pdf},
url = {http://proceedings.mlr.press/v119/li20c.html},
abstract = {This paper searches for the optimal neural architecture by minimizing a proxy of validation loss. Existing neural architecture search (NAS) methods used to discover the optimal neural architecture that best fits the validation examples given the up-to-date network weights. However, back propagation with a number of validation examples could be time consuming, especially when it needs to be repeated many times in NAS. Though these intermediate validation results are invaluable, they would be wasted if we cannot use them to predict the future from the past. In this paper, we propose to approximate the validation loss landscape by learning a mapping from neural architectures to their corresponding validate losses. The optimal neural architecture thus can be easily identified as the minimum of this proxy validation loss landscape. A novel sampling strategy is further developed for an efficient approximation of the loss landscape. Theoretical analysis indicates that the validation loss estimator learnt with our sampling strategy can reach a lower error rate and a lower label complexity compared with a uniform sampling. Experimental results on benchmarks demonstrate that the architecture searched by the proposed algorithm can achieve a satisfactory accuracy with less time cost.}
}
@InProceedings{pmlr-v119-li20d,
title = {{PENNI}: Pruned Kernel Sharing for Efficient {CNN} Inference},
author = {Li, Shiyu and Hanson, Edward and Li, Hai and Chen, Yiran},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5863--5873},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/li20d/li20d.pdf},
url = {http://proceedings.mlr.press/v119/li20d.html},
abstract = {Although state-of-the-art (SOTA) CNNs achieve outstanding performance on various tasks, their high computation demand and massive number of parameters make it difficult to deploy these SOTA CNNs onto resource-constrained devices. Previous works on CNN acceleration utilize low-rank approximation of the original convolution layers to reduce computation cost. However, these methods are very difficult to conduct upon sparse models, which limits execution speedup since redundancies within the CNN model are not fully exploited. We argue that kernel granularity decomposition can be conducted with low-rank assumption while exploiting the redundancy within the remaining compact coefficients. Based on this observation, we propose PENNI, a CNN model compression framework that is able to achieve model compactness and hardware efficiency simultaneously by (1) implementing kernel sharing in convolution layers via a small number of basis kernels and (2) alternately adjusting bases and coefficients with sparse constraints. Experiments show that we can prune 97% parameters and 92% FLOPs on ResNet18 CIFAR10 with no accuracy loss, and achieve a 44% reduction in run-time memory consumption and a 53% reduction in inference latency.}
}
@InProceedings{pmlr-v119-li20e,
title = {Implicit {E}uler Skip Connections: Enhancing Adversarial Robustness via Numerical Stability},
author = {Li, Mingjie and He, Lingshen and Lin, Zhouchen},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5874--5883},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/li20e/li20e.pdf},
url = {http://proceedings.mlr.press/v119/li20e.html},
abstract = {Deep neural networks have achieved great success in various areas, but recent works have found that neural networks are vulnerable to adversarial attacks, which leads to a hot topic nowadays. Although many approaches have been proposed to enhance the robustness of neural networks, few of them explored robust architectures for neural networks. On this account, we try to address such an issue from the perspective of dynamic system in this work. By viewing ResNet as an explicit Euler discretization of an ordinary differential equation (ODE), for the first time, we find that the adversarial robustness of ResNet is connected to the numerical stability of the corresponding dynamic system, i.e., more stable numerical schemes may correspond to more robust deep networks. Furthermore, inspired by the implicit Euler method for solving numerical ODE problems, we propose Implicit Euler skip connections (IE-Skips) by modifying the original skip connection in ResNet or its variants. Then we theoretically prove its advantages under the adversarial attack and the experimental results show that our ResNet with IE-Skips can largely improve the robustness and the generalization ability under adversarial attacks when compared with the vanilla ResNet of the same parameter size.}
}
@InProceedings{pmlr-v119-li20f,
title = {Closed Loop Neural-Symbolic Learning via Integrating Neural Perception, Grammar Parsing, and Symbolic Reasoning},
author = {Li, Qing and Huang, Siyuan and Hong, Yining and Chen, Yixin and Wu, Ying Nian and Zhu, Song-Chun},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5884--5894},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/li20f/li20f.pdf},
url = {http://proceedings.mlr.press/v119/li20f.html},
abstract = {The goal of neural-symbolic computation is to integrate the connectionist and symbolist paradigms. Prior methods learn the neural-symbolic models using reinforcement learning (RL) approaches, which ignore the error propagation in the symbolic reasoning module and thus converge slowly with sparse rewards. In this paper, we address these issues and close the loop of neural-symbolic learning by (1) introducing the grammar model as a symbolic prior to bridge neural perception and symbolic reasoning, and (2) proposing a novel back-search algorithm which mimics the top-down human-like learning procedure to propagate the error through the symbolic reasoning module efficiently. We further interpret the proposed learning framework as maximum likelihood estimation using Markov chain Monte Carlo sampling and the back-search algorithm as a Metropolis-Hastings sampler. The experiments are conducted on two weakly-supervised neural-symbolic tasks: (1) handwritten formula recognition on the newly introduced HWF dataset; (2) visual question answering on the CLEVR dataset. The results show that our approach significantly outperforms the RL methods in terms of performance, converging speed, and data efficiency. Our code and data are released at https://liqing-ustc.github.io/NGS.}
}
@InProceedings{pmlr-v119-li20g,
title = {Acceleration for Compressed Gradient Descent in Distributed and Federated Optimization},
author = {Li, Zhize and Kovalev, Dmitry and Qian, Xun and Richtarik, Peter},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5895--5904},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/li20g/li20g.pdf},
url = {http://proceedings.mlr.press/v119/li20g.html},
abstract = {Due to the high communication cost in distributed and federated learning problems, methods relying on compression of communicated messages are becoming increasingly popular. While in other contexts the best performing gradient-type methods invariably rely on some form of acceleration/momentum to reduce the number of iterations, there are no methods which combine the benefits of both gradient compression and acceleration. In this paper, we remedy this situation and propose the first \emph{accelerated compressed gradient descent (ACGD)} methods. In the single machine regime, we prove that ACGD enjoys the rate $O\Big((1+\omega)\sqrt{\frac{L}{\mu}}\log \frac{1}{\epsilon}\Big)$ for $\mu$-strongly convex problems and $O\Big((1+\omega)\sqrt{\frac{L}{\epsilon}}\Big)$ for convex problems, respectively, where $\omega$ is the compression parameter. Our results improve upon the existing non-accelerated rates $O\Big((1+\omega)\frac{L}{\mu}\log \frac{1}{\epsilon}\Big)$ and $O\Big((1+\omega)\frac{L}{\epsilon}\Big)$, respectively, and recover the optimal rates of accelerated gradient descent as a special case when no compression ($\omega=0$) is applied. We further propose a distributed variant of ACGD (called ADIANA) and prove the convergence rate $\widetilde{O}\Big(\omega+\sqrt{\frac{L}{\mu}}+\sqrt{\big(\frac{\omega}{n}+\sqrt{\frac{\omega}{n}}\big)\frac{\omega L}{\mu}}\Big)$, where $n$ is the number of devices/workers and $\widetilde{O}$ hides the logarithmic factor $\log \frac{1}{\epsilon}$. This improves upon the previous best result $\widetilde{O}\Big(\omega + \frac{L}{\mu}+\frac{\omega L}{n\mu} \Big)$ achieved by the DIANA method of Mishchenko et al. (2019). Finally, we conduct several experiments on real-world datasets which corroborate our theoretical results and confirm the practical superiority of our accelerated methods.}
}
@InProceedings{pmlr-v119-li20h,
title = {On the Relation between Quality-Diversity Evaluation and Distribution-Fitting Goal in Text Generation},
author = {Li, Jianing and Lan, Yanyan and Guo, Jiafeng and Cheng, Xueqi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5905--5915},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/li20h/li20h.pdf},
url = {http://proceedings.mlr.press/v119/li20h.html},
abstract = {The goal of text generation models is to fit the underlying real probability distribution of text. For performance evaluation, quality and diversity metrics are usually applied. However, it is still not clear to what extend can the quality-diversity evaluation reflect the distribution-fitting goal. In this paper, we try to reveal such relation in a theoretical approach. We prove that under certain conditions, a linear combination of quality and diversity constitutes a divergence metric between the generated distribution and the real distribution. We also show that the commonly used BLEU/Self-BLEU metric pair fails to match any divergence metric, thus propose CR/NRR as a substitute for quality/diversity metric pair.}
}
@InProceedings{pmlr-v119-li20i,
title = {Latent Space Factorisation and Manipulation via Matrix Subspace Projection},
author = {Li, Xiao and Lin, Chenghua and Li, Ruizhe and Wang, Chaozheng and Guerin, Frank},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5916--5926},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/li20i/li20i.pdf},
url = {http://proceedings.mlr.press/v119/li20i.html},
abstract = {We tackle the problem disentangling the latent space of an autoencoder in order to separate labelled attribute information from other characteristic information. This then allows us to change selected attributes while preserving other information. Our method, matrix subspace projection, is much simpler than previous approaches to latent space factorisation, for example not requiring multiple discriminators or a careful weighting among their loss functions. Furthermore our new model can be applied to autoencoders as a plugin, and works across diverse domains such as images or text. We demonstrate the utility of our method for attribute manipulation in autoencoders trained across varied domains, using both human evaluation and automated methods. The quality of generation of our new model (e.g. reconstruction, conditional generation) is highly competitive to a number of strong baselines.}
}
@InProceedings{pmlr-v119-li20j,
title = {Visual Grounding of Learned Physical Models},
author = {Li, Yunzhu and Lin, Toru and Yi, Kexin and Bear, Daniel and Yamins, Daniel and Wu, Jiajun and Tenenbaum, Joshua and Torralba, Antonio},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5927--5936},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/li20j/li20j.pdf},
url = {http://proceedings.mlr.press/v119/li20j.html},
abstract = {Humans intuitively recognize objects’ physical properties and predict their motion, even when the objects are engaged in complicated interactions. The abilities to perform physical reasoning and to adapt to new environments, while intrinsic to humans, remain challenging to state-of-the-art computational models. In this work, we present a neural model that simultaneously reasons about physics and makes future predictions based on visual and dynamics priors. The visual prior predicts a particle-based representation of the system from visual observations. An inference module operates on those particles, predicting and refining estimates of particle locations, object states, and physical parameters, subject to the constraints imposed by the dynamics prior, which we refer to as visual grounding. We demonstrate the effectiveness of our method in environments involving rigid objects, deformable materials, and fluids. Experiments show that our model can infer the physical properties within a few observations, which allows the model to quickly adapt to unseen scenarios and make accurate predictions into the future.}
}
@InProceedings{pmlr-v119-li20k,
title = {Learning from Irregularly-Sampled Time Series: A Missing Data Perspective},
author = {Li, Steven Cheng-Xian and Marlin, Benjamin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5937--5946},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/li20k/li20k.pdf},
url = {http://proceedings.mlr.press/v119/li20k.html},
abstract = {Irregularly-sampled time series occur in many domains including healthcare. They can be challenging to model because they do not naturally yield a fixed-dimensional representation as required by many standard machine learning models. In this paper, we consider irregular sampling from the perspective of missing data. We model observed irregularly-sampled time series data as a sequence of index-value pairs sampled from a continuous but unobserved function. We introduce an encoder-decoder framework for learning from such generic indexed sequences. We propose learning methods for this framework based on variational autoencoders and generative adversarial networks. For continuous irregularly-sampled time series, we introduce continuous convolutional layers that can efficiently interface with existing neural network architectures. Experiments show that our models are able to achieve competitive or better classification results on irregularly-sampled multivariate time series compared to recent RNN models while offering significantly faster training times.}
}
@InProceedings{pmlr-v119-li20l,
title = {Evolutionary Topology Search for Tensor Network Decomposition},
author = {Li, Chao and Sun, Zhun},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5947--5957},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/li20l/li20l.pdf},
url = {http://proceedings.mlr.press/v119/li20l.html},
abstract = {Tensor network (TN) decomposition is a promising framework to represent extremely high-dimensional problems with few parameters. However, it is challenging to search the (near-)optimal topological structures for TN decomposition, since the number of candidate solutions exponentially grows with increasing the order of a tensor. In this paper, we claim that the issue can be practically tackled by evolutionary algorithms in an affordable manner. We encode the complex topological structures into binary strings, and develop a simple genetic meta-algorithm to search the optimal topology on Hamming space. The experimental results by both synthetic and real-world data demonstrate that our method can effectively discover the ground-truth topology or even better structures with a small number of generations, and significantly boost the representational power of TN decomposition compared with well-known tensor-train (TT) or tensor-ring (TR) models.}
}
@InProceedings{pmlr-v119-li20m,
title = {Train Big, Then Compress: Rethinking Model Size for Efficient Training and Inference of Transformers},
author = {Li, Zhuohan and Wallace, Eric and Shen, Sheng and Lin, Kevin and Keutzer, Kurt and Klein, Dan and Gonzalez, Joey},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5958--5968},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/li20m/li20m.pdf},
url = {http://proceedings.mlr.press/v119/li20m.html},
abstract = {Since hardware resources are limited, the objective of training deep learning models is typically to maximize accuracy subject to the time and memory constraints of training and inference. We study the impact of model size in this setting, focusing on Transformer models for NLP tasks that are limited by compute: self-supervised pretraining and high-resource machine translation. We first show that even though smaller Transformer models execute faster per iteration, wider and deeper models converge in significantly fewer steps. Moreover, this acceleration in convergence typically outpaces the additional computational overhead of using larger models. Therefore, the most compute-efficient training strategy is to counterintuitively train extremely large models but stop after a small number of iterations. This leads to an apparent trade-off between the training efficiency of large Transformer models and the inference efficiency of small Transformer models. However, we show that large models are more robust to compression techniques such as quantization and pruning than small models. Consequently, one can get the best of both worlds: heavily compressed, large models achieve higher accuracy than lightly compressed, small models.}
}
@InProceedings{pmlr-v119-li20n,
title = {Almost Tune-Free Variance Reduction},
author = {Li, Bingcong and Wang, Lingda and Giannakis, Georgios B.},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5969--5978},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/li20n/li20n.pdf},
url = {http://proceedings.mlr.press/v119/li20n.html},
abstract = {The variance reduction class of algorithms including the representative ones, SVRG and SARAH, have well documented merits for empirical risk minimization problems. However, they require grid search to tune parameters (step size and the number of iterations per inner loop) for optimal performance. This work introduces ‘almost tune-free’ SVRG and SARAH schemes equipped with i) Barzilai-Borwein (BB) step sizes; ii) averaging; and, iii) the inner loop length adjusted to the BB step sizes. In particular, SVRG, SARAH, and their BB variants are first reexamined through an ‘estimate sequence’ lens to enable new averaging methods that tighten their convergence rates theoretically, and improve their performance empirically when the step size or the inner loop length is chosen large. Then a simple yet effective means to adjust the number of iterations per inner loop is developed to enhance the merits of the proposed averaging schemes and BB step sizes. Numerical tests corroborate the proposed methods.}
}
@InProceedings{pmlr-v119-li20o,
title = {Nearly Linear Row Sampling Algorithm for Quantile Regression},
author = {Li, Yi and Wang, Ruosong and Yang, Lin and Zhang, Hanrui},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5979--5989},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/li20o/li20o.pdf},
url = {http://proceedings.mlr.press/v119/li20o.html},
abstract = {We give a row sampling algorithm for the quantile loss function with sample complexity nearly linear in the dimensionality of the data, improving upon the previous best algorithm whose sampling complexity has at least cubic dependence on the dimensionality. Based upon our row sampling algorithm, we give the fastest known algorithm for quantile regression and a graph sparsification algorithm for balanced directed graphs. Our main technical contribution is to show that Lewis weights sampling, which has been used in row sampling algorithms for $\ell_p$ norms, can also be applied in row sampling algorithms for a variety of loss functions. We complement our theoretical results by experiments to demonstrate the practicality of our approach.}
}
@InProceedings{pmlr-v119-li20p,
title = {Temporal Logic Point Processes},
author = {Li, Shuang and Wang, Lu and Zhang, Ruizhi and Chang, Xiaofu and Liu, Xuqin and Xie, Yao and Qi, Yuan and Song, Le},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5990--6000},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/li20p/li20p.pdf},
url = {http://proceedings.mlr.press/v119/li20p.html},
abstract = {We propose a modeling framework for event data and aim to answer questions such as \emph{when} and \emph{why} the next event would happen. Our proposed model excels in small data regime with the ability to incorporate domain knowledge in terms of logic rules. We model the dynamics of the event starts and ends via intensity function with the structures informed by a set of first-order temporal logic rules. Using the softened representation of temporal relations, and a weighted combination of logic rules, our probabilistic model can deal with uncertainty in events. Furthermore, many well-known point processes (e.g., Hawkes process, self-correcting point process) can be interpreted as special cases of our model given simple temporal logic rules. Our model, therefore, riches the family of point processes. We derive a maximum likelihood estimation procedure for our model and show that it can lead to accurate predictions when data are sparse and domain knowledge is critical.}
}
@InProceedings{pmlr-v119-li20q,
title = {Input-Sparsity Low Rank Approximation in Schatten Norm},
author = {Li, Yi and Woodruff, David},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6001--6009},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/li20q/li20q.pdf},
url = {http://proceedings.mlr.press/v119/li20q.html},
abstract = {We give the first input-sparsity time algorithms for the rank-$k$ low rank approximation problem in every Schatten norm. Specifically, for a given $n\times n$ matrix $A$, our algorithm computes $Y,Z\in \R^{n\times k}$, which, with high probability, satisfy $\|A-YZ^T\|_p \leq (1+\eps)\|A-A_k\|_p$, where $\|M\|_p = \left (\sum_{i=1}^n \sigma_i(M)^p \right )^{1/p}$ is the Schatten $p$-norm of a matrix $M$ with singular values $\sigma_1(M), \ldots, \sigma_n(M)$, and where $A_k$ is the best rank-$k$ approximation to $A$. Our algorithm runs in time $\tilde{O}(\nnz(A) + n^{\alpha_p}\poly(k/\eps))$, where $\alpha_p = 1$ for $p\in [1,2)$ and $\alpha_p = 1 + (\omega-1)(1-2/p)$ for $p>2$ and $\omega \approx 2.374$ is the exponent of matrix multiplication. For the important case of $p = 1$, which corresponds to the more “robust” nuclear norm, we obtain $\tilde{O}(\nnz(A) + n \cdot \poly(k/\epsilon))$ time, which was previously only known for the Frobenius norm $(p = 2)$. Moreover, since $\alpha_p < \omega$ for every $p$, our algorithm has a better dependence on $n$ than that in the singular value decomposition for every $p$. Crucial to our analysis is the use of dimensionality reduction for Ky-Fan $p$-norms.}
}
@InProceedings{pmlr-v119-li20r,
title = {{RIFLE}: Backpropagation in Depth for Deep Transfer Learning through Re-Initializing the Fully-connected {L}ay{E}r},
author = {Li, Xingjian and Xiong, Haoyi and An, Haozhe and Xu, Cheng-Zhong and Dou, Dejing},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6010--6019},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/li20r/li20r.pdf},
url = {http://proceedings.mlr.press/v119/li20r.html},
abstract = {Fine-tuning the deep convolution neural network (CNN) using a pre-trained model helps transfer knowledge learned from larger datasets to the target task. While the accuracy could be largely improved even when the training dataset is small, the transfer learning outcome is similar with the pre-trained one with closed CNN weights[17], as the backpropagation here brings less updates to deeper CNN layers. In this work, we propose RIFLE - a simple yet effective strategy that deepens backpropagation in transfer learning settings, through periodically ReInitializing the Fully-connected LayEr with random scratch during the fine-tuning procedure. RIFLE brings significant perturbation to the backpropagation process and leads to deep CNN weights update, while the affects of perturbation can be easily converged throughout the overall learning procedure. The experiments show that the use of RIFLE significantly improves deep transfer learning accuracy on a wide range of datasets, outperforming known tricks for the similar purpose, such as dropout, dropconnect, stochastic depth, and cyclic learning rate, under the same settings with 0.5%-2% higher testing accuracy. Empirical cases and ablation studies further indicate RIFLE brings meaningful updates to deep CNN layers with accuracy improved.}
}
@InProceedings{pmlr-v119-li20s,
title = {On a projective ensemble approach to two sample test for equality of distributions},
author = {Li, Zhimei and Zhang, Yaowu},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6020--6027},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/li20s/li20s.pdf},
url = {http://proceedings.mlr.press/v119/li20s.html},
abstract = {In this work, we propose a robust test for the multivariate two-sample problem through projective ensemble, which is a generalization of the Cramer-von Mises statistic. The proposed test statistic has a simple closed-form expression without any tuning parameters involved, it is easy to implement can be computed in quadratic time. Moreover, our test is insensitive to the dimension and consistent against all fixed alternatives, it does not require the moment assumption and is robust to the presence of outliers. We study the asymptotic behaviors of the test statistic under the null and two kinds of alternative hypotheses. We also suggest a permutation procedure to approximate critical values and employ its consistency. We demonstrate the effectiveness of our test through extensive simulation studies and a real data application.}
}
@InProceedings{pmlr-v119-liang20a,
title = {Do We Really Need to Access the Source Data? {S}ource Hypothesis Transfer for Unsupervised Domain Adaptation},
author = {Liang, Jian and Hu, Dapeng and Feng, Jiashi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6028--6039},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/liang20a/liang20a.pdf},
url = {http://proceedings.mlr.press/v119/liang20a.html},
abstract = {Unsupervised domain adaptation (UDA) aims to leverage the knowledge learned from a labeled source dataset to solve similar tasks in a new unlabeled domain. Prior UDA methods typically require to access the source data when learning to adapt the model, making them risky and inefficient for decentralized private data. This work tackles a practical setting where only a trained source model is available and investigates how we can effectively utilize such a model without source data to solve UDA problems. We propose a simple yet generic representation learning framework, named \emph{Source HypOthesis Transfer} (SHOT). SHOT freezes the classifier module (hypothesis) of the source model and learns the target-specific feature extraction module by exploiting both information maximization and self-supervised pseudo-labeling to implicitly align representations from the target domains to the source hypothesis. To verify its versatility, we evaluate SHOT in a variety of adaptation cases including closed-set, partial-set, and open-set domain adaptation. Experiments indicate that SHOT yields state-of-the-art results among multiple domain adaptation benchmarks.}
}
@InProceedings{pmlr-v119-liang20b,
title = {Variable Skipping for Autoregressive Range Density Estimation},
author = {Liang, Eric and Yang, Zongheng and Stoica, Ion and Abbeel, Pieter and Duan, Yan and Chen, Peter},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6040--6049},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/liang20b/liang20b.pdf},
url = {http://proceedings.mlr.press/v119/liang20b.html},
abstract = {Deep autoregressive models compute point likelihood estimates of individual data points. However, many applications (i.e., database cardinality estimation), require estimating range densities, a capability that is under-explored by current neural density estimation literature. In these applications, fast and accurate range density estimates over high-dimensional data directly impact user-perceived performance. In this paper, we explore a technique for accelerating range density estimation over deep autoregressive models. This technique, called variable skipping, exploits the sparse structure of range density queries to avoid sampling unnecessary variables during approximate inference. We show that variable skipping provides 10-100x efficiency improvements when targeting challenging high-quantile error metrics, enables complex applications such as text pattern matching, and can be realized via a simple data augmentation procedure without changing the usual maximum likelihood objective.}
}
@InProceedings{pmlr-v119-liang20c,
title = {Adaptive Droplet Routing in Digital Microfluidic Biochips Using Deep Reinforcement Learning},
author = {Liang, Tung-Che and Zhong, Zhanwei and Bigdeli, Yaas and Ho, Tsung-Yi and Chakrabarty, Krishnendu and Fair, Richard},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6050--6060},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/liang20c/liang20c.pdf},
url = {http://proceedings.mlr.press/v119/liang20c.html},
abstract = {We present and investigate a novel application domain for deep reinforcement learning (RL): droplet routing on digital microfluidic biochips (DMFBs). A DMFB, composed of a two-dimensional electrode array, manipulates discrete fluid droplets to automatically execute biochemical protocols such as point-of-care clinical diagnosis. However, a major concern associated with the use of DMFBs is that electrodes in a biochip can degrade over time. Droplet-transportation operations associated with the degraded electrodes can fail, thereby compromising the integrity of the bioassay outcome. We show that casting droplet transportation as an RL problem enables the training of deep network policies to capture the underlying health conditions of electrodes and to provide reliable fluidic operations. We propose a new RL-based droplet-routing flow that can be used for various sizes of DMFBs, and demonstrate reliable execution of an epigenetic bioassay with the RL droplet router on a fabricated DMFB. To facilitate further research, we also present a simulation environment based on the OpenAI Gym Interface for RL-guided droplet-routing problems on DMFBs.}
}
@InProceedings{pmlr-v119-lim20a,
title = {{AR}-{DAE}: Towards Unbiased Neural Entropy Gradient Estimation},
author = {Lim, Jae Hyun and Courville, Aaron and Pal, Christopher and Huang, Chin-Wei},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6061--6071},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lim20a/lim20a.pdf},
url = {http://proceedings.mlr.press/v119/lim20a.html},
abstract = {Entropy is ubiquitous in machine learning, but it is in general intractable to compute the entropy of the distribution of an arbitrary continuous random variable. In this paper, we propose the amortized residual denoising autoencoder (AR-DAE) to approximate the gradient of the log density function, which can be used to estimate the gradient of entropy. Amortization allows us to significantly reduce the error of the gradient approximator by approaching asymptotic optimality of a regular DAE, in which case the estimation is in theory unbiased. We conduct theoretical and experimental analyses on the approximation error of the proposed method, as well as extensive studies on heuristics to ensure its robustness. Finally, using the proposed gradient approximator to estimate the gradient of entropy, we demonstrate state-of-the-art performance on density estimation with variational autoencoders and continuous control with soft actor-critic.}
}
@InProceedings{pmlr-v119-lim20b,
title = {Hierarchical Verification for Adversarial Robustness},
author = {Lim, Cong Han and Urtasun, Raquel and Yumer, Ersin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6072--6082},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lim20b/lim20b.pdf},
url = {http://proceedings.mlr.press/v119/lim20b.html},
abstract = {We introduce a new framework for the exact point-wise ℓp robustness verification problem that exploits the layer-wise geometric structure of deep feed-forward networks with rectified linear activations (ReLU networks). The activation regions of the network partition the input space, and one can verify the ℓp robustness around a point by checking all the activation regions within the desired radius. The GeoCert algorithm (Jordan et al., NeurIPS 2019) treats this partition as a generic polyhedral complex in order to detect which region to check next. In contrast, our LayerCert framework considers the nested hyperplane arrangement structure induced by the layers of the ReLU network and explores regions in a hierarchical manner. We show that, under certain conditions on the algorithm parameters, LayerCert provably reduces the number and size of the convex programs that one needs to solve compared to GeoCert. Furthermore, our LayerCert framework allows the incorporation of lower bounding routines based on convex relaxations to further improve performance. Experimental results demonstrate that LayerCert can significantly reduce both the number of convex programs solved and the running time over the state-of-the-art.}
}
@InProceedings{pmlr-v119-lin20a,
title = {On Gradient Descent Ascent for Nonconvex-Concave Minimax Problems},
author = {Lin, Tianyi and Jin, Chi and Jordan, Michael},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6083--6093},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lin20a/lin20a.pdf},
url = {http://proceedings.mlr.press/v119/lin20a.html},
abstract = {We consider nonconvex-concave minimax problems, $\min_{\mathbf{x}} \max_{\mathbf{y} \in \mathcal{Y}} f(\mathbf{x}, \mathbf{y})$, where $f$ is nonconvex in $\mathbf{x}$ but concave in $\mathbf{y}$ and $\mathcal{Y}$ is a convex and bounded set. One of the most popular algorithms for solving this problem is the celebrated gradient descent ascent (GDA) algorithm, which has been widely used in machine learning, control theory and economics. Despite the extensive convergence results for the convex-concave setting, GDA with equal stepsize can converge to limit cycles or even diverge in a general setting. In this paper, we present the complexity results on two-time-scale GDA for solving nonconvex-concave minimax problems, showing that the algorithm can find a stationary point of the function $\Phi(\cdot) := \max_{\mathbf{y} \in \mathcal{Y}} f(\cdot, \mathbf{y})$ efficiently. To the best our knowledge, this is the first nonasymptotic analysis for two-time-scale GDA in this setting, shedding light on its superior practical performance in training generative adversarial networks (GANs) and other real applications.}
}
@InProceedings{pmlr-v119-lin20b,
title = {Extrapolation for Large-batch Training in Deep Learning},
author = {Lin, Tao and Kong, Lingjing and Stich, Sebastian and Jaggi, Martin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6094--6104},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lin20b/lin20b.pdf},
url = {http://proceedings.mlr.press/v119/lin20b.html},
abstract = {Deep learning networks are typically trained by Stochastic Gradient Descent (SGD) methods that iteratively improve the model parameters by estimating a gradient on a very small fraction of the training data. A major roadblock faced when increasing the batch size to a substantial fraction of the training data for reducing training time is the persistent degradation in performance (generalization gap). To address this issue, recent work propose to add small perturbations to the model parameters when computing the stochastic gradients and report improved generalization performance due to smoothing effects. However, this approach is poorly understood; it requires often model-specific noise and fine-tuning. To alleviate these drawbacks, we propose to use instead computationally efficient extrapolation (extragradient) to stabilize the optimization trajectory while still benefiting from smoothing to avoid sharp minima. This principled approach is well grounded from an optimization perspective and we show that a host of variations can be covered in a unified framework that we propose. We prove the convergence of this novel scheme and rigorously evaluate its empirical performance on ResNet, LSTM, and Transformer. We demonstrate that in a variety of experiments the scheme allows scaling to much larger batch sizes than before whilst reaching or surpassing SOTA accuracy.}
}
@InProceedings{pmlr-v119-lin20c,
title = {On the Theoretical Properties of the Network Jackknife},
author = {Lin, Qiaohui and Lunde, Robert and Sarkar, Purnamrita},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6105--6115},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lin20c/lin20c.pdf},
url = {http://proceedings.mlr.press/v119/lin20c.html},
abstract = {We study the properties of a leave-node-out jackknife procedure for network data. Under the sparse graphon model, we prove an Efron-Stein-type inequality, showing that the network jackknife leads to conservative estimates of the variance (in expectation) for any network functional that is invariant to node permutation. For a general class of count functionals, we also establish consistency of the network jackknife. We complement our theoretical analysis with a range of simulated and real-data examples and show that the network jackknife offers competitive performance in cases where other resampling methods are known to be valid. In fact, for several network statistics, we see that the jackknife provides more accurate inferences compared to related methods such as subsampling.}
}
@InProceedings{pmlr-v119-lin20d,
title = {Handling the Positive-Definite Constraint in the {B}ayesian Learning Rule},
author = {Lin, Wu and Schmidt, Mark and Khan, Mohammad Emtiyaz},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6116--6126},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lin20d/lin20d.pdf},
url = {http://proceedings.mlr.press/v119/lin20d.html},
abstract = {The Bayesian learning rule is a natural-gradient variational inference method, which not only contains many existing learning algorithms as special cases but also enables the design of new algorithms. Unfortunately, when variational parameters lie in an open constraint set, the rule may not satisfy the constraint and requires line-searches which could slow down the algorithm. In this work, we address this issue for positive-definite constraints by proposing an improved rule that naturally handles the constraints. Our modification is obtained by using Riemannian gradient methods, and is valid when the approximation attains a block-coordinate natural parameterization (e.g., Gaussian distributions and their mixtures). Our method outperforms existing methods without any significant increase in computation. Our work makes it easier to apply the rule in the presence of positive-definite constraints in parameter spaces.}
}
@InProceedings{pmlr-v119-lin20e,
title = {{I}nfo{GAN}-{CR} and {M}odel{C}entrality: Self-supervised Model Training and Selection for Disentangling {GAN}s},
author = {Lin, Zinan and Thekumparampil, Kiran and Fanti, Giulia and Oh, Sewoong},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6127--6139},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lin20e/lin20e.pdf},
url = {http://proceedings.mlr.press/v119/lin20e.html},
abstract = {Disentangled generative models map a latent code vector to a target space, while enforcing that a subset of the learned latent codes are interpretable and associated with distinct properties of the target distribution. Recent advances have been dominated by Variational AutoEncoder (VAE)-based methods, while training disentangled generative adversarial networks (GANs) remains challenging. In this work, we show that the dominant challenges facing disentangled GANs can be mitigated through the use of self-supervision. We make two main contributions: first, we design a novel approach for training disentangled GANs with self-supervision. We propose contrastive regularizer, which is inspired by a natural notion of disentanglement: latent traversal. This achieves higher disentanglement scores than state-of-the-art VAE- and GAN-based approaches. Second, we propose an unsupervised model selection scheme called ModelCentrality, which uses generated synthetic samples to compute the medoid (multi-dimensional generalization of median) of a collection of models. The current common practice of hyper-parameter tuning requires using ground-truths samples, each labelled with known perfect disentangled latent codes. As real datasets are not equipped with such labels, we propose an unsupervised model selection scheme and show that it finds a model close to the best one, for both VAEs and GANs. Combining contrastive regularization with ModelCentrality, we improve upon the state-of-the-art disentanglement scores significantly, without accessing the supervised data.}
}
@InProceedings{pmlr-v119-lin20f,
title = {Improving Generative Imagination in Object-Centric World Models},
author = {Lin, Zhixuan and Wu, Yi-Fu and Peri, Skand and Fu, Bofeng and Jiang, Jindong and Ahn, Sungjin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6140--6149},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lin20f/lin20f.pdf},
url = {http://proceedings.mlr.press/v119/lin20f.html},
abstract = {The remarkable recent advances in object-centric generative world models raise a few questions. First, while many of the recent achievements are indispensable for making a general and versatile world model, it is quite unclear how these ingredients can be integrated into a unified framework. Second, despite using generative objectives, abilities for object detection and tracking are mainly investigated, leaving the crucial ability of temporal imagination largely under question. Third, a few key abilities for more faithful temporal imagination such as multimodal uncertainty and situation-awareness are missing. In this paper, we introduce Generative Structured World Models (G-SWM). The G-SWM achieves the versatile world modeling not only by unifying the key properties of previous models in a principled framework but also by achieving two crucial new abilities, multimodal uncertainty and situation-awareness. Our thorough investigation on the temporal generation ability in comparison to the previous models demonstrates that G-SWM achieves the versatility with the best or comparable performance for all experiment settings including a few complex settings that have not been tested before. https://sites.google.com/view/gswm}
}
@InProceedings{pmlr-v119-lin20g,
title = {Generalized and Scalable Optimal Sparse Decision Trees},
author = {Lin, Jimmy and Zhong, Chudi and Hu, Diane and Rudin, Cynthia and Seltzer, Margo},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6150--6160},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lin20g/lin20g.pdf},
url = {http://proceedings.mlr.press/v119/lin20g.html},
abstract = {Decision tree optimization is notoriously difficult from a computational perspective but essential for the field of interpretable machine learning. Despite efforts over the past 40 years, only recently have optimization breakthroughs been made that have allowed practical algorithms to find optimal decision trees. These new techniques have the potential to trigger a paradigm shift, where, it is possible to construct sparse decision trees to efficiently optimize a variety of objective functions, without relying on greedy splitting and pruning heuristics that often lead to suboptimal solutions. The contribution in this work is to provide a general framework for decision tree optimization that addresses the two significant open problems in the area: treatment of imbalanced data and fully optimizing over continuous variables. We present techniques that produce optimal decision trees over variety of objectives including F-score, AUC, and partial area under the ROC convex hull. We also introduce a scalable algorithm that produces provably optimal results in the presence of continuous variables and speeds up decision tree construction by several order of magnitude relative to the state-of-the art.}
}
@InProceedings{pmlr-v119-lin20h,
title = {Finite-Time Last-Iterate Convergence for Multi-Agent Learning in Games},
author = {Lin, Tianyi and Zhou, Zhengyuan and Mertikopoulos, Panayotis and Jordan, Michael},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6161--6171},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lin20h/lin20h.pdf},
url = {http://proceedings.mlr.press/v119/lin20h.html},
abstract = {In this paper, we consider multi-agent learning via online gradient descent in a class of games called $\lambda$-cocoercive games, a fairly broad class of games that admits many Nash equilibria and that properly includes unconstrained strongly monotone games. We characterize the finite-time last-iterate convergence rate for joint OGD learning on $\lambda$-cocoercive games; further, building on this result, we develop a fully adaptive OGD learning algorithm that does not require any knowledge of problem parameter (e.g. cocoercive constant $\lambda$) and show, via a novel double-stopping time technique, that this adaptive algorithm achieves same finite-time last-iterate convergence rate as non-adaptive counterpart. Subsequently, we extend OGD learning to the noisy gradient feedback case and establish last-iterate convergence results–first qualitative almost sure convergence, then quantitative finite-time convergence rates– all under non-decreasing step-sizes. To our knowledge, we provide the first set of results that fill in several gaps of the existing multi-agent online learning literature, where three aspects–finite-time convergence rates, non-decreasing step-sizes, and fully adaptive algorithms have been unexplored before.}
}
@InProceedings{pmlr-v119-lioutas20a,
title = {Time-aware Large Kernel Convolutions},
author = {Lioutas, Vasileios and Guo, Yuhong},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6172--6183},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lioutas20a/lioutas20a.pdf},
url = {http://proceedings.mlr.press/v119/lioutas20a.html},
abstract = {To date, most state-of-the-art sequence modeling architectures use attention to build generative models for language based tasks. Some of these models use all the available sequence tokens to generate an attention distribution which results in time complexity of $O(n^2)$. Alternatively, they utilize depthwise convolutions with softmax normalized kernels of size $k$ acting as a limited-window self-attention, resulting in time complexity of $O(k{\cdot}n)$. In this paper, we introduce Time-aware Large Kernel (TaLK) Convolutions, a novel adaptive convolution operation that learns to predict the size of a summation kernel instead of using a fixed-sized kernel matrix. This method yields a time complexity of $O(n)$, effectively making the sequence encoding process linear to the number of tokens. We evaluate the proposed method on large-scale standard machine translation, abstractive summarization and language modeling datasets and show that TaLK Convolutions constitute an efficient improvement over other attention/convolution based approaches.}
}
@InProceedings{pmlr-v119-liu20a,
title = {Understanding the Curse of Horizon in Off-Policy Evaluation via Conditional Importance Sampling},
author = {Liu, Yao and Bacon, Pierre-Luc and Brunskill, Emma},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6184--6193},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/liu20a/liu20a.pdf},
url = {http://proceedings.mlr.press/v119/liu20a.html},
abstract = {Off-policy policy estimators that use importance sampling (IS) can suffer from high variance in long-horizon domains, and there has been particular excitement over new IS methods that leverage the structure of Markov decision processes. We analyze the variance of the most popular approaches through the viewpoint of conditional Monte Carlo. Surprisingly, we find that in finite horizon MDPs there is no strict variance reduction of per-decision importance sampling or marginalized importance sampling, comparing with vanilla importance sampling. We then provide sufficient conditions under which the per-decision or marginalized estimators will provably reduce the variance over importance sampling with finite horizons. For the asymptotic (in terms of horizon $T$) case, we develop upper and lower bounds on the variance of those estimators which yields sufficient conditions under which there exists an exponential v.s. polynomial gap between the variance of importance sampling and that of the per-decision or stationary/marginalized estimators. These results help advance our understanding of if and when new types of IS estimators will improve the accuracy of off-policy estimation.}
}
@InProceedings{pmlr-v119-liu20b,
title = {Sparse Shrunk Additive Models},
author = {Liu, Guodong and Chen, Hong and Huang, Heng},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6194--6204},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/liu20b/liu20b.pdf},
url = {http://proceedings.mlr.press/v119/liu20b.html},
abstract = {Most existing feature selection methods in literature are linear models, so that the nonlinear relations between features and response variables are not considered. Meanwhile, in these feature selection models, the interactions between features are often ignored or just discussed under prior structure information. To address these challenging issues, we consider the problem of sparse additive models for high-dimensional nonparametric regression with the allowance of the flexible interactions between features. A new method, called as sparse shrunk additive models (SSAM), is proposed to explore the structure information among features. This method bridges sparse kernel regression and sparse feature selection. Theoretical results on the convergence rate and sparsity characteristics of SSAM are established by the novel analysis techniques with integral operator and concentration estimate. In particular, our algorithm and theoretical analysis only require the component functions to be continuous and bounded, which are not necessary to be in reproducing kernel Hilbert spaces. Experiments on both synthetic and real-world data demonstrate the effectiveness of the proposed approach.}
}
@InProceedings{pmlr-v119-liu20c,
title = {Boosting Deep Neural Network Efficiency with Dual-Module Inference},
author = {Liu, Liu and Deng, Lei and Chen, Zhaodong and Wang, Yuke and Li, Shuangchen and Zhang, Jingwei and Yang, Yihua and Gu, Zhenyu and Ding, Yufei and Xie, Yuan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6205--6215},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/liu20c/liu20c.pdf},
url = {http://proceedings.mlr.press/v119/liu20c.html},
abstract = {Using deep neural networks (DNNs) in machine learning tasks is promising in delivering high-quality results but challenging to meet stringent latency requirements and energy constraints because of the memory-bound and the compute-bound execution pattern of DNNs. We propose a big-little dual-module inference to dynamically skip unnecessary memory accesses and computations to accelerate DNN inference. Leveraging the noise-resilient feature of nonlinear activation functions, we propose to use a lightweight little module that approximates the original DNN layer, termed as the big module, to compute activations of the insensitive region that are more noise-resilient. Hence, the expensive memory accesses and computations of the big module can be reduced as the results are only calculated in the sensitive region. For memory-bound models such as recurrent neural networks (RNNs), our method can reduce the overall memory accesses by 40% on average and achieve 1.54x to 1.75x speedup on a commodity CPU-based server platform with a negligible impact on model quality. In addition, our method can reduce the operations of the compute-bound models such as convolutional neural networks (CNNs) by 3.02x, with only a 0.5% accuracy drop.}
}
@InProceedings{pmlr-v119-liu20d,
title = {Sample Complexity Bounds for 1-bit Compressive Sensing and Binary Stable Embeddings with Generative Priors},
author = {Liu, Zhaoqiang and Gomes, Selwyn and Tiwari, Avtansh and Scarlett, Jonathan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6216--6225},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/liu20d/liu20d.pdf},
url = {http://proceedings.mlr.press/v119/liu20d.html},
abstract = {The goal of standard 1-bit compressive sensing is to accurately recover an unknown sparse vector from binary-valued measurements, each indicating the sign of a linear function of the vector. Motivated by recent advances in compressive sensing with generative models, where a generative modeling assumption replaces the usual sparsity assumption, we study the problem of 1-bit compressive sensing with generative models. We first consider noiseless 1-bit measurements, and provide sample complexity bounds for approximate recovery under i.i.d. Gaussian measurements and a Lipschitz continuous generative prior, as well as a near-matching algorithm-independent lower bound. Moreover, we demonstrate that the Binary $\epsilon$-Stable Embedding property, which characterizes the robustness of the reconstruction to measurement errors and noise, also holds for 1-bit compressive sensing with Lipschitz continuous generative models with sufficiently many Gaussian measurements. In addition, we apply our results to neural network generative models, and provide a proof-of-concept numerical experiment demonstrating significant improvements over sparsity-based approaches.}
}
@InProceedings{pmlr-v119-liu20e,
title = {Peer Loss Functions: Learning from Noisy Labels without Knowing Noise Rates},
author = {Liu, Yang and Guo, Hongyi},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6226--6236},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/liu20e/liu20e.pdf},
url = {http://proceedings.mlr.press/v119/liu20e.html},
abstract = {Learning with noisy labels is a common challenge in supervised learning. Existing approaches often require practitioners to specify noise rates, i.e., a set of parameters controlling the severity of label noises in the problem, and the specifications are either assumed to be given or estimated using additional steps. In this work, we introduce a new family of loss functions that we name as peer loss functions, which enables learning from noisy labels and does not require a priori specification of the noise rates. Peer loss functions work within the standard empirical risk minimization (ERM) framework. We show that, under mild conditions, performing ERM with peer loss functions on the noisy data leads to the optimal or a near-optimal classifier as if performing ERM over the clean training data, which we do not have access to. We pair our results with an extensive set of experiments. Peer loss provides a way to simplify model development when facing potentially noisy training labels, and can be promoted as a robust candidate loss function in such situations.}
}
@InProceedings{pmlr-v119-liu20f,
title = {An Imitation Learning Approach for Cache Replacement},
author = {Liu, Evan and Hashemi, Milad and Swersky, Kevin and Ranganathan, Parthasarathy and Ahn, Junwhan},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6237--6247},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/liu20f/liu20f.pdf},
url = {http://proceedings.mlr.press/v119/liu20f.html},
abstract = {Program execution speed critically depends on increasing cache hits, as cache hits are orders of magnitude faster than misses. To increase cache hits, we focus on the problem of cache replacement: choosing which cache line to evict upon inserting a new line. This is challenging because it requires planning far ahead and currently there is no known practical solution. As a result, current replacement policies typically resort to heuristics designed for specific common access patterns, which fail on more diverse and complex access patterns. In contrast, we propose an imitation learning approach to automatically learn cache access patterns by leveraging Belady’s, an oracle policy that computes the optimal eviction decision given the future cache accesses. While directly applying Belady’s is infeasible since the future is unknown, we train a policy conditioned only on past accesses that accurately approximates Belady’s even on diverse and complex access patterns, and call this approach Parrot. When evaluated on 13 of the most memory-intensive SPEC applications, Parrot increases cache miss rates by 20% over the current state of the art. In addition, on a large-scale web search benchmark, Parrot increases cache hit rates by 61% over a conventional LRU policy. We release a Gym environment to facilitate research in this area, as data is plentiful, and further advancements can have significant real-world impact.}
}
@InProceedings{pmlr-v119-liu20g,
title = {Exploration Through Reward Biasing: Reward-Biased Maximum Likelihood Estimation for Stochastic Multi-Armed Bandits},
author = {Liu, Xi and Hsieh, Ping-Chun and Hung, Yu Heng and Bhattacharya, Anirban and Kumar, P.},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6248--6258},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/liu20g/liu20g.pdf},
url = {http://proceedings.mlr.press/v119/liu20g.html},
abstract = {Inspired by the Reward-Biased Maximum Likelihood Estimate method of adaptive control, we propose RBMLE – a novel family of learning algorithms for stochastic multi-armed bandits (SMABs). For a broad range of SMABs including both the parametric Exponential Family as well as the non-parametric sub-Gaussian/Exponential family, we show that RBMLE yields an index policy. To choose the bias-growth rate $\alpha(t)$ in RBMLE, we reveal the nontrivial interplay between $\alpha(t)$ and the regret bound that generally applies in both the Exponential Family as well as the sub-Gaussian/Exponential family bandits. To quantify the finite-time performance, we prove that RBMLE attains order-optimality by adaptively estimating the unknown constants in the expression of $\alpha(t)$ for Gaussian and sub-Gaussian bandits. Extensive experiments demonstrate that the proposed RBMLE achieves empirical regret performance competitive with the state-of-the-art methods, while being more computationally efficient and scalable in comparison to the best-performing ones among them.}
}
@InProceedings{pmlr-v119-liu20h,
title = {Hallucinative Topological Memory for Zero-Shot Visual Planning},
author = {Liu, Kara and Kurutach, Thanard and Tung, Christine and Abbeel, Pieter and Tamar, Aviv},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6259--6270},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/liu20h/liu20h.pdf},
url = {http://proceedings.mlr.press/v119/liu20h.html},
abstract = {In visual planning (VP), an agent learns to plan goal-directed behavior from observations of a dynamical system obtained offline, e.g., images obtained from self-supervised robot interaction. Most previous works on VP approached the problem by planning in a learned latent space, resulting in low-quality visual plans, and difficult training algorithms. Here, instead, we propose a simple VP method that plans directly in image space and displays competitive performance. We build on the semi-parametric topological memory (SPTM) method: image samples are treated as nodes in a graph, the graph connectivity is learned from image sequence data, and planning can be performed using conventional graph search methods. We propose two modifications on SPTM. First, we train an energy-based graph connectivity function using contrastive predictive coding that admits stable training. Second, to allow zero-shot planning in new domains, we learn a conditional VAE model that generates images given a context describing the domain, and use these hallucinated samples for building the connectivity graph and planning. We show that this simple approach significantly outperform the SOTA VP methods, in terms of both plan interpretability and success rate when using the plan to guide a trajectory-following controller. Interestingly, our method can pick up non-trivial visual properties of objects, such as their geometry, and account for it in the plans.}
}
@InProceedings{pmlr-v119-liu20i,
title = {A Chance-Constrained Generative Framework for Sequence Optimization},
author = {Liu, Xianggen and Liu, Qiang and Song, Sen and Peng, Jian},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6271--6281},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/liu20i/liu20i.pdf},
url = {http://proceedings.mlr.press/v119/liu20i.html},
abstract = {Deep generative modeling has achieved many successes for continuous data generation, such as producing realistic images and controlling their properties (e.g., styles). However, the development of generative modeling techniques for optimizing discrete data, such as sequences or strings, still lags behind largely due to the challenges in modeling complex and long-range constraints, including both syntax and semantics, in discrete structures. In this paper, we formulate the sequence optimization task as a chance-constrained optimization problem. The key idea is to enforce a high probability of generating valid sequences and also optimize the property of interest. We propose a novel minimax algorithm to simultaneously tighten a bound of the valid chance and optimize the expected property. Extensive experimental results in three domains demonstrate the superiority of our approach over the existing sequence optimization methods.}
}
@InProceedings{pmlr-v119-liu20j,
title = {Min-Max Optimization without Gradients: Convergence and Applications to Black-Box Evasion and Poisoning Attacks},
author = {Liu, Sijia and Lu, Songtao and Chen, Xiangyi and Feng, Yao and Xu, Kaidi and Al-Dujaili, Abdullah and Hong, Mingyi and O'Reilly, Una-May},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6282--6293},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/liu20j/liu20j.pdf},
url = {http://proceedings.mlr.press/v119/liu20j.html},
abstract = {In this paper, we study the problem of constrained min-max optimization in a black-box setting, where the desired optimizer cannot access the gradients of the objective function but may query its values. We present a principled optimization framework, integrating a zeroth-order (ZO) gradient estimator with an alternating projected stochastic gradient descent-ascent method, where the former only requires a small number of function queries and the later needs just one-step descent/ascent update. We show that the proposed framework, referred to as ZO-Min-Max, has a sublinear convergence rate under mild conditions and scales gracefully with problem size. We also explore a promising connection between black-box min-max optimization and black-box evasion and poisoning attacks in adversarial machine learning (ML). Our empirical evaluations on these use cases demonstrate the effectiveness of our approach and its scalability to dimensions that prohibit using recent black-box solvers.}
}
@InProceedings{pmlr-v119-liu20k,
title = {Median Matrix Completion: from Embarrassment to Optimality},
author = {Liu, Weidong and Mao, Xiaojun and Wong, Raymond K. W.},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6294--6304},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/liu20k/liu20k.pdf},
url = {http://proceedings.mlr.press/v119/liu20k.html},
abstract = {In this paper, we consider matrix completion with absolute deviation loss and obtain an estimator of the median matrix. Despite several appealing properties of median, the non-smooth absolute deviation loss leads to computational challenge for large-scale data sets which are increasingly common among matrix completion problems. A simple solution to large-scale problems is parallel computing. However, embarrassingly parallel fashion often leads to inefficient estimators. Based on the idea of pseudo data, we propose a novel refinement step, which turns such inefficient estimators into a rate (near-)optimal matrix completion procedure. The refined estimator is an approximation of a regularized least median estimator, and therefore not an ordinary regularized empirical risk estimator. This leads to a non-standard analysis of asymptotic behaviors. Empirical results are also provided to confirm the effectiveness of the proposed method.}
}
@InProceedings{pmlr-v119-liu20l,
title = {A Generic First-Order Algorithmic Framework for Bi-Level Programming Beyond Lower-Level Singleton},
author = {Liu, Risheng and Mu, Pan and Yuan, Xiaoming and Zeng, Shangzhi and Zhang, Jin},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6305--6315},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/liu20l/liu20l.pdf},
url = {http://proceedings.mlr.press/v119/liu20l.html},
abstract = {In recent years, a variety of gradient-based bi-level optimization methods have been developed for learning tasks. However, theoretical guarantees of these existing approaches often heavily rely on the simplification that for each fixed upper-level variable, the lower-level solution must be a singleton (a.k.a., Lower-Level Singleton, LLS). In this work, by formulating bi-level models from the optimistic viewpoint and aggregating hierarchical objective information, we establish Bi-level Descent Aggregation (BDA), a flexible and modularized algorithmic framework for bi-level programming. Theoretically, we derive a new methodology to prove the convergence of BDA without the LLS condition. Furthermore, we improve the convergence properties of conventional first-order bi-level schemes (under the LLS simplification) based on our proof recipe. Extensive experiments justify our theoretical results and demonstrate the superiority of the proposed BDA for different tasks, including hyper-parameter optimization and meta learning.}
}
@InProceedings{pmlr-v119-liu20m,
title = {Learning Deep Kernels for Non-Parametric Two-Sample Tests},
author = {Liu, Feng and Xu, Wenkai and Lu, Jie and Zhang, Guangquan and Gretton, Arthur and Sutherland, Danica J.},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6316--6326},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/liu20m/liu20m.pdf},
url = {http://proceedings.mlr.press/v119/liu20m.html},
abstract = {We propose a class of kernel-based two-sample tests, which aim to determine whether two sets of samples are drawn from the same distribution. Our tests are constructed from kernels parameterized by deep neural nets, trained to maximize test power. These tests adapt to variations in distribution smoothness and shape over space, and are especially suited to high dimensions and complex data. By contrast, the simpler kernels used in prior kernel testing work are spatially homogeneous, and adaptive only in lengthscale. We explain how this scheme includes popular classifier-based two-sample tests as a special case, but improves on them in general. We provide the first proof of consistency for the proposed adaptation method, which applies both to kernels on deep features and to simpler radial basis kernels or multiple kernel learning. In experiments, we establish the superior performance of our deep kernels in hypothesis testing on benchmark and real-world data. The code of our deep-kernel-based two-sample tests is available at github.com/fengliu90/DK-for-TST.}
}
@InProceedings{pmlr-v119-liu20n,
title = {Learning to Encode Position for Transformer with Continuous Dynamical Model},
author = {Liu, Xuanqing and Yu, Hsiang-Fu and Dhillon, Inderjit and Hsieh, Cho-Jui},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6327--6335},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/liu20n/liu20n.pdf},
url = {http://proceedings.mlr.press/v119/liu20n.html},
abstract = {We introduce a new way of learning to encode position information for non-recurrent models, such as Transformer models. Unlike RNN and LSTM, which contain inductive bias by loading the input tokens sequentially, non-recurrent models are less sensitive to position. The main reason is that position information among input units is not encoded inherently, i.e., they are permutation equivalent, this problem justifies why all of the existing models are accompanied by position encoding/embedding layer at the input. However, this solution has clear limitations: the sinusoidal position encoding is not flexible enough as it is manually designed and does not contain any learnable parameters, whereas the position embedding restricts the maximum length of input sequences. It is thus desirable to design a new position layer that contains learnable parameters to adjust to different datasets and different architectures. At the same time, we would also like it to extrapolate in accordance with the variable length of inputs. In our proposed solution, we borrow from the recent Neural ODE approach, which may be viewed as a versatile continuous version of a ResNet. This model is capable of modeling many kinds of dynamical systems. We model the evolution of encoded results along position index by such a dynamical system, thereby overcoming the above limitations of existing methods. We evaluate our new position layers on a variety of neural machine translation and language understanding tasks, the experimental results show consistent improvements over the baselines.}
}
@InProceedings{pmlr-v119-liu20o,
title = {Finding trainable sparse networks through Neural Tangent Transfer},
author = {Liu, Tianlin and Zenke, Friedemann},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6336--6347},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/liu20o/liu20o.pdf},
url = {http://proceedings.mlr.press/v119/liu20o.html},
abstract = {Deep neural networks have dramatically transformed machine learning, but their memory and energy demands are substantial. The requirements of real biological neural networks are rather modest in comparison, and one feature that might underlie this austerity is their sparse connectivity. In deep learning, trainable sparse networks that perform well on a specific task are usually constructed using label-dependent pruning criteria. In this article, we introduce Neural Tangent Transfer, a method that instead finds trainable sparse networks in a label-free manner. Specifically, we find sparse networks whose training dynamics, as characterized by the neural tangent kernel, mimic those of dense networks in function space. Finally, we evaluate our label-agnostic approach on several standard classification tasks and show that the resulting sparse networks achieve higher classification performance while converging faster.}
}
@InProceedings{pmlr-v119-locatello20a,
title = {Weakly-Supervised Disentanglement Without Compromises},
author = {Locatello, Francesco and Poole, Ben and Raetsch, Gunnar and Sch{\"o}lkopf, Bernhard and Bachem, Olivier and Tschannen, Michael},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6348--6359},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/locatello20a/locatello20a.pdf},
url = {http://proceedings.mlr.press/v119/locatello20a.html},
abstract = {Intelligent agents should be able to learn useful representations by observing changes in their environment. We model such observations as pairs of non-i.i.d. images sharing at least one of the underlying factors of variation. First, we theoretically show that only knowing how many factors have changed, but not which ones, is sufficient to learn disentangled representations. Second, we provide practical algorithms that learn disentangled representations from pairs of images without requiring annotation of groups, individual factors, or the number of factors that have changed. Third, we perform a large-scale empirical study and show that such pairs of observations are sufficient to reliably learn disentangled representations on several benchmark data sets. Finally, we evaluate our learned representations and find that they are simultaneously useful on a diverse suite of tasks, including generalization under covariate shifts, fairness, and abstract reasoning. Overall, our results demonstrate that weak supervision enables learning of useful disentangled representations in realistic scenarios.}
}
@InProceedings{pmlr-v119-lohaus20a,
title = {Too Relaxed to Be Fair},
author = {Lohaus, Michael and Perrot, Michael and Luxburg, Ulrike Von},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6360--6369},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lohaus20a/lohaus20a.pdf},
url = {http://proceedings.mlr.press/v119/lohaus20a.html},
abstract = {We address the problem of classification under fairness constraints. Given a notion of fairness, the goal is to learn a classifier that is not discriminatory against a group of individuals. In the literature, this problem is often formulated as a constrained optimization problem and solved using relaxations of the fairness constraints. We show that many existing relaxations are unsatisfactory: even if a model satisfies the relaxed constraint, it can be surprisingly unfair. We propose a principled framework to solve this problem. This new approach uses a strongly convex formulation and comes with theoretical guarantees on the fairness of its solution. In practice, we show that this method gives promising results on real data.}
}
@InProceedings{pmlr-v119-loizou20a,
title = {Stochastic {H}amiltonian Gradient Methods for Smooth Games},
author = {Loizou, Nicolas and Berard, Hugo and Jolicoeur-Martineau, Alexia and Vincent, Pascal and Lacoste-Julien, Simon and Mitliagkas, Ioannis},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6370--6381},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/loizou20a/loizou20a.pdf},
url = {http://proceedings.mlr.press/v119/loizou20a.html},
abstract = {The success of adversarial formulations in machine learning has brought renewed motivation for smooth games. In this work, we focus on the class of stochastic Hamiltonian methods and provide the first convergence guarantees for certain classes of stochastic smooth games. We propose a novel unbiased estimator for the stochastic Hamiltonian gradient descent (SHGD) and highlight its benefits. Using tools from the optimization literature we show that SHGD converges linearly to the neighbourhood of a stationary point. To guarantee convergence to the exact solution, we analyze SHGD with a decreasing step-size and we also present the first stochastic variance reduced Hamiltonian method. Our results provide the first global non-asymptotic last-iterate convergence guarantees for the class of stochastic unconstrained bilinear games and for the more general class of stochastic games that satisfy a “sufficiently bilinear" condition, notably including some non-convex non-concave problems. We supplement our analysis with experiments on stochastic bilinear and sufficiently bilinear games, where our theory is shown to be tight, and on simple adversarial machine learning formulations.}
}
@InProceedings{pmlr-v119-lopes20a,
title = {Error Estimation for Sketched {SVD} via the Bootstrap},
author = {Lopes, Miles and Erichson, N. Benjamin and Mahoney, Michael},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6382--6392},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lopes20a/lopes20a.pdf},
url = {http://proceedings.mlr.press/v119/lopes20a.html},
abstract = {In order to compute fast approximations to the singular value decompositions (SVD) of very large matrices, randomized sketching algorithms have become a leading approach. However, a key practical difficulty of sketching an SVD is that the user does not know how far the sketched singular vectors/values are from the exact ones. Indeed, the user may be forced to rely on analytical worst-case error bounds, which may not account for the unique structure of a given problem. As a result, the lack of tools for error estimation often leads to much more computation than is really necessary. To overcome these challenges, this paper develops a fully data-driven bootstrap method that numerically estimates the actual error of sketched singular vectors/values. Furthermore, the method is computationally inexpensive, because it operates only on sketched objects, and hence it requires no extra passes over the full matrix being factored.}
}
@InProceedings{pmlr-v119-lou20a,
title = {Differentiating through the Fr{é}chet Mean},
author = {Lou, Aaron and Katsman, Isay and Jiang, Qingxuan and Belongie, Serge and Lim, Ser-Nam and De Sa, Christopher},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6393--6403},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lou20a/lou20a.pdf},
url = {http://proceedings.mlr.press/v119/lou20a.html},
abstract = {Recent advances in deep representation learning on Riemannian manifolds extend classical deep learning operations to better capture the geometry of the manifold. One possible extension is the Fr{é}chet mean, the generalization of the Euclidean mean; however, it has been difficult to apply because it lacks a closed form with an easily computable derivative. In this paper, we show how to differentiate through the Fr{é}chet mean for arbitrary Riemannian manifolds. Then, focusing on hyperbolic space, we derive explicit gradient expressions and a fast, accurate, and hyperparameter-free Fr{é}chet mean solver. This fully integrates the Fr{é}chet mean into the hyperbolic neural network pipeline. To demonstrate this integration, we present two case studies. First, we apply our Fr{é}chet mean to the existing Hyperbolic Graph Convolutional Network, replacing its projected aggregation to obtain state-of-the-art results on datasets with high hyperbolicity. Second, to demonstrate the Fr{é}chet mean’s capacity to generalize Euclidean neural network operations, we develop a hyperbolic batch normalization method that gives an improvement parallel to the one observed in the Euclidean setting.}
}
@InProceedings{pmlr-v119-loynd20a,
title = {Working Memory Graphs},
author = {Loynd, Ricky and Fernandez, Roland and Celikyilmaz, Asli and Swaminathan, Adith and Hausknecht, Matthew},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6404--6414},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/loynd20a/loynd20a.pdf},
url = {http://proceedings.mlr.press/v119/loynd20a.html},
abstract = {Transformers have increasingly outperformed gated RNNs in obtaining new state-of-the-art results on supervised tasks involving text sequences. Inspired by this trend, we study the question of how Transformer-based models can improve the performance of sequential decision-making agents. We present the Working Memory Graph (WMG), an agent that employs multi-head self-attention to reason over a dynamic set of vectors representing observed and recurrent state. We evaluate WMG in three environments featuring factored observation spaces: a Pathfinding environment that requires complex reasoning over past observations, BabyAI gridworld levels that involve variable goals, and Sokoban which emphasizes future planning. We find that the combination of WMG’s Transformer-based architecture with factored observation spaces leads to significant gains in learning efficiency compared to baseline architectures across all tasks. WMG demonstrates how Transformer-based models can dramatically boost sample efficiency in RL environments for which observations can be factored.}
}
@InProceedings{pmlr-v119-lu20a,
title = {Moniqua: Modulo Quantized Communication in Decentralized {SGD}},
author = {Lu, Yucheng and De Sa, Christopher},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6415--6425},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lu20a/lu20a.pdf},
url = {http://proceedings.mlr.press/v119/lu20a.html},
abstract = {Running Stochastic Gradient Descent (SGD) in a decentralized fashion has shown promising results. In this paper we propose Moniqua, a technique that allows decentralized SGD to use quantized communication. We prove in theory that Moniqua communicates a provably bounded number of bits per iteration, while converging at the same asymptotic rate as the original algorithm does with full-precision communication. Moniqua improves upon prior works in that it (1) requires zero additional memory, (2) works with 1-bit quantization, and (3) is applicable to a variety of decentralized algorithms. We demonstrate empirically that Moniqua converges faster with respect to wall clock time than other quantized decentralized algorithms. We also show that Moniqua is robust to very low bit-budgets, allowing $1$-bit-per-parameter communication without compromising validation accuracy when training ResNet20 and ResNet110 on CIFAR10.}
}
@InProceedings{pmlr-v119-lu20b,
title = {A Mean Field Analysis Of Deep {R}es{N}et And Beyond: Towards Provably Optimization Via Overparameterization From Depth},
author = {Lu, Yiping and Ma, Chao and Lu, Yulong and Lu, Jianfeng and Ying, Lexing},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6426--6436},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lu20b/lu20b.pdf},
url = {http://proceedings.mlr.press/v119/lu20b.html},
abstract = {Training deep neural networks with stochastic gradient descent (SGD) can often achieve zero training loss on real-world tasks although the optimization landscape is known to be highly non-convex. To understand the success of SGD for training deep neural networks, this work presents a mean-field analysis of deep residual networks, based on a line of works which interpret the continuum limit of the deep residual network as an ordinary differential equation as the the network capacity tends to infinity. Specifically, we propose a \textbf{new continuum limit} of deep residual networks, which enjoys a good landscape in the sense that \textbf{every local minimizer is global}. This characterization enables us to derive the first global convergence result for multilayer neural networks in the mean-field regime. Furthermore, our proof does not rely on the convexity of the loss landscape, but instead, an assumption on the global minimizer should achieve zero loss which can be achieved when the model shares a universal approximation property. Key to our result is the observation that a deep residual network resembles a shallow network ensemble \cite{veit2016residual}, \emph{i.e.} a two-layer network. We bound the difference between the shallow network and our ResNet model via the adjoint sensitivity method, which enables us to transfer previous mean-field analysis of two-layer networks to deep networks. Furthermore, we propose several novel training schemes based on our new continuous model, among which one new training procedure introduces the operation of switching the order of the residual blocks and results in strong empirical performance on benchmark datasets.}
}
@InProceedings{pmlr-v119-lu20c,
title = {Countering Language Drift with Seeded Iterated Learning},
author = {Lu, Yuchen and Singhal, Soumye and Strub, Florian and Courville, Aaron and Pietquin, Olivier},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6437--6447},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lu20c/lu20c.pdf},
url = {http://proceedings.mlr.press/v119/lu20c.html},
abstract = {Pretraining on human corpus and then finetuning in a simulator has become a standard pipeline for training a goal-oriented dialogue agent. Nevertheless, as soon as the agents are finetuned to maximize task completion, they suffer from the so-called language drift phenomenon: they slowly lose syntactic and semantic properties of language as they only focus on solving the task. In this paper, we propose a generic approach to counter language drift called Seeded iterated learning (SIL). We periodically refine a pretrained student agent by imitating data sampled from a newly generated teacher agent. At each time step, the teacher is created by copying the student agent, before being finetuned to maximize task completion. SIL does not require external syntactic constraint nor semantic knowledge, making it a valuable task-agnostic finetuning protocol. We evaluate SIL in a toy-setting Lewis Game, and then scale it up to the translation game with natural language. In both settings, SIL helps counter language drift as well as it improves the task completion compared to baselines.}
}
@InProceedings{pmlr-v119-lukasik20a,
title = {Does label smoothing mitigate label noise?},
author = {Lukasik, Michal and Bhojanapalli, Srinadh and Menon, Aditya and Kumar, Sanjiv},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6448--6458},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lukasik20a/lukasik20a.pdf},
url = {http://proceedings.mlr.press/v119/lukasik20a.html},
abstract = {Label smoothing is commonly used in training deep learning models, wherein one-hot training labels are mixed with uniform label vectors. Empirically, smoothing has been shown to improve both predictive performance and model calibration. In this paper, we study whether label smoothing is also effective as a means of coping with label noise. While label smoothing apparently amplifies this problem — being equivalent to injecting symmetric noise to the labels — we show how it relates to a general family of loss-correction techniques from the label noise literature. Building on this connection, we show that label smoothing is competitive with loss-correction under label noise. Further, we show that when distilling models from noisy data, label smoothing of the teacher is beneficial; this is in contrast to recent findings for noise-free problems, and sheds further light on settings where label smoothing is beneficial.}
}
@InProceedings{pmlr-v119-luo20a,
title = {Improved Communication Cost in Distributed {P}age{R}ank Computation {–} A Theoretical Study},
author = {Luo, Siqiang},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6459--6467},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/luo20a/luo20a.pdf},
url = {http://proceedings.mlr.press/v119/luo20a.html},
abstract = {PageRank is a widely used approach for measuring the importance of a node in a graph. Due to the rapid growth of the graph size in the real world, the importance of computing PageRanks in a distributed environment has been increasingly recognized. However, only a few previous works can provide a provable complexity and accuracy for distributed PageRank computation. Given a constant $d\ge 1$ and a graph of $n$ nodes, the state-of-the-art approach, Radar-Push, uses $O(\log\log{n}+\log{d})$ communication rounds to approximate the PageRanks within a relative error $\Theta(\frac{1}{\log^d{n}})$ under a generalized congested clique distributed computation model. However, Radar-Push entails as large as $O(\log^{2d+3}{n})$ bits of bandwidth (e.g., the communication cost between a pair of nodes per round). In this paper, we provide a new algorithm that uses asymptotically the same communication round complexity while using only $O(d\log^3{n})$ bits of bandwidth.}
}
@InProceedings{pmlr-v119-luo20b,
title = {Progressive Graph Learning for Open-Set Domain Adaptation},
author = {Luo, Yadan and Wang, Zijian and Huang, Zi and Baktashmotlagh, Mahsa},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6468--6478},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/luo20b/luo20b.pdf},
url = {http://proceedings.mlr.press/v119/luo20b.html},
abstract = {Domain shift is a fundamental problem in visual recognition which typically arises when the source and target data follow different distributions. The existing domain adaptation approaches which tackle this problem work in the "closed-set" setting with the assumption that the source and the target data share exactly the same classes of objects. In this paper, we tackle a more realistic problem of the "open-set" domain shift where the target data contains additional classes that were not present in the source data. More specifically, we introduce an end-to-end Progressive Graph Learning (PGL) framework where a graph neural network with episodic training is integrated to suppress underlying conditional shift and adversarial learning is adopted to close the gap between the source and target distributions. Compared to the existing open-set adaptation approaches, our approach guarantees to achieve a tighter upper bound of the target error. Extensive experiments on three standard open-set benchmarks evidence that our approach significantly outperforms the state-of-the-arts in open-set domain adaptation.}
}
@InProceedings{pmlr-v119-luo20c,
title = {Adversarial Nonnegative Matrix Factorization},
author = {Luo, Lei and Zhang, Yanfu and Huang, Heng},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6479--6488},
year = {2020},
editor = {III, Hal Daumé and Singh, Aarti},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/luo20c/luo20c.pdf},
url = {http://proceedings.mlr.press/v119/luo20c.html},
abstract = {Nonnegative Matrix Factorization (NMF) has become an increasingly important research topic in machine learning. Despite all the practical success, most of existing NMF models are still vulnerable to adversarial attacks. To overcome this limitation, we propose a novel Adversarial NMF (ANMF) approach in which an adversary can exercise some control over the perturbed data generation process. Different from the traditional NMF models which focus on either the regular input or certain types of noise, our model considers potential test adversaries that are beyond the pre-defined constraints, which can cope with various noises (or perturbations). We formulate the proposed model as a bilevel optimization problem and use Alternating Direction Method of Multipliers (ADMM) to solve it with convergence analysis. Theoretically, the robustness analysis of ANMF is established under mild conditions dedicating asymptotically unbiased prediction. Extensive experiments verify that ANMF is robust to a broad categories of perturbations, and achieves state-of-the-art performances on distinct real-world benchmark datasets.}
}
@InProceedings{pmlr-v119-luz20a,
ti