@Proceedings{ICML2021,
title = {Proceedings of the 38th International Conference on Machine Learning},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
editor = {Marina Meila and Tong Zhang},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
volume = 139
}
@InProceedings{pmlr-v139-abdolshah21a,
title = {A New Representation of Successor Features for Transfer across Dissimilar Environments},
author = {Abdolshah, Majid and Le, Hung and George, Thommen Karimpanal and Gupta, Sunil and Rana, Santu and Venkatesh, Svetha},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1--9},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/abdolshah21a/abdolshah21a.pdf},
url = {https://proceedings.mlr.press/v139/abdolshah21a.html},
abstract = {Transfer in reinforcement learning is usually achieved through generalisation across tasks. Whilst many studies have investigated transferring knowledge when the reward function changes, they have assumed that the dynamics of the environments remain consistent. Many real-world RL problems require transfer among environments with different dynamics. To address this problem, we propose an approach based on successor features in which we model successor feature functions with Gaussian Processes permitting the source successor features to be treated as noisy measurements of the target successor feature function. Our theoretical analysis proves the convergence of this approach as well as the bounded error on modelling successor feature functions with Gaussian Processes in environments with both different dynamics and rewards. We demonstrate our method on benchmark datasets and show that it outperforms current baselines.}
}
@InProceedings{pmlr-v139-abeyrathna21a,
title = {Massively Parallel and Asynchronous Tsetlin Machine Architecture Supporting Almost Constant-Time Scaling},
author = {Abeyrathna, Kuruge Darshana and Bhattarai, Bimal and Goodwin, Morten and Gorji, Saeed Rahimi and Granmo, Ole-Christoffer and Jiao, Lei and Saha, Rupsa and Yadav, Rohan K.},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {10--20},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/abeyrathna21a/abeyrathna21a.pdf},
url = {https://proceedings.mlr.press/v139/abeyrathna21a.html},
abstract = {Using logical clauses to represent patterns, Tsetlin Machine (TM) have recently obtained competitive performance in terms of accuracy, memory footprint, energy, and learning speed on several benchmarks. Each TM clause votes for or against a particular class, with classification resolved using a majority vote. While the evaluation of clauses is fast, being based on binary operators, the voting makes it necessary to synchronize the clause evaluation, impeding parallelization. In this paper, we propose a novel scheme for desynchronizing the evaluation of clauses, eliminating the voting bottleneck. In brief, every clause runs in its own thread for massive native parallelism. For each training example, we keep track of the class votes obtained from the clauses in local voting tallies. The local voting tallies allow us to detach the processing of each clause from the rest of the clauses, supporting decentralized learning. This means that the TM most of the time will operate on outdated voting tallies. We evaluated the proposed parallelization across diverse learning tasks and it turns out that our decentralized TM learning algorithm copes well with working on outdated data, resulting in no significant loss in learning accuracy. Furthermore, we show that the approach provides up to 50 times faster learning. Finally, learning time is almost constant for reasonable clause amounts (employing from 20 to 7,000 clauses on a Tesla V100 GPU). For sufficiently large clause numbers, computation time increases approximately proportionally. Our parallel and asynchronous architecture thus allows processing of more massive datasets and operating with more clauses for higher accuracy.}
}
@InProceedings{pmlr-v139-acar21a,
title = {Debiasing Model Updates for Improving Personalized Federated Training},
author = {Acar, Durmus Alp Emre and Zhao, Yue and Zhu, Ruizhao and Matas, Ramon and Mattina, Matthew and Whatmough, Paul and Saligrama, Venkatesh},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {21--31},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/acar21a/acar21a.pdf},
url = {https://proceedings.mlr.press/v139/acar21a.html},
abstract = {We propose a novel method for federated learning that is customized specifically to the objective of a given edge device. In our proposed method, a server trains a global meta-model by collaborating with devices without actually sharing data. The trained global meta-model is then personalized locally by each device to meet its specific objective. Different from the conventional federated learning setting, training customized models for each device is hindered by both the inherent data biases of the various devices, as well as the requirements imposed by the federated architecture. We propose gradient correction methods leveraging prior works, and explicitly de-bias the meta-model in the distributed heterogeneous data setting to learn personalized device models. We present convergence guarantees of our method for strongly convex, convex and nonconvex meta objectives. We empirically evaluate the performance of our method on benchmark datasets and demonstrate significant communication savings.}
}
@InProceedings{pmlr-v139-acar21b,
title = {Memory Efficient Online Meta Learning},
author = {Acar, Durmus Alp Emre and Zhu, Ruizhao and Saligrama, Venkatesh},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {32--42},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/acar21b/acar21b.pdf},
url = {https://proceedings.mlr.press/v139/acar21b.html},
abstract = {We propose a novel algorithm for online meta learning where task instances are sequentially revealed with limited supervision and a learner is expected to meta learn them in each round, so as to allow the learner to customize a task-specific model rapidly with little task-level supervision. A fundamental concern arising in online meta-learning is the scalability of memory as more tasks are viewed over time. Heretofore, prior works have allowed for perfect recall leading to linear increase in memory with time. Different from prior works, in our method, prior task instances are allowed to be deleted. We propose to leverage prior task instances by means of a fixed-size state-vector, which is updated sequentially. Our theoretical analysis demonstrates that our proposed memory efficient online learning (MOML) method suffers sub-linear regret with convex loss functions and sub-linear local regret for nonconvex losses. On benchmark datasets we show that our method can outperform prior works even though they allow for perfect recall.}
}
@InProceedings{pmlr-v139-acharya21a,
title = {Robust Testing and Estimation under Manipulation Attacks},
author = {Acharya, Jayadev and Sun, Ziteng and Zhang, Huanyu},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {43--53},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/acharya21a/acharya21a.pdf},
url = {https://proceedings.mlr.press/v139/acharya21a.html},
abstract = {We study robust testing and estimation of discrete distributions in the strong contamination model. Our results cover both centralized setting and distributed setting with general local information constraints including communication and LDP constraints. Our technique relates the strength of manipulation attacks to the earth-mover distance using Hamming distance as the metric between messages (samples) from the users. In the centralized setting, we provide optimal error bounds for both learning and testing. Our lower bounds under local information constraints build on the recent lower bound methods in distributed inference. In the communication constrained setting, we develop novel algorithms based on random hashing and an L1-L1 isometry.}
}
@InProceedings{pmlr-v139-achituve21a,
title = {GP-Tree: A Gaussian Process Classifier for Few-Shot Incremental Learning},
author = {Achituve, Idan and Navon, Aviv and Yemini, Yochai and Chechik, Gal and Fetaya, Ethan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {54--65},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/achituve21a/achituve21a.pdf},
url = {https://proceedings.mlr.press/v139/achituve21a.html},
abstract = {Gaussian processes (GPs) are non-parametric, flexible, models that work well in many tasks. Combining GPs with deep learning methods via deep kernel learning (DKL) is especially compelling due to the strong representational power induced by the network. However, inference in GPs, whether with or without DKL, can be computationally challenging on large datasets. Here, we propose GP-Tree, a novel method for multi-class classification with Gaussian processes and DKL. We develop a tree-based hierarchical model in which each internal node of the tree fits a GP to the data using the P{ó}lya-Gamma augmentation scheme. As a result, our method scales well with both the number of classes and data size. We demonstrate the effectiveness of our method against other Gaussian process training baselines, and we show how our general GP approach achieves improved accuracy on standard incremental few-shot learning benchmarks.}
}
@InProceedings{pmlr-v139-acuna21a,
title = {f-Domain Adversarial Learning: Theory and Algorithms},
author = {Acuna, David and Zhang, Guojun and Law, Marc T. and Fidler, Sanja},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {66--75},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/acuna21a/acuna21a.pdf},
url = {https://proceedings.mlr.press/v139/acuna21a.html},
abstract = {Unsupervised domain adaptation is used in many machine learning applications where, during training, a model has access to unlabeled data in the target domain, and a related labeled dataset. In this paper, we introduce a novel and general domain-adversarial framework. Specifically, we derive a novel generalization bound for domain adaptation that exploits a new measure of discrepancy between distributions based on a variational characterization of f-divergences. It recovers the theoretical results from Ben-David et al. (2010a) as a special case and supports divergences used in practice. Based on this bound, we derive a new algorithmic framework that introduces a key correction in the original adversarial training method of Ganin et al. (2016). We show that many regularizers and ad-hoc objectives introduced over the last years in this framework are then not required to achieve performance comparable to (if not better than) state-of-the-art domain-adversarial methods. Experimental analysis conducted on real-world natural language and computer vision datasets show that our framework outperforms existing baselines, and obtains the best results for f-divergences that were not considered previously in domain-adversarial learning.}
}
@InProceedings{pmlr-v139-afchar21a,
title = {Towards Rigorous Interpretations: a Formalisation of Feature Attribution},
author = {Afchar, Darius and Guigue, Vincent and Hennequin, Romain},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {76--86},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/afchar21a/afchar21a.pdf},
url = {https://proceedings.mlr.press/v139/afchar21a.html},
abstract = {Feature attribution is often loosely presented as the process of selecting a subset of relevant features as a rationale of a prediction. Task-dependent by nature, precise definitions of "relevance" encountered in the literature are however not always consistent. This lack of clarity stems from the fact that we usually do not have access to any notion of ground-truth attribution and from a more general debate on what good interpretations are. In this paper we propose to formalise feature selection/attribution based on the concept of relaxed functional dependence. In particular, we extend our notions to the instance-wise setting and derive necessary properties for candidate selection solutions, while leaving room for task-dependence. By computing ground-truth attributions on synthetic datasets, we evaluate many state-of-the-art attribution methods and show that, even when optimised, some fail to verify the proposed properties and provide wrong solutions.}
}
@InProceedings{pmlr-v139-agarwal21a,
title = {Acceleration via Fractal Learning Rate Schedules},
author = {Agarwal, Naman and Goel, Surbhi and Zhang, Cyril},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {87--99},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/agarwal21a/agarwal21a.pdf},
url = {https://proceedings.mlr.press/v139/agarwal21a.html},
abstract = {In practical applications of iterative first-order optimization, the learning rate schedule remains notoriously difficult to understand and expensive to tune. We demonstrate the presence of these subtleties even in the innocuous case when the objective is a convex quadratic. We reinterpret an iterative algorithm from the numerical analysis literature as what we call the Chebyshev learning rate schedule for accelerating vanilla gradient descent, and show that the problem of mitigating instability leads to a fractal ordering of step sizes. We provide some experiments to challenge conventional beliefs about stable learning rates in deep learning: the fractal schedule enables training to converge with locally unstable updates which make negative progress on the objective.}
}
@InProceedings{pmlr-v139-agarwal21b,
title = {A Regret Minimization Approach to Iterative Learning Control},
author = {Agarwal, Naman and Hazan, Elad and Majumdar, Anirudha and Singh, Karan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {100--109},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/agarwal21b/agarwal21b.pdf},
url = {https://proceedings.mlr.press/v139/agarwal21b.html},
abstract = {We consider the setting of iterative learning control, or model-based policy learning in the presence of uncertain, time-varying dynamics. In this setting, we propose a new performance metric, planning regret, which replaces the standard stochastic uncertainty assumptions with worst case regret. Based on recent advances in non-stochastic control, we design a new iterative algorithm for minimizing planning regret that is more robust to model mismatch and uncertainty. We provide theoretical and empirical evidence that the proposed algorithm outperforms existing methods on several benchmarks.}
}
@InProceedings{pmlr-v139-agarwal21c,
title = {Towards the Unification and Robustness of Perturbation and Gradient Based Explanations},
author = {Agarwal, Sushant and Jabbari, Shahin and Agarwal, Chirag and Upadhyay, Sohini and Wu, Steven and Lakkaraju, Himabindu},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {110--119},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/agarwal21c/agarwal21c.pdf},
url = {https://proceedings.mlr.press/v139/agarwal21c.html},
abstract = {As machine learning black boxes are increasingly being deployed in critical domains such as healthcare and criminal justice, there has been a growing emphasis on developing techniques for explaining these black boxes in a post hoc manner. In this work, we analyze two popular post hoc interpretation techniques: SmoothGrad which is a gradient based method, and a variant of LIME which is a perturbation based method. More specifically, we derive explicit closed form expressions for the explanations output by these two methods and show that they both converge to the same explanation in expectation, i.e., when the number of perturbed samples used by these methods is large. We then leverage this connection to establish other desirable properties, such as robustness, for these techniques. We also derive finite sample complexity bounds for the number of perturbations required for these methods to converge to their expected explanation. Finally, we empirically validate our theory using extensive experimentation on both synthetic and real-world datasets.}
}
@InProceedings{pmlr-v139-aggarwal21a,
title = {Label Inference Attacks from Log-loss Scores},
author = {Aggarwal, Abhinav and Kasiviswanathan, Shiva and Xu, Zekun and Feyisetan, Oluwaseyi and Teissier, Nathanael},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {120--129},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/aggarwal21a/aggarwal21a.pdf},
url = {https://proceedings.mlr.press/v139/aggarwal21a.html},
abstract = {Log-loss (also known as cross-entropy loss) metric is ubiquitously used across machine learning applications to assess the performance of classification algorithms. In this paper, we investigate the problem of inferring the labels of a dataset from single (or multiple) log-loss score(s), without any other access to the dataset. Surprisingly, we show that for any finite number of label classes, it is possible to accurately infer the labels of the dataset from the reported log-loss score of a single carefully constructed prediction vector if we allow arbitrary precision arithmetic. Additionally, we present label inference algorithms (attacks) that succeed even under addition of noise to the log-loss scores and under limited precision arithmetic. All our algorithms rely on ideas from number theory and combinatorics and require no model training. We run experimental simulations on some real datasets to demonstrate the ease of running these attacks in practice.}
}
@InProceedings{pmlr-v139-aitchison21a,
title = {Deep Kernel Processes},
author = {Aitchison, Laurence and Yang, Adam and Ober, Sebastian W},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {130--140},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/aitchison21a/aitchison21a.pdf},
url = {https://proceedings.mlr.press/v139/aitchison21a.html},
abstract = {We define deep kernel processes in which positive definite Gram matrices are progressively transformed by nonlinear kernel functions and by sampling from (inverse) Wishart distributions. Remarkably, we find that deep Gaussian processes (DGPs), Bayesian neural networks (BNNs), infinite BNNs, and infinite BNNs with bottlenecks can all be written as deep kernel processes. For DGPs the equivalence arises because the Gram matrix formed by the inner product of features is Wishart distributed, and as we show, standard isotropic kernels can be written entirely in terms of this Gram matrix — we do not need knowledge of the underlying features. We define a tractable deep kernel process, the deep inverse Wishart process, and give a doubly-stochastic inducing-point variational inference scheme that operates on the Gram matrices, not on the features, as in DGPs. We show that the deep inverse Wishart process gives superior performance to DGPs and infinite BNNs on fully-connected baselines.}
}
@InProceedings{pmlr-v139-akbari21a,
title = {How Does Loss Function Affect Generalization Performance of Deep Learning? Application to Human Age Estimation},
author = {Akbari, Ali and Awais, Muhammad and Bashar, Manijeh and Kittler, Josef},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {141--151},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/akbari21a/akbari21a.pdf},
url = {https://proceedings.mlr.press/v139/akbari21a.html},
abstract = {Good generalization performance across a wide variety of domains caused by many external and internal factors is the fundamental goal of any machine learning algorithm. This paper theoretically proves that the choice of loss function matters for improving the generalization performance of deep learning-based systems. By deriving the generalization error bound for deep neural models trained by stochastic gradient descent, we pinpoint the characteristics of the loss function that is linked to the generalization error and can therefore be used for guiding the loss function selection process. In summary, our main statement in this paper is: choose a stable loss function, generalize better. Focusing on human age estimation from the face which is a challenging topic in computer vision, we then propose a novel loss function for this learning problem. We theoretically prove that the proposed loss function achieves stronger stability, and consequently a tighter generalization error bound, compared to the other common loss functions for this problem. We have supported our findings theoretically, and demonstrated the merits of the guidance process experimentally, achieving significant improvements.}
}
@InProceedings{pmlr-v139-akiyama21a,
title = {On Learnability via Gradient Method for Two-Layer ReLU Neural Networks in Teacher-Student Setting},
author = {Akiyama, Shunta and Suzuki, Taiji},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {152--162},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/akiyama21a/akiyama21a.pdf},
url = {https://proceedings.mlr.press/v139/akiyama21a.html},
abstract = {Deep learning empirically achieves high performance in many applications, but its training dynamics has not been fully understood theoretically. In this paper, we explore theoretical analysis on training two-layer ReLU neural networks in a teacher-student regression model, in which a student network learns an unknown teacher network through its outputs. We show that with a specific regularization and sufficient over-parameterization, the student network can identify the parameters of the teacher network with high probability via gradient descent with a norm dependent stepsize even though the objective function is highly non-convex. The key theoretical tool is the measure representation of the neural networks and a novel application of a dual certificate argument for sparse estimation on a measure space. We analyze the global minima and global convergence property in the measure space.}
}
@InProceedings{pmlr-v139-aladago21a,
title = {Slot Machines: Discovering Winning Combinations of Random Weights in Neural Networks},
author = {Aladago, Maxwell M and Torresani, Lorenzo},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {163--174},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/aladago21a/aladago21a.pdf},
url = {https://proceedings.mlr.press/v139/aladago21a.html},
abstract = {In contrast to traditional weight optimization in a continuous space, we demonstrate the existence of effective random networks whose weights are never updated. By selecting a weight among a fixed set of random values for each individual connection, our method uncovers combinations of random weights that match the performance of traditionally-trained networks of the same capacity. We refer to our networks as "slot machines" where each reel (connection) contains a fixed set of symbols (random values). Our backpropagation algorithm "spins" the reels to seek "winning" combinations, i.e., selections of random weight values that minimize the given loss. Quite surprisingly, we find that allocating just a few random values to each connection (e.g., 8 values per connection) yields highly competitive combinations despite being dramatically more constrained compared to traditionally learned weights. Moreover, finetuning these combinations often improves performance over the trained baselines. A randomly initialized VGG-19 with 8 values per connection contains a combination that achieves 91% test accuracy on CIFAR-10. Our method also achieves an impressive performance of 98.2% on MNIST for neural networks containing only random weights.}
}
@InProceedings{pmlr-v139-alet21a,
title = {A large-scale benchmark for few-shot program induction and synthesis},
author = {Alet, Ferran and Lopez-Contreras, Javier and Koppel, James and Nye, Maxwell and Solar-Lezama, Armando and Lozano-Perez, Tomas and Kaelbling, Leslie and Tenenbaum, Joshua},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {175--186},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/alet21a/alet21a.pdf},
url = {https://proceedings.mlr.press/v139/alet21a.html},
abstract = {A landmark challenge for AI is to learn flexible, powerful representations from small numbers of examples. On an important class of tasks, hypotheses in the form of programs provide extreme generalization capabilities from surprisingly few examples. However, whereas large natural few-shot learning image benchmarks have spurred progress in meta-learning for deep networks, there is no comparably big, natural program-synthesis dataset that can play a similar role. This is because, whereas images are relatively easy to label from internet meta-data or annotated by non-experts, generating meaningful input-output examples for program induction has proven hard to scale. In this work, we propose a new way of leveraging unit tests and natural inputs for small programs as meaningful input-output examples for each sub-program of the overall program. This allows us to create a large-scale naturalistic few-shot program-induction benchmark and propose new challenges in this domain. The evaluation of multiple program induction and synthesis algorithms points to shortcomings of current methods and suggests multiple avenues for future work.}
}
@InProceedings{pmlr-v139-alieva21a,
title = {Robust Pure Exploration in Linear Bandits with Limited Budget},
author = {Alieva, Ayya and Cutkosky, Ashok and Das, Abhimanyu},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {187--195},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/alieva21a/alieva21a.pdf},
url = {https://proceedings.mlr.press/v139/alieva21a.html},
abstract = {We consider the pure exploration problem in the fixed-budget linear bandit setting. We provide a new algorithm that identifies the best arm with high probability while being robust to unknown levels of observation noise as well as to moderate levels of misspecification in the linear model. Our technique combines prior approaches to pure exploration in the multi-armed bandit problem with optimal experimental design algorithms to obtain both problem dependent and problem independent bounds. Our success probability is never worse than that of an algorithm that ignores the linear structure, but seamlessly takes advantage of such structure when possible. Furthermore, we only need the number of samples to scale with the dimension of the problem rather than the number of arms. We complement our theoretical results with empirical validation.}
}
@InProceedings{pmlr-v139-alimisis21a,
title = {Communication-Efficient Distributed Optimization with Quantized Preconditioners},
author = {Alimisis, Foivos and Davies, Peter and Alistarh, Dan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {196--206},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/alimisis21a/alimisis21a.pdf},
url = {https://proceedings.mlr.press/v139/alimisis21a.html},
abstract = {We investigate fast and communication-efficient algorithms for the classic problem of minimizing a sum of strongly convex and smooth functions that are distributed among $n$ different nodes, which can communicate using a limited number of bits. Most previous communication-efficient approaches for this problem are limited to first-order optimization, and therefore have \emph{linear} dependence on the condition number in their communication complexity. We show that this dependence is not inherent: communication-efficient methods can in fact have sublinear dependence on the condition number. For this, we design and analyze the first communication-efficient distributed variants of preconditioned gradient descent for Generalized Linear Models, and for Newton’s method. Our results rely on a new technique for quantizing both the preconditioner and the descent direction at each step of the algorithms, while controlling their convergence rate. We also validate our findings experimentally, showing faster convergence and reduced communication relative to previous methods.}
}
@InProceedings{pmlr-v139-alquier21a,
title = {Non-Exponentially Weighted Aggregation: Regret Bounds for Unbounded Loss Functions},
author = {Alquier, Pierre},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {207--218},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/alquier21a/alquier21a.pdf},
url = {https://proceedings.mlr.press/v139/alquier21a.html},
abstract = {We tackle the problem of online optimization with a general, possibly unbounded, loss function. It is well known that when the loss is bounded, the exponentially weighted aggregation strategy (EWA) leads to a regret in $\sqrt{T}$ after $T$ steps. In this paper, we study a generalized aggregation strategy, where the weights no longer depend exponentially on the losses. Our strategy is based on Follow The Regularized Leader (FTRL): we minimize the expected losses plus a regularizer, that is here a $\phi$-divergence. When the regularizer is the Kullback-Leibler divergence, we obtain EWA as a special case. Using alternative divergences enables unbounded losses, at the cost of a worst regret bound in some cases.}
}
@InProceedings{pmlr-v139-alvarez-melis21a,
title = {Dataset Dynamics via Gradient Flows in Probability Space},
author = {Alvarez-Melis, David and Fusi, Nicol\`o},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {219--230},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/alvarez-melis21a/alvarez-melis21a.pdf},
url = {https://proceedings.mlr.press/v139/alvarez-melis21a.html},
abstract = {Various machine learning tasks, from generative modeling to domain adaptation, revolve around the concept of dataset transformation and manipulation. While various methods exist for transforming unlabeled datasets, principled methods to do so for labeled (e.g., classification) datasets are missing. In this work, we propose a novel framework for dataset transformation, which we cast as optimization over data-generating joint probability distributions. We approach this class of problems through Wasserstein gradient flows in probability space, and derive practical and efficient particle-based methods for a flexible but well-behaved class of objective functions. Through various experiments, we show that this framework can be used to impose constraints on classification datasets, adapt them for transfer learning, or to re-purpose fixed or black-box models to classify {—}with high accuracy{—} previously unseen datasets.}
}
@InProceedings{pmlr-v139-amanatidis21a,
title = {Submodular Maximization subject to a Knapsack Constraint: Combinatorial Algorithms with Near-optimal Adaptive Complexity},
author = {Amanatidis, Georgios and Fusco, Federico and Lazos, Philip and Leonardi, Stefano and Marchetti-Spaccamela, Alberto and Reiffenh{\"a}user, Rebecca},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {231--242},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/amanatidis21a/amanatidis21a.pdf},
url = {https://proceedings.mlr.press/v139/amanatidis21a.html},
abstract = {The growing need to deal with massive instances motivates the design of algorithms balancing the quality of the solution with applicability. For the latter, an important measure is the \emph{adaptive complexity}, capturing the number of sequential rounds of parallel computation needed. In this work we obtain the first \emph{constant factor} approximation algorithm for non-monotone submodular maximization subject to a knapsack constraint with \emph{near-optimal} $O(\log n)$ adaptive complexity. Low adaptivity by itself, however, is not enough: one needs to account for the total number of function evaluations (or value queries) as well. Our algorithm asks $\tilde{O}(n^2)$ value queries, but can be modified to run with only $\tilde{O}(n)$ instead, while retaining a low adaptive complexity of $O(\log^2n)$. Besides the above improvement in adaptivity, this is also the first \emph{combinatorial} approach with sublinear adaptive complexity for the problem and yields algorithms comparable to the state-of-the-art even for the special cases of cardinality constraints or monotone objectives. Finally, we showcase our algorithms’ applicability on real-world datasets.}
}
@InProceedings{pmlr-v139-amani21a,
title = {Safe Reinforcement Learning with Linear Function Approximation},
author = {Amani, Sanae and Thrampoulidis, Christos and Yang, Lin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {243--253},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/amani21a/amani21a.pdf},
url = {https://proceedings.mlr.press/v139/amani21a.html},
abstract = {Safety in reinforcement learning has become increasingly important in recent years. Yet, existing solutions either fail to strictly avoid choosing unsafe actions, which may lead to catastrophic results in safety-critical systems, or fail to provide regret guarantees for settings where safety constraints need to be learned. In this paper, we address both problems by first modeling safety as an unknown linear cost function of states and actions, which must always fall below a certain threshold. We then present algorithms, termed SLUCB-QVI and RSLUCB-QVI, for episodic Markov decision processes (MDPs) with linear function approximation. We show that SLUCB-QVI and RSLUCB-QVI, while with \emph{no safety violation}, achieve a $\tilde{\mathcal{O}}\left(\kappa\sqrt{d^3H^3T}\right)$ regret, nearly matching that of state-of-the-art unsafe algorithms, where $H$ is the duration of each episode, $d$ is the dimension of the feature mapping, $\kappa$ is a constant characterizing the safety constraints, and $T$ is the total number of action plays. We further present numerical simulations that corroborate our theoretical findings.}
}
@InProceedings{pmlr-v139-ambrogioni21a,
title = {Automatic variational inference with cascading flows},
author = {Ambrogioni, Luca and Silvestri, Gianluigi and van Gerven, Marcel},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {254--263},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ambrogioni21a/ambrogioni21a.pdf},
url = {https://proceedings.mlr.press/v139/ambrogioni21a.html},
abstract = {The automation of probabilistic reasoning is one of the primary aims of machine learning. Recently, the confluence of variational inference and deep learning has led to powerful and flexible automatic inference methods that can be trained by stochastic gradient descent. In particular, normalizing flows are highly parameterized deep models that can fit arbitrarily complex posterior densities. However, normalizing flows struggle in highly structured probabilistic programs as they need to relearn the forward-pass of the program. Automatic structured variational inference (ASVI) remedies this problem by constructing variational programs that embed the forward-pass. Here, we combine the flexibility of normalizing flows and the prior-embedding property of ASVI in a new family of variational programs, which we named cascading flows. A cascading flows program interposes a newly designed highway flow architecture in between the conditional distributions of the prior program such as to steer it toward the observed data. These programs can be constructed automatically from an input probabilistic program and can also be amortized automatically. We evaluate the performance of the new variational programs in a series of structured inference problems. We find that cascading flows have much higher performance than both normalizing flows and ASVI in a large set of structured inference problems.}
}
@InProceedings{pmlr-v139-ament21a,
title = {Sparse Bayesian Learning via Stepwise Regression},
author = {Ament, Sebastian E. and Gomes, Carla P.},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {264--274},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ament21a/ament21a.pdf},
url = {https://proceedings.mlr.press/v139/ament21a.html},
abstract = {Sparse Bayesian Learning (SBL) is a powerful framework for attaining sparsity in probabilistic models. Herein, we propose a coordinate ascent algorithm for SBL termed Relevance Matching Pursuit (RMP) and show that, as its noise variance parameter goes to zero, RMP exhibits a surprising connection to Stepwise Regression. Further, we derive novel guarantees for Stepwise Regression algorithms, which also shed light on RMP. Our guarantees for Forward Regression improve on deterministic and probabilistic results for Orthogonal Matching Pursuit with noise. Our analysis of Backward Regression culminates in a bound on the residual of the optimal solution to the subset selection problem that, if satisfied, guarantees the optimality of the result. To our knowledge, this bound is the first that can be computed in polynomial time and depends chiefly on the smallest singular value of the matrix. We report numerical experiments using a variety of feature selection algorithms. Notably, RMP and its limiting variant are both efficient and maintain strong performance with correlated features.}
}
@InProceedings{pmlr-v139-amin21a,
title = {Locally Persistent Exploration in Continuous Control Tasks with Sparse Rewards},
author = {Amin, Susan and Gomrokchi, Maziar and Aboutalebi, Hossein and Satija, Harsh and Precup, Doina},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {275--285},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/amin21a/amin21a.pdf},
url = {https://proceedings.mlr.press/v139/amin21a.html},
abstract = {A major challenge in reinforcement learning is the design of exploration strategies, especially for environments with sparse reward structures and continuous state and action spaces. Intuitively, if the reinforcement signal is very scarce, the agent should rely on some form of short-term memory in order to cover its environment efficiently. We propose a new exploration method, based on two intuitions: (1) the choice of the next exploratory action should depend not only on the (Markovian) state of the environment, but also on the agent’s trajectory so far, and (2) the agent should utilize a measure of spread in the state space to avoid getting stuck in a small region. Our method leverages concepts often used in statistical physics to provide explanations for the behavior of simplified (polymer) chains in order to generate persistent (locally self-avoiding) trajectories in state space. We discuss the theoretical properties of locally self-avoiding walks and their ability to provide a kind of short-term memory through a decaying temporal correlation within the trajectory. We provide empirical evaluations of our approach in a simulated 2D navigation task, as well as higher-dimensional MuJoCo continuous control locomotion tasks with sparse rewards.}
}
@InProceedings{pmlr-v139-anand21a,
title = {Preferential Temporal Difference Learning},
author = {Anand, Nishanth and Precup, Doina},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {286--296},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/anand21a/anand21a.pdf},
url = {https://proceedings.mlr.press/v139/anand21a.html},
abstract = {Temporal-Difference (TD) learning is a general and very useful tool for estimating the value function of a given policy, which in turn is required to find good policies. Generally speaking, TD learning updates states whenever they are visited. When the agent lands in a state, its value can be used to compute the TD-error, which is then propagated to other states. However, it may be interesting, when computing updates, to take into account other information than whether a state is visited or not. For example, some states might be more important than others (such as states which are frequently seen in a successful trajectory). Or, some states might have unreliable value estimates (for example, due to partial observability or lack of data), making their values less desirable as targets. We propose an approach to re-weighting states used in TD updates, both when they are the input and when they provide the target for the update. We prove that our approach converges with linear function approximation and illustrate its desirable empirical behaviour compared to other TD-style methods.}
}
@InProceedings{pmlr-v139-andino21a,
title = {Unitary Branching Programs: Learnability and Lower Bounds},
author = {Andino, Fidel Ernesto Diaz and Kokkou, Maria and De Oliveira Oliveira, Mateus and Vadiee, Farhad},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {297--306},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/andino21a/andino21a.pdf},
url = {https://proceedings.mlr.press/v139/andino21a.html},
abstract = {Bounded width branching programs are a formalism that can be used to capture the notion of non-uniform constant-space computation. In this work, we study a generalized version of bounded width branching programs where instructions are defined by unitary matrices of bounded dimension. We introduce a new learning framework for these branching programs that leverages on a combination of local search techniques with gradient descent over Riemannian manifolds. We also show that gapped, read-once branching programs of bounded dimension can be learned with a polynomial number of queries in the presence of a teacher. Finally, we provide explicit near-quadratic size lower-bounds for bounded-dimension unitary branching programs, and exponential size lower-bounds for bounded-dimension read-once gapped unitary branching programs. The first lower bound is proven using a combination of Neciporuk’s lower bound technique with classic results from algebraic geometry. The second lower bound is proven within the framework of communication complexity theory.}
}
@InProceedings{pmlr-v139-araki21a,
title = {The Logical Options Framework},
author = {Araki, Brandon and Li, Xiao and Vodrahalli, Kiran and Decastro, Jonathan and Fry, Micah and Rus, Daniela},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {307--317},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/araki21a/araki21a.pdf},
url = {https://proceedings.mlr.press/v139/araki21a.html},
abstract = {Learning composable policies for environments with complex rules and tasks is a challenging problem. We introduce a hierarchical reinforcement learning framework called the Logical Options Framework (LOF) that learns policies that are satisfying, optimal, and composable. LOF efficiently learns policies that satisfy tasks by representing the task as an automaton and integrating it into learning and planning. We provide and prove conditions under which LOF will learn satisfying, optimal policies. And lastly, we show how LOF’s learned policies can be composed to satisfy unseen tasks with only 10-50 retraining steps on our benchmarks. We evaluate LOF on four tasks in discrete and continuous domains, including a 3D pick-and-place environment.}
}
@InProceedings{pmlr-v139-arbel21a,
title = {Annealed Flow Transport Monte Carlo},
author = {Arbel, Michael and Matthews, Alex and Doucet, Arnaud},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {318--330},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/arbel21a/arbel21a.pdf},
url = {https://proceedings.mlr.press/v139/arbel21a.html},
abstract = {Annealed Importance Sampling (AIS) and its Sequential Monte Carlo (SMC) extensions are state-of-the-art methods for estimating normalizing constants of probability distributions. We propose here a novel Monte Carlo algorithm, Annealed Flow Transport (AFT), that builds upon AIS and SMC and combines them with normalizing flows (NFs) for improved performance. This method transports a set of particles using not only importance sampling (IS), Markov chain Monte Carlo (MCMC) and resampling steps - as in SMC, but also relies on NFs which are learned sequentially to push particles towards the successive annealed targets. We provide limit theorems for the resulting Monte Carlo estimates of the normalizing constant and expectations with respect to the target distribution. Additionally, we show that a continuous-time scaling limit of the population version of AFT is given by a Feynman–Kac measure which simplifies to the law of a controlled diffusion for expressive NFs. We demonstrate experimentally the benefits and limitations of our methodology on a variety of applications.}
}
@InProceedings{pmlr-v139-arbour21a,
title = {Permutation Weighting},
author = {Arbour, David and Dimmery, Drew and Sondhi, Arjun},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {331--341},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/arbour21a/arbour21a.pdf},
url = {https://proceedings.mlr.press/v139/arbour21a.html},
abstract = {A commonly applied approach for estimating causal effects from observational data is to apply weights which render treatments independent of observed pre-treatment covariates. Recently emphasis has been placed on deriving balancing weights which explicitly target this independence condition. In this work we introduce permutation weighting, a method for estimating balancing weights using a standard binary classifier (regardless of cardinality of treatment). A large class of probabilistic classifiers may be used in this method; the choice of loss for the classifier implies the particular definition of balance. We bound bias and variance in terms of the excess risk of the classifier, show that these disappear asymptotically, and demonstrate that our classification problem directly minimizes imbalance. Additionally, hyper-parameter tuning and model selection can be performed with standard cross-validation methods. Empirical evaluations indicate that permutation weighting provides favorable performance in comparison to existing methods.}
}
@InProceedings{pmlr-v139-arnould21a,
title = {Analyzing the tree-layer structure of Deep Forests},
author = {Arnould, Ludovic and Boyer, Claire and Scornet, Erwan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {342--350},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/arnould21a/arnould21a.pdf},
url = {https://proceedings.mlr.press/v139/arnould21a.html},
abstract = {Random forests on the one hand, and neural networks on the other hand, have met great success in the machine learning community for their predictive performance. Combinations of both have been proposed in the literature, notably leading to the so-called deep forests (DF) (Zhou & Feng,2019). In this paper, our aim is not to benchmark DF performances but to investigate instead their underlying mechanisms. Additionally, DF architecture can be generally simplified into more simple and computationally efficient shallow forest networks. Despite some instability, the latter may outperform standard predictive tree-based methods. We exhibit a theoretical framework in which a shallow tree network is shown to enhance the performance of classical decision trees. In such a setting, we provide tight theoretical lower and upper bounds on its excess risk. These theoretical results show the interest of tree-network architectures for well-structured data provided that the first layer, acting as a data encoder, is rich enough.}
}
@InProceedings{pmlr-v139-arora21a,
title = {Dropout: Explicit Forms and Capacity Control},
author = {Arora, Raman and Bartlett, Peter and Mianjy, Poorya and Srebro, Nathan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {351--361},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/arora21a/arora21a.pdf},
url = {https://proceedings.mlr.press/v139/arora21a.html},
abstract = {We investigate the capacity control provided by dropout in various machine learning problems. First, we study dropout for matrix completion, where it induces a distribution-dependent regularizer that equals the weighted trace-norm of the product of the factors. In deep learning, we show that the distribution-dependent regularizer due to dropout directly controls the Rademacher complexity of the underlying class of deep neural networks. These developments enable us to give concrete generalization error bounds for the dropout algorithm in both matrix completion as well as training deep neural networks.}
}
@InProceedings{pmlr-v139-artemev21a,
title = {Tighter Bounds on the Log Marginal Likelihood of Gaussian Process Regression Using Conjugate Gradients},
author = {Artemev, Artem and Burt, David R. and van der Wilk, Mark},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {362--372},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/artemev21a/artemev21a.pdf},
url = {https://proceedings.mlr.press/v139/artemev21a.html},
abstract = {We propose a lower bound on the log marginal likelihood of Gaussian process regression models that can be computed without matrix factorisation of the full kernel matrix. We show that approximate maximum likelihood learning of model parameters by maximising our lower bound retains many benefits of the sparse variational approach while reducing the bias introduced into hyperparameter learning. The basis of our bound is a more careful analysis of the log-determinant term appearing in the log marginal likelihood, as well as using the method of conjugate gradients to derive tight lower bounds on the term involving a quadratic form. Our approach is a step forward in unifying methods relying on lower bound maximisation (e.g. variational methods) and iterative approaches based on conjugate gradients for training Gaussian processes. In experiments, we show improved predictive performance with our model for a comparable amount of training time compared to other conjugate gradient based approaches.}
}
@InProceedings{pmlr-v139-arumugam21a,
title = {Deciding What to Learn: A Rate-Distortion Approach},
author = {Arumugam, Dilip and Van Roy, Benjamin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {373--382},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/arumugam21a/arumugam21a.pdf},
url = {https://proceedings.mlr.press/v139/arumugam21a.html},
abstract = {Agents that learn to select optimal actions represent a prominent focus of the sequential decision-making literature. In the face of a complex environment or constraints on time and resources, however, aiming to synthesize such an optimal policy can become infeasible. These scenarios give rise to an important trade-off between the information an agent must acquire to learn and the sub-optimality of the resulting policy. While an agent designer has a preference for how this trade-off is resolved, existing approaches further require that the designer translate these preferences into a fixed learning target for the agent. In this work, leveraging rate-distortion theory, we automate this process such that the designer need only express their preferences via a single hyperparameter and the agent is endowed with the ability to compute its own learning targets that best achieve the desired trade-off. We establish a general bound on expected discounted regret for an agent that decides what to learn in this manner along with computational experiments that illustrate the expressiveness of designer preferences and even show improvements over Thompson sampling in identifying an optimal policy.}
}
@InProceedings{pmlr-v139-asi21a,
title = {Private Adaptive Gradient Methods for Convex Optimization},
author = {Asi, Hilal and Duchi, John and Fallah, Alireza and Javidbakht, Omid and Talwar, Kunal},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {383--392},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/asi21a/asi21a.pdf},
url = {https://proceedings.mlr.press/v139/asi21a.html},
abstract = {We study adaptive methods for differentially private convex optimization, proposing and analyzing differentially private variants of a Stochastic Gradient Descent (SGD) algorithm with adaptive stepsizes, as well as the AdaGrad algorithm. We provide upper bounds on the regret of both algorithms and show that the bounds are (worst-case) optimal. As a consequence of our development, we show that our private versions of AdaGrad outperform adaptive SGD, which in turn outperforms traditional SGD in scenarios with non-isotropic gradients where (non-private) Adagrad provably outperforms SGD. The major challenge is that the isotropic noise typically added for privacy dominates the signal in gradient geometry for high-dimensional problems; approaches to this that effectively optimize over lower-dimensional subspaces simply ignore the actual problems that varying gradient geometries introduce. In contrast, we study non-isotropic clipping and noise addition, developing a principled theoretical approach; the consequent procedures also enjoy significantly stronger empirical performance than prior approaches.}
}
@InProceedings{pmlr-v139-asi21b,
title = {Private Stochastic Convex Optimization: Optimal Rates in L1 Geometry},
author = {Asi, Hilal and Feldman, Vitaly and Koren, Tomer and Talwar, Kunal},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {393--403},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/asi21b/asi21b.pdf},
url = {https://proceedings.mlr.press/v139/asi21b.html},
abstract = {Stochastic convex optimization over an $\ell_1$-bounded domain is ubiquitous in machine learning applications such as LASSO but remains poorly understood when learning with differential privacy. We show that, up to logarithmic factors the optimal excess population loss of any $(\epsilon,\delta)$-differentially private optimizer is $\sqrt{\log(d)/n} + \sqrt{d}/\epsilon n.$ The upper bound is based on a new algorithm that combines the iterative localization approach of Feldman et al. (2020) with a new analysis of private regularized mirror descent. It applies to $\ell_p$ bounded domains for $p\in [1,2]$ and queries at most $n^{3/2}$ gradients improving over the best previously known algorithm for the $\ell_2$ case which needs $n^2$ gradients. Further, we show that when the loss functions satisfy additional smoothness assumptions, the excess loss is upper bounded (up to logarithmic factors) by $\sqrt{\log(d)/n} + (\log(d)/\epsilon n)^{2/3}.$ This bound is achieved by a new variance-reduced version of the Frank-Wolfe algorithm that requires just a single pass over the data. We also show that the lower bound in this case is the minimum of the two rates mentioned above.}
}
@InProceedings{pmlr-v139-atsidakou21a,
title = {Combinatorial Blocking Bandits with Stochastic Delays},
author = {Atsidakou, Alexia and Papadigenopoulos, Orestis and Basu, Soumya and Caramanis, Constantine and Shakkottai, Sanjay},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {404--413},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/atsidakou21a/atsidakou21a.pdf},
url = {https://proceedings.mlr.press/v139/atsidakou21a.html},
abstract = {Recent work has considered natural variations of the {\em multi-armed bandit} problem, where the reward distribution of each arm is a special function of the time passed since its last pulling. In this direction, a simple (yet widely applicable) model is that of {\em blocking bandits}, where an arm becomes unavailable for a deterministic number of rounds after each play. In this work, we extend the above model in two directions: (i) We consider the general combinatorial setting where more than one arms can be played at each round, subject to feasibility constraints. (ii) We allow the blocking time of each arm to be stochastic. We first study the computational/unconditional hardness of the above setting and identify the necessary conditions for the problem to become tractable (even in an approximate sense). Based on these conditions, we provide a tight analysis of the approximation guarantee of a natural greedy heuristic that always plays the maximum expected reward feasible subset among the available (non-blocked) arms. When the arms’ expected rewards are unknown, we adapt the above heuristic into a bandit algorithm, based on UCB, for which we provide sublinear (approximate) regret guarantees, matching the theoretical lower bounds in the limiting case of absence of delays.}
}
@InProceedings{pmlr-v139-audiffren21a,
title = {Dichotomous Optimistic Search to Quantify Human Perception},
author = {Audiffren, Julien},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {414--424},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/audiffren21a/audiffren21a.pdf},
url = {https://proceedings.mlr.press/v139/audiffren21a.html},
abstract = {In this paper we address a variant of the continuous multi-armed bandits problem, called the threshold estimation problem, which is at the heart of many psychometric experiments. Here, the objective is to estimate the sensitivity threshold for an unknown psychometric function Psi, which is assumed to be non decreasing and continuous. Our algorithm, Dichotomous Optimistic Search (DOS), efficiently solves this task by taking inspiration from hierarchical multi-armed bandits and Black-box optimization. Compared to previous approaches, DOS is model free and only makes minimal assumption on Psi smoothness, while having strong theoretical guarantees that compares favorably to recent methods from both Psychophysics and Global Optimization. We also empirically evaluate DOS and show that it significantly outperforms these methods, both in experiments that mimics the conduct of a psychometric experiment, and in tests with large pulls budgets that illustrate the faster convergence rate.}
}
@InProceedings{pmlr-v139-avdiukhin21a,
title = {Federated Learning under Arbitrary Communication Patterns},
author = {Avdiukhin, Dmitrii and Kasiviswanathan, Shiva},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {425--435},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/avdiukhin21a/avdiukhin21a.pdf},
url = {https://proceedings.mlr.press/v139/avdiukhin21a.html},
abstract = {Federated Learning is a distributed learning setting where the goal is to train a centralized model with training data distributed over a large number of heterogeneous clients, each with unreliable and relatively slow network connections. A common optimization approach used in federated learning is based on the idea of local SGD: each client runs some number of SGD steps locally and then the updated local models are averaged to form the updated global model on the coordinating server. In this paper, we investigate the performance of an asynchronous version of local SGD wherein the clients can communicate with the server at arbitrary time intervals. Our main result shows that for smooth strongly convex and smooth nonconvex functions we achieve convergence rates that match the synchronous version that requires all clients to communicate simultaneously.}
}
@InProceedings{pmlr-v139-aviv21a,
title = {Asynchronous Distributed Learning : Adapting to Gradient Delays without Prior Knowledge},
author = {Aviv, Rotem Zamir and Hakimi, Ido and Schuster, Assaf and Levy, Kfir Yehuda},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {436--445},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/aviv21a/aviv21a.pdf},
url = {https://proceedings.mlr.press/v139/aviv21a.html},
abstract = {We consider stochastic convex optimization problems, where several machines act asynchronously in parallel while sharing a common memory. We propose a robust training method for the constrained setting and derive non asymptotic convergence guarantees that do not depend on prior knowledge of update delays, objective smoothness, and gradient variance. Conversely, existing methods for this setting crucially rely on this prior knowledge, which render them unsuitable for essentially all shared-resources computational environments, such as clouds and data centers. Concretely, existing approaches are unable to accommodate changes in the delays which result from dynamic allocation of the machines, while our method implicitly adapts to such changes.}
}
@InProceedings{pmlr-v139-axiotis21a,
title = {Decomposable Submodular Function Minimization via Maximum Flow},
author = {Axiotis, Kyriakos and Karczmarz, Adam and Mukherjee, Anish and Sankowski, Piotr and Vladu, Adrian},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {446--456},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/axiotis21a/axiotis21a.pdf},
url = {https://proceedings.mlr.press/v139/axiotis21a.html},
abstract = {This paper bridges discrete and continuous optimization approaches for decomposable submodular function minimization, in both the standard and parametric settings. We provide improved running times for this problem by reducing it to a number of calls to a maximum flow oracle. When each function in the decomposition acts on O(1) elements of the ground set V and is polynomially bounded, our running time is up to polylogarithmic factors equal to that of solving maximum flow in a sparse graph with O(|V|) vertices and polynomial integral capacities. We achieve this by providing a simple iterative method which can optimize to high precision any convex function defined on the submodular base polytope, provided we can efficiently minimize it on the base polytope corresponding to the cut function of a certain graph that we construct. We solve this minimization problem by lifting the solutions of a parametric cut problem, which we obtain via a new efficient combinatorial reduction to maximum flow. This reduction is of independent interest and implies some previously unknown bounds for the parametric minimum s,t-cut problem in multiple settings.}
}
@InProceedings{pmlr-v139-aydore21a,
title = {Differentially Private Query Release Through Adaptive Projection},
author = {Aydore, Sergul and Brown, William and Kearns, Michael and Kenthapadi, Krishnaram and Melis, Luca and Roth, Aaron and Siva, Ankit A},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {457--467},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/aydore21a/aydore21a.pdf},
url = {https://proceedings.mlr.press/v139/aydore21a.html},
abstract = {We propose, implement, and evaluate a new algo-rithm for releasing answers to very large numbersof statistical queries likek-way marginals, sub-ject to differential privacy. Our algorithm makesadaptive use of a continuous relaxation of thePro-jection Mechanism, which answers queries on theprivate dataset using simple perturbation, and thenattempts to find the synthetic dataset that mostclosely matches the noisy answers. We use a con-tinuous relaxation of the synthetic dataset domainwhich makes the projection loss differentiable,and allows us to use efficient ML optimizationtechniques and tooling. Rather than answering allqueries up front, we make judicious use of ourprivacy budget by iteratively finding queries forwhich our (relaxed) synthetic data has high error,and then repeating the projection. Randomizedrounding allows us to obtain synthetic data in theoriginal schema. We perform experimental evalu-ations across a range of parameters and datasets,and find that our method outperforms existingalgorithms on large query classes.}
}
@InProceedings{pmlr-v139-azulay21a,
title = {On the Implicit Bias of Initialization Shape: Beyond Infinitesimal Mirror Descent},
author = {Azulay, Shahar and Moroshko, Edward and Nacson, Mor Shpigel and Woodworth, Blake E and Srebro, Nathan and Globerson, Amir and Soudry, Daniel},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {468--477},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/azulay21a/azulay21a.pdf},
url = {https://proceedings.mlr.press/v139/azulay21a.html},
abstract = {Recent work has highlighted the role of initialization scale in determining the structure of the solutions that gradient methods converge to. In particular, it was shown that large initialization leads to the neural tangent kernel regime solution, whereas small initialization leads to so called “rich regimes”. However, the initialization structure is richer than the overall scale alone and involves relative magnitudes of different weights and layers in the network. Here we show that these relative scales, which we refer to as initialization shape, play an important role in determining the learned model. We develop a novel technique for deriving the inductive bias of gradient-flow and use it to obtain closed-form implicit regularizers for multiple cases of interest.}
}
@InProceedings{pmlr-v139-babaiee21a,
title = {On-Off Center-Surround Receptive Fields for Accurate and Robust Image Classification},
author = {Babaiee, Zahra and Hasani, Ramin and Lechner, Mathias and Rus, Daniela and Grosu, Radu},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {478--489},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/babaiee21a/babaiee21a.pdf},
url = {https://proceedings.mlr.press/v139/babaiee21a.html},
abstract = {Robustness to variations in lighting conditions is a key objective for any deep vision system. To this end, our paper extends the receptive field of convolutional neural networks with two residual components, ubiquitous in the visual processing system of vertebrates: On-center and off-center pathways, with an excitatory center and inhibitory surround; OOCS for short. The On-center pathway is excited by the presence of a light stimulus in its center, but not in its surround, whereas the Off-center pathway is excited by the absence of a light stimulus in its center, but not in its surround. We design OOCS pathways via a difference of Gaussians, with their variance computed analytically from the size of the receptive fields. OOCS pathways complement each other in their response to light stimuli, ensuring this way a strong edge-detection capability, and as a result an accurate and robust inference under challenging lighting conditions. We provide extensive empirical evidence showing that networks supplied with OOCS pathways gain accuracy and illumination-robustness from the novel edge representation, compared to other baselines.}
}
@InProceedings{pmlr-v139-bachmann21a,
title = {Uniform Convergence, Adversarial Spheres and a Simple Remedy},
author = {Bachmann, Gregor and Moosavi-Dezfooli, Seyed-Mohsen and Hofmann, Thomas},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {490--499},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bachmann21a/bachmann21a.pdf},
url = {https://proceedings.mlr.press/v139/bachmann21a.html},
abstract = {Previous work has cast doubt on the general framework of uniform convergence and its ability to explain generalization in neural networks. By considering a specific dataset, it was observed that a neural network completely misclassifies a projection of the training data (adversarial set), rendering any existing generalization bound based on uniform convergence vacuous. We provide an extensive theoretical investigation of the previously studied data setting through the lens of infinitely-wide models. We prove that the Neural Tangent Kernel (NTK) also suffers from the same phenomenon and we uncover its origin. We highlight the important role of the output bias and show theoretically as well as empirically how a sensible choice completely mitigates the problem. We identify sharp phase transitions in the accuracy on the adversarial set and study its dependency on the training sample size. As a result, we are able to characterize critical sample sizes beyond which the effect disappears. Moreover, we study decompositions of a neural network into a clean and noisy part by considering its canonical decomposition into its different eigenfunctions and show empirically that for too small bias the adversarial phenomenon still persists.}
}
@InProceedings{pmlr-v139-backurs21a,
title = {Faster Kernel Matrix Algebra via Density Estimation},
author = {Backurs, Arturs and Indyk, Piotr and Musco, Cameron and Wagner, Tal},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {500--510},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/backurs21a/backurs21a.pdf},
url = {https://proceedings.mlr.press/v139/backurs21a.html},
abstract = {We study fast algorithms for computing basic properties of an n x n positive semidefinite kernel matrix K corresponding to n points x_1,...,x_n in R^d. In particular, we consider the estimating the sum of kernel matrix entries, along with its top eigenvalue and eigenvector. These are some of the most basic problems defined over kernel matrices. We show that the sum of matrix entries can be estimated up to a multiplicative factor of 1+\epsilon in time sublinear in n and linear in d for many popular kernel functions, including the Gaussian, exponential, and rational quadratic kernels. For these kernels, we also show that the top eigenvalue (and a witnessing approximate eigenvector) can be approximated to a multiplicative factor of 1+\epsilon in time sub-quadratic in n and linear in d. Our algorithms represent significant advances in the best known runtimes for these problems. They leverage the positive definiteness of the kernel matrix, along with a recent line of work on efficient kernel density estimation.}
}
@InProceedings{pmlr-v139-badrinath21a,
title = {Robust Reinforcement Learning using Least Squares Policy Iteration with Provable Performance Guarantees},
author = {Badrinath, Kishan Panaganti and Kalathil, Dileep},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {511--520},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/badrinath21a/badrinath21a.pdf},
url = {https://proceedings.mlr.press/v139/badrinath21a.html},
abstract = {This paper addresses the problem of model-free reinforcement learning for Robust Markov Decision Process (RMDP) with large state spaces. The goal of the RMDPs framework is to find a policy that is robust against the parameter uncertainties due to the mismatch between the simulator model and real-world settings. We first propose the Robust Least Squares Policy Evaluation algorithm, which is a multi-step online model-free learning algorithm for policy evaluation. We prove the convergence of this algorithm using stochastic approximation techniques. We then propose Robust Least Squares Policy Iteration (RLSPI) algorithm for learning the optimal robust policy. We also give a general weighted Euclidean norm bound on the error (closeness to optimality) of the resulting policy. Finally, we demonstrate the performance of our RLSPI algorithm on some benchmark problems from OpenAI Gym.}
}
@InProceedings{pmlr-v139-bagaria21a,
title = {Skill Discovery for Exploration and Planning using Deep Skill Graphs},
author = {Bagaria, Akhil and Senthil, Jason K and Konidaris, George},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {521--531},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bagaria21a/bagaria21a.pdf},
url = {https://proceedings.mlr.press/v139/bagaria21a.html},
abstract = {We introduce a new skill-discovery algorithm that builds a discrete graph representation of large continuous MDPs, where nodes correspond to skill subgoals and the edges to skill policies. The agent constructs this graph during an unsupervised training phase where it interleaves discovering skills and planning using them to gain coverage over ever-increasing portions of the state-space. Given a novel goal at test time, the agent plans with the acquired skill graph to reach a nearby state, then switches to learning to reach the goal. We show that the resulting algorithm, Deep Skill Graphs, outperforms both flat and existing hierarchical reinforcement learning methods on four difficult continuous control tasks.}
}
@InProceedings{pmlr-v139-bahri21a,
title = {Locally Adaptive Label Smoothing Improves Predictive Churn},
author = {Bahri, Dara and Jiang, Heinrich},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {532--542},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bahri21a/bahri21a.pdf},
url = {https://proceedings.mlr.press/v139/bahri21a.html},
abstract = {Training modern neural networks is an inherently noisy process that can lead to high \emph{prediction churn}– disagreements between re-trainings of the same model due to factors such as randomization in the parameter initialization and mini-batches– even when the trained models all attain similar accuracies. Such prediction churn can be very undesirable in practice. In this paper, we present several baselines for reducing churn and show that training on soft labels obtained by adaptively smoothing each example’s label based on the example’s neighboring labels often outperforms the baselines on churn while improving accuracy on a variety of benchmark classification tasks and model architectures.}
}
@InProceedings{pmlr-v139-bai21a,
title = {How Important is the Train-Validation Split in Meta-Learning?},
author = {Bai, Yu and Chen, Minshuo and Zhou, Pan and Zhao, Tuo and Lee, Jason and Kakade, Sham and Wang, Huan and Xiong, Caiming},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {543--553},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bai21a/bai21a.pdf},
url = {https://proceedings.mlr.press/v139/bai21a.html},
abstract = {Meta-learning aims to perform fast adaptation on a new task through learning a “prior” from multiple existing tasks. A common practice in meta-learning is to perform a train-validation split (\emph{train-val method}) where the prior adapts to the task on one split of the data, and the resulting predictor is evaluated on another split. Despite its prevalence, the importance of the train-validation split is not well understood either in theory or in practice, particularly in comparison to the more direct \emph{train-train method}, which uses all the per-task data for both training and evaluation. We provide a detailed theoretical study on whether and when the train-validation split is helpful in the linear centroid meta-learning problem. In the agnostic case, we show that the expected loss of the train-val method is minimized at the optimal prior for meta testing, and this is not the case for the train-train method in general without structural assumptions on the data. In contrast, in the realizable case where the data are generated from linear models, we show that both the train-val and train-train losses are minimized at the optimal prior in expectation. Further, perhaps surprisingly, our main result shows that the train-train method achieves a \emph{strictly better} excess loss in this realizable case, even when the regularization parameter and split ratio are optimally tuned for both methods. Our results highlight that sample splitting may not always be preferable, especially when the data is realizable by the model. We validate our theories by experimentally showing that the train-train method can indeed outperform the train-val method, on both simulations and real meta-learning tasks.}
}
@InProceedings{pmlr-v139-bai21b,
title = {Stabilizing Equilibrium Models by Jacobian Regularization},
author = {Bai, Shaojie and Koltun, Vladlen and Kolter, Zico},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {554--565},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bai21b/bai21b.pdf},
url = {https://proceedings.mlr.press/v139/bai21b.html},
abstract = {Deep equilibrium networks (DEQs) are a new class of models that eschews traditional depth in favor of finding the fixed point of a single non-linear layer. These models have been shown to achieve performance competitive with the state-of-the-art deep networks while using significantly less memory. Yet they are also slower, brittle to architectural choices, and introduce potential instability to the model. In this paper, we propose a regularization scheme for DEQ models that explicitly regularizes the Jacobian of the fixed-point update equations to stabilize the learning of equilibrium models. We show that this regularization adds only minimal computational cost, significantly stabilizes the fixed-point convergence in both forward and backward passes, and scales well to high-dimensional, realistic domains (e.g., WikiText-103 language modeling and ImageNet classification). Using this method, we demonstrate, for the first time, an implicit-depth model that runs with approximately the same speed and level of performance as popular conventional deep networks such as ResNet-101, while still maintaining the constant memory footprint and architectural simplicity of DEQs. Code is available https://github.com/locuslab/deq.}
}
@InProceedings{pmlr-v139-bai21c,
title = {Don’t Just Blame Over-parametrization for Over-confidence: Theoretical Analysis of Calibration in Binary Classification},
author = {Bai, Yu and Mei, Song and Wang, Huan and Xiong, Caiming},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {566--576},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bai21c/bai21c.pdf},
url = {https://proceedings.mlr.press/v139/bai21c.html},
abstract = {Modern machine learning models with high accuracy are often miscalibrated—the predicted top probability does not reflect the actual accuracy, and tends to be \emph{over-confident}. It is commonly believed that such over-confidence is mainly due to \emph{over-parametrization}, in particular when the model is large enough to memorize the training data and maximize the confidence. In this paper, we show theoretically that over-parametrization is not the only reason for over-confidence. We prove that \emph{logistic regression is inherently over-confident}, in the realizable, under-parametrized setting where the data is generated from the logistic model, and the sample size is much larger than the number of parameters. Further, this over-confidence happens for general well-specified binary classification problems as long as the activation is symmetric and concave on the positive part. Perhaps surprisingly, we also show that over-confidence is not always the case—there exists another activation function (and a suitable loss function) under which the learned classifier is \emph{under-confident} at some probability values. Overall, our theory provides a precise characterization of calibration in realizable binary classification, which we verify on simulations and real data experiments.}
}
@InProceedings{pmlr-v139-bai21d,
title = {Principled Exploration via Optimistic Bootstrapping and Backward Induction},
author = {Bai, Chenjia and Wang, Lingxiao and Han, Lei and Hao, Jianye and Garg, Animesh and Liu, Peng and Wang, Zhaoran},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {577--587},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bai21d/bai21d.pdf},
url = {https://proceedings.mlr.press/v139/bai21d.html},
abstract = {One principled approach for provably efficient exploration is incorporating the upper confidence bound (UCB) into the value function as a bonus. However, UCB is specified to deal with linear and tabular settings and is incompatible with Deep Reinforcement Learning (DRL). In this paper, we propose a principled exploration method for DRL through Optimistic Bootstrapping and Backward Induction (OB2I). OB2I constructs a general-purpose UCB-bonus through non-parametric bootstrap in DRL. The UCB-bonus estimates the epistemic uncertainty of state-action pairs for optimistic exploration. We build theoretical connections between the proposed UCB-bonus and the LSVI-UCB in linear setting. We propagate future uncertainty in a time-consistent manner through episodic backward update, which exploits the theoretical advantage and empirically improves the sample-efficiency. Our experiments in MNIST maze and Atari suit suggest that OB2I outperforms several state-of-the-art exploration approaches.}
}
@InProceedings{pmlr-v139-bai21e,
title = {GLSearch: Maximum Common Subgraph Detection via Learning to Search},
author = {Bai, Yunsheng and Xu, Derek and Sun, Yizhou and Wang, Wei},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {588--598},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bai21e/bai21e.pdf},
url = {https://proceedings.mlr.press/v139/bai21e.html},
abstract = {Detecting the Maximum Common Subgraph (MCS) between two input graphs is fundamental for applications in drug synthesis, malware detection, cloud computing, etc. However, MCS computation is NP-hard, and state-of-the-art MCS solvers rely on heuristic search algorithms which in practice cannot find good solution for large graph pairs given a limited computation budget. We propose GLSearch, a Graph Neural Network (GNN) based learning to search model. Our model is built upon the branch and bound algorithm, which selects one pair of nodes from the two input graphs to expand at a time. We propose a novel GNN-based Deep Q-Network (DQN) to select the node pair, making the search process much faster. Experiments on synthetic and real-world graph pairs demonstrate that our model learns a search strategy that is able to detect significantly larger common subgraphs than existing MCS solvers given the same computation budget. GLSearch can be potentially extended to solve many other combinatorial problems with constraints on graphs.}
}
@InProceedings{pmlr-v139-balcilar21a,
title = {Breaking the Limits of Message Passing Graph Neural Networks},
author = {Balcilar, Muhammet and Heroux, Pierre and Gauzere, Benoit and Vasseur, Pascal and Adam, Sebastien and Honeine, Paul},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {599--608},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/balcilar21a/balcilar21a.pdf},
url = {https://proceedings.mlr.press/v139/balcilar21a.html},
abstract = {Since the Message Passing (Graph) Neural Networks (MPNNs) have a linear complexity with respect to the number of nodes when applied to sparse graphs, they have been widely implemented and still raise a lot of interest even though their theoretical expressive power is limited to the first order Weisfeiler-Lehman test (1-WL). In this paper, we show that if the graph convolution supports are designed in spectral-domain by a non-linear custom function of eigenvalues and masked with an arbitrary large receptive field, the MPNN is theoretically more powerful than the 1-WL test and experimentally as powerful as a 3-WL existing models, while remaining spatially localized. Moreover, by designing custom filter functions, outputs can have various frequency components that allow the convolution process to learn different relationships between a given input graph signal and its associated properties. So far, the best 3-WL equivalent graph neural networks have a computational complexity in $\mathcal{O}(n^3)$ with memory usage in $\mathcal{O}(n^2)$, consider non-local update mechanism and do not provide the spectral richness of output profile. The proposed method overcomes all these aforementioned problems and reaches state-of-the-art results in many downstream tasks.}
}
@InProceedings{pmlr-v139-balkanski21a,
title = {Instance Specific Approximations for Submodular Maximization},
author = {Balkanski, Eric and Qian, Sharon and Singer, Yaron},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {609--618},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/balkanski21a/balkanski21a.pdf},
url = {https://proceedings.mlr.press/v139/balkanski21a.html},
abstract = {The predominant measure for the performance of an algorithm is its worst-case approximation guarantee. While worst-case approximations give desirable robustness guarantees, they can differ significantly from the performance of an algorithm in practice. For the problem of monotone submodular maximization under a cardinality constraint, the greedy algorithm is known to obtain a 1-1/e approximation guarantee, which is optimal for a polynomial-time algorithm. However, very little is known about the approximation achieved by greedy and other submodular maximization algorithms on real instances. We develop an algorithm that gives an instance-specific approximation for any solution of an instance of monotone submodular maximization under a cardinality constraint. This algorithm uses a novel dual approach to submodular maximization. In particular, it relies on the construction of a lower bound to the dual objective that can also be exactly minimized. We use this algorithm to show that on a wide variety of real-world datasets and objectives, greedy and other algorithms find solutions that approximate the optimal solution significantly better than the 1-1/e 0.63 worst-case approximation guarantee, often exceeding 0.9.}
}
@InProceedings{pmlr-v139-ball21a,
title = {Augmented World Models Facilitate Zero-Shot Dynamics Generalization From a Single Offline Environment},
author = {Ball, Philip J and Lu, Cong and Parker-Holder, Jack and Roberts, Stephen},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {619--629},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ball21a/ball21a.pdf},
url = {https://proceedings.mlr.press/v139/ball21a.html},
abstract = {Reinforcement learning from large-scale offline datasets provides us with the ability to learn policies without potentially unsafe or impractical exploration. Significant progress has been made in the past few years in dealing with the challenge of correcting for differing behavior between the data collection and learned policies. However, little attention has been paid to potentially changing dynamics when transferring a policy to the online setting, where performance can be up to 90% reduced for existing methods. In this paper we address this problem with Augmented World Models (AugWM). We augment a learned dynamics model with simple transformations that seek to capture potential changes in physical properties of the robot, leading to more robust policies. We not only train our policy in this new setting, but also provide it with the sampled augmentation as a context, allowing it to adapt to changes in the environment. At test time we learn the context in a self-supervised fashion by approximating the augmentation which corresponds to the new environment. We rigorously evaluate our approach on over 100 different changed dynamics settings, and show that this simple approach can significantly improve the zero-shot generalization of a recent state-of-the-art baseline, often achieving successful policies where the baseline fails.}
}
@InProceedings{pmlr-v139-balseiro21a,
title = {Regularized Online Allocation Problems: Fairness and Beyond},
author = {Balseiro, Santiago and Lu, Haihao and Mirrokni, Vahab},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {630--639},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/balseiro21a/balseiro21a.pdf},
url = {https://proceedings.mlr.press/v139/balseiro21a.html},
abstract = {Online allocation problems with resource constraints have a rich history in computer science and operations research. In this paper, we introduce the regularized online allocation problem, a variant that includes a non-linear regularizer acting on the total resource consumption. In this problem, requests repeatedly arrive over time and, for each request, a decision maker needs to take an action that generates a reward and consumes resources. The objective is to simultaneously maximize total rewards and the value of the regularizer subject to the resource constraints. Our primary motivation is the online allocation of internet advertisements wherein firms seek to maximize additive objectives such as the revenue or efficiency of the allocation. By introducing a regularizer, firms can account for the fairness of the allocation or, alternatively, punish under-delivery of advertisements—two common desiderata in internet advertising markets. We design an algorithm when arrivals are drawn independently from a distribution that is unknown to the decision maker. Our algorithm is simple, fast, and attains the optimal order of sub-linear regret compared to the optimal allocation with the benefit of hindsight. Numerical experiments confirm the effectiveness of the proposed algorithm and of the regularizers in an internet advertising application.}
}
@InProceedings{pmlr-v139-bao21a,
title = {Predict then Interpolate: A Simple Algorithm to Learn Stable Classifiers},
author = {Bao, Yujia and Chang, Shiyu and Barzilay, Regina},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {640--650},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bao21a/bao21a.pdf},
url = {https://proceedings.mlr.press/v139/bao21a.html},
abstract = {We propose Predict then Interpolate (PI), a simple algorithm for learning correlations that are stable across environments. The algorithm follows from the intuition that when using a classifier trained on one environment to make predictions on examples from another environment, its mistakes are informative as to which correlations are unstable. In this work, we prove that by interpolating the distributions of the correct predictions and the wrong predictions, we can uncover an oracle distribution where the unstable correlation vanishes. Since the oracle interpolation coefficients are not accessible, we use group distributionally robust optimization to minimize the worst-case risk across all such interpolations. We evaluate our method on both text classification and image classification. Empirical results demonstrate that our algorithm is able to learn robust classifiers (outperforms IRM by 23.85% on synthetic environments and 12.41% on natural environments). Our code and data are available at https://github.com/YujiaBao/ Predict-then-Interpolate.}
}
@InProceedings{pmlr-v139-bao21b,
title = {Variational (Gradient) Estimate of the Score Function in Energy-based Latent Variable Models},
author = {Bao, Fan and Xu, Kun and Li, Chongxuan and Hong, Lanqing and Zhu, Jun and Zhang, Bo},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {651--661},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bao21b/bao21b.pdf},
url = {https://proceedings.mlr.press/v139/bao21b.html},
abstract = {This paper presents new estimates of the score function and its gradient with respect to the model parameters in a general energy-based latent variable model (EBLVM). The score function and its gradient can be expressed as combinations of expectation and covariance terms over the (generally intractable) posterior of the latent variables. New estimates are obtained by introducing a variational posterior to approximate the true posterior in these terms. The variational posterior is trained to minimize a certain divergence (e.g., the KL divergence) between itself and the true posterior. Theoretically, the divergence characterizes upper bounds of the bias of the estimates. In principle, our estimates can be applied to a wide range of objectives, including kernelized Stein discrepancy (KSD), score matching (SM)-based methods and exact Fisher divergence with a minimal model assumption. In particular, these estimates applied to SM-based methods outperform existing methods in learning EBLVMs on several image datasets.}
}
@InProceedings{pmlr-v139-bar21a,
title = {Compositional Video Synthesis with Action Graphs},
author = {Bar, Amir and Herzig, Roei and Wang, Xiaolong and Rohrbach, Anna and Chechik, Gal and Darrell, Trevor and Globerson, Amir},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {662--673},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bar21a/bar21a.pdf},
url = {https://proceedings.mlr.press/v139/bar21a.html},
abstract = {Videos of actions are complex signals containing rich compositional structure in space and time. Current video generation methods lack the ability to condition the generation on multiple coordinated and potentially simultaneous timed actions. To address this challenge, we propose to represent the actions in a graph structure called Action Graph and present the new "Action Graph To Video" synthesis task. Our generative model for this task (AG2Vid) disentangles motion and appearance features, and by incorporating a scheduling mechanism for actions facilitates a timely and coordinated video generation. We train and evaluate AG2Vid on CATER and Something-Something V2 datasets, which results in videos that have better visual quality and semantic consistency compared to baselines. Finally, our model demonstrates zero-shot abilities by synthesizing novel compositions of the learned actions.}
}
@InProceedings{pmlr-v139-barak21a,
title = {Approximating a Distribution Using Weight Queries},
author = {Barak, Nadav and Sabato, Sivan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {674--683},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/barak21a/barak21a.pdf},
url = {https://proceedings.mlr.press/v139/barak21a.html},
abstract = {We consider a novel challenge: approximating a distribution without the ability to randomly sample from that distribution. We study how such an approximation can be obtained using *weight queries*. Given some data set of examples, a weight query presents one of the examples to an oracle, which returns the probability, according to the target distribution, of observing examples similar to the presented example. This oracle can represent, for instance, counting queries to a database of the target population, or an interface to a search engine which returns the number of results that match a given search. We propose an interactive algorithm that iteratively selects data set examples and performs corresponding weight queries. The algorithm finds a reweighting of the data set that approximates the weights according to the target distribution, using a limited number of weight queries. We derive an approximation bound on the total variation distance between the reweighting found by the algorithm and the best achievable reweighting. Our algorithm takes inspiration from the UCB approach common in multi-armed bandits problems, and combines it with a new discrepancy estimator and a greedy iterative procedure. In addition to our theoretical guarantees, we demonstrate in experiments the advantages of the proposed algorithm over several baselines. A python implementation of the proposed algorithm and of all the experiments can be found at https://github.com/Nadav-Barak/AWP.}
}
@InProceedings{pmlr-v139-baranwal21a,
title = {Graph Convolution for Semi-Supervised Classification: Improved Linear Separability and Out-of-Distribution Generalization},
author = {Baranwal, Aseem and Fountoulakis, Kimon and Jagannath, Aukosh},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {684--693},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/baranwal21a/baranwal21a.pdf},
url = {https://proceedings.mlr.press/v139/baranwal21a.html},
abstract = {Recently there has been increased interest in semi-supervised classification in the presence of graphical information. A new class of learning models has emerged that relies, at its most basic level, on classifying the data after first applying a graph convolution. To understand the merits of this approach, we study the classification of a mixture of Gaussians, where the data corresponds to the node attributes of a stochastic block model. We show that graph convolution extends the regime in which the data is linearly separable by a factor of roughly $1/\sqrt{D}$, where $D$ is the expected degree of a node, as compared to the mixture model data on its own. Furthermore, we find that the linear classifier obtained by minimizing the cross-entropy loss after the graph convolution generalizes to out-of-distribution data where the unseen data can have different intra- and inter-class edge probabilities from the training data.}
}
@InProceedings{pmlr-v139-bartan21a,
title = {Training Quantized Neural Networks to Global Optimality via Semidefinite Programming},
author = {Bartan, Burak and Pilanci, Mert},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {694--704},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bartan21a/bartan21a.pdf},
url = {https://proceedings.mlr.press/v139/bartan21a.html},
abstract = {Neural networks (NNs) have been extremely successful across many tasks in machine learning. Quantization of NN weights has become an important topic due to its impact on their energy efficiency, inference time and deployment on hardware. Although post-training quantization is well-studied, training optimal quantized NNs involves combinatorial non-convex optimization problems which appear intractable. In this work, we introduce a convex optimization strategy to train quantized NNs with polynomial activations. Our method leverages hidden convexity in two-layer neural networks from the recent literature, semidefinite lifting, and Grothendieck’s identity. Surprisingly, we show that certain quantized NN problems can be solved to global optimality provably in polynomial time in all relevant parameters via tight semidefinite relaxations. We present numerical examples to illustrate the effectiveness of our method.}
}
@InProceedings{pmlr-v139-basu21a,
title = {Beyond $log^2(T)$ regret for decentralized bandits in matching markets},
author = {Basu, Soumya and Sankararaman, Karthik Abinav and Sankararaman, Abishek},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {705--715},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/basu21a/basu21a.pdf},
url = {https://proceedings.mlr.press/v139/basu21a.html},
abstract = {We design decentralized algorithms for regret minimization in the two sided matching market with one-sided bandit feedback that significantly improves upon the prior works (Liu et al.\,2020a, Sankararaman et al.\,2020, Liu et al.\,2020b). First, for general markets, for any $\varepsilon > 0$, we design an algorithm that achieves a $O(\log^{1+\varepsilon}(T))$ regret to the agent-optimal stable matching, with unknown time horizon $T$, improving upon the $O(\log^{2}(T))$ regret achieved in (Liu et al.\,2020b). Second, we provide the optimal $\Theta(\log(T))$ agent-optimal regret for markets satisfying {\em uniqueness consistency} – markets where leaving participants don’t alter the original stable matching. Previously, $\Theta(\log(T))$ regret was achievable (Sankararaman et al.\,2020, Liu et al.\,2020b) in the much restricted {\em serial dictatorship} setting, when all arms have the same preference over the agents. We propose a phase based algorithm, where in each phase, besides deleting the globally communicated dominated arms the agents locally delete arms with which they collide often. This \emph{local deletion} is pivotal in breaking deadlocks arising from rank heterogeneity of agents across arms. We further demonstrate superiority of our algorithm over existing works through simulations.}
}
@InProceedings{pmlr-v139-baudry21a,
title = {Optimal Thompson Sampling strategies for support-aware CVaR bandits},
author = {Baudry, Dorian and Gautron, Romain and Kaufmann, Emilie and Maillard, Odalric},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {716--726},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/baudry21a/baudry21a.pdf},
url = {https://proceedings.mlr.press/v139/baudry21a.html},
abstract = {In this paper we study a multi-arm bandit problem in which the quality of each arm is measured by the Conditional Value at Risk (CVaR) at some level alpha of the reward distribution. While existing works in this setting mainly focus on Upper Confidence Bound algorithms, we introduce a new Thompson Sampling approach for CVaR bandits on bounded rewards that is flexible enough to solve a variety of problems grounded on physical resources. Building on a recent work by Riou & Honda (2020), we introduce B-CVTS for continuous bounded rewards and M-CVTS for multinomial distributions. On the theoretical side, we provide a non-trivial extension of their analysis that enables to theoretically bound their CVaR regret minimization performance. Strikingly, our results show that these strategies are the first to provably achieve asymptotic optimality in CVaR bandits, matching the corresponding asymptotic lower bounds for this setting. Further, we illustrate empirically the benefit of Thompson Sampling approaches both in a realistic environment simulating a use-case in agriculture and on various synthetic examples.}
}
@InProceedings{pmlr-v139-baudry21b,
title = {On Limited-Memory Subsampling Strategies for Bandits},
author = {Baudry, Dorian and Russac, Yoan and Capp{\'e}, Olivier},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {727--737},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/baudry21b/baudry21b.pdf},
url = {https://proceedings.mlr.press/v139/baudry21b.html},
abstract = {There has been a recent surge of interest in non-parametric bandit algorithms based on subsampling. One drawback however of these approaches is the additional complexity required by random subsampling and the storage of the full history of rewards. Our first contribution is to show that a simple deterministic subsampling rule, proposed in the recent work of \citet{baudry2020sub} under the name of “last-block subsampling”, is asymptotically optimal in one-parameter exponential families. In addition, we prove that these guarantees also hold when limiting the algorithm memory to a polylogarithmic function of the time horizon. These findings open up new perspectives, in particular for non-stationary scenarios in which the arm distributions evolve over time. We propose a variant of the algorithm in which only the most recent observations are used for subsampling, achieving optimal regret guarantees under the assumption of a known number of abrupt changes. Extensive numerical simulations highlight the merits of this approach, particularly when the changes are not only affecting the means of the rewards.}
}
@InProceedings{pmlr-v139-bauer21a,
title = {Generalized Doubly Reparameterized Gradient Estimators},
author = {Bauer, Matthias and Mnih, Andriy},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {738--747},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bauer21a/bauer21a.pdf},
url = {https://proceedings.mlr.press/v139/bauer21a.html},
abstract = {Efficient low-variance gradient estimation enabled by the reparameterization trick (RT) has been essential to the success of variational autoencoders. Doubly-reparameterized gradients (DReGs) improve on the RT for multi-sample variational bounds by applying reparameterization a second time for an additional reduction in variance. Here, we develop two generalizations of the DReGs estimator and show that they can be used to train conditional and hierarchical VAEs on image modelling tasks more effectively. We first extend the estimator to hierarchical models with several stochastic layers by showing how to treat additional score function terms due to the hierarchical variational posterior. We then generalize DReGs to score functions of arbitrary distributions instead of just those of the sampling distribution, which makes the estimator applicable to the parameters of the prior in addition to those of the posterior.}
}
@InProceedings{pmlr-v139-beaini21a,
title = {Directional Graph Networks},
author = {Beaini, Dominique and Passaro, Saro and L{\'e}tourneau, Vincent and Hamilton, Will and Corso, Gabriele and Li{\'o}, Pietro},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {748--758},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/beaini21a/beaini21a.pdf},
url = {https://proceedings.mlr.press/v139/beaini21a.html},
abstract = {The lack of anisotropic kernels in graph neural networks (GNNs) strongly limits their expressiveness, contributing to well-known issues such as over-smoothing. To overcome this limitation, we propose the first globally consistent anisotropic kernels for GNNs, allowing for graph convolutions that are defined according to topologicaly-derived directional flows. First, by defining a vector field in the graph, we develop a method of applying directional derivatives and smoothing by projecting node-specific messages into the field. Then, we propose the use of the Laplacian eigenvectors as such vector field. We show that the method generalizes CNNs on an $n$-dimensional grid and is provably more discriminative than standard GNNs regarding the Weisfeiler-Lehman 1-WL test. We evaluate our method on different standard benchmarks and see a relative error reduction of 8% on the CIFAR10 graph dataset and 11% to 32% on the molecular ZINC dataset, and a relative increase in precision of 1.6% on the MolPCBA dataset. An important outcome of this work is that it enables graph networks to embed directions in an unsupervised way, thus allowing a better representation of the anisotropic features in different physical or biological problems.}
}
@InProceedings{pmlr-v139-bellot21a,
title = {Policy Analysis using Synthetic Controls in Continuous-Time},
author = {Bellot, Alexis and van der Schaar, Mihaela},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {759--768},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bellot21a/bellot21a.pdf},
url = {https://proceedings.mlr.press/v139/bellot21a.html},
abstract = {Counterfactual estimation using synthetic controls is one of the most successful recent methodological developments in causal inference. Despite its popularity, the current description only considers time series aligned across units and synthetic controls expressed as linear combinations of observed control units. We propose a continuous-time alternative that models the latent counterfactual path explicitly using the formalism of controlled differential equations. This model is directly applicable to the general setting of irregularly-aligned multivariate time series and may be optimized in rich function spaces – thereby improving on some limitations of existing approaches.}
}
@InProceedings{pmlr-v139-benton21a,
title = {Loss Surface Simplexes for Mode Connecting Volumes and Fast Ensembling},
author = {Benton, Gregory and Maddox, Wesley and Lotfi, Sanae and Wilson, Andrew Gordon Gordon},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {769--779},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/benton21a/benton21a.pdf},
url = {https://proceedings.mlr.press/v139/benton21a.html},
abstract = {With a better understanding of the loss surfaces for multilayer networks, we can build more robust and accurate training procedures. Recently it was discovered that independently trained SGD solutions can be connected along one-dimensional paths of near-constant training loss. In this paper, we in fact demonstrate the existence of mode-connecting simplicial complexes that form multi-dimensional manifolds of low loss, connecting many independently trained models. Building on this discovery, we show how to efficiently construct simplicial complexes for fast ensembling, outperforming independently trained deep ensembles in accuracy, calibration, and robustness to dataset shift. Notably, our approach is easy to apply and only requires a few training epochs to discover a low-loss simplex.}
}
@InProceedings{pmlr-v139-berabi21a,
title = {TFix: Learning to Fix Coding Errors with a Text-to-Text Transformer},
author = {Berabi, Berkay and He, Jingxuan and Raychev, Veselin and Vechev, Martin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {780--791},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/berabi21a/berabi21a.pdf},
url = {https://proceedings.mlr.press/v139/berabi21a.html},
abstract = {The problem of fixing errors in programs has attracted substantial interest over the years. The key challenge for building an effective code fixing tool is to capture a wide range of errors and meanwhile maintain high accuracy. In this paper, we address this challenge and present a new learning-based system, called TFix. TFix works directly on program text and phrases the problem of code fixing as a text-to-text task. In turn, this enables it to leverage a powerful Transformer based model pre-trained on natural language and fine-tuned to generate code fixes (via a large, high-quality dataset obtained from GitHub commits). TFix is not specific to a particular programming language or class of defects and, in fact, improved its precision by simultaneously fine-tuning on 52 different error types reported by a popular static analyzer. Our evaluation on a massive dataset of JavaScript programs shows that TFix is practically effective: it is able to synthesize code that fixes the error in 67 percent of cases and significantly outperforms existing learning-based approaches.}
}
@InProceedings{pmlr-v139-berrevoets21a,
title = {Learning Queueing Policies for Organ Transplantation Allocation using Interpretable Counterfactual Survival Analysis},
author = {Berrevoets, Jeroen and Alaa, Ahmed and Qian, Zhaozhi and Jordon, James and Gimson, Alexander E. S. and van der Schaar, Mihaela},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {792--802},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/berrevoets21a/berrevoets21a.pdf},
url = {https://proceedings.mlr.press/v139/berrevoets21a.html},
abstract = {Organ transplantation is often the last resort for treating end-stage illnesses, but managing transplant wait-lists is challenging because of organ scarcity and the complexity of assessing donor-recipient compatibility. In this paper, we develop a data-driven model for (real-time) organ allocation using observational data for transplant outcomes. Our model integrates a queuing-theoretic framework with unsupervised learning to cluster the organs into “organ types”, and then construct priority queues (associated with each organ type) wherein incoming patients are assigned. To reason about organ allocations, the model uses synthetic controls to infer a patient’s survival outcomes under counterfactual allocations to the different organ types{–} the model is trained end-to-end to optimise the trade-off between patient waiting time and expected survival time. The usage of synthetic controls enable patient-level interpretations of allocation decisions that can be presented and understood by clinicians. We test our model on multiple data sets, and show that it outperforms other organ-allocation policies in terms of added life-years, and death count. Furthermore, we introduce a novel organ-allocation simulator to accurately test new policies.}
}
@InProceedings{pmlr-v139-bertail21a,
title = {Learning from Biased Data: A Semi-Parametric Approach},
author = {Bertail, Patrice and Cl{\'e}men{\c{c}}on, Stephan and Guyonvarch, Yannick and Noiry, Nathan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {803--812},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bertail21a/bertail21a.pdf},
url = {https://proceedings.mlr.press/v139/bertail21a.html},
abstract = {We consider risk minimization problems where the (source) distribution $P_S$ of the training observations $Z_1, \ldots, Z_n$ differs from the (target) distribution $P_T$ involved in the risk that one seeks to minimize. Under the natural assumption that $P_S$ dominates $P_T$, \textit{i.e.} $P_T< \! \! 1$ (including $p = \infty$).}
}
@InProceedings{pmlr-v139-bhattacharyya21a,
title = {Finding k in Latent $k-$ polytope},
author = {Bhattacharyya, Chiranjib and Kannan, Ravindran and Kumar, Amit},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {894--903},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bhattacharyya21a/bhattacharyya21a.pdf},
url = {https://proceedings.mlr.press/v139/bhattacharyya21a.html},
abstract = {The recently introduced Latent $k-$ Polytope($\LkP$) encompasses several stochastic Mixed Membership models including Topic Models. The problem of finding $k$, the number of extreme points of $\LkP$, is a fundamental challenge and includes several important open problems such as determination of number of components in Ad-mixtures. This paper addresses this challenge by introducing Interpolative Convex Rank(\INR) of a matrix defined as the minimum number of its columns whose convex hull is within Hausdorff distance $\varepsilon$ of the convex hull of all columns. The first important contribution of this paper is to show that under \emph{standard assumptions} $k$ equals the \INR of a \emph{subset smoothed data matrix} defined from Data generated from an $\LkP$. The second important contribution of the paper is a polynomial time algorithm for finding $k$ under standard assumptions. An immediate corollary is the first polynomial time algorithm for finding the \emph{inner dimension} in Non-negative matrix factorisation(NMF) with assumptions which are qualitatively different than existing ones such as \emph{Separability}. %An immediate corollary is the first polynomial time algorithm for finding the \emph{inner dimension} in Non-negative matrix factorisation(NMF) with assumptions considerably weaker than \emph{Separability}.}
}
@InProceedings{pmlr-v139-bi21a,
title = {Non-Autoregressive Electron Redistribution Modeling for Reaction Prediction},
author = {Bi, Hangrui and Wang, Hengyi and Shi, Chence and Coley, Connor and Tang, Jian and Guo, Hongyu},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {904--913},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bi21a/bi21a.pdf},
url = {https://proceedings.mlr.press/v139/bi21a.html},
abstract = {Reliably predicting the products of chemical reactions presents a fundamental challenge in synthetic chemistry. Existing machine learning approaches typically produce a reaction product by sequentially forming its subparts or intermediate molecules. Such autoregressive methods, however, not only require a pre-defined order for the incremental construction but preclude the use of parallel decoding for efficient computation. To address these issues, we devise a non-autoregressive learning paradigm that predicts reaction in one shot. Leveraging the fact that chemical reactions can be described as a redistribution of electrons in molecules, we formulate a reaction as an arbitrary electron flow and predict it with a novel multi-pointer decoding network. Experiments on the USPTO-MIT dataset show that our approach has established a new state-of-the-art top-1 accuracy and achieves at least 27 times inference speedup over the state-of-the-art methods. Also, our predictions are easier for chemists to interpret owing to predicting the electron flows.}
}
@InProceedings{pmlr-v139-biedenkapp21a,
title = {TempoRL: Learning When to Act},
author = {Biedenkapp, Andr{\'e} and Rajan, Raghu and Hutter, Frank and Lindauer, Marius},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {914--924},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/biedenkapp21a/biedenkapp21a.pdf},
url = {https://proceedings.mlr.press/v139/biedenkapp21a.html},
abstract = {Reinforcement learning is a powerful approach to learn behaviour through interactions with an environment. However, behaviours are usually learned in a purely reactive fashion, where an appropriate action is selected based on an observation. In this form, it is challenging to learn when it is necessary to execute new decisions. This makes learning inefficient especially in environments that need various degrees of fine and coarse control. To address this, we propose a proactive setting in which the agent not only selects an action in a state but also for how long to commit to that action. Our TempoRL approach introduces skip connections between states and learns a skip-policy for repeating the same action along these skips. We demonstrate the effectiveness of TempoRL on a variety of traditional and deep RL environments, showing that our approach is capable of learning successful policies up to an order of magnitude faster than vanilla Q-learning.}
}
@InProceedings{pmlr-v139-bielawski21a,
title = {Follow-the-Regularized-Leader Routes to Chaos in Routing Games},
author = {Bielawski, Jakub and Chotibut, Thiparat and Falniowski, Fryderyk and Kosiorowski, Grzegorz and Misiurewicz, Micha{\l} and Piliouras, Georgios},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {925--935},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bielawski21a/bielawski21a.pdf},
url = {https://proceedings.mlr.press/v139/bielawski21a.html},
abstract = {We study the emergence of chaotic behavior of Follow-the-Regularized Leader (FoReL) dynamics in games. We focus on the effects of increasing the population size or the scale of costs in congestion games, and generalize recent results on unstable, chaotic behaviors in the Multiplicative Weights Update dynamics to a much larger class of FoReL dynamics. We establish that, even in simple linear non-atomic congestion games with two parallel links and \emph{any} fixed learning rate, unless the game is fully symmetric, increasing the population size or the scale of costs causes learning dynamics to becomes unstable and eventually chaotic, in the sense of Li-Yorke and positive topological entropy. Furthermore, we prove the existence of novel non-standard phenomena such as the coexistence of stable Nash equilibria and chaos in the same game. We also observe the simultaneous creation of a chaotic attractor as another chaotic attractor gets destroyed. Lastly, although FoReL dynamics can be strange and non-equilibrating, we prove that the time average still converges to an \emph{exact} equilibrium for any choice of learning rate and any scale of costs.}
}
@InProceedings{pmlr-v139-biggio21a,
title = {Neural Symbolic Regression that scales},
author = {Biggio, Luca and Bendinelli, Tommaso and Neitz, Alexander and Lucchi, Aurelien and Parascandolo, Giambattista},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {936--945},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/biggio21a/biggio21a.pdf},
url = {https://proceedings.mlr.press/v139/biggio21a.html},
abstract = {Symbolic equations are at the core of scientific discovery. The task of discovering the underlying equation from a set of input-output pairs is called symbolic regression. Traditionally, symbolic regression methods use hand-designed strategies that do not improve with experience. In this paper, we introduce the first symbolic regression method that leverages large scale pre-training. We procedurally generate an unbounded set of equations, and simultaneously pre-train a Transformer to predict the symbolic equation from a corresponding set of input-output-pairs. At test time, we query the model on a new set of points and use its output to guide the search for the equation. We show empirically that this approach can re-discover a set of well-known physical equations, and that it improves over time with more data and compute.}
}
@InProceedings{pmlr-v139-biggs21a,
title = {Model Distillation for Revenue Optimization: Interpretable Personalized Pricing},
author = {Biggs, Max and Sun, Wei and Ettl, Markus},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {946--956},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/biggs21a/biggs21a.pdf},
url = {https://proceedings.mlr.press/v139/biggs21a.html},
abstract = {Data-driven pricing strategies are becoming increasingly common, where customers are offered a personalized price based on features that are predictive of their valuation of a product. It is desirable for this pricing policy to be simple and interpretable, so it can be verified, checked for fairness, and easily implemented. However, efforts to incorporate machine learning into a pricing framework often lead to complex pricing policies that are not interpretable, resulting in slow adoption in practice. We present a novel, customized, prescriptive tree-based algorithm that distills knowledge from a complex black-box machine learning algorithm, segments customers with similar valuations and prescribes prices in such a way that maximizes revenue while maintaining interpretability. We quantify the regret of a resulting policy and demonstrate its efficacy in applications with both synthetic and real-world datasets.}
}
@InProceedings{pmlr-v139-bilos21a,
title = {Scalable Normalizing Flows for Permutation Invariant Densities},
author = {Bilo{\v{s}}, Marin and G{\"u}nnemann, Stephan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {957--967},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bilos21a/bilos21a.pdf},
url = {https://proceedings.mlr.press/v139/bilos21a.html},
abstract = {Modeling sets is an important problem in machine learning since this type of data can be found in many domains. A promising approach defines a family of permutation invariant densities with continuous normalizing flows. This allows us to maximize the likelihood directly and sample new realizations with ease. In this work, we demonstrate how calculating the trace, a crucial step in this method, raises issues that occur both during training and inference, limiting its practicality. We propose an alternative way of defining permutation equivariant transformations that give closed form trace. This leads not only to improvements while training, but also to better final performance. We demonstrate the benefits of our approach on point processes and general set modeling.}
}
@InProceedings{pmlr-v139-bistritz21a,
title = {Online Learning for Load Balancing of Unknown Monotone Resource Allocation Games},
author = {Bistritz, Ilai and Bambos, Nicholas},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {968--979},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bistritz21a/bistritz21a.pdf},
url = {https://proceedings.mlr.press/v139/bistritz21a.html},
abstract = {Consider N players that each uses a mixture of K resources. Each of the players’ reward functions includes a linear pricing term for each resource that is controlled by the game manager. We assume that the game is strongly monotone, so if each player runs gradient descent, the dynamics converge to a unique Nash equilibrium (NE). Unfortunately, this NE can be inefficient since the total load on a given resource can be very high. In principle, we can control the total loads by tuning the coefficients of the pricing terms. However, finding pricing coefficients that balance the loads requires knowing the players’ reward functions and their action sets. Obtaining this game structure information is infeasible in a large-scale network and violates the users’ privacy. To overcome this, we propose a simple algorithm that learns to shift the NE of the game to meet the total load constraints by adjusting the pricing coefficients in an online manner. Our algorithm only requires the total load per resource as feedback and does not need to know the reward functions or the action sets. We prove that our algorithm guarantees convergence in L2 to a NE that meets target total load constraints. Simulations show the effectiveness of our approach when applied to smart grid demand-side management or power control in wireless networks.}
}
@InProceedings{pmlr-v139-bjorck21a,
title = {Low-Precision Reinforcement Learning: Running Soft Actor-Critic in Half Precision},
author = {Bj{\"o}rck, Johan and Chen, Xiangyu and De Sa, Christopher and Gomes, Carla P and Weinberger, Kilian},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {980--991},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bjorck21a/bjorck21a.pdf},
url = {https://proceedings.mlr.press/v139/bjorck21a.html},
abstract = {Low-precision training has become a popular approach to reduce compute requirements, memory footprint, and energy consumption in supervised learning. In contrast, this promising approach has not yet enjoyed similarly widespread adoption within the reinforcement learning (RL) community, partly because RL agents can be notoriously hard to train even in full precision. In this paper we consider continuous control with the state-of-the-art SAC agent and demonstrate that a naïve adaptation of low-precision methods from supervised learning fails. We propose a set of six modifications, all straightforward to implement, that leaves the underlying agent and its hyperparameters unchanged but improves the numerical stability dramatically. The resulting modified SAC agent has lower memory and compute requirements while matching full-precision rewards, demonstrating that low-precision training can substantially accelerate state-of-the-art RL without parameter tuning.}
}
@InProceedings{pmlr-v139-blalock21a,
title = {Multiplying Matrices Without Multiplying},
author = {Blalock, Davis and Guttag, John},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {992--1004},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/blalock21a/blalock21a.pdf},
url = {https://proceedings.mlr.press/v139/blalock21a.html},
abstract = {Multiplying matrices is among the most fundamental and most computationally demanding operations in machine learning and scientific computing. Consequently, the task of efficiently approximating matrix products has received significant attention. We introduce a learning-based algorithm for this task that greatly outperforms existing methods. Experiments using hundreds of matrices from diverse domains show that it often runs 10x faster than alternatives at a given level of error, as well as 100x faster than exact matrix multiplication. In the common case that one matrix is known ahead of time, our method also has the interesting property that it requires zero multiply-adds. These results suggest that a mixture of hashing, averaging, and byte shuffling{—}the core operations of our method{—}could be a more promising building block for machine learning than the sparsified, factorized, and/or scalar quantized matrix products that have recently been the focus of substantial research and hardware investment.}
}
@InProceedings{pmlr-v139-blum21a,
title = {One for One, or All for All: Equilibria and Optimality of Collaboration in Federated Learning},
author = {Blum, Avrim and Haghtalab, Nika and Phillips, Richard Lanas and Shao, Han},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1005--1014},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/blum21a/blum21a.pdf},
url = {https://proceedings.mlr.press/v139/blum21a.html},
abstract = {In recent years, federated learning has been embraced as an approach for bringing about collaboration across large populations of learning agents. However, little is known about how collaboration protocols should take agents’ incentives into account when allocating individual resources for communal learning in order to maintain such collaborations. Inspired by game theoretic notions, this paper introduces a framework for incentive-aware learning and data sharing in federated learning. Our stable and envy-free equilibria capture notions of collaboration in the presence of agents interested in meeting their learning objectives while keeping their own sample collection burden low. For example, in an envy-free equilibrium, no agent would wish to swap their sampling burden with any other agent and in a stable equilibrium, no agent would wish to unilaterally reduce their sampling burden. In addition to formalizing this framework, our contributions include characterizing the structural properties of such equilibria, proving when they exist, and showing how they can be computed. Furthermore, we compare the sample complexity of incentive-aware collaboration with that of optimal collaboration when one ignores agents’ incentives.}
}
@InProceedings{pmlr-v139-bodin21a,
title = {Black-box density function estimation using recursive partitioning},
author = {Bodin, Erik and Dai, Zhenwen and Campbell, Neill and Ek, Carl Henrik},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1015--1025},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bodin21a/bodin21a.pdf},
url = {https://proceedings.mlr.press/v139/bodin21a.html},
abstract = {We present a novel approach to Bayesian inference and general Bayesian computation that is defined through a sequential decision loop. Our method defines a recursive partitioning of the sample space. It neither relies on gradients nor requires any problem-specific tuning, and is asymptotically exact for any density function with a bounded domain. The output is an approximation to the whole density function including the normalisation constant, via partitions organised in efficient data structures. Such approximations may be used for evidence estimation or fast posterior sampling, but also as building blocks to treat a larger class of estimation problems. The algorithm shows competitive performance to recent state-of-the-art methods on synthetic and real-world problems including parameter inference for gravitational-wave physics.}
}
@InProceedings{pmlr-v139-bodnar21a,
title = {Weisfeiler and Lehman Go Topological: Message Passing Simplicial Networks},
author = {Bodnar, Cristian and Frasca, Fabrizio and Wang, Yuguang and Otter, Nina and Montufar, Guido F and Li{\'o}, Pietro and Bronstein, Michael},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1026--1037},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bodnar21a/bodnar21a.pdf},
url = {https://proceedings.mlr.press/v139/bodnar21a.html},
abstract = {The pairwise interaction paradigm of graph machine learning has predominantly governed the modelling of relational systems. However, graphs alone cannot capture the multi-level interactions present in many complex systems and the expressive power of such schemes was proven to be limited. To overcome these limitations, we propose Message Passing Simplicial Networks (MPSNs), a class of models that perform message passing on simplicial complexes (SCs). To theoretically analyse the expressivity of our model we introduce a Simplicial Weisfeiler-Lehman (SWL) colouring procedure for distinguishing non-isomorphic SCs. We relate the power of SWL to the problem of distinguishing non-isomorphic graphs and show that SWL and MPSNs are strictly more powerful than the WL test and not less powerful than the 3-WL test. We deepen the analysis by comparing our model with traditional graph neural networks (GNNs) with ReLU activations in terms of the number of linear regions of the functions they can represent. We empirically support our theoretical claims by showing that MPSNs can distinguish challenging strongly regular graphs for which GNNs fail and, when equipped with orientation equivariant layers, they can improve classification accuracy in oriented SCs compared to a GNN baseline.}
}
@InProceedings{pmlr-v139-bondesan21a,
title = {The Hintons in your Neural Network: a Quantum Field Theory View of Deep Learning},
author = {Bondesan, Roberto and Welling, Max},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1038--1048},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bondesan21a/bondesan21a.pdf},
url = {https://proceedings.mlr.press/v139/bondesan21a.html},
abstract = {In this work we develop a quantum field theory formalism for deep learning, where input signals are encoded in Gaussian states, a generalization of Gaussian processes which encode the agent’s uncertainty about the input signal. We show how to represent linear and non-linear layers as unitary quantum gates, and interpret the fundamental excitations of the quantum model as particles, dubbed “Hintons”. On top of opening a new perspective and techniques for studying neural networks, the quantum formulation is well suited for optical quantum computing, and provides quantum deformations of neural networks that can be run efficiently on those devices. Finally, we discuss a semi-classical limit of the quantum deformed models which is amenable to classical simulation.}
}
@InProceedings{pmlr-v139-brandfonbrener21a,
title = {Offline Contextual Bandits with Overparameterized Models},
author = {Brandfonbrener, David and Whitney, William and Ranganath, Rajesh and Bruna, Joan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1049--1058},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/brandfonbrener21a/brandfonbrener21a.pdf},
url = {https://proceedings.mlr.press/v139/brandfonbrener21a.html},
abstract = {Recent results in supervised learning suggest that while overparameterized models have the capacity to overfit, they in fact generalize quite well. We ask whether the same phenomenon occurs for offline contextual bandits. Our results are mixed. Value-based algorithms benefit from the same generalization behavior as overparameterized supervised learning, but policy-based algorithms do not. We show that this discrepancy is due to the \emph{action-stability} of their objectives. An objective is action-stable if there exists a prediction (action-value vector or action distribution) which is optimal no matter which action is observed. While value-based objectives are action-stable, policy-based objectives are unstable. We formally prove upper bounds on the regret of overparameterized value-based learning and lower bounds on the regret for policy-based algorithms. In our experiments with large neural networks, this gap between action-stable value-based objectives and unstable policy-based objectives leads to significant performance differences.}
}
@InProceedings{pmlr-v139-brock21a,
title = {High-Performance Large-Scale Image Recognition Without Normalization},
author = {Brock, Andy and De, Soham and Smith, Samuel L and Simonyan, Karen},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1059--1071},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/brock21a/brock21a.pdf},
url = {https://proceedings.mlr.press/v139/brock21a.html},
abstract = {Batch normalization is a key component of most image classification models, but it has many undesirable properties stemming from its dependence on the batch size and interactions between examples. Although recent work has succeeded in training deep ResNets without normalization layers, these models do not match the test accuracies of the best batch-normalized networks, and are often unstable for large learning rates or strong data augmentations. In this work, we develop an adaptive gradient clipping technique which overcomes these instabilities, and design a significantly improved class of Normalizer-Free ResNets. Our smaller models match the test accuracy of an EfficientNet-B7 on ImageNet while being up to 8.7x faster to train, and our largest models attain a new state-of-the-art top-1 accuracy of 86.5%. In addition, Normalizer-Free models attain significantly better performance than their batch-normalized counterparts when fine-tuning on ImageNet after large-scale pre-training on a dataset of 300 million labeled images, with our best models obtaining an accuracy of 89.2%.}
}
@InProceedings{pmlr-v139-brofos21a,
title = {Evaluating the Implicit Midpoint Integrator for Riemannian Hamiltonian Monte Carlo},
author = {Brofos, James and Lederman, Roy R},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1072--1081},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/brofos21a/brofos21a.pdf},
url = {https://proceedings.mlr.press/v139/brofos21a.html},
abstract = {Riemannian manifold Hamiltonian Monte Carlo is traditionally carried out using the generalized leapfrog integrator. However, this integrator is not the only choice and other integrators yielding valid Markov chain transition operators may be considered. In this work, we examine the implicit midpoint integrator as an alternative to the generalized leapfrog integrator. We discuss advantages and disadvantages of the implicit midpoint integrator for Hamiltonian Monte Carlo, its theoretical properties, and an empirical assessment of the critical attributes of such an integrator for Hamiltonian Monte Carlo: energy conservation, volume preservation, and reversibility. Empirically, we find that while leapfrog iterations are faster, the implicit midpoint integrator has better energy conservation, leading to higher acceptance rates, as well as better conservation of volume and better reversibility, arguably yielding a more accurate sampling procedure.}
}
@InProceedings{pmlr-v139-brooks21a,
title = {Reinforcement Learning of Implicit and Explicit Control Flow Instructions},
author = {Brooks, Ethan and Rajendran, Janarthanan and Lewis, Richard L and Singh, Satinder},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1082--1091},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/brooks21a/brooks21a.pdf},
url = {https://proceedings.mlr.press/v139/brooks21a.html},
abstract = {Learning to flexibly follow task instructions in dynamic environments poses interesting challenges for reinforcement learning agents. We focus here on the problem of learning control flow that deviates from a strict step-by-step execution of instructions{—}that is, control flow that may skip forward over parts of the instructions or return backward to previously completed or skipped steps. Demand for such flexible control arises in two fundamental ways: explicitly when control is specified in the instructions themselves (such as conditional branching and looping) and implicitly when stochastic environment dynamics require re-completion of instructions whose effects have been perturbed, or opportunistic skipping of instructions whose effects are already present. We formulate an attention-based architecture that meets these challenges by learning, from task reward only, to flexibly attend to and condition behavior on an internal encoding of the instructions. We test the architecture’s ability to learn both explicit and implicit control in two illustrative domains—one inspired by Minecraft and the other by StarCraft—and show that the architecture exhibits zero-shot generalization to novel instructions of length greater than those in a training set, at a performance level unmatched by three baseline recurrent architectures and one ablation architecture.}
}
@InProceedings{pmlr-v139-brophy21a,
title = {Machine Unlearning for Random Forests},
author = {Brophy, Jonathan and Lowd, Daniel},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1092--1104},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/brophy21a/brophy21a.pdf},
url = {https://proceedings.mlr.press/v139/brophy21a.html},
abstract = {Responding to user data deletion requests, removing noisy examples, or deleting corrupted training data are just a few reasons for wanting to delete instances from a machine learning (ML) model. However, efficiently removing this data from an ML model is generally difficult. In this paper, we introduce data removal-enabled (DaRE) forests, a variant of random forests that enables the removal of training data with minimal retraining. Model updates for each DaRE tree in the forest are exact, meaning that removing instances from a DaRE model yields exactly the same model as retraining from scratch on updated data. DaRE trees use randomness and caching to make data deletion efficient. The upper levels of DaRE trees use random nodes, which choose split attributes and thresholds uniformly at random. These nodes rarely require updates because they only minimally depend on the data. At the lower levels, splits are chosen to greedily optimize a split criterion such as Gini index or mutual information. DaRE trees cache statistics at each node and training data at each leaf, so that only the necessary subtrees are updated as data is removed. For numerical attributes, greedy nodes optimize over a random subset of thresholds, so that they can maintain statistics while approximating the optimal threshold. By adjusting the number of thresholds considered for greedy nodes, and the number of random nodes, DaRE trees can trade off between more accurate predictions and more efficient updates. In experiments on 13 real-world datasets and one synthetic dataset, we find DaRE forests delete data orders of magnitude faster than retraining from scratch while sacrificing little to no predictive power.}
}
@InProceedings{pmlr-v139-brown21a,
title = {Value Alignment Verification},
author = {Brown, Daniel S and Schneider, Jordan and Dragan, Anca and Niekum, Scott},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1105--1115},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/brown21a/brown21a.pdf},
url = {https://proceedings.mlr.press/v139/brown21a.html},
abstract = {As humans interact with autonomous agents to perform increasingly complicated, potentially risky tasks, it is important to be able to efficiently evaluate an agent’s performance and correctness. In this paper we formalize and theoretically analyze the problem of efficient value alignment verification: how to efficiently test whether the behavior of another agent is aligned with a human’s values? The goal is to construct a kind of "driver’s test" that a human can give to any agent which will verify value alignment via a minimal number of queries. We study alignment verification problems with both idealized humans that have an explicit reward function as well as problems where they have implicit values. We analyze verification of exact value alignment for rational agents, propose and test heuristics for value alignment verification in gridworlds and a continuous autonomous driving domain, and prove that there exist sufficient conditions such that we can verify epsilon-alignment in any environment via a constant-query-complexity alignment test.}
}
@InProceedings{pmlr-v139-bruns-smith21a,
title = {Model-Free and Model-Based Policy Evaluation when Causality is Uncertain},
author = {Bruns-Smith, David A},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1116--1126},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bruns-smith21a/bruns-smith21a.pdf},
url = {https://proceedings.mlr.press/v139/bruns-smith21a.html},
abstract = {When decision-makers can directly intervene, policy evaluation algorithms give valid causal estimates. In off-policy evaluation (OPE), there may exist unobserved variables that both impact the dynamics and are used by the unknown behavior policy. These “confounders” will introduce spurious correlations and naive estimates for a new policy will be biased. We develop worst-case bounds to assess sensitivity to these unobserved confounders in finite horizons when confounders are drawn iid each period. We demonstrate that a model-based approach with robust MDPs gives sharper lower bounds by exploiting domain knowledge about the dynamics. Finally, we show that when unobserved confounders are persistent over time, OPE is far more difficult and existing techniques produce extremely conservative bounds.}
}
@InProceedings{pmlr-v139-buet-golfouse21a,
title = {Narrow Margins: Classification, Margins and Fat Tails},
author = {Buet-Golfouse, Francois},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1127--1135},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/buet-golfouse21a/buet-golfouse21a.pdf},
url = {https://proceedings.mlr.press/v139/buet-golfouse21a.html},
abstract = {It is well-known that, for separable data, the regularised two-class logistic regression or support vector machine re-normalised estimate converges to the maximal margin classifier as the regularisation hyper-parameter $\lambda$ goes to 0. The fact that different loss functions may lead to the same solution is of theoretical and practical relevance as margin maximisation allows more straightforward considerations in terms of generalisation and geometric interpretation. We investigate the case where this convergence property is not guaranteed to hold and show that it can be fully characterised by the distribution of error terms in the latent variable interpretation of linear classifiers. In particular, if errors follow a regularly varying distribution, then the regularised and re-normalised estimate does not converge to the maximal margin classifier. This shows that classification with fat tails has a qualitatively different behaviour, which should be taken into account when considering real-life data.}
}
@InProceedings{pmlr-v139-bun21a,
title = {Differentially Private Correlation Clustering},
author = {Bun, Mark and Elias, Marek and Kulkarni, Janardhan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1136--1146},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/bun21a/bun21a.pdf},
url = {https://proceedings.mlr.press/v139/bun21a.html},
abstract = {Correlation clustering is a widely used technique in unsupervised machine learning. Motivated by applications where individual privacy is a concern, we initiate the study of differentially private correlation clustering. We propose an algorithm that achieves subquadratic additive error compared to the optimal cost. In contrast, straightforward adaptations of existing non-private algorithms all lead to a trivial quadratic error. Finally, we give a lower bound showing that any pure differentially private algorithm for correlation clustering requires additive error $\Omega$(n).}
}
@InProceedings{pmlr-v139-cabannnes21a,
title = {Disambiguation of Weak Supervision leading to Exponential Convergence rates},
author = {Cabannnes, Vivien A and Bach, Francis and Rudi, Alessandro},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1147--1157},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cabannnes21a/cabannnes21a.pdf},
url = {https://proceedings.mlr.press/v139/cabannnes21a.html},
abstract = {Machine learning approached through supervised learning requires expensive annotation of data. This motivates weakly supervised learning, where data are annotated with incomplete yet discriminative information. In this paper, we focus on partial labelling, an instance of weak supervision where, from a given input, we are given a set of potential targets. We review a disambiguation principle to recover full supervision from weak supervision, and propose an empirical disambiguation algorithm. We prove exponential convergence rates of our algorithm under classical learnability assumptions, and we illustrate the usefulness of our method on practical examples.}
}
@InProceedings{pmlr-v139-cai21a,
title = {Finite mixture models do not reliably learn the number of components},
author = {Cai, Diana and Campbell, Trevor and Broderick, Tamara},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1158--1169},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cai21a/cai21a.pdf},
url = {https://proceedings.mlr.press/v139/cai21a.html},
abstract = {Scientists and engineers are often interested in learning the number of subpopulations (or components) present in a data set. A common suggestion is to use a finite mixture model (FMM) with a prior on the number of components. Past work has shown the resulting FMM component-count posterior is consistent; that is, the posterior concentrates on the true, generating number of components. But consistency requires the assumption that the component likelihoods are perfectly specified, which is unrealistic in practice. In this paper, we add rigor to data-analysis folk wisdom by proving that under even the slightest model misspecification, the FMM component-count posterior diverges: the posterior probability of any particular finite number of components converges to 0 in the limit of infinite data. Contrary to intuition, posterior-density consistency is not sufficient to establish this result. We develop novel sufficient conditions that are more realistic and easily checkable than those common in the asymptotics literature. We illustrate practical consequences of our theory on simulated and real data.}
}
@InProceedings{pmlr-v139-cai21b,
title = {A Theory of Label Propagation for Subpopulation Shift},
author = {Cai, Tianle and Gao, Ruiqi and Lee, Jason and Lei, Qi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1170--1182},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cai21b/cai21b.pdf},
url = {https://proceedings.mlr.press/v139/cai21b.html},
abstract = {One of the central problems in machine learning is domain adaptation. Different from past theoretical works, we consider a new model of subpopulation shift in the input or representation space. In this work, we propose a provably effective framework based on label propagation by using an input consistency loss. In our analysis we used a simple but realistic “expansion” assumption, which has been proposed in \citet{wei2021theoretical}. It turns out that based on a teacher classifier on the source domain, the learned classifier can not only propagate to the target domain but also improve upon the teacher. By leveraging existing generalization bounds, we also obtain end-to-end finite-sample guarantees on deep neural networks. In addition, we extend our theoretical framework to a more general setting of source-to-target transfer based on an additional unlabeled dataset, which can be easily applied to various learning scenarios. Inspired by our theory, we adapt consistency-based semi-supervised learning methods to domain adaptation settings and gain significant improvements.}
}
@InProceedings{pmlr-v139-cai21c,
title = {Lenient Regret and Good-Action Identification in Gaussian Process Bandits},
author = {Cai, Xu and Gomes, Selwyn and Scarlett, Jonathan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1183--1192},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cai21c/cai21c.pdf},
url = {https://proceedings.mlr.press/v139/cai21c.html},
abstract = {In this paper, we study the problem of Gaussian process (GP) bandits under relaxed optimization criteria stating that any function value above a certain threshold is “good enough”. On the theoretical side, we study various {\em lenient regret} notions in which all near-optimal actions incur zero penalty, and provide upper bounds on the lenient regret for GP-UCB and an elimination algorithm, circumventing the usual $O(\sqrt{T})$ term (with time horizon $T$) resulting from zooming extremely close towards the function maximum. In addition, we complement these upper bounds with algorithm-independent lower bounds. On the practical side, we consider the problem of finding a single “good action” according to a known pre-specified threshold, and introduce several good-action identification algorithms that exploit knowledge of the threshold. We experimentally find that such algorithms can typically find a good action faster than standard optimization-based approaches.}
}
@InProceedings{pmlr-v139-cai21d,
title = {A Zeroth-Order Block Coordinate Descent Algorithm for Huge-Scale Black-Box Optimization},
author = {Cai, Hanqin and Lou, Yuchen and Mckenzie, Daniel and Yin, Wotao},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1193--1203},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cai21d/cai21d.pdf},
url = {https://proceedings.mlr.press/v139/cai21d.html},
abstract = {We consider the zeroth-order optimization problem in the huge-scale setting, where the dimension of the problem is so large that performing even basic vector operations on the decision variables is infeasible. In this paper, we propose a novel algorithm, coined ZO-BCD, that exhibits favorable overall query complexity and has a much smaller per-iteration computational complexity. In addition, we discuss how the memory footprint of ZO-BCD can be reduced even further by the clever use of circulant measurement matrices. As an application of our new method, we propose the idea of crafting adversarial attacks on neural network based classifiers in a wavelet domain, which can result in problem dimensions of over one million. In particular, we show that crafting adversarial examples to audio classifiers in a wavelet domain can achieve the state-of-the-art attack success rate of 97.9% with significantly less distortion.}
}
@InProceedings{pmlr-v139-cai21e,
title = {GraphNorm: A Principled Approach to Accelerating Graph Neural Network Training},
author = {Cai, Tianle and Luo, Shengjie and Xu, Keyulu and He, Di and Liu, Tie-Yan and Wang, Liwei},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1204--1215},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cai21e/cai21e.pdf},
url = {https://proceedings.mlr.press/v139/cai21e.html},
abstract = {Normalization is known to help the optimization of deep neural networks. Curiously, different architectures require specialized normalization methods. In this paper, we study what normalization is effective for Graph Neural Networks (GNNs). First, we adapt and evaluate the existing methods from other domains to GNNs. Faster convergence is achieved with InstanceNorm compared to BatchNorm and LayerNorm. We provide an explanation by showing that InstanceNorm serves as a preconditioner for GNNs, but such preconditioning effect is weaker with BatchNorm due to the heavy batch noise in graph datasets. Second, we show that the shift operation in InstanceNorm results in an expressiveness degradation of GNNs for highly regular graphs. We address this issue by proposing GraphNorm with a learnable shift. Empirically, GNNs with GraphNorm converge faster compared to GNNs using other normalization. GraphNorm also improves the generalization of GNNs, achieving better performance on graph classification benchmarks.}
}
@InProceedings{pmlr-v139-cai21f,
title = {On Lower Bounds for Standard and Robust Gaussian Process Bandit Optimization},
author = {Cai, Xu and Scarlett, Jonathan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1216--1226},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cai21f/cai21f.pdf},
url = {https://proceedings.mlr.press/v139/cai21f.html},
abstract = {In this paper, we consider algorithm independent lower bounds for the problem of black-box optimization of functions having a bounded norm is some Reproducing Kernel Hilbert Space (RKHS), which can be viewed as a non-Bayesian Gaussian process bandit problem. In the standard noisy setting, we provide a novel proof technique for deriving lower bounds on the regret, with benefits including simplicity, versatility, and an improved dependence on the error probability. In a robust setting in which the final point is perturbed by an adversary, we strengthen an existing lower bound that only holds for target success probabilities very close to one, by allowing for arbitrary target success probabilities in (0, 1). Furthermore, in a distinct robust setting in which every sampled point may be perturbed by a constrained adversary, we provide a novel lower bound for deterministic strategies, demonstrating an inevitable joint dependence of the cumulative regret on the corruption level and the time horizon, in contrast with existing lower bounds that only characterize the individual dependencies.}
}
@InProceedings{pmlr-v139-camilleri21a,
title = {High-dimensional Experimental Design and Kernel Bandits},
author = {Camilleri, Romain and Jamieson, Kevin and Katz-Samuels, Julian},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1227--1237},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/camilleri21a/camilleri21a.pdf},
url = {https://proceedings.mlr.press/v139/camilleri21a.html},
abstract = {In recent years methods from optimal linear experimental design have been leveraged to obtain state of the art results for linear bandits. A design returned from an objective such as G-optimal design is actually a probability distribution over a pool of potential measurement vectors. Consequently, one nuisance of the approach is the task of converting this continuous probability distribution into a discrete assignment of N measurements. While sophisticated rounding techniques have been proposed, in d dimensions they require N to be at least d, d log(log(d)), or d^2 based on the sub-optimality of the solution. In this paper we are interested in settings where N may be much less than d, such as in experimental design in an RKHS where d may be effectively infinite. In this work, we propose a rounding procedure that frees N of any dependence on the dimension d, while achieving nearly the same performance guarantees of existing rounding procedures. We evaluate the procedure against a baseline that projects the problem to a lower dimensional space and performs rounding there, which requires N to just be at least a notion of the effective dimension. We also leverage our new approach in a new algorithm for kernelized bandits to obtain state of the art results for regret minimization and pure exploration. An advantage of our approach over existing UCB-like approaches is that our kernel bandit algorithms are provably robust to model misspecification.}
}
@InProceedings{pmlr-v139-campbell21a,
title = {A Gradient Based Strategy for Hamiltonian Monte Carlo Hyperparameter Optimization},
author = {Campbell, Andrew and Chen, Wenlong and Stimper, Vincent and Hernandez-Lobato, Jose Miguel and Zhang, Yichuan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1238--1248},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/campbell21a/campbell21a.pdf},
url = {https://proceedings.mlr.press/v139/campbell21a.html},
abstract = {Hamiltonian Monte Carlo (HMC) is one of the most successful sampling methods in machine learning. However, its performance is significantly affected by the choice of hyperparameter values. Existing approaches for optimizing the HMC hyperparameters either optimize a proxy for mixing speed or consider the HMC chain as an implicit variational distribution and optimize a tractable lower bound that can be very loose in practice. Instead, we propose to optimize an objective that quantifies directly the speed of convergence to the target distribution. Our objective can be easily optimized using stochastic gradient descent. We evaluate our proposed method and compare to baselines on a variety of problems including sampling from synthetic 2D distributions, reconstructing sparse signals, learning deep latent variable models and sampling molecular configurations from the Boltzmann distribution of a 22 atom molecule. We find that our method is competitive with or improves upon alternative baselines in all these experiments.}
}
@InProceedings{pmlr-v139-camuto21a,
title = {Asymmetric Heavy Tails and Implicit Bias in Gaussian Noise Injections},
author = {Camuto, Alexander and Wang, Xiaoyu and Zhu, Lingjiong and Holmes, Chris and Gurbuzbalaban, Mert and Simsekli, Umut},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1249--1260},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/camuto21a/camuto21a.pdf},
url = {https://proceedings.mlr.press/v139/camuto21a.html},
abstract = {Gaussian noise injections (GNIs) are a family of simple and widely-used regularisation methods for training neural networks, where one injects additive or multiplicative Gaussian noise to the network activations at every iteration of the optimisation algorithm, which is typically chosen as stochastic gradient descent (SGD). In this paper, we focus on the so-called ‘implicit effect’ of GNIs, which is the effect of the injected noise on the dynamics of SGD. We show that this effect induces an \emph{asymmetric heavy-tailed noise} on SGD gradient updates. In order to model this modified dynamics, we first develop a Langevin-like stochastic differential equation that is driven by a general family of \emph{asymmetric} heavy-tailed noise. Using this model we then formally prove that GNIs induce an ‘implicit bias’, which varies depending on the heaviness of the tails and the level of asymmetry. Our empirical results confirm that different types of neural networks trained with GNIs are well-modelled by the proposed dynamics and that the implicit effect of these injections induces a bias that degrades the performance of networks.}
}
@InProceedings{pmlr-v139-cao21a,
title = {Fold2Seq: A Joint Sequence(1D)-Fold(3D) Embedding-based Generative Model for Protein Design},
author = {Cao, Yue and Das, Payel and Chenthamarakshan, Vijil and Chen, Pin-Yu and Melnyk, Igor and Shen, Yang},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1261--1271},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cao21a/cao21a.pdf},
url = {https://proceedings.mlr.press/v139/cao21a.html},
abstract = {Designing novel protein sequences for a desired 3D topological fold is a fundamental yet non-trivial task in protein engineering. Challenges exist due to the complex sequence–fold relationship, as well as the difficulties to capture the diversity of the sequences (therefore structures and functions) within a fold. To overcome these challenges, we propose Fold2Seq, a novel transformer-based generative framework for designing protein sequences conditioned on a specific target fold. To model the complex sequence–structure relationship, Fold2Seq jointly learns a sequence embedding using a transformer and a fold embedding from the density of secondary structural elements in 3D voxels. On test sets with single, high-resolution and complete structure inputs for individual folds, our experiments demonstrate improved or comparable performance of Fold2Seq in terms of speed, coverage, and reliability for sequence design, when compared to existing state-of-the-art methods that include data-driven deep generative models and physics-based RosettaDesign. The unique advantages of fold-based Fold2Seq, in comparison to a structure-based deep model and RosettaDesign, become more evident on three additional real-world challenges originating from low-quality, incomplete, or ambiguous input structures. Source code and data are available at https://github.com/IBM/fold2seq.}
}
@InProceedings{pmlr-v139-cao21b,
title = {Learning from Similarity-Confidence Data},
author = {Cao, Yuzhou and Feng, Lei and Xu, Yitian and An, Bo and Niu, Gang and Sugiyama, Masashi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1272--1282},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cao21b/cao21b.pdf},
url = {https://proceedings.mlr.press/v139/cao21b.html},
abstract = {Weakly supervised learning has drawn considerable attention recently to reduce the expensive time and labor consumption of labeling massive data. In this paper, we investigate a novel weakly supervised learning problem of learning from similarity-confidence (Sconf) data, where only unlabeled data pairs equipped with confidence that illustrates their degree of similarity (two examples are similar if they belong to the same class) are needed for training a discriminative binary classifier. We propose an unbiased estimator of the classification risk that can be calculated from only Sconf data and show that the estimation error bound achieves the optimal convergence rate. To alleviate potential overfitting when flexible models are used, we further employ a risk correction scheme on the proposed risk estimator. Experimental results demonstrate the effectiveness of the proposed methods.}
}
@InProceedings{pmlr-v139-carderera21a,
title = {Parameter-free Locally Accelerated Conditional Gradients},
author = {Carderera, Alejandro and Diakonikolas, Jelena and Lin, Cheuk Yin and Pokutta, Sebastian},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1283--1293},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/carderera21a/carderera21a.pdf},
url = {https://proceedings.mlr.press/v139/carderera21a.html},
abstract = {Projection-free conditional gradient (CG) methods are the algorithms of choice for constrained optimization setups in which projections are often computationally prohibitive but linear optimization over the constraint set remains computationally feasible. Unlike in projection-based methods, globally accelerated convergence rates are in general unattainable for CG. However, a very recent work on Locally accelerated CG (LaCG) has demonstrated that local acceleration for CG is possible for many settings of interest. The main downside of LaCG is that it requires knowledge of the smoothness and strong convexity parameters of the objective function. We remove this limitation by introducing a novel, Parameter-Free Locally accelerated CG (PF-LaCG) algorithm, for which we provide rigorous convergence guarantees. Our theoretical results are complemented by numerical experiments, which demonstrate local acceleration and showcase the practical improvements of PF-LaCG over non-accelerated algorithms, both in terms of iteration count and wall-clock time.}
}
@InProceedings{pmlr-v139-carriere21a,
title = {Optimizing persistent homology based functions},
author = {Carriere, Mathieu and Chazal, Frederic and Glisse, Marc and Ike, Yuichi and Kannan, Hariprasad and Umeda, Yuhei},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1294--1303},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/carriere21a/carriere21a.pdf},
url = {https://proceedings.mlr.press/v139/carriere21a.html},
abstract = {Solving optimization tasks based on functions and losses with a topological flavor is a very active and growing field of research in data science and Topological Data Analysis, with applications in non-convex optimization, statistics and machine learning. However, the approaches proposed in the literature are usually anchored to a specific application and/or topological construction, and do not come with theoretical guarantees. To address this issue, we study the differentiability of a general map associated with the most common topological construction, that is, the persistence map. Building on real analytic geometry arguments, we propose a general framework that allows us to define and compute gradients for persistence-based functions in a very simple way. We also provide a simple, explicit and sufficient condition for convergence of stochastic subgradient methods for such functions. This result encompasses all the constructions and applications of topological optimization in the literature. Finally, we provide associated code, that is easy to handle and to mix with other non-topological methods and constraints, as well as some experiments showcasing the versatility of our approach.}
}
@InProceedings{pmlr-v139-cassel21a,
title = {Online Policy Gradient for Model Free Learning of Linear Quadratic Regulators with $\sqrt{}$T Regret},
author = {Cassel, Asaf B and Koren, Tomer},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1304--1313},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cassel21a/cassel21a.pdf},
url = {https://proceedings.mlr.press/v139/cassel21a.html},
abstract = {We consider the task of learning to control a linear dynamical system under fixed quadratic costs, known as the Linear Quadratic Regulator (LQR) problem. While model-free approaches are often favorable in practice, thus far only model-based methods, which rely on costly system identification, have been shown to achieve regret that scales with the optimal dependence on the time horizon T. We present the first model-free algorithm that achieves similar regret guarantees. Our method relies on an efficient policy gradient scheme, and a novel and tighter analysis of the cost of exploration in policy space in this setting.}
}
@InProceedings{pmlr-v139-castiglioni21a,
title = {Multi-Receiver Online Bayesian Persuasion},
author = {Castiglioni, Matteo and Marchesi, Alberto and Celli, Andrea and Gatti, Nicola},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1314--1323},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/castiglioni21a/castiglioni21a.pdf},
url = {https://proceedings.mlr.press/v139/castiglioni21a.html},
abstract = {Bayesian persuasion studies how an informed sender should partially disclose information to influence the behavior of a self-interested receiver. Classical models make the stringent assumption that the sender knows the receiver’s utility. This can be relaxed by considering an online learning framework in which the sender repeatedly faces a receiver of an unknown, adversarially selected type. We study, for the first time, an online Bayesian persuasion setting with multiple receivers. We focus on the case with no externalities and binary actions, as customary in offline models. Our goal is to design no-regret algorithms for the sender with polynomial per-iteration running time. First, we prove a negative result: for any 0 < $\alpha$ $\leq$ 1, there is no polynomial-time no-$\alpha$-regret algorithm when the sender’s utility function is supermodular or anonymous. Then, we focus on the setting of submodular sender’s utility functions and we show that, in this case, it is possible to design a polynomial-time no-(1-1/e)-regret algorithm. To do so, we introduce a general online gradient descent framework to handle online learning problems with a finite number of possible loss functions. This requires the existence of an approximate projection oracle. We show that, in our setting, there exists one such projection oracle which can be implemented in polynomial time.}
}
@InProceedings{pmlr-v139-catav21a,
title = {Marginal Contribution Feature Importance - an Axiomatic Approach for Explaining Data},
author = {Catav, Amnon and Fu, Boyang and Zoabi, Yazeed and Meilik, Ahuva Libi Weiss and Shomron, Noam and Ernst, Jason and Sankararaman, Sriram and Gilad-Bachrach, Ran},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1324--1335},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/catav21a/catav21a.pdf},
url = {https://proceedings.mlr.press/v139/catav21a.html},
abstract = {In recent years, methods were proposed for assigning feature importance scores to measure the contribution of individual features. While in some cases the goal is to understand a specific model, in many cases the goal is to understand the contribution of certain properties (features) to a real-world phenomenon. Thus, a distinction has been made between feature importance scores that explain a model and scores that explain the data. When explaining the data, machine learning models are used as proxies in settings where conducting many real-world experiments is expensive or prohibited. While existing feature importance scores show great success in explaining models, we demonstrate their limitations when explaining the data, especially in the presence of correlations between features. Therefore, we develop a set of axioms to capture properties expected from a feature importance score when explaining data and prove that there exists only one score that satisfies all of them, the Marginal Contribution Feature Importance (MCI). We analyze the theoretical properties of this score function and demonstrate its merits empirically.}
}
@InProceedings{pmlr-v139-caucheteux21a,
title = {Disentangling syntax and semantics in the brain with deep networks},
author = {Caucheteux, Charlotte and Gramfort, Alexandre and King, Jean-Remi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1336--1348},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/caucheteux21a/caucheteux21a.pdf},
url = {https://proceedings.mlr.press/v139/caucheteux21a.html},
abstract = {The activations of language transformers like GPT-2 have been shown to linearly map onto brain activity during speech comprehension. However, the nature of these activations remains largely unknown and presumably conflate distinct linguistic classes. Here, we propose a taxonomy to factorize the high-dimensional activations of language models into four combinatorial classes: lexical, compositional, syntactic, and semantic representations. We then introduce a statistical method to decompose, through the lens of GPT-2’s activations, the brain activity of 345 subjects recorded with functional magnetic resonance imaging (fMRI) during the listening of 4.6 hours of narrated text. The results highlight two findings. First, compositional representations recruit a more widespread cortical network than lexical ones, and encompass the bilateral temporal, parietal and prefrontal cortices. Second, contrary to previous claims, syntax and semantics are not associated with separated modules, but, instead, appear to share a common and distributed neural substrate. Overall, this study introduces a versatile framework to isolate, in the brain activity, the distributed representations of linguistic constructs.}
}
@InProceedings{pmlr-v139-celis21a,
title = {Fair Classification with Noisy Protected Attributes: A Framework with Provable Guarantees},
author = {Celis, L. Elisa and Huang, Lingxiao and Keswani, Vijay and Vishnoi, Nisheeth K.},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1349--1361},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/celis21a/celis21a.pdf},
url = {https://proceedings.mlr.press/v139/celis21a.html},
abstract = {We present an optimization framework for learning a fair classifier in the presence of noisy perturbations in the protected attributes. Compared to prior work, our framework can be employed with a very general class of linear and linear-fractional fairness constraints, can handle multiple, non-binary protected attributes, and outputs a classifier that comes with provable guarantees on both accuracy and fairness. Empirically, we show that our framework can be used to attain either statistical rate or false positive rate fairness guarantees with a minimal loss in accuracy, even when the noise is large, in two real-world datasets.}
}
@InProceedings{pmlr-v139-cella21a,
title = {Best Model Identification: A Rested Bandit Formulation},
author = {Cella, Leonardo and Pontil, Massimiliano and Gentile, Claudio},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1362--1372},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cella21a/cella21a.pdf},
url = {https://proceedings.mlr.press/v139/cella21a.html},
abstract = {We introduce and analyze a best arm identification problem in the rested bandit setting, wherein arms are themselves learning algorithms whose expected losses decrease with the number of times the arm has been played. The shape of the expected loss functions is similar across arms, and is assumed to be available up to unknown parameters that have to be learned on the fly. We define a novel notion of regret for this problem, where we compare to the policy that always plays the arm having the smallest expected loss at the end of the game. We analyze an arm elimination algorithm whose regret vanishes as the time horizon increases. The actual rate of convergence depends in a detailed way on the postulated functional form of the expected losses. We complement our analysis with lower bounds, indicating strengths and limitations of the proposed solution.}
}
@InProceedings{pmlr-v139-ceron21a,
title = {Revisiting Rainbow: Promoting more insightful and inclusive deep reinforcement learning research},
author = {Ceron, Johan Samir Obando and Castro, Pablo Samuel},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1373--1383},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ceron21a/ceron21a.pdf},
url = {https://proceedings.mlr.press/v139/ceron21a.html},
abstract = {Since the introduction of DQN, a vast majority of reinforcement learning research has focused on reinforcement learning with deep neural networks as function approximators. New methods are typically evaluated on a set of environments that have now become standard, such as Atari 2600 games. While these benchmarks help standardize evaluation, their computational cost has the unfortunate side effect of widening the gap between those with ample access to computational resources, and those without. In this work we argue that, despite the community’s emphasis on large-scale environments, the traditional small-scale environments can still yield valuable scientific insights and can help reduce the barriers to entry for underprivileged communities. To substantiate our claims, we empirically revisit the paper which introduced the Rainbow algorithm [Hessel et al., 2018] and present some new insights into the algorithms used by Rainbow.}
}
@InProceedings{pmlr-v139-cetin21a,
title = {Learning Routines for Effective Off-Policy Reinforcement Learning},
author = {Cetin, Edoardo and Celiktutan, Oya},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1384--1394},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cetin21a/cetin21a.pdf},
url = {https://proceedings.mlr.press/v139/cetin21a.html},
abstract = {The performance of reinforcement learning depends upon designing an appropriate action space, where the effect of each action is measurable, yet, granular enough to permit flexible behavior. So far, this process involved non-trivial user choices in terms of the available actions and their execution frequency. We propose a novel framework for reinforcement learning that effectively lifts such constraints. Within our framework, agents learn effective behavior over a routine space: a new, higher-level action space, where each routine represents a set of ’equivalent’ sequences of granular actions with arbitrary length. Our routine space is learned end-to-end to facilitate the accomplishment of underlying off-policy reinforcement learning objectives. We apply our framework to two state-of-the-art off-policy algorithms and show that the resulting agents obtain relevant performance improvements while requiring fewer interactions with the environment per episode, improving computational efficiency.}
}
@InProceedings{pmlr-v139-ceylan21a,
title = {Learning Node Representations Using Stationary Flow Prediction on Large Payment and Cash Transaction Networks},
author = {Ceylan, Ciwan and Franz{\'e}n, Salla and Pokorny, Florian T.},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1395--1406},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ceylan21a/ceylan21a.pdf},
url = {https://proceedings.mlr.press/v139/ceylan21a.html},
abstract = {Banks are required to analyse large transaction datasets as a part of the fight against financial crime. Today, this analysis is either performed manually by domain experts or using expensive feature engineering. Gradient flow analysis allows for basic representation learning as node potentials can be inferred directly from network transaction data. However, the gradient model has a fundamental limitation: it cannot represent all types of of network flows. Furthermore, standard methods for learning the gradient flow are not appropriate for flow signals that span multiple orders of magnitude and contain outliers, i.e. transaction data. In this work, the gradient model is extended to a gated version and we prove that it, unlike the gradient model, is a universal approximator for flows on graphs. To tackle the mentioned challenges of transaction data, we propose a multi-scale and outlier robust loss function based on the Student-t log-likelihood. Ethereum transaction data is used for evaluation and the gradient models outperform MLP models using hand-engineered and node2vec features in terms of relative error. These results extend to 60 synthetic datasets, with experiments also showing that the gated gradient model learns qualitative information about the underlying synthetic generative flow distributions.}
}
@InProceedings{pmlr-v139-chamberlain21a,
title = {GRAND: Graph Neural Diffusion},
author = {Chamberlain, Ben and Rowbottom, James and Gorinova, Maria I and Bronstein, Michael and Webb, Stefan and Rossi, Emanuele},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1407--1418},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chamberlain21a/chamberlain21a.pdf},
url = {https://proceedings.mlr.press/v139/chamberlain21a.html},
abstract = {We present Graph Neural Diffusion (GRAND) that approaches deep learning on graphs as a continuous diffusion process and treats Graph Neural Networks (GNNs) as discretisations of an underlying PDE. In our model, the layer structure and topology correspond to the discretisation choices of temporal and spatial operators. Our approach allows a principled development of a broad new class of GNNs that are able to address the common plights of graph learning models such as depth, oversmoothing, and bottlenecks. Key to the success of our models are stability with respect to perturbations in the data and this is addressed for both implicit and explicit discretisation schemes. We develop linear and nonlinear versions of GRAND, which achieve competitive results on many standard graph benchmarks.}
}
@InProceedings{pmlr-v139-chami21a,
title = {HoroPCA: Hyperbolic Dimensionality Reduction via Horospherical Projections},
author = {Chami, Ines and Gu, Albert and Nguyen, Dat P and Re, Christopher},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1419--1429},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chami21a/chami21a.pdf},
url = {https://proceedings.mlr.press/v139/chami21a.html},
abstract = {This paper studies Principal Component Analysis (PCA) for data lying in hyperbolic spaces. Given directions, PCA relies on: (1) a parameterization of subspaces spanned by these directions, (2) a method of projection onto subspaces that preserves information in these directions, and (3) an objective to optimize, namely the variance explained by projections. We generalize each of these concepts to the hyperbolic space and propose HoroPCA, a method for hyperbolic dimensionality reduction. By focusing on the core problem of extracting principal directions, HoroPCA theoretically better preserves information in the original data such as distances, compared to previous generalizations of PCA. Empirically, we validate that HoroPCA outperforms existing dimensionality reduction methods, significantly reducing error in distance preservation. As a data whitening method, it improves downstream classification by up to 3.9% compared to methods that don’t use whitening. Finally, we show that HoroPCA can be used to visualize hyperbolic data in two dimensions.}
}
@InProceedings{pmlr-v139-chane-sane21a,
title = {Goal-Conditioned Reinforcement Learning with Imagined Subgoals},
author = {Chane-Sane, Elliot and Schmid, Cordelia and Laptev, Ivan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1430--1440},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chane-sane21a/chane-sane21a.pdf},
url = {https://proceedings.mlr.press/v139/chane-sane21a.html},
abstract = {Goal-conditioned reinforcement learning endows an agent with a large variety of skills, but it often struggles to solve tasks that require more temporally extended reasoning. In this work, we propose to incorporate imagined subgoals into policy learning to facilitate learning of complex tasks. Imagined subgoals are predicted by a separate high-level policy, which is trained simultaneously with the policy and its critic. This high-level policy predicts intermediate states halfway to the goal using the value function as a reachability metric. We don’t require the policy to reach these subgoals explicitly. Instead, we use them to define a prior policy, and incorporate this prior into a KL-constrained policy iteration scheme to speed up and regularize learning. Imagined subgoals are used during policy learning, but not during test time, where we only apply the learned policy. We evaluate our approach on complex robotic navigation and manipulation tasks and show that it outperforms existing methods by a large margin.}
}
@InProceedings{pmlr-v139-chang21a,
title = {Locally Private k-Means in One Round},
author = {Chang, Alisa and Ghazi, Badih and Kumar, Ravi and Manurangsi, Pasin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1441--1451},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chang21a/chang21a.pdf},
url = {https://proceedings.mlr.press/v139/chang21a.html},
abstract = {We provide an approximation algorithm for k-means clustering in the \emph{one-round} (aka \emph{non-interactive}) local model of differential privacy (DP). Our algorithm achieves an approximation ratio arbitrarily close to the best \emph{non private} approximation algorithm, improving upon previously known algorithms that only guarantee large (constant) approximation ratios. Furthermore, ours is the first constant-factor approximation algorithm for k-means that requires only \emph{one} round of communication in the local DP model, positively resolving an open question of Stemmer (SODA 2020). Our algorithmic framework is quite flexible; we demonstrate this by showing that it also yields a similar near-optimal approximation algorithm in the (one-round) shuffle DP model.}
}
@InProceedings{pmlr-v139-chang21b,
title = {Modularity in Reinforcement Learning via Algorithmic Independence in Credit Assignment},
author = {Chang, Michael and Kaushik, Sid and Levine, Sergey and Griffiths, Tom},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1452--1462},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chang21b/chang21b.pdf},
url = {https://proceedings.mlr.press/v139/chang21b.html},
abstract = {Many transfer problems require re-using previously optimal decisions for solving new tasks, which suggests the need for learning algorithms that can modify the mechanisms for choosing certain actions independently of those for choosing others. However, there is currently no formalism nor theory for how to achieve this kind of modular credit assignment. To answer this question, we define modular credit assignment as a constraint on minimizing the algorithmic mutual information among feedback signals for different decisions. We introduce what we call the modularity criterion for testing whether a learning algorithm satisfies this constraint by performing causal analysis on the algorithm itself. We generalize the recently proposed societal decision-making framework as a more granular formalism than the Markov decision process to prove that for decision sequences that do not contain cycles, certain single-step temporal difference action-value methods meet this criterion while all policy-gradient methods do not. Empirical evidence suggests that such action-value methods are more sample efficient than policy-gradient methods on transfer problems that require only sparse changes to a sequence of previously optimal decisions.}
}
@InProceedings{pmlr-v139-chang21c,
title = {Image-Level or Object-Level? A Tale of Two Resampling Strategies for Long-Tailed Detection},
author = {Chang, Nadine and Yu, Zhiding and Wang, Yu-Xiong and Anandkumar, Animashree and Fidler, Sanja and Alvarez, Jose M},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1463--1472},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chang21c/chang21c.pdf},
url = {https://proceedings.mlr.press/v139/chang21c.html},
abstract = {Training on datasets with long-tailed distributions has been challenging for major recognition tasks such as classification and detection. To deal with this challenge, image resampling is typically introduced as a simple but effective approach. However, we observe that long-tailed detection differs from classification since multiple classes may be present in one image. As a result, image resampling alone is not enough to yield a sufficiently balanced distribution at the object-level. We address object-level resampling by introducing an object-centric sampling strategy based on a dynamic, episodic memory bank. Our proposed strategy has two benefits: 1) convenient object-level resampling without significant extra computation, and 2) implicit feature-level augmentation from model updates. We show that image-level and object-level resamplings are both important, and thus unify them with a joint resampling strategy. Our method achieves state-of-the-art performance on the rare categories of LVIS, with 1.89% and 3.13% relative improvements over Forest R-CNN on detection and instance segmentation.}
}
@InProceedings{pmlr-v139-chanpuriya21a,
title = {DeepWalking Backwards: From Embeddings Back to Graphs},
author = {Chanpuriya, Sudhanshu and Musco, Cameron and Sotiropoulos, Konstantinos and Tsourakakis, Charalampos},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1473--1483},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chanpuriya21a/chanpuriya21a.pdf},
url = {https://proceedings.mlr.press/v139/chanpuriya21a.html},
abstract = {Low-dimensional node embeddings play a key role in analyzing graph datasets. However, little work studies exactly what information is encoded by popular embedding methods, and how this information correlates with performance in downstream learning tasks. We tackle this question by studying whether embeddings can be inverted to (approximately) recover the graph used to generate them. Focusing on a variant of the popular DeepWalk method \cite{PerozziAl-RfouSkiena:2014, QiuDongMa:2018}, we present algorithms for accurate embedding inversion – i.e., from the low-dimensional embedding of a graph $G$, we can find a graph $\tilde G$ with a very similar embedding. We perform numerous experiments on real-world networks, observing that significant information about $G$, such as specific edges and bulk properties like triangle density, is often lost in $\tilde G$. However, community structure is often preserved or even enhanced. Our findings are a step towards a more rigorous understanding of exactly what information embeddings encode about the input graph, and why this information is useful for learning tasks.}
}
@InProceedings{pmlr-v139-chaplot21a,
title = {Differentiable Spatial Planning using Transformers},
author = {Chaplot, Devendra Singh and Pathak, Deepak and Malik, Jitendra},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1484--1495},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chaplot21a/chaplot21a.pdf},
url = {https://proceedings.mlr.press/v139/chaplot21a.html},
abstract = {We consider the problem of spatial path planning. In contrast to the classical solutions which optimize a new plan from scratch and assume access to the full map with ground truth obstacle locations, we learn a planner from the data in a differentiable manner that allows us to leverage statistical regularities from past data. We propose Spatial Planning Transformers (SPT), which given an obstacle map learns to generate actions by planning over long-range spatial dependencies, unlike prior data-driven planners that propagate information locally via convolutional structure in an iterative manner. In the setting where the ground truth map is not known to the agent, we leverage pre-trained SPTs in an end-to-end framework that has the structure of mapper and planner built into it which allows seamless generalization to out-of-distribution maps and goals. SPTs outperform prior state-of-the-art differentiable planners across all the setups for both manipulation and navigation tasks, leading to an absolute improvement of 7-19%.}
}
@InProceedings{pmlr-v139-charlesworth21a,
title = {Solving Challenging Dexterous Manipulation Tasks With Trajectory Optimisation and Reinforcement Learning},
author = {Charlesworth, Henry J and Montana, Giovanni},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1496--1506},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/charlesworth21a/charlesworth21a.pdf},
url = {https://proceedings.mlr.press/v139/charlesworth21a.html},
abstract = {Training agents to autonomously control anthropomorphic robotic hands has the potential to lead to systems capable of performing a multitude of complex manipulation tasks in unstructured and uncertain environments. In this work, we first introduce a suite of challenging simulated manipulation tasks where current reinforcement learning and trajectory optimisation techniques perform poorly. These include environments where two simulated hands have to pass or throw objects between each other, as well as an environment where the agent must learn to spin a long pen between its fingers. We then introduce a simple trajectory optimisation algorithm that performs significantly better than existing methods on these environments. Finally, on the most challenging “PenSpin" task, we combine sub-optimal demonstrations generated through trajectory optimisation with off-policy reinforcement learning, obtaining performance that far exceeds either of these approaches individually. Videos of all of our results are available at: https://dexterous-manipulation.github.io}
}
@InProceedings{pmlr-v139-charoenphakdee21a,
title = {Classification with Rejection Based on Cost-sensitive Classification},
author = {Charoenphakdee, Nontawat and Cui, Zhenghang and Zhang, Yivan and Sugiyama, Masashi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1507--1517},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/charoenphakdee21a/charoenphakdee21a.pdf},
url = {https://proceedings.mlr.press/v139/charoenphakdee21a.html},
abstract = {The goal of classification with rejection is to avoid risky misclassification in error-critical applications such as medical diagnosis and product inspection. In this paper, based on the relationship between classification with rejection and cost-sensitive classification, we propose a novel method of classification with rejection by learning an ensemble of cost-sensitive classifiers, which satisfies all the following properties: (i) it can avoid estimating class-posterior probabilities, resulting in improved classification accuracy. (ii) it allows a flexible choice of losses including non-convex ones, (iii) it does not require complicated modifications when using different losses, (iv) it is applicable to both binary and multiclass cases, and (v) it is theoretically justifiable for any classification-calibrated loss. Experimental results demonstrate the usefulness of our proposed approach in clean-labeled, noisy-labeled, and positive-unlabeled classification.}
}
@InProceedings{pmlr-v139-chebotar21a,
title = {Actionable Models: Unsupervised Offline Reinforcement Learning of Robotic Skills},
author = {Chebotar, Yevgen and Hausman, Karol and Lu, Yao and Xiao, Ted and Kalashnikov, Dmitry and Varley, Jacob and Irpan, Alex and Eysenbach, Benjamin and Julian, Ryan C and Finn, Chelsea and Levine, Sergey},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1518--1528},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chebotar21a/chebotar21a.pdf},
url = {https://proceedings.mlr.press/v139/chebotar21a.html},
abstract = {We consider the problem of learning useful robotic skills from previously collected offline data without access to manually specified rewards or additional online exploration, a setting that is becoming increasingly important for scaling robot learning by reusing past robotic data. In particular, we propose the objective of learning a functional understanding of the environment by learning to reach any goal state in a given dataset. We employ goal-conditioned Q-learning with hindsight relabeling and develop several techniques that enable training in a particularly challenging offline setting. We find that our method can operate on high-dimensional camera images and learn a variety of skills on real robots that generalize to previously unseen scenes and objects. We also show that our method can learn to reach long-horizon goals across multiple episodes through goal chaining, and learn rich representations that can help with downstream tasks through pre-training or auxiliary objectives.}
}
@InProceedings{pmlr-v139-chen21a,
title = {Unified Robust Semi-Supervised Variational Autoencoder},
author = {Chen, Xu},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1529--1538},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21a/chen21a.pdf},
url = {https://proceedings.mlr.press/v139/chen21a.html},
abstract = {In this paper, we propose a novel noise-robust semi-supervised deep generative model by jointly tackling noisy labels and outliers simultaneously in a unified robust semi-supervised variational autoencoder (URSVAE). Typically, the uncertainty of of input data is characterized by placing uncertainty prior on the parameters of the probability density distributions in order to ensure the robustness of the variational encoder towards outliers. Subsequently, a noise transition model is integrated naturally into our model to alleviate the detrimental effects of noisy labels. Moreover, a robust divergence measure is employed to further enhance the robustness, where a novel variational lower bound is derived and optimized to infer the network parameters. By proving the influence function on the proposed evidence lower bound is bounded, the enormous potential of the proposed model in the classification in the presence of the compound noise is demonstrated. The experimental results highlight the superiority of the proposed framework by the evaluating on image classification tasks and comparing with the state-of-the-art approaches.}
}
@InProceedings{pmlr-v139-chen21b,
title = {Unsupervised Learning of Visual 3D Keypoints for Control},
author = {Chen, Boyuan and Abbeel, Pieter and Pathak, Deepak},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1539--1549},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21b/chen21b.pdf},
url = {https://proceedings.mlr.press/v139/chen21b.html},
abstract = {Learning sensorimotor control policies from high-dimensional images crucially relies on the quality of the underlying visual representations. Prior works show that structured latent space such as visual keypoints often outperforms unstructured representations for robotic control. However, most of these representations, whether structured or unstructured are learned in a 2D space even though the control tasks are usually performed in a 3D environment. In this work, we propose a framework to learn such a 3D geometric structure directly from images in an end-to-end unsupervised manner. The input images are embedded into latent 3D keypoints via a differentiable encoder which is trained to optimize both a multi-view consistency loss and downstream task objective. These discovered 3D keypoints tend to meaningfully capture robot joints as well as object movements in a consistent manner across both time and 3D space. The proposed approach outperforms prior state-of-art methods across a variety of reinforcement learning benchmarks. Code and videos at https://buoyancy99.github.io/unsup-3d-keypoints/.}
}
@InProceedings{pmlr-v139-chen21c,
title = {Integer Programming for Causal Structure Learning in the Presence of Latent Variables},
author = {Chen, Rui and Dash, Sanjeeb and Gao, Tian},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1550--1560},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21c/chen21c.pdf},
url = {https://proceedings.mlr.press/v139/chen21c.html},
abstract = {The problem of finding an ancestral acyclic directed mixed graph (ADMG) that represents the causal relationships between a set of variables is an important area of research on causal inference. Most existing score-based structure learning methods focus on learning directed acyclic graph (DAG) models without latent variables. A number of score-based methods have recently been proposed for the ADMG learning, yet they are heuristic in nature and do not guarantee an optimal solution. We propose a novel exact score-based method that solves an integer programming (IP) formulation and returns a score-maximizing ancestral ADMG for a set of continuous variables that follow a multivariate Gaussian distribution. We generalize the state-of-the-art IP model for DAG learning problems and derive new classes of valid inequalities to formulate an IP model for ADMG learning. Empirically, our model can be solved efficiently for medium-sized problems and achieves better accuracy than state-of-the-art score-based methods as well as benchmark constraint-based methods.}
}
@InProceedings{pmlr-v139-chen21d,
title = {Improved Corruption Robust Algorithms for Episodic Reinforcement Learning},
author = {Chen, Yifang and Du, Simon and Jamieson, Kevin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1561--1570},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21d/chen21d.pdf},
url = {https://proceedings.mlr.press/v139/chen21d.html},
abstract = {We study episodic reinforcement learning under unknown adversarial corruptions in both the rewards and the transition probabilities of the underlying system. We propose new algorithms which, compared to the existing results in \cite{lykouris2020corruption}, achieve strictly better regret bounds in terms of total corruptions for the tabular setting. To be specific, firstly, our regret bounds depend on more precise numerical values of total rewards corruptions and transition corruptions, instead of only on the total number of corrupted episodes. Secondly, our regret bounds are the first of their kind in the reinforcement learning setting to have the number of corruptions show up additively with respect to $\min\{ \sqrt{T},\text{PolicyGapComplexity} \}$ rather than multiplicatively. Our results follow from a general algorithmic framework that combines corruption-robust policy elimination meta-algorithms, and plug-in reward-free exploration sub-algorithms. Replacing the meta-algorithm or sub-algorithm may extend the framework to address other corrupted settings with potentially more structure.}
}
@InProceedings{pmlr-v139-fan21d,
title = {Scalable Computations of Wasserstein Barycenter via Input Convex Neural Networks},
author = {Fan, Jiaojiao, Taghvaei, Amirhossein and Chen, Yongxin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1571--1581},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fan21d/fan21d.pdf},
url = {https://proceedings.mlr.press/v139/fan21d.html},
abstract = {Wasserstein Barycenter is a principled approach to represent the weighted mean of a given set of probability distributions, utilizing the geometry induced by optimal transport. In this work, we present a novel scalable algorithm to approximate the Wasserstein Barycenters aiming at high-dimensional applications in machine learning. Our proposed algorithm is based on the Kantorovich dual formulation of the Wasserstein-2 distance as well as a recent neural network architecture, input convex neural network, that is known to parametrize convex functions. The distinguishing features of our method are: i) it only requires samples from the marginal distributions; ii) unlike the existing approaches, it represents the Barycenter with a generative model and can thus generate infinite samples from the barycenter without querying the marginal distributions; iii) it works similar to Generative Adversarial Model in one marginal case. We demonstratethe efficacy of our algorithm by comparing it with the state-of-art methods in multiple experiments.}
}
@InProceedings{pmlr-v139-chen21f,
title = {Neural Feature Matching in Implicit 3D Representations},
author = {Chen, Yunlu and Fernando, Basura and Bilen, Hakan and Mensink, Thomas and Gavves, Efstratios},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1582--1593},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21f/chen21f.pdf},
url = {https://proceedings.mlr.press/v139/chen21f.html},
abstract = {Recently, neural implicit functions have achieved impressive results for encoding 3D shapes. Conditioning on low-dimensional latent codes generalises a single implicit function to learn shared representation space for a variety of shapes, with the advantage of smooth interpolation. While the benefits from the global latent space do not correspond to explicit points at local level, we propose to track the continuous point trajectory by matching implicit features with the latent code interpolating between shapes, from which we corroborate the hierarchical functionality of the deep implicit functions, where early layers map the latent code to fitting the coarse shape structure, and deeper layers further refine the shape details. Furthermore, the structured representation space of implicit functions enables to apply feature matching for shape deformation, with the benefits to handle topology and semantics inconsistency, such as from an armchair to a chair with no arms, without explicit flow functions or manual annotations.}
}
@InProceedings{pmlr-v139-chen21g,
title = {Decentralized Riemannian Gradient Descent on the Stiefel Manifold},
author = {Chen, Shixiang and Garcia, Alfredo and Hong, Mingyi and Shahrampour, Shahin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1594--1605},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21g/chen21g.pdf},
url = {https://proceedings.mlr.press/v139/chen21g.html},
abstract = {We consider a distributed non-convex optimization where a network of agents aims at minimizing a global function over the Stiefel manifold. The global function is represented as a finite sum of smooth local functions, where each local function is associated with one agent and agents communicate with each other over an undirected connected graph. The problem is non-convex as local functions are possibly non-convex (but smooth) and the Steifel manifold is a non-convex set. We present a decentralized Riemannian stochastic gradient method (DRSGD) with the convergence rate of $\mathcal{O}(1/\sqrt{K})$ to a stationary point. To have exact convergence with constant stepsize, we also propose a decentralized Riemannian gradient tracking algorithm (DRGTA) with the convergence rate of $\mathcal{O}(1/K)$ to a stationary point. We use multi-step consensus to preserve the iteration in the local (consensus) region. DRGTA is the first decentralized algorithm with exact convergence for distributed optimization on Stiefel manifold.}
}
@InProceedings{pmlr-v139-chen21h,
title = {Learning Self-Modulating Attention in Continuous Time Space with Applications to Sequential Recommendation},
author = {Chen, Chao and Geng, Haoyu and Yang, Nianzu and Yan, Junchi and Xue, Daiyue and Yu, Jianping and Yang, Xiaokang},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1606--1616},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21h/chen21h.pdf},
url = {https://proceedings.mlr.press/v139/chen21h.html},
abstract = {User interests are usually dynamic in the real world, which poses both theoretical and practical challenges for learning accurate preferences from rich behavior data. Among existing user behavior modeling solutions, attention networks are widely adopted for its effectiveness and relative simplicity. Despite being extensively studied, existing attentions still suffer from two limitations: i) conventional attentions mainly take into account the spatial correlation between user behaviors, regardless the distance between those behaviors in the continuous time space; and ii) these attentions mostly provide a dense and undistinguished distribution over all past behaviors then attentively encode them into the output latent representations. This is however not suitable in practical scenarios where a user’s future actions are relevant to a small subset of her/his historical behaviors. In this paper, we propose a novel attention network, named \textit{self-modulating attention}, that models the complex and non-linearly evolving dynamic user preferences. We empirically demonstrate the effectiveness of our method on top-N sequential recommendation tasks, and the results on three large-scale real-world datasets show that our model can achieve state-of-the-art performance.}
}
@InProceedings{pmlr-v139-chen21i,
title = {Mandoline: Model Evaluation under Distribution Shift},
author = {Chen, Mayee and Goel, Karan and Sohoni, Nimit S and Poms, Fait and Fatahalian, Kayvon and Re, Christopher},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1617--1629},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21i/chen21i.pdf},
url = {https://proceedings.mlr.press/v139/chen21i.html},
abstract = {Machine learning models are often deployed in different settings than they were trained and validated on, posing a challenge to practitioners who wish to predict how well the deployed model will perform on a target distribution. If an unlabeled sample from the target distribution is available, along with a labeled sample from a possibly different source distribution, standard approaches such as importance weighting can be applied to estimate performance on the target. However, importance weighting struggles when the source and target distributions have non-overlapping support or are high-dimensional. Taking inspiration from fields such as epidemiology and polling, we develop Mandoline, a new evaluation framework that mitigates these issues. Our key insight is that practitioners may have prior knowledge about the ways in which the distribution shifts, which we can use to better guide the importance weighting procedure. Specifically, users write simple "slicing functions" {–} noisy, potentially correlated binary functions intended to capture possible axes of distribution shift {–} to compute reweighted performance estimates. We further describe a density ratio estimation framework for the slices and show how its estimation error scales with slice quality and dataset size. Empirical validation on NLP and vision tasks shows that Mandoline can estimate performance on the target distribution up to 3x more accurately compared to standard baselines.}
}
@InProceedings{pmlr-v139-chen21j,
title = {Order Matters: Probabilistic Modeling of Node Sequence for Graph Generation},
author = {Chen, Xiaohui and Han, Xu and Hu, Jiajing and Ruiz, Francisco and Liu, Liping},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1630--1639},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21j/chen21j.pdf},
url = {https://proceedings.mlr.press/v139/chen21j.html},
abstract = {A graph generative model defines a distribution over graphs. Typically, the model consists of a sequential process that creates and adds nodes and edges. Such sequential process defines an ordering of the nodes in the graph. The computation of the model’s likelihood requires to marginalize the node orderings; this makes maximum likelihood estimation (MLE) challenging due to the (factorial) number of possible permutations. In this work, we provide an expression for the likelihood of a graph generative model and show that its calculation is closely related to the problem of graph automorphism. In addition, we derive a variational inference (VI) algorithm for fitting a graph generative model that is based on the maximization of a variational bound of the log-likelihood. This allows the model to be trained with node orderings from the approximate posterior instead of ad-hoc orderings. Our experiments show that our log-likelihood bound is significantly tighter than the bound of previous schemes. The models fitted with the VI algorithm are able to generate high-quality graphs that match the structures of target graphs not seen during training.}
}
@InProceedings{pmlr-v139-chen21k,
title = {CARTL: Cooperative Adversarially-Robust Transfer Learning},
author = {Chen, Dian and Hu, Hongxin and Wang, Qian and Yinli, Li and Wang, Cong and Shen, Chao and Li, Qi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1640--1650},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21k/chen21k.pdf},
url = {https://proceedings.mlr.press/v139/chen21k.html},
abstract = {Transfer learning eases the burden of training a well-performed model from scratch, especially when training data is scarce and computation power is limited. In deep learning, a typical strategy for transfer learning is to freeze the early layers of a pre-trained model and fine-tune the rest of its layers on the target domain. Previous work focuses on the accuracy of the transferred model but neglects the transfer of adversarial robustness. In this work, we first show that transfer learning improves the accuracy on the target domain but degrades the inherited robustness of the target model. To address such a problem, we propose a novel cooperative adversarially-robust transfer learning (CARTL) by pre-training the model via feature distance minimization and fine-tuning the pre-trained model with non-expansive fine-tuning for target domain tasks. Empirical results show that CARTL improves the inherited robustness by about 28% at most compared with the baseline with the same degree of accuracy. Furthermore, we study the relationship between the batch normalization (BN) layers and the robustness in the context of transfer learning, and we reveal that freezing BN layers can further boost the robustness transfer.}
}
@InProceedings{pmlr-v139-chen21l,
title = {Finding the Stochastic Shortest Path with Low Regret: the Adversarial Cost and Unknown Transition Case},
author = {Chen, Liyu and Luo, Haipeng},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1651--1660},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21l/chen21l.pdf},
url = {https://proceedings.mlr.press/v139/chen21l.html},
abstract = {We make significant progress toward the stochastic shortest path problem with adversarial costs and unknown transition. Specifically, we develop algorithms that achieve $O(\sqrt{S^2ADT_\star K})$ regret for the full-information setting and $O(\sqrt{S^3A^2DT_\star K})$ regret for the bandit feedback setting, where $D$ is the diameter, $T_\star$ is the expected hitting time of the optimal policy, $S$ is the number of states, $A$ is the number of actions, and $K$ is the number of episodes. Our work strictly improves (Rosenberg and Mansour, 2020) in the full information setting, extends (Chen et al., 2020) from known transition to unknown transition, and is also the first to consider the most challenging combination: bandit feedback with adversarial costs and unknown transition. To remedy the gap between our upper bounds and the current best lower bounds constructed via a stochastically oblivious adversary, we also propose algorithms with near-optimal regret for this special case.}
}
@InProceedings{pmlr-v139-chen21m,
title = {SpreadsheetCoder: Formula Prediction from Semi-structured Context},
author = {Chen, Xinyun and Maniatis, Petros and Singh, Rishabh and Sutton, Charles and Dai, Hanjun and Lin, Max and Zhou, Denny},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1661--1672},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21m/chen21m.pdf},
url = {https://proceedings.mlr.press/v139/chen21m.html},
abstract = {Spreadsheet formula prediction has been an important program synthesis problem with many real-world applications. Previous works typically utilize input-output examples as the specification for spreadsheet formula synthesis, where each input-output pair simulates a separate row in the spreadsheet. However, this formulation does not fully capture the rich context in real-world spreadsheets. First, spreadsheet data entries are organized as tables, thus rows and columns are not necessarily independent from each other. In addition, many spreadsheet tables include headers, which provide high-level descriptions of the cell data. However, previous synthesis approaches do not consider headers as part of the specification. In this work, we present the first approach for synthesizing spreadsheet formulas from tabular context, which includes both headers and semi-structured tabular data. In particular, we propose SpreadsheetCoder, a BERT-based model architecture to represent the tabular context in both row-based and column-based formats. We train our model on a large dataset of spreadsheets, and demonstrate that SpreadsheetCoder achieves top-1 prediction accuracy of 42.51%, which is a considerable improvement over baselines that do not employ rich tabular context. Compared to the rule-based system, SpreadsheetCoder assists 82% more users in composing formulas on Google Sheets.}
}
@InProceedings{pmlr-v139-chen21n,
title = {Large-Margin Contrastive Learning with Distance Polarization Regularizer},
author = {Chen, Shuo and Niu, Gang and Gong, Chen and Li, Jun and Yang, Jian and Sugiyama, Masashi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1673--1683},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21n/chen21n.pdf},
url = {https://proceedings.mlr.press/v139/chen21n.html},
abstract = {\emph{Contrastive learning} (CL) pretrains models in a pairwise manner, where given a data point, other data points are all regarded as dissimilar, including some that are \emph{semantically} similar. The issue has been addressed by properly weighting similar and dissimilar pairs as in \emph{positive-unlabeled learning}, so that the objective of CL is \emph{unbiased} and CL is \emph{consistent}. However, in this paper, we argue that this great solution is still not enough: its weighted objective \emph{hides} the issue where the semantically similar pairs are still pushed away; as CL is pretraining, this phenomenon is not our desideratum and might affect downstream tasks. To this end, we propose \emph{large-margin contrastive learning} (LMCL) with \emph{distance polarization regularizer}, motivated by the distribution characteristic of pairwise distances in \emph{metric learning}. In LMCL, we can distinguish between \emph{intra-cluster} and \emph{inter-cluster} pairs, and then only push away inter-cluster pairs, which \emph{solves} the above issue explicitly. Theoretically, we prove a tighter error bound for LMCL; empirically, the superiority of LMCL is demonstrated across multiple domains, \emph{i.e.}, image classification, sentence representation, and reinforcement learning.}
}
@InProceedings{pmlr-v139-chen21o,
title = {Z-GCNETs: Time Zigzags at Graph Convolutional Networks for Time Series Forecasting},
author = {Chen, Yuzhou and Segovia, Ignacio and Gel, Yulia R.},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1684--1694},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21o/chen21o.pdf},
url = {https://proceedings.mlr.press/v139/chen21o.html},
abstract = {There recently has been a surge of interest in developing a new class of deep learning (DL) architectures that integrate an explicit time dimension as a fundamental building block of learning and representation mechanisms. In turn, many recent results show that topological descriptors of the observed data, encoding information on the shape of the dataset in a topological space at different scales, that is, persistent homology of the data, may contain important complementary information, improving both performance and robustness of DL. As convergence of these two emerging ideas, we propose to enhance DL architectures with the most salient time-conditioned topological information of the data and introduce the concept of zigzag persistence into time-aware graph convolutional networks (GCNs). Zigzag persistence provides a systematic and mathematically rigorous framework to track the most important topological features of the observed data that tend to manifest themselves over time. To integrate the extracted time-conditioned topological descriptors into DL, we develop a new topological summary, zigzag persistence image, and derive its theoretical stability guarantees. We validate the new GCNs with a time-aware zigzag topological layer (Z-GCNETs), in application to traffic forecasting and Ethereum blockchain price prediction. Our results indicate that Z-GCNET outperforms 13 state-of-the-art methods on 4 time series datasets.}
}
@InProceedings{pmlr-v139-chen21p,
title = {A Unified Lottery Ticket Hypothesis for Graph Neural Networks},
author = {Chen, Tianlong and Sui, Yongduo and Chen, Xuxi and Zhang, Aston and Wang, Zhangyang},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1695--1706},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21p/chen21p.pdf},
url = {https://proceedings.mlr.press/v139/chen21p.html},
abstract = {With graphs rapidly growing in size and deeper graph neural networks (GNNs) emerging, the training and inference of GNNs become increasingly expensive. Existing network weight pruning algorithms cannot address the main space and computational bottleneck in GNNs, caused by the size and connectivity of the graph. To this end, this paper first presents a unified GNN sparsification (UGS) framework that simultaneously prunes the graph adjacency matrix and the model weights, for effectively accelerating GNN inference on large-scale graphs. Leveraging this new tool, we further generalize the recently popular lottery ticket hypothesis to GNNs for the first time, by defining a graph lottery ticket (GLT) as a pair of core sub-dataset and sparse sub-network, which can be jointly identified from the original GNN and the full dense graph by iteratively applying UGS. Like its counterpart in convolutional neural networks, GLT can be trained in isolation to match the performance of training with the full model and graph, and can be drawn from both randomly initialized and self-supervised pre-trained GNNs. Our proposal has been experimentally verified across various GNN architectures and diverse tasks, on both small-scale graph datasets (Cora, Citeseer and PubMed), and large-scale datasets from the challenging Open Graph Benchmark (OGB). Specifically, for node classification, our found GLTs achieve the same accuracies with 20% 98% MACs saving on small graphs and 25% 85% MACs saving on large ones. For link prediction, GLTs lead to 48% 97% and 70% MACs saving on small and large graph datasets, respectively, without compromising predictive performance. Codes are at https://github.com/VITA-Group/Unified-LTH-GNN.}
}
@InProceedings{pmlr-v139-chen21q,
title = {Network Inference and Influence Maximization from Samples},
author = {Chen, Wei and Sun, Xiaoming and Zhang, Jialin and Zhang, Zhijie},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1707--1716},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21q/chen21q.pdf},
url = {https://proceedings.mlr.press/v139/chen21q.html},
abstract = {Influence maximization is the task of selecting a small number of seed nodes in a social network to maximize the spread of the influence from these seeds, and it has been widely investigated in the past two decades. In the canonical setting, the whole social network as well as its diffusion parameters is given as input. In this paper, we consider the more realistic sampling setting where the network is unknown and we only have a set of passively observed cascades that record the set of activated nodes at each diffusion step. We study the task of influence maximization from these cascade samples (IMS), and present constant approximation algorithms for this task under mild conditions on the seed set distribution. To achieve the optimization goal, we also provide a novel solution to the network inference problem, that is, learning diffusion parameters and the network structure from the cascade data. Comparing with prior solutions, our network inference algorithm requires weaker assumptions and does not rely on maximum-likelihood estimation and convex programming. Our IMS algorithms enhance the learning-and-then-optimization approach by allowing a constant approximation ratio even when the diffusion parameters are hard to learn, and we do not need any assumption related to the network structure or diffusion parameters.}
}
@InProceedings{pmlr-v139-chen21r,
title = {Data-driven Prediction of General Hamiltonian Dynamics via Learning Exactly-Symplectic Maps},
author = {Chen, Renyi and Tao, Molei},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1717--1727},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21r/chen21r.pdf},
url = {https://proceedings.mlr.press/v139/chen21r.html},
abstract = {We consider the learning and prediction of nonlinear time series generated by a latent symplectic map. A special case is (not necessarily separable) Hamiltonian systems, whose solution flows give such symplectic maps. For this special case, both generic approaches based on learning the vector field of the latent ODE and specialized approaches based on learning the Hamiltonian that generates the vector field exist. Our method, however, is different as it does not rely on the vector field nor assume its existence; instead, it directly learns the symplectic evolution map in discrete time. Moreover, we do so by representing the symplectic map via a generating function, which we approximate by a neural network (hence the name GFNN). This way, our approximation of the evolution map is always \emph{exactly} symplectic. This additional geometric structure allows the local prediction error at each step to accumulate in a controlled fashion, and we will prove, under reasonable assumptions, that the global prediction error grows at most \emph{linearly} with long prediction time, which significantly improves an otherwise exponential growth. In addition, as a map-based and thus purely data-driven method, GFNN avoids two additional sources of inaccuracies common in vector-field based approaches, namely the error in approximating the vector field by finite difference of the data, and the error in numerical integration of the vector field for making predictions. Numerical experiments further demonstrate our claims.}
}
@InProceedings{pmlr-v139-chen21s,
title = {Analysis of stochastic Lanczos quadrature for spectrum approximation},
author = {Chen, Tyler and Trogdon, Thomas and Ubaru, Shashanka},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1728--1739},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21s/chen21s.pdf},
url = {https://proceedings.mlr.press/v139/chen21s.html},
abstract = {The cumulative empirical spectral measure (CESM) $\Phi[\mathbf{A}] : \mathbb{R} \to [0,1]$ of a $n\times n$ symmetric matrix $\mathbf{A}$ is defined as the fraction of eigenvalues of $\mathbf{A}$ less than a given threshold, i.e., $\Phi[\mathbf{A}](x) := \sum_{i=1}^{n} \frac{1}{n} {\large\unicode{x1D7D9}}[ \lambda_i[\mathbf{A}]\leq x]$. Spectral sums $\operatorname{tr}(f[\mathbf{A}])$ can be computed as the Riemann–Stieltjes integral of $f$ against $\Phi[\mathbf{A}]$, so the task of estimating CESM arises frequently in a number of applications, including machine learning. We present an error analysis for stochastic Lanczos quadrature (SLQ). We show that SLQ obtains an approximation to the CESM within a Wasserstein distance of $t \: | \lambda_{\text{max}}[\mathbf{A}] - \lambda_{\text{min}}[\mathbf{A}] |$ with probability at least $1-\eta$, by applying the Lanczos algorithm for $\lceil 12 t^{-1} + \frac{1}{2} \rceil$ iterations to $\lceil 4 ( n+2 )^{-1}t^{-2} \ln(2n\eta^{-1}) \rceil$ vectors sampled independently and uniformly from the unit sphere. We additionally provide (matrix-dependent) a posteriori error bounds for the Wasserstein and Kolmogorov–Smirnov distances between the output of this algorithm and the true CESM. The quality of our bounds is demonstrated using numerical experiments.}
}
@InProceedings{pmlr-v139-chen21t,
title = {Large-Scale Multi-Agent Deep FBSDEs},
author = {Chen, Tianrong and Wang, Ziyi O and Exarchos, Ioannis and Theodorou, Evangelos},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1740--1748},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21t/chen21t.pdf},
url = {https://proceedings.mlr.press/v139/chen21t.html},
abstract = {In this paper we present a scalable deep learning framework for finding Markovian Nash Equilibria in multi-agent stochastic games using fictitious play. The motivation is inspired by theoretical analysis of Forward Backward Stochastic Differential Equations and their implementation in a deep learning setting, which is the source of our algorithm’s sample efficiency improvement. By taking advantage of the permutation-invariant property of agents in symmetric games, the scalability and performance is further enhanced significantly. We showcase superior performance of our framework over the state-of-the-art deep fictitious play algorithm on an inter-bank lending/borrowing problem in terms of multiple metrics. More importantly, our approach scales up to 3000 agents in simulation, a scale which, to the best of our knowledge, represents a new state-of-the-art. We also demonstrate the applicability of our framework in robotics on a belief space autonomous racing problem.}
}
@InProceedings{pmlr-v139-chen21u,
title = {Representation Subspace Distance for Domain Adaptation Regression},
author = {Chen, Xinyang and Wang, Sinan and Wang, Jianmin and Long, Mingsheng},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1749--1759},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21u/chen21u.pdf},
url = {https://proceedings.mlr.press/v139/chen21u.html},
abstract = {Regression, as a counterpart to classification, is a major paradigm with a wide range of applications. Domain adaptation regression extends it by generalizing a regressor from a labeled source domain to an unlabeled target domain. Existing domain adaptation regression methods have achieved positive results limited only to the shallow regime. A question arises: Why learning invariant representations in the deep regime less pronounced? A key finding of this paper is that classification is robust to feature scaling but regression is not, and aligning the distributions of deep representations will alter feature scale and impede domain adaptation regression. Based on this finding, we propose to close the domain gap through orthogonal bases of the representation spaces, which are free from feature scaling. Inspired by Riemannian geometry of Grassmann manifold, we define a geometrical distance over representation subspaces and learn deep transferable representations by minimizing it. To avoid breaking the geometrical properties of deep representations, we further introduce the bases mismatch penalization to match the ordering of orthogonal bases across representation subspaces. Our method is evaluated on three domain adaptation regression benchmarks, two of which are introduced in this paper. Our method outperforms the state-of-the-art methods significantly, forming early positive results in the deep regime.}
}
@InProceedings{pmlr-v139-chen21v,
title = {Overcoming Catastrophic Forgetting by Bayesian Generative Regularization},
author = {Chen, Pei-Hung and Wei, Wei and Hsieh, Cho-Jui and Dai, Bo},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1760--1770},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21v/chen21v.pdf},
url = {https://proceedings.mlr.press/v139/chen21v.html},
abstract = {In this paper, we propose a new method to over-come catastrophic forgetting by adding generative regularization to Bayesian inference frame-work. Bayesian method provides a general frame-work for continual learning. We could further construct a generative regularization term for all given classification models by leveraging energy-based models and Langevin dynamic sampling to enrich the features learned in each task. By combining discriminative and generative loss together, we empirically show that the proposed method outperforms state-of-the-art methods on a variety of tasks, avoiding catastrophic forgetting in continual learning. In particular, the proposed method outperforms baseline methods over 15%on the Fashion-MNIST dataset and 10%on the CUB dataset.}
}
@InProceedings{pmlr-v139-chen21w,
title = {Cyclically Equivariant Neural Decoders for Cyclic Codes},
author = {Chen, Xiangyu and Ye, Min},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1771--1780},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21w/chen21w.pdf},
url = {https://proceedings.mlr.press/v139/chen21w.html},
abstract = {Neural decoders were introduced as a generalization of the classic Belief Propagation (BP) decoding algorithms, where the Trellis graph in the BP algorithm is viewed as a neural network, and the weights in the Trellis graph are optimized by training the neural network. In this work, we propose a novel neural decoder for cyclic codes by exploiting their cyclically invariant property. More precisely, we impose a shift invariant structure on the weights of our neural decoder so that any cyclic shift of inputs results in the same cyclic shift of outputs. Extensive simulations with BCH codes and punctured Reed-Muller (RM) codes show that our new decoder consistently outperforms previous neural decoders when decoding cyclic codes. Finally, we propose a list decoding procedure that can significantly reduce the decoding error probability for BCH codes and punctured RM codes. For certain high-rate codes, the gap between our list decoder and the Maximum Likelihood decoder is less than $0.1$dB. Code available at github.com/cyclicallyneuraldecoder}
}
@InProceedings{pmlr-v139-chen21x,
title = {A Receptor Skeleton for Capsule Neural Networks},
author = {Chen, Jintai and Yu, Hongyun and Qian, Chengde and Chen, Danny Z and Wu, Jian},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1781--1790},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21x/chen21x.pdf},
url = {https://proceedings.mlr.press/v139/chen21x.html},
abstract = {In previous Capsule Neural Networks (CapsNets), routing algorithms often performed clustering processes to assemble the child capsules’ representations into parent capsules. Such routing algorithms were typically implemented with iterative processes and incurred high computing complexity. This paper presents a new capsule structure, which contains a set of optimizable receptors and a transmitter is devised on the capsule’s representation. Specifically, child capsules’ representations are sent to the parent capsules whose receptors match well the transmitters of the child capsules’ representations, avoiding applying computationally complex routing algorithms. To ensure the receptors in a CapsNet work cooperatively, we build a skeleton to organize the receptors in different capsule layers in a CapsNet. The receptor skeleton assigns a share-out objective for each receptor, making the CapsNet perform as a hierarchical agglomerative clustering process. Comprehensive experiments verify that our approach facilitates efficient clustering processes, and CapsNets with our approach significantly outperform CapsNets with previous routing algorithms on image classification, affine transformation generalization, overlapped object recognition, and representation semantic decoupling.}
}
@InProceedings{pmlr-v139-chen21y,
title = {Accelerating Gossip SGD with Periodic Global Averaging},
author = {Chen, Yiming and Yuan, Kun and Zhang, Yingya and Pan, Pan and Xu, Yinghui and Yin, Wotao},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1791--1802},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21y/chen21y.pdf},
url = {https://proceedings.mlr.press/v139/chen21y.html},
abstract = {Communication overhead hinders the scalability of large-scale distributed training. Gossip SGD, where each node averages only with its neighbors, is more communication-efficient than the prevalent parallel SGD. However, its convergence rate is reversely proportional to quantity $1-\beta$ which measures the network connectivity. On large and sparse networks where $1-\beta \to 0$, Gossip SGD requires more iterations to converge, which offsets against its communication benefit. This paper introduces Gossip-PGA, which adds Periodic Global Averaging to accelerate Gossip SGD. Its transient stage, i.e., the iterations required to reach asymptotic linear speedup stage, improves from $\Omega(\beta^4 n^3/(1-\beta)^4)$ to $\Omega(\beta^4 n^3 H^4)$ for non-convex problems. The influence of network topology in Gossip-PGA can be controlled by the averaging period $H$. Its transient-stage complexity is also superior to local SGD which has order $\Omega(n^3 H^4)$. Empirical results of large-scale training on image classification (ResNet50) and language modeling (BERT) validate our theoretical findings.}
}
@InProceedings{pmlr-v139-chen21z,
title = {ActNN: Reducing Training Memory Footprint via 2-Bit Activation Compressed Training},
author = {Chen, Jianfei and Zheng, Lianmin and Yao, Zhewei and Wang, Dequan and Stoica, Ion and Mahoney, Michael and Gonzalez, Joseph},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1803--1813},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chen21z/chen21z.pdf},
url = {https://proceedings.mlr.press/v139/chen21z.html},
abstract = {The increasing size of neural network models has been critical for improvements in their accuracy, but device memory is not growing at the same rate. This creates fundamental challenges for training neural networks within limited memory environments. In this work, we propose ActNN, a memory-efficient training framework that stores randomly quantized activations for back propagation. We prove the convergence of ActNN for general network architectures, and we characterize the impact of quantization on the convergence via an exact expression for the gradient variance. Using our theory, we propose novel mixed-precision quantization strategies that exploit the activation’s heterogeneity across feature dimensions, samples, and layers. These techniques can be readily applied to existing dynamic graph frameworks, such as PyTorch, simply by substituting the layers. We evaluate ActNN on mainstream computer vision models for classification, detection, and segmentation tasks. On all these tasks, ActNN compresses the activation to 2 bits on average, with negligible accuracy loss. ActNN reduces the memory footprint of the activation by 12x, and it enables training with a 6.6x to 14x larger batch size.}
}
@InProceedings{pmlr-v139-cheng21a,
title = {SPADE: A Spectral Method for Black-Box Adversarial Robustness Evaluation},
author = {Cheng, Wuxinlin and Deng, Chenhui and Zhao, Zhiqiang and Cai, Yaohui and Zhang, Zhiru and Feng, Zhuo},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1814--1824},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cheng21a/cheng21a.pdf},
url = {https://proceedings.mlr.press/v139/cheng21a.html},
abstract = {A black-box spectral method is introduced for evaluating the adversarial robustness of a given machine learning (ML) model. Our approach, named SPADE, exploits bijective distance mapping between the input/output graphs constructed for approximating the manifolds corresponding to the input/output data. By leveraging the generalized Courant-Fischer theorem, we propose a SPADE score for evaluating the adversarial robustness of a given model, which is proved to be an upper bound of the best Lipschitz constant under the manifold setting. To reveal the most non-robust data samples highly vulnerable to adversarial attacks, we develop a spectral graph embedding procedure leveraging dominant generalized eigenvectors. This embedding step allows assigning each data point a robustness score that can be further harnessed for more effective adversarial training of ML models. Our experiments show promising empirical results for neural networks trained with the MNIST and CIFAR-10 data sets.}
}
@InProceedings{pmlr-v139-cheng21b,
title = {Self-supervised and Supervised Joint Training for Resource-rich Machine Translation},
author = {Cheng, Yong and Wang, Wei and Jiang, Lu and Macherey, Wolfgang},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1825--1835},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cheng21b/cheng21b.pdf},
url = {https://proceedings.mlr.press/v139/cheng21b.html},
abstract = {Self-supervised pre-training of text representations has been successfully applied to low-resource Neural Machine Translation (NMT). However, it usually fails to achieve notable gains on resource-rich NMT. In this paper, we propose a joint training approach, F2-XEnDec, to combine self-supervised and supervised learning to optimize NMT models. To exploit complementary self-supervised signals for supervised learning, NMT models are trained on examples that are interbred from monolingual and parallel sentences through a new process called crossover encoder-decoder. Experiments on two resource-rich translation benchmarks, WMT’14 English-German and WMT’14 English-French, demonstrate that our approach achieves substantial improvements over several strong baseline methods and obtains a new state of the art of 46.19 BLEU on English-French when incorporating back translation. Results also show that our approach is capable of improving model robustness to input perturbations such as code-switching noise which frequently appears on the social media.}
}
@InProceedings{pmlr-v139-cherubin21a,
title = {Exact Optimization of Conformal Predictors via Incremental and Decremental Learning},
author = {Cherubin, Giovanni and Chatzikokolakis, Konstantinos and Jaggi, Martin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1836--1845},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cherubin21a/cherubin21a.pdf},
url = {https://proceedings.mlr.press/v139/cherubin21a.html},
abstract = {Conformal Predictors (CP) are wrappers around ML models, providing error guarantees under weak assumptions on the data distribution. They are suitable for a wide range of problems, from classification and regression to anomaly detection. Unfortunately, their very high computational complexity limits their applicability to large datasets. In this work, we show that it is possible to speed up a CP classifier considerably, by studying it in conjunction with the underlying ML method, and by exploiting incremental&decremental learning. For methods such as k-NN, KDE, and kernel LS-SVM, our approach reduces the running time by one order of magnitude, whilst producing exact solutions. With similar ideas, we also achieve a linear speed up for the harder case of bootstrapping. Finally, we extend these techniques to improve upon an optimization of k-NN CP for regression. We evaluate our findings empirically, and discuss when methods are suitable for CP optimization.}
}
@InProceedings{pmlr-v139-cheshire21a,
title = {Problem Dependent View on Structured Thresholding Bandit Problems},
author = {Cheshire, James and Menard, Pierre and Carpentier, Alexandra},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1846--1854},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cheshire21a/cheshire21a.pdf},
url = {https://proceedings.mlr.press/v139/cheshire21a.html},
abstract = {We investigate the \textit{problem dependent regime} in the stochastic \emph{Thresholding Bandit problem} (\tbp) under several \emph{shape constraints}. In the \tbp the objective of the learner is to output, after interacting with the environment, the set of arms whose means are above a given threshold. The vanilla, unstructured, case is already well studied in the literature. Taking $K$ as the number of arms, we consider the case where (i) the sequence of arm’s means $(\mu_k){k=1}^K$ is monotonically increasing (\textit{MTBP}) and (ii) the case where $(\mu_k){k=1}^K$ is concave (\textit{CTBP}). We consider both cases in the \emph{problem dependent} regime and study the probability of error - i.e. the probability to mis-classify at least one arm. In the fixed budget setting, we provide nearly matching upper and lower bounds for the probability of error in both the concave and monotone settings, as well as associated algorithms. Of interest, is that for both the monotone and concave cases, optimal bounds on probability of error are of the same order as those for the two armed bandit problem.}
}
@InProceedings{pmlr-v139-cheung21a,
title = {Online Optimization in Games via Control Theory: Connecting Regret, Passivity and Poincar{é} Recurrence},
author = {Cheung, Yun Kuen and Piliouras, Georgios},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1855--1865},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cheung21a/cheung21a.pdf},
url = {https://proceedings.mlr.press/v139/cheung21a.html},
abstract = {We present a novel control-theoretic understanding of online optimization and learning in games, via the notion of passivity. Passivity is a fundamental concept in control theory, which abstracts energy conservation and dissipation in physical systems. It has become a standard tool in analysis of general feedback systems, to which game dynamics belong. Our starting point is to show that all continuous-time Follow-the-Regularized-Leader (FTRL) dynamics, which include the well-known Replicator Dynamic, are lossless, i.e. it is passive with no energy dissipation. Interestingly, we prove that passivity implies bounded regret, connecting two fundamental primitives of control theory and online optimization. The observation of energy conservation in FTRL inspires us to present a family of lossless learning dynamics, each of which has an underlying energy function with a simple gradient structure. This family is closed under convex combination; as an immediate corollary, any convex combination of FTRL dynamics is lossless and thus has bounded regret. This allows us to extend the framework of Fox & Shamma [Games 2013] to prove not just global asymptotic stability results for game dynamics, but Poincar{é} recurrence results as well. Intuitively, when a lossless game (e.g. graphical constant-sum game) is coupled with lossless learning dynamic, their interconnection is also lossless, which results in a pendulum-like energy-preserving recurrent behavior, generalizing Piliouras & Shamma [SODA 2014] and Mertikopoulos et al. [SODA 2018].}
}
@InProceedings{pmlr-v139-chi21a,
title = {Understanding and Mitigating Accuracy Disparity in Regression},
author = {Chi, Jianfeng and Tian, Yuan and Gordon, Geoffrey J. and Zhao, Han},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1866--1876},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chi21a/chi21a.pdf},
url = {https://proceedings.mlr.press/v139/chi21a.html},
abstract = {With the widespread deployment of large-scale prediction systems in high-stakes domains, e.g., face recognition, criminal justice, etc., disparity on prediction accuracy between different demographic subgroups has called for fundamental understanding on the source of such disparity and algorithmic intervention to mitigate it. In this paper, we study the accuracy disparity problem in regression. To begin with, we first propose an error decomposition theorem, which decomposes the accuracy disparity into the distance between marginal label distributions and the distance between conditional representations, to help explain why such accuracy disparity appears in practice. Motivated by this error decomposition and the general idea of distribution alignment with statistical distances, we then propose an algorithm to reduce this disparity, and analyze its game-theoretic optima of the proposed objective functions. To corroborate our theoretical findings, we also conduct experiments on five benchmark datasets. The experimental results suggest that our proposed algorithms can effectively mitigate accuracy disparity while maintaining the predictive power of the regression models.}
}
@InProceedings{pmlr-v139-chien21a,
title = {Private Alternating Least Squares: Practical Private Matrix Completion with Tighter Rates},
author = {Chien, Steve and Jain, Prateek and Krichene, Walid and Rendle, Steffen and Song, Shuang and Thakurta, Abhradeep and Zhang, Li},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1877--1887},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chien21a/chien21a.pdf},
url = {https://proceedings.mlr.press/v139/chien21a.html},
abstract = {We study the problem of differentially private (DP) matrix completion under user-level privacy. We design a joint differentially private variant of the popular Alternating-Least-Squares (ALS) method that achieves: i) (nearly) optimal sample complexity for matrix completion (in terms of number of items, users), and ii) the best known privacy/utility trade-off both theoretically, as well as on benchmark data sets. In particular, we provide the first global convergence analysis of ALS with noise introduced to ensure DP, and show that, in comparison to the best known alternative (the Private Frank-Wolfe algorithm by Jain et al. (2018)), our error bounds scale significantly better with respect to the number of items and users, which is critical in practical problems. Extensive validation on standard benchmarks demonstrate that the algorithm, in combination with carefully designed sampling procedures, is significantly more accurate than existing techniques, thus promising to be the first practical DP embedding model.}
}
@InProceedings{pmlr-v139-chierichetti21a,
title = {Light RUMs},
author = {Chierichetti, Flavio and Kumar, Ravi and Tomkins, Andrew},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1888--1897},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chierichetti21a/chierichetti21a.pdf},
url = {https://proceedings.mlr.press/v139/chierichetti21a.html},
abstract = {A Random Utility Model (RUM) is a distribution on permutations over a universe of items. For each subset of the universe, a RUM induces a natural distribution of the winner in the subset: choose a permutation according to the RUM distribution and pick the maximum item in the subset according to the chosen permutation. RUMs are widely used in the theory of discrete choice. In this paper we consider the question of the (lossy) compressibility of RUMs on a universe of size $n$, i.e., the minimum number of bits required to approximate the winning probabilities of each slate. Our main result is that RUMs can be approximated using $\tilde{O}(n^2)$ bits, an exponential improvement over the standard representation; furthermore, we show that this bound is optimal. En route, we sharpen the classical existential result of McFadden and Train (2000) by showing that the minimum size of a mixture of multinomial logits required to can approximate a general RUM is $\tilde{\Theta}(n)$.}
}
@InProceedings{pmlr-v139-chilkuri21a,
title = {Parallelizing Legendre Memory Unit Training},
author = {Chilkuri, Narsimha Reddy and Eliasmith, Chris},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1898--1907},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chilkuri21a/chilkuri21a.pdf},
url = {https://proceedings.mlr.press/v139/chilkuri21a.html},
abstract = {Recently, a new recurrent neural network (RNN) named the Legendre Memory Unit (LMU) was proposed and shown to achieve state-of-the-art performance on several benchmark datasets. Here we leverage the linear time-invariant (LTI) memory component of the LMU to construct a simplified variant that can be parallelized during training (and yet executed as an RNN during inference), resulting in up to 200 times faster training. We note that our efficient parallelizing scheme is general and is applicable to any deep network whose recurrent components are linear dynamical systems. We demonstrate the improved accuracy of our new architecture compared to the original LMU and a variety of published LSTM and transformer networks across seven benchmarks. For instance, our LMU sets a new state-of-the-art result on psMNIST, and uses half the parameters while outperforming DistilBERT and LSTM models on IMDB sentiment analysis.}
}
@InProceedings{pmlr-v139-chitra21a,
title = {Quantifying and Reducing Bias in Maximum Likelihood Estimation of Structured Anomalies},
author = {Chitra, Uthsav and Ding, Kimberly and Lee, Jasper C.H. and Raphael, Benjamin J},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1908--1919},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chitra21a/chitra21a.pdf},
url = {https://proceedings.mlr.press/v139/chitra21a.html},
abstract = {Anomaly estimation, or the problem of finding a subset of a dataset that differs from the rest of the dataset, is a classic problem in machine learning and data mining. In both theoretical work and in applications, the anomaly is assumed to have a specific structure defined by membership in an anomaly family. For example, in temporal data the anomaly family may be time intervals, while in network data the anomaly family may be connected subgraphs. The most prominent approach for anomaly estimation is to compute the Maximum Likelihood Estimator (MLE) of the anomaly; however, it was recently observed that for normally distributed data, the MLE is a biased estimator for some anomaly families. In this work, we demonstrate that in the normal means setting, the bias of the MLE depends on the size of the anomaly family. We prove that if the number of sets in the anomaly family that contain the anomaly is sub-exponential, then the MLE is asymptotically unbiased. We also provide empirical evidence that the converse is true: if the number of such sets is exponential, then the MLE is asymptotically biased. Our analysis unifies a number of earlier results on the bias of the MLE for specific anomaly families. Next, we derive a new anomaly estimator using a mixture model, and we prove that our anomaly estimator is asymptotically unbiased regardless of the size of the anomaly family. We illustrate the advantages of our estimator versus the MLE on disease outbreak data and highway traffic data.}
}
@InProceedings{pmlr-v139-chledowski21a,
title = {Robust Learning-Augmented Caching: An Experimental Study},
author = {Ch{\l}{\k{e}}dowski, Jakub and Polak, Adam and Szabucki, Bartosz and {\.Z}o{\l}na, Konrad Tomasz},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1920--1930},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chledowski21a/chledowski21a.pdf},
url = {https://proceedings.mlr.press/v139/chledowski21a.html},
abstract = {Effective caching is crucial for performance of modern-day computing systems. A key optimization problem arising in caching – which item to evict to make room for a new item – cannot be optimally solved without knowing the future. There are many classical approximation algorithms for this problem, but more recently researchers started to successfully apply machine learning to decide what to evict by discovering implicit input patterns and predicting the future. While machine learning typically does not provide any worst-case guarantees, the new field of learning-augmented algorithms proposes solutions which leverage classical online caching algorithms to make the machine-learned predictors robust. We are the first to comprehensively evaluate these learning-augmented algorithms on real-world caching datasets and state-of-the-art machine-learned predictors. We show that a straightforward method – blindly following either a predictor or a classical robust algorithm, and switching whenever one becomes worse than the other – has only a low overhead over a well-performing predictor, while competing with classical methods when the coupled predictor fails, thus providing a cheap worst-case insurance.}
}
@InProceedings{pmlr-v139-cho21a,
title = {Unifying Vision-and-Language Tasks via Text Generation},
author = {Cho, Jaemin and Lei, Jie and Tan, Hao and Bansal, Mohit},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1931--1942},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cho21a/cho21a.pdf},
url = {https://proceedings.mlr.press/v139/cho21a.html},
abstract = {Existing methods for vision-and-language learning typically require designing task-specific architectures and objectives for each task. For example, a multi-label answer classifier for visual question answering, a region scorer for referring expression comprehension, and a language decoder for image captioning, etc. To alleviate these hassles, in this work, we propose a unified framework that learns different tasks in a single architecture with the same language modeling objective, i.e., multimodal conditional text generation, where our models learn to generate labels in text based on the visual and textual inputs. On 7 popular vision-and-language benchmarks, including visual question answering, referring expression comprehension, visual commonsense reasoning, most of which have been previously modeled as discriminative tasks, our generative approach (with a single unified architecture) reaches comparable performance to recent task-specific state-of-the-art vision-and-language models. Moreover, our generative approach shows better generalization ability on questions that have rare answers. Also, we show that our framework allows multi-task learning in a single architecture with a single set of parameters, achieving similar performance to separately optimized single-task models. Our code is publicly available at: https://github.com/j-min/VL-T5}
}
@InProceedings{pmlr-v139-choi21a,
title = {Learning from Nested Data with Ornstein Auto-Encoders},
author = {Choi, Youngwon and Lee, Sungdong and Won, Joong-Ho},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1943--1952},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/choi21a/choi21a.pdf},
url = {https://proceedings.mlr.press/v139/choi21a.html},
abstract = {Many of real-world data, e.g., the VGGFace2 dataset, which is a collection of multiple portraits of individuals, come with nested structures due to grouped observation. The Ornstein auto-encoder (OAE) is an emerging framework for representation learning from nested data, based on an optimal transport distance between random processes. An attractive feature of OAE is its ability to generate new variations nested within an observational unit, whether or not the unit is known to the model. A previously proposed algorithm for OAE, termed the random-intercept OAE (RIOAE), showed an impressive performance in learning nested representations, yet lacks theoretical justification. In this work, we show that RIOAE minimizes a loose upper bound of the employed optimal transport distance. After identifying several issues with RIOAE, we present the product-space OAE (PSOAE) that minimizes a tighter upper bound of the distance and achieves orthogonality in the representation space. PSOAE alleviates the instability of RIOAE and provides more flexible representation of nested data. We demonstrate the high performance of PSOAE in the three key tasks of generative models: exemplar generation, style transfer, and new concept generation.}
}
@InProceedings{pmlr-v139-choi21b,
title = {Variational Empowerment as Representation Learning for Goal-Conditioned Reinforcement Learning},
author = {Choi, Jongwook and Sharma, Archit and Lee, Honglak and Levine, Sergey and Gu, Shixiang Shane},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1953--1963},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/choi21b/choi21b.pdf},
url = {https://proceedings.mlr.press/v139/choi21b.html},
abstract = {Learning to reach goal states and learning diverse skills through mutual information maximization have been proposed as principled frameworks for unsupervised reinforcement learning, allowing agents to acquire broadly applicable multi-task policies with minimal reward engineering. In this paper, we discuss how these two approaches {—} goal-conditioned RL (GCRL) and MI-based RL {—} can be generalized into a single family of methods, interpreting mutual information maximization and variational empowerment as representation learning methods that acquire function-ally aware state representations for goal reaching.Starting from a simple observation that the standard GCRL is encapsulated by the optimization objective of variational empowerment, we can derive novel variants of GCRL and variational empowerment under a single, unified optimization objective, such as adaptive-variance GCRL and linear-mapping GCRL, and study the characteristics of representation learning each variant provides. Furthermore, through the lens of GCRL, we show that adapting powerful techniques fromGCRL such as goal relabeling into the variationalMI context as well as proper regularization on the variational posterior provides substantial gains in algorithm performance, and propose a novel evaluation metric named latent goal reaching (LGR)as an objective measure for evaluating empowerment algorithms akin to goal-based RL. Through principled mathematical derivations and careful experimental validations, our work lays a novel foundation from which representation learning can be evaluated and analyzed in goal-based RL}
}
@InProceedings{pmlr-v139-choquette-choo21a,
title = {Label-Only Membership Inference Attacks},
author = {Choquette-Choo, Christopher A. and Tramer, Florian and Carlini, Nicholas and Papernot, Nicolas},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1964--1974},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/choquette-choo21a/choquette-choo21a.pdf},
url = {https://proceedings.mlr.press/v139/choquette-choo21a.html},
abstract = {Membership inference is one of the simplest privacy threats faced by machine learning models that are trained on private sensitive data. In this attack, an adversary infers whether a particular point was used to train the model, or not, by observing the model’s predictions. Whereas current attack methods all require access to the model’s predicted confidence score, we introduce a label-only attack that instead evaluates the robustness of the model’s predicted (hard) labels under perturbations of the input, to infer membership. Our label-only attack is not only as-effective as attacks requiring access to confidence scores, it also demonstrates that a class of defenses against membership inference, which we call “confidence masking” because they obfuscate the confidence scores to thwart attacks, are insufficient to prevent the leakage of private information. Our experiments show that training with differential privacy or strong L2 regularization are the only current defenses that meaningfully decrease leakage of private information, even for points that are outliers of the training distribution.}
}
@InProceedings{pmlr-v139-chowdhury21a,
title = {Modeling Hierarchical Structures with Continuous Recursive Neural Networks},
author = {Chowdhury, Jishnu Ray and Caragea, Cornelia},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1975--1988},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chowdhury21a/chowdhury21a.pdf},
url = {https://proceedings.mlr.press/v139/chowdhury21a.html},
abstract = {Recursive Neural Networks (RvNNs), which compose sequences according to their underlying hierarchical syntactic structure, have performed well in several natural language processing tasks compared to similar models without structural biases. However, traditional RvNNs are incapable of inducing the latent structure in a plain text sequence on their own. Several extensions have been proposed to overcome this limitation. Nevertheless, these extensions tend to rely on surrogate gradients or reinforcement learning at the cost of higher bias or variance. In this work, we propose Continuous Recursive Neural Network (CRvNN) as a backpropagation-friendly alternative to address the aforementioned limitations. This is done by incorporating a continuous relaxation to the induced structure. We demonstrate that CRvNN achieves strong performance in challenging synthetic tasks such as logical inference (Bowman et al., 2015b) and ListOps (Nangia & Bowman, 2018). We also show that CRvNN performs comparably or better than prior latent structure models on real-world tasks such as sentiment analysis and natural language inference.}
}
@InProceedings{pmlr-v139-christianos21a,
title = {Scaling Multi-Agent Reinforcement Learning with Selective Parameter Sharing},
author = {Christianos, Filippos and Papoudakis, Georgios and Rahman, Muhammad A and Albrecht, Stefano V},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1989--1998},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/christianos21a/christianos21a.pdf},
url = {https://proceedings.mlr.press/v139/christianos21a.html},
abstract = {Sharing parameters in multi-agent deep reinforcement learning has played an essential role in allowing algorithms to scale to a large number of agents. Parameter sharing between agents significantly decreases the number of trainable parameters, shortening training times to tractable levels, and has been linked to more efficient learning. However, having all agents share the same parameters can also have a detrimental effect on learning. We demonstrate the impact of parameter sharing methods on training speed and converged returns, establishing that when applied indiscriminately, their effectiveness is highly dependent on the environment. We propose a novel method to automatically identify agents which may benefit from sharing parameters by partitioning them based on their abilities and goals. Our approach combines the increased sample efficiency of parameter sharing with the representational capacity of multiple independent networks to reduce training time and increase final returns.}
}
@InProceedings{pmlr-v139-chung21a,
title = {Beyond Variance Reduction: Understanding the True Impact of Baselines on Policy Optimization},
author = {Chung, Wesley and Thomas, Valentin and Machado, Marlos C. and Roux, Nicolas Le},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {1999--2009},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/chung21a/chung21a.pdf},
url = {https://proceedings.mlr.press/v139/chung21a.html},
abstract = {Bandit and reinforcement learning (RL) problems can often be framed as optimization problems where the goal is to maximize average performance while having access only to stochastic estimates of the true gradient. Traditionally, stochastic optimization theory predicts that learning dynamics are governed by the curvature of the loss function and the noise of the gradient estimates. In this paper we demonstrate that the standard view is too limited for bandit and RL problems. To allow our analysis to be interpreted in light of multi-step MDPs, we focus on techniques derived from stochastic optimization principles (e.g., natural policy gradient and EXP3) and we show that some standard assumptions from optimization theory are violated in these problems. We present theoretical results showing that, at least for bandit problems, curvature and noise are not sufficient to explain the learning dynamics and that seemingly innocuous choices like the baseline can determine whether an algorithm converges. These theoretical findings match our empirical evaluation, which we extend to multi-state MDPs.}
}
@InProceedings{pmlr-v139-clement21a,
title = {First-Order Methods for Wasserstein Distributionally Robust MDP},
author = {Clement, Julien Grand and Kroer, Christian},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2010--2019},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/clement21a/clement21a.pdf},
url = {https://proceedings.mlr.press/v139/clement21a.html},
abstract = {Markov decision processes (MDPs) are known to be sensitive to parameter specification. Distributionally robust MDPs alleviate this issue by allowing for \textit{ambiguity sets} which give a set of possible distributions over parameter sets. The goal is to find an optimal policy with respect to the worst-case parameter distribution. We propose a framework for solving Distributionally robust MDPs via first-order methods, and instantiate it for several types of Wasserstein ambiguity sets. By developing efficient proximal updates, our algorithms achieve a convergence rate of $O\left(NA^{2.5}S^{3.5}\log(S)\log(\epsilon^{-1})\epsilon^{-1.5} \right)$ for the number of kernels $N$ in the support of the nominal distribution, states $S$, and actions $A$; this rate varies slightly based on the Wasserstein setup. Our dependence on $N,A$ and $S$ is significantly better than existing methods, which have a complexity of $O\left(N^{3.5}A^{3.5}S^{4.5}\log^{2}(\epsilon^{-1}) \right)$. Numerical experiments show that our algorithm is significantly more scalable than state-of-the-art approaches across several domains.}
}
@InProceedings{pmlr-v139-cobbe21a,
title = {Phasic Policy Gradient},
author = {Cobbe, Karl W and Hilton, Jacob and Klimov, Oleg and Schulman, John},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2020--2027},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cobbe21a/cobbe21a.pdf},
url = {https://proceedings.mlr.press/v139/cobbe21a.html},
abstract = {We introduce Phasic Policy Gradient (PPG), a reinforcement learning framework which modifies traditional on-policy actor-critic methods by separating policy and value function training into distinct phases. In prior methods, one must choose between using a shared network or separate networks to represent the policy and value function. Using separate networks avoids interference between objectives, while using a shared network allows useful features to be shared. PPG is able to achieve the best of both worlds by splitting optimization into two phases, one that advances training and one that distills features. PPG also enables the value function to be more aggressively optimized with a higher level of sample reuse. Compared to PPO, we find that PPG significantly improves sample efficiency on the challenging Procgen Benchmark.}
}
@InProceedings{pmlr-v139-cohen21a,
title = {Riemannian Convex Potential Maps},
author = {Cohen, Samuel and Amos, Brandon and Lipman, Yaron},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2028--2038},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cohen21a/cohen21a.pdf},
url = {https://proceedings.mlr.press/v139/cohen21a.html},
abstract = {Modeling distributions on Riemannian manifolds is a crucial component in understanding non-Euclidean data that arises, e.g., in physics and geology. The budding approaches in this space are limited by representational and computational tradeoffs. We propose and study a class of flows that uses convex potentials from Riemannian optimal transport. These are universal and can model distributions on any compact Riemannian manifold without requiring domain knowledge of the manifold to be integrated into the architecture. We demonstrate that these flows can model standard distributions on spheres, and tori, on synthetic and geological data.}
}
@InProceedings{pmlr-v139-cohen21b,
title = {Scaling Properties of Deep Residual Networks},
author = {Cohen, Alain-Sam and Cont, Rama and Rossier, Alain and Xu, Renyuan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2039--2048},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cohen21b/cohen21b.pdf},
url = {https://proceedings.mlr.press/v139/cohen21b.html},
abstract = {Residual networks (ResNets) have displayed impressive results in pattern recognition and, recently, have garnered considerable theoretical interest due to a perceived link with neural ordinary differential equations (neural ODEs). This link relies on the convergence of network weights to a smooth function as the number of layers increases. We investigate the properties of weights trained by stochastic gradient descent and their scaling with network depth through detailed numerical experiments. We observe the existence of scaling regimes markedly different from those assumed in neural ODE literature. Depending on certain features of the network architecture, such as the smoothness of the activation function, one may obtain an alternative ODE limit, a stochastic differential equation or neither of these. These findings cast doubts on the validity of the neural ODE model as an adequate asymptotic description of deep ResNets and point to an alternative class of differential equations as a better description of the deep network limit.}
}
@InProceedings{pmlr-v139-cohen21c,
title = {Differentially-Private Clustering of Easy Instances},
author = {Cohen, Edith and Kaplan, Haim and Mansour, Yishay and Stemmer, Uri and Tsfadia, Eliad},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2049--2059},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cohen21c/cohen21c.pdf},
url = {https://proceedings.mlr.press/v139/cohen21c.html},
abstract = {Clustering is a fundamental problem in data analysis. In differentially private clustering, the goal is to identify k cluster centers without disclosing information on individual data points. Despite significant research progress, the problem had so far resisted practical solutions. In this work we aim at providing simple implementable differentrially private clustering algorithms when the the data is "easy," e.g., when there exists a significant separation between the clusters. For the easy instances we consider, we have a simple implementation based on utilizing non-private clustering algorithms, and combining them privately. We are able to get improved sample complexity bounds in some cases of Gaussian mixtures and k-means. We complement our theoretical algorithms with experiments of simulated data.}
}
@InProceedings{pmlr-v139-cohen-addad21a,
title = {Improving Ultrametrics Embeddings Through Coresets},
author = {Cohen-Addad, Vincent and De Joannis De Verclos, R{\'e}mi and Lagarde, Guillaume},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2060--2068},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cohen-addad21a/cohen-addad21a.pdf},
url = {https://proceedings.mlr.press/v139/cohen-addad21a.html},
abstract = {To tackle the curse of dimensionality in data analysis and unsupervised learning, it is critical to be able to efficiently compute “simple” faithful representations of the data that helps extract information, improves understanding and visualization of the structure. When the dataset consists of $d$-dimensional vectors, simple representations of the data may consist in trees or ultrametrics, and the goal is to best preserve the distances (i.e.: dissimilarity values) between data elements. To circumvent the quadratic running times of the most popular methods for fitting ultrametrics, such as average, single, or complete linkage, \citet{CKL20} recently presented a new algorithm that for any $c \ge 1$, outputs in time $n^{1+O(1/c^2)}$ an ultrametric $\Delta$ such that for any two points $u, v$, $\Delta(u, v)$ is within a multiplicative factor of $5c$ to the distance between $u$ and $v$ in the “best” ultrametric representation. We improve the above result and show how to improve the above guarantee from $5c$ to $\sqrt{2}c + \varepsilon$ while achieving the same asymptotic running time. To complement the improved theoretical bound, we additionally show that the performances of our algorithm are significantly better for various real-world datasets.}
}
@InProceedings{pmlr-v139-cohen-addad21b,
title = {Correlation Clustering in Constant Many Parallel Rounds},
author = {Cohen-Addad, Vincent and Lattanzi, Silvio and Mitrovi{\'c}, Slobodan and Norouzi-Fard, Ashkan and Parotsidis, Nikos and Tarnawski, Jakub},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2069--2078},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cohen-addad21b/cohen-addad21b.pdf},
url = {https://proceedings.mlr.press/v139/cohen-addad21b.html},
abstract = {Correlation clustering is a central topic in unsupervised learning, with many applications in ML and data mining. In correlation clustering, one receives as input a signed graph and the goal is to partition it to minimize the number of disagreements. In this work we propose a massively parallel computation (MPC) algorithm for this problem that is considerably faster than prior work. In particular, our algorithm uses machines with memory sublinear in the number of nodes in the graph and returns a constant approximation while running only for a constant number of rounds. To the best of our knowledge, our algorithm is the first that can provably approximate a clustering problem using only a constant number of MPC rounds in the sublinear memory regime. We complement our analysis with an experimental scalability evaluation of our techniques.}
}
@InProceedings{pmlr-v139-collas21a,
title = {Concentric mixtures of Mallows models for top-$k$ rankings: sampling and identifiability},
author = {Collas, Fabien and Irurozki, Ekhine},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2079--2088},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/collas21a/collas21a.pdf},
url = {https://proceedings.mlr.press/v139/collas21a.html},
abstract = {In this paper, we study mixtures of two Mallows models for top-$k$ rankings with equal location parameters but with different scale parameters (a mixture of concentric Mallows models). These models arise when we have a heterogeneous population of voters formed by two populations, one of which is a subpopulation of expert voters. We show the identifiability of both components and the learnability of their respective parameters. These results are based upon, first, bounding the sample complexity for the Borda algorithm with top-$k$ rankings. Second, we characterize the distances between rankings, showing that an off-the-shelf clustering algorithm separates the rankings by components with high probability -provided the scales are well-separated.As a by-product, we include an efficient sampling algorithm for Mallows top-$k$ rankings. Finally, since the rank aggregation will suffer from a large amount of noise introduced by the non-expert voters, we adapt the Borda algorithm to be able to recover the ground truth consensus ranking which is especially consistent with the expert rankings.}
}
@InProceedings{pmlr-v139-collins21a,
title = {Exploiting Shared Representations for Personalized Federated Learning},
author = {Collins, Liam and Hassani, Hamed and Mokhtari, Aryan and Shakkottai, Sanjay},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2089--2099},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/collins21a/collins21a.pdf},
url = {https://proceedings.mlr.press/v139/collins21a.html},
abstract = {Deep neural networks have shown the ability to extract universal feature representations from data such as images and text that have been useful for a variety of learning tasks. However, the fruits of representation learning have yet to be fully-realized in federated settings. Although data in federated settings is often non-i.i.d. across clients, the success of centralized deep learning suggests that data often shares a global {\em feature representation}, while the statistical heterogeneity across clients or tasks is concentrated in the {\em labels}. Based on this intuition, we propose a novel federated learning framework and algorithm for learning a shared data representation across clients and unique local heads for each client. Our algorithm harnesses the distributed computational power across clients to perform many local-updates with respect to the low-dimensional local parameters for every update of the representation. We prove that this method obtains linear convergence to the ground-truth representation with near-optimal sample complexity in a linear setting, demonstrating that it can efficiently reduce the problem dimension for each client. Further, we provide extensive experimental results demonstrating the improvement of our method over alternative personalized federated learning approaches in heterogeneous settings.}
}
@InProceedings{pmlr-v139-corenflos21a,
title = {Differentiable Particle Filtering via Entropy-Regularized Optimal Transport},
author = {Corenflos, Adrien and Thornton, James and Deligiannidis, George and Doucet, Arnaud},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2100--2111},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/corenflos21a/corenflos21a.pdf},
url = {https://proceedings.mlr.press/v139/corenflos21a.html},
abstract = {Particle Filtering (PF) methods are an established class of procedures for performing inference in non-linear state-space models. Resampling is a key ingredient of PF necessary to obtain low variance likelihood and states estimates. However, traditional resampling methods result in PF-based loss functions being non-differentiable with respect to model and PF parameters. In a variational inference context, resampling also yields high variance gradient estimates of the PF-based evidence lower bound. By leveraging optimal transport ideas, we introduce a principled differentiable particle filter and provide convergence results. We demonstrate this novel method on a variety of applications.}
}
@InProceedings{pmlr-v139-correa21a,
title = {Fairness and Bias in Online Selection},
author = {Correa, Jose and Cristi, Andres and Duetting, Paul and Norouzi-Fard, Ashkan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2112--2121},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/correa21a/correa21a.pdf},
url = {https://proceedings.mlr.press/v139/correa21a.html},
abstract = {There is growing awareness and concern about fairness in machine learning and algorithm design. This is particularly true in online selection problems where decisions are often biased, for example, when assessing credit risks or hiring staff. We address the issues of fairness and bias in online selection by introducing multi-color versions of the classic secretary and prophet problem. Interestingly, existing algorithms for these problems are either very unfair or very inefficient, so we develop optimal fair algorithms for these new problems and provide tight bounds on their competitiveness. We validate our theoretical findings on real-world data.}
}
@InProceedings{pmlr-v139-cortes21a,
title = {Relative Deviation Margin Bounds},
author = {Cortes, Corinna and Mohri, Mehryar and Suresh, Ananda Theertha},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2122--2131},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cortes21a/cortes21a.pdf},
url = {https://proceedings.mlr.press/v139/cortes21a.html},
abstract = {We present a series of new and more favorable margin-based learning guarantees that depend on the empirical margin loss of a predictor. e give two types of learning bounds, in terms of either the Rademacher complexity or the empirical $\ell_\infty$-covering number of the hypothesis set used, both distribution-dependent and valid for general families. Furthermore, using our relative deviation margin bounds, we derive distribution-dependent generalization bounds for unbounded loss functions under the assumption of a finite moment. We also briefly highlight several applications of these bounds and discuss their connection with existing results.}
}
@InProceedings{pmlr-v139-cortes21b,
title = {A Discriminative Technique for Multiple-Source Adaptation},
author = {Cortes, Corinna and Mohri, Mehryar and Suresh, Ananda Theertha and Zhang, Ningshan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2132--2143},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cortes21b/cortes21b.pdf},
url = {https://proceedings.mlr.press/v139/cortes21b.html},
abstract = {We present a new discriminative technique for the multiple-source adaptation (MSA) problem. Unlike previous work, which relies on density estimation for each source domain, our solution only requires conditional probabilities that can be straightforwardly accurately estimated from unlabeled data from the source domains. We give a detailed analysis of our new technique, including general guarantees based on Rényi divergences, and learning bounds when conditional Maxent is used for estimating conditional probabilities for a point to belong to a source domain. We show that these guarantees compare favorably to those that can be derived for the generative solution, using kernel density estimation. Our experiments with real-world applications further demonstrate that our new discriminative MSA algorithm outperforms the previous generative solution as well as other domain adaptation baselines.}
}
@InProceedings{pmlr-v139-coston21a,
title = {Characterizing Fairness Over the Set of Good Models Under Selective Labels},
author = {Coston, Amanda and Rambachan, Ashesh and Chouldechova, Alexandra},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2144--2155},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/coston21a/coston21a.pdf},
url = {https://proceedings.mlr.press/v139/coston21a.html},
abstract = {Algorithmic risk assessments are used to inform decisions in a wide variety of high-stakes settings. Often multiple predictive models deliver similar overall performance but differ markedly in their predictions for individual cases, an empirical phenomenon known as the “Rashomon Effect.” These models may have different properties over various groups, and therefore have different predictive fairness properties. We develop a framework for characterizing predictive fairness properties over the set of models that deliver similar overall performance, or “the set of good models.” Our framework addresses the empirically relevant challenge of selectively labelled data in the setting where the selection decision and outcome are unconfounded given the observed data features. Our framework can be used to 1) audit for predictive bias; or 2) replace an existing model with one that has better fairness properties. We illustrate these use cases on a recidivism prediction task and a real-world credit-scoring task.}
}
@InProceedings{pmlr-v139-couillet21a,
title = {Two-way kernel matrix puncturing: towards resource-efficient PCA and spectral clustering},
author = {Couillet, Romain and Chatelain, Florent and Bihan, Nicolas Le},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2156--2165},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/couillet21a/couillet21a.pdf},
url = {https://proceedings.mlr.press/v139/couillet21a.html},
abstract = {The article introduces an elementary cost and storage reduction method for spectral clustering and principal component analysis. The method consists in randomly “puncturing” both the data matrix $X\in\mathbb{C}^{p\times n}$ (or $\mathbb{R}^{p\times n}$) and its corresponding kernel (Gram) matrix $K$ through Bernoulli masks: $S\in\{0,1\}^{p\times n}$ for $X$ and $B\in\{0,1\}^{n\times n}$ for $K$. The resulting “two-way punctured” kernel is thus given by $K=\frac1p[(X\odot S)^\H (X\odot S)]\odot B$. We demonstrate that, for $X$ composed of independent columns drawn from a Gaussian mixture model, as $n,p\to\infty$ with $p/n\to c_0\in(0,\infty)$, the spectral behavior of $K$ – its limiting eigenvalue distribution, as well as its isolated eigenvalues and eigenvectors – is fully tractable and exhibits a series of counter-intuitive phenomena. We notably prove, and empirically confirm on various image databases, that it is possible to drastically puncture the data, thereby providing possibly huge computational and storage gains, for a virtually constant (clustering or PCA) performance. This preliminary study opens as such the path towards rethinking, from a large dimensional standpoint, computational and storage costs in elementary machine learning models.}
}
@InProceedings{pmlr-v139-crabbe21a,
title = {Explaining Time Series Predictions with Dynamic Masks},
author = {Crabb{\'e}, Jonathan and Van Der Schaar, Mihaela},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2166--2177},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/crabbe21a/crabbe21a.pdf},
url = {https://proceedings.mlr.press/v139/crabbe21a.html},
abstract = {How can we explain the predictions of a machine learning model? When the data is structured as a multivariate time series, this question induces additional difficulties such as the necessity for the explanation to embody the time dependency and the large number of inputs. To address these challenges, we propose dynamic masks (Dynamask). This method produces instance-wise importance scores for each feature at each time step by fitting a perturbation mask to the input sequence. In order to incorporate the time dependency of the data, Dynamask studies the effects of dynamic perturbation operators. In order to tackle the large number of inputs, we propose a scheme to make the feature selection parsimonious (to select no more feature than necessary) and legible (a notion that we detail by making a parallel with information theory). With synthetic and real-world data, we demonstrate that the dynamic underpinning of Dynamask, together with its parsimony, offer a neat improvement in the identification of feature importance over time. The modularity of Dynamask makes it ideal as a plug-in to increase the transparency of a wide range of machine learning models in areas such as medicine and finance, where time series are abundant.}
}
@InProceedings{pmlr-v139-cranko21a,
title = {Generalised Lipschitz Regularisation Equals Distributional Robustness},
author = {Cranko, Zac and Shi, Zhan and Zhang, Xinhua and Nock, Richard and Kornblith, Simon},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2178--2188},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cranko21a/cranko21a.pdf},
url = {https://proceedings.mlr.press/v139/cranko21a.html},
abstract = {The problem of adversarial examples has highlighted the need for a theory of regularisation that is general enough to apply to exotic function classes, such as universal approximators. In response, we have been able to significantly sharpen existing results regarding the relationship between distributional robustness and regularisation, when defined with a transportation cost uncertainty set. The theory allows us to characterise the conditions under which the distributional robustness equals a Lipschitz-regularised model, and to tightly quantify, for the first time, the slackness under very mild assumptions. As a theoretical application we show a new result explicating the connection between adversarial learning and distributional robustness. We then give new results for how to achieve Lipschitz regularisation of kernel classifiers, which are demonstrated experimentally.}
}
@InProceedings{pmlr-v139-creager21a,
title = {Environment Inference for Invariant Learning},
author = {Creager, Elliot and Jacobsen, Joern-Henrik and Zemel, Richard},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2189--2200},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/creager21a/creager21a.pdf},
url = {https://proceedings.mlr.press/v139/creager21a.html},
abstract = {Learning models that gracefully handle distribution shifts is central to research on domain generalization, robust optimization, and fairness. A promising formulation is domain-invariant learning, which identifies the key issue of learning which features are domain-specific versus domain-invariant. An important assumption in this area is that the training examples are partitioned into “domains” or “environments”. Our focus is on the more common setting where such partitions are not provided. We propose EIIL, a general framework for domain-invariant learning that incorporates Environment Inference to directly infer partitions that are maximally informative for downstream Invariant Learning. We show that EIIL outperforms invariant learning methods on the CMNIST benchmark without using environment labels, and significantly outperforms ERM on worst-group performance in the Waterbirds dataset. Finally, we establish connections between EIIL and algorithmic fairness, which enables EIIL to improve accuracy and calibration in a fair prediction problem.}
}
@InProceedings{pmlr-v139-croce21a,
title = {Mind the Box: $l_1$-APGD for Sparse Adversarial Attacks on Image Classifiers},
author = {Croce, Francesco and Hein, Matthias},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2201--2211},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/croce21a/croce21a.pdf},
url = {https://proceedings.mlr.press/v139/croce21a.html},
abstract = {We show that when taking into account also the image domain $[0,1]^d$, established $l_1$-projected gradient descent (PGD) attacks are suboptimal as they do not consider that the effective threat model is the intersection of the $l_1$-ball and $[0,1]^d$. We study the expected sparsity of the steepest descent step for this effective threat model and show that the exact projection onto this set is computationally feasible and yields better performance. Moreover, we propose an adaptive form of PGD which is highly effective even with a small budget of iterations. Our resulting $l_1$-APGD is a strong white-box attack showing that prior works overestimated their $l_1$-robustness. Using $l_1$-APGD for adversarial training we get a robust classifier with SOTA $l_1$-robustness. Finally, we combine $l_1$-APGD and an adaptation of the Square Attack to $l_1$ into $l_1$-AutoAttack, an ensemble of attacks which reliably assesses adversarial robustness for the threat model of $l_1$-ball intersected with $[0,1]^d$.}
}
@InProceedings{pmlr-v139-cui21a,
title = {Parameterless Transductive Feature Re-representation for Few-Shot Learning},
author = {Cui, Wentao and Guo, Yuhong},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2212--2221},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cui21a/cui21a.pdf},
url = {https://proceedings.mlr.press/v139/cui21a.html},
abstract = {Recent literature in few-shot learning (FSL) has shown that transductive methods often outperform their inductive counterparts. However, most transductive solutions, particularly the meta-learning based ones, require inserting trainable parameters on top of some inductive baselines to facilitate transduction. In this paper, we propose a parameterless transductive feature re-representation framework that differs from all existing solutions from the following perspectives. (1) It is widely compatible with existing FSL methods, including meta-learning and fine tuning based models. (2) The framework is simple and introduces no extra training parameters when applied to any architecture. We conduct experiments on three benchmark datasets by applying the framework to both representative meta-learning baselines and state-of-the-art FSL methods. Our framework consistently improves performances in all experiments and refreshes the state-of-the-art FSL results.}
}
@InProceedings{pmlr-v139-cui21b,
title = {Randomized Algorithms for Submodular Function Maximization with a $k$-System Constraint},
author = {Cui, Shuang and Han, Kai and Zhu, Tianshuai and Tang, Jing and Wu, Benwei and Huang, He},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2222--2232},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cui21b/cui21b.pdf},
url = {https://proceedings.mlr.press/v139/cui21b.html},
abstract = {Submodular optimization has numerous applications such as crowdsourcing and viral marketing. In this paper, we study the problem of non-negative submodular function maximization subject to a $k$-system constraint, which generalizes many other important constraints in submodular optimization such as cardinality constraint, matroid constraint, and $k$-extendible system constraint. The existing approaches for this problem are all based on deterministic algorithmic frameworks, and the best approximation ratio achieved by these algorithms (for a general submodular function) is $k+2\sqrt{k+2}+3$. We propose a randomized algorithm with an improved approximation ratio of $(1+\sqrt{k})^2$, while achieving nearly-linear time complexity significantly lower than that of the state-of-the-art algorithm. We also show that our algorithm can be further generalized to address a stochastic case where the elements can be adaptively selected, and propose an approximation ratio of $(1+\sqrt{k+1})^2$ for the adaptive optimization case. The empirical performance of our algorithms is extensively evaluated in several applications related to data mining and social computing, and the experimental results demonstrate the superiorities of our algorithms in terms of both utility and efficiency.}
}
@InProceedings{pmlr-v139-cui21c,
title = {GBHT: Gradient Boosting Histogram Transform for Density Estimation},
author = {Cui, Jingyi and Hang, Hanyuan and Wang, Yisen and Lin, Zhouchen},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2233--2243},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cui21c/cui21c.pdf},
url = {https://proceedings.mlr.press/v139/cui21c.html},
abstract = {In this paper, we propose a density estimation algorithm called \textit{Gradient Boosting Histogram Transform} (GBHT), where we adopt the \textit{Negative Log Likelihood} as the loss function to make the boosting procedure available for the unsupervised tasks. From a learning theory viewpoint, we first prove fast convergence rates for GBHT with the smoothness assumption that the underlying density function lies in the space $C^{0,\alpha}$. Then when the target density function lies in spaces $C^{1,\alpha}$, we present an upper bound for GBHT which is smaller than the lower bound of its corresponding base learner, in the sense of convergence rates. To the best of our knowledge, we make the first attempt to theoretically explain why boosting can enhance the performance of its base learners for density estimation problems. In experiments, we not only conduct performance comparisons with the widely used KDE, but also apply GBHT to anomaly detection to showcase a further application of GBHT.}
}
@InProceedings{pmlr-v139-cummins21a,
title = {ProGraML: A Graph-based Program Representation for Data Flow Analysis and Compiler Optimizations},
author = {Cummins, Chris and Fisches, Zacharias V. and Ben-Nun, Tal and Hoefler, Torsten and O'Boyle, Michael F P and Leather, Hugh},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2244--2253},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cummins21a/cummins21a.pdf},
url = {https://proceedings.mlr.press/v139/cummins21a.html},
abstract = {Machine learning (ML) is increasingly seen as a viable approach for building compiler optimization heuristics, but many ML methods cannot replicate even the simplest of the data flow analyses that are critical to making good optimization decisions. We posit that if ML cannot do that, then it is insufficiently able to reason about programs. We formulate data flow analyses as supervised learning tasks and introduce a large open dataset of programs and their corresponding labels from several analyses. We use this dataset to benchmark ML methods and show that they struggle on these fundamental program reasoning tasks. We propose ProGraML - Program Graphs for Machine Learning - a language-independent, portable representation of program semantics. ProGraML overcomes the limitations of prior works and yields improved performance on downstream optimization tasks.}
}
@InProceedings{pmlr-v139-curi21a,
title = {Combining Pessimism with Optimism for Robust and Efficient Model-Based Deep Reinforcement Learning},
author = {Curi, Sebastian and Bogunovic, Ilija and Krause, Andreas},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2254--2264},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/curi21a/curi21a.pdf},
url = {https://proceedings.mlr.press/v139/curi21a.html},
abstract = {In real-world tasks, reinforcement learning (RL) agents frequently encounter situations that are not present during training time. To ensure reliable performance, the RL agents need to exhibit robustness to such worst-case situations. The robust-RL framework addresses this challenge via a minimax optimization between an agent and an adversary. Previous robust RL algorithms are either sample inefficient, lack robustness guarantees, or do not scale to large problems. We propose the Robust Hallucinated Upper-Confidence RL (RH-UCRL) algorithm to provably solve this problem while attaining near-optimal sample complexity guarantees. RH-UCRL is a model-based reinforcement learning (MBRL) algorithm that effectively distinguishes between epistemic and aleatoric uncertainty and efficiently explores both the agent and the adversary decision spaces during policy learning. We scale RH-UCRL to complex tasks via neural networks ensemble models as well as neural network policies. Experimentally we demonstrate that RH-UCRL outperforms other robust deep RL algorithms in a variety of adversarial environments.}
}
@InProceedings{pmlr-v139-curmei21a,
title = {Quantifying Availability and Discovery in Recommender Systems via Stochastic Reachability},
author = {Curmei, Mihaela and Dean, Sarah and Recht, Benjamin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2265--2275},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/curmei21a/curmei21a.pdf},
url = {https://proceedings.mlr.press/v139/curmei21a.html},
abstract = {In this work, we consider how preference models in interactive recommendation systems determine the availability of content and users’ opportunities for discovery. We propose an evaluation procedure based on stochastic reachability to quantify the maximum probability of recommending a target piece of content to an user for a set of allowable strategic modifications. This framework allows us to compute an upper bound on the likelihood of recommendation with minimal assumptions about user behavior. Stochastic reachability can be used to detect biases in the availability of content and diagnose limitations in the opportunities for discovery granted to users. We show that this metric can be computed efficiently as a convex program for a variety of practical settings, and further argue that reachability is not inherently at odds with accuracy. We demonstrate evaluations of recommendation algorithms trained on large datasets of explicit and implicit ratings. Our results illustrate how preference models, selection rules, and user interventions impact reachability and how these effects can be distributed unevenly.}
}
@InProceedings{pmlr-v139-cutkosky21a,
title = {Dynamic Balancing for Model Selection in Bandits and RL},
author = {Cutkosky, Ashok and Dann, Christoph and Das, Abhimanyu and Gentile, Claudio and Pacchiano, Aldo and Purohit, Manish},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2276--2285},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/cutkosky21a/cutkosky21a.pdf},
url = {https://proceedings.mlr.press/v139/cutkosky21a.html},
abstract = {We propose a framework for model selection by combining base algorithms in stochastic bandits and reinforcement learning. We require a candidate regret bound for each base algorithm that may or may not hold. We select base algorithms to play in each round using a “balancing condition” on the candidate regret bounds. Our approach simultaneously recovers previous worst-case regret bounds, while also obtaining much smaller regret in natural scenarios when some base learners significantly exceed their candidate bounds. Our framework is relevant in many settings, including linear bandits and MDPs with nested function classes, linear bandits with unknown misspecification, and tuning confidence parameters of algorithms such as LinUCB. Moreover, unlike recent efforts in model selection for linear stochastic bandits, our approach can be extended to consider adversarial rather than stochastic contexts.}
}
@InProceedings{pmlr-v139-d-ascoli21a,
title = {ConViT: Improving Vision Transformers with Soft Convolutional Inductive Biases},
author = {D'Ascoli, St{\'e}phane and Touvron, Hugo and Leavitt, Matthew L and Morcos, Ari S and Biroli, Giulio and Sagun, Levent},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2286--2296},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/d-ascoli21a/d-ascoli21a.pdf},
url = {https://proceedings.mlr.press/v139/d-ascoli21a.html},
abstract = {Convolutional architectures have proven extremely successful for vision tasks. Their hard inductive biases enable sample-efficient learning, but come at the cost of a potentially lower performance ceiling. Vision Transformers (ViTs) rely on more flexible self-attention layers, and have recently outperformed CNNs for image classification. However, they require costly pre-training on large external datasets or distillation from pre-trained convolutional networks. In this paper, we ask the following question: is it possible to combine the strengths of these two architectures while avoiding their respective limitations? To this end, we introduce gated positional self-attention (GPSA), a form of positional self-attention which can be equipped with a “soft" convolutional inductive bias. We initialise the GPSA layers to mimic the locality of convolutional layers, then give each attention head the freedom to escape locality by adjusting a gating parameter regulating the attention paid to position versus content information. The resulting convolutional-like ViT architecture, ConViT, outperforms the DeiT on ImageNet, while offering a much improved sample efficiency. We further investigate the role of locality in learning by first quantifying how it is encouraged in vanilla self-attention layers, then analysing how it is escaped in GPSA layers. We conclude by presenting various ablations to better understand the success of the ConViT. Our code and models are released publicly at https://github.com/facebookresearch/convit.}
}
@InProceedings{pmlr-v139-d-orsi21a,
title = {Consistent regression when oblivious outliers overwhelm},
author = {D'Orsi, Tommaso and Novikov, Gleb and Steurer, David},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2297--2306},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/d-orsi21a/d-orsi21a.pdf},
url = {https://proceedings.mlr.press/v139/d-orsi21a.html},
abstract = {We consider a robust linear regression model $y=X\beta^* + \eta$, where an adversary oblivious to the design $X\in \mathbb{R}^{n\times d}$ may choose $\eta$ to corrupt all but an $\alpha$ fraction of the observations $y$ in an arbitrary way. Prior to our work, even for Gaussian $X$, no estimator for $\beta^*$ was known to be consistent in this model except for quadratic sample size $n \gtrsim (d/\alpha)^2$ or for logarithmic inlier fraction $\alpha\ge 1/\log n$. We show that consistent estimation is possible with nearly linear sample size and inverse-polynomial inlier fraction. Concretely, we show that the Huber loss estimator is consistent for every sample size $n= \omega(d/\alpha^2)$ and achieves an error rate of $O(d/\alpha^2n)^{1/2}$ (both bounds are optimal up to constant factors). Our results extend to designs far beyond the Gaussian case and only require the column span of $X$ to not contain approximately sparse vectors (similar to the kind of assumption commonly made about the kernel space for compressed sensing). We provide two technically similar proofs. One proof is phrased in terms of strong convexity, extending work of [Tsakonas et al. ’14], and particularly short. The other proof highlights a connection between the Huber loss estimator and high-dimensional median computations. In the special case of Gaussian designs, this connection leads us to a strikingly simple algorithm based on computing coordinate-wise medians that achieves nearly optimal guarantees in linear time, and that can exploit sparsity of $\beta^*$. The model studied here also captures heavy-tailed noise distributions that may not even have a first moment.}
}
@InProceedings{pmlr-v139-dadashi21a,
title = {Offline Reinforcement Learning with Pseudometric Learning},
author = {Dadashi, Robert and Rezaeifar, Shideh and Vieillard, Nino and Hussenot, L{\'e}onard and Pietquin, Olivier and Geist, Matthieu},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2307--2318},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/dadashi21a/dadashi21a.pdf},
url = {https://proceedings.mlr.press/v139/dadashi21a.html},
abstract = {Offline Reinforcement Learning methods seek to learn a policy from logged transitions of an environment, without any interaction. In the presence of function approximation, and under the assumption of limited coverage of the state-action space of the environment, it is necessary to enforce the policy to visit state-action pairs close to the support of logged transitions. In this work, we propose an iterative procedure to learn a pseudometric (closely related to bisimulation metrics) from logged transitions, and use it to define this notion of closeness. We show its convergence and extend it to the function approximation setting. We then use this pseudometric to define a new lookup based bonus in an actor-critic algorithm: PLOFF. This bonus encourages the actor to stay close, in terms of the defined pseudometric, to the support of logged transitions. Finally, we evaluate the method on hand manipulation and locomotion tasks.}
}
@InProceedings{pmlr-v139-daghaghi21a,
title = {A Tale of Two Efficient and Informative Negative Sampling Distributions},
author = {Daghaghi, Shabnam and Medini, Tharun and Meisburger, Nicholas and Chen, Beidi and Zhao, Mengnan and Shrivastava, Anshumali},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2319--2329},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/daghaghi21a/daghaghi21a.pdf},
url = {https://proceedings.mlr.press/v139/daghaghi21a.html},
abstract = {Softmax classifiers with a very large number of classes naturally occur in many applications such as natural language processing and information retrieval. The calculation of full softmax is costly from the computational and energy perspective. There have been various sampling approaches to overcome this challenge, popularly known as negative sampling (NS). Ideally, NS should sample negative classes from a distribution that is dependent on the input data, the current parameters, and the correct positive class. Unfortunately, due to the dynamically updated parameters and data samples, there is no sampling scheme that is provably adaptive and samples the negative classes efficiently. Therefore, alternative heuristics like random sampling, static frequency-based sampling, or learning-based biased sampling, which primarily trade either the sampling cost or the adaptivity of samples per iteration are adopted. In this paper, we show two classes of distributions where the sampling scheme is truly adaptive and provably generates negative samples in near-constant time. Our implementation in C++ on CPU is significantly superior, both in terms of wall-clock time and accuracy, compared to the most optimized TensorFlow implementations of other popular negative sampling approaches on powerful NVIDIA V100 GPU.}
}
@InProceedings{pmlr-v139-dahiya21a,
title = {SiameseXML: Siamese Networks meet Extreme Classifiers with 100M Labels},
author = {Dahiya, Kunal and Agarwal, Ananye and Saini, Deepak and K, Gururaj and Jiao, Jian and Singh, Amit and Agarwal, Sumeet and Kar, Purushottam and Varma, Manik},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2330--2340},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/dahiya21a/dahiya21a.pdf},
url = {https://proceedings.mlr.press/v139/dahiya21a.html},
abstract = {Deep extreme multi-label learning (XML) requires training deep architectures that can tag a data point with its most relevant subset of labels from an extremely large label set. XML applications such as ad and product recommendation involve labels rarely seen during training but which nevertheless hold the key to recommendations that delight users. Effective utilization of label metadata and high quality predictions for rare labels at the scale of millions of labels are thus key challenges in contemporary XML research. To address these, this paper develops the SiameseXML framework based on a novel probabilistic model that naturally motivates a modular approach melding Siamese architectures with high-capacity extreme classifiers, and a training pipeline that effortlessly scales to tasks with 100 million labels. SiameseXML offers predictions 2–13% more accurate than leading XML methods on public benchmark datasets, as well as in live A/B tests on the Bing search engine, it offers significant gains in click-through-rates, coverage, revenue and other online metrics over state-of-the-art techniques currently in production. Code for SiameseXML is available at https://github.com/Extreme-classification/siamesexml}
}
@InProceedings{pmlr-v139-dahiya21b,
title = {Fixed-Parameter and Approximation Algorithms for PCA with Outliers},
author = {Dahiya, Yogesh and Fomin, Fedor and Panolan, Fahad and Simonov, Kirill},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2341--2351},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/dahiya21b/dahiya21b.pdf},
url = {https://proceedings.mlr.press/v139/dahiya21b.html},
abstract = {PCA with Outliers is the fundamental problem of identifying an underlying low-dimensional subspace in a data set corrupted with outliers. A large body of work is devoted to the information-theoretic aspects of this problem. However, from the computational perspective, its complexity is still not well-understood. We study this problem from the perspective of parameterized complexity by investigating how parameters like the dimension of the data, the subspace dimension, the number of outliers and their structure, and approximation error, influence the computational complexity of the problem. Our algorithmic methods are based on techniques of randomized linear algebra and algebraic geometry.}
}
@InProceedings{pmlr-v139-dai21a,
title = {Sliced Iterative Normalizing Flows},
author = {Dai, Biwei and Seljak, Uros},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2352--2364},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/dai21a/dai21a.pdf},
url = {https://proceedings.mlr.press/v139/dai21a.html},
abstract = {We develop an iterative (greedy) deep learning (DL) algorithm which is able to transform an arbitrary probability distribution function (PDF) into the target PDF. The model is based on iterative Optimal Transport of a series of 1D slices, matching on each slice the marginal PDF to the target. The axes of the orthogonal slices are chosen to maximize the PDF difference using Wasserstein distance at each iteration, which enables the algorithm to scale well to high dimensions. As special cases of this algorithm, we introduce two sliced iterative Normalizing Flow (SINF) models, which map from the data to the latent space (GIS) and vice versa (SIG). We show that SIG is able to generate high quality samples of image datasets, which match the GAN benchmarks, while GIS obtains competitive results on density estimation tasks compared to the density trained NFs, and is more stable, faster, and achieves higher p(x) when trained on small training sets. SINF approach deviates significantly from the current DL paradigm, as it is greedy and does not use concepts such as mini-batching, stochastic gradient descent and gradient back-propagation through deep layers.}
}
@InProceedings{pmlr-v139-dam21a,
title = {Convex Regularization in Monte-Carlo Tree Search},
author = {Dam, Tuan Q and D'Eramo, Carlo and Peters, Jan and Pajarinen, Joni},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2365--2375},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/dam21a/dam21a.pdf},
url = {https://proceedings.mlr.press/v139/dam21a.html},
abstract = {Monte-Carlo planning and Reinforcement Learning (RL) are essential to sequential decision making. The recent AlphaGo and AlphaZero algorithms have shown how to successfully combine these two paradigms to solve large-scale sequential decision problems. These methodologies exploit a variant of the well-known UCT algorithm to trade off the exploitation of good actions and the exploration of unvisited states, but their empirical success comes at the cost of poor sample-efficiency and high computation time. In this paper, we overcome these limitations by introducing the use of convex regularization in Monte-Carlo Tree Search (MCTS) to drive exploration efficiently and to improve policy updates. First, we introduce a unifying theory on the use of generic convex regularizers in MCTS, deriving the first regret analysis of regularized MCTS and showing that it guarantees an exponential convergence rate. Second, we exploit our theoretical framework to introduce novel regularized backup operators for MCTS, based on the relative entropy of the policy update and, more importantly, on the Tsallis entropy of the policy, for which we prove superior theoretical guarantees. We empirically verify the consequence of our theoretical results on a toy problem. Finally, we show how our framework can easily be incorporated in AlphaGo and we empirically show the superiority of convex regularization, w.r.t. representative baselines, on well-known RL problems across several Atari games.}
}
@InProceedings{pmlr-v139-dance21a,
title = {Demonstration-Conditioned Reinforcement Learning for Few-Shot Imitation},
author = {Dance, Christopher R. and Perez, Julien and Cachet, Th{\'e}o},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2376--2387},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/dance21a/dance21a.pdf},
url = {https://proceedings.mlr.press/v139/dance21a.html},
abstract = {In few-shot imitation, an agent is given a few demonstrations of a previously unseen task, and must then successfully perform that task. We propose a novel approach to learning few-shot-imitation agents that we call demonstration-conditioned reinforcement learning (DCRL). Given a training set consisting of demonstrations, reward functions and transition distributions for multiple tasks, the idea is to work with a policy that takes demonstrations as input, and to train this policy to maximize the average of the cumulative reward over the set of training tasks. Relative to previously proposed few-shot imitation methods that use behaviour cloning or infer reward functions from demonstrations, our method has the disadvantage that it requires reward functions at training time. However, DCRL also has several advantages, such as the ability to improve upon suboptimal demonstrations, to operate given state-only demonstrations, and to cope with a domain shift between the demonstrator and the agent. Moreover, we show that DCRL outperforms methods based on behaviour cloning by a large margin, on navigation tasks and on robotic manipulation tasks from the Meta-World benchmark.}
}
@InProceedings{pmlr-v139-danesh21a,
title = {Re-understanding Finite-State Representations of Recurrent Policy Networks},
author = {Danesh, Mohamad H and Koul, Anurag and Fern, Alan and Khorram, Saeed},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2388--2397},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/danesh21a/danesh21a.pdf},
url = {https://proceedings.mlr.press/v139/danesh21a.html},
abstract = {We introduce an approach for understanding control policies represented as recurrent neural networks. Recent work has approached this problem by transforming such recurrent policy networks into finite-state machines (FSM) and then analyzing the equivalent minimized FSM. While this led to interesting insights, the minimization process can obscure a deeper understanding of a machine’s operation by merging states that are semantically distinct. To address this issue, we introduce an analysis approach that starts with an unminimized FSM and applies more-interpretable reductions that preserve the key decision points of the policy. We also contribute an attention tool to attain a deeper understanding of the role of observations in the decisions. Our case studies on 7 Atari games and 3 control benchmarks demonstrate that the approach can reveal insights that have not been previously noticed.}
}
@InProceedings{pmlr-v139-daneshmand21a,
title = {Newton Method over Networks is Fast up to the Statistical Precision},
author = {Daneshmand, Amir and Scutari, Gesualdo and Dvurechensky, Pavel and Gasnikov, Alexander},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2398--2409},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/daneshmand21a/daneshmand21a.pdf},
url = {https://proceedings.mlr.press/v139/daneshmand21a.html},
abstract = {We propose a distributed cubic regularization of the Newton method for solving (constrained) empirical risk minimization problems over a network of agents, modeled as undirected graph. The algorithm employs an inexact, preconditioned Newton step at each agent’s side: the gradient of the centralized loss is iteratively estimated via a gradient-tracking consensus mechanism and the Hessian is subsampled over the local data sets. No Hessian matrices are exchanged over the network. We derive global complexity bounds for convex and strongly convex losses. Our analysis reveals an interesting interplay between sample and iteration/communication complexity: statistically accurate solutions are achievable in roughly the same number of iterations of the centralized cubic Newton, with a communication cost per iteration of the order of $\widetilde{\mathcal{O}}\big(1/\sqrt{1-\rho}\big)$, where $\rho$ characterizes the connectivity of the network. This represents a significant improvement with respect to existing, statistically oblivious, distributed Newton-based methods over networks.}
}
@InProceedings{pmlr-v139-danks21a,
title = {BasisDeVAE: Interpretable Simultaneous Dimensionality Reduction and Feature-Level Clustering with Derivative-Based Variational Autoencoders},
author = {Danks, Dominic and Yau, Christopher},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2410--2420},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/danks21a/danks21a.pdf},
url = {https://proceedings.mlr.press/v139/danks21a.html},
abstract = {The Variational Autoencoder (VAE) performs effective nonlinear dimensionality reduction in a variety of problem settings. However, the black-box neural network decoder function typically employed limits the ability of the decoder function to be constrained and interpreted, making the use of VAEs problematic in settings where prior knowledge should be embedded within the decoder. We present DeVAE, a novel VAE-based model with a derivative-based forward mapping, allowing for greater control over decoder behaviour via specification of the decoder function in derivative space. Additionally, we show how DeVAE can be paired with a sparse clustering prior to create BasisDeVAE and perform interpretable simultaneous dimensionality reduction and feature-level clustering. We demonstrate the performance and scalability of the DeVAE and BasisDeVAE models on synthetic and real-world data and present how the derivative-based approach allows for expressive yet interpretable forward models which respect prior knowledge.}
}
@InProceedings{pmlr-v139-daras21a,
title = {Intermediate Layer Optimization for Inverse Problems using Deep Generative Models},
author = {Daras, Giannis and Dean, Joseph and Jalal, Ajil and Dimakis, Alex},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2421--2432},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/daras21a/daras21a.pdf},
url = {https://proceedings.mlr.press/v139/daras21a.html},
abstract = {We propose Intermediate Layer Optimization (ILO), a novel optimization algorithm for solving inverse problems with deep generative models. Instead of optimizing only over the initial latent code, we progressively change the input layer obtaining successively more expressive generators. To explore the higher dimensional spaces, our method searches for latent codes that lie within a small l1 ball around the manifold induced by the previous layer. Our theoretical analysis shows that by keeping the radius of the ball relatively small, we can improve the established error bound for compressed sensing with deep generative models. We empirically show that our approach outperforms state-of-the-art methods introduced in StyleGAN2 and PULSE for a wide range of inverse problems including inpainting, denoising, super-resolution and compressed sensing.}
}
@InProceedings{pmlr-v139-darestani21a,
title = {Measuring Robustness in Deep Learning Based Compressive Sensing},
author = {Darestani, Mohammad Zalbagi and Chaudhari, Akshay S and Heckel, Reinhard},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2433--2444},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/darestani21a/darestani21a.pdf},
url = {https://proceedings.mlr.press/v139/darestani21a.html},
abstract = {Deep neural networks give state-of-the-art accuracy for reconstructing images from few and noisy measurements, a problem arising for example in accelerated magnetic resonance imaging (MRI). However, recent works have raised concerns that deep-learning-based image reconstruction methods are sensitive to perturbations and are less robust than traditional methods: Neural networks (i) may be sensitive to small, yet adversarially-selected perturbations, (ii) may perform poorly under distribution shifts, and (iii) may fail to recover small but important features in an image. In order to understand the sensitivity to such perturbations, in this work, we measure the robustness of different approaches for image reconstruction including trained and un-trained neural networks as well as traditional sparsity-based methods. We find, contrary to prior works, that both trained and un-trained methods are vulnerable to adversarial perturbations. Moreover, both trained and un-trained methods tuned for a particular dataset suffer very similarly from distribution shifts. Finally, we demonstrate that an image reconstruction method that achieves higher reconstruction quality, also performs better in terms of accurately recovering fine details. Our results indicate that the state-of-the-art deep-learning-based image reconstruction methods provide improved performance than traditional methods without compromising robustness.}
}
@InProceedings{pmlr-v139-das21a,
title = {SAINT-ACC: Safety-Aware Intelligent Adaptive Cruise Control for Autonomous Vehicles Using Deep Reinforcement Learning},
author = {Das, Lokesh Chandra and Won, Myounggyu},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2445--2455},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/das21a/das21a.pdf},
url = {https://proceedings.mlr.press/v139/das21a.html},
abstract = {We present a novel adaptive cruise control (ACC) system namely SAINT-ACC: {S}afety-{A}ware {Int}elligent {ACC} system (SAINT-ACC) that is designed to achieve simultaneous optimization of traffic efficiency, driving safety, and driving comfort through dynamic adaptation of the inter-vehicle gap based on deep reinforcement learning (RL). A novel dual RL agent-based approach is developed to seek and adapt the optimal balance between traffic efficiency and driving safety/comfort by effectively controlling the driving safety model parameters and inter-vehicle gap based on macroscopic and microscopic traffic information collected from dynamically changing and complex traffic environments. Results obtained through over 12,000 simulation runs with varying traffic scenarios and penetration rates demonstrate that SAINT-ACC significantly enhances traffic flow, driving safety and comfort compared with a state-of-the-art approach.}
}
@InProceedings{pmlr-v139-dasoulas21a,
title = {Lipschitz normalization for self-attention layers with application to graph neural networks},
author = {Dasoulas, George and Scaman, Kevin and Virmaux, Aladin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2456--2466},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/dasoulas21a/dasoulas21a.pdf},
url = {https://proceedings.mlr.press/v139/dasoulas21a.html},
abstract = {Attention based neural networks are state of the art in a large range of applications. However, their performance tends to degrade when the number of layers increases. In this work, we show that enforcing Lipschitz continuity by normalizing the attention scores can significantly improve the performance of deep attention models. First, we show that, for deep graph attention networks (GAT), gradient explosion appears during training, leading to poor performance of gradient-based training algorithms. To address this issue, we derive a theoretical analysis of the Lipschitz continuity of attention modules and introduce LipschitzNorm, a simple and parameter-free normalization for self-attention mechanisms that enforces the model to be Lipschitz continuous. We then apply LipschitzNorm to GAT and Graph Transformers and show that their performance is substantially improved in the deep setting (10 to 30 layers). More specifically, we show that a deep GAT model with LipschitzNorm achieves state of the art results for node label prediction tasks that exhibit long-range dependencies, while showing consistent improvements over their unnormalized counterparts in benchmark node classification tasks.}
}
@InProceedings{pmlr-v139-dass21a,
title = {Householder Sketch for Accurate and Accelerated Least-Mean-Squares Solvers},
author = {Dass, Jyotikrishna and Mahapatra, Rabi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2467--2477},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/dass21a/dass21a.pdf},
url = {https://proceedings.mlr.press/v139/dass21a.html},
abstract = {Least-Mean-Squares (\textsc{LMS}) solvers comprise a class of fundamental optimization problems such as linear regression, and regularized regressions such as Ridge, LASSO, and Elastic-Net. Data summarization techniques for big data generate summaries called coresets and sketches to speed up model learning under streaming and distributed settings. For example, \citep{nips2019} design a fast and accurate Caratheodory set on input data to boost the performance of existing \textsc{LMS} solvers. In retrospect, we explore classical Householder transformation as a candidate for sketching and accurately solving LMS problems. We find it to be a simpler, memory-efficient, and faster alternative that always existed to the above strong baseline. We also present a scalable algorithm based on the construction of distributed Householder sketches to solve \textsc{LMS} problem across multiple worker nodes. We perform thorough empirical analysis with large synthetic and real datasets to evaluate the performance of Householder sketch and compare with \citep{nips2019}. Our results show Householder sketch speeds up existing \textsc{LMS} solvers in the scikit-learn library up to $100$x-$400$x. Also, it is $10$x-$100$x faster than the above baseline with similar numerical stability. The distributed algorithm demonstrates linear scalability with a near-negligible communication overhead.}
}
@InProceedings{pmlr-v139-data21a,
title = {Byzantine-Resilient High-Dimensional SGD with Local Iterations on Heterogeneous Data},
author = {Data, Deepesh and Diggavi, Suhas},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2478--2488},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/data21a/data21a.pdf},
url = {https://proceedings.mlr.press/v139/data21a.html},
abstract = {We study stochastic gradient descent (SGD) with local iterations in the presence of Byzantine clients, motivated by the federated learning. The clients, instead of communicating with the server in every iteration, maintain their local models, which they update by taking several SGD iterations based on their own datasets and then communicate the net update with the server, thereby achieving communication-efficiency. Furthermore, only a subset of clients communicates with the server at synchronization times. The Byzantine clients may collude and send arbitrary vectors to the server to disrupt the learning process. To combat the adversary, we employ an efficient high-dimensional robust mean estimation algorithm at the server to filter-out corrupt vectors; and to analyze the outlier-filtering procedure, we develop a novel matrix concentration result that may be of independent interest. We provide convergence analyses for both strongly-convex and non-convex smooth objectives in the heterogeneous data setting. We believe that ours is the first Byzantine-resilient local SGD algorithm and analysis with non-trivial guarantees. We corroborate our theoretical results with preliminary experiments for neural network training.}
}
@InProceedings{pmlr-v139-davis21a,
title = {Catformer: Designing Stable Transformers via Sensitivity Analysis},
author = {Davis, Jared Q and Gu, Albert and Choromanski, Krzysztof and Dao, Tri and Re, Christopher and Finn, Chelsea and Liang, Percy},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2489--2499},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/davis21a/davis21a.pdf},
url = {https://proceedings.mlr.press/v139/davis21a.html},
abstract = {Transformer architectures are widely used, but training them is non-trivial, requiring custom learning rate schedules, scaling terms, residual connections, careful placement of submodules such as normalization, and so on. In this paper, we improve upon recent analysis of Transformers and formalize a notion of sensitivity to capture the difficulty of training. Sensitivity characterizes how the variance of activation and gradient norms change in expectation when parameters are randomly perturbed. We analyze the sensitivity of previous Transformer architectures and design a new architecture, the Catformer, which replaces residual connections or RNN-based gating mechanisms with concatenation. We prove that Catformers are less sensitive than other Transformer variants and demonstrate that this leads to more stable training. On DMLab30, a suite of high-dimension reinforcement tasks, Catformer outperforms other transformers, including Gated Transformer-XL—the state-of-the-art architecture designed to address stability—by 13%.}
}
@InProceedings{pmlr-v139-dawkins21a,
title = {Diffusion Source Identification on Networks with Statistical Confidence},
author = {Dawkins, Quinlan E and Li, Tianxi and Xu, Haifeng},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2500--2509},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/dawkins21a/dawkins21a.pdf},
url = {https://proceedings.mlr.press/v139/dawkins21a.html},
abstract = {Diffusion source identification on networks is a problem of fundamental importance in a broad class of applications, including controlling the spreading of rumors on social media, identifying a computer virus over cyber networks, or identifying the disease center during epidemiology. Though this problem has received significant recent attention, most known approaches are well-studied in only very restrictive settings and lack theoretical guarantees for more realistic networks. We introduce a statistical framework for the study of this problem and develop a confidence set inference approach inspired by hypothesis testing. Our method efficiently produces a small subset of nodes, which provably covers the source node with any pre-specified confidence level without restrictive assumptions on network structures. To our knowledge, this is the first diffusion source identification method with a practically useful theoretical guarantee on general networks. We demonstrate our approach via extensive synthetic experiments on well-known random network models, a large data set of real-world networks as well as a mobility network between cities concerning the COVID-19 spreading in January 2020.}
}
@InProceedings{pmlr-v139-daxberger21a,
title = {Bayesian Deep Learning via Subnetwork Inference},
author = {Daxberger, Erik and Nalisnick, Eric and Allingham, James U and Antoran, Javier and Hernandez-Lobato, Jose Miguel},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2510--2521},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/daxberger21a/daxberger21a.pdf},
url = {https://proceedings.mlr.press/v139/daxberger21a.html},
abstract = {The Bayesian paradigm has the potential to solve core issues of deep neural networks such as poor calibration and data inefficiency. Alas, scaling Bayesian inference to large weight spaces often requires restrictive approximations. In this work, we show that it suffices to perform inference over a small subset of model weights in order to obtain accurate predictive posteriors. The other weights are kept as point estimates. This subnetwork inference framework enables us to use expressive, otherwise intractable, posterior approximations over such subsets. In particular, we implement subnetwork linearized Laplace as a simple, scalable Bayesian deep learning method: We first obtain a MAP estimate of all weights and then infer a full-covariance Gaussian posterior over a subnetwork using the linearized Laplace approximation. We propose a subnetwork selection strategy that aims to maximally preserve the model’s predictive uncertainty. Empirically, our approach compares favorably to ensembles and less expressive posterior approximations over full networks.}
}
@InProceedings{pmlr-v139-de-palma21a,
title = {Adversarial Robustness Guarantees for Random Deep Neural Networks},
author = {De Palma, Giacomo and Kiani, Bobak and Lloyd, Seth},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2522--2534},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/de-palma21a/de-palma21a.pdf},
url = {https://proceedings.mlr.press/v139/de-palma21a.html},
abstract = {The reliability of deep learning algorithms is fundamentally challenged by the existence of adversarial examples, which are incorrectly classified inputs that are extremely close to a correctly classified input. We explore the properties of adversarial examples for deep neural networks with random weights and biases, and prove that for any p$\geq$1, the \ell^p distance of any given input from the classification boundary scales as one over the square root of the dimension of the input times the \ell^p norm of the input. The results are based on the recently proved equivalence between Gaussian processes and deep neural networks in the limit of infinite width of the hidden layers, and are validated with experiments on both random deep neural networks and deep neural networks trained on the MNIST and CIFAR10 datasets. The results constitute a fundamental advance in the theoretical understanding of adversarial examples, and open the way to a thorough theoretical characterization of the relation between network architecture and robustness to adversarial perturbations.}
}
@InProceedings{pmlr-v139-de-roos21a,
title = {High-Dimensional Gaussian Process Inference with Derivatives},
author = {de Roos, Filip and Gessner, Alexandra and Hennig, Philipp},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2535--2545},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/de-roos21a/de-roos21a.pdf},
url = {https://proceedings.mlr.press/v139/de-roos21a.html},
abstract = {Although it is widely known that Gaussian processes can be conditioned on observations of the gradient, this functionality is of limited use due to the prohibitive computational cost of $\mathcal{O}(N^3 D^3)$ in data points $N$ and dimension $D$. The dilemma of gradient observations is that a single one of them comes at the same cost as $D$ independent function evaluations, so the latter are often preferred. Careful scrutiny reveals, however, that derivative observations give rise to highly structured kernel Gram matrices for very general classes of kernels (inter alia, stationary kernels). We show that in the \emph{low-data} regime $N < D$, the Gram matrix can be decomposed in a manner that reduces the cost of inference to $\mathcal{O}(N^2D + (N^2)^3)$ (i.e., linear in the number of dimensions) and, in special cases, to $\mathcal{O}(N^2D + N^3)$. This reduction in complexity opens up new use-cases for inference with gradients especially in the high-dimensional regime, where the information-to-cost ratio of gradient observations significantly increases. We demonstrate this potential in a variety of tasks relevant for machine learning, such as optimization and Hamiltonian Monte Carlo with predictive gradients.}
}
@InProceedings{pmlr-v139-deecke21a,
title = {Transfer-Based Semantic Anomaly Detection},
author = {Deecke, Lucas and Ruff, Lukas and Vandermeulen, Robert A. and Bilen, Hakan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2546--2558},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/deecke21a/deecke21a.pdf},
url = {https://proceedings.mlr.press/v139/deecke21a.html},
abstract = {Detecting semantic anomalies is challenging due to the countless ways in which they may appear in real-world data. While enhancing the robustness of networks may be sufficient for modeling simplistic anomalies, there is no good known way of preparing models for all potential and unseen anomalies that can potentially occur, such as the appearance of new object classes. In this paper, we show that a previously overlooked strategy for anomaly detection (AD) is to introduce an explicit inductive bias toward representations transferred over from some large and varied semantic task. We rigorously verify our hypothesis in controlled trials that utilize intervention, and show that it gives rise to surprisingly effective auxiliary objectives that outperform previous AD paradigms.}
}
@InProceedings{pmlr-v139-dehesa21a,
title = {Grid-Functioned Neural Networks},
author = {Dehesa, Javier and Vidler, Andrew and Padget, Julian and Lutteroth, Christof},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2559--2567},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/dehesa21a/dehesa21a.pdf},
url = {https://proceedings.mlr.press/v139/dehesa21a.html},
abstract = {We introduce a new neural network architecture that we call "grid-functioned" neural networks. It utilises a grid structure of network parameterisations that can be specialised for different subdomains of the problem, while maintaining smooth, continuous behaviour. The grid gives the user flexibility to prevent gross features from overshadowing important minor ones. We present a full characterisation of its computational and spatial complexity, and demonstrate its potential, compared to a traditional architecture, over a set of synthetic regression problems. We further illustrate the benefits through a real-world 3D skeletal animation case study, where it offers the same visual quality as a state-of-the-art model, but with lower computational complexity and better control accuracy.}
}
@InProceedings{pmlr-v139-demaine21a,
title = {Multidimensional Scaling: Approximation and Complexity},
author = {Demaine, Erik and Hesterberg, Adam and Koehler, Frederic and Lynch, Jayson and Urschel, John},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2568--2578},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/demaine21a/demaine21a.pdf},
url = {https://proceedings.mlr.press/v139/demaine21a.html},
abstract = {Metric Multidimensional scaling (MDS) is a classical method for generating meaningful (non-linear) low-dimensional embeddings of high-dimensional data. MDS has a long history in the statistics, machine learning, and graph drawing communities. In particular, the Kamada-Kawai force-directed graph drawing method is equivalent to MDS and is one of the most popular ways in practice to embed graphs into low dimensions. Despite its ubiquity, our theoretical understanding of MDS remains limited as its objective function is highly non-convex. In this paper, we prove that minimizing the Kamada-Kawai objective is NP-hard and give a provable approximation algorithm for optimizing it, which in particular is a PTAS on low-diameter graphs. We supplement this result with experiments suggesting possible connections between our greedy approximation algorithm and gradient-based methods.}
}
@InProceedings{pmlr-v139-deng21a,
title = {What Does Rotation Prediction Tell Us about Classifier Accuracy under Varying Testing Environments?},
author = {Deng, Weijian and Gould, Stephen and Zheng, Liang},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2579--2589},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/deng21a/deng21a.pdf},
url = {https://proceedings.mlr.press/v139/deng21a.html},
abstract = {Understanding classifier decision under novel environments is central to the community, and a common practice is evaluating it on labeled test sets. However, in real-world testing, image annotations are difficult and expensive to obtain, especially when the test environment is changing. A natural question then arises: given a trained classifier, can we evaluate its accuracy on varying unlabeled test sets? In this work, we train semantic classification and rotation prediction in a multi-task way. On a series of datasets, we report an interesting finding, i.e., the semantic classification accuracy exhibits a strong linear relationship with the accuracy of the rotation prediction task (Pearson’s Correlation r > 0.88). This finding allows us to utilize linear regression to estimate classifier performance from the accuracy of rotation prediction which can be obtained on the test set through the freely generated rotation labels.}
}
@InProceedings{pmlr-v139-deng21b,
title = {Toward Better Generalization Bounds with Locally Elastic Stability},
author = {Deng, Zhun and He, Hangfeng and Su, Weijie},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2590--2600},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/deng21b/deng21b.pdf},
url = {https://proceedings.mlr.press/v139/deng21b.html},
abstract = {Algorithmic stability is a key characteristic to ensure the generalization ability of a learning algorithm. Among different notions of stability, \emph{uniform stability} is arguably the most popular one, which yields exponential generalization bounds. However, uniform stability only considers the worst-case loss change (or so-called sensitivity) by removing a single data point, which is distribution-independent and therefore undesirable. There are many cases that the worst-case sensitivity of the loss is much larger than the average sensitivity taken over the single data point that is removed, especially in some advanced models such as random feature models or neural networks. Many previous works try to mitigate the distribution independent issue by proposing weaker notions of stability, however, they either only yield polynomial bounds or the bounds derived do not vanish as sample size goes to infinity. Given that, we propose \emph{locally elastic stability} as a weaker and distribution-dependent stability notion, which still yields exponential generalization bounds. We further demonstrate that locally elastic stability implies tighter generalization bounds than those derived based on uniform stability in many situations by revisiting the examples of bounded support vector machines, regularized least square regressions, and stochastic gradient descent.}
}
@InProceedings{pmlr-v139-deng21c,
title = {Revenue-Incentive Tradeoffs in Dynamic Reserve Pricing},
author = {Deng, Yuan and Lahaie, Sebastien and Mirrokni, Vahab and Zuo, Song},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2601--2610},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/deng21c/deng21c.pdf},
url = {https://proceedings.mlr.press/v139/deng21c.html},
abstract = {Online advertisements are primarily sold via repeated auctions with reserve prices. In this paper, we study how to set reserves to boost revenue based on the historical bids of strategic buyers, while controlling the impact of such a policy on the incentive compatibility of the repeated auctions. Adopting an incentive compatibility metric which quantifies the incentives to shade bids, we propose a novel class of reserve pricing policies and provide analytical tradeoffs between their revenue performance and bid-shading incentives. The policies are inspired by the exponential mechanism from the literature on differential privacy, but our study uncovers mechanisms with significantly better revenue-incentive tradeoffs than the exponential mechanism in practice. We further empirically evaluate the tradeoffs on synthetic data as well as real ad auction data from a major ad exchange to verify and support our theoretical findings.}
}
@InProceedings{pmlr-v139-dennis21a,
title = {Heterogeneity for the Win: One-Shot Federated Clustering},
author = {Dennis, Don Kurian and Li, Tian and Smith, Virginia},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2611--2620},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/dennis21a/dennis21a.pdf},
url = {https://proceedings.mlr.press/v139/dennis21a.html},
abstract = {In this work, we explore the unique challenges—and opportunities—of unsupervised federated learning (FL). We develop and analyze a one-shot federated clustering scheme, kfed, based on the widely-used Lloyd’s method for $k$-means clustering. In contrast to many supervised problems, we show that the issue of statistical heterogeneity in federated networks can in fact benefit our analysis. We analyse kfed under a center separation assumption and compare it to the best known requirements of its centralized counterpart. Our analysis shows that in heterogeneous regimes where the number of clusters per device $(k’)$ is smaller than the total number of clusters over the network $k$, $(k’\le \sqrt{k})$, we can use heterogeneity to our advantage—significantly weakening the cluster separation requirements for kfed. From a practical viewpoint, kfed also has many desirable properties: it requires only round of communication, can run asynchronously, and can handle partial participation or node/network failures. We motivate our analysis with experiments on common FL benchmarks, and highlight the practical utility of one-shot clustering through use-cases in personalized FL and device sampling.}
}
@InProceedings{pmlr-v139-derakhshani21a,
title = {Kernel Continual Learning},
author = {Derakhshani, Mohammad Mahdi and Zhen, Xiantong and Shao, Ling and Snoek, Cees},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2621--2631},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/derakhshani21a/derakhshani21a.pdf},
url = {https://proceedings.mlr.press/v139/derakhshani21a.html},
abstract = {This paper introduces kernel continual learning, a simple but effective variant of continual learning that leverages the non-parametric nature of kernel methods to tackle catastrophic forgetting. We deploy an episodic memory unit that stores a subset of samples for each task to learn task-specific classifiers based on kernel ridge regression. This does not require memory replay and systematically avoids task interference in the classifiers. We further introduce variational random features to learn a data-driven kernel for each task. To do so, we formulate kernel continual learning as a variational inference problem, where a random Fourier basis is incorporated as the latent variable. The variational posterior distribution over the random Fourier basis is inferred from the coreset of each task. In this way, we are able to generate more informative kernels specific to each task, and, more importantly, the coreset size can be reduced to achieve more compact memory, resulting in more efficient continual learning based on episodic memory. Extensive evaluation on four benchmarks demonstrates the effectiveness and promise of kernels for continual learning.}
}
@InProceedings{pmlr-v139-deshwal21a,
title = {Bayesian Optimization over Hybrid Spaces},
author = {Deshwal, Aryan and Belakaria, Syrine and Doppa, Janardhan Rao},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2632--2643},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/deshwal21a/deshwal21a.pdf},
url = {https://proceedings.mlr.press/v139/deshwal21a.html},
abstract = {We consider the problem of optimizing hybrid structures (mixture of discrete and continuous input variables) via expensive black-box function evaluations. This problem arises in many real-world applications. For example, in materials design optimization via lab experiments, discrete and continuous variables correspond to the presence/absence of primitive elements and their relative concentrations respectively. The key challenge is to accurately model the complex interactions between discrete and continuous variables. In this paper, we propose a novel approach referred as Hybrid Bayesian Optimization (HyBO) by utilizing diffusion kernels, which are naturally defined over continuous and discrete variables. We develop a principled approach for constructing diffusion kernels over hybrid spaces by utilizing the additive kernel formulation, which allows additive interactions of all orders in a tractable manner. We theoretically analyze the modeling strength of additive hybrid kernels and prove that it has the universal approximation property. Our experiments on synthetic and six diverse real-world benchmarks show that HyBO significantly outperforms the state-of-the-art methods.}
}
@InProceedings{pmlr-v139-devlin21a,
title = {Navigation Turing Test (NTT): Learning to Evaluate Human-Like Navigation},
author = {Devlin, Sam and Georgescu, Raluca and Momennejad, Ida and Rzepecki, Jaroslaw and Zuniga, Evelyn and Costello, Gavin and Leroy, Guy and Shaw, Ali and Hofmann, Katja},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2644--2653},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/devlin21a/devlin21a.pdf},
url = {https://proceedings.mlr.press/v139/devlin21a.html},
abstract = {A key challenge on the path to developing agents that learn complex human-like behavior is the need to quickly and accurately quantify human-likeness. While human assessments of such behavior can be highly accurate, speed and scalability are limited. We address these limitations through a novel automated Navigation Turing Test (ANTT) that learns to predict human judgments of human-likeness. We demonstrate the effectiveness of our automated NTT on a navigation task in a complex 3D environment. We investigate six classification models to shed light on the types of architectures best suited to this task, and validate them against data collected through a human NTT. Our best models achieve high accuracy when distinguishing true human and agent behavior. At the same time, we show that predicting finer-grained human assessment of agents’ progress towards human-like behavior remains unsolved. Our work takes an important step towards agents that more effectively learn complex human-like behavior.}
}
@InProceedings{pmlr-v139-devos21a,
title = {Versatile Verification of Tree Ensembles},
author = {Devos, Laurens and Meert, Wannes and Davis, Jesse},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2654--2664},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/devos21a/devos21a.pdf},
url = {https://proceedings.mlr.press/v139/devos21a.html},
abstract = {Machine learned models often must abide by certain requirements (e.g., fairness or legal). This has spurred interested in developing approaches that can provably verify whether a model satisfies certain properties. This paper introduces a generic algorithm called Veritas that enables tackling multiple different verification tasks for tree ensemble models like random forests (RFs) and gradient boosted decision trees (GBDTs). This generality contrasts with previous work, which has focused exclusively on either adversarial example generation or robustness checking. Veritas formulates the verification task as a generic optimization problem and introduces a novel search space representation. Veritas offers two key advantages. First, it provides anytime lower and upper bounds when the optimization problem cannot be solved exactly. In contrast, many existing methods have focused on exact solutions and are thus limited by the verification problem being NP-complete. Second, Veritas produces full (bounded suboptimal) solutions that can be used to generate concrete examples. We experimentally show that our method produces state-of-the-art robustness estimates, especially when executed with strict time constraints. This is exceedingly important when checking the robustness of large datasets. Additionally, we show that Veritas enables tackling more real-world verification scenarios.}
}
@InProceedings{pmlr-v139-dhifallah21a,
title = {On the Inherent Regularization Effects of Noise Injection During Training},
author = {Dhifallah, Oussama and Lu, Yue},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2665--2675},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/dhifallah21a/dhifallah21a.pdf},
url = {https://proceedings.mlr.press/v139/dhifallah21a.html},
abstract = {Randomly perturbing networks during the training process is a commonly used approach to improving generalization performance. In this paper, we present a theoretical study of one particular way of random perturbation, which corresponds to injecting artificial noise to the training data. We provide a precise asymptotic characterization of the training and generalization errors of such randomly perturbed learning problems on a random feature model. Our analysis shows that Gaussian noise injection in the training process is equivalent to introducing a weighted ridge regularization, when the number of noise injections tends to infinity. The explicit form of the regularization is also given. Numerical results corroborate our asymptotic predictions, showing that they are accurate even in moderate problem dimensions. Our theoretical predictions are based on a new correlated Gaussian equivalence conjecture that generalizes recent results in the study of random feature models.}
}
@InProceedings{pmlr-v139-dhulipala21a,
title = {Hierarchical Agglomerative Graph Clustering in Nearly-Linear Time},
author = {Dhulipala, Laxman and Eisenstat, David and {\L}{\k{a}}cki, Jakub and Mirrokni, Vahab and Shi, Jessica},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2676--2686},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/dhulipala21a/dhulipala21a.pdf},
url = {https://proceedings.mlr.press/v139/dhulipala21a.html},
abstract = {We study the widely-used hierarchical agglomerative clustering (HAC) algorithm on edge-weighted graphs. We define an algorithmic framework for hierarchical agglomerative graph clustering that provides the first efficient $\tilde{O}(m)$ time exact algorithms for classic linkage measures, such as complete- and WPGMA-linkage, as well as other measures. Furthermore, for average-linkage, arguably the most popular variant of HAC, we provide an algorithm that runs in $\tilde{O}(n\sqrt{m})$ time. For this variant, this is the first exact algorithm that runs in subquadratic time, as long as $m=n^{2-\epsilon}$ for some constant $\epsilon > 0$. We complement this result with a simple $\epsilon$-close approximation algorithm for average-linkage in our framework that runs in $\tilde{O}(m)$ time. As an application of our algorithms, we consider clustering points in a metric space by first using $k$-NN to generate a graph from the point set, and then running our algorithms on the resulting weighted graph. We validate the performance of our algorithms on publicly available datasets, and show that our approach can speed up clustering of point datasets by a factor of 20.7–76.5x.}
}
@InProceedings{pmlr-v139-diakonikolas21a,
title = {Learning Online Algorithms with Distributional Advice},
author = {Diakonikolas, Ilias and Kontonis, Vasilis and Tzamos, Christos and Vakilian, Ali and Zarifis, Nikos},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2687--2696},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/diakonikolas21a/diakonikolas21a.pdf},
url = {https://proceedings.mlr.press/v139/diakonikolas21a.html},
abstract = {We study the problem of designing online algorithms given advice about the input. While prior work had focused on deterministic advice, we only assume distributional access to the instances of interest, and the goal is to learn a competitive algorithm given access to i.i.d. samples. We aim to be competitive against an adversary with prior knowledge of the distribution, while also performing well against worst-case inputs. We focus on the classical online problems of ski-rental and prophet-inequalities, and provide sample complexity bounds for the underlying learning tasks. First, we point out that for general distributions it is information-theoretically impossible to beat the worst-case competitive-ratio with any finite sample size. As our main contribution, we establish strong positive results for well-behaved distributions. Specifically, for the broad class of log-concave distributions, we show that $\mathrm{poly}(1/\epsilon)$ samples suffice to obtain $(1+\epsilon)$-competitive ratio. Finally, we show that this sample upper bound is close to best possible, even for very simple classes of distributions.}
}
@InProceedings{pmlr-v139-diamandis21a,
title = {A Wasserstein Minimax Framework for Mixed Linear Regression},
author = {Diamandis, Theo and Eldar, Yonina and Fallah, Alireza and Farnia, Farzan and Ozdaglar, Asuman},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2697--2706},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/diamandis21a/diamandis21a.pdf},
url = {https://proceedings.mlr.press/v139/diamandis21a.html},
abstract = {Multi-modal distributions are commonly used to model clustered data in statistical learning tasks. In this paper, we consider the Mixed Linear Regression (MLR) problem. We propose an optimal transport-based framework for MLR problems, Wasserstein Mixed Linear Regression (WMLR), which minimizes the Wasserstein distance between the learned and target mixture regression models. Through a model-based duality analysis, WMLR reduces the underlying MLR task to a nonconvex-concave minimax optimization problem, which can be provably solved to find a minimax stationary point by the Gradient Descent Ascent (GDA) algorithm. In the special case of mixtures of two linear regression models, we show that WMLR enjoys global convergence and generalization guarantees. We prove that WMLR’s sample complexity grows linearly with the dimension of data. Finally, we discuss the application of WMLR to the federated learning task where the training samples are collected by multiple agents in a network. Unlike the Expectation-Maximization algorithm, WMLR directly extends to the distributed, federated learning setting. We support our theoretical results through several numerical experiments, which highlight our framework’s ability to handle the federated learning setting with mixture models.}
}
@InProceedings{pmlr-v139-dickens21a,
title = {Context-Aware Online Collective Inference for Templated Graphical Models},
author = {Dickens, Charles and Pryor, Connor and Augustine, Eriq and Miller, Alexander and Getoor, Lise},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2707--2716},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/dickens21a/dickens21a.pdf},
url = {https://proceedings.mlr.press/v139/dickens21a.html},
abstract = {In this work, we examine online collective inference, the problem of maintaining and performing inference over a sequence of evolving graphical models. We utilize templated graphical models (TGM), a general class of graphical models expressed via templates and instantiated with data. A key challenge is minimizing the cost of instantiating the updated model. To address this, we define a class of exact and approximate context-aware methods for updating an existing TGM. These methods avoid a full re-instantiation by using the context of the updates to only add relevant components to the graphical model. Further, we provide stability bounds for the general online inference problem and regret bounds for a proposed approximation. Finally, we implement our approach in probabilistic soft logic, and test it on several online collective inference tasks. Through these experiments we verify the bounds on regret and stability, and show that our approximate online approach consistently runs two to five times faster than the offline alternative while, surprisingly, maintaining the quality of the predictions.}
}
@InProceedings{pmlr-v139-dimitriev21a,
title = {ARMS: Antithetic-REINFORCE-Multi-Sample Gradient for Binary Variables},
author = {Dimitriev, Aleksandar and Zhou, Mingyuan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2717--2727},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/dimitriev21a/dimitriev21a.pdf},
url = {https://proceedings.mlr.press/v139/dimitriev21a.html},
abstract = {Estimating the gradients for binary variables is a task that arises frequently in various domains, such as training discrete latent variable models. What has been commonly used is a REINFORCE based Monte Carlo estimation method that uses either independent samples or pairs of negatively correlated samples. To better utilize more than two samples, we propose ARMS, an Antithetic REINFORCE-based Multi-Sample gradient estimator. ARMS uses a copula to generate any number of mutually antithetic samples. It is unbiased, has low variance, and generalizes both DisARM, which we show to be ARMS with two samples, and the leave-one-out REINFORCE (LOORF) estimator, which is ARMS with uncorrelated samples. We evaluate ARMS on several datasets for training generative models, and our experimental results show that it outperforms competing methods. We also develop a version of ARMS for optimizing the multi-sample variational bound, and show that it outperforms both VIMCO and DisARM. The code is publicly available.}
}
@InProceedings{pmlr-v139-ding21a,
title = {XOR-CD: Linearly Convergent Constrained Structure Generation},
author = {Ding, Fan and Ma, Jianzhu and Xu, Jinbo and Xue, Yexiang},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2728--2738},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ding21a/ding21a.pdf},
url = {https://proceedings.mlr.press/v139/ding21a.html},
abstract = {We propose XOR-Contrastive Divergence learning (XOR-CD), a provable approach for constrained structure generation, which remains difficult for state-of-the-art neural network and constraint reasoning approaches. XOR-CD harnesses XOR-Sampling to generate samples from the model distribution in CD learning and is guaranteed to generate valid structures. In addition, XOR-CD has a linear convergence rate towards the global maximum of the likelihood function within a vanishing constant in learning exponential family models. Constraint satisfaction enabled by XOR-CD also boosts its learning performance. Our real-world experiments on data-driven experimental design, dispatching route generation, and sequence-based protein homology detection demonstrate the superior performance of XOR-CD compared to baseline approaches in generating valid structures as well as capturing the inductive bias in the training set.}
}
@InProceedings{pmlr-v139-ding21b,
title = {Dual Principal Component Pursuit for Robust Subspace Learning: Theory and Algorithms for a Holistic Approach},
author = {Ding, Tianyu and Zhu, Zhihui and Vidal, Rene and Robinson, Daniel P},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2739--2748},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ding21b/ding21b.pdf},
url = {https://proceedings.mlr.press/v139/ding21b.html},
abstract = {The Dual Principal Component Pursuit (DPCP) method has been proposed to robustly recover a subspace of high-relative dimension from corrupted data. Existing analyses and algorithms of DPCP, however, mainly focus on finding a normal to a single hyperplane that contains the inliers. Although these algorithms can be extended to a subspace of higher co-dimension through a recursive approach that sequentially finds a new basis element of the space orthogonal to the subspace, this procedure is computationally expensive and lacks convergence guarantees. In this paper, we consider a DPCP approach for simultaneously computing the entire basis of the orthogonal complement subspace (we call this a holistic approach) by solving a non-convex non-smooth optimization problem over the Grassmannian. We provide geometric and statistical analyses for the global optimality and prove that it can tolerate as many outliers as the square of the number of inliers, under both noiseless and noisy settings. We then present a Riemannian regularity condition for the problem, which is then used to prove that a Riemannian subgradient method converges linearly to a neighborhood of the orthogonal subspace with error proportional to the noise level.}
}
@InProceedings{pmlr-v139-dinh21a,
title = {Coded-InvNet for Resilient Prediction Serving Systems},
author = {Dinh, Tuan and Lee, Kangwook},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2749--2759},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/dinh21a/dinh21a.pdf},
url = {https://proceedings.mlr.press/v139/dinh21a.html},
abstract = {Inspired by a new coded computation algorithm for invertible functions, we propose Coded-InvNet a new approach to design resilient prediction serving systems that can gracefully handle stragglers or node failures. Coded-InvNet leverages recent findings in the deep learning literature such as invertible neural networks, Manifold Mixup, and domain translation algorithms, identifying interesting research directions that span across machine learning and systems. Our experimental results show that Coded-InvNet can outperform existing approaches, especially when the compute resource overhead is as low as 10%. For instance, without knowing which of the ten workers is going to fail, our algorithm can design a backup task so that it can correctly recover the missing prediction result with an accuracy of 85.9%, significantly outperforming the previous SOTA by 32.5%.}
}
@InProceedings{pmlr-v139-divol21a,
title = {Estimation and Quantization of Expected Persistence Diagrams},
author = {Divol, Vincent and Lacombe, Theo},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2760--2770},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/divol21a/divol21a.pdf},
url = {https://proceedings.mlr.press/v139/divol21a.html},
abstract = {Persistence diagrams (PDs) are the most common descriptors used to encode the topology of structured data appearing in challenging learning tasks; think e.g. of graphs, time series or point clouds sampled close to a manifold. Given random objects and the corresponding distribution of PDs, one may want to build a statistical summary—such as a mean—of these random PDs, which is however not a trivial task as the natural geometry of the space of PDs is not linear. In this article, we study two such summaries, the Expected Persistence Diagram (EPD), and its quantization. The EPD is a measure supported on $\mathbb{R}^2$, which may be approximated by its empirical counterpart. We prove that this estimator is optimal from a minimax standpoint on a large class of models with a parametric rate of convergence. The empirical EPD is simple and efficient to compute, but possibly has a very large support, hindering its use in practice. To overcome this issue, we propose an algorithm to compute a quantization of the empirical EPD, a measure with small support which is shown to approximate with near-optimal rates a quantization of the theoretical EPD.}
}
@InProceedings{pmlr-v139-domingo-enrich21a,
title = {On Energy-Based Models with Overparametrized Shallow Neural Networks},
author = {Domingo-Enrich, Carles and Bietti, Alberto and Vanden-Eijnden, Eric and Bruna, Joan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2771--2782},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/domingo-enrich21a/domingo-enrich21a.pdf},
url = {https://proceedings.mlr.press/v139/domingo-enrich21a.html},
abstract = {Energy-based models (EBMs) are a simple yet powerful framework for generative modeling. They are based on a trainable energy function which defines an associated Gibbs measure, and they can be trained and sampled from via well-established statistical tools, such as MCMC. Neural networks may be used as energy function approximators, providing both a rich class of expressive models as well as a flexible device to incorporate data structure. In this work we focus on shallow neural networks. Building from the incipient theory of overparametrized neural networks, we show that models trained in the so-called ’active’ regime provide a statistical advantage over their associated ’lazy’ or kernel regime, leading to improved adaptivity to hidden low-dimensional structure in the data distribution, as already observed in supervised learning. Our study covers both the maximum likelihood and Stein Discrepancy estimators, and we validate our theoretical results with numerical experiments on synthetic data.}
}
@InProceedings{pmlr-v139-domingues21a,
title = {Kernel-Based Reinforcement Learning: A Finite-Time Analysis},
author = {Domingues, Omar Darwiche and Menard, Pierre and Pirotta, Matteo and Kaufmann, Emilie and Valko, Michal},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2783--2792},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/domingues21a/domingues21a.pdf},
url = {https://proceedings.mlr.press/v139/domingues21a.html},
abstract = {We consider the exploration-exploitation dilemma in finite-horizon reinforcement learning problems whose state-action space is endowed with a metric. We introduce Kernel-UCBVI, a model-based optimistic algorithm that leverages the smoothness of the MDP and a non-parametric kernel estimator of the rewards and transitions to efficiently balance exploration and exploitation. For problems with $K$ episodes and horizon $H$, we provide a regret bound of $\widetilde{O}\left( H^3 K^{\frac{2d}{2d+1}}\right)$, where $d$ is the covering dimension of the joint state-action space. This is the first regret bound for kernel-based RL using smoothing kernels, which requires very weak assumptions on the MDP and applies to a wide range of tasks. We empirically validate our approach in continuous MDPs with sparse rewards.}
}
@InProceedings{pmlr-v139-dong21a,
title = {Attention is not all you need: pure attention loses rank doubly exponentially with depth},
author = {Dong, Yihe and Cordonnier, Jean-Baptiste and Loukas, Andreas},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2793--2803},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/dong21a/dong21a.pdf},
url = {https://proceedings.mlr.press/v139/dong21a.html},
abstract = {Attention-based architectures have become ubiquitous in machine learning. Yet, our understanding of the reasons for their effectiveness remains limited. This work proposes a new way to understand self-attention networks: we show that their output can be decomposed into a sum of smaller terms—or paths—each involving the operation of a sequence of attention heads across layers. Using this path decomposition, we prove that self-attention possesses a strong inductive bias towards "token uniformity". Specifically, without skip connections or multi-layer perceptrons (MLPs), the output converges doubly exponentially to a rank-1 matrix. On the other hand, skip connections and MLPs stop the output from degeneration. Our experiments verify the convergence results on standard transformer architectures.}
}
@InProceedings{pmlr-v139-donhauser21a,
title = {How rotational invariance of common kernels prevents generalization in high dimensions},
author = {Donhauser, Konstantin and Wu, Mingqi and Yang, Fanny},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2804--2814},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/donhauser21a/donhauser21a.pdf},
url = {https://proceedings.mlr.press/v139/donhauser21a.html},
abstract = {Kernel ridge regression is well-known to achieve minimax optimal rates in low-dimensional settings. However, its behavior in high dimensions is much less understood. Recent work establishes consistency for high-dimensional kernel regression for a number of specific assumptions on the data distribution. In this paper, we show that in high dimensions, the rotational invariance property of commonly studied kernels (such as RBF, inner product kernels and fully-connected NTK of any depth) leads to inconsistent estimation unless the ground truth is a low-degree polynomial. Our lower bound on the generalization error holds for a wide range of distributions and kernels with different eigenvalue decays. This lower bound suggests that consistency results for kernel ridge regression in high dimensions generally require a more refined analysis that depends on the structure of the kernel beyond its eigenvalue decay.}
}
@InProceedings{pmlr-v139-dragomir21a,
title = {Fast Stochastic Bregman Gradient Methods: Sharp Analysis and Variance Reduction},
author = {Dragomir, Radu Alexandru and Even, Mathieu and Hendrikx, Hadrien},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2815--2825},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/dragomir21a/dragomir21a.pdf},
url = {https://proceedings.mlr.press/v139/dragomir21a.html},
abstract = {We study the problem of minimizing a relatively-smooth convex function using stochastic Bregman gradient methods. We first prove the convergence of Bregman Stochastic Gradient Descent (BSGD) to a region that depends on the noise (magnitude of the gradients) at the optimum. In particular, BSGD quickly converges to the exact minimizer when this noise is zero (interpolation setting, in which the data is fit perfectly). Otherwise, when the objective has a finite sum structure, we show that variance reduction can be used to counter the effect of noise. In particular, fast convergence to the exact minimizer can be obtained under additional regularity assumptions on the Bregman reference function. We illustrate the effectiveness of our approach on two key applications of relative smoothness: tomographic reconstruction with Poisson noise and statistical preconditioning for distributed optimization.}
}
@InProceedings{pmlr-v139-du21a,
title = {Bilinear Classes: A Structural Framework for Provable Generalization in RL},
author = {Du, Simon and Kakade, Sham and Lee, Jason and Lovett, Shachar and Mahajan, Gaurav and Sun, Wen and Wang, Ruosong},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2826--2836},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/du21a/du21a.pdf},
url = {https://proceedings.mlr.press/v139/du21a.html},
abstract = {This work introduces Bilinear Classes, a new structural framework, which permit generalization in reinforcement learning in a wide variety of settings through the use of function approximation. The framework incorporates nearly all existing models in which a polynomial sample complexity is achievable, and, notably, also includes new models, such as the Linear Q*/V* model in which both the optimal Q-function and the optimal V-function are linear in some known feature space. Our main result provides an RL algorithm which has polynomial sample complexity for Bilinear Classes; notably, this sample complexity is stated in terms of a reduction to the generalization error of an underlying supervised learning sub-problem. These bounds nearly match the best known sample complexity bounds for existing models. Furthermore, this framework also extends to the infinite dimensional (RKHS) setting: for the the Linear Q*/V* model, linear MDPs, and linear mixture MDPs, we provide sample complexities that have no explicit dependence on the explicit feature dimension (which could be infinite), but instead depends only on information theoretic quantities.}
}
@InProceedings{pmlr-v139-du21b,
title = {Improved Contrastive Divergence Training of Energy-Based Models},
author = {Du, Yilun and Li, Shuang and Tenenbaum, Joshua and Mordatch, Igor},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2837--2848},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/du21b/du21b.pdf},
url = {https://proceedings.mlr.press/v139/du21b.html},
abstract = {Contrastive divergence is a popular method of training energy-based models, but is known to have difficulties with training stability. We propose an adaptation to improve contrastive divergence training by scrutinizing a gradient term that is difficult to calculate and is often left out for convenience. We show that this gradient term is numerically significant and in practice is important to avoid training instabilities, while being tractable to estimate. We further highlight how data augmentation and multi-scale processing can be used to improve model robustness and generation quality. Finally, we empirically evaluate stability of model architectures and show improved performance on a host of benchmarks and use cases, such as image generation, OOD detection, and compositional generation.}
}
@InProceedings{pmlr-v139-du21c,
title = {Order-Agnostic Cross Entropy for Non-Autoregressive Machine Translation},
author = {Du, Cunxiao and Tu, Zhaopeng and Jiang, Jing},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2849--2859},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/du21c/du21c.pdf},
url = {https://proceedings.mlr.press/v139/du21c.html},
abstract = {We propose a new training objective named order-agnostic cross entropy (OaXE) for fully non-autoregressive translation (NAT) models. OaXE improves the standard cross-entropy loss to ameliorate the effect of word reordering, which is a common source of the critical multimodality problem in NAT. Concretely, OaXE removes the penalty for word order errors, and computes the cross entropy loss based on the best possible alignment between model predictions and target tokens. Since the log loss is very sensitive to invalid references, we leverage cross entropy initialization and loss truncation to ensure the model focuses on a good part of the search space. Extensive experiments on major WMT benchmarks demonstrate that OaXE substantially improves translation performance, setting new state of the art for fully NAT models. Further analyses show that OaXE indeed alleviates the multimodality problem by reducing token repetitions and increasing prediction confidence. Our code, data, and trained models are available at https://github.com/tencent-ailab/ICML21_OAXE.}
}
@InProceedings{pmlr-v139-du21d,
title = {Putting the “Learning" into Learning-Augmented Algorithms for Frequency Estimation},
author = {Du, Elbert and Wang, Franklyn and Mitzenmacher, Michael},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2860--2869},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/du21d/du21d.pdf},
url = {https://proceedings.mlr.press/v139/du21d.html},
abstract = {In learning-augmented algorithms, algorithms are enhanced using information from a machine learning algorithm. In turn, this suggests that we should tailor our machine-learning approach for the target algorithm. We here consider this synergy in the context of the learned count-min sketch from (Hsu et al., 2019). Learning here is used to predict heavy hitters from a data stream, which are counted explicitly outside the sketch. We show that an approximately sufficient statistic for the performance of the underlying count-min sketch is given by the coverage of the predictor, or the normalized $L^1$ norm of keys that are filtered by the predictor to be explicitly counted. We show that machine learning models which are trained to optimize for coverage lead to large improvements in performance over prior approaches according to the average absolute frequency error. Our source code can be found at https://github.com/franklynwang/putting-the-learning-in-LAA.}
}
@InProceedings{pmlr-v139-du21e,
title = {Estimating $α$-Rank from A Few Entries with Low Rank Matrix Completion},
author = {Du, Yali and Yan, Xue and Chen, Xu and Wang, Jun and Zhang, Haifeng},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2870--2879},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/du21e/du21e.pdf},
url = {https://proceedings.mlr.press/v139/du21e.html},
abstract = {Multi-agent evaluation aims at the assessment of an agent’s strategy on the basis of interaction with others. Typically, existing methods such as $\alpha$-rank and its approximation still require to exhaustively compare all pairs of joint strategies for an accurate ranking, which in practice is computationally expensive. In this paper, we aim to reduce the number of pairwise comparisons in recovering a satisfying ranking for $n$ strategies in two-player meta-games, by exploring the fact that agents with similar skills may achieve similar payoffs against others. Two situations are considered: the first one is when we can obtain the true payoffs; the other one is when we can only access noisy payoff. Based on these formulations, we leverage low-rank matrix completion and design two novel algorithms for noise-free and noisy evaluations respectively. For both of these settings, we theorize that $O(nr \log n)$ ($n$ is the number of agents and $r$ is the rank of the payoff matrix) payoff entries are required to achieve sufficiently well strategy evaluation performance. Empirical results on evaluating the strategies in three synthetic games and twelve real world games demonstrate that strategy evaluation from a few entries can lead to comparable performance to algorithms with full knowledge of the payoff matrix.}
}
@InProceedings{pmlr-v139-du21f,
title = {Learning Diverse-Structured Networks for Adversarial Robustness},
author = {Du, Xuefeng and Zhang, Jingfeng and Han, Bo and Liu, Tongliang and Rong, Yu and Niu, Gang and Huang, Junzhou and Sugiyama, Masashi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2880--2891},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/du21f/du21f.pdf},
url = {https://proceedings.mlr.press/v139/du21f.html},
abstract = {In adversarial training (AT), the main focus has been the objective and optimizer while the model has been less studied, so that the models being used are still those classic ones in standard training (ST). Classic network architectures (NAs) are generally worse than searched NA in ST, which should be the same in AT. In this paper, we argue that NA and AT cannot be handled independently, since given a dataset, the optimal NA in ST would be no longer optimal in AT. That being said, AT is time-consuming itself; if we directly search NAs in AT over large search spaces, the computation will be practically infeasible. Thus, we propose diverse-structured network (DS-Net), to significantly reduce the size of the search space: instead of low-level operations, we only consider predefined atomic blocks, where an atomic block is a time-tested building block like the residual block. There are only a few atomic blocks and thus we can weight all atomic blocks rather than find the best one in a searched block of DS-Net, which is an essential tradeoff between exploring diverse structures and exploiting the best structures. Empirical results demonstrate the advantages of DS-Net, i.e., weighting the atomic blocks.}
}
@InProceedings{pmlr-v139-duan21a,
title = {Risk Bounds and Rademacher Complexity in Batch Reinforcement Learning},
author = {Duan, Yaqi and Jin, Chi and Li, Zhiyuan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2892--2902},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/duan21a/duan21a.pdf},
url = {https://proceedings.mlr.press/v139/duan21a.html},
abstract = {This paper considers batch Reinforcement Learning (RL) with general value function approximation. Our study investigates the minimal assumptions to reliably estimate/minimize Bellman error, and characterizes the generalization performance by (local) Rademacher complexities of general function classes, which makes initial steps in bridging the gap between statistical learning theory and batch RL. Concretely, we view the Bellman error as a surrogate loss for the optimality gap, and prove the followings: (1) In double sampling regime, the excess risk of Empirical Risk Minimizer (ERM) is bounded by the Rademacher complexity of the function class. (2) In the single sampling regime, sample-efficient risk minimization is not possible without further assumptions, regardless of algorithms. However, with completeness assumptions, the excess risk of FQI and a minimax style algorithm can be again bounded by the Rademacher complexity of the corresponding function classes. (3) Fast statistical rates can be achieved by using tools of local Rademacher complexity. Our analysis covers a wide range of function classes, including finite classes, linear spaces, kernel spaces, sparse linear features, etc.}
}
@InProceedings{pmlr-v139-duan21b,
title = {Sawtooth Factorial Topic Embeddings Guided Gamma Belief Network},
author = {Duan, Zhibin and Wang, Dongsheng and Chen, Bo and Wang, Chaojie and Chen, Wenchao and Li, Yewen and Ren, Jie and Zhou, Mingyuan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2903--2913},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/duan21b/duan21b.pdf},
url = {https://proceedings.mlr.press/v139/duan21b.html},
abstract = {Hierarchical topic models such as the gamma belief network (GBN) have delivered promising results in mining multi-layer document representations and discovering interpretable topic taxonomies. However, they often assume in the prior that the topics at each layer are independently drawn from the Dirichlet distribution, ignoring the dependencies between the topics both at the same layer and across different layers. To relax this assumption, we propose sawtooth factorial topic embedding guided GBN, a deep generative model of documents that captures the dependencies and semantic similarities between the topics in the embedding space. Specifically, both the words and topics are represented as embedding vectors of the same dimension. The topic matrix at a layer is factorized into the product of a factor loading matrix and a topic embedding matrix, the transpose of which is set as the factor loading matrix of the layer above. Repeating this particular type of factorization, which shares components between adjacent layers, leads to a structure referred to as sawtooth factorization. An auto-encoding variational inference network is constructed to optimize the model parameter via stochastic gradient descent. Experiments on big corpora show that our models outperform other neural topic models on extracting deeper interpretable topics and deriving better document representations.}
}
@InProceedings{pmlr-v139-dutt21a,
title = {Exponential Reduction in Sample Complexity with Learning of Ising Model Dynamics},
author = {Dutt, Arkopal and Lokhov, Andrey and Vuffray, Marc D and Misra, Sidhant},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2914--2925},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/dutt21a/dutt21a.pdf},
url = {https://proceedings.mlr.press/v139/dutt21a.html},
abstract = {The usual setting for learning the structure and parameters of a graphical model assumes the availability of independent samples produced from the corresponding multivariate probability distribution. However, for many models the mixing time of the respective Markov chain can be very large and i.i.d. samples may not be obtained. We study the problem of reconstructing binary graphical models from correlated samples produced by a dynamical process, which is natural in many applications. We analyze the sample complexity of two estimators that are based on the interaction screening objective and the conditional likelihood loss. We observe that for samples coming from a dynamical process far from equilibrium, the sample complexity reduces exponentially compared to a dynamical process that mixes quickly.}
}
@InProceedings{pmlr-v139-ecoffet21a,
title = {Reinforcement Learning Under Moral Uncertainty},
author = {Ecoffet, Adrien and Lehman, Joel},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2926--2936},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ecoffet21a/ecoffet21a.pdf},
url = {https://proceedings.mlr.press/v139/ecoffet21a.html},
abstract = {An ambitious goal for machine learning is to create agents that behave ethically: The capacity to abide by human moral norms would greatly expand the context in which autonomous agents could be practically and safely deployed, e.g. fully autonomous vehicles will encounter charged moral decisions that complicate their deployment. While ethical agents could be trained by rewarding correct behavior under a specific moral theory (e.g. utilitarianism), there remains widespread disagreement about the nature of morality. Acknowledging such disagreement, recent work in moral philosophy proposes that ethical behavior requires acting under moral uncertainty, i.e. to take into account when acting that one’s credence is split across several plausible ethical theories. This paper translates such insights to the field of reinforcement learning, proposes two training methods that realize different points among competing desiderata, and trains agents in simple environments to act under moral uncertainty. The results illustrate (1) how such uncertainty can help curb extreme behavior from commitment to single theories and (2) several technical complications arising from attempting to ground moral philosophy in RL (e.g. how can a principled trade-off between two competing but incomparable reward functions be reached). The aim is to catalyze progress towards morally-competent agents and highlight the potential of RL to contribute towards the computational grounding of moral philosophy.}
}
@InProceedings{pmlr-v139-efroni21a,
title = {Confidence-Budget Matching for Sequential Budgeted Learning},
author = {Efroni, Yonathan and Merlis, Nadav and Saha, Aadirupa and Mannor, Shie},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2937--2947},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/efroni21a/efroni21a.pdf},
url = {https://proceedings.mlr.press/v139/efroni21a.html},
abstract = {A core element in decision-making under uncertainty is the feedback on the quality of the performed actions. However, in many applications, such feedback is restricted. For example, in recommendation systems, repeatedly asking the user to provide feedback on the quality of recommendations will annoy them. In this work, we formalize decision-making problems with querying budget, where there is a (possibly time-dependent) hard limit on the number of reward queries allowed. Specifically, we focus on multi-armed bandits, linear contextual bandits, and reinforcement learning problems. We start by analyzing the performance of ‘greedy’ algorithms that query a reward whenever they can. We show that in fully stochastic settings, doing so performs surprisingly well, but in the presence of any adversity, this might lead to linear regret. To overcome this issue, we propose the Confidence-Budget Matching (CBM) principle that queries rewards when the confidence intervals are wider than the inverse square root of the available budget. We analyze the performance of CBM based algorithms in different settings and show that it performs well in the presence of adversity in the contexts, initial states, and budgets.}
}
@InProceedings{pmlr-v139-eimer21a,
title = {Self-Paced Context Evaluation for Contextual Reinforcement Learning},
author = {Eimer, Theresa and Biedenkapp, Andr{\'e} and Hutter, Frank and Lindauer, Marius},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2948--2958},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/eimer21a/eimer21a.pdf},
url = {https://proceedings.mlr.press/v139/eimer21a.html},
abstract = {Reinforcement learning (RL) has made a lot of advances for solving a single problem in a given environment; but learning policies that generalize to unseen variations of a problem remains challenging. To improve sample efficiency for learning on such instances of a problem domain, we present Self-Paced Context Evaluation (SPaCE). Based on self-paced learning, SPaCE automatically generates instance curricula online with little computational overhead. To this end, SPaCE leverages information contained in state values during training to accelerate and improve training performance as well as generalization capabilities to new \tasks from the same problem domain. Nevertheless, SPaCE is independent of the problem domain at hand and can be applied on top of any RL agent with state-value function approximation. We demonstrate SPaCE’s ability to speed up learning of different value-based RL agents on two environments, showing better generalization capabilities and up to 10x faster learning compared to naive approaches such as round robin or SPDRL, as the closest state-of-the-art approach.}
}
@InProceedings{pmlr-v139-elesedy21a,
title = {Provably Strict Generalisation Benefit for Equivariant Models},
author = {Elesedy, Bryn and Zaidi, Sheheryar},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2959--2969},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/elesedy21a/elesedy21a.pdf},
url = {https://proceedings.mlr.press/v139/elesedy21a.html},
abstract = {It is widely believed that engineering a model to be invariant/equivariant improves generalisation. Despite the growing popularity of this approach, a precise characterisation of the generalisation benefit is lacking. By considering the simplest case of linear models, this paper provides the first provably non-zero improvement in generalisation for invariant/equivariant models when the target distribution is invariant/equivariant with respect to a compact group. Moreover, our work reveals an interesting relationship between generalisation, the number of training examples and properties of the group action. Our results rest on an observation of the structure of function spaces under averaging operators which, along with its consequences for feature averaging, may be of independent interest.}
}
@InProceedings{pmlr-v139-emami21a,
title = {Efficient Iterative Amortized Inference for Learning Symmetric and Disentangled Multi-Object Representations},
author = {Emami, Patrick and He, Pan and Ranka, Sanjay and Rangarajan, Anand},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2970--2981},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/emami21a/emami21a.pdf},
url = {https://proceedings.mlr.press/v139/emami21a.html},
abstract = {Unsupervised multi-object representation learning depends on inductive biases to guide the discovery of object-centric representations that generalize. However, we observe that methods for learning these representations are either impractical due to long training times and large memory consumption or forego key inductive biases. In this work, we introduce EfficientMORL, an efficient framework for the unsupervised learning of object-centric representations. We show that optimization challenges caused by requiring both symmetry and disentanglement can in fact be addressed by high-cost iterative amortized inference by designing the framework to minimize its dependence on it. We take a two-stage approach to inference: first, a hierarchical variational autoencoder extracts symmetric and disentangled representations through bottom-up inference, and second, a lightweight network refines the representations with top-down feedback. The number of refinement steps taken during training is reduced following a curriculum, so that at test time with zero steps the model achieves 99.1% of the refined decomposition performance. We demonstrate strong object decomposition and disentanglement on the standard multi-object benchmark while achieving nearly an order of magnitude faster training and test time inference over the previous state-of-the-art model.}
}
@InProceedings{pmlr-v139-emami21b,
title = {Implicit Bias of Linear RNNs},
author = {Emami, Melikasadat and Sahraee-Ardakan, Mojtaba and Pandit, Parthe and Rangan, Sundeep and Fletcher, Alyson K},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2982--2992},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/emami21b/emami21b.pdf},
url = {https://proceedings.mlr.press/v139/emami21b.html},
abstract = {Contemporary wisdom based on empirical studies suggests that standard recurrent neural networks (RNNs) do not perform well on tasks requiring long-term memory. However, RNNs’ poor ability to capture long-term dependencies has not been fully understood. This paper provides a rigorous explanation of this property in the special case of linear RNNs. Although this work is limited to linear RNNs, even these systems have traditionally been difficult to analyze due to their non-linear parameterization. Using recently-developed kernel regime analysis, our main result shows that as the number of hidden units goes to infinity, linear RNNs learned from random initializations are functionally equivalent to a certain weighted 1D-convolutional network. Importantly, the weightings in the equivalent model cause an implicit bias to elements with smaller time lags in the convolution, and hence shorter memory. The degree of this bias depends on the variance of the transition matrix at initialization and is related to the classic exploding and vanishing gradients problem. The theory is validated with both synthetic and real data experiments.}
}
@InProceedings{pmlr-v139-ergen21a,
title = {Global Optimality Beyond Two Layers: Training Deep ReLU Networks via Convex Programs},
author = {Ergen, Tolga and Pilanci, Mert},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {2993--3003},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ergen21a/ergen21a.pdf},
url = {https://proceedings.mlr.press/v139/ergen21a.html},
abstract = {Understanding the fundamental mechanism behind the success of deep neural networks is one of the key challenges in the modern machine learning literature. Despite numerous attempts, a solid theoretical analysis is yet to be developed. In this paper, we develop a novel unified framework to reveal a hidden regularization mechanism through the lens of convex optimization. We first show that the training of multiple three-layer ReLU sub-networks with weight decay regularization can be equivalently cast as a convex optimization problem in a higher dimensional space, where sparsity is enforced via a group $\ell_1$-norm regularization. Consequently, ReLU networks can be interpreted as high dimensional feature selection methods. More importantly, we then prove that the equivalent convex problem can be globally optimized by a standard convex optimization solver with a polynomial-time complexity with respect to the number of samples and data dimension when the width of the network is fixed. Finally, we numerically validate our theoretical results via experiments involving both synthetic and real datasets.}
}
@InProceedings{pmlr-v139-ergen21b,
title = {Revealing the Structure of Deep Neural Networks via Convex Duality},
author = {Ergen, Tolga and Pilanci, Mert},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3004--3014},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ergen21b/ergen21b.pdf},
url = {https://proceedings.mlr.press/v139/ergen21b.html},
abstract = {We study regularized deep neural networks (DNNs) and introduce a convex analytic framework to characterize the structure of the hidden layers. We show that a set of optimal hidden layer weights for a norm regularized DNN training problem can be explicitly found as the extreme points of a convex set. For the special case of deep linear networks, we prove that each optimal weight matrix aligns with the previous layers via duality. More importantly, we apply the same characterization to deep ReLU networks with whitened data and prove the same weight alignment holds. As a corollary, we also prove that norm regularized deep ReLU networks yield spline interpolation for one-dimensional datasets which was previously known only for two-layer networks. Furthermore, we provide closed-form solutions for the optimal layer weights when data is rank-one or whitened. The same analysis also applies to architectures with batch normalization even for arbitrary data. Therefore, we obtain a complete explanation for a recent empirical observation termed Neural Collapse where class means collapse to the vertices of a simplex equiangular tight frame.}
}
@InProceedings{pmlr-v139-ermolov21a,
title = {Whitening for Self-Supervised Representation Learning},
author = {Ermolov, Aleksandr and Siarohin, Aliaksandr and Sangineto, Enver and Sebe, Nicu},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3015--3024},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ermolov21a/ermolov21a.pdf},
url = {https://proceedings.mlr.press/v139/ermolov21a.html},
abstract = {Most of the current self-supervised representation learning (SSL) methods are based on the contrastive loss and the instance-discrimination task, where augmented versions of the same image instance ("positives") are contrasted with instances extracted from other images ("negatives"). For the learning to be effective, many negatives should be compared with a positive pair, which is computationally demanding. In this paper, we propose a different direction and a new loss function for SSL, which is based on the whitening of the latent-space features. The whitening operation has a "scattering" effect on the batch samples, avoiding degenerate solutions where all the sample representations collapse to a single point. Our solution does not require asymmetric networks and it is conceptually simple. Moreover, since negatives are not needed, we can extract multiple positive pairs from the same image instance. The source code of the method and of all the experiments is available at: https://github.com/htdt/self-supervised.}
}
@InProceedings{pmlr-v139-errica21a,
title = {Graph Mixture Density Networks},
author = {Errica, Federico and Bacciu, Davide and Micheli, Alessio},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3025--3035},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/errica21a/errica21a.pdf},
url = {https://proceedings.mlr.press/v139/errica21a.html},
abstract = {We introduce the Graph Mixture Density Networks, a new family of machine learning models that can fit multimodal output distributions conditioned on graphs of arbitrary topology. By combining ideas from mixture models and graph representation learning, we address a broader class of challenging conditional density estimation problems that rely on structured data. In this respect, we evaluate our method on a new benchmark application that leverages random graphs for stochastic epidemic simulations. We show a significant improvement in the likelihood of epidemic outcomes when taking into account both multimodality and structure. The empirical analysis is complemented by two real-world regression tasks showing the effectiveness of our approach in modeling the output prediction uncertainty. Graph Mixture Density Networks open appealing research opportunities in the study of structure-dependent phenomena that exhibit non-trivial conditional output distributions.}
}
@InProceedings{pmlr-v139-esfandiari21a,
title = {Cross-Gradient Aggregation for Decentralized Learning from Non-IID Data},
author = {Esfandiari, Yasaman and Tan, Sin Yong and Jiang, Zhanhong and Balu, Aditya and Herron, Ethan and Hegde, Chinmay and Sarkar, Soumik},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3036--3046},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/esfandiari21a/esfandiari21a.pdf},
url = {https://proceedings.mlr.press/v139/esfandiari21a.html},
abstract = {Decentralized learning enables a group of collaborative agents to learn models using a distributed dataset without the need for a central parameter server. Recently, decentralized learning algorithms have demonstrated state-of-the-art results on benchmark data sets, comparable with centralized algorithms. However, the key assumption to achieve competitive performance is that the data is independently and identically distributed (IID) among the agents which, in real-life applications, is often not applicable. Inspired by ideas from continual learning, we propose Cross-Gradient Aggregation (CGA), a novel decentralized learning algorithm where (i) each agent aggregates cross-gradient information, i.e., derivatives of its model with respect to its neighbors’ datasets, and (ii) updates its model using a projected gradient based on quadratic programming (QP). We theoretically analyze the convergence characteristics of CGA and demonstrate its efficiency on non-IID data distributions sampled from the MNIST and CIFAR-10 datasets. Our empirical comparisons show superior learning performance of CGA over existing state-of-the-art decentralized learning algorithms, as well as maintaining the improved performance under information compression to reduce peer-to-peer communication overhead. The code is available here on GitHub.}
}
@InProceedings{pmlr-v139-eustratiadis21a,
title = {Weight-covariance alignment for adversarially robust neural networks},
author = {Eustratiadis, Panagiotis and Gouk, Henry and Li, Da and Hospedales, Timothy},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3047--3056},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/eustratiadis21a/eustratiadis21a.pdf},
url = {https://proceedings.mlr.press/v139/eustratiadis21a.html},
abstract = {Stochastic Neural Networks (SNNs) that inject noise into their hidden layers have recently been shown to achieve strong robustness against adversarial attacks. However, existing SNNs are usually heuristically motivated, and often rely on adversarial training, which is computationally costly. We propose a new SNN that achieves state-of-the-art performance without relying on adversarial training, and enjoys solid theoretical justification. Specifically, while existing SNNs inject learned or hand-tuned isotropic noise, our SNN learns an anisotropic noise distribution to optimize a learning-theoretic bound on adversarial robustness. We evaluate our method on a number of popular benchmarks, show that it can be applied to different architectures, and that it provides robustness to a variety of white-box and black-box attacks, while being simple and fast to train compared to existing alternatives.}
}
@InProceedings{pmlr-v139-fabian21a,
title = {Data augmentation for deep learning based accelerated MRI reconstruction with limited data},
author = {Fabian, Zalan and Heckel, Reinhard and Soltanolkotabi, Mahdi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3057--3067},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fabian21a/fabian21a.pdf},
url = {https://proceedings.mlr.press/v139/fabian21a.html},
abstract = {Deep neural networks have emerged as very successful tools for image restoration and reconstruction tasks. These networks are often trained end-to-end to directly reconstruct an image from a noisy or corrupted measurement of that image. To achieve state-of-the-art performance, training on large and diverse sets of images is considered critical. However, it is often difficult and/or expensive to collect large amounts of training images. Inspired by the success of Data Augmentation (DA) for classification problems, in this paper, we propose a pipeline for data augmentation for accelerated MRI reconstruction and study its effectiveness at reducing the required training data in a variety of settings. Our DA pipeline, MRAugment, is specifically designed to utilize the invariances present in medical imaging measurements as naive DA strategies that neglect the physics of the problem fail. Through extensive studies on multiple datasets we demonstrate that in the low-data regime DA prevents overfitting and can match or even surpass the state of the art while using significantly fewer training data, whereas in the high-data regime it has diminishing returns. Furthermore, our findings show that DA improves the robustness of the model against various shifts in the test distribution.}
}
@InProceedings{pmlr-v139-fan21a,
title = {Poisson-Randomised DirBN: Large Mutation is Needed in Dirichlet Belief Networks},
author = {Fan, Xuhui and Li, Bin and Li, Yaqiong and Sisson, Scott A.},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3068--3077},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fan21a/fan21a.pdf},
url = {https://proceedings.mlr.press/v139/fan21a.html},
abstract = {The Dirichlet Belief Network (DirBN) was recently proposed as a promising deep generative model to learn interpretable deep latent distributions for objects. However, its current representation capability is limited since its latent distributions across different layers is prone to form similar patterns and can thus hardly use multi-layer structure to form flexible distributions. In this work, we propose Poisson-randomised Dirichlet Belief Networks (Pois-DirBN), which allows large mutations for the latent distributions across layers to enlarge the representation capability. Based on our key idea of inserting Poisson random variables in the layer-wise connection, Pois-DirBN first introduces a component-wise propagation mechanism to enable latent distributions to have large variations across different layers. Then, we develop a layer-wise Gibbs sampling algorithm to infer the latent distributions, leading to a larger number of effective layers compared to DirBN. In addition, we integrate out latent distributions and form a multi-stochastic deep integer network, which provides an alternative view on Pois-DirBN. We apply Pois-DirBN to relational modelling and validate its effectiveness through improved link prediction performance and more interpretable latent distribution visualisations. The code can be downloaded at https://github.com/xuhuifan/Pois_DirBN.}
}
@InProceedings{pmlr-v139-fan21b,
title = {Model-based Reinforcement Learning for Continuous Control with Posterior Sampling},
author = {Fan, Ying and Ming, Yifei},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3078--3087},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fan21b/fan21b.pdf},
url = {https://proceedings.mlr.press/v139/fan21b.html},
abstract = {Balancing exploration and exploitation is crucial in reinforcement learning (RL). In this paper, we study model-based posterior sampling for reinforcement learning (PSRL) in continuous state-action spaces theoretically and empirically. First, we show the first regret bound of PSRL in continuous spaces which is polynomial in the episode length to the best of our knowledge. With the assumption that reward and transition functions can be modeled by Bayesian linear regression, we develop a regret bound of $\tilde{O}(H^{3/2}d\sqrt{T})$, where $H$ is the episode length, $d$ is the dimension of the state-action space, and $T$ indicates the total time steps. This result matches the best-known regret bound of non-PSRL methods in linear MDPs. Our bound can be extended to nonlinear cases as well with feature embedding: using linear kernels on the feature representation $\phi$, the regret bound becomes $\tilde{O}(H^{3/2}d_{\phi}\sqrt{T})$, where $d_\phi$ is the dimension of the representation space. Moreover, we present MPC-PSRL, a model-based posterior sampling algorithm with model predictive control for action selection. To capture the uncertainty in models, we use Bayesian linear regression on the penultimate layer (the feature representation layer $\phi$) of neural networks. Empirical results show that our algorithm achieves the state-of-the-art sample efficiency in benchmark continuous control tasks compared to prior model-based algorithms, and matches the asymptotic performance of model-free algorithms.}
}
@InProceedings{pmlr-v139-fan21c,
title = {SECANT: Self-Expert Cloning for Zero-Shot Generalization of Visual Policies},
author = {Fan, Linxi and Wang, Guanzhi and Huang, De-An and Yu, Zhiding and Fei-Fei, Li and Zhu, Yuke and Anandkumar, Animashree},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3088--3099},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fan21c/fan21c.pdf},
url = {https://proceedings.mlr.press/v139/fan21c.html},
abstract = {Generalization has been a long-standing challenge for reinforcement learning (RL). Visual RL, in particular, can be easily distracted by irrelevant factors in high-dimensional observation space. In this work, we consider robust policy learning which targets zero-shot generalization to unseen visual environments with large distributional shift. We propose SECANT, a novel self-expert cloning technique that leverages image augmentation in two stages to *decouple* robust representation learning from policy optimization. Specifically, an expert policy is first trained by RL from scratch with weak augmentations. A student network then learns to mimic the expert policy by supervised learning with strong augmentations, making its representation more robust against visual variations compared to the expert. Extensive experiments demonstrate that SECANT significantly advances the state of the art in zero-shot generalization across 4 challenging domains. Our average reward improvements over prior SOTAs are: DeepMind Control (+26.5%), robotic manipulation (+337.8%), vision-based autonomous driving (+47.7%), and indoor object navigation (+15.8%). Code release and video are available at https://linxifan.github.io/secant-site/.}
}
@InProceedings{pmlr-v139-fang21a,
title = {On Estimation in Latent Variable Models},
author = {Fang, Guanhua and Li, Ping},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3100--3110},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fang21a/fang21a.pdf},
url = {https://proceedings.mlr.press/v139/fang21a.html},
abstract = {Latent variable models have been playing a central role in statistics, econometrics, machine learning with applications to repeated observation study, panel data inference, user behavior analysis, etc. In many modern applications, the inference based on latent variable models involves one or several of the following features: the presence of complex latent structure, the observed and latent variables being continuous or discrete, constraints on parameters, and data size being large. Therefore, solving an estimation problem for general latent variable models is highly non-trivial. In this paper, we consider a gradient based method via using variance reduction technique to accelerate estimation procedure. Theoretically, we show the convergence results for the proposed method under general and mild model assumptions. The algorithm has better computational complexity compared with the classical gradient methods and maintains nice statistical properties. Various numerical results corroborate our theory.}
}
@InProceedings{pmlr-v139-fang21b,
title = {On Variational Inference in Biclustering Models},
author = {Fang, Guanhua and Li, Ping},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3111--3121},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fang21b/fang21b.pdf},
url = {https://proceedings.mlr.press/v139/fang21b.html},
abstract = {Biclustering structures exist ubiquitously in data matrices and the biclustering problem was first formalized by John Hartigan (1972) to cluster rows and columns simultaneously. In this paper, we develop a theory for the estimation of general biclustering models, where the data is assumed to follow certain statistical distribution with underlying biclustering structure. Due to the existence of latent variables, directly computing the maximal likelihood estimator is prohibitively difficult in practice and we instead consider the variational inference (VI) approach to solve the parameter estimation problem. Although variational inference method generally has good empirical performance, there are very few theoretical results around VI. In this paper, we obtain the precise estimation bound of variational estimator and show that it matches the minimax rate in terms of estimation error under mild assumptions in biclustering setting. Furthermore, we study the convergence property of the coordinate ascent variational inference algorithm, where both local and global convergence results have been provided. Numerical results validate our new theories.}
}
@InProceedings{pmlr-v139-fang21c,
title = {Learning Bounds for Open-Set Learning},
author = {Fang, Zhen and Lu, Jie and Liu, Anjin and Liu, Feng and Zhang, Guangquan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3122--3132},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fang21c/fang21c.pdf},
url = {https://proceedings.mlr.press/v139/fang21c.html},
abstract = {Traditional supervised learning aims to train a classifier in the closed-set world, where training and test samples share the same label space. In this paper, we target a more challenging and re_x0002_alistic setting: open-set learning (OSL), where there exist test samples from the classes that are unseen during training. Although researchers have designed many methods from the algorith_x0002_mic perspectives, there are few methods that pro_x0002_vide generalization guarantees on their ability to achieve consistent performance on different train_x0002_ing samples drawn from the same distribution. Motivated by the transfer learning and probably approximate correct (PAC) theory, we make a bold attempt to study OSL by proving its general_x0002_ization error-given training samples with size n, the estimation error will get close to order Op(1/$\sqrt{}$n). This is the first study to provide a generalization bound for OSL, which we do by theoretically investigating the risk of the tar_x0002_get classifier on unknown classes. According to our theory, a novel algorithm, called auxiliary open-set risk (AOSR) is proposed to address the OSL problem. Experiments verify the efficacy of AOSR. The code is available at github.com/AnjinLiu/Openset_Learning_AOSR.}
}
@InProceedings{pmlr-v139-fang21d,
title = {Streaming Bayesian Deep Tensor Factorization},
author = {Fang, Shikai and Wang, Zheng and Pan, Zhimeng and Liu, Ji and Zhe, Shandian},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3133--3142},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fang21d/fang21d.pdf},
url = {https://proceedings.mlr.press/v139/fang21d.html},
abstract = {Despite the success of existing tensor factorization methods, most of them conduct a multilinear decomposition, and rarely exploit powerful modeling frameworks, like deep neural networks, to capture a variety of complicated interactions in data. More important, for highly expressive, deep factorization, we lack an effective approach to handle streaming data, which are ubiquitous in real-world applications. To address these issues, we propose SBTD, a Streaming Bayesian Deep Tensor factorization method. We first use Bayesian neural networks (NNs) to build a deep tensor factorization model. We assign a spike-and-slab prior over each NN weight to encourage sparsity and to prevent overfitting. We then use multivariate Delta’s method and moment matching to approximate the posterior of the NN output and calculate the running model evidence, based on which we develop an efficient streaming posterior inference algorithm in the assumed-density-filtering and expectation propagation framework. Our algorithm provides responsive incremental updates for the posterior of the latent factors and NN weights upon receiving newly observed tensor entries, and meanwhile identify and inhibit redundant/useless weights. We show the advantages of our approach in four real-world applications.}
}
@InProceedings{pmlr-v139-farahmand21a,
title = {PID Accelerated Value Iteration Algorithm},
author = {Farahmand, Amir-Massoud and Ghavamzadeh, Mohammad},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3143--3153},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/farahmand21a/farahmand21a.pdf},
url = {https://proceedings.mlr.press/v139/farahmand21a.html},
abstract = {The convergence rate of Value Iteration (VI), a fundamental procedure in dynamic programming and reinforcement learning, for solving MDPs can be slow when the discount factor is close to one. We propose modifications to VI in order to potentially accelerate its convergence behaviour. The key insight is the realization that the evolution of the value function approximations $(V_k)_{k \geq 0}$ in the VI procedure can be seen as a dynamical system. This opens up the possibility of using techniques from \emph{control theory} to modify, and potentially accelerate, this dynamics. We present such modifications based on simple controllers, such as PD (Proportional-Derivative), PI (Proportional-Integral), and PID. We present the error dynamics of these variants of VI, and provably (for certain classes of MDPs) and empirically (for more general classes) show that the convergence rate can be significantly improved. We also propose a gain adaptation mechanism in order to automatically select the controller gains, and empirically show the effectiveness of this procedure.}
}
@InProceedings{pmlr-v139-farias21a,
title = {Near-Optimal Entrywise Anomaly Detection for Low-Rank Matrices with Sub-Exponential Noise},
author = {Farias, Vivek and Li, Andrew A and Peng, Tianyi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3154--3163},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/farias21a/farias21a.pdf},
url = {https://proceedings.mlr.press/v139/farias21a.html},
abstract = {We study the problem of identifying anomalies in a low-rank matrix observed with sub-exponential noise, motivated by applications in retail and inventory management. State of the art approaches to anomaly detection in low-rank matrices apparently fall short, since they require that non-anomalous entries be observed with vanishingly small noise (which is not the case in our problem, and indeed in many applications). So motivated, we propose a conceptually simple entrywise approach to anomaly detection in low-rank matrices. Our approach accommodates a general class of probabilistic anomaly models. We extend recent work on entrywise error guarantees for matrix completion, establishing such guarantees for sub-exponential matrices, where in addition to missing entries, a fraction of entries are corrupted by (an also unknown) anomaly model. Viewing the anomaly detection as a classification task, to the best of our knowledge, we are the first to achieve the min-max optimal detection rate (up to log factors). Using data from a massive consumer goods retailer, we show that our approach provides significant improvements over incumbent approaches to anomaly detection.}
}
@InProceedings{pmlr-v139-farina21a,
title = {Connecting Optimal Ex-Ante Collusion in Teams to Extensive-Form Correlation: Faster Algorithms and Positive Complexity Results},
author = {Farina, Gabriele and Celli, Andrea and Gatti, Nicola and Sandholm, Tuomas},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3164--3173},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/farina21a/farina21a.pdf},
url = {https://proceedings.mlr.press/v139/farina21a.html},
abstract = {We focus on the problem of finding an optimal strategy for a team of players that faces an opponent in an imperfect-information zero-sum extensive-form game. Team members are not allowed to communicate during play but can coordinate before the game. In this setting, it is known that the best the team can do is sample a profile of potentially randomized strategies (one per player) from a joint (a.k.a. correlated) probability distribution at the beginning of the game. In this paper, we first provide new modeling results about computing such an optimal distribution by drawing a connection to a different literature on extensive-form correlation. Second, we provide an algorithm that allows one for capping the number of profiles employed in the solution. This begets an anytime algorithm by increasing the cap. We find that often a handful of well-chosen such profiles suffices to reach optimal utility for the team. This enables team members to reach coordination through a simple and understandable plan. Finally, inspired by this observation and leveraging theoretical concepts that we introduce, we develop an efficient column-generation algorithm for finding an optimal distribution for the team. We evaluate it on a suite of common benchmark games. It is three orders of magnitude faster than the prior state of the art on games that the latter can solve and it can also solve several games that were previously unsolvable.}
}
@InProceedings{pmlr-v139-farnia21a,
title = {Train simultaneously, generalize better: Stability of gradient-based minimax learners},
author = {Farnia, Farzan and Ozdaglar, Asuman},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3174--3185},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/farnia21a/farnia21a.pdf},
url = {https://proceedings.mlr.press/v139/farnia21a.html},
abstract = {The success of minimax learning problems of generative adversarial networks (GANs) has been observed to depend on the minimax optimization algorithm used for their training. This dependence is commonly attributed to the convergence speed and robustness properties of the underlying optimization algorithm. In this paper, we show that the optimization algorithm also plays a key role in the generalization performance of the trained minimax model. To this end, we analyze the generalization properties of standard gradient descent ascent (GDA) and proximal point method (PPM) algorithms through the lens of algorithmic stability as defined by Bousquet & Elisseeff, 2002 under both convex-concave and nonconvex-nonconcave minimax settings. While the GDA algorithm is not guaranteed to have a vanishing excess risk in convex-concave problems, we show the PPM algorithm enjoys a bounded excess risk in the same setup. For nonconvex-nonconcave problems, we compare the generalization performance of stochastic GDA and GDmax algorithms where the latter fully solves the maximization subproblem at every iteration. Our generalization analysis suggests the superiority of GDA provided that the minimization and maximization subproblems are solved simultaneously with similar learning rates. We discuss several numerical results indicating the role of optimization algorithms in the generalization of learned minimax models.}
}
@InProceedings{pmlr-v139-fatras21a,
title = {Unbalanced minibatch Optimal Transport; applications to Domain Adaptation},
author = {Fatras, Kilian and Sejourne, Thibault and Flamary, R{\'e}mi and Courty, Nicolas},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3186--3197},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fatras21a/fatras21a.pdf},
url = {https://proceedings.mlr.press/v139/fatras21a.html},
abstract = {Optimal transport distances have found many applications in machine learning for their capacity to compare non-parametric probability distributions. Yet their algorithmic complexity generally prevents their direct use on large scale datasets. Among the possible strategies to alleviate this issue, practitioners can rely on computing estimates of these distances over subsets of data, i.e. minibatches. While computationally appealing, we highlight in this paper some limits of this strategy, arguing it can lead to undesirable smoothing effects. As an alternative, we suggest that the same minibatch strategy coupled with unbalanced optimal transport can yield more robust behaviors. We discuss the associated theoretical properties, such as unbiased estimators, existence of gradients and concentration bounds. Our experimental study shows that in challenging problems associated to domain adaptation, the use of unbalanced optimal transport leads to significantly better results, competing with or surpassing recent baselines.}
}
@InProceedings{pmlr-v139-fei21a,
title = {Risk-Sensitive Reinforcement Learning with Function Approximation: A Debiasing Approach},
author = {Fei, Yingjie and Yang, Zhuoran and Wang, Zhaoran},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3198--3207},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fei21a/fei21a.pdf},
url = {https://proceedings.mlr.press/v139/fei21a.html},
abstract = {We study function approximation for episodic reinforcement learning with entropic risk measure. We first propose an algorithm with linear function approximation. Compared to existing algorithms, which suffer from improper regularization and regression biases, this algorithm features debiasing transformations in backward induction and regression procedures. We further propose an algorithm with general function approximation, which features implicit debiasing transformations. We prove that both algorithms achieve a sublinear regret and demonstrate a trade-off between generality and efficiency. Our analysis provides a unified framework for function approximation in risk-sensitive reinforcement learning, which leads to the first sublinear regret bounds in the setting.}
}
@InProceedings{pmlr-v139-feldman21a,
title = {Lossless Compression of Efficient Private Local Randomizers},
author = {Feldman, Vitaly and Talwar, Kunal},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3208--3219},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/feldman21a/feldman21a.pdf},
url = {https://proceedings.mlr.press/v139/feldman21a.html},
abstract = {Locally Differentially Private (LDP) Reports are commonly used for collection of statistics and machine learning in the federated setting. In many cases the best known LDP algorithms require sending prohibitively large messages from the client device to the server (such as when constructing histograms over a large domain or learning a high-dimensional model). Here we demonstrate a general approach that, under standard cryptographic assumptions, compresses every efficient LDP algorithm with negligible loss in privacy and utility guarantees. The practical implication of our result is that in typical applications every message can be compressed to the size of the server’s pseudo-random generator seed. From this general approach we derive low-communication algorithms for the problems of frequency estimation and high-dimensional mean estimation. Our algorithms are simpler and more accurate than existing low-communication LDP algorithms for these well-studied problems.}
}
@InProceedings{pmlr-v139-feng21a,
title = {Dimensionality Reduction for the Sum-of-Distances Metric},
author = {Feng, Zhili and Kacham, Praneeth and Woodruff, David},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3220--3229},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/feng21a/feng21a.pdf},
url = {https://proceedings.mlr.press/v139/feng21a.html},
abstract = {We give a dimensionality reduction procedure to approximate the sum of distances of a given set of $n$ points in $R^d$ to any “shape” that lies in a $k$-dimensional subspace. Here, by “shape” we mean any set of points in $R^d$. Our algorithm takes an input in the form of an $n \times d$ matrix $A$, where each row of $A$ denotes a data point, and outputs a subspace $P$ of dimension $O(k^{3}/\epsilon^6)$ such that the projections of each of the $n$ points onto the subspace $P$ and the distances of each of the points to the subspace $P$ are sufficient to obtain an $\epsilon$-approximation to the sum of distances to any arbitrary shape that lies in a $k$-dimensional subspace of $R^d$. These include important problems such as $k$-median, $k$-subspace approximation, and $(j,l)$ subspace clustering with $j \cdot l \leq k$. Dimensionality reduction reduces the data storage requirement to $(n+d)k^{3}/\epsilon^6$ from nnz$(A)$. Here nnz$(A)$ could potentially be as large as $nd$. Our algorithm runs in time nnz$(A)/\epsilon^2 + (n+d)$poly$(k/\epsilon)$, up to logarithmic factors. For dense matrices, where nnz$(A) \approx nd$, we give a faster algorithm, that runs in time $nd + (n+d)$poly$(k/\epsilon)$ up to logarithmic factors. Our dimensionality reduction algorithm can also be used to obtain poly$(k/\epsilon)$ size coresets for $k$-median and $(k,1)$-subspace approximation problems in polynomial time.}
}
@InProceedings{pmlr-v139-feng21b,
title = {Reserve Price Optimization for First Price Auctions in Display Advertising},
author = {Feng, Zhe and Lahaie, Sebastien and Schneider, Jon and Ye, Jinchao},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3230--3239},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/feng21b/feng21b.pdf},
url = {https://proceedings.mlr.press/v139/feng21b.html},
abstract = {The display advertising industry has recently transitioned from second- to first-price auctions as its primary mechanism for ad allocation and pricing. In light of this, publishers need to re-evaluate and optimize their auction parameters, notably reserve prices. In this paper, we propose a gradient-based algorithm to adaptively update and optimize reserve prices based on estimates of bidders’ responsiveness to experimental shocks in reserves. Our key innovation is to draw on the inherent structure of the revenue objective in order to reduce the variance of gradient estimates and improve convergence rates in both theory and practice. We show that revenue in a first-price auction can be usefully decomposed into a \emph{demand} component and a \emph{bidding} component, and introduce techniques to reduce the variance of each component. We characterize the bias-variance trade-offs of these techniques and validate the performance of our proposed algorithm through experiments on synthetic data and real display ad auctions data from a major ad exchange.}
}
@InProceedings{pmlr-v139-feng21c,
title = {Uncertainty Principles of Encoding GANs},
author = {Feng, Ruili and Lin, Zhouchen and Zhu, Jiapeng and Zhao, Deli and Zhou, Jingren and Zha, Zheng-Jun},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3240--3251},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/feng21c/feng21c.pdf},
url = {https://proceedings.mlr.press/v139/feng21c.html},
abstract = {The compelling synthesis results of Generative Adversarial Networks (GANs) demonstrate rich semantic knowledge in their latent codes. To obtain this knowledge for downstream applications, encoding GANs has been proposed to learn encoders, such that real world data can be encoded to latent codes, which can be fed to generators to reconstruct those data. However, despite the theoretical guarantees of precise reconstruction in previous works, current algorithms generally reconstruct inputs with non-negligible deviations from inputs. In this paper we study this predicament of encoding GANs, which is indispensable research for the GAN community. We prove three uncertainty principles of encoding GANs in practice: a) the ‘perfect’ encoder and generator cannot be continuous at the same time, which implies that current framework of encoding GANs is ill-posed and needs rethinking; b) neural networks cannot approximate the underlying encoder and generator precisely at the same time, which explains why we cannot get ‘perfect’ encoders and generators as promised in previous theories; c) neural networks cannot be stable and accurate at the same time, which demonstrates the difficulty of training and trade-off between fidelity and disentanglement encountered in previous works. Our work may eliminate gaps between previous theories and empirical results, promote the understanding of GANs, and guide network designs for follow-up works.}
}
@InProceedings{pmlr-v139-feng21d,
title = {Pointwise Binary Classification with Pairwise Confidence Comparisons},
author = {Feng, Lei and Shu, Senlin and Lu, Nan and Han, Bo and Xu, Miao and Niu, Gang and An, Bo and Sugiyama, Masashi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3252--3262},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/feng21d/feng21d.pdf},
url = {https://proceedings.mlr.press/v139/feng21d.html},
abstract = {To alleviate the data requirement for training effective binary classifiers in binary classification, many weakly supervised learning settings have been proposed. Among them, some consider using pairwise but not pointwise labels, when pointwise labels are not accessible due to privacy, confidentiality, or security reasons. However, as a pairwise label denotes whether or not two data points share a pointwise label, it cannot be easily collected if either point is equally likely to be positive or negative. Thus, in this paper, we propose a novel setting called pairwise comparison (Pcomp) classification, where we have only pairs of unlabeled data that we know one is more likely to be positive than the other. Firstly, we give a Pcomp data generation process, derive an unbiased risk estimator (URE) with theoretical guarantee, and further improve URE using correction functions. Secondly, we link Pcomp classification to noisy-label learning to develop a progressive URE and improve it by imposing consistency regularization. Finally, we demonstrate by experiments the effectiveness of our methods, which suggests Pcomp is a valuable and practically useful type of pairwise supervision besides the pairwise label.}
}
@InProceedings{pmlr-v139-feng21e,
title = {Provably Correct Optimization and Exploration with Non-linear Policies},
author = {Feng, Fei and Yin, Wotao and Agarwal, Alekh and Yang, Lin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3263--3273},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/feng21e/feng21e.pdf},
url = {https://proceedings.mlr.press/v139/feng21e.html},
abstract = {Policy optimization methods remain a powerful workhorse in empirical Reinforcement Learning (RL), with a focus on neural policies that can easily reason over complex and continuous state and/or action spaces. Theoretical understanding of strategic exploration in policy-based methods with non-linear function approximation, however, is largely missing. In this paper, we address this question by designing ENIAC, an actor-critic method that allows non-linear function approximation in the critic. We show that under certain assumptions, e.g., a bounded eluder dimension $d$ for the critic class, the learner finds to a near-optimal policy in $\widetilde{O}(\mathrm{poly}(d))$ exploration rounds. The method is robust to model misspecification and strictly extends existing works on linear function approximation. We also develop some computational optimizations of our approach with slightly worse statistical guarantees, and an empirical adaptation building on existing deep RL tools. We empirically evaluate this adaptation, and show that it outperforms prior heuristics inspired by linear methods, establishing the value in correctly reasoning about the agent’s uncertainty under non-linear function approximation.}
}
@InProceedings{pmlr-v139-feng21f,
title = {KD3A: Unsupervised Multi-Source Decentralized Domain Adaptation via Knowledge Distillation},
author = {Feng, Haozhe and You, Zhaoyang and Chen, Minghao and Zhang, Tianye and Zhu, Minfeng and Wu, Fei and Wu, Chao and Chen, Wei},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3274--3283},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/feng21f/feng21f.pdf},
url = {https://proceedings.mlr.press/v139/feng21f.html},
abstract = {Conventional unsupervised multi-source domain adaptation (UMDA) methods assume all source domains can be accessed directly. However, this assumption neglects the privacy-preserving policy, where all the data and computations must be kept decentralized. There exist three challenges in this scenario: (1) Minimizing the domain distance requires the pairwise calculation of the data from the source and target domains, while the data on the source domain is not available. (2) The communication cost and privacy security limit the application of existing UMDA methods, such as the domain adversarial training. (3) Since users cannot govern the data quality, the irrelevant or malicious source domains are more likely to appear, which causes negative transfer. To address the above problems, we propose a privacy-preserving UMDA paradigm named Knowledge Distillation based Decentralized Domain Adaptation (KD3A), which performs domain adaptation through the knowledge distillation on models from different source domains. The extensive experiments show that KD3A significantly outperforms state-of-the-art UMDA approaches. Moreover, the KD3A is robust to the negative transfer and brings a 100x reduction of communication cost compared with other decentralized UMDA methods.}
}
@InProceedings{pmlr-v139-feng21g,
title = {Understanding Noise Injection in GANs},
author = {Feng, Ruili and Zhao, Deli and Zha, Zheng-Jun},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3284--3293},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/feng21g/feng21g.pdf},
url = {https://proceedings.mlr.press/v139/feng21g.html},
abstract = {Noise injection is an effective way of circumventing overfitting and enhancing generalization in machine learning, the rationale of which has been validated in deep learning as well. Recently, noise injection exhibits surprising effectiveness when generating high-fidelity images in Generative Adversarial Networks (GANs) (e.g. StyleGAN). Despite its successful applications in GANs, the mechanism of its validity is still unclear. In this paper, we propose a geometric framework to theoretically analyze the role of noise injection in GANs. First, we point out the existence of the adversarial dimension trap inherent in GANs, which leads to the difficulty of learning a proper generator. Second, we successfully model the noise injection framework with exponential maps based on Riemannian geometry. Guided by our theories, we propose a general geometric realization for noise injection. Under our novel framework, the simple noise injection used in StyleGAN reduces to the Euclidean case. The goal of our work is to make theoretical steps towards understanding the underlying mechanism of state-of-the-art GAN algorithms. Experiments on image generation and GAN inversion validate our theory in practice.}
}
@InProceedings{pmlr-v139-fey21a,
title = {GNNAutoScale: Scalable and Expressive Graph Neural Networks via Historical Embeddings},
author = {Fey, Matthias and Lenssen, Jan E. and Weichert, Frank and Leskovec, Jure},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3294--3304},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fey21a/fey21a.pdf},
url = {https://proceedings.mlr.press/v139/fey21a.html},
abstract = {We present GNNAutoScale (GAS), a framework for scaling arbitrary message-passing GNNs to large graphs. GAS prunes entire sub-trees of the computation graph by utilizing historical embeddings from prior training iterations, leading to constant GPU memory consumption in respect to input node size without dropping any data. While existing solutions weaken the expressive power of message passing due to sub-sampling of edges or non-trainable propagations, our approach is provably able to maintain the expressive power of the original GNN. We achieve this by providing approximation error bounds of historical embeddings and show how to tighten them in practice. Empirically, we show that the practical realization of our framework, PyGAS, an easy-to-use extension for PyTorch Geometric, is both fast and memory-efficient, learns expressive node representations, closely resembles the performance of their non-scaling counterparts, and reaches state-of-the-art performance on large-scale graphs.}
}
@InProceedings{pmlr-v139-filos21a,
title = {PsiPhi-Learning: Reinforcement Learning with Demonstrations using Successor Features and Inverse Temporal Difference Learning},
author = {Filos, Angelos and Lyle, Clare and Gal, Yarin and Levine, Sergey and Jaques, Natasha and Farquhar, Gregory},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3305--3317},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/filos21a/filos21a.pdf},
url = {https://proceedings.mlr.press/v139/filos21a.html},
abstract = {We study reinforcement learning (RL) with no-reward demonstrations, a setting in which an RL agent has access to additional data from the interaction of other agents with the same environment. However, it has no access to the rewards or goals of these agents, and their objectives and levels of expertise may vary widely. These assumptions are common in multi-agent settings, such as autonomous driving. To effectively use this data, we turn to the framework of successor features. This allows us to disentangle shared features and dynamics of the environment from agent-specific rewards and policies. We propose a multi-task inverse reinforcement learning (IRL) algorithm, called \emph{inverse temporal difference learning} (ITD), that learns shared state features, alongside per-agent successor features and preference vectors, purely from demonstrations without reward labels. We further show how to seamlessly integrate ITD with learning from online environment interactions, arriving at a novel algorithm for reinforcement learning with demonstrations, called $\Psi \Phi$-learning (pronounced ‘Sci-Fi’). We provide empirical evidence for the effectiveness of $\Psi \Phi$-learning as a method for improving RL, IRL, imitation, and few-shot transfer, and derive worst-case bounds for its performance in zero-shot transfer to new tasks.}
}
@InProceedings{pmlr-v139-finzi21a,
title = {A Practical Method for Constructing Equivariant Multilayer Perceptrons for Arbitrary Matrix Groups},
author = {Finzi, Marc and Welling, Max and Wilson, Andrew Gordon Gordon},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3318--3328},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/finzi21a/finzi21a.pdf},
url = {https://proceedings.mlr.press/v139/finzi21a.html},
abstract = {Symmetries and equivariance are fundamental to the generalization of neural networks on domains such as images, graphs, and point clouds. Existing work has primarily focused on a small number of groups, such as the translation, rotation, and permutation groups. In this work we provide a completely general algorithm for solving for the equivariant layers of matrix groups. In addition to recovering solutions from other works as special cases, we construct multilayer perceptrons equivariant to multiple groups that have never been tackled before, including $\mathrm{O}(1,3)$, $\mathrm{O}(5)$, $\mathrm{Sp}(n)$, and the Rubik’s cube group. Our approach outperforms non-equivariant baselines, with applications to particle physics and modeling dynamical systems. We release our software library to enable researchers to construct equivariant layers for arbitrary}
}
@InProceedings{pmlr-v139-fisch21a,
title = {Few-Shot Conformal Prediction with Auxiliary Tasks},
author = {Fisch, Adam and Schuster, Tal and Jaakkola, Tommi and Barzilay, Dr.Regina},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3329--3339},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fisch21a/fisch21a.pdf},
url = {https://proceedings.mlr.press/v139/fisch21a.html},
abstract = {We develop a novel approach to conformal prediction when the target task has limited data available for training. Conformal prediction identifies a small set of promising output candidates in place of a single prediction, with guarantees that the set contains the correct answer with high probability. When training data is limited, however, the predicted set can easily become unusably large. In this work, we obtain substantially tighter prediction sets while maintaining desirable marginal guarantees by casting conformal prediction as a meta-learning paradigm over exchangeable collections of auxiliary tasks. Our conformalization algorithm is simple, fast, and agnostic to the choice of underlying model, learning algorithm, or dataset. We demonstrate the effectiveness of this approach across a number of few-shot classification and regression tasks in natural language processing, computer vision, and computational chemistry for drug discovery.}
}
@InProceedings{pmlr-v139-fischer21a,
title = {Scalable Certified Segmentation via Randomized Smoothing},
author = {Fischer, Marc and Baader, Maximilian and Vechev, Martin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3340--3351},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fischer21a/fischer21a.pdf},
url = {https://proceedings.mlr.press/v139/fischer21a.html},
abstract = {We present a new certification method for image and point cloud segmentation based on randomized smoothing. The method leverages a novel scalable algorithm for prediction and certification that correctly accounts for multiple testing, necessary for ensuring statistical guarantees. The key to our approach is reliance on established multiple-testing correction mechanisms as well as the ability to abstain from classifying single pixels or points while still robustly segmenting the overall input. Our experimental evaluation on synthetic data and challenging datasets, such as Pascal Context, Cityscapes, and ShapeNet, shows that our algorithm can achieve, for the first time, competitive accuracy and certification guarantees on real-world segmentation tasks. We provide an implementation at https://github.com/eth-sri/segmentation-smoothing.}
}
@InProceedings{pmlr-v139-fischer21b,
title = {What’s in the Box? Exploring the Inner Life of Neural Networks with Robust Rules},
author = {Fischer, Jonas and Olah, Anna and Vreeken, Jilles},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3352--3362},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fischer21b/fischer21b.pdf},
url = {https://proceedings.mlr.press/v139/fischer21b.html},
abstract = {We propose a novel method for exploring how neurons within neural networks interact. In particular, we consider activation values of a network for given data, and propose to mine noise-robust rules of the form X {\rightarrow} Y , where X and Y are sets of neurons in different layers. We identify the best set of rules by the Minimum Description Length Principle as the rules that together are most descriptive of the activation data. To learn good rule sets in practice, we propose the unsupervised ExplaiNN algorithm. Extensive evaluation shows that the patterns it discovers give clear insight in how networks perceive the world: they identify shared, respectively class-specific traits, compositionality within the network, as well as locality in convolutional layers. Moreover, these patterns are not only easily interpretable, but also supercharge prototyping as they identify which groups of neurons to consider in unison.}
}
@InProceedings{pmlr-v139-flaspohler21a,
title = {Online Learning with Optimism and Delay},
author = {Flaspohler, Genevieve E and Orabona, Francesco and Cohen, Judah and Mouatadid, Soukayna and Oprescu, Miruna and Orenstein, Paulo and Mackey, Lester},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3363--3373},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/flaspohler21a/flaspohler21a.pdf},
url = {https://proceedings.mlr.press/v139/flaspohler21a.html},
abstract = {Inspired by the demands of real-time climate and weather forecasting, we develop optimistic online learning algorithms that require no parameter tuning and have optimal regret guarantees under delayed feedback. Our algorithms—DORM, DORM+, and AdaHedgeD—arise from a novel reduction of delayed online learning to optimistic online learning that reveals how optimistic hints can mitigate the regret penalty caused by delay. We pair this delay-as-optimism perspective with a new analysis of optimistic learning that exposes its robustness to hinting errors and a new meta-algorithm for learning effective hinting strategies in the presence of delay. We conclude by benchmarking our algorithms on four subseasonal climate forecasting tasks, demonstrating low regret relative to state-of-the-art forecasting models.}
}
@InProceedings{pmlr-v139-fontaine21a,
title = {Online A-Optimal Design and Active Linear Regression},
author = {Fontaine, Xavier and Perrault, Pierre and Valko, Michal and Perchet, Vianney},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3374--3383},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fontaine21a/fontaine21a.pdf},
url = {https://proceedings.mlr.press/v139/fontaine21a.html},
abstract = {We consider in this paper the problem of optimal experiment design where a decision maker can choose which points to sample to obtain an estimate $\hat{\beta}$ of the hidden parameter $\beta^{\star}$ of an underlying linear model. The key challenge of this work lies in the heteroscedasticity assumption that we make, meaning that each covariate has a different and unknown variance. The goal of the decision maker is then to figure out on the fly the optimal way to allocate the total budget of $T$ samples between covariates, as sampling several times a specific one will reduce the variance of the estimated model around it (but at the cost of a possible higher variance elsewhere). By trying to minimize the $\ell^2$-loss $\mathbb{E} [\lVert\hat{\beta}-\beta^{\star}\rVert^2]$ the decision maker is actually minimizing the trace of the covariance matrix of the problem, which corresponds then to online A-optimal design. Combining techniques from bandit and convex optimization we propose a new active sampling algorithm and we compare it with existing ones. We provide theoretical guarantees of this algorithm in different settings, including a $\mathcal{O}(T^{-2})$ regret bound in the case where the covariates form a basis of the feature space, generalizing and improving existing results. Numerical experiments validate our theoretical findings.}
}
@InProceedings{pmlr-v139-foster21a,
title = {Deep Adaptive Design: Amortizing Sequential Bayesian Experimental Design},
author = {Foster, Adam and Ivanova, Desi R and Malik, Ilyas and Rainforth, Tom},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3384--3395},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/foster21a/foster21a.pdf},
url = {https://proceedings.mlr.press/v139/foster21a.html},
abstract = {We introduce Deep Adaptive Design (DAD), a method for amortizing the cost of adaptive Bayesian experimental design that allows experiments to be run in real-time. Traditional sequential Bayesian optimal experimental design approaches require substantial computation at each stage of the experiment. This makes them unsuitable for most real-world applications, where decisions must typically be made quickly. DAD addresses this restriction by learning an amortized design network upfront and then using this to rapidly run (multiple) adaptive experiments at deployment time. This network represents a design policy which takes as input the data from previous steps, and outputs the next design using a single forward pass; these design decisions can be made in milliseconds during the live experiment. To train the network, we introduce contrastive information bounds that are suitable objectives for the sequential setting, and propose a customized network architecture that exploits key symmetries. We demonstrate that DAD successfully amortizes the process of experimental design, outperforming alternative strategies on a number of problems.}
}
@InProceedings{pmlr-v139-fotakis21a,
title = {Efficient Online Learning for Dynamic k-Clustering},
author = {Fotakis, Dimitris and Piliouras, Georgios and Skoulakis, Stratis},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3396--3406},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fotakis21a/fotakis21a.pdf},
url = {https://proceedings.mlr.press/v139/fotakis21a.html},
abstract = {In this work, we study dynamic clustering problems from the perspective of online learning. We consider an online learning problem, called \textit{Dynamic $k$-Clustering}, in which $k$ centers are maintained in a metric space over time (centers may change positions) such as a dynamically changing set of $r$ clients is served in the best possible way. The connection cost at round $t$ is given by the \textit{$p$-norm} of the vector formed by the distance of each client to its closest center at round $t$, for some $p\geq 1$. We design a \textit{$\Theta\left( \min(k,r) \right)$-regret} polynomial-time online learning algorithm, while we show that, under some well-established computational complexity conjectures, \textit{constant-regret} cannot be achieved in polynomial-time. In addition to the efficient solution of Dynamic $k$-Clustering, our work contributes to the long line of research of combinatorial online learning.}
}
@InProceedings{pmlr-v139-fraboni21a,
title = {Clustered Sampling: Low-Variance and Improved Representativity for Clients Selection in Federated Learning},
author = {Fraboni, Yann and Vidal, Richard and Kameni, Laetitia and Lorenzi, Marco},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3407--3416},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fraboni21a/fraboni21a.pdf},
url = {https://proceedings.mlr.press/v139/fraboni21a.html},
abstract = {This work addresses the problem of optimizing communications between server and clients in federated learning (FL). Current sampling approaches in FL are either biased, or non optimal in terms of server-clients communications and training stability. To overcome this issue, we introduce clustered sampling for clients selection. We prove that clustered sampling leads to better clients representatitivity and to reduced variance of the clients stochastic aggregation weights in FL. Compatibly with our theory, we provide two different clustering approaches enabling clients aggregation based on 1) sample size, and 2) models similarity. Through a series of experiments in non-iid and unbalanced scenarios, we demonstrate that model aggregation through clustered sampling consistently leads to better training convergence and variability when compared to standard sampling approaches. Our approach does not require any additional operation on the clients side, and can be seamlessly integrated in standard FL implementations. Finally, clustered sampling is compatible with existing methods and technologies for privacy enhancement, and for communication reduction through model compression.}
}
@InProceedings{pmlr-v139-frei21a,
title = {Agnostic Learning of Halfspaces with Gradient Descent via Soft Margins},
author = {Frei, Spencer and Cao, Yuan and Gu, Quanquan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3417--3426},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/frei21a/frei21a.pdf},
url = {https://proceedings.mlr.press/v139/frei21a.html},
abstract = {We analyze the properties of gradient descent on convex surrogates for the zero-one loss for the agnostic learning of halfspaces. We show that when a quantity we refer to as the \textit{soft margin} is well-behaved—a condition satisfied by log-concave isotropic distributions among others—minimizers of convex surrogates for the zero-one loss are approximate minimizers for the zero-one loss itself. As standard convex optimization arguments lead to efficient guarantees for minimizing convex surrogates of the zero-one loss, our methods allow for the first positive guarantees for the classification error of halfspaces learned by gradient descent using the binary cross-entropy or hinge loss in the presence of agnostic label noise.}
}
@InProceedings{pmlr-v139-frei21b,
title = {Provable Generalization of SGD-trained Neural Networks of Any Width in the Presence of Adversarial Label Noise},
author = {Frei, Spencer and Cao, Yuan and Gu, Quanquan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3427--3438},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/frei21b/frei21b.pdf},
url = {https://proceedings.mlr.press/v139/frei21b.html},
abstract = {We consider a one-hidden-layer leaky ReLU network of arbitrary width trained by stochastic gradient descent (SGD) following an arbitrary initialization. We prove that SGD produces neural networks that have classification accuracy competitive with that of the best halfspace over the distribution for a broad class of distributions that includes log-concave isotropic and hard margin distributions. Equivalently, such networks can generalize when the data distribution is linearly separable but corrupted with adversarial label noise, despite the capacity to overfit. To the best of our knowledge, this is the first work to show that overparameterized neural networks trained by SGD can generalize when the data is corrupted with adversarial label noise.}
}
@InProceedings{pmlr-v139-freidling21a,
title = {Post-selection inference with HSIC-Lasso},
author = {Freidling, Tobias and Poignard, Benjamin and Climente-Gonz{\'a}lez, H{\'e}ctor and Yamada, Makoto},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3439--3448},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/freidling21a/freidling21a.pdf},
url = {https://proceedings.mlr.press/v139/freidling21a.html},
abstract = {Detecting influential features in non-linear and/or high-dimensional data is a challenging and increasingly important task in machine learning. Variable selection methods have thus been gaining much attention as well as post-selection inference. Indeed, the selected features can be significantly flawed when the selection procedure is not accounted for. We propose a selective inference procedure using the so-called model-free "HSIC-Lasso" based on the framework of truncated Gaussians combined with the polyhedral lemma. We then develop an algorithm, which allows for low computational costs and provides a selection of the regularisation parameter. The performance of our method is illustrated by both artificial and real-world data based experiments, which emphasise a tight control of the type-I error, even for small sample sizes.}
}
@InProceedings{pmlr-v139-frerix21a,
title = {Variational Data Assimilation with a Learned Inverse Observation Operator},
author = {Frerix, Thomas and Kochkov, Dmitrii and Smith, Jamie and Cremers, Daniel and Brenner, Michael and Hoyer, Stephan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3449--3458},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/frerix21a/frerix21a.pdf},
url = {https://proceedings.mlr.press/v139/frerix21a.html},
abstract = {Variational data assimilation optimizes for an initial state of a dynamical system such that its evolution fits observational data. The physical model can subsequently be evolved into the future to make predictions. This principle is a cornerstone of large scale forecasting applications such as numerical weather prediction. As such, it is implemented in current operational systems of weather forecasting agencies across the globe. However, finding a good initial state poses a difficult optimization problem in part due to the non-invertible relationship between physical states and their corresponding observations. We learn a mapping from observational data to physical states and show how it can be used to improve optimizability. We employ this mapping in two ways: to better initialize the non-convex optimization problem, and to reformulate the objective function in better behaved physics space instead of observation space. Our experimental results for the Lorenz96 model and a two-dimensional turbulent fluid flow demonstrate that this procedure significantly improves forecast quality for chaotic systems.}
}
@InProceedings{pmlr-v139-frohlich21a,
title = {Bayesian Quadrature on Riemannian Data Manifolds},
author = {Fr{\"o}hlich, Christian and Gessner, Alexandra and Hennig, Philipp and Sch{\"o}lkopf, Bernhard and Arvanitidis, Georgios},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3459--3468},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/frohlich21a/frohlich21a.pdf},
url = {https://proceedings.mlr.press/v139/frohlich21a.html},
abstract = {Riemannian manifolds provide a principled way to model nonlinear geometric structure inherent in data. A Riemannian metric on said manifolds determines geometry-aware shortest paths and provides the means to define statistical models accordingly. However, these operations are typically computationally demanding. To ease this computational burden, we advocate probabilistic numerical methods for Riemannian statistics. In particular, we focus on Bayesian quadrature (BQ) to numerically compute integrals over normal laws on Riemannian manifolds learned from data. In this task, each function evaluation relies on the solution of an expensive initial value problem. We show that by leveraging both prior knowledge and an active exploration scheme, BQ significantly reduces the number of required evaluations and thus outperforms Monte Carlo methods on a wide range of integration problems. As a concrete application, we highlight the merits of adopting Riemannian geometry with our proposed framework on a nonlinear dataset from molecular dynamics.}
}
@InProceedings{pmlr-v139-fu21a,
title = {Learn-to-Share: A Hardware-friendly Transfer Learning Framework Exploiting Computation and Parameter Sharing},
author = {Fu, Cheng and Huang, Hanxian and Chen, Xinyun and Tian, Yuandong and Zhao, Jishen},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3469--3479},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fu21a/fu21a.pdf},
url = {https://proceedings.mlr.press/v139/fu21a.html},
abstract = {Task-specific fine-tuning on pre-trained transformers has achieved performance breakthroughs in multiple NLP tasks. Yet, as both computation and parameter size grows linearly with the number of sub-tasks, it is increasingly difficult to adopt such methods to the real world due to unrealistic memory and computation overhead on computing devices. Previous works on fine-tuning focus on reducing the growing parameter size to save storage cost by parameter sharing. However, compared to storage, the constraint of computation is a more critical issue with the fine-tuning models in modern computing environments. In this work, we propose LeTS, a framework that leverages both computation and parameter sharing across multiple tasks. Compared to traditional fine-tuning, LeTS proposes a novel neural architecture that contains a fixed pre-trained transformer model, plus learnable additive components for sub-tasks. The learnable components reuse the intermediate activations in the fixed pre-trained model, decoupling computation dependency. Differentiable neural architecture search is used to determine a task-specific computation sharing scheme, and a novel early stage pruning is applied to additive components for sparsity to achieve parameter sharing. Extensive experiments show that with 1.4% of extra parameters per task, LeTS reduces the computation by 49.5% on GLUE benchmarks with only 0.2% accuracy loss compared to full fine-tuning.}
}
@InProceedings{pmlr-v139-fu21b,
title = {Learning Task Informed Abstractions},
author = {Fu, Xiang and Yang, Ge and Agrawal, Pulkit and Jaakkola, Tommi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3480--3491},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fu21b/fu21b.pdf},
url = {https://proceedings.mlr.press/v139/fu21b.html},
abstract = {Current model-based reinforcement learning methods struggle when operating from complex visual scenes due to their inability to prioritize task-relevant features. To mitigate this problem, we propose learning Task Informed Abstractions (TIA) that explicitly separates reward-correlated visual features from distractors. For learning TIA, we introduce the formalism of Task Informed MDP (TiMDP) that is realized by training two models that learn visual features via cooperative reconstruction, but one model is adversarially dissociated from the reward signal. Empirical evaluation shows that TIA leads to significant performance gains over state-of-the-art methods on many visual control tasks where natural and unconstrained visual distractions pose a formidable challenge. Project page: https://xiangfu.co/tia}
}
@InProceedings{pmlr-v139-fu21c,
title = {Double-Win Quant: Aggressively Winning Robustness of Quantized Deep Neural Networks via Random Precision Training and Inference},
author = {Fu, Yonggan and Yu, Qixuan and Li, Meng and Chandra, Vikas and Lin, Yingyan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3492--3504},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fu21c/fu21c.pdf},
url = {https://proceedings.mlr.press/v139/fu21c.html},
abstract = {Quantization is promising in enabling powerful yet complex deep neural networks (DNNs) to be deployed into resource constrained platforms. However, quantized DNNs are vulnerable to adversarial attacks unless being equipped with sophisticated techniques, leading to a dilemma of struggling between DNNs’ efficiency and robustness. In this work, we demonstrate a new perspective regarding quantization’s role in DNNs’ robustness, advocating that quantization can be leveraged to largely boost DNNs’ robustness, and propose a framework dubbed Double-Win Quant that can boost the robustness of quantized DNNs over their full precision counterparts by a large margin. Specifically, we for the first time identify that when an adversarially trained model is quantized to different precisions in a post-training manner, the associated adversarial attacks transfer poorly between different precisions. Leveraging this intriguing observation, we further develop Double-Win Quant integrating random precision inference and training to further reduce and utilize the poor adversarial transferability, enabling an aggressive “win-win" in terms of DNNs’ robustness and efficiency. Extensive experiments and ablation studies consistently validate Double-Win Quant’s effectiveness and advantages over state-of-the-art (SOTA) adversarial training methods across various attacks/models/datasets. Our codes are available at: https://github.com/RICE-EIC/Double-Win-Quant.}
}
@InProceedings{pmlr-v139-fu21d,
title = {Auto-NBA: Efficient and Effective Search Over the Joint Space of Networks, Bitwidths, and Accelerators},
author = {Fu, Yonggan and Zhang, Yongan and Zhang, Yang and Cox, David and Lin, Yingyan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3505--3517},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fu21d/fu21d.pdf},
url = {https://proceedings.mlr.press/v139/fu21d.html},
abstract = {While maximizing deep neural networks’ (DNNs’) acceleration efficiency requires a joint search/design of three different yet highly coupled aspects, including the networks, bitwidths, and accelerators, the challenges associated with such a joint search have not yet been fully understood and addressed. The key challenges include (1) the dilemma of whether to explode the memory consumption due to the huge joint space or achieve sub-optimal designs, (2) the discrete nature of the accelerator design space that is coupled yet different from that of the networks and bitwidths, and (3) the chicken and egg problem associated with network-accelerator co-search, i.e., co-search requires operation-wise hardware cost, which is lacking during search as the optimal accelerator depending on the whole network is still unknown during search. To tackle these daunting challenges towards optimal and fast development of DNN accelerators, we propose a framework dubbed Auto-NBA to enable jointly searching for the Networks, Bitwidths, and Accelerators, by efficiently localizing the optimal design within the huge joint design space for each target dataset and acceleration specification. Our Auto-NBA integrates a heterogeneous sampling strategy to achieve unbiased search with constant memory consumption, and a novel joint-search pipeline equipped with a generic differentiable accelerator search engine. Extensive experiments and ablation studies validate that both Auto-NBA generated networks and accelerators consistently outperform state-of-the-art designs (including co-search/exploration techniques, hardware-aware NAS methods, and DNN accelerators), in terms of search time, task accuracy, and accelerator efficiency. Our codes are available at: https://github.com/RICE-EIC/Auto-NBA.}
}
@InProceedings{pmlr-v139-fujimoto21a,
title = {A Deep Reinforcement Learning Approach to Marginalized Importance Sampling with the Successor Representation},
author = {Fujimoto, Scott and Meger, David and Precup, Doina},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3518--3529},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fujimoto21a/fujimoto21a.pdf},
url = {https://proceedings.mlr.press/v139/fujimoto21a.html},
abstract = {Marginalized importance sampling (MIS), which measures the density ratio between the state-action occupancy of a target policy and that of a sampling distribution, is a promising approach for off-policy evaluation. However, current state-of-the-art MIS methods rely on complex optimization tricks and succeed mostly on simple toy problems. We bridge the gap between MIS and deep reinforcement learning by observing that the density ratio can be computed from the successor representation of the target policy. The successor representation can be trained through deep reinforcement learning methodology and decouples the reward optimization from the dynamics of the environment, making the resulting algorithm stable and applicable to high-dimensional domains. We evaluate the empirical performance of our approach on a variety of challenging Atari and MuJoCo environments.}
}
@InProceedings{pmlr-v139-fumero21a,
title = {Learning disentangled representations via product manifold projection},
author = {Fumero, Marco and Cosmo, Luca and Melzi, Simone and Rodola, Emanuele},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3530--3540},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/fumero21a/fumero21a.pdf},
url = {https://proceedings.mlr.press/v139/fumero21a.html},
abstract = {We propose a novel approach to disentangle the generative factors of variation underlying a given set of observations. Our method builds upon the idea that the (unknown) low-dimensional manifold underlying the data space can be explicitly modeled as a product of submanifolds. This definition of disentanglement gives rise to a novel weakly-supervised algorithm for recovering the unknown explanatory factors behind the data. At training time, our algorithm only requires pairs of non i.i.d. data samples whose elements share at least one, possibly multidimensional, generative factor of variation. We require no knowledge on the nature of these transformations, and do not make any limiting assumption on the properties of each subspace. Our approach is easy to implement, and can be successfully applied to different kinds of data (from images to 3D surfaces) undergoing arbitrary transformations. In addition to standard synthetic benchmarks, we showcase our method in challenging real-world applications, where we compare favorably with the state of the art.}
}
@InProceedings{pmlr-v139-furuta21a,
title = {Policy Information Capacity: Information-Theoretic Measure for Task Complexity in Deep Reinforcement Learning},
author = {Furuta, Hiroki and Matsushima, Tatsuya and Kozuno, Tadashi and Matsuo, Yutaka and Levine, Sergey and Nachum, Ofir and Gu, Shixiang Shane},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3541--3552},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/furuta21a/furuta21a.pdf},
url = {https://proceedings.mlr.press/v139/furuta21a.html},
abstract = {Progress in deep reinforcement learning (RL) research is largely enabled by benchmark task environments. However, analyzing the nature of those environments is often overlooked. In particular, we still do not have agreeable ways to measure the difficulty or solvability of a task, given that each has fundamentally different actions, observations, dynamics, rewards, and can be tackled with diverse RL algorithms. In this work, we propose policy information capacity (PIC) – the mutual information between policy parameters and episodic return – and policy-optimal information capacity (POIC) – between policy parameters and episodic optimality – as two environment-agnostic, algorithm-agnostic quantitative metrics for task difficulty. Evaluating our metrics across toy environments as well as continuous control benchmark tasks from OpenAI Gym and DeepMind Control Suite, we empirically demonstrate that these information-theoretic metrics have higher correlations with normalized task solvability scores than a variety of alternatives. Lastly, we show that these metrics can also be used for fast and compute-efficient optimizations of key design parameters such as reward shaping, policy architectures, and MDP properties for better solvability by RL algorithms without ever running full RL experiments.}
}
@InProceedings{pmlr-v139-gao21a,
title = {An Information-Geometric Distance on the Space of Tasks},
author = {Gao, Yansong and Chaudhari, Pratik},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3553--3563},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gao21a/gao21a.pdf},
url = {https://proceedings.mlr.press/v139/gao21a.html},
abstract = {This paper prescribes a distance between learning tasks modeled as joint distributions on data and labels. Using tools in information geometry, the distance is defined to be the length of the shortest weight trajectory on a Riemannian manifold as a classifier is fitted on an interpolated task. The interpolated task evolves from the source to the target task using an optimal transport formulation. This distance, which we call the "coupled transfer distance" can be compared across different classifier architectures. We develop an algorithm to compute the distance which iteratively transports the marginal on the data of the source task to that of the target task while updating the weights of the classifier to track this evolving data distribution. We develop theory to show that our distance captures the intuitive idea that a good transfer trajectory is the one that keeps the generalization gap small during transfer, in particular at the end on the target task. We perform thorough empirical validation and analysis across diverse image classification datasets to show that the coupled transfer distance correlates strongly with the difficulty of fine-tuning.}
}
@InProceedings{pmlr-v139-gao21b,
title = {Maximum Mean Discrepancy Test is Aware of Adversarial Attacks},
author = {Gao, Ruize and Liu, Feng and Zhang, Jingfeng and Han, Bo and Liu, Tongliang and Niu, Gang and Sugiyama, Masashi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3564--3575},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gao21b/gao21b.pdf},
url = {https://proceedings.mlr.press/v139/gao21b.html},
abstract = {The maximum mean discrepancy (MMD) test could in principle detect any distributional discrepancy between two datasets. However, it has been shown that the MMD test is unaware of adversarial attacks–the MMD test failed to detect the discrepancy between natural data and adversarial data. Given this phenomenon, we raise a question: are natural and adversarial data really from different distributions? The answer is affirmative–the previous use of the MMD test on the purpose missed three key factors, and accordingly, we propose three components. Firstly, the Gaussian kernel has limited representation power, and we replace it with an effective deep kernel. Secondly, the test power of the MMD test was neglected, and we maximize it following asymptotic statistics. Finally, adversarial data may be non-independent, and we overcome this issue with the help of wild bootstrap. By taking care of the three factors, we verify that the MMD test is aware of adversarial attacks, which lights up a novel road for adversarial data detection based on two-sample tests.}
}
@InProceedings{pmlr-v139-gao21c,
title = {Unsupervised Co-part Segmentation through Assembly},
author = {Gao, Qingzhe and Wang, Bin and Liu, Libin and Chen, Baoquan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3576--3586},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gao21c/gao21c.pdf},
url = {https://proceedings.mlr.press/v139/gao21c.html},
abstract = {Co-part segmentation is an important problem in computer vision for its rich applications. We propose an unsupervised learning approach for co-part segmentation from images. For the training stage, we leverage motion information embedded in videos and explicitly extract latent representations to segment meaningful object parts. More importantly, we introduce a dual procedure of part-assembly to form a closed loop with part-segmentation, enabling an effective self-supervision. We demonstrate the effectiveness of our approach with a host of extensive experiments, ranging from human bodies, hands, quadruped, and robot arms. We show that our approach can achieve meaningful and compact part segmentation, outperforming state-of-the-art approaches on diverse benchmarks.}
}
@InProceedings{pmlr-v139-gao21d,
title = {Discriminative Complementary-Label Learning with Weighted Loss},
author = {Gao, Yi and Zhang, Min-Ling},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3587--3597},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gao21d/gao21d.pdf},
url = {https://proceedings.mlr.press/v139/gao21d.html},
abstract = {Complementary-label learning (CLL) deals with the weak supervision scenario where each training instance is associated with one \emph{complementary} label, which specifies the class label that the instance does \emph{not} belong to. Given the training instance ${\bm x}$, existing CLL approaches aim at modeling the \emph{generative} relationship between the complementary label $\bar y$, i.e. $P(\bar y\mid {\bm x})$, and the ground-truth label $y$, i.e. $P(y\mid {\bm x})$. Nonetheless, as the ground-truth label is not directly accessible for complementarily labeled training instance, strong generative assumptions may not hold for real-world CLL tasks. In this paper, we derive a simple and theoretically-sound \emph{discriminative} model towards $P(\bar y\mid {\bm x})$, which naturally leads to a risk estimator with estimation error bound at $\mathcal{O}(1/\sqrt{n})$ convergence rate. Accordingly, a practical CLL approach is proposed by further introducing weighted loss to the empirical risk to maximize the predictive gap between potential ground-truth label and complementary label. Extensive experiments clearly validate the effectiveness of the proposed discriminative complementary-label learning approach.}
}
@InProceedings{pmlr-v139-garg21a,
title = {RATT: Leveraging Unlabeled Data to Guarantee Generalization},
author = {Garg, Saurabh and Balakrishnan, Sivaraman and Kolter, Zico and Lipton, Zachary},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3598--3609},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/garg21a/garg21a.pdf},
url = {https://proceedings.mlr.press/v139/garg21a.html},
abstract = {To assess generalization, machine learning scientists typically either (i) bound the generalization gap and then (after training) plug in the empirical risk to obtain a bound on the true risk; or (ii) validate empirically on holdout data. However, (i) typically yields vacuous guarantees for overparameterized models; and (ii) shrinks the training set and its guarantee erodes with each re-use of the holdout set. In this paper, we leverage unlabeled data to produce generalization bounds. After augmenting our (labeled) training set with randomly labeled data, we train in the standard fashion. Whenever classifiers achieve low error on the clean data but high error on the random data, our bound ensures that the true risk is low. We prove that our bound is valid for 0-1 empirical risk minimization and with linear classifiers trained by gradient descent. Our approach is especially useful in conjunction with deep learning due to the early learning phenomenon whereby networks fit true labels before noisy labels but requires one intuitive assumption. Empirically, on canonical computer vision and NLP tasks, our bound provides non-vacuous generalization guarantees that track actual performance closely. This work enables practitioners to certify generalization even when (labeled) holdout data is unavailable and provides insights into the relationship between random label noise and generalization.}
}
@InProceedings{pmlr-v139-garg21b,
title = {On Proximal Policy Optimization’s Heavy-tailed Gradients},
author = {Garg, Saurabh and Zhanson, Joshua and Parisotto, Emilio and Prasad, Adarsh and Kolter, Zico and Lipton, Zachary and Balakrishnan, Sivaraman and Salakhutdinov, Ruslan and Ravikumar, Pradeep},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3610--3619},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/garg21b/garg21b.pdf},
url = {https://proceedings.mlr.press/v139/garg21b.html},
abstract = {Modern policy gradient algorithms such as Proximal Policy Optimization (PPO) rely on an arsenal of heuristics, including loss clipping and gradient clipping, to ensure successful learning. These heuristics are reminiscent of techniques from robust statistics, commonly used for estimation in outlier-rich ("heavy-tailed") regimes. In this paper, we present a detailed empirical study to characterize the heavy-tailed nature of the gradients of the PPO surrogate reward function. We demonstrate that the gradients, especially for the actor network, exhibit pronounced heavy-tailedness and that it increases as the agent’s policy diverges from the behavioral policy (i.e., as the agent goes further off policy). Further examination implicates the likelihood ratios and advantages in the surrogate reward as the main sources of the observed heavy-tailedness. We then highlight issues arising due to the heavy-tailed nature of the gradients. In this light, we study the effects of the standard PPO clipping heuristics, demonstrating that these tricks primarily serve to offset heavy-tailedness in gradients. Thus motivated, we propose incorporating GMOM, a high-dimensional robust estimator, into PPO as a substitute for three clipping tricks. Despite requiring less hyperparameter tuning, our method matches the performance of PPO (with all heuristics enabled) on a battery of MuJoCo continuous control tasks.}
}
@InProceedings{pmlr-v139-garreau21a,
title = {What does LIME really see in images?},
author = {Garreau, Damien and Mardaoui, Dina},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3620--3629},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/garreau21a/garreau21a.pdf},
url = {https://proceedings.mlr.press/v139/garreau21a.html},
abstract = {The performance of modern algorithms on certain computer vision tasks such as object recognition is now close to that of humans. This success was achieved at the price of complicated architectures depending on millions of parameters and it has become quite challenging to understand how particular predictions are made. Interpretability methods propose to give us this understanding. In this paper, we study LIME, perhaps one of the most popular. On the theoretical side, we show that when the number of generated examples is large, LIME explanations are concentrated around a limit explanation for which we give an explicit expression. We further this study for elementary shape detectors and linear models. As a consequence of this analysis, we uncover a connection between LIME and integrated gradients, another explanation method. More precisely, the LIME explanations are similar to the sum of integrated gradients over the superpixels used in the preprocessing step of LIME.}
}
@InProceedings{pmlr-v139-gauthier21a,
title = {Parametric Graph for Unimodal Ranking Bandit},
author = {Gauthier, Camille-Sovanneary and Gaudel, Romaric and Fromont, Elisa and Lompo, Boammani Aser},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3630--3639},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gauthier21a/gauthier21a.pdf},
url = {https://proceedings.mlr.press/v139/gauthier21a.html},
abstract = {We tackle the online ranking problem of assigning $L$ items to $K$ positions on a web page in order to maximize the number of user clicks. We propose an original algorithm, easy to implement and with strong theoretical guarantees to tackle this problem in the Position-Based Model (PBM) setting, well suited for applications where items are displayed on a grid. Besides learning to rank, our algorithm, GRAB (for parametric Graph for unimodal RAnking Bandit), also learns the parameter of a compact graph over permutations of $K$ items among $L$. The logarithmic regret bound of this algorithm is a direct consequence of the unimodality property of the bandit setting with respect to the learned graph. Experiments against state-of-the-art learning algorithms which also tackle the PBM setting, show that our method is more efficient while giving regret performance on par with the best known algorithms on simulated and real life datasets.}
}
@InProceedings{pmlr-v139-geerts21a,
title = {Let’s Agree to Degree: Comparing Graph Convolutional Networks in the Message-Passing Framework},
author = {Geerts, Floris and Mazowiecki, Filip and Perez, Guillermo},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3640--3649},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/geerts21a/geerts21a.pdf},
url = {https://proceedings.mlr.press/v139/geerts21a.html},
abstract = {In this paper we cast neural networks defined on graphs as message-passing neural networks (MPNNs) to study the distinguishing power of different classes of such models. We are interested in when certain architectures are able to tell vertices apart based on the feature labels given as input with the graph. We consider two variants of MPNNS: anonymous MPNNs whose message functions depend only on the labels of vertices involved; and degree-aware MPNNs whose message functions can additionally use information regarding the degree of vertices. The former class covers popular graph neural network (GNN) formalisms for which the distinguished power is known. The latter covers graph convolutional networks (GCNs), introduced by Kipf and Welling, for which the distinguishing power was unknown. We obtain lower and upper bounds on the distinguishing power of (anonymous and degree-aware) MPNNs in terms of the distinguishing power of the Weisfeiler-Lehman (WL) algorithm. Our main results imply that (i) the distinguishing power of GCNs is bounded by the WL algorithm, but they may be one step ahead; (ii) the WL algorithm cannot be simulated by “plain vanilla” GCNs but the addition of a trade-off parameter between features of the vertex and those of its neighbours (as proposed by Kipf and Welling) resolves this problem.}
}
@InProceedings{pmlr-v139-geffner21a,
title = {On the difficulty of unbiased alpha divergence minimization},
author = {Geffner, Tomas and Domke, Justin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3650--3659},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/geffner21a/geffner21a.pdf},
url = {https://proceedings.mlr.press/v139/geffner21a.html},
abstract = {Several approximate inference algorithms have been proposed to minimize an alpha-divergence between an approximating distribution and a target distribution. Many of these algorithms introduce bias, the magnitude of which becomes problematic in high dimensions. Other algorithms are unbiased. These often seem to suffer from high variance, but little is rigorously known. In this work we study unbiased methods for alpha-divergence minimization through the Signal-to-Noise Ratio (SNR) of the gradient estimator. We study several representative scenarios where strong analytical results are possible, such as fully-factorized or Gaussian distributions. We find that when alpha is not zero, the SNR worsens exponentially in the dimensionality of the problem. This casts doubt on the practicality of these methods. We empirically confirm these theoretical results.}
}
@InProceedings{pmlr-v139-gentzel21a,
title = {How and Why to Use Experimental Data to Evaluate Methods for Observational Causal Inference},
author = {Gentzel, Amanda M and Pruthi, Purva and Jensen, David},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3660--3671},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gentzel21a/gentzel21a.pdf},
url = {https://proceedings.mlr.press/v139/gentzel21a.html},
abstract = {Methods that infer causal dependence from observational data are central to many areas of science, including medicine, economics, and the social sciences. A variety of theoretical properties of these methods have been proven, but empirical evaluation remains a challenge, largely due to the lack of observational data sets for which treatment effect is known. We describe and analyze observational sampling from randomized controlled trials (OSRCT), a method for evaluating causal inference methods using data from randomized controlled trials (RCTs). This method can be used to create constructed observational data sets with corresponding unbiased estimates of treatment effect, substantially increasing the number of data sets available for evaluating causal inference methods. We show that, in expectation, OSRCT creates data sets that are equivalent to those produced by randomly sampling from empirical data sets in which all potential outcomes are available. We then perform a large-scale evaluation of seven causal inference methods over 37 data sets, drawn from RCTs, as well as simulators, real-world computational systems, and observational data sets augmented with a synthetic response variable. We find notable performance differences when comparing across data from different sources, demonstrating the importance of using data from a variety of sources when evaluating any causal inference method.}
}
@InProceedings{pmlr-v139-ghalme21a,
title = {Strategic Classification in the Dark},
author = {Ghalme, Ganesh and Nair, Vineet and Eilat, Itay and Talgam-Cohen, Inbal and Rosenfeld, Nir},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3672--3681},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ghalme21a/ghalme21a.pdf},
url = {https://proceedings.mlr.press/v139/ghalme21a.html},
abstract = {Strategic classification studies the interaction between a classification rule and the strategic agents it governs. Agents respond by manipulating their features, under the assumption that the classifier is known. However, in many real-life scenarios of high-stake classification (e.g., credit scoring), the classifier is not revealed to the agents, which leads agents to attempt to learn the classifier and game it too. In this paper we generalize the strategic classification model to such scenarios and analyze the effect of an unknown classifier. We define the ”price of opacity” as the difference between the prediction error under the opaque and transparent policies, characterize it, and give a sufficient condition for it to be strictly positive, in which case transparency is the recommended policy. Our experiments show how Hardt et al.’s robust classifier is affected by keeping agents in the dark.}
}
@InProceedings{pmlr-v139-ghasemipour21a,
title = {EMaQ: Expected-Max Q-Learning Operator for Simple Yet Effective Offline and Online RL},
author = {Ghasemipour, Seyed Kamyar Seyed and Schuurmans, Dale and Gu, Shixiang Shane},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3682--3691},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ghasemipour21a/ghasemipour21a.pdf},
url = {https://proceedings.mlr.press/v139/ghasemipour21a.html},
abstract = {Off-policy reinforcement learning (RL) holds the promise of sample-efficient learning of decision-making policies by leveraging past experience. However, in the offline RL setting – where a fixed collection of interactions are provided and no further interactions are allowed – it has been shown that standard off-policy RL methods can significantly underperform. In this work, we closely investigate an important simplification of BCQ (Fujimoto et al., 2018) – a prior approach for offline RL – removing a heuristic design choice. Importantly, in contrast to their original theoretical considerations, we derive this simplified algorithm through the introduction of a novel backup operator, Expected-Max Q-Learning (EMaQ), which is more closely related to the resulting practical algorithm. Specifically, in addition to the distribution support, EMaQ explicitly considers the number of samples and the proposal distribution, allowing us to derive new sub-optimality bounds. In the offline RL setting – the main focus of this work – EMaQ matches and outperforms prior state-of-the-art in the D4RL benchmarks (Fu et al., 2020). In the online RL setting, we demonstrate that EMaQ is competitive with Soft Actor Critic (SAC). The key contributions of our empirical findings are demonstrating the importance of careful generative model design for estimating behavior policies, and an intuitive notion of complexity for offline RL problems. With its simple interpretation and fewer moving parts, such as no explicit function approximator representing the policy, EMaQ serves as a strong yet easy to implement baseline for future work.}
}
@InProceedings{pmlr-v139-ghazi21a,
title = {Differentially Private Aggregation in the Shuffle Model: Almost Central Accuracy in Almost a Single Message},
author = {Ghazi, Badih and Kumar, Ravi and Manurangsi, Pasin and Pagh, Rasmus and Sinha, Amer},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3692--3701},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ghazi21a/ghazi21a.pdf},
url = {https://proceedings.mlr.press/v139/ghazi21a.html},
abstract = {The shuffle model of differential privacy has attracted attention in the literature due to it being a middle ground between the well-studied central and local models. In this work, we study the problem of summing (aggregating) real numbers or integers, a basic primitive in numerous machine learning tasks, in the shuffle model. We give a protocol achieving error arbitrarily close to that of the (Discrete) Laplace mechanism in central differential privacy, while each user only sends 1 + o(1) short messages in expectation.}
}
@InProceedings{pmlr-v139-ghuge21a,
title = {The Power of Adaptivity for Stochastic Submodular Cover},
author = {Ghuge, Rohan and Gupta, Anupam and Nagarajan, Viswanath},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3702--3712},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ghuge21a/ghuge21a.pdf},
url = {https://proceedings.mlr.press/v139/ghuge21a.html},
abstract = {In the stochastic submodular cover problem, the goal is to select a subset of stochastic items of minimum expected cost to cover a submodular function. Solutions in this setting correspond to a sequential decision process that selects items one by one “adaptively” (depending on prior observations). While such adaptive solutions achieve the best objective, the inherently sequential nature makes them undesirable in many applications. We ask: \emph{how well can solutions with only a few adaptive rounds approximate fully-adaptive solutions?} We consider both cases where the stochastic items are independent, and where they are correlated. For both situations, we obtain nearly tight answers, establishing smooth tradeoffs between the number of adaptive rounds and the solution quality, relative to fully adaptive solutions. Experiments on synthetic and real datasets validate the practical performance of our algorithms, showing qualitative improvements in the solutions as we allow more rounds of adaptivity; in practice, solutions using just a few rounds of adaptivity are nearly as good as fully adaptive solutions.}
}
@InProceedings{pmlr-v139-gillenwater21a,
title = {Differentially Private Quantiles},
author = {Gillenwater, Jennifer and Joseph, Matthew and Kulesza, Alex},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3713--3722},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gillenwater21a/gillenwater21a.pdf},
url = {https://proceedings.mlr.press/v139/gillenwater21a.html},
abstract = {Quantiles are often used for summarizing and understanding data. If that data is sensitive, it may be necessary to compute quantiles in a way that is differentially private, providing theoretical guarantees that the result does not reveal private information. However, when multiple quantiles are needed, existing differentially private algorithms fare poorly: they either compute quantiles individually, splitting the privacy budget, or summarize the entire distribution, wasting effort. In either case the result is reduced accuracy. In this work we propose an instance of the exponential mechanism that simultaneously estimates exactly $m$ quantiles from $n$ data points while guaranteeing differential privacy. The utility function is carefully structured to allow for an efficient implementation that returns estimates of all $m$ quantiles in time $O(mn\log(n) + m^2n)$. Experiments show that our method significantly outperforms the current state of the art on both real and synthetic data while remaining efficient enough to be practical.}
}
@InProceedings{pmlr-v139-gluch21a,
title = {Query Complexity of Adversarial Attacks},
author = {Gluch, Grzegorz and Urbanke, R{\"u}diger},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3723--3733},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gluch21a/gluch21a.pdf},
url = {https://proceedings.mlr.press/v139/gluch21a.html},
abstract = {There are two main attack models considered in the adversarial robustness literature: black-box and white-box. We consider these threat models as two ends of a fine-grained spectrum, indexed by the number of queries the adversary can ask. Using this point of view we investigate how many queries the adversary needs to make to design an attack that is comparable to the best possible attack in the white-box model. We give a lower bound on that number of queries in terms of entropy of decision boundaries of the classifier. Using this result we analyze two classical learning algorithms on two synthetic tasks for which we prove meaningful security guarantees. The obtained bounds suggest that some learning algorithms are inherently more robust against query-bounded adversaries than others.}
}
@InProceedings{pmlr-v139-gogianu21a,
title = {Spectral Normalisation for Deep Reinforcement Learning: An Optimisation Perspective},
author = {Gogianu, Florin and Berariu, Tudor and Rosca, Mihaela C and Clopath, Claudia and Busoniu, Lucian and Pascanu, Razvan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3734--3744},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gogianu21a/gogianu21a.pdf},
url = {https://proceedings.mlr.press/v139/gogianu21a.html},
abstract = {Most of the recent deep reinforcement learning advances take an RL-centric perspective and focus on refinements of the training objective. We diverge from this view and show we can recover the performance of these developments not by changing the objective, but by regularising the value-function estimator. Constraining the Lipschitz constant of a single layer using spectral normalisation is sufficient to elevate the performance of a Categorical-DQN agent to that of a more elaborated agent on the challenging Atari domain. We conduct ablation studies to disentangle the various effects normalisation has on the learning dynamics and show that is sufficient to modulate the parameter updates to recover most of the performance of spectral normalisation. These findings hint towards the need to also focus on the neural component and its learning dynamics to tackle the peculiarities of Deep Reinforcement Learning.}
}
@InProceedings{pmlr-v139-golany21a,
title = {12-Lead ECG Reconstruction via Koopman Operators},
author = {Golany, Tomer and Radinsky, Kira and Freedman, Daniel and Minha, Saar},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3745--3754},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/golany21a/golany21a.pdf},
url = {https://proceedings.mlr.press/v139/golany21a.html},
abstract = {32% of all global deaths in the world are caused by cardiovascular diseases. Early detection, especially for patients with ischemia or cardiac arrhythmia, is crucial. To reduce the time between symptoms onset and treatment, wearable ECG sensors were developed to allow for the recording of the full 12-lead ECG signal at home. However, if even a single lead is not correctly positioned on the body that lead becomes corrupted, making automatic diagnosis on the basis of the full signal impossible. In this work, we present a methodology to reconstruct missing or noisy leads using the theory of Koopman Operators. Given a dataset consisting of full 12-lead ECGs, we learn a dynamical system describing the evolution of the 12 individual signals together in time. The Koopman theory indicates that there exists a high-dimensional embedding space in which the operator which propagates from one time instant to the next is linear. We therefore learn both the mapping to this embedding space, as well as the corresponding linear operator. Armed with this representation, we are able to impute missing leads by solving a least squares system in the embedding space, which can be achieved efficiently due to the sparse structure of the system. We perform an empirical evaluation using 12-lead ECG signals from thousands of patients, and show that we are able to reconstruct the signals in such way that enables accurate clinical diagnosis.}
}
@InProceedings{pmlr-v139-gondal21a,
title = {Function Contrastive Learning of Transferable Meta-Representations},
author = {Gondal, Muhammad Waleed and Joshi, Shruti and Rahaman, Nasim and Bauer, Stefan and Wuthrich, Manuel and Sch{\"o}lkopf, Bernhard},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3755--3765},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gondal21a/gondal21a.pdf},
url = {https://proceedings.mlr.press/v139/gondal21a.html},
abstract = {Meta-learning algorithms adapt quickly to new tasks that are drawn from the same task distribution as the training tasks. The mechanism leading to fast adaptation is the conditioning of a downstream predictive model on the inferred representation of the task’s underlying data generative process, or \emph{function}. This \emph{meta-representation}, which is computed from a few observed examples of the underlying function, is learned jointly with the predictive model. In this work, we study the implications of this joint training on the transferability of the meta-representations. Our goal is to learn meta-representations that are robust to noise in the data and facilitate solving a wide range of downstream tasks that share the same underlying functions. To this end, we propose a decoupled encoder-decoder approach to supervised meta-learning, where the encoder is trained with a contrastive objective to find a good representation of the underlying function. In particular, our training scheme is driven by the self-supervision signal indicating whether two sets of examples stem from the same function. Our experiments on a number of synthetic and real-world datasets show that the representations we obtain outperform strong baselines in terms of downstream performance and noise robustness, even when these baselines are trained in an end-to-end manner.}
}
@InProceedings{pmlr-v139-gong21a,
title = {Active Slices for Sliced Stein Discrepancy},
author = {Gong, Wenbo and Zhang, Kaibo and Li, Yingzhen and Hernandez-Lobato, Jose Miguel},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3766--3776},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gong21a/gong21a.pdf},
url = {https://proceedings.mlr.press/v139/gong21a.html},
abstract = {Sliced Stein discrepancy (SSD) and its kernelized variants have demonstrated promising successes in goodness-of-fit tests and model learning in high dimensions. Despite the theoretical elegance, their empirical performance depends crucially on the search of the optimal slicing directions to discriminate between two distributions. Unfortunately, previous gradient-based optimisation approach returns sub-optimal results for the slicing directions: it is computationally expensive, sensitive to initialization, and it lacks theoretical guarantee for convergence. We address these issues in two steps. First, we show in theory that the requirement of using optimal slicing directions in the kernelized version of SSD can be relaxed, validating the resulting discrepancy with finite random slicing directions. Second, given that good slicing directions are crucial for practical performance, we propose a fast algorithm for finding good slicing directions based on ideas of active sub-space construction and spectral decomposition. Experiments in goodness-of-fit tests and model learning show that our approach achieves both the best performance and the fastest convergence. Especially, we demonstrate 14-80x speed-up in goodness-of-fit tests when compared with the gradient-based approach.}
}
@InProceedings{pmlr-v139-gorantla21a,
title = {On the Problem of Underranking in Group-Fair Ranking},
author = {Gorantla, Sruthi and Deshpande, Amit and Louis, Anand},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3777--3787},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gorantla21a/gorantla21a.pdf},
url = {https://proceedings.mlr.press/v139/gorantla21a.html},
abstract = {Bias in ranking systems, especially among the top ranks, can worsen social and economic inequalities, polarize opinions, and reinforce stereotypes. On the other hand, a bias correction for minority groups can cause more harm if perceived as favoring group-fair outcomes over meritocracy. Most group-fair ranking algorithms post-process a given ranking and output a group-fair ranking. In this paper, we formulate the problem of underranking in group-fair rankings based on how close the group-fair rank of each item is to its original rank, and prove a lower bound on the trade-off achievable for simultaneous underranking and group fairness in ranking. We give a fair ranking algorithm that takes any given ranking and outputs another ranking with simultaneous underranking and group fairness guarantees comparable to the lower bound we prove. Our experimental results confirm the theoretical trade-off between underranking and group fairness, and also show that our algorithm achieves the best of both when compared to the state-of-the-art baselines.}
}
@InProceedings{pmlr-v139-gorbunov21a,
title = {MARINA: Faster Non-Convex Distributed Learning with Compression},
author = {Gorbunov, Eduard and Burlachenko, Konstantin P. and Li, Zhize and Richtarik, Peter},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3788--3798},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gorbunov21a/gorbunov21a.pdf},
url = {https://proceedings.mlr.press/v139/gorbunov21a.html},
abstract = {We develop and analyze MARINA: a new communication efficient method for non-convex distributed learning over heterogeneous datasets. MARINA employs a novel communication compression strategy based on the compression of gradient differences that is reminiscent of but different from the strategy employed in the DIANA method of Mishchenko et al. (2019). Unlike virtually all competing distributed first-order methods, including DIANA, ours is based on a carefully designed biased gradient estimator, which is the key to its superior theoretical and practical performance. The communication complexity bounds we prove for MARINA are evidently better than those of all previous first-order methods. Further, we develop and analyze two variants of MARINA: VR-MARINA and PP-MARINA. The first method is designed for the case when the local loss functions owned by clients are either of a finite sum or of an expectation form, and the second method allows for a partial participation of clients {–} a feature important in federated learning. All our methods are superior to previous state-of-the-art methods in terms of oracle/communication complexity. Finally, we provide a convergence analysis of all methods for problems satisfying the Polyak-{Ł}ojasiewicz condition.}
}
@InProceedings{pmlr-v139-gosgens21a,
title = {Systematic Analysis of Cluster Similarity Indices: How to Validate Validation Measures},
author = {G{\"o}sgens, Martijn M and Tikhonov, Alexey and Prokhorenkova, Liudmila},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3799--3808},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gosgens21a/gosgens21a.pdf},
url = {https://proceedings.mlr.press/v139/gosgens21a.html},
abstract = {Many cluster similarity indices are used to evaluate clustering algorithms, and choosing the best one for a particular task remains an open problem. We demonstrate that this problem is crucial: there are many disagreements among the indices, these disagreements do affect which algorithms are preferred in applications, and this can lead to degraded performance in real-world systems. We propose a theoretical framework to tackle this problem: we develop a list of desirable properties and conduct an extensive theoretical analysis to verify which indices satisfy them. This allows for making an informed choice: given a particular application, one can first select properties that are desirable for the task and then identify indices satisfying these. Our work unifies and considerably extends existing attempts at analyzing cluster similarity indices: we introduce new properties, formalize existing ones, and mathematically prove or disprove each property for an extensive list of validation indices. This broader and more rigorous approach leads to recommendations that considerably differ from how validation indices are currently being chosen by practitioners. Some of the most popular indices are even shown to be dominated by previously overlooked ones.}
}
@InProceedings{pmlr-v139-goyal21a,
title = {Revisiting Point Cloud Shape Classification with a Simple and Effective Baseline},
author = {Goyal, Ankit and Law, Hei and Liu, Bowei and Newell, Alejandro and Deng, Jia},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3809--3820},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/goyal21a/goyal21a.pdf},
url = {https://proceedings.mlr.press/v139/goyal21a.html},
abstract = {Processing point cloud data is an important component of many real-world systems. As such, a wide variety of point-based approaches have been proposed, reporting steady benchmark improvements over time. We study the key ingredients of this progress and uncover two critical results. First, we find that auxiliary factors like different evaluation schemes, data augmentation strategies, and loss functions, which are independent of the model architecture, make a large difference in performance. The differences are large enough that they obscure the effect of architecture. When these factors are controlled for, PointNet++, a relatively older network, performs competitively with recent methods. Second, a very simple projection-based method, which we refer to as SimpleView, performs surprisingly well. It achieves on par or better results than sophisticated state-of-the-art methods on ModelNet40 while being half the size of PointNet++. It also outperforms state-of-the-art methods on ScanObjectNN, a real-world point cloud benchmark, and demonstrates better cross-dataset generalization. Code is available at https://github.com/princeton-vl/SimpleView.}
}
@InProceedings{pmlr-v139-graf21a,
title = {Dissecting Supervised Contrastive Learning},
author = {Graf, Florian and Hofer, Christoph and Niethammer, Marc and Kwitt, Roland},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3821--3830},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/graf21a/graf21a.pdf},
url = {https://proceedings.mlr.press/v139/graf21a.html},
abstract = {Minimizing cross-entropy over the softmax scores of a linear map composed with a high-capacity encoder is arguably the most popular choice for training neural networks on supervised learning tasks. However, recent works show that one can directly optimize the encoder instead, to obtain equally (or even more) discriminative representations via a supervised variant of a contrastive objective. In this work, we address the question whether there are fundamental differences in the sought-for representation geometry in the output space of the encoder at minimal loss. Specifically, we prove, under mild assumptions, that both losses attain their minimum once the representations of each class collapse to the vertices of a regular simplex, inscribed in a hypersphere. We provide empirical evidence that this configuration is attained in practice and that reaching a close-to-optimal state typically indicates good generalization performance. Yet, the two losses show remarkably different optimization behavior. The number of iterations required to perfectly fit to data scales superlinearly with the amount of randomly flipped labels for the supervised contrastive loss. This is in contrast to the approximately linear scaling previously reported for networks trained with cross-entropy.}
}
@InProceedings{pmlr-v139-grathwohl21a,
title = {Oops I Took A Gradient: Scalable Sampling for Discrete Distributions},
author = {Grathwohl, Will and Swersky, Kevin and Hashemi, Milad and Duvenaud, David and Maddison, Chris},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3831--3841},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/grathwohl21a/grathwohl21a.pdf},
url = {https://proceedings.mlr.press/v139/grathwohl21a.html},
abstract = {We propose a general and scalable approximate sampling strategy for probabilistic models with discrete variables. Our approach uses gradients of the likelihood function with respect to its discrete inputs to propose updates in a Metropolis-Hastings sampler. We show empirically that this approach outperforms generic samplers in a number of difficult settings including Ising models, Potts models, restricted Boltzmann machines, and factorial hidden Markov models. We also demonstrate our improved sampler for training deep energy-based models on high dimensional discrete image data. This approach outperforms variational auto-encoders and existing energy-based models. Finally, we give bounds showing that our approach is near-optimal in the class of samplers which propose local updates.}
}
@InProceedings{pmlr-v139-greenberg21a,
title = {Detecting Rewards Deterioration in Episodic Reinforcement Learning},
author = {Greenberg, Ido and Mannor, Shie},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3842--3853},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/greenberg21a/greenberg21a.pdf},
url = {https://proceedings.mlr.press/v139/greenberg21a.html},
abstract = {In many RL applications, once training ends, it is vital to detect any deterioration in the agent performance as soon as possible. Furthermore, it often has to be done without modifying the policy and under minimal assumptions regarding the environment. In this paper, we address this problem by focusing directly on the rewards and testing for degradation. We consider an episodic framework, where the rewards within each episode are not independent, nor identically-distributed, nor Markov. We present this problem as a multivariate mean-shift detection problem with possibly partial observations. We define the mean-shift in a way corresponding to deterioration of a temporal signal (such as the rewards), and derive a test for this problem with optimal statistical power. Empirically, on deteriorated rewards in control problems (generated using various environment modifications), the test is demonstrated to be more powerful than standard tests - often by orders of magnitude. We also suggest a novel Bootstrap mechanism for False Alarm Rate control (BFAR), applicable to episodic (non-i.i.d) signal and allowing our test to run sequentially in an online manner. Our method does not rely on a learned model of the environment, is entirely external to the agent, and in fact can be applied to detect changes or drifts in any episodic signal.}
}
@InProceedings{pmlr-v139-gu21a,
title = {Crystallization Learning with the Delaunay Triangulation},
author = {Gu, Jiaqi and Yin, Guosheng},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3854--3863},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gu21a/gu21a.pdf},
url = {https://proceedings.mlr.press/v139/gu21a.html},
abstract = {Based on the Delaunay triangulation, we propose the crystallization learning to estimate the conditional expectation function in the framework of nonparametric regression. By conducting the crystallization search for the Delaunay simplices closest to the target point in a hierarchical way, the crystallization learning estimates the conditional expectation of the response by fitting a local linear model to the data points of the constructed Delaunay simplices. Instead of conducting the Delaunay triangulation for the entire feature space which would encounter enormous computational difficulty, our approach focuses only on the neighborhood of the target point and thus greatly expedites the estimation for high-dimensional cases. Because the volumes of Delaunay simplices are adaptive to the density of feature data points, our method selects neighbor data points uniformly in all directions and thus is more robust to the local geometric structure of the data than existing nonparametric regression methods. We develop the asymptotic properties of the crystallization learning and conduct numerical experiments on both synthetic and real data to demonstrate the advantages of our method in estimation of the conditional expectation function and prediction of the response.}
}
@InProceedings{pmlr-v139-guan21a,
title = {AutoAttend: Automated Attention Representation Search},
author = {Guan, Chaoyu and Wang, Xin and Zhu, Wenwu},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3864--3874},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/guan21a/guan21a.pdf},
url = {https://proceedings.mlr.press/v139/guan21a.html},
abstract = {Self-attention mechanisms have been widely adopted in many machine learning areas, including Natural Language Processing (NLP) and Graph Representation Learning (GRL), etc. However, existing works heavily rely on hand-crafted design to obtain customized attention mechanisms. In this paper, we automate Key, Query and Value representation design, which is one of the most important steps to obtain effective self-attentions. We propose an automated self-attention representation model, AutoAttend, which can automatically search powerful attention representations for downstream tasks leveraging Neural Architecture Search (NAS). In particular, we design a tailored search space for attention representation automation, which is flexible to produce effective attention representation designs. Based on the design prior obtained from attention representations in previous works, we further regularize our search space to reduce the space complexity without the loss of expressivity. Moreover, we propose a novel context-aware parameter sharing mechanism considering special characteristics of each sub-architecture to provide more accurate architecture estimations when conducting parameter sharing in our tailored search space. Experiments show the superiority of our proposed AutoAttend model over previous state-of-the-arts on eight text classification tasks in NLP and four node classification tasks in GRL.}
}
@InProceedings{pmlr-v139-gultchin21a,
title = {Operationalizing Complex Causes: A Pragmatic View of Mediation},
author = {Gultchin, Limor and Watson, David and Kusner, Matt and Silva, Ricardo},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3875--3885},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gultchin21a/gultchin21a.pdf},
url = {https://proceedings.mlr.press/v139/gultchin21a.html},
abstract = {We examine the problem of causal response estimation for complex objects (e.g., text, images, genomics). In this setting, classical \emph{atomic} interventions are often not available (e.g., changes to characters, pixels, DNA base-pairs). Instead, we only have access to indirect or \emph{crude} interventions (e.g., enrolling in a writing program, modifying a scene, applying a gene therapy). In this work, we formalize this problem and provide an initial solution. Given a collection of candidate mediators, we propose (a) a two-step method for predicting the causal responses of crude interventions; and (b) a testing procedure to identify mediators of crude interventions. We demonstrate, on a range of simulated and real-world-inspired examples, that our approach allows us to efficiently estimate the effect of crude interventions with limited data from new treatment regimes.}
}
@InProceedings{pmlr-v139-guminov21a,
title = {On a Combination of Alternating Minimization and Nesterov’s Momentum},
author = {Guminov, Sergey and Dvurechensky, Pavel and Tupitsa, Nazarii and Gasnikov, Alexander},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3886--3898},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/guminov21a/guminov21a.pdf},
url = {https://proceedings.mlr.press/v139/guminov21a.html},
abstract = {Alternating minimization (AM) procedures are practically efficient in many applications for solving convex and non-convex optimization problems. On the other hand, Nesterov’s accelerated gradient is theoretically optimal first-order method for convex optimization. In this paper we combine AM and Nesterov’s acceleration to propose an accelerated alternating minimization algorithm. We prove $1/k^2$ convergence rate in terms of the objective for convex problems and $1/k$ in terms of the squared gradient norm for non-convex problems, where $k$ is the iteration counter. Our method does not require any knowledge of neither convexity of the problem nor function parameters such as Lipschitz constant of the gradient, i.e. it is adaptive to convexity and smoothness and is uniformly optimal for smooth convex and non-convex problems. Further, we develop its primal-dual modification for strongly convex problems with linear constraints and prove the same $1/k^2$ for the primal objective residual and constraints feasibility.}
}
@InProceedings{pmlr-v139-guo21a,
title = {Decentralized Single-Timescale Actor-Critic on Zero-Sum Two-Player Stochastic Games},
author = {Guo, Hongyi and Fu, Zuyue and Yang, Zhuoran and Wang, Zhaoran},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3899--3909},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/guo21a/guo21a.pdf},
url = {https://proceedings.mlr.press/v139/guo21a.html},
abstract = {We study the global convergence and global optimality of the actor-critic algorithm applied for the zero-sum two-player stochastic games in a decentralized manner. We focus on the single-timescale setting where the critic is updated by applying the Bellman operator only once and the actor is updated by policy gradient with the information from the critic. Our algorithm is in a decentralized manner, as we assume that each player has no access to the actions of the other one, which, in a way, protects the privacy of both players. Moreover, we consider linear function approximations for both actor and critic, and we prove that the sequence of joint policy generated by our decentralized linear algorithm converges to the minimax equilibrium at a sublinear rate \(\cO(\sqrt{K})\), where \(K\){is} the number of iterations. To the best of our knowledge, we establish the global optimality and convergence of decentralized actor-critic algorithm on zero-sum two-player stochastic games with linear function approximations for the first time.}
}
@InProceedings{pmlr-v139-guo21b,
title = {Adversarial Policy Learning in Two-player Competitive Games},
author = {Guo, Wenbo and Wu, Xian and Huang, Sui and Xing, Xinyu},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3910--3919},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/guo21b/guo21b.pdf},
url = {https://proceedings.mlr.press/v139/guo21b.html},
abstract = {In a two-player deep reinforcement learning task, recent work shows an attacker could learn an adversarial policy that triggers a target agent to perform poorly and even react in an undesired way. However, its efficacy heavily relies upon the zero-sum assumption made in the two-player game. In this work, we propose a new adversarial learning algorithm. It addresses the problem by resetting the optimization goal in the learning process and designing a new surrogate optimization function. Our experiments show that our method significantly improves adversarial agents’ exploitability compared with the state-of-art attack. Besides, we also discover that our method could augment an agent with the ability to abuse the target game’s unfairness. Finally, we show that agents adversarially re-trained against our adversarial agents could obtain stronger adversary-resistance.}
}
@InProceedings{pmlr-v139-guo21c,
title = {Soft then Hard: Rethinking the Quantization in Neural Image Compression},
author = {Guo, Zongyu and Zhang, Zhizheng and Feng, Runsen and Chen, Zhibo},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3920--3929},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/guo21c/guo21c.pdf},
url = {https://proceedings.mlr.press/v139/guo21c.html},
abstract = {Quantization is one of the core components in lossy image compression. For neural image compression, end-to-end optimization requires differentiable approximations of quantization, which can generally be grouped into three categories: additive uniform noise, straight-through estimator and soft-to-hard annealing. Training with additive uniform noise approximates the quantization error variationally but suffers from the train-test mismatch. The other two methods do not encounter this mismatch but, as shown in this paper, hurt the rate-distortion performance since the latent representation ability is weakened. We thus propose a novel soft-then-hard quantization strategy for neural image compression that first learns an expressive latent space softly, then closes the train-test mismatch with hard quantization. In addition, beyond the fixed integer-quantization, we apply scaled additive uniform noise to adaptively control the quantization granularity by deriving a new variational upper bound on actual rate. Experiments demonstrate that our proposed methods are easy to adopt, stable to train, and highly effective especially on complex compression models.}
}
@InProceedings{pmlr-v139-gupta21a,
title = {UneVEn: Universal Value Exploration for Multi-Agent Reinforcement Learning},
author = {Gupta, Tarun and Mahajan, Anuj and Peng, Bei and Boehmer, Wendelin and Whiteson, Shimon},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3930--3941},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gupta21a/gupta21a.pdf},
url = {https://proceedings.mlr.press/v139/gupta21a.html},
abstract = {VDN and QMIX are two popular value-based algorithms for cooperative MARL that learn a centralized action value function as a monotonic mixing of per-agent utilities. While this enables easy decentralization of the learned policy, the restricted joint action value function can prevent them from solving tasks that require significant coordination between agents at a given timestep. We show that this problem can be overcome by improving the joint exploration of all agents during training. Specifically, we propose a novel MARL approach called Universal Value Exploration (UneVEn) that learns a set of related tasks simultaneously with a linear decomposition of universal successor features. With the policies of already solved related tasks, the joint exploration process of all agents can be improved to help them achieve better coordination. Empirical results on a set of exploration games, challenging cooperative predator-prey tasks requiring significant coordination among agents, and StarCraft II micromanagement benchmarks show that UneVEn can solve tasks where other state-of-the-art MARL methods fail.}
}
@InProceedings{pmlr-v139-gupta21b,
title = {Distribution-Free Calibration Guarantees for Histogram Binning without Sample Splitting},
author = {Gupta, Chirag and Ramdas, Aaditya},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3942--3952},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gupta21b/gupta21b.pdf},
url = {https://proceedings.mlr.press/v139/gupta21b.html},
abstract = {We prove calibration guarantees for the popular histogram binning (also called uniform-mass binning) method of Zadrozny and Elkan (2001). Histogram binning has displayed strong practical performance, but theoretical guarantees have only been shown for sample split versions that avoid ’double dipping’ the data. We demonstrate that the statistical cost of sample splitting is practically significant on a credit default dataset. We then prove calibration guarantees for the original method that double dips the data, using a certain Markov property of order statistics. Based on our results, we make practical recommendations for choosing the number of bins in histogram binning. In our illustrative simulations, we propose a new tool for assessing calibration—validity plots—which provide more information than an ECE estimate.}
}
@InProceedings{pmlr-v139-gupta21c,
title = {Correcting Exposure Bias for Link Recommendation},
author = {Gupta, Shantanu and Wang, Hao and Lipton, Zachary and Wang, Yuyang},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3953--3963},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gupta21c/gupta21c.pdf},
url = {https://proceedings.mlr.press/v139/gupta21c.html},
abstract = {Link prediction methods are frequently applied in recommender systems, e.g., to suggest citations for academic papers or friends in social networks. However, exposure bias can arise when users are systematically underexposed to certain relevant items. For example, in citation networks, authors might be more likely to encounter papers from their own field and thus cite them preferentially. This bias can propagate through naively trained link predictors, leading to both biased evaluation and high generalization error (as assessed by true relevance). Moreover, this bias can be exacerbated by feedback loops. We propose estimators that leverage known exposure probabilities to mitigate this bias and consequent feedback loops. Next, we provide a loss function for learning the exposure probabilities from data. Finally, experiments on semi-synthetic data based on real-world citation networks, show that our methods reliably identify (truly) relevant citations. Additionally, our methods lead to greater diversity in the recommended papers’ fields of study. The code is available at github.com/shantanu95/exposure-bias-link-rec.}
}
@InProceedings{pmlr-v139-gurbuzbalaban21a,
title = {The Heavy-Tail Phenomenon in SGD},
author = {Gurbuzbalaban, Mert and Simsekli, Umut and Zhu, Lingjiong},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3964--3975},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gurbuzbalaban21a/gurbuzbalaban21a.pdf},
url = {https://proceedings.mlr.press/v139/gurbuzbalaban21a.html},
abstract = {In recent years, various notions of capacity and complexity have been proposed for characterizing the generalization properties of stochastic gradient descent (SGD) in deep learning. Some of the popular notions that correlate well with the performance on unseen data are (i) the ‘flatness’ of the local minimum found by SGD, which is related to the eigenvalues of the Hessian, (ii) the ratio of the stepsize $\eta$ to the batch-size $b$, which essentially controls the magnitude of the stochastic gradient noise, and (iii) the ‘tail-index’, which measures the heaviness of the tails of the network weights at convergence. In this paper, we argue that these three seemingly unrelated perspectives for generalization are deeply linked to each other. We claim that depending on the structure of the Hessian of the loss at the minimum, and the choices of the algorithm parameters $\eta$ and $b$, the SGD iterates will converge to a \emph{heavy-tailed} stationary distribution. We rigorously prove this claim in the setting of quadratic optimization: we show that even in a simple linear regression problem with independent and identically distributed data whose distribution has finite moments of all order, the iterates can be heavy-tailed with infinite variance. We further characterize the behavior of the tails with respect to algorithm parameters, the dimension, and the curvature. We then translate our results into insights about the behavior of SGD in deep learning. We support our theory with experiments conducted on synthetic data, fully connected, and convolutional neural networks.}
}
@InProceedings{pmlr-v139-gurel21a,
title = {Knowledge Enhanced Machine Learning Pipeline against Diverse Adversarial Attacks},
author = {G{\"u}rel, Nezihe Merve and Qi, Xiangyu and Rimanic, Luka and Zhang, Ce and Li, Bo},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3976--3987},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gurel21a/gurel21a.pdf},
url = {https://proceedings.mlr.press/v139/gurel21a.html},
abstract = {Despite the great successes achieved by deep neural networks (DNNs), recent studies show that they are vulnerable against adversarial examples, which aim to mislead DNNs by adding small adversarial perturbations. Several defenses have been proposed against such attacks, while many of them have been adaptively attacked. In this work, we aim to enhance the ML robustness from a different perspective by leveraging domain knowledge: We propose a Knowledge Enhanced Machine Learning Pipeline (KEMLP) to integrate domain knowledge (i.e., logic relationships among different predictions) into a probabilistic graphical model via first-order logic rules. In particular, we develop KEMLP by integrating a diverse set of weak auxiliary models based on their logical relationships to the main DNN model that performs the target task. Theoretically, we provide convergence results and prove that, under mild conditions, the prediction of KEMLP is more robust than that of the main DNN model. Empirically, we take road sign recognition as an example and leverage the relationships between road signs and their shapes and contents as domain knowledge. We show that compared with adversarial training and other baselines, KEMLP achieves higher robustness against physical attacks, $\mathcal{L}_p$ bounded attacks, unforeseen attacks, and natural corruptions under both whitebox and blackbox settings, while still maintaining high clean accuracy.}
}
@InProceedings{pmlr-v139-gyorgy21a,
title = {Adapting to Delays and Data in Adversarial Multi-Armed Bandits},
author = {Gyorgy, Andras and Joulani, Pooria},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3988--3997},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gyorgy21a/gyorgy21a.pdf},
url = {https://proceedings.mlr.press/v139/gyorgy21a.html},
abstract = {We consider the adversarial multi-armed bandit problem under delayed feedback. We analyze variants of the Exp3 algorithm that tune their step size using only information (about the losses and delays) available at the time of the decisions, and obtain regret guarantees that adapt to the observed (rather than the worst-case) sequences of delays and/or losses. First, through a remarkably simple proof technique, we show that with proper tuning of the step size, the algorithm achieves an optimal (up to logarithmic factors) regret of order $\sqrt{\log(K)(TK + D)}$ both in expectation and in high probability, where $K$ is the number of arms, $T$ is the time horizon, and $D$ is the cumulative delay. The high-probability version of the bound, which is the first high-probability delay-adaptive bound in the literature, crucially depends on the use of implicit exploration in estimating the losses. Then, following Zimmert and Seldin (2019), we extend these results so that the algorithm can “skip” rounds with large delays, resulting in regret bounds of order $\sqrt{TK\log(K)} + |R| + \sqrt{D_{\bar{R}}\log(K)}$, where $R$ is an arbitrary set of rounds (which are skipped) and $D_{\bar{R}}$ is the cumulative delay of the feedback for other rounds. Finally, we present another, data-adaptive (AdaGrad-style) version of the algorithm for which the regret adapts to the observed (delayed) losses instead of only adapting to the cumulative delay (this algorithm requires an a priori upper bound on the maximum delay, or the advance knowledge of the delay for each decision when it is made). The resulting bound can be orders of magnitude smaller on benign problems, and it can be shown that the delay only affects the regret through the loss of the best arm.}
}
@InProceedings{pmlr-v139-hafez-kolahi21a,
title = {Rate-Distortion Analysis of Minimum Excess Risk in Bayesian Learning},
author = {Hafez-Kolahi, Hassan and Moniri, Behrad and Kasaei, Shohreh and Baghshah, Mahdieh Soleymani},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {3998--4007},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hafez-kolahi21a/hafez-kolahi21a.pdf},
url = {https://proceedings.mlr.press/v139/hafez-kolahi21a.html},
abstract = {In parametric Bayesian learning, a prior is assumed on the parameter $W$ which determines the distribution of samples. In this setting, Minimum Excess Risk (MER) is defined as the difference between the minimum expected loss achievable when learning from data and the minimum expected loss that could be achieved if $W$ was observed. In this paper, we build upon and extend the recent results of (Xu & Raginsky, 2020) to analyze the MER in Bayesian learning and derive information-theoretic bounds on it. We formulate the problem as a (constrained) rate-distortion optimization and show how the solution can be bounded above and below by two other rate-distortion functions that are easier to study. The lower bound represents the minimum possible excess risk achievable by \emph{any} process using $R$ bits of information from the parameter $W$. For the upper bound, the optimization is further constrained to use $R$ bits from the training set, a setting which relates MER to information-theoretic bounds on the generalization gap in frequentist learning. We derive information-theoretic bounds on the difference between these upper and lower bounds and show that they can provide order-wise tight rates for MER under certain conditions. This analysis gives more insight into the information-theoretic nature of Bayesian learning as well as providing novel bounds.}
}
@InProceedings{pmlr-v139-hallak21a,
title = {Regret Minimization in Stochastic Non-Convex Learning via a Proximal-Gradient Approach},
author = {Hallak, Nadav and Mertikopoulos, Panayotis and Cevher, Volkan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4008--4017},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hallak21a/hallak21a.pdf},
url = {https://proceedings.mlr.press/v139/hallak21a.html},
abstract = {This paper develops a methodology for regret minimization with stochastic first-order oracle feedback in online, constrained, non-smooth, non-convex problems. In this setting, the minimization of external regret is beyond reach for first-order methods, and there are no gradient-based algorithmic frameworks capable of providing a solution. On that account, we propose a conceptual approach that leverages non-convex optimality measures, leading to a suitable generalization of the learner’s local regret. We focus on a local regret measure defined via a proximal-gradient mapping, that also encompasses the original notion proposed by Hazan et al. (2017). To achieve no local regret in this setting, we develop a proximal-gradient method based on stochastic first-order feedback, and a simpler method for when access to a perfect first-order oracle is possible. Both methods are order-optimal (in the min-max sense), and we also establish a bound on the number of proximal-gradient queries these methods require. As an important application of our results, we also obtain a link between online and offline non-convex stochastic optimization manifested as a new proximal-gradient scheme with complexity guarantees matching those obtained via variance reduction techniques.}
}
@InProceedings{pmlr-v139-han21a,
title = {Diversity Actor-Critic: Sample-Aware Entropy Regularization for Sample-Efficient Exploration},
author = {Han, Seungyul and Sung, Youngchul},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4018--4029},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/han21a/han21a.pdf},
url = {https://proceedings.mlr.press/v139/han21a.html},
abstract = {In this paper, sample-aware policy entropy regularization is proposed to enhance the conventional policy entropy regularization for better exploration. Exploiting the sample distribution obtainable from the replay buffer, the proposed sample-aware entropy regularization maximizes the entropy of the weighted sum of the policy action distribution and the sample action distribution from the replay buffer for sample-efficient exploration. A practical algorithm named diversity actor-critic (DAC) is developed by applying policy iteration to the objective function with the proposed sample-aware entropy regularization. Numerical results show that DAC significantly outperforms existing recent algorithms for reinforcement learning.}
}
@InProceedings{pmlr-v139-han21b,
title = {Adversarial Combinatorial Bandits with General Non-linear Reward Functions},
author = {Han, Yanjun and Wang, Yining and Chen, Xi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4030--4039},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/han21b/han21b.pdf},
url = {https://proceedings.mlr.press/v139/han21b.html},
abstract = {In this paper we study the adversarial combinatorial bandit with a known non-linear reward function, extending existing work on adversarial linear combinatorial bandit. {The adversarial combinatorial bandit with general non-linear reward is an important open problem in bandit literature, and it is still unclear whether there is a significant gap from the case of linear reward, stochastic bandit, or semi-bandit feedback.} We show that, with $N$ arms and subsets of $K$ arms being chosen at each of $T$ time periods, the minimax optimal regret is $\widetilde\Theta_{d}(\sqrt{N^d T})$ if the reward function is a $d$-degree polynomial with $d< K$, and $\Theta_K(\sqrt{N^K T})$ if the reward function is not a low-degree polynomial. {Both bounds are significantly different from the bound $O(\sqrt{\mathrm{poly}(N,K)T})$ for the linear case, which suggests that there is a fundamental gap between the linear and non-linear reward structures.} Our result also finds applications to adversarial assortment optimization problem in online recommendation. We show that in the worst-case of adversarial assortment problem, the optimal algorithm must treat each individual $\binom{N}{K}$ assortment as independent.}
}
@InProceedings{pmlr-v139-hang21a,
title = {A Collective Learning Framework to Boost GNN Expressiveness for Node Classification},
author = {Hang, Mengyue and Neville, Jennifer and Ribeiro, Bruno},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4040--4050},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hang21a/hang21a.pdf},
url = {https://proceedings.mlr.press/v139/hang21a.html},
abstract = {Collective Inference (CI) is a procedure designed to boost weak relational classifiers, specially for node classification tasks. Graph Neural Networks (GNNs) are strong classifiers that have been used with great success. Unfortunately, most existing practical GNNs are not most-expressive (universal). Thus, it is an open question whether one can improve strong relational node classifiers, such as GNNs, with CI. In this work, we investigate this question and propose {\em collective learning} for GNNs —a general collective classification approach for node representation learning that increases their representation power. We show that previous attempts to incorporate CI into GNNs fail to boost their expressiveness because they do not adapt CI’s Monte Carlo sampling to representation learning. We evaluate our proposed framework with a variety of state-of-the-art GNNs. Our experiments show a consistent, significant boost in node classification accuracy —regardless of the choice of underlying GNN— for inductive node classification in partially-labeled graphs, across five real-world network datasets.}
}
@InProceedings{pmlr-v139-hanjie21a,
title = {Grounding Language to Entities and Dynamics for Generalization in Reinforcement Learning},
author = {Hanjie, Austin W. and Zhong, Victor Y and Narasimhan, Karthik},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4051--4062},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hanjie21a/hanjie21a.pdf},
url = {https://proceedings.mlr.press/v139/hanjie21a.html},
abstract = {We investigate the use of natural language to drive the generalization of control policies and introduce the new multi-task environment Messenger with free-form text manuals describing the environment dynamics. Unlike previous work, Messenger does not assume prior knowledge connecting text and state observations {—} the control policy must simultaneously ground the game manual to entity symbols and dynamics in the environment. We develop a new model, EMMA (Entity Mapper with Multi-modal Attention) which uses an entity-conditioned attention module that allows for selective focus over relevant descriptions in the manual for each entity in the environment. EMMA is end-to-end differentiable and learns a latent grounding of entities and dynamics from text to observations using only environment rewards. EMMA achieves successful zero-shot generalization to unseen games with new dynamics, obtaining a 40% higher win rate compared to multiple baselines. However, win rate on the hardest stage of Messenger remains low (10%), demonstrating the need for additional work in this direction.}
}
@InProceedings{pmlr-v139-hao21a,
title = {Sparse Feature Selection Makes Batch Reinforcement Learning More Sample Efficient},
author = {Hao, Botao and Duan, Yaqi and Lattimore, Tor and Szepesvari, Csaba and Wang, Mengdi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4063--4073},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hao21a/hao21a.pdf},
url = {https://proceedings.mlr.press/v139/hao21a.html},
abstract = {This paper provides a statistical analysis of high-dimensional batch reinforcement learning (RL) using sparse linear function approximation. When there is a large number of candidate features, our result sheds light on the fact that sparsity-aware methods can make batch RL more sample efficient. We first consider the off-policy policy evaluation problem. To evaluate a new target policy, we analyze a Lasso fitted Q-evaluation method and establish a finite-sample error bound that has no polynomial dependence on the ambient dimension. To reduce the Lasso bias, we further propose a post model-selection estimator that applies fitted Q-evaluation to the features selected via group Lasso. Under an additional signal strength assumption, we derive a sharper instance-dependent error bound that depends on a divergence function measuring the distribution mismatch between the data distribution and occupancy measure of the target policy. Further, we study the Lasso fitted Q-iteration for batch policy optimization and establish a finite-sample error bound depending on the ratio between the number of relevant features and restricted minimal eigenvalue of the data’s covariance. In the end, we complement the results with minimax lower bounds for batch-data policy evaluation/optimization that nearly match our upper bounds. The results suggest that having well-conditioned data is crucial for sparse batch policy learning.}
}
@InProceedings{pmlr-v139-hao21b,
title = {Bootstrapping Fitted Q-Evaluation for Off-Policy Inference},
author = {Hao, Botao and Ji, Xiang and Duan, Yaqi and Lu, Hao and Szepesvari, Csaba and Wang, Mengdi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4074--4084},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hao21b/hao21b.pdf},
url = {https://proceedings.mlr.press/v139/hao21b.html},
abstract = {Bootstrapping provides a flexible and effective approach for assessing the quality of batch reinforcement learning, yet its theoretical properties are poorly understood. In this paper, we study the use of bootstrapping in off-policy evaluation (OPE), and in particular, we focus on the fitted Q-evaluation (FQE) that is known to be minimax-optimal in the tabular and linear-model cases. We propose a bootstrapping FQE method for inferring the distribution of the policy evaluation error and show that this method is asymptotically efficient and distributionally consistent for off-policy statistical inference. To overcome the computation limit of bootstrapping, we further adapt a subsampling procedure that improves the runtime by an order of magnitude. We numerically evaluate the bootrapping method in classical RL environments for confidence interval estimation, estimating the variance of off-policy evaluator, and estimating the correlation between multiple off-policy evaluators.}
}
@InProceedings{pmlr-v139-hao21c,
title = {Compressed Maximum Likelihood},
author = {Hao, Yi and Orlitsky, Alon},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4085--4095},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hao21c/hao21c.pdf},
url = {https://proceedings.mlr.press/v139/hao21c.html},
abstract = {Maximum likelihood (ML) is one of the most fundamental and general statistical estimation techniques. Inspired by recent advances in estimating distribution functionals, we propose $\textit{compressed maximum likelihood}$ (CML) that applies ML to the compressed samples. We then show that CML is sample-efficient for several essential learning tasks over both discrete and continuous domains, including learning densities with structures, estimating probability multisets, and inferring symmetric distribution functionals.}
}
@InProceedings{pmlr-v139-hartford21a,
title = {Valid Causal Inference with (Some) Invalid Instruments},
author = {Hartford, Jason S and Veitch, Victor and Sridhar, Dhanya and Leyton-Brown, Kevin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4096--4106},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hartford21a/hartford21a.pdf},
url = {https://proceedings.mlr.press/v139/hartford21a.html},
abstract = {Instrumental variable methods provide a powerful approach to estimating causal effects in the presence of unobserved confounding. But a key challenge when applying them is the reliance on untestable "exclusion" assumptions that rule out any relationship between the instrument variable and the response that is not mediated by the treatment. In this paper, we show how to perform consistent IV estimation despite violations of the exclusion assumption. In particular, we show that when one has multiple candidate instruments, only a majority of these candidates—or, more generally, the modal candidate-response relationship—needs to be valid to estimate the causal effect. Our approach uses an estimate of the modal prediction from an ensemble of instrumental variable estimators. The technique is simple to apply and is "black-box" in the sense that it may be used with any instrumental variable estimator as long as the treatment effect is identified for each valid instrument independently. As such, it is compatible with recent machine-learning based estimators that allow for the estimation of conditional average treatment effects (CATE) on complex, high dimensional data. Experimentally, we achieve accurate estimates of conditional average treatment effects using an ensemble of deep network-based estimators, including on a challenging simulated Mendelian Randomization problem.}
}
@InProceedings{pmlr-v139-hashimoto21a,
title = {Model Performance Scaling with Multiple Data Sources},
author = {Hashimoto, Tatsunori},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4107--4116},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hashimoto21a/hashimoto21a.pdf},
url = {https://proceedings.mlr.press/v139/hashimoto21a.html},
abstract = {Real-world machine learning systems are often trained using a mix of data sources with varying cost and quality. Understanding how the size and composition of a training dataset affect model performance is critical for advancing our understanding of generalization, as well as designing more effective data collection policies. We show that there is a simple scaling law that predicts the loss incurred by a model even under varying dataset composition. Our work expands recent observations of scaling laws for log-linear generalization error in the i.i.d setting and uses this to cast model performance prediction as a learning problem. Using the theory of optimal experimental design, we derive a simple rational function approximation to generalization error that can be fitted using a few model training runs. Our approach can achieve highly accurate ($r^2\approx .9$) predictions of model performance under substantial extrapolation in two different standard supervised learning tasks and is accurate ($r^2 \approx .83$) on more challenging machine translation and question answering tasks where many baselines achieve worse-than-random performance.}
}
@InProceedings{pmlr-v139-havtorn21a,
title = {Hierarchical VAEs Know What They Don’t Know},
author = {Havtorn, Jakob D. and Frellsen, Jes and Hauberg, S{\o}ren and Maal{\o}e, Lars},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4117--4128},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/havtorn21a/havtorn21a.pdf},
url = {https://proceedings.mlr.press/v139/havtorn21a.html},
abstract = {Deep generative models have been demonstrated as state-of-the-art density estimators. Yet, recent work has found that they often assign a higher likelihood to data from outside the training distribution. This seemingly paradoxical behavior has caused concerns over the quality of the attained density estimates. In the context of hierarchical variational autoencoders, we provide evidence to explain this behavior by out-of-distribution data having in-distribution low-level features. We argue that this is both expected and desirable behavior. With this insight in hand, we develop a fast, scalable and fully unsupervised likelihood-ratio score for OOD detection that requires data to be in-distribution across all feature-levels. We benchmark the method on a vast set of data and model combinations and achieve state-of-the-art results on out-of-distribution detection.}
}
@InProceedings{pmlr-v139-hayase21a,
title = {SPECTRE: defending against backdoor attacks using robust statistics},
author = {Hayase, Jonathan and Kong, Weihao and Somani, Raghav and Oh, Sewoong},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4129--4139},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hayase21a/hayase21a.pdf},
url = {https://proceedings.mlr.press/v139/hayase21a.html},
abstract = {Modern machine learning increasingly requires training on a large collection of data from multiple sources, not all of which can be trusted. A particularly frightening scenario is when a small fraction of corrupted data changes the behavior of the trained model when triggered by an attacker-specified watermark. Such a compromised model will be deployed unnoticed as the model is accurate otherwise. There has been promising attempts to use the intermediate representations of such a model to separate corrupted examples from clean ones. However, these methods require a significant fraction of the data to be corrupted, in order to have strong enough signal for detection. We propose a novel defense algorithm using robust covariance estimation to amplify the spectral signature of corrupted data. This defense is able to completely remove backdoors whenever the benchmark backdoor attacks are successful, even in regimes where previous methods have no hope for detecting poisoned examples.}
}
@InProceedings{pmlr-v139-hazan21a,
title = {Boosting for Online Convex Optimization},
author = {Hazan, Elad and Singh, Karan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4140--4149},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hazan21a/hazan21a.pdf},
url = {https://proceedings.mlr.press/v139/hazan21a.html},
abstract = {We consider the decision-making framework of online convex optimization with a very large number of experts. This setting is ubiquitous in contextual and reinforcement learning problems, where the size of the policy class renders enumeration and search within the policy class infeasible. Instead, we consider generalizing the methodology of online boosting. We define a weak learning algorithm as a mechanism that guarantees multiplicatively approximate regret against a base class of experts. In this access model, we give an efficient boosting algorithm that guarantees near-optimal regret against the convex hull of the base class. We consider both full and partial (a.k.a. bandit) information feedback models. We also give an analogous efficient boosting algorithm for the i.i.d. statistical setting. Our results simultaneously generalize online boosting and gradient boosting guarantees to contextual learning model, online convex optimization and bandit linear optimization settings.}
}
@InProceedings{pmlr-v139-he21a,
title = {PipeTransformer: Automated Elastic Pipelining for Distributed Training of Large-scale Models},
author = {He, Chaoyang and Li, Shen and Soltanolkotabi, Mahdi and Avestimehr, Salman},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4150--4159},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/he21a/he21a.pdf},
url = {https://proceedings.mlr.press/v139/he21a.html},
abstract = {The size of Transformer models is growing at an unprecedented rate. It has taken less than one year to reach trillion-level parameters since the release of GPT-3 (175B). Training such models requires both substantial engineering efforts and enormous computing resources, which are luxuries most research teams cannot afford. In this paper, we propose PipeTransformer, which leverages automated elastic pipelining for efficient distributed training of Transformer models. In PipeTransformer, we design an adaptive on the fly freeze algorithm that can identify and freeze some layers gradually during training, and an elastic pipelining system that can dynamically allocate resources to train the remaining active layers. More specifically, PipeTransformer automatically excludes frozen layers from the pipeline, packs active layers into fewer GPUs, and forks more replicas to increase data-parallel width. We evaluate PipeTransformer using Vision Transformer (ViT) on ImageNet and BERT on SQuAD and GLUE datasets. Our results show that compared to the state-of-the-art baseline, PipeTransformer attains up to 2.83-fold speedup without losing accuracy. We also provide various performance analyses for a more comprehensive understanding of our algorithmic and system-wise design. Finally, we have modularized our training system with flexible APIs and made the source code publicly available at https://DistML.ai.}
}
@InProceedings{pmlr-v139-he21b,
title = {SoundDet: Polyphonic Moving Sound Event Detection and Localization from Raw Waveform},
author = {He, Yuhang and Trigoni, Niki and Markham, Andrew},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4160--4170},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/he21b/he21b.pdf},
url = {https://proceedings.mlr.press/v139/he21b.html},
abstract = {We present a new framework SoundDet, which is an end-to-end trainable and light-weight framework, for polyphonic moving sound event detection and localization. Prior methods typically approach this problem by preprocessing raw waveform into time-frequency representations, which is more amenable to process with well-established image processing pipelines. Prior methods also detect in segment-wise manner, leading to incomplete and partial detections. SoundDet takes a novel approach and directly consumes the raw, multichannel waveform and treats the spatio-temporal sound event as a complete “sound-object" to be detected. Specifically, SoundDet consists of a backbone neural network and two parallel heads for temporal detection and spatial localization, respectively. Given the large sampling rate of raw waveform, the backbone network first learns a set of phase-sensitive and frequency-selective bank of filters to explicitly retain direction-of-arrival information, whilst being highly computationally and parametrically efficient than standard 1D/2D convolution. A dense sound event proposal map is then constructed to handle the challenges of predicting events with large varying temporal duration. Accompanying the dense proposal map are a temporal overlapness map and a motion smoothness map that measure a proposal’s confidence to be an event from temporal detection accuracy and movement consistency perspective. Involving the two maps guarantees SoundDet to be trained in a spatio-temporally unified manner. Experimental results on the public DCASE dataset show the advantage of SoundDet on both segment-based evaluation and our newly proposed event-based evaluation system.}
}
@InProceedings{pmlr-v139-he21c,
title = {Logarithmic Regret for Reinforcement Learning with Linear Function Approximation},
author = {He, Jiafan and Zhou, Dongruo and Gu, Quanquan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4171--4180},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/he21c/he21c.pdf},
url = {https://proceedings.mlr.press/v139/he21c.html},
abstract = {Reinforcement learning (RL) with linear function approximation has received increasing attention recently. However, existing work has focused on obtaining $\sqrt{T}$-type regret bound, where $T$ is the number of interactions with the MDP. In this paper, we show that logarithmic regret is attainable under two recently proposed linear MDP assumptions provided that there exists a positive sub-optimality gap for the optimal action-value function. More specifically, under the linear MDP assumption (Jin et al., 2020), the LSVI-UCB algorithm can achieve $\tilde{O}(d^{3}H^5/\text{gap}_{\text{min}}\cdot \log(T))$regret; and under the linear mixture MDP assumption (Ayoub et al., 2020), the UCRL-VTR algorithm can achieve $\tilde{O}(d^{2}H^5/\text{gap}_{\text{min}}\cdot \log^3(T))$ regret, where $d$ is the dimension of feature mapping, $H$ is the length of episode, $\text{gap}_{\text{min}}$ is the minimal sub-optimality gap, and $\tilde O$ hides all logarithmic terms except $\log(T)$. To the best of our knowledge, these are the first logarithmic regret bounds for RL with linear function approximation. We also establish gap-dependent lower bounds for the two linear MDP models.}
}
@InProceedings{pmlr-v139-heidari21a,
title = {Finding Relevant Information via a Discrete Fourier Expansion},
author = {Heidari, Mohsen and Sreedharan, Jithin and Shamir, Gil I and Szpankowski, Wojciech},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4181--4191},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/heidari21a/heidari21a.pdf},
url = {https://proceedings.mlr.press/v139/heidari21a.html},
abstract = {A fundamental obstacle in learning information from data is the presence of nonlinear redundancies and dependencies in it. To address this, we propose a Fourier-based approach to extract relevant information in the supervised setting. We first develop a novel Fourier expansion for functions of correlated binary random variables. This expansion is a generalization of the standard Fourier analysis on the Boolean cube beyond product probability spaces. We further extend our Fourier analysis to stochastic mappings. As an important application of this analysis, we investigate learning with feature subset selection. We reformulate this problem in the Fourier domain and introduce a computationally efficient measure for selecting features. Bridging the Bayesian error rate with the Fourier coefficients, we demonstrate that the Fourier expansion provides a powerful tool to characterize nonlinear dependencies in the features-label relation. Via theoretical analysis, we show that our proposed measure finds provably asymptotically optimal feature subsets. Lastly, we present an algorithm based on our measure and verify our findings via numerical experiments on various datasets.}
}
@InProceedings{pmlr-v139-heliou21a,
title = {Zeroth-Order Non-Convex Learning via Hierarchical Dual Averaging},
author = {H{\'e}liou, Am{\'e}lie and Martin, Matthieu and Mertikopoulos, Panayotis and Rahier, Thibaud},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4192--4202},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/heliou21a/heliou21a.pdf},
url = {https://proceedings.mlr.press/v139/heliou21a.html},
abstract = {We propose a hierarchical version of dual averaging for zeroth-order online non-convex optimization {–} i.e., learning processes where, at each stage, the optimizer is facing an unknown non-convex loss function and only receives the incurred loss as feedback. The proposed class of policies relies on the construction of an online model that aggregates loss information as it arrives, and it consists of two principal components: (a) a regularizer adapted to the Fisher information metric (as opposed to the metric norm of the ambient space); and (b) a principled exploration of the problem’s state space based on an adapted hierarchical schedule. This construction enables sharper control of the model’s bias and variance, and allows us to derive tight bounds for both the learner’s static and dynamic regret {–} i.e., the regret incurred against the best dynamic policy in hindsight over the horizon of play.}
}
@InProceedings{pmlr-v139-henderson21a,
title = {Improving Molecular Graph Neural Network Explainability with Orthonormalization and Induced Sparsity},
author = {Henderson, Ryan and Clevert, Djork-Arn{\'e} and Montanari, Floriane},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4203--4213},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/henderson21a/henderson21a.pdf},
url = {https://proceedings.mlr.press/v139/henderson21a.html},
abstract = {Rationalizing which parts of a molecule drive the predictions of a molecular graph convolutional neural network (GCNN) can be difficult. To help, we propose two simple regularization techniques to apply during the training of GCNNs: Batch Representation Orthonormalization (BRO) and Gini regularization. BRO, inspired by molecular orbital theory, encourages graph convolution operations to generate orthonormal node embeddings. Gini regularization is applied to the weights of the output layer and constrains the number of dimensions the model can use to make predictions. We show that Gini and BRO regularization can improve the accuracy of state-of-the-art GCNN attribution methods on artificial benchmark datasets. In a real-world setting, we demonstrate that medicinal chemists significantly prefer explanations extracted from regularized models. While we only study these regularizers in the context of GCNNs, both can be applied to other types of neural networks.}
}
@InProceedings{pmlr-v139-hessel21a,
title = {Muesli: Combining Improvements in Policy Optimization},
author = {Hessel, Matteo and Danihelka, Ivo and Viola, Fabio and Guez, Arthur and Schmitt, Simon and Sifre, Laurent and Weber, Theophane and Silver, David and Van Hasselt, Hado},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4214--4226},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hessel21a/hessel21a.pdf},
url = {https://proceedings.mlr.press/v139/hessel21a.html},
abstract = {We propose a novel policy update that combines regularized policy optimization with model learning as an auxiliary loss. The update (henceforth Muesli) matches MuZero’s state-of-the-art performance on Atari. Notably, Muesli does so without using deep search: it acts directly with a policy network and has computation speed comparable to model-free baselines. The Atari results are complemented by extensive ablations, and by additional results on continuous control and 9x9 Go.}
}
@InProceedings{pmlr-v139-hilgard21a,
title = {Learning Representations by Humans, for Humans},
author = {Hilgard, Sophie and Rosenfeld, Nir and Banaji, Mahzarin R and Cao, Jack and Parkes, David},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4227--4238},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hilgard21a/hilgard21a.pdf},
url = {https://proceedings.mlr.press/v139/hilgard21a.html},
abstract = {When machine predictors can achieve higher performance than the human decision-makers they support, improving the performance of human decision-makers is often conflated with improving machine accuracy. Here we propose a framework to directly support human decision-making, in which the role of machines is to reframe problems rather than to prescribe actions through prediction. Inspired by the success of representation learning in improving performance of machine predictors, our framework learns human-facing representations optimized for human performance. This “Mind Composed with Machine” framework incorporates a human decision-making model directly into the representation learning paradigm and is trained with a novel human-in-the-loop training procedure. We empirically demonstrate the successful application of the framework to various tasks and representational forms.}
}
@InProceedings{pmlr-v139-hiranandani21a,
title = {Optimizing Black-box Metrics with Iterative Example Weighting},
author = {Hiranandani, Gaurush and Mathur, Jatin and Narasimhan, Harikrishna and Fard, Mahdi Milani and Koyejo, Sanmi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4239--4249},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hiranandani21a/hiranandani21a.pdf},
url = {https://proceedings.mlr.press/v139/hiranandani21a.html},
abstract = {We consider learning to optimize a classification metric defined by a black-box function of the confusion matrix. Such black-box learning settings are ubiquitous, for example, when the learner only has query access to the metric of interest, or in noisy-label and domain adaptation applications where the learner must evaluate the metric via performance evaluation using a small validation sample. Our approach is to adaptively learn example weights on the training dataset such that the resulting weighted objective best approximates the metric on the validation sample. We show how to model and estimate the example weights and use them to iteratively post-shift a pre-trained class probability estimator to construct a classifier. We also analyze the resulting procedure’s statistical properties. Experiments on various label noise, domain shift, and fair classification setups confirm that our proposal compares favorably to the state-of-the-art baselines for each application.}
}
@InProceedings{pmlr-v139-hirsch21a,
title = {Trees with Attention for Set Prediction Tasks},
author = {Hirsch, Roy and Gilad-Bachrach, Ran},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4250--4261},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hirsch21a/hirsch21a.pdf},
url = {https://proceedings.mlr.press/v139/hirsch21a.html},
abstract = {In many machine learning applications, each record represents a set of items. For example, when making predictions from medical records, the medications prescribed to a patient are a set whose size is not fixed and whose order is arbitrary. However, most machine learning algorithms are not designed to handle set structures and are limited to processing records of fixed size. Set-Tree, presented in this work, extends the support for sets to tree-based models, such as Random-Forest and Gradient-Boosting, by introducing an attention mechanism and set-compatible split criteria. We evaluate the new method empirically on a wide range of problems ranging from making predictions on sub-atomic particle jets to estimating the redshift of galaxies. The new method outperforms existing tree-based methods consistently and significantly. Moreover, it is competitive and often outperforms Deep Learning. We also discuss the theoretical properties of Set-Trees and explain how they enable item-level explainability.}
}
@InProceedings{pmlr-v139-hodgkinson21a,
title = {Multiplicative Noise and Heavy Tails in Stochastic Optimization},
author = {Hodgkinson, Liam and Mahoney, Michael},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4262--4274},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hodgkinson21a/hodgkinson21a.pdf},
url = {https://proceedings.mlr.press/v139/hodgkinson21a.html},
abstract = {Although stochastic optimization is central to modern machine learning, the precise mechanisms underlying its success, and in particular, the precise role of the stochasticity, still remain unclear. Modeling stochastic optimization algorithms as discrete random recurrence relations, we show that multiplicative noise, as it commonly arises due to variance in local rates of convergence, results in heavy-tailed stationary behaviour in the parameters. Theoretical results are obtained characterizing this for a large class of (non-linear and even non-convex) models and optimizers (including momentum, Adam, and stochastic Newton), demonstrating that this phenomenon holds generally. We describe dependence on key factors, including step size, batch size, and data variability, all of which exhibit similar qualitative behavior to recent empirical results on state-of-the-art neural network models. Furthermore, we empirically illustrate how multiplicative noise and heavy-tailed structure improve capacity for basin hopping and exploration of non-convex loss surfaces, over commonly-considered stochastic dynamics with only additive noise and light-tailed structure.}
}
@InProceedings{pmlr-v139-hoedt21a,
title = {MC-LSTM: Mass-Conserving LSTM},
author = {Hoedt, Pieter-Jan and Kratzert, Frederik and Klotz, Daniel and Halmich, Christina and Holzleitner, Markus and Nearing, Grey S and Hochreiter, Sepp and Klambauer, Guenter},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4275--4286},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hoedt21a/hoedt21a.pdf},
url = {https://proceedings.mlr.press/v139/hoedt21a.html},
abstract = {The success of Convolutional Neural Networks (CNNs) in computer vision is mainly driven by their strong inductive bias, which is strong enough to allow CNNs to solve vision-related tasks with random weights, meaning without learning. Similarly, Long Short-Term Memory (LSTM) has a strong inductive bias towards storing information over time. However, many real-world systems are governed by conservation laws, which lead to the redistribution of particular quantities {—} e.g.in physical and economical systems. Our novel Mass-Conserving LSTM (MC-LSTM) adheres to these conservation laws by extending the inductive bias of LSTM to model the redistribution of those stored quantities. MC-LSTMs set a new state-of-the-art for neural arithmetic units at learning arithmetic operations, such as addition tasks,which have a strong conservation law, as the sum is constant over time. Further, MC-LSTM is applied to traffic forecasting, modeling a pendulum, and a large benchmark dataset in hydrology, where it sets a new state-of-the-art for predicting peak flows. In the hydrology example, we show that MC-LSTM states correlate with real world processes and are therefore interpretable.}
}
@InProceedings{pmlr-v139-hoiem21a,
title = {Learning Curves for Analysis of Deep Networks},
author = {Hoiem, Derek and Gupta, Tanmay and Li, Zhizhong and Shlapentokh-Rothman, Michal},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4287--4296},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hoiem21a/hoiem21a.pdf},
url = {https://proceedings.mlr.press/v139/hoiem21a.html},
abstract = {Learning curves model a classifier’s test error as a function of the number of training samples. Prior works show that learning curves can be used to select model parameters and extrapolate performance. We investigate how to use learning curves to evaluate design choices, such as pretraining, architecture, and data augmentation. We propose a method to robustly estimate learning curves, abstract their parameters into error and data-reliance, and evaluate the effectiveness of different parameterizations. Our experiments exemplify use of learning curves for analysis and yield several interesting observations.}
}
@InProceedings{pmlr-v139-holderrieth21a,
title = {Equivariant Learning of Stochastic Fields: Gaussian Processes and Steerable Conditional Neural Processes},
author = {Holderrieth, Peter and Hutchinson, Michael J and Teh, Yee Whye},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4297--4307},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/holderrieth21a/holderrieth21a.pdf},
url = {https://proceedings.mlr.press/v139/holderrieth21a.html},
abstract = {Motivated by objects such as electric fields or fluid streams, we study the problem of learning stochastic fields, i.e. stochastic processes whose samples are fields like those occurring in physics and engineering. Considering general transformations such as rotations and reflections, we show that spatial invariance of stochastic fields requires an inference model to be equivariant. Leveraging recent advances from the equivariance literature, we study equivariance in two classes of models. Firstly, we fully characterise equivariant Gaussian processes. Secondly, we introduce Steerable Conditional Neural Processes (SteerCNPs), a new, fully equivariant member of the Neural Process family. In experiments with Gaussian process vector fields, images, and real-world weather data, we observe that SteerCNPs significantly improve the performance of previous models and equivariance leads to improvements in transfer learning tasks.}
}
@InProceedings{pmlr-v139-hong21a,
title = {Latent Programmer: Discrete Latent Codes for Program Synthesis},
author = {Hong, Joey and Dohan, David and Singh, Rishabh and Sutton, Charles and Zaheer, Manzil},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4308--4318},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hong21a/hong21a.pdf},
url = {https://proceedings.mlr.press/v139/hong21a.html},
abstract = {A key problem in program synthesis is searching over the large space of possible programs. Human programmers might decide the high-level structure of the desired program before thinking about the details; motivated by this intuition, we consider two-level search for program synthesis, in which the synthesizer first generates a plan, a sequence of symbols that describes the desired program at a high level, before generating the program. We propose to learn representations of programs that can act as plans to organize such a two-level search. Discrete latent codes are appealing for this purpose, and can be learned by applying recent work on discrete autoencoders. Based on these insights, we introduce the Latent Programmer (LP), a program synthesis method that first predicts a discrete latent code from input/output examples, and then generates the program in the target language. We evaluate the LP on two domains, demonstrating that it yields an improvement in accuracy, especially on longer programs for which search is most difficult.}
}
@InProceedings{pmlr-v139-hong21b,
title = {Chebyshev Polynomial Codes: Task Entanglement-based Coding for Distributed Matrix Multiplication},
author = {Hong, Sangwoo and Yang, Heecheol and Yoon, Youngseok and Cho, Taehyun and Lee, Jungwoo},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4319--4327},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hong21b/hong21b.pdf},
url = {https://proceedings.mlr.press/v139/hong21b.html},
abstract = {Distributed computing has been a prominent solution to efficiently process massive datasets in parallel. However, the existence of stragglers is one of the major concerns that slows down the overall speed of distributed computing. To deal with this problem, we consider a distributed matrix multiplication scenario where a master assigns multiple tasks to each worker to exploit stragglers’ computing ability (which is typically wasted in conventional distributed computing). We propose Chebyshev polynomial codes, which can achieve order-wise improvement in encoding complexity at the master and communication load in distributed matrix multiplication using task entanglement. The key idea of task entanglement is to reduce the number of encoded matrices for multiple tasks assigned to each worker by intertwining encoded matrices. We experimentally demonstrate that, in cloud environments, Chebyshev polynomial codes can provide significant reduction in overall processing time in distributed computing for matrix multiplication, which is a key computational component in modern deep learning.}
}
@InProceedings{pmlr-v139-hosseini21a,
title = {Federated Learning of User Verification Models Without Sharing Embeddings},
author = {Hosseini, Hossein and Park, Hyunsin and Yun, Sungrack and Louizos, Christos and Soriaga, Joseph and Welling, Max},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4328--4336},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hosseini21a/hosseini21a.pdf},
url = {https://proceedings.mlr.press/v139/hosseini21a.html},
abstract = {We consider the problem of training User Verification (UV) models in federated setup, where each user has access to the data of only one class and user embeddings cannot be shared with the server or other users. To address this problem, we propose Federated User Verification (FedUV), a framework in which users jointly learn a set of vectors and maximize the correlation of their instance embeddings with a secret linear combination of those vectors. We show that choosing the linear combinations from the codewords of an error-correcting code allows users to collaboratively train the model without revealing their embedding vectors. We present the experimental results for user verification with voice, face, and handwriting data and show that FedUV is on par with existing approaches, while not sharing the embeddings with other users or the server.}
}
@InProceedings{pmlr-v139-hsieh21a,
title = {The Limits of Min-Max Optimization Algorithms: Convergence to Spurious Non-Critical Sets},
author = {Hsieh, Ya-Ping and Mertikopoulos, Panayotis and Cevher, Volkan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4337--4348},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hsieh21a/hsieh21a.pdf},
url = {https://proceedings.mlr.press/v139/hsieh21a.html},
abstract = {Compared to minimization, the min-max optimization in machine learning applications is considerably more convoluted because of the existence of cycles and similar phenomena. Such oscillatory behaviors are well-understood in the convex-concave regime, and many algorithms are known to overcome them. In this paper, we go beyond this basic setting and characterize the convergence properties of many popular methods in solving non-convex/non-concave problems. In particular, we show that a wide class of state-of-the-art schemes and heuristics may converge with arbitrarily high probability to attractors that are in no way min-max optimal or even stationary. Our work thus points out a potential pitfall among many existing theoretical frameworks, and we corroborate our theoretical claims by explicitly showcasing spurious attractors in simple two-dimensional problems.}
}
@InProceedings{pmlr-v139-hu21a,
title = {Near-Optimal Representation Learning for Linear Bandits and Linear RL},
author = {Hu, Jiachen and Chen, Xiaoyu and Jin, Chi and Li, Lihong and Wang, Liwei},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4349--4358},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hu21a/hu21a.pdf},
url = {https://proceedings.mlr.press/v139/hu21a.html},
abstract = {This paper studies representation learning for multi-task linear bandits and multi-task episodic RL with linear value function approximation. We first consider the setting where we play $M$ linear bandits with dimension $d$ concurrently, and these bandits share a common $k$-dimensional linear representation so that $k\ll d$ and $k \ll M$. We propose a sample-efficient algorithm, MTLR-OFUL, which leverages the shared representation to achieve $\tilde{O}(M\sqrt{dkT} + d\sqrt{kMT} )$ regret, with $T$ being the number of total steps. Our regret significantly improves upon the baseline $\tilde{O}(Md\sqrt{T})$ achieved by solving each task independently. We further develop a lower bound that shows our regret is near-optimal when $d > M$. Furthermore, we extend the algorithm and analysis to multi-task episodic RL with linear value function approximation under low inherent Bellman error (Zanette et al., 2020a). To the best of our knowledge, this is the first theoretical result that characterize the benefits of multi-task representation learning for exploration in RL with function approximation.}
}
@InProceedings{pmlr-v139-hu21b,
title = {On the Random Conjugate Kernel and Neural Tangent Kernel},
author = {Hu, Zhengmian and Huang, Heng},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4359--4368},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hu21b/hu21b.pdf},
url = {https://proceedings.mlr.press/v139/hu21b.html},
abstract = {We investigate the distributions of Conjugate Kernel (CK) and Neural Tangent Kernel (NTK) for ReLU networks with random initialization. We derive the precise distributions and moments of the diagonal elements of these kernels. For a feedforward network, these values converge in law to a log-normal distribution when the network depth $d$ and width $n$ simultaneously tend to infinity and the variance of log diagonal elements is proportional to ${d}/{n}$. For the residual network, in the limit that number of branches $m$ increases to infinity and the width $n$ remains fixed, the diagonal elements of Conjugate Kernel converge in law to a log-normal distribution where the variance of log value is proportional to ${1}/{n}$, and the diagonal elements of NTK converge in law to a log-normal distributed variable times the conjugate kernel of one feedforward network. Our new theoretical analysis results suggest that residual network remains trainable in the limit of infinite branches and fixed network width. The numerical experiments are conducted and all results validate the soundness of our theoretical analysis.}
}
@InProceedings{pmlr-v139-hu21c,
title = {Off-Belief Learning},
author = {Hu, Hengyuan and Lerer, Adam and Cui, Brandon and Pineda, Luis and Brown, Noam and Foerster, Jakob},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4369--4379},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hu21c/hu21c.pdf},
url = {https://proceedings.mlr.press/v139/hu21c.html},
abstract = {The standard problem setting in Dec-POMDPs is self-play, where the goal is to find a set of policies that play optimally together. Policies learned through self-play may adopt arbitrary conventions and implicitly rely on multi-step reasoning based on fragile assumptions about other agents’ actions and thus fail when paired with humans or independently trained agents at test time. To address this, we present off-belief learning (OBL). At each timestep OBL agents follow a policy $\pi_1$ that is optimized assuming past actions were taken by a given, fixed policy ($\pi_0$), but assuming that future actions will be taken by $\pi_1$. When $\pi_0$ is uniform random, OBL converges to an optimal policy that does not rely on inferences based on other agents’ behavior (an optimal grounded policy). OBL can be iterated in a hierarchy, where the optimal policy from one level becomes the input to the next, thereby introducing multi-level cognitive reasoning in a controlled manner. Unlike existing approaches, which may converge to any equilibrium policy, OBL converges to a unique policy, making it suitable for zero-shot coordination (ZSC). OBL can be scaled to high-dimensional settings with a fictitious transition mechanism and shows strong performance in both a toy-setting and the benchmark human-AI & ZSC problem Hanabi.}
}
@InProceedings{pmlr-v139-hu21d,
title = {Generalizable Episodic Memory for Deep Reinforcement Learning},
author = {Hu, Hao and Ye, Jianing and Zhu, Guangxiang and Ren, Zhizhou and Zhang, Chongjie},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4380--4390},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hu21d/hu21d.pdf},
url = {https://proceedings.mlr.press/v139/hu21d.html},
abstract = {Episodic memory-based methods can rapidly latch onto past successful strategies by a non-parametric memory and improve sample efficiency of traditional reinforcement learning. However, little effort is put into the continuous domain, where a state is never visited twice, and previous episodic methods fail to efficiently aggregate experience across trajectories. To address this problem, we propose Generalizable Episodic Memory (GEM), which effectively organizes the state-action values of episodic memory in a generalizable manner and supports implicit planning on memorized trajectories. GEM utilizes a double estimator to reduce the overestimation bias induced by value propagation in the planning process. Empirical evaluation shows that our method significantly outperforms existing trajectory-based methods on various MuJoCo continuous control tasks. To further show the general applicability, we evaluate our method on Atari games with discrete action space, which also shows a significant improvement over baseline algorithms.}
}
@InProceedings{pmlr-v139-hua21a,
title = {A Scalable Deterministic Global Optimization Algorithm for Clustering Problems},
author = {Hua, Kaixun and Shi, Mingfei and Cao, Yankai},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4391--4401},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hua21a/hua21a.pdf},
url = {https://proceedings.mlr.press/v139/hua21a.html},
abstract = {The minimum sum-of-squares clustering (MSSC) task, which can be treated as a Mixed Integer Second Order Cone Programming (MISOCP) problem, is rarely investigated in the literature through deterministic optimization to find its global optimal value. In this paper, we modelled the MSSC task as a two-stage optimization problem and proposed a tailed reduced-space branch and bound (BB) algorithm. We designed several approaches to construct lower and upper bounds at each node in the BB scheme, including a scenario grouping based Lagrangian decomposition approach. One key advantage of this reduced-space algorithm is that it only needs to perform branching on the centers of clusters to guarantee convergence, and the size of centers is independent of the number of data samples. Moreover, the lower bounds can be computed by solving small-scale sample subproblems, and upper bounds can be obtained trivially. These two properties enable our algorithm easy to be paralleled and can be scalable to the dataset with up to 200,000 samples for finding a global $\epsilon$-optimal solution of the MSSC task. We performed numerical experiments on both synthetic and real-world datasets and compared our proposed algorithms with the off-the-shelf global optimal solvers and classical local optimal algorithms. The results reveal a strong performance and scalability of our algorithm.}
}
@InProceedings{pmlr-v139-huang21a,
title = {On Recovering from Modeling Errors Using Testing Bayesian Networks},
author = {Huang, Haiying and Darwiche, Adnan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4402--4411},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/huang21a/huang21a.pdf},
url = {https://proceedings.mlr.press/v139/huang21a.html},
abstract = {We consider the problem of supervised learning with Bayesian Networks when the used dependency structure is incomplete due to missing edges or missing variable states. These modeling errors induce independence constraints on the learned model that may not hold in the true, data-generating distribution. We provide a unified treatment of these modeling errors as instances of state-space abstractions. We then identify a class of Bayesian Networks and queries which allow one to fully recover from such modeling errors if one can choose Conditional Probability Tables (CPTs) dynamically based on evidence. We show theoretically that the recently proposed Testing Bayesian Networks (TBNs), which can be trained by compiling them into Testing Arithmetic Circuits (TACs), provide a promising construct for emulating this CPT selection mechanism. Finally, we present empirical results that illustrate the promise of TBNs as a tool for recovering from certain modeling errors in the context of supervised learning.}
}
@InProceedings{pmlr-v139-huang21b,
title = {A Novel Sequential Coreset Method for Gradient Descent Algorithms},
author = {Huang, Jiawei and Huang, Ruomin and Liu, Wenjie and Freris, Nikolaos and Ding, Hu},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4412--4422},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/huang21b/huang21b.pdf},
url = {https://proceedings.mlr.press/v139/huang21b.html},
abstract = {A wide range of optimization problems arising in machine learning can be solved by gradient descent algorithms, and a central question in this area is how to efficiently compress a large-scale dataset so as to reduce the computational complexity. Coreset is a popular data compression technique that has been extensively studied before. However, most of existing coreset methods are problem-dependent and cannot be used as a general tool for a broader range of applications. A key obstacle is that they often rely on the pseudo-dimension and total sensitivity bound that can be very high or hard to obtain. In this paper, based on the “locality” property of gradient descent algorithms, we propose a new framework, termed “sequential coreset”, which effectively avoids these obstacles. Moreover, our method is particularly suitable for sparse optimization whence the coreset size can be further reduced to be only poly-logarithmically dependent on the dimension. In practice, the experimental results suggest that our method can save a large amount of running time compared with the baseline algorithms.}
}
@InProceedings{pmlr-v139-huang21c,
title = {FL-NTK: A Neural Tangent Kernel-based Framework for Federated Learning Analysis},
author = {Huang, Baihe and Li, Xiaoxiao and Song, Zhao and Yang, Xin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4423--4434},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/huang21c/huang21c.pdf},
url = {https://proceedings.mlr.press/v139/huang21c.html},
abstract = {Federated Learning (FL) is an emerging learning scheme that allows different distributed clients to train deep neural networks together without data sharing. Neural networks have become popular due to their unprecedented success. To the best of our knowledge, the theoretical guarantees of FL concerning neural networks with explicit forms and multi-step updates are unexplored. Nevertheless, training analysis of neural networks in FL is non-trivial for two reasons: first, the objective loss function we are optimizing is non-smooth and non-convex, and second, we are even not updating in the gradient direction. Existing convergence results for gradient descent-based methods heavily rely on the fact that the gradient direction is used for updating. The current paper presents a new class of convergence analysis for FL, Federated Neural Tangent Kernel (FL-NTK), which corresponds to overparamterized ReLU neural networks trained by gradient descent in FL and is inspired by the analysis in Neural Tangent Kernel (NTK). Theoretically, FL-NTK converges to a global-optimal solution at a linear rate with properly tuned learning parameters. Furthermore, with proper distributional assumptions, FL-NTK can also achieve good generalization. The proposed theoretical analysis scheme can be generalized to more complex neural networks.}
}
@InProceedings{pmlr-v139-huang21d,
title = {STRODE: Stochastic Boundary Ordinary Differential Equation},
author = {Huang, Hengguan and Liu, Hongfu and Wang, Hao and Xiao, Chang and Wang, Ye},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4435--4445},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/huang21d/huang21d.pdf},
url = {https://proceedings.mlr.press/v139/huang21d.html},
abstract = {Perception of time from sequentially acquired sensory inputs is rooted in everyday behaviors of individual organisms. Yet, most algorithms for time-series modeling fail to learn dynamics of random event timings directly from visual or audio inputs, requiring timing annotations during training that are usually unavailable for real-world applications. For instance, neuroscience perspectives on postdiction imply that there exist variable temporal ranges within which the incoming sensory inputs can affect the earlier perception, but such temporal ranges are mostly unannotated for real applications such as automatic speech recognition (ASR). In this paper, we present a probabilistic ordinary differential equation (ODE), called STochastic boundaRy ODE (STRODE), that learns both the timings and the dynamics of time series data without requiring any timing annotations during training. STRODE allows the usage of differential equations to sample from the posterior point processes, efficiently and analytically. We further provide theoretical guarantees on the learning of STRODE. Our empirical results show that our approach successfully infers event timings of time series data. Our method achieves competitive or superior performances compared to existing state-of-the-art methods for both synthetic and real-world datasets.}
}
@InProceedings{pmlr-v139-huang21e,
title = {A Riemannian Block Coordinate Descent Method for Computing the Projection Robust Wasserstein Distance},
author = {Huang, Minhui and Ma, Shiqian and Lai, Lifeng},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4446--4455},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/huang21e/huang21e.pdf},
url = {https://proceedings.mlr.press/v139/huang21e.html},
abstract = {The Wasserstein distance has become increasingly important in machine learning and deep learning. Despite its popularity, the Wasserstein distance is hard to approximate because of the curse of dimensionality. A recently proposed approach to alleviate the curse of dimensionality is to project the sampled data from the high dimensional probability distribution onto a lower-dimensional subspace, and then compute the Wasserstein distance between the projected data. However, this approach requires to solve a max-min problem over the Stiefel manifold, which is very challenging in practice. In this paper, we propose a Riemannian block coordinate descent (RBCD) method to solve this problem, which is based on a novel reformulation of the regularized max-min problem over the Stiefel manifold. We show that the complexity of arithmetic operations for RBCD to obtain an $\epsilon$-stationary point is $O(\epsilon^{-3})$, which is significantly better than the complexity of existing methods. Numerical results on both synthetic and real datasets demonstrate that our method is more efficient than existing methods, especially when the number of sampled data is very large.}
}
@InProceedings{pmlr-v139-huang21f,
title = {Projection Robust Wasserstein Barycenters},
author = {Huang, Minhui and Ma, Shiqian and Lai, Lifeng},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4456--4465},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/huang21f/huang21f.pdf},
url = {https://proceedings.mlr.press/v139/huang21f.html},
abstract = {Collecting and aggregating information from several probability measures or histograms is a fundamental task in machine learning. One of the popular solution methods for this task is to compute the barycenter of the probability measures under the Wasserstein metric. However, approximating the Wasserstein barycenter is numerically challenging because of the curse of dimensionality. This paper proposes the projection robust Wasserstein barycenter (PRWB) that has the potential to mitigate the curse of dimensionality, and a relaxed PRWB (RPRWB) model that is computationally more tractable. By combining the iterative Bregman projection algorithm and Riemannian optimization, we propose two algorithms for computing the RPRWB, which is a max-min problem over the Stiefel manifold. The complexity of arithmetic operations of the proposed algorithms for obtaining an $\epsilon$-stationary solution is analyzed. We incorporate the RPRWB into a discrete distribution clustering algorithm, and the numerical results on real text datasets confirm that our RPRWB model helps improve the clustering performance significantly.}
}
@InProceedings{pmlr-v139-hubara21a,
title = {Accurate Post Training Quantization With Small Calibration Sets},
author = {Hubara, Itay and Nahshan, Yury and Hanani, Yair and Banner, Ron and Soudry, Daniel},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4466--4475},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hubara21a/hubara21a.pdf},
url = {https://proceedings.mlr.press/v139/hubara21a.html},
abstract = {Lately, post-training quantization methods have gained considerable attention, as they are simple to use, and require only a small unlabeled calibration set. This small dataset cannot be used to fine-tune the model without significant over-fitting. Instead, these methods only use the calibration set to set the activations’ dynamic ranges. However, such methods always resulted in significant accuracy degradation, when used below 8-bits (except on small datasets). Here we aim to break the 8-bit barrier. To this end, we minimize the quantization errors of each layer or block separately by optimizing its parameters over the calibration set. We empirically demonstrate that this approach is: (1) much less susceptible to over-fitting than the standard fine-tuning approaches, and can be used even on a very small calibration set; and (2) more powerful than previous methods, which only set the activations’ dynamic ranges. We suggest two flavors for our method, parallel and sequential aim for a fixed and flexible bit-width allocation. For the latter, we demonstrate how to optimally allocate the bit-widths for each layer, while constraining accuracy degradation or model compression by proposing a novel integer programming formulation. Finally, we suggest model global statistics tuning, to correct biases introduced during quantization. Together, these methods yield state-of-the-art results for both vision and text models. For instance, on ResNet50, we obtain less than 1% accuracy degradation — with 4-bit weights and activations in all layers, but first and last. The suggested methods are two orders of magnitude faster than the traditional Quantize Aware Training approach used for lower than 8-bit quantization. We open-sourced our code \textit{https://github.com/papers-submission/CalibTIP}.}
}
@InProceedings{pmlr-v139-hubert21a,
title = {Learning and Planning in Complex Action Spaces},
author = {Hubert, Thomas and Schrittwieser, Julian and Antonoglou, Ioannis and Barekatain, Mohammadamin and Schmitt, Simon and Silver, David},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4476--4486},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hubert21a/hubert21a.pdf},
url = {https://proceedings.mlr.press/v139/hubert21a.html},
abstract = {Many important real-world problems have action spaces that are high-dimensional, continuous or both, making full enumeration of all possible actions infeasible. Instead, only small subsets of actions can be sampled for the purpose of policy evaluation and improvement. In this paper, we propose a general framework to reason in a principled way about policy evaluation and improvement over such sampled action subsets. This sample-based policy iteration framework can in principle be applied to any reinforcement learning algorithm based upon policy iteration. Concretely, we propose Sampled MuZero, an extension of the MuZero algorithm that is able to learn in domains with arbitrarily complex action spaces by planning over sampled actions. We demonstrate this approach on the classical board game of Go and on two continuous control benchmark domains: DeepMind Control Suite and Real-World RL Suite.}
}
@InProceedings{pmlr-v139-hudson21a,
title = {Generative Adversarial Transformers},
author = {Hudson, Drew A and Zitnick, Larry},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4487--4499},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hudson21a/hudson21a.pdf},
url = {https://proceedings.mlr.press/v139/hudson21a.html},
abstract = {We introduce the GANsformer, a novel and efficient type of transformer, and explore it for the task of visual generative modeling. The network employs a bipartite structure that enables long-range interactions across the image, while maintaining computation of linear efficiency, that can readily scale to high-resolution synthesis. It iteratively propagates information from a set of latent variables to the evolving visual features and vice versa, to support the refinement of each in light of the other and encourage the emergence of compositional representations of objects and scenes. In contrast to the classic transformer architecture, it utilizes multiplicative integration that allows flexible region-based modulation, and can thus be seen as a generalization of the successful StyleGAN network. We demonstrate the model’s strength and robustness through a careful evaluation over a range of datasets, from simulated multi-object environments to rich real-world indoor and outdoor scenes, showing it achieves state-of-the-art results in terms of image quality and diversity, while enjoying fast learning and better data-efficiency. Further qualitative and quantitative experiments offer us an insight into the model’s inner workings, revealing improved interpretability and stronger disentanglement, and illustrating the benefits and efficacy of our approach. An implementation of the model is available at https://github.com/dorarad/gansformer.}
}
@InProceedings{pmlr-v139-hussain21a,
title = {Neural Pharmacodynamic State Space Modeling},
author = {Hussain, Zeshan M and Krishnan, Rahul G. and Sontag, David},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4500--4510},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hussain21a/hussain21a.pdf},
url = {https://proceedings.mlr.press/v139/hussain21a.html},
abstract = {Modeling the time-series of high-dimensional, longitudinal data is important for predicting patient disease progression. However, existing neural network based approaches that learn representations of patient state, while very flexible, are susceptible to overfitting. We propose a deep generative model that makes use of a novel attention-based neural architecture inspired by the physics of how treatments affect disease state. The result is a scalable and accurate model of high-dimensional patient biomarkers as they vary over time. Our proposed model yields significant improvements in generalization and, on real-world clinical data, provides interpretable insights into the dynamics of cancer progression.}
}
@InProceedings{pmlr-v139-hussenot21a,
title = {Hyperparameter Selection for Imitation Learning},
author = {Hussenot, L{\'e}onard and Andrychowicz, Marcin and Vincent, Damien and Dadashi, Robert and Raichuk, Anton and Ramos, Sabela and Momchev, Nikola and Girgin, Sertan and Marinier, Raphael and Stafiniak, Lukasz and Orsini, Manu and Bachem, Olivier and Geist, Matthieu and Pietquin, Olivier},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4511--4522},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hussenot21a/hussenot21a.pdf},
url = {https://proceedings.mlr.press/v139/hussenot21a.html},
abstract = {We address the issue of tuning hyperparameters (HPs) for imitation learning algorithms in the context of continuous-control, when the underlying reward function of the demonstrating expert cannot be observed at any time. The vast literature in imitation learning mostly considers this reward function to be available for HP selection, but this is not a realistic setting. Indeed, would this reward function be available, it could then directly be used for policy training and imitation would not be necessary. To tackle this mostly ignored problem, we propose a number of possible proxies to the external reward. We evaluate them in an extensive empirical study (more than 10’000 agents across 9 environments) and make practical recommendations for selecting HPs. Our results show that while imitation learning algorithms are sensitive to HP choices, it is often possible to select good enough HPs through a proxy to the reward function.}
}
@InProceedings{pmlr-v139-huster21a,
title = {Pareto GAN: Extending the Representational Power of GANs to Heavy-Tailed Distributions},
author = {Huster, Todd and Cohen, Jeremy and Lin, Zinan and Chan, Kevin and Kamhoua, Charles and Leslie, Nandi O. and Chiang, Cho-Yu Jason and Sekar, Vyas},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4523--4532},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/huster21a/huster21a.pdf},
url = {https://proceedings.mlr.press/v139/huster21a.html},
abstract = {Generative adversarial networks (GANs) are often billed as "universal distribution learners", but precisely what distributions they can represent and learn is still an open question. Heavy-tailed distributions are prevalent in many different domains such as financial risk-assessment, physics, and epidemiology. We observe that existing GAN architectures do a poor job of matching the asymptotic behavior of heavy-tailed distributions, a problem that we show stems from their construction. Additionally, common loss functions produce unstable or near-zero gradients when faced with the infinite moments and large distances between outlier points characteristic of heavy-tailed distributions. We address these problems with the Pareto GAN. A Pareto GAN leverages extreme value theory and the functional properties of neural networks to learn a distribution that matches the asymptotic behavior of the marginal distributions of the features. We identify issues with standard loss functions and propose the use of alternative metric spaces that enable stable and efficient learning. Finally, we evaluate our proposed approach on a variety of heavy-tailed datasets.}
}
@InProceedings{pmlr-v139-hutchinson21a,
title = {LieTransformer: Equivariant Self-Attention for Lie Groups},
author = {Hutchinson, Michael J and Lan, Charline Le and Zaidi, Sheheryar and Dupont, Emilien and Teh, Yee Whye and Kim, Hyunjik},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4533--4543},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/hutchinson21a/hutchinson21a.pdf},
url = {https://proceedings.mlr.press/v139/hutchinson21a.html},
abstract = {Group equivariant neural networks are used as building blocks of group invariant neural networks, which have been shown to improve generalisation performance and data efficiency through principled parameter sharing. Such works have mostly focused on group equivariant convolutions, building on the result that group equivariant linear maps are necessarily convolutions. In this work, we extend the scope of the literature to self-attention, that is emerging as a prominent building block of deep learning models. We propose the LieTransformer, an architecture composed of LieSelfAttention layers that are equivariant to arbitrary Lie groups and their discrete subgroups. We demonstrate the generality of our approach by showing experimental results that are competitive to baseline methods on a wide range of tasks: shape counting on point clouds, molecular property regression and modelling particle trajectories under Hamiltonian dynamics.}
}
@InProceedings{pmlr-v139-ibrahim21a,
title = {Crowdsourcing via Annotator Co-occurrence Imputation and Provable Symmetric Nonnegative Matrix Factorization},
author = {Ibrahim, Shahana and Fu, Xiao},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4544--4554},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ibrahim21a/ibrahim21a.pdf},
url = {https://proceedings.mlr.press/v139/ibrahim21a.html},
abstract = {Unsupervised learning of the Dawid-Skene (D&S) model from noisy, incomplete and crowdsourced annotations has been a long-standing challenge, and is a critical step towards reliably labeling massive data. A recent work takes a coupled nonnegative matrix factorization (CNMF) perspective, and shows appealing features: It ensures the identifiability of the D&S model and enjoys low sample complexity, as only the estimates of the co-occurrences of annotator labels are involved. However, the identifiability holds only when certain somewhat restrictive conditions are met in the context of crowdsourcing. Optimizing the CNMF criterion is also costly—and convergence assurances are elusive. This work recasts the pairwise co-occurrence based D&S model learning problem as a symmetric NMF (SymNMF) problem—which offers enhanced identifiability relative to CNMF. In practice, the SymNMF model is often (largely) incomplete, due to the lack of co-labeled items by some annotators. Two lightweight algorithms are proposed for co-occurrence imputation. Then, a low-complexity shifted rectified linear unit (ReLU)-empowered SymNMF algorithm is proposed to identify the D&S model. Various performance characterizations (e.g., missing co-occurrence recoverability, stability, and convergence) and evaluations are also presented.}
}
@InProceedings{pmlr-v139-ilse21a,
title = {Selecting Data Augmentation for Simulating Interventions},
author = {Ilse, Maximilian and Tomczak, Jakub M and Forr{\'e}, Patrick},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4555--4562},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ilse21a/ilse21a.pdf},
url = {https://proceedings.mlr.press/v139/ilse21a.html},
abstract = {Machine learning models trained with purely observational data and the principle of empirical risk minimization (Vapnik 1992) can fail to generalize to unseen domains. In this paper, we focus on the case where the problem arises through spurious correlation between the observed domains and the actual task labels. We find that many domain generalization methods do not explicitly take this spurious correlation into account. Instead, especially in more application-oriented research areas like medical imaging or robotics, data augmentation techniques that are based on heuristics are used to learn domain invariant features. To bridge the gap between theory and practice, we develop a causal perspective on the problem of domain generalization. We argue that causal concepts can be used to explain the success of data augmentation by describing how they can weaken the spurious correlation between the observed domains and the task labels. We demonstrate that data augmentation can serve as a tool for simulating interventional data. We use these theoretical insights to derive a simple algorithm that is able to select data augmentation techniques that will lead to better domain generalization.}
}
@InProceedings{pmlr-v139-immer21a,
title = {Scalable Marginal Likelihood Estimation for Model Selection in Deep Learning},
author = {Immer, Alexander and Bauer, Matthias and Fortuin, Vincent and R{\"a}tsch, Gunnar and Emtiyaz, Khan Mohammad},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4563--4573},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/immer21a/immer21a.pdf},
url = {https://proceedings.mlr.press/v139/immer21a.html},
abstract = {Marginal-likelihood based model-selection, even though promising, is rarely used in deep learning due to estimation difficulties. Instead, most approaches rely on validation data, which may not be readily available. In this work, we present a scalable marginal-likelihood estimation method to select both hyperparameters and network architectures, based on the training data alone. Some hyperparameters can be estimated online during training, simplifying the procedure. Our marginal-likelihood estimate is based on Laplace’s method and Gauss-Newton approximations to the Hessian, and it outperforms cross-validation and manual tuning on standard regression and image classification datasets, especially in terms of calibration and out-of-distribution detection. Our work shows that marginal likelihoods can improve generalization and be useful when validation data is unavailable (e.g., in nonstationary settings).}
}
@InProceedings{pmlr-v139-inatsu21a,
title = {Active Learning for Distributionally Robust Level-Set Estimation},
author = {Inatsu, Yu and Iwazaki, Shogo and Takeuchi, Ichiro},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4574--4584},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/inatsu21a/inatsu21a.pdf},
url = {https://proceedings.mlr.press/v139/inatsu21a.html},
abstract = {Many cases exist in which a black-box function $f$ with high evaluation cost depends on two types of variables $\bm x$ and $\bm w$, where $\bm x$ is a controllable \emph{design} variable and $\bm w$ are uncontrollable \emph{environmental} variables that have random variation following a certain distribution $P$. In such cases, an important task is to find the range of design variables $\bm x$ such that the function $f(\bm x, \bm w)$ has the desired properties by incorporating the random variation of the environmental variables $\bm w$. A natural measure of robustness is the probability that $f(\bm x, \bm w)$ exceeds a given threshold $h$, which is known as the \emph{probability threshold robustness} (PTR) measure in the literature on robust optimization. However, this robustness measure cannot be correctly evaluated when the distribution $P$ is unknown. In this study, we addressed this problem by considering the \textit{distributionally robust PTR} (DRPTR) measure, which considers the worst-case PTR within given candidate distributions. Specifically, we studied the problem of efficiently identifying a reliable set $H$, which is defined as a region in which the DRPTR measure exceeds a certain desired probability $\alpha$, which can be interpreted as a level set estimation (LSE) problem for DRPTR. We propose a theoretically grounded and computationally efficient active learning method for this problem. We show that the proposed method has theoretical guarantees on convergence and accuracy, and confirmed through numerical experiments that the proposed method outperforms existing methods.}
}
@InProceedings{pmlr-v139-indelman21a,
title = {Learning Randomly Perturbed Structured Predictors for Direct Loss Minimization},
author = {Indelman, Hedda Cohen and Hazan, Tamir},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4585--4595},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/indelman21a/indelman21a.pdf},
url = {https://proceedings.mlr.press/v139/indelman21a.html},
abstract = {Direct loss minimization is a popular approach for learning predictors over structured label spaces. This approach is computationally appealing as it replaces integration with optimization and allows to propagate gradients in a deep net using loss-perturbed prediction. Recently, this technique was extended to generative models, by introducing a randomized predictor that samples a structure from a randomly perturbed score function. In this work, we interpolate between these techniques by learning the variance of randomized structured predictors as well as their mean, in order to balance between the learned score function and the randomized noise. We demonstrate empirically the effectiveness of learning this balance in structured discrete spaces.}
}
@InProceedings{pmlr-v139-iqbal21a,
title = {Randomized Entity-wise Factorization for Multi-Agent Reinforcement Learning},
author = {Iqbal, Shariq and De Witt, Christian A Schroeder and Peng, Bei and Boehmer, Wendelin and Whiteson, Shimon and Sha, Fei},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4596--4606},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/iqbal21a/iqbal21a.pdf},
url = {https://proceedings.mlr.press/v139/iqbal21a.html},
abstract = {Multi-agent settings in the real world often involve tasks with varying types and quantities of agents and non-agent entities; however, common patterns of behavior often emerge among these agents/entities. Our method aims to leverage these commonalities by asking the question: “What is the expected utility of each agent when only considering a randomly selected sub-group of its observed entities?” By posing this counterfactual question, we can recognize state-action trajectories within sub-groups of entities that we may have encountered in another task and use what we learned in that task to inform our prediction in the current one. We then reconstruct a prediction of the full returns as a combination of factors considering these disjoint groups of entities and train this “randomly factorized" value function as an auxiliary objective for value-based multi-agent reinforcement learning. By doing so, our model can recognize and leverage similarities across tasks to improve learning efficiency in a multi-task setting. Our approach, Randomized Entity-wise Factorization for Imagined Learning (REFIL), outperforms all strong baselines by a significant margin in challenging multi-task StarCraft micromanagement settings.}
}
@InProceedings{pmlr-v139-ishfaq21a,
title = {Randomized Exploration in Reinforcement Learning with General Value Function Approximation},
author = {Ishfaq, Haque and Cui, Qiwen and Nguyen, Viet and Ayoub, Alex and Yang, Zhuoran and Wang, Zhaoran and Precup, Doina and Yang, Lin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4607--4616},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ishfaq21a/ishfaq21a.pdf},
url = {https://proceedings.mlr.press/v139/ishfaq21a.html},
abstract = {We propose a model-free reinforcement learning algorithm inspired by the popular randomized least squares value iteration (RLSVI) algorithm as well as the optimism principle. Unlike existing upper-confidence-bound (UCB) based approaches, which are often computationally intractable, our algorithm drives exploration by simply perturbing the training data with judiciously chosen i.i.d. scalar noises. To attain optimistic value function estimation without resorting to a UCB-style bonus, we introduce an optimistic reward sampling procedure. When the value functions can be represented by a function class $\mathcal{F}$, our algorithm achieves a worst-case regret bound of $\tilde{O}(\mathrm{poly}(d_EH)\sqrt{T})$ where $T$ is the time elapsed, $H$ is the planning horizon and $d_E$ is the \emph{eluder dimension} of $\mathcal{F}$. In the linear setting, our algorithm reduces to LSVI-PHE, a variant of RLSVI, that enjoys an $\tilde{\mathcal{O}}(\sqrt{d^3H^3T})$ regret. We complement the theory with an empirical evaluation across known difficult exploration tasks.}
}
@InProceedings{pmlr-v139-islamov21a,
title = {Distributed Second Order Methods with Fast Rates and Compressed Communication},
author = {Islamov, Rustem and Qian, Xun and Richtarik, Peter},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4617--4628},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/islamov21a/islamov21a.pdf},
url = {https://proceedings.mlr.press/v139/islamov21a.html},
abstract = {We develop several new communication-efficient second-order methods for distributed optimization. Our first method, NEWTON-STAR, is a variant of Newton’s method from which it inherits its fast local quadratic rate. However, unlike Newton’s method, NEWTON-STAR enjoys the same per iteration communication cost as gradient descent. While this method is impractical as it relies on the use of certain unknown parameters characterizing the Hessian of the objective function at the optimum, it serves as the starting point which enables us to design practical variants thereof with strong theoretical guarantees. In particular, we design a stochastic sparsification strategy for learning the unknown parameters in an iterative fashion in a communication efficient manner. Applying this strategy to NEWTON-STAR leads to our next method, NEWTON-LEARN, for which we prove local linear and superlinear rates independent of the condition number. When applicable, this method can have dramatically superior convergence behavior when compared to state-of-the-art methods. Finally, we develop a globalization strategy using cubic regularization which leads to our next method, CUBIC-NEWTON-LEARN, for which we prove global sublinear and linear convergence rates, and a fast superlinear rate. Our results are supported with experimental results on real datasets, and show several orders of magnitude improvement on baseline and state-of-the-art methods in terms of communication complexity.}
}
@InProceedings{pmlr-v139-izmailov21a,
title = {What Are Bayesian Neural Network Posteriors Really Like?},
author = {Izmailov, Pavel and Vikram, Sharad and Hoffman, Matthew D and Wilson, Andrew Gordon Gordon},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4629--4640},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/izmailov21a/izmailov21a.pdf},
url = {https://proceedings.mlr.press/v139/izmailov21a.html},
abstract = {The posterior over Bayesian neural network (BNN) parameters is extremely high-dimensional and non-convex. For computational reasons, researchers approximate this posterior using inexpensive mini-batch methods such as mean-field variational inference or stochastic-gradient Markov chain Monte Carlo (SGMCMC). To investigate foundational questions in Bayesian deep learning, we instead use full batch Hamiltonian Monte Carlo (HMC) on modern architectures. We show that (1) BNNs can achieve significant performance gains over standard training and deep ensembles; (2) a single long HMC chain can provide a comparable representation of the posterior to multiple shorter chains; (3) in contrast to recent studies, we find posterior tempering is not needed for near-optimal performance, with little evidence for a “cold posterior” effect, which we show is largely an artifact of data augmentation; (4) BMA performance is robust to the choice of prior scale, and relatively similar for diagonal Gaussian, mixture of Gaussian, and logistic priors; (5) Bayesian neural networks show surprisingly poor generalization under domain shift; (6) while cheaper alternatives such as deep ensembles and SGMCMC can provide good generalization, their predictive distributions are distinct from HMC. Notably, deep ensemble predictive distributions are similarly close to HMC as standard SGLD, and closer than standard variational inference.}
}
@InProceedings{pmlr-v139-izzo21a,
title = {How to Learn when Data Reacts to Your Model: Performative Gradient Descent},
author = {Izzo, Zachary and Ying, Lexing and Zou, James},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4641--4650},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/izzo21a/izzo21a.pdf},
url = {https://proceedings.mlr.press/v139/izzo21a.html},
abstract = {Performative distribution shift captures the setting where the choice of which ML model is deployed changes the data distribution. For example, a bank which uses the number of open credit lines to determine a customer’s risk of default on a loan may induce customers to open more credit lines in order to improve their chances of being approved. Because of the interactions between the model and data distribution, finding the optimal model parameters is challenging. Works in this area have focused on finding stable points, which can be far from optimal. Here we introduce \emph{performative gradient descent} (PerfGD), an algorithm for computing performatively optimal points. Under regularity assumptions on the performative loss, PerfGD is the first algorithm which provably converges to an optimal point. PerfGD explicitly captures how changes in the model affects the data distribution and is simple to use. We support our findings with theory and experiments.}
}
@InProceedings{pmlr-v139-jaegle21a,
title = {Perceiver: General Perception with Iterative Attention},
author = {Jaegle, Andrew and Gimeno, Felix and Brock, Andy and Vinyals, Oriol and Zisserman, Andrew and Carreira, Joao},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4651--4664},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jaegle21a/jaegle21a.pdf},
url = {https://proceedings.mlr.press/v139/jaegle21a.html},
abstract = {Biological systems understand the world by simultaneously processing high-dimensional inputs from modalities as diverse as vision, audition, touch, proprioception, etc. The perception models used in deep learning on the other hand are designed for individual modalities, often relying on domain-specific assumptions such as the local grid structures exploited by virtually all existing vision models. These priors introduce helpful inductive biases, but also lock models to individual modalities. In this paper we introduce the Perceiver {–} a model that builds upon Transformers and hence makes few architectural assumptions about the relationship between its inputs, but that also scales to hundreds of thousands of inputs, like ConvNets. The model leverages an asymmetric attention mechanism to iteratively distill inputs into a tight latent bottleneck, allowing it to scale to handle very large inputs. We show that this architecture is competitive with or outperforms strong, specialized models on classification tasks across various modalities: images, point clouds, audio, video and video+audio. The Perceiver obtains performance comparable to ResNet-50 and ViT on ImageNet without 2D convolutions by directly attending to 50,000 pixels. It is also competitive in all modalities in AudioSet.}
}
@InProceedings{pmlr-v139-jaegle21b,
title = {Imitation by Predicting Observations},
author = {Jaegle, Andrew and Sulsky, Yury and Ahuja, Arun and Bruce, Jake and Fergus, Rob and Wayne, Greg},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4665--4676},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jaegle21b/jaegle21b.pdf},
url = {https://proceedings.mlr.press/v139/jaegle21b.html},
abstract = {Imitation learning enables agents to reuse and adapt the hard-won expertise of others, offering a solution to several key challenges in learning behavior. Although it is easy to observe behavior in the real-world, the underlying actions may not be accessible. We present a new method for imitation solely from observations that achieves comparable performance to experts on challenging continuous control tasks while also exhibiting robustness in the presence of observations unrelated to the task. Our method, which we call FORM (for "Future Observation Reward Model") is derived from an inverse RL objective and imitates using a model of expert behavior learned by generative modelling of the expert’s observations, without needing ground truth actions. We show that FORM performs comparably to a strong baseline IRL method (GAIL) on the DeepMind Control Suite benchmark, while outperforming GAIL in the presence of task-irrelevant features.}
}
@InProceedings{pmlr-v139-jafarov21a,
title = {Local Correlation Clustering with Asymmetric Classification Errors},
author = {Jafarov, Jafar and Kalhan, Sanchit and Makarychev, Konstantin and Makarychev, Yury},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4677--4686},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jafarov21a/jafarov21a.pdf},
url = {https://proceedings.mlr.press/v139/jafarov21a.html},
abstract = {In the Correlation Clustering problem, we are given a complete weighted graph $G$ with its edges labeled as “similar" and “dissimilar" by a noisy binary classifier. For a clustering $\mathcal{C}$ of graph $G$, a similar edge is in disagreement with $\mathcal{C}$, if its endpoints belong to distinct clusters; and a dissimilar edge is in disagreement with $\mathcal{C}$ if its endpoints belong to the same cluster. The disagreements vector, $\disagree$, is a vector indexed by the vertices of $G$ such that the $v$-th coordinate $\disagree_v$ equals the weight of all disagreeing edges incident on $v$. The goal is to produce a clustering that minimizes the $\ell_p$ norm of the disagreements vector for $p\geq 1$. We study the $\ell_p$ objective in Correlation Clustering under the following assumption: Every similar edge has weight in $[\alpha\mathbf{w},\mathbf{w}]$ and every dissimilar edge has weight at least $\alpha\mathbf{w}$ (where $\alpha \leq 1$ and $\mathbf{w}>0$ is a scaling parameter). We give an $O\left((\nicefrac{1}{\alpha})^{\nicefrac{1}{2}-\nicefrac{1}{2p}}\cdot \log\nicefrac{1}{\alpha}\right)$ approximation algorithm for this problem. Furthermore, we show an almost matching convex programming integrality gap.}
}
@InProceedings{pmlr-v139-jagadeesan21a,
title = {Alternative Microfoundations for Strategic Classification},
author = {Jagadeesan, Meena and Mendler-D{\"u}nner, Celestine and Hardt, Moritz},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4687--4697},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jagadeesan21a/jagadeesan21a.pdf},
url = {https://proceedings.mlr.press/v139/jagadeesan21a.html},
abstract = {When reasoning about strategic behavior in a machine learning context it is tempting to combine standard microfoundations of rational agents with the statistical decision theory underlying classification. In this work, we argue that a direct combination of these ingredients leads to brittle solution concepts of limited descriptive and prescriptive value. First, we show that rational agents with perfect information produce discontinuities in the aggregate response to a decision rule that we often do not observe empirically. Second, when any positive fraction of agents is not perfectly strategic, desirable stable points—where the classifier is optimal for the data it entails—no longer exist. Third, optimal decision rules under standard microfoundations maximize a measure of negative externality known as social burden within a broad class of assumptions about agent behavior. Recognizing these limitations we explore alternatives to standard microfoundations for binary classification. We describe desiderata that help navigate the space of possible assumptions about agent responses, and we then propose the noisy response model. Inspired by smoothed analysis and empirical observations, noisy response incorporates imperfection in the agent responses, which we show mitigates the limitations of standard microfoundations. Our model retains analytical tractability, leads to more robust insights about stable points, and imposes a lower social burden at optimality.}
}
@InProceedings{pmlr-v139-jain21a,
title = {Robust Density Estimation from Batches: The Best Things in Life are (Nearly) Free},
author = {Jain, Ayush and Orlitsky, Alon},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4698--4708},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jain21a/jain21a.pdf},
url = {https://proceedings.mlr.press/v139/jain21a.html},
abstract = {In many applications data are collected in batches, some potentially biased, corrupt, or even adversarial. Learning algorithms for this setting have therefore garnered considerable recent attention. In particular, a sequence of works has shown that all approximately piecewise polynomial distributions—and in particular all Gaussian, Gaussian-mixture, log-concave, low-modal, and monotone-hazard distributions—can be learned robustly in polynomial time. However, these results left open the question, stated explicitly in \cite{chen2020learning}, about the best possible sample complexity of such algorithms. We answer this question, showing that, perhaps surprisingly, up to logarithmic factors, the optimal sample complexity is the same as for genuine, non-adversarial, data! To establish the result, we reduce robust learning of approximately piecewise polynomial distributions to robust learning of the probability of all subsets of size at most $k$ of a larger discrete domain, and learn these probabilities in optimal sample complexity linear in $k$ regardless of the domain size. In simulations, the algorithm runs very quickly and estimates distributions to essentially the accuracy achieved when all adversarial batches are removed. The results also imply the first polynomial-time sample-optimal algorithm for robust interval-based classification based on batched data.}
}
@InProceedings{pmlr-v139-jalal21a,
title = {Instance-Optimal Compressed Sensing via Posterior Sampling},
author = {Jalal, Ajil and Karmalkar, Sushrut and Dimakis, Alex and Price, Eric},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4709--4720},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jalal21a/jalal21a.pdf},
url = {https://proceedings.mlr.press/v139/jalal21a.html},
abstract = {We characterize the measurement complexity of compressed sensing of signals drawn from a known prior distribution, even when the support of the prior is the entire space (rather than, say, sparse vectors). We show for Gaussian measurements and \emph{any} prior distribution on the signal, that the posterior sampling estimator achieves near-optimal recovery guarantees. Moreover, this result is robust to model mismatch, as long as the distribution estimate (e.g., from an invertible generative model) is close to the true distribution in Wasserstein distance. We implement the posterior sampling estimator for deep generative priors using Langevin dynamics, and empirically find that it produces accurate estimates with more diversity than MAP.}
}
@InProceedings{pmlr-v139-jalal21b,
title = {Fairness for Image Generation with Uncertain Sensitive Attributes},
author = {Jalal, Ajil and Karmalkar, Sushrut and Hoffmann, Jessica and Dimakis, Alex and Price, Eric},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4721--4732},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jalal21b/jalal21b.pdf},
url = {https://proceedings.mlr.press/v139/jalal21b.html},
abstract = {This work tackles the issue of fairness in the context of generative procedures, such as image super-resolution, which entail different definitions from the standard classification setting. Moreover, while traditional group fairness definitions are typically defined with respect to specified protected groups – camouflaging the fact that these groupings are artificial and carry historical and political motivations – we emphasize that there are no ground truth identities. For instance, should South and East Asians be viewed as a single group or separate groups? Should we consider one race as a whole or further split by gender? Choosing which groups are valid and who belongs in them is an impossible dilemma and being “fair” with respect to Asians may require being “unfair” with respect to South Asians. This motivates the introduction of definitions that allow algorithms to be \emph{oblivious} to the relevant groupings. We define several intuitive notions of group fairness and study their incompatibilities and trade-offs. We show that the natural extension of demographic parity is strongly dependent on the grouping, and \emph{impossible} to achieve obliviously. On the other hand, the conceptually new definition we introduce, Conditional Proportional Representation, can be achieved obliviously through Posterior Sampling. Our experiments validate our theoretical results and achieve fair image reconstruction using state-of-the-art generative models.}
}
@InProceedings{pmlr-v139-jalalzai21a,
title = {Feature Clustering for Support Identification in Extreme Regions},
author = {Jalalzai, Hamid and Leluc, R{\'e}mi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4733--4743},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jalalzai21a/jalalzai21a.pdf},
url = {https://proceedings.mlr.press/v139/jalalzai21a.html},
abstract = {Understanding the complex structure of multivariate extremes is a major challenge in various fields from portfolio monitoring and environmental risk management to insurance. In the framework of multivariate Extreme Value Theory, a common characterization of extremes’ dependence structure is the angular measure. It is a suitable measure to work in extreme regions as it provides meaningful insights concerning the subregions where extremes tend to concentrate their mass. The present paper develops a novel optimization-based approach to assess the dependence structure of extremes. This support identification scheme rewrites as estimating clusters of features which best capture the support of extremes. The dimension reduction technique we provide is applied to statistical learning tasks such as feature clustering and anomaly detection. Numerical experiments provide strong empirical evidence of the relevance of our approach.}
}
@InProceedings{pmlr-v139-jang21a,
title = {Improved Regret Bounds of Bilinear Bandits using Action Space Analysis},
author = {Jang, Kyoungseok and Jun, Kwang-Sung and Yun, Se-Young and Kang, Wanmo},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4744--4754},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jang21a/jang21a.pdf},
url = {https://proceedings.mlr.press/v139/jang21a.html},
abstract = {We consider the bilinear bandit problem where the learner chooses a pair of arms, each from two different action spaces of dimension $d_1$ and $d_2$, respectively. The learner then receives a reward whose expectation is a bilinear function of the two chosen arms with an unknown matrix parameter $\Theta^*\in\mathbb{R}^{d_1 \times d_2}$ with rank $r$. Despite abundant applications such as drug discovery, the optimal regret rate is unknown for this problem, though it was conjectured to be $\tilde O(\sqrt{d_1d_2(d_1+d_2)r T})$ by Jun et al. (2019) where $\tilde O$ ignores polylogarithmic factors in $T$. In this paper, we make progress towards closing the gap between the upper and lower bound on the optimal regret. First, we reject the conjecture above by proposing algorithms that achieve the regret $\tilde O(\sqrt{d_1 d_2 (d_1+d_2) T})$ using the fact that the action space dimension $O(d_1+d_2)$ is significantly lower than the matrix parameter dimension $O(d_1 d_2)$. Second, we additionally devise an algorithm with better empirical performance than previous algorithms.}
}
@InProceedings{pmlr-v139-jarrett21a,
title = {Inverse Decision Modeling: Learning Interpretable Representations of Behavior},
author = {Jarrett, Daniel and H{\"u}y{\"u}k, Alihan and Van Der Schaar, Mihaela},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4755--4771},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jarrett21a/jarrett21a.pdf},
url = {https://proceedings.mlr.press/v139/jarrett21a.html},
abstract = {Decision analysis deals with modeling and enhancing decision processes. A principal challenge in improving behavior is in obtaining a transparent *description* of existing behavior in the first place. In this paper, we develop an expressive, unifying perspective on *inverse decision modeling*: a framework for learning parameterized representations of sequential decision behavior. First, we formalize the *forward* problem (as a normative standard), subsuming common classes of control behavior. Second, we use this to formalize the *inverse* problem (as a descriptive model), generalizing existing work on imitation/reward learning—while opening up a much broader class of research problems in behavior representation. Finally, we instantiate this approach with an example (*inverse bounded rational control*), illustrating how this structure enables learning (interpretable) representations of (bounded) rationality—while naturally capturing intuitive notions of suboptimal actions, biased beliefs, and imperfect knowledge of environments.}
}
@InProceedings{pmlr-v139-jastrzebski21a,
title = {Catastrophic Fisher Explosion: Early Phase Fisher Matrix Impacts Generalization},
author = {Jastrzebski, Stanislaw and Arpit, Devansh and Astrand, Oliver and Kerg, Giancarlo B and Wang, Huan and Xiong, Caiming and Socher, Richard and Cho, Kyunghyun and Geras, Krzysztof J},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4772--4784},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jastrzebski21a/jastrzebski21a.pdf},
url = {https://proceedings.mlr.press/v139/jastrzebski21a.html},
abstract = {The early phase of training a deep neural network has a dramatic effect on the local curvature of the loss function. For instance, using a small learning rate does not guarantee stable optimization because the optimization trajectory has a tendency to steer towards regions of the loss surface with increasing local curvature. We ask whether this tendency is connected to the widely observed phenomenon that the choice of the learning rate strongly influences generalization. We first show that stochastic gradient descent (SGD) implicitly penalizes the trace of the Fisher Information Matrix (FIM), a measure of the local curvature, from the start of training. We argue it is an implicit regularizer in SGD by showing that explicitly penalizing the trace of the FIM can significantly improve generalization. We highlight that poor final generalization coincides with the trace of the FIM attaining a large value early in training, to which we refer as catastrophic Fisher explosion. Finally, to gain insight into the regularization effect of penalizing the trace of the FIM, we show that it limits memorization by reducing the learning speed of examples with noisy labels more than that of the examples with clean labels.}
}
@InProceedings{pmlr-v139-javed21a,
title = {Policy Gradient Bayesian Robust Optimization for Imitation Learning},
author = {Javed, Zaynah and Brown, Daniel S and Sharma, Satvik and Zhu, Jerry and Balakrishna, Ashwin and Petrik, Marek and Dragan, Anca and Goldberg, Ken},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4785--4796},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/javed21a/javed21a.pdf},
url = {https://proceedings.mlr.press/v139/javed21a.html},
abstract = {The difficulty in specifying rewards for many real-world problems has led to an increased focus on learning rewards from human feedback, such as demonstrations. However, there are often many different reward functions that explain the human feedback, leaving agents with uncertainty over what the true reward function is. While most policy optimization approaches handle this uncertainty by optimizing for expected performance, many applications demand risk-averse behavior. We derive a novel policy gradient-style robust optimization approach, PG-BROIL, that optimizes a soft-robust objective that balances expected performance and risk. To the best of our knowledge, PG-BROIL is the first policy optimization algorithm robust to a distribution of reward hypotheses which can scale to continuous MDPs. Results suggest that PG-BROIL can produce a family of behaviors ranging from risk-neutral to risk-averse and outperforms state-of-the-art imitation learning algorithms when learning from ambiguous demonstrations by hedging against uncertainty, rather than seeking to uniquely identify the demonstrator’s reward function.}
}
@InProceedings{pmlr-v139-jayaram21a,
title = {In-Database Regression in Input Sparsity Time},
author = {Jayaram, Rajesh and Samadian, Alireza and Woodruff, David and Ye, Peng},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4797--4806},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jayaram21a/jayaram21a.pdf},
url = {https://proceedings.mlr.press/v139/jayaram21a.html},
abstract = {Sketching is a powerful dimensionality reduction technique for accelerating algorithms for data analysis. A crucial step in sketching methods is to compute a subspace embedding (SE) for a large matrix $A \in \mathbb{R}^{N \times d}$. SE’s are the primary tool for obtaining extremely efficient solutions for many linear-algebraic tasks, such as least squares regression and low rank approximation. Computing an SE often requires an explicit representation of $A$ and running time proportional to the size of $A$. However, if $A= T_1 \Join T_2 \Join …\Join T_m$ is the result of a database join query on several smaller tables $T_i \in \mathbb{R}^{n_i \times d_i}$, then this running time can be prohibitive, as $A$ itself can have as many as $O(n_1 n_2 \cdots n_m)$ rows. In this work, we design subspace embeddings for database joins which can be computed significantly faster than computing the join. For the case of a two table join $A = T_1 \Join T_2$ we give input-sparsity algorithms for computing subspace embeddings, with running time bounded by the number of non-zero entries in $T_1,T_2$. This results in input-sparsity time algorithms for high accuracy regression, significantly improving upon the running time of prior FAQ-based methods for regression. We extend our results to arbitrary joins for the ridge regression problem, also considerably improving the running time of prior methods. Empirically, we apply our method to real datasets and show that it is significantly faster than existing algorithms.}
}
@InProceedings{pmlr-v139-jayaram21b,
title = {Parallel and Flexible Sampling from Autoregressive Models via Langevin Dynamics},
author = {Jayaram, Vivek and Thickstun, John},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4807--4818},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jayaram21b/jayaram21b.pdf},
url = {https://proceedings.mlr.press/v139/jayaram21b.html},
abstract = {This paper introduces an alternative approach to sampling from autoregressive models. Autoregressive models are typically sampled sequentially, according to the transition dynamics defined by the model. Instead, we propose a sampling procedure that initializes a sequence with white noise and follows a Markov chain defined by Langevin dynamics on the global log-likelihood of the sequence. This approach parallelizes the sampling process and generalizes to conditional sampling. Using an autoregressive model as a Bayesian prior, we can steer the output of a generative model using a conditional likelihood or constraints. We apply these techniques to autoregressive models in the visual and audio domains, with competitive results for audio source separation, super-resolution, and inpainting.}
}
@InProceedings{pmlr-v139-jeong21a,
title = {Objective Bound Conditional Gaussian Process for Bayesian Optimization},
author = {Jeong, Taewon and Kim, Heeyoung},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4819--4828},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jeong21a/jeong21a.pdf},
url = {https://proceedings.mlr.press/v139/jeong21a.html},
abstract = {A Gaussian process is a standard surrogate model for an unknown objective function in Bayesian optimization. In this paper, we propose a new surrogate model, called the objective bound conditional Gaussian process (OBCGP), to condition a Gaussian process on a bound on the optimal function value. The bound is obtained and updated as the best observed value during the sequential optimization procedure. Unlike the standard Gaussian process, the OBCGP explicitly incorporates the existence of a point that improves the best known bound. We treat the location of such a point as a model parameter and estimate it jointly with other parameters by maximizing the likelihood using variational inference. Within the standard Bayesian optimization framework, the OBCGP can be combined with various acquisition functions to select the next query point. In particular, we derive cumulative regret bounds for the OBCGP combined with the upper confidence bound acquisition algorithm. Furthermore, the OBCGP can inherently incorporate a new type of prior knowledge, i.e., the bounds on the optimum, if it is available. The incorporation of this type of prior knowledge into a surrogate model has not been studied previously. We demonstrate the effectiveness of the OBCGP through its application to Bayesian optimization tasks, such as the sequential design of experiments and hyperparameter optimization in neural networks.}
}
@InProceedings{pmlr-v139-jesson21a,
title = {Quantifying Ignorance in Individual-Level Causal-Effect Estimates under Hidden Confounding},
author = {Jesson, Andrew and Mindermann, S{\"o}ren and Gal, Yarin and Shalit, Uri},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4829--4838},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jesson21a/jesson21a.pdf},
url = {https://proceedings.mlr.press/v139/jesson21a.html},
abstract = {We study the problem of learning conditional average treatment effects (CATE) from high-dimensional, observational data with unobserved confounders. Unobserved confounders introduce ignorance—a level of unidentifiability—about an individual’s response to treatment by inducing bias in CATE estimates. We present a new parametric interval estimator suited for high-dimensional data, that estimates a range of possible CATE values when given a predefined bound on the level of hidden confounding. Further, previous interval estimators do not account for ignorance about the CATE associated with samples that may be underrepresented in the original study, or samples that violate the overlap assumption. Our interval estimator also incorporates model uncertainty so that practitioners can be made aware of such out-of-distribution data. We prove that our estimator converges to tight bounds on CATE when there may be unobserved confounding and assess it using semi-synthetic, high-dimensional datasets.}
}
@InProceedings{pmlr-v139-jha21a,
title = {DeepReDuce: ReLU Reduction for Fast Private Inference},
author = {Jha, Nandan Kumar and Ghodsi, Zahra and Garg, Siddharth and Reagen, Brandon},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4839--4849},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jha21a/jha21a.pdf},
url = {https://proceedings.mlr.press/v139/jha21a.html},
abstract = {The recent rise of privacy concerns has led researchers to devise methods for private neural inference—where inferences are made directly on encrypted data, never seeing inputs. The primary challenge facing private inference is that computing on encrypted data levies an impractically-high latency penalty, stemming mostly from non-linear operators like ReLU. Enabling practical and private inference requires new optimization methods that minimize network ReLU counts while preserving accuracy. This paper proposes DeepReDuce: a set of optimizations for the judicious removal of ReLUs to reduce private inference latency. The key insight is that not all ReLUs contribute equally to accuracy. We leverage this insight to drop, or remove, ReLUs from classic networks to significantly reduce inference latency and maintain high accuracy. Given a network architecture, DeepReDuce outputs a Pareto frontier of networks that tradeoff the number of ReLUs and accuracy. Compared to the state-of-the-art for private inference DeepReDuce improves accuracy and reduces ReLU count by up to 3.5% (iso-ReLU count) and 3.5x (iso-accuracy), respectively.}
}
@InProceedings{pmlr-v139-jha21b,
title = {Factor-analytic inverse regression for high-dimension, small-sample dimensionality reduction},
author = {Jha, Aditi and Morais, Michael J. and Pillow, Jonathan W},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4850--4859},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jha21b/jha21b.pdf},
url = {https://proceedings.mlr.press/v139/jha21b.html},
abstract = {Sufficient dimension reduction (SDR) methods are a family of supervised methods for dimensionality reduction that seek to reduce dimensionality while preserving information about a target variable of interest. However, existing SDR methods typically require more observations than the number of dimensions ($N > p$). To overcome this limitation, we propose Class-conditional Factor Analytic Dimensions (CFAD), a model-based dimensionality reduction method for high-dimensional, small-sample data. We show that CFAD substantially outperforms existing SDR methods in the small-sample regime, and can be extended to incorporate prior information such as smoothness in the projection axes. We demonstrate the effectiveness of CFAD with an application to functional magnetic resonance imaging (fMRI) measurements during visual object recognition and working memory tasks, where it outperforms existing SDR and a variety of other dimensionality-reduction methods.}
}
@InProceedings{pmlr-v139-ji21a,
title = {Fast margin maximization via dual acceleration},
author = {Ji, Ziwei and Srebro, Nathan and Telgarsky, Matus},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4860--4869},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ji21a/ji21a.pdf},
url = {https://proceedings.mlr.press/v139/ji21a.html},
abstract = {We present and analyze a momentum-based gradient method for training linear classifiers with an exponentially-tailed loss (e.g., the exponential or logistic loss), which maximizes the classification margin on separable data at a rate of O(1/t^2). This contrasts with a rate of O(1/log(t)) for standard gradient descent, and O(1/t) for normalized gradient descent. The momentum-based method is derived via the convex dual of the maximum-margin problem, and specifically by applying Nesterov acceleration to this dual, which manages to result in a simple and intuitive method in the primal. This dual view can also be used to derive a stochastic variant, which performs adaptive non-uniform sampling via the dual variables.}
}
@InProceedings{pmlr-v139-ji21b,
title = {Marginalized Stochastic Natural Gradients for Black-Box Variational Inference},
author = {Ji, Geng and Sujono, Debora and Sudderth, Erik B},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4870--4881},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ji21b/ji21b.pdf},
url = {https://proceedings.mlr.press/v139/ji21b.html},
abstract = {Black-box variational inference algorithms use stochastic sampling to analyze diverse statistical models, like those expressed in probabilistic programming languages, without model-specific derivations. While the popular score-function estimator computes unbiased gradient estimates, its variance is often unacceptably large, especially in models with discrete latent variables. We propose a stochastic natural gradient estimator that is as broadly applicable and unbiased, but improves efficiency by exploiting the curvature of the variational bound, and provably reduces variance by marginalizing discrete latent variables. Our marginalized stochastic natural gradients have intriguing connections to classic coordinate ascent variational inference, but allow parallel updates of variational parameters, and provide superior convergence guarantees relative to naive Monte Carlo approximations. We integrate our method with the probabilistic programming language Pyro and evaluate real-world models of documents, images, networks, and crowd-sourcing. Compared to score-function estimators, we require far fewer Monte Carlo samples and consistently convergence orders of magnitude faster.}
}
@InProceedings{pmlr-v139-ji21c,
title = {Bilevel Optimization: Convergence Analysis and Enhanced Design},
author = {Ji, Kaiyi and Yang, Junjie and Liang, Yingbin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4882--4892},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ji21c/ji21c.pdf},
url = {https://proceedings.mlr.press/v139/ji21c.html},
abstract = {Bilevel optimization has arisen as a powerful tool for many machine learning problems such as meta-learning, hyperparameter optimization, and reinforcement learning. In this paper, we investigate the nonconvex-strongly-convex bilevel optimization problem. For deterministic bilevel optimization, we provide a comprehensive convergence rate analysis for two popular algorithms respectively based on approximate implicit differentiation (AID) and iterative differentiation (ITD). For the AID-based method, we orderwisely improve the previous convergence rate analysis due to a more practical parameter selection as well as a warm start strategy, and for the ITD-based method we establish the first theoretical convergence rate. Our analysis also provides a quantitative comparison between ITD and AID based approaches. For stochastic bilevel optimization, we propose a novel algorithm named stocBiO, which features a sample-efficient hypergradient estimator using efficient Jacobian- and Hessian-vector product computations. We provide the convergence rate guarantee for stocBiO, and show that stocBiO outperforms the best known computational complexities orderwisely with respect to the condition number $\kappa$ and the target accuracy $\epsilon$. We further validate our theoretical results and demonstrate the efficiency of bilevel optimization algorithms by the experiments on meta-learning and hyperparameter optimization.}
}
@InProceedings{pmlr-v139-jia21a,
title = {Efficient Statistical Tests: A Neural Tangent Kernel Approach},
author = {Jia, Sheng and Nezhadarya, Ehsan and Wu, Yuhuai and Ba, Jimmy},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4893--4903},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jia21a/jia21a.pdf},
url = {https://proceedings.mlr.press/v139/jia21a.html},
abstract = {For machine learning models to make reliable predictions in deployment, one needs to ensure the previously unknown test samples need to be sufficiently similar to the training data. The commonly used shift-invariant kernels do not have the compositionality and fail to capture invariances in high-dimensional data in computer vision. We propose a shift-invariant convolutional neural tangent kernel (SCNTK) based outlier detector and two-sample tests with maximum mean discrepancy (MMD) that is O(n) in the number of samples due to using the random feature approximation. On MNIST and CIFAR10 with various types of dataset shifts, we empirically show that statistical tests with such compositional kernels, inherited from infinitely wide neural networks, achieve higher detection accuracy than existing non-parametric methods. Our method also provides a competitive alternative to adapted kernel methods that require a training phase.}
}
@InProceedings{pmlr-v139-jia21b,
title = {Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision},
author = {Jia, Chao and Yang, Yinfei and Xia, Ye and Chen, Yi-Ting and Parekh, Zarana and Pham, Hieu and Le, Quoc and Sung, Yun-Hsuan and Li, Zhen and Duerig, Tom},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4904--4916},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jia21b/jia21b.pdf},
url = {https://proceedings.mlr.press/v139/jia21b.html},
abstract = {Pre-trained representations are becoming crucial for many NLP and perception tasks. While representation learning in NLP has transitioned to training on raw text without human annotations, visual and vision-language representations still rely heavily on curated training datasets that are expensive or require expert knowledge. For vision applications, representations are mostly learned using datasets with explicit class labels such as ImageNet or OpenImages. For vision-language, popular datasets like Conceptual Captions, MSCOCO, or CLIP all involve a non-trivial data collection (and cleaning) process. This costly curation process limits the size of datasets and hence hinders the scaling of trained models. In this paper, we leverage a noisy dataset of over one billion image alt-text pairs, obtained without expensive filtering or post-processing steps in the Conceptual Captions dataset. A simple dual-encoder architecture learns to align visual and language representations of the image and text pairs using a contrastive loss. We show that the scale of our corpus can make up for its noise and leads to state-of-the-art representations even with such a simple learning scheme. Our visual representation achieves strong performance when transferred to classification tasks such as ImageNet and VTAB. The aligned visual and language representations enables zero-shot image classification and also set new state-of-the-art results on Flickr30K and MSCOCO image-text retrieval benchmarks, even when compared with more sophisticated cross-attention models. The representations also enable cross-modality search with complex text and text + image queries.}
}
@InProceedings{pmlr-v139-jia21c,
title = {Multi-Dimensional Classification via Sparse Label Encoding},
author = {Jia, Bin-Bin and Zhang, Min-Ling},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4917--4926},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jia21c/jia21c.pdf},
url = {https://proceedings.mlr.press/v139/jia21c.html},
abstract = {In multi-dimensional classification (MDC), there are multiple class variables in the output space with each of them corresponding to one heterogeneous class space. Due to the heterogeneity of class spaces, it is quite challenging to consider the dependencies among class variables when learning from MDC examples. In this paper, we propose a novel MDC approach named SLEM which learns the predictive model in an encoded label space instead of the original heterogeneous one. Specifically, SLEM works in an encoding-training-decoding framework. In the encoding phase, each class vector is mapped into a real-valued one via three cascaded operations including pairwise grouping, one-hot conversion and sparse linear encoding. In the training phase, a multi-output regression model is learned within the encoded label space. In the decoding phase, the predicted class vector is obtained by adapting orthogonal matching pursuit over outputs of the learned multi-output regression model. Experimental results clearly validate the superiority of SLEM against state-of-the-art MDC approaches.}
}
@InProceedings{pmlr-v139-jiang21a,
title = {Self-Damaging Contrastive Learning},
author = {Jiang, Ziyu and Chen, Tianlong and Mortazavi, Bobak J and Wang, Zhangyang},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4927--4939},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jiang21a/jiang21a.pdf},
url = {https://proceedings.mlr.press/v139/jiang21a.html},
abstract = {The recent breakthrough achieved by contrastive learning accelerates the pace for deploying unsupervised training on real-world data applications. However, unlabeled data in reality is commonly imbalanced and shows a long-tail distribution, and it is unclear how robustly the latest contrastive learning methods could perform in the practical scenario. This paper proposes to explicitly tackle this challenge, via a principled framework called Self-Damaging Contrastive Learning (SDCLR), to automatically balance the representation learning without knowing the classes. Our main inspiration is drawn from the recent finding that deep models have difficult-to-memorize samples, and those may be exposed through network pruning. It is further natural to hypothesize that long-tail samples are also tougher for the model to learn well due to insufficient examples. Hence, the key innovation in SDCLR is to create a dynamic self-competitor model to contrast with the target model, which is a pruned version of the latter. During training, contrasting the two models will lead to adaptive online mining of the most easily forgotten samples for the current target model, and implicitly emphasize them more in the contrastive loss. Extensive experiments across multiple datasets and imbalance settings show that SDCLR significantly improves not only overall accuracies but also balancedness, in terms of linear evaluation on the full-shot and few-shot settings. Our code is available at https://github.com/VITA-Group/SDCLR.}
}
@InProceedings{pmlr-v139-jiang21b,
title = {Prioritized Level Replay},
author = {Jiang, Minqi and Grefenstette, Edward and Rockt{\"a}schel, Tim},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4940--4950},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jiang21b/jiang21b.pdf},
url = {https://proceedings.mlr.press/v139/jiang21b.html},
abstract = {Environments with procedurally generated content serve as important benchmarks for testing systematic generalization in deep reinforcement learning. In this setting, each level is an algorithmically created environment instance with a unique configuration of its factors of variation. Training on a prespecified subset of levels allows for testing generalization to unseen levels. What can be learned from a level depends on the current policy, yet prior work defaults to uniform sampling of training levels independently of the policy. We introduce Prioritized Level Replay (PLR), a general framework for selectively sampling the next training level by prioritizing those with higher estimated learning potential when revisited in the future. We show TD-errors effectively estimate a level’s future learning potential and, when used to guide the sampling procedure, induce an emergent curriculum of increasingly difficult levels. By adapting the sampling of training levels, PLR significantly improves sample-efficiency and generalization on Procgen Benchmark—matching the previous state-of-the-art in test return—and readily combines with other methods. Combined with the previous leading method, PLR raises the state-of-the-art to over 76% improvement in test return relative to standard RL baselines.}
}
@InProceedings{pmlr-v139-jiang21c,
title = {Monotonic Robust Policy Optimization with Model Discrepancy},
author = {Jiang, Yuankun and Li, Chenglin and Dai, Wenrui and Zou, Junni and Xiong, Hongkai},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4951--4960},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jiang21c/jiang21c.pdf},
url = {https://proceedings.mlr.press/v139/jiang21c.html},
abstract = {State-of-the-art deep reinforcement learning (DRL) algorithms tend to overfit due to the model discrepancy between source and target environments. Though applying domain randomization during training can improve the average performance by randomly generating a sufficient diversity of environments in simulator, the worst-case environment is still neglected without any performance guarantee. Since the average and worst-case performance are both important for generalization in RL, in this paper, we propose a policy optimization approach for concurrently improving the policy’s performance in the average and worst-case environment. We theoretically derive a lower bound for the worst-case performance of a given policy by relating it to the expected performance. Guided by this lower bound, we formulate an optimization problem to jointly optimize the policy and sampling distribution, and prove that by iteratively solving it the worst-case performance is monotonically improved. We then develop a practical algorithm, named monotonic robust policy optimization (MRPO). Experimental evaluations in several robot control tasks demonstrate that MRPO can generally improve both the average and worst-case performance in the source environments for training, and facilitate in all cases the learned policy with a better generalization capability in some unseen testing environments.}
}
@InProceedings{pmlr-v139-jiang21d,
title = {Approximation Theory of Convolutional Architectures for Time Series Modelling},
author = {Jiang, Haotian and Li, Zhong and Li, Qianxiao},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4961--4970},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jiang21d/jiang21d.pdf},
url = {https://proceedings.mlr.press/v139/jiang21d.html},
abstract = {We study the approximation properties of convolutional architectures applied to time series modelling, which can be formulated mathematically as a functional approximation problem. In the recurrent setting, recent results reveal an intricate connection between approximation efficiency and memory structures in the data generation process. In this paper, we derive parallel results for convolutional architectures, with WaveNet being a prime example. Our results reveal that in this new setting, approximation efficiency is not only characterised by memory, but also additional fine structures in the target relationship. This leads to a novel definition of spectrum-based regularity that measures the complexity of temporal relationships under the convolutional approximation scheme. These analyses provide a foundation to understand the differences between architectural choices for time series modelling and can give theoretically grounded guidance for practical applications.}
}
@InProceedings{pmlr-v139-jiang21e,
title = {Streaming and Distributed Algorithms for Robust Column Subset Selection},
author = {Jiang, Shuli and Li, Dennis and Li, Irene Mengze and Mahankali, Arvind V and Woodruff, David},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4971--4981},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jiang21e/jiang21e.pdf},
url = {https://proceedings.mlr.press/v139/jiang21e.html},
abstract = {We give the first single-pass streaming algorithm for Column Subset Selection with respect to the entrywise $\ell_p$-norm with $1 \leq p < 2$. We study the $\ell_p$ norm loss since it is often considered more robust to noise than the standard Frobenius norm. Given an input matrix $A \in \mathbb{R}^{d \times n}$ ($n \gg d$), our algorithm achieves a multiplicative $k^{\frac{1}{p} - \frac{1}{2}}\poly(\log nd)$-approximation to the error with respect to the \textit{best possible column subset} of size $k$. Furthermore, the space complexity of the streaming algorithm is optimal up to a logarithmic factor. Our streaming algorithm also extends naturally to a 1-round distributed protocol with nearly optimal communication cost. A key ingredient in our algorithms is a reduction to column subset selection in the $\ell_{p,2}$-norm, which corresponds to the $p$-norm of the vector of Euclidean norms of each of the columns of $A$. This enables us to leverage strong coreset constructions for the Euclidean norm, which previously had not been applied in this context. We also give the first provable guarantees for greedy column subset selection in the $\ell_{1, 2}$ norm, which can be used as an alternative, practical subroutine in our algorithms. Finally, we show that our algorithms give significant practical advantages on real-world data analysis tasks.}
}
@InProceedings{pmlr-v139-jiang21f,
title = {Single Pass Entrywise-Transformed Low Rank Approximation},
author = {Jiang, Yifei and Li, Yi and Sun, Yiming and Wang, Jiaxin and Woodruff, David},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4982--4991},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jiang21f/jiang21f.pdf},
url = {https://proceedings.mlr.press/v139/jiang21f.html},
abstract = {In applications such as natural language processing or computer vision, one is given a large $n \times n$ matrix $A = (a_{i,j})$ and would like to compute a matrix decomposition, e.g., a low rank approximation, of a function $f(A) = (f(a_{i,j}))$ applied entrywise to $A$. A very important special case is the likelihood function $f\left( A \right ) = \log{\left( \left| a_{ij}\right| +1\right)}$. A natural way to do this would be to simply apply $f$ to each entry of $A$, and then compute the matrix decomposition, but this requires storing all of $A$ as well as multiple passes over its entries. Recent work of Liang et al. shows how to find a rank-$k$ factorization to $f(A)$ using only $n \cdot \poly(\eps^{-1}k\log n)$ words of memory, with overall error $10\|f(A)-[f(A)]_k\|_F^2 + \poly(\epsilon/k) \|f(A)\|_{1,2}^2$, where $[f(A)]_k$ is the best rank-$k$ approximation to $f(A)$ and $\|f(A)\|_{1,2}^2$ is the square of the sum of Euclidean lengths of rows of $f(A)$. Their algorithm uses $3$ passes over the entries of $A$. The authors pose the open question of obtaining an algorithm with $n \cdot \poly(\eps^{-1}k\log n)$ words of memory using only a single pass over the entries of $A$. In this paper we resolve this open question, obtaining the first single-pass algorithm for this problem and for the same class of functions $f$ studied by Liang et al. Moreover, our error is $\|f(A)-[f(A)]_k\|_F^2 + \poly(\epsilon/k) \|f(A)\|_F^2$, where $\|f(A)\|_F^2$ is the sum of squares of Euclidean lengths of rows of $f(A)$. Thus our error is significantly smaller, as it removes the factor of $10$ and also $\|f(A)\|_F^2 \leq \|f(A)\|_{1,2}^2$.}
}
@InProceedings{pmlr-v139-jiang21g,
title = {The Emergence of Individuality},
author = {Jiang, Jiechuan and Lu, Zongqing},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {4992--5001},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jiang21g/jiang21g.pdf},
url = {https://proceedings.mlr.press/v139/jiang21g.html},
abstract = {Individuality is essential in human society. It induces the division of labor and thus improves the efficiency and productivity. Similarly, it should also be a key to multi-agent cooperation. Inspired by that individuality is of being an individual separate from others, we propose a simple yet efficient method for the emergence of individuality (EOI) in multi-agent reinforcement learning (MARL). EOI learns a probabilistic classifier that predicts a probability distribution over agents given their observation and gives each agent an intrinsic reward of being correctly predicted by the classifier. The intrinsic reward encourages the agents to visit their own familiar observations, and learning the classifier by such observations makes the intrinsic reward signals stronger and in turn makes the agents more identifiable. To further enhance the intrinsic reward and promote the emergence of individuality, two regularizers are proposed to increase the discriminability of the classifier. We implement EOI on top of popular MARL algorithms. Empirically, we show that EOI outperforms existing methods in a variety of multi-agent cooperative scenarios.}
}
@InProceedings{pmlr-v139-jiang21h,
title = {Online Selection Problems against Constrained Adversary},
author = {Jiang, Zhihao and Lu, Pinyan and Tang, Zhihao Gavin and Zhang, Yuhao},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5002--5012},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jiang21h/jiang21h.pdf},
url = {https://proceedings.mlr.press/v139/jiang21h.html},
abstract = {Inspired by a recent line of work in online algorithms with predictions, we study the constrained adversary model that utilizes predictions from a different perspective. Prior works mostly focused on designing simultaneously robust and consistent algorithms, without making assumptions on the quality of the predictions. In contrary, our model assumes the adversarial instance is consistent with the predictions and aim to design algorithms that have best worst-case performance against all such instances. We revisit classical online selection problems under the constrained adversary model. For the single item selection problem, we design an optimal algorithm in the adversarial arrival model and an improved algorithm in the random arrival model (a.k.a., the secretary problem). For the online edge-weighted bipartite matching problem, we extend the classical Water-filling and Ranking algorithms and achieve improved competitive ratios.}
}
@InProceedings{pmlr-v139-jiang21i,
title = {Active Covering},
author = {Jiang, Heinrich and Rostamizadeh, Afshin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5013--5022},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jiang21i/jiang21i.pdf},
url = {https://proceedings.mlr.press/v139/jiang21i.html},
abstract = {We analyze the problem of active covering, where the learner is given an unlabeled dataset and can sequentially label query examples. The objective is to label query all of the positive examples in the fewest number of total label queries. We show under standard non-parametric assumptions that a classical support estimator can be repurposed as an offline algorithm attaining an excess query cost of $\widetilde{\Theta}(n^{D/(D+1)})$ compared to the optimal learner, where $n$ is the number of datapoints and $D$ is the dimension. We then provide a simple active learning method that attains an improved excess query cost of $\widetilde{O}(n^{(D-1)/D})$. Furthermore, the proposed algorithms only require access to the positive labeled examples, which in certain settings provides additional computational and privacy benefits. Finally, we show that the active learning method consistently outperforms offline methods as well as a variety of baselines on a wide range of benchmark image-based datasets.}
}
@InProceedings{pmlr-v139-jiang21j,
title = {Emphatic Algorithms for Deep Reinforcement Learning},
author = {Jiang, Ray and Zahavy, Tom and Xu, Zhongwen and White, Adam and Hessel, Matteo and Blundell, Charles and Van Hasselt, Hado},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5023--5033},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jiang21j/jiang21j.pdf},
url = {https://proceedings.mlr.press/v139/jiang21j.html},
abstract = {Off-policy learning allows us to learn about possible policies of behavior from experience generated by a different behavior policy. Temporal difference (TD) learning algorithms can become unstable when combined with function approximation and off-policy sampling—this is known as the “deadly triad”. Emphatic temporal difference (ETD($\lambda$)) algorithm ensures convergence in the linear case by appropriately weighting the TD($\lambda$) updates. In this paper, we extend the use of emphatic methods to deep reinforcement learning agents. We show that naively adapting ETD($\lambda$) to popular deep reinforcement learning algorithms, which use forward view multi-step returns, results in poor performance. We then derive new emphatic algorithms for use in the context of such algorithms, and we demonstrate that they provide noticeable benefits in small problems designed to highlight the instability of TD methods. Finally, we observed improved performance when applying these algorithms at scale on classic Atari games from the Arcade Learning Environment.}
}
@InProceedings{pmlr-v139-jiang21k,
title = {Characterizing Structural Regularities of Labeled Data in Overparameterized Models},
author = {Jiang, Ziheng and Zhang, Chiyuan and Talwar, Kunal and Mozer, Michael C},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5034--5044},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jiang21k/jiang21k.pdf},
url = {https://proceedings.mlr.press/v139/jiang21k.html},
abstract = {Humans are accustomed to environments that contain both regularities and exceptions. For example, at most gas stations, one pays prior to pumping, but the occasional rural station does not accept payment in advance. Likewise, deep neural networks can generalize across instances that share common patterns or structures, yet have the capacity to memorize rare or irregular forms. We analyze how individual instances are treated by a model via a consistency score. The score characterizes the expected accuracy for a held-out instance given training sets of varying size sampled from the data distribution. We obtain empirical estimates of this score for individual instances in multiple data sets, and we show that the score identifies out-of-distribution and mislabeled examples at one end of the continuum and strongly regular examples at the other end. We identify computationally inexpensive proxies to the consistency score using statistics collected during training. We apply the score toward understanding the dynamics of representation learning and to filter outliers during training.}
}
@InProceedings{pmlr-v139-jin21a,
title = {Optimal Streaming Algorithms for Multi-Armed Bandits},
author = {Jin, Tianyuan and Huang, Keke and Tang, Jing and Xiao, Xiaokui},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5045--5054},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jin21a/jin21a.pdf},
url = {https://proceedings.mlr.press/v139/jin21a.html},
abstract = {This paper studies two variants of the best arm identification (BAI) problem under the streaming model, where we have a stream of n arms with reward distributions supported on [0,1] with unknown means. The arms in the stream are arriving one by one, and the algorithm cannot access an arm unless it is stored in a limited size memory. We first study the streaming \epslion-topk-arms identification problem, which asks for k arms whose reward means are lower than that of the k-th best arm by at most \epsilon with probability at least 1-\delta. For general \epsilon \in (0,1), the existing solution for this problem assumes k = 1 and achieves the optimal sample complexity O(\frac{n}{\epsilon^2} \log \frac{1}{\delta}) using O(\log^*(n)) memory and a single pass of the stream. We propose an algorithm that works for any k and achieves the optimal sample complexity O(\frac{n}{\epsilon^2} \log\frac{k}{\delta}) using a single-arm memory and a single pass of the stream. Second, we study the streaming BAI problem, where the objective is to identify the arm with the maximum reward mean with at least 1-\delta probability, using a single-arm memory and as few passes of the input stream as possible. We present a single-arm-memory algorithm that achieves a near instance-dependent optimal sample complexity within O(\log \Delta_2^{-1}) passes, where \Delta_2 is the gap between the mean of the best arm and that of the second best arm.}
}
@InProceedings{pmlr-v139-jin21b,
title = {Towards Tight Bounds on the Sample Complexity of Average-reward MDPs},
author = {Jin, Yujia and Sidford, Aaron},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5055--5064},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jin21b/jin21b.pdf},
url = {https://proceedings.mlr.press/v139/jin21b.html},
abstract = {We prove new upper and lower bounds for sample complexity of finding an $\epsilon$-optimal policy of an infinite-horizon average-reward Markov decision process (MDP) given access to a generative model. When the mixing time of the probability transition matrix of all policies is at most $t_\mathrm{mix}$, we provide an algorithm that solves the problem using $\widetilde{O}(t_\mathrm{mix} \epsilon^{-3})$ (oblivious) samples per state-action pair. Further, we provide a lower bound showing that a linear dependence on $t_\mathrm{mix}$ is necessary in the worst case for any algorithm which computes oblivious samples. We obtain our results by establishing connections between infinite-horizon average-reward MDPs and discounted MDPs of possible further utility.}
}
@InProceedings{pmlr-v139-jin21c,
title = {Almost Optimal Anytime Algorithm for Batched Multi-Armed Bandits},
author = {Jin, Tianyuan and Tang, Jing and Xu, Pan and Huang, Keke and Xiao, Xiaokui and Gu, Quanquan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5065--5073},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jin21c/jin21c.pdf},
url = {https://proceedings.mlr.press/v139/jin21c.html},
abstract = {In batched multi-armed bandit problems, the learner can adaptively pull arms and adjust strategy in batches. In many real applications, not only the regret but also the batch complexity need to be optimized. Existing batched bandit algorithms usually assume that the time horizon T is known in advance. However, many applications involve an unpredictable stopping time. In this paper, we study the anytime batched multi-armed bandit problem. We propose an anytime algorithm that achieves the asymptotically optimal regret for exponential families of reward distributions with $O(\log \log T \ilog^{\alpha} (T))$ \footnote{Notation \ilog^{\alpha} (T) is the result of iteratively applying the logarithm function on T for \alpha times, e.g., \ilog^{3} (T)=\log\log\log T.} batches, where $\alpha\in O_{T}(1)$. Moreover, we prove that for any constant c>0, no algorithm can achieve the asymptotically optimal regret within c\log\log T batches.}
}
@InProceedings{pmlr-v139-jin21d,
title = {MOTS: Minimax Optimal Thompson Sampling},
author = {Jin, Tianyuan and Xu, Pan and Shi, Jieming and Xiao, Xiaokui and Gu, Quanquan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5074--5083},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jin21d/jin21d.pdf},
url = {https://proceedings.mlr.press/v139/jin21d.html},
abstract = {Thompson sampling is one of the most widely used algorithms in many online decision problems due to its simplicity for implementation and superior empirical performance over other state-of-the-art methods. Despite its popularity and empirical success, it has remained an open problem whether Thompson sampling can achieve the minimax optimal regret O(\sqrt{TK}) for K-armed bandit problems, where T is the total time horizon. In this paper we fill this long open gap by proposing a new Thompson sampling algorithm called MOTS that adaptively truncates the sampling result of the chosen arm at each time step. We prove that this simple variant of Thompson sampling achieves the minimax optimal regret bound O(\sqrt{TK}) for finite time horizon T and also the asymptotic optimal regret bound when $T$ grows to infinity as well. This is the first time that the minimax optimality of multi-armed bandit problems has been attained by Thompson sampling type of algorithms.}
}
@InProceedings{pmlr-v139-jin21e,
title = {Is Pessimism Provably Efficient for Offline RL?},
author = {Jin, Ying and Yang, Zhuoran and Wang, Zhaoran},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5084--5096},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jin21e/jin21e.pdf},
url = {https://proceedings.mlr.press/v139/jin21e.html},
abstract = {We study offline reinforcement learning (RL), which aims to learn an optimal policy based on a dataset collected a priori. Due to the lack of further interactions with the environment, offline RL suffers from the insufficient coverage of the dataset, which eludes most existing theoretical analysis. In this paper, we propose a pessimistic variant of the value iteration algorithm (PEVI), which incorporates an uncertainty quantifier as the penalty function. Such a penalty function simply flips the sign of the bonus function for promoting exploration in online RL, which makes it easily implementable and compatible with general function approximators. Without assuming the sufficient coverage of the dataset, we establish a data-dependent upper bound on the suboptimality of PEVI for general Markov decision processes (MDPs). When specialized to linear MDPs, it matches the information-theoretic lower bound up to multiplicative factors of the dimension and horizon. In other words, pessimism is not only provably efficient but also minimax optimal. In particular, given the dataset, the learned policy serves as the “best effort” among all policies, as no other policies can do better. Our theoretical analysis identifies the critical role of pessimism in eliminating a notion of spurious correlation, which emerges from the “irrelevant” trajectories that are less covered by the dataset and not informative for the optimal policy.}
}
@InProceedings{pmlr-v139-jing21a,
title = {Adversarial Option-Aware Hierarchical Imitation Learning},
author = {Jing, Mingxuan and Huang, Wenbing and Sun, Fuchun and Ma, Xiaojian and Kong, Tao and Gan, Chuang and Li, Lei},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5097--5106},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jing21a/jing21a.pdf},
url = {https://proceedings.mlr.press/v139/jing21a.html},
abstract = {It has been a challenge to learning skills for an agent from long-horizon unannotated demonstrations. Existing approaches like Hierarchical Imitation Learning(HIL) are prone to compounding errors or suboptimal solutions. In this paper, we propose Option-GAIL, a novel method to learn skills at long horizon. The key idea of Option-GAIL is modeling the task hierarchy by options and train the policy via generative adversarial optimization. In particular, we propose an Expectation-Maximization(EM)-style algorithm: an E-step that samples the options of expert conditioned on the current learned policy, and an M-step that updates the low- and high-level policies of agent simultaneously to minimize the newly proposed option-occupancy measurement between the expert and the agent. We theoretically prove the convergence of the proposed algorithm. Experiments show that Option-GAIL outperforms other counterparts consistently across a variety of tasks.}
}
@InProceedings{pmlr-v139-jo21a,
title = {Discrete-Valued Latent Preference Matrix Estimation with Graph Side Information},
author = {Jo, Changhun and Lee, Kangwook},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5107--5117},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jo21a/jo21a.pdf},
url = {https://proceedings.mlr.press/v139/jo21a.html},
abstract = {Incorporating graph side information into recommender systems has been widely used to better predict ratings, but relatively few works have focused on theoretical guarantees. Ahn et al. (2018) firstly characterized the optimal sample complexity in the presence of graph side information, but the results are limited due to strict, unrealistic assumptions made on the unknown latent preference matrix and the structure of user clusters. In this work, we propose a new model in which 1) the unknown latent preference matrix can have any discrete values, and 2) users can be clustered into multiple clusters, thereby relaxing the assumptions made in prior work. Under this new model, we fully characterize the optimal sample complexity and develop a computationally-efficient algorithm that matches the optimal sample complexity. Our algorithm is robust to model errors and outperforms the existing algorithms in terms of prediction performance on both synthetic and real data.}
}
@InProceedings{pmlr-v139-jordan21a,
title = {Provable Lipschitz Certification for Generative Models},
author = {Jordan, Matt and Dimakis, Alex},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5118--5126},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jordan21a/jordan21a.pdf},
url = {https://proceedings.mlr.press/v139/jordan21a.html},
abstract = {We present a scalable technique for upper bounding the Lipschitz constant of generative models. We relate this quantity to the maximal norm over the set of attainable vector-Jacobian products of a given generative model. We approximate this set by layerwise convex approximations using zonotopes. Our approach generalizes and improves upon prior work using zonotope transformers and we extend to Lipschitz estimation of neural networks with large output dimension. This provides efficient and tight bounds on small networks and can scale to generative models on VAE and DCGAN architectures.}
}
@InProceedings{pmlr-v139-jorgensen21a,
title = {Isometric Gaussian Process Latent Variable Model for Dissimilarity Data},
author = {J{\o}rgensen, Martin and Hauberg, Soren},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5127--5136},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jorgensen21a/jorgensen21a.pdf},
url = {https://proceedings.mlr.press/v139/jorgensen21a.html},
abstract = {We present a probabilistic model where the latent variable respects both the distances and the topology of the modeled data. The model leverages the Riemannian geometry of the generated manifold to endow the latent space with a well-defined stochastic distance measure, which is modeled locally as Nakagami distributions. These stochastic distances are sought to be as similar as possible to observed distances along a neighborhood graph through a censoring process. The model is inferred by variational inference based on observations of pairwise distances. We demonstrate how the new model can encode invariances in the learned manifolds.}
}
@InProceedings{pmlr-v139-ju21a,
title = {On the Generalization Power of Overfitted Two-Layer Neural Tangent Kernel Models},
author = {Ju, Peizhong and Lin, Xiaojun and Shroff, Ness},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5137--5147},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/ju21a/ju21a.pdf},
url = {https://proceedings.mlr.press/v139/ju21a.html},
abstract = {In this paper, we study the generalization performance of min $\ell_2$-norm overfitting solutions for the neural tangent kernel (NTK) model of a two-layer neural network with ReLU activation that has no bias term. We show that, depending on the ground-truth function, the test error of overfitted NTK models exhibits characteristics that are different from the "double-descent" of other overparameterized linear models with simple Fourier or Gaussian features. Specifically, for a class of learnable functions, we provide a new upper bound of the generalization error that approaches a small limiting value, even when the number of neurons $p$ approaches infinity. This limiting value further decreases with the number of training samples $n$. For functions outside of this class, we provide a lower bound on the generalization error that does not diminish to zero even when $n$ and $p$ are both large.}
}
@InProceedings{pmlr-v139-jun21a,
title = {Improved Confidence Bounds for the Linear Logistic Model and Applications to Bandits},
author = {Jun, Kwang-Sung and Jain, Lalit and Mason, Blake and Nassif, Houssam},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5148--5157},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jun21a/jun21a.pdf},
url = {https://proceedings.mlr.press/v139/jun21a.html},
abstract = {We propose improved fixed-design confidence bounds for the linear logistic model. Our bounds significantly improve upon the state-of-the-art bound by Li et al. (2017) via recent developments of the self-concordant analysis of the logistic loss (Faury et al., 2020). Specifically, our confidence bound avoids a direct dependence on $1/\kappa$, where $\kappa$ is the minimal variance over all arms’ reward distributions. In general, $1/\kappa$ scales exponentially with the norm of the unknown linear parameter $\theta^*$. Instead of relying on this worst case quantity, our confidence bound for the reward of any given arm depends directly on the variance of that arm’s reward distribution. We present two applications of our novel bounds to pure exploration and regret minimization logistic bandits improving upon state-of-the-art performance guarantees. For pure exploration we also provide a lower bound highlighting a dependence on $1/\kappa$ for a family of instances.}
}
@InProceedings{pmlr-v139-jung21a,
title = {Detection of Signal in the Spiked Rectangular Models},
author = {Jung, Ji Hyung and Chung, Hye Won and Lee, Ji Oon},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5158--5167},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jung21a/jung21a.pdf},
url = {https://proceedings.mlr.press/v139/jung21a.html},
abstract = {We consider the problem of detecting signals in the rank-one signal-plus-noise data matrix models that generalize the spiked Wishart matrices. We show that the principal component analysis can be improved by pre-transforming the matrix entries if the noise is non-Gaussian. As an intermediate step, we prove a sharp phase transition of the largest eigenvalues of spiked rectangular matrices, which extends the Baik–Ben Arous–Péché (BBP) transition. We also propose a hypothesis test to detect the presence of signal with low computational complexity, based on the linear spectral statistics, which minimizes the sum of the Type-I and Type-II errors when the noise is Gaussian.}
}
@InProceedings{pmlr-v139-jung21b,
title = {Estimating Identifiable Causal Effects on Markov Equivalence Class through Double Machine Learning},
author = {Jung, Yonghan and Tian, Jin and Bareinboim, Elias},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5168--5179},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/jung21b/jung21b.pdf},
url = {https://proceedings.mlr.press/v139/jung21b.html},
abstract = {General methods have been developed for estimating causal effects from observational data under causal assumptions encoded in the form of a causal graph. Most of this literature assumes that the underlying causal graph is completely specified. However, only observational data is available in most practical settings, which means that one can learn at most a Markov equivalence class (MEC) of the underlying causal graph. In this paper, we study the problem of causal estimation from a MEC represented by a partial ancestral graph (PAG), which is learnable from observational data. We develop a general estimator for any identifiable causal effects in a PAG. The result fills a gap for an end-to-end solution to causal inference from observational data to effects estimation. Specifically, we develop a complete identification algorithm that derives an influence function for any identifiable causal effects from PAGs. We then construct a double/debiased machine learning (DML) estimator that is robust to model misspecification and biases in nuisance function estimation, permitting the use of modern machine learning techniques. Simulation results corroborate with the theory.}
}
@InProceedings{pmlr-v139-kaba21a,
title = {A Nullspace Property for Subspace-Preserving Recovery},
author = {Kaba, Mustafa D and You, Chong and Robinson, Daniel P and Mallada, Enrique and Vidal, Rene},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5180--5188},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kaba21a/kaba21a.pdf},
url = {https://proceedings.mlr.press/v139/kaba21a.html},
abstract = {Much of the theory for classical sparse recovery is based on conditions on the dictionary that are both necessary and sufficient (e.g., nullspace property) or only sufficient (e.g., incoherence and restricted isometry). In contrast, much of the theory for subspace-preserving recovery, the theoretical underpinnings for sparse subspace classification and clustering methods, is based on conditions on the subspaces and the data that are only sufficient (e.g., subspace incoherence and data inner-radius). This paper derives a necessary and sufficient condition for subspace-preserving recovery that is inspired by the classical nullspace property.Based on this novel condition, called here the subspace nullspace property, we derive equivalent characterizations that either admit a clear geometric interpretation that relates data distribution and subspace separation to the recovery success, or can be verified using a finite set of extreme points of a properly defined set. We further exploit these characterizations to derive new sufficient conditions, based on inner-radius and outer-radius measures and dual bounds, that generalize existing conditions and preserve the geometric interpretations. These results fill an important gap in the subspace-preserving recovery literature.}
}
@InProceedings{pmlr-v139-kag21a,
title = {Training Recurrent Neural Networks via Forward Propagation Through Time},
author = {Kag, Anil and Saligrama, Venkatesh},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5189--5200},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kag21a/kag21a.pdf},
url = {https://proceedings.mlr.press/v139/kag21a.html},
abstract = {Back-propagation through time (BPTT) has been widely used for training Recurrent Neural Networks (RNNs). BPTT updates RNN parameters on an instance by back-propagating the error in time over the entire sequence length, and as a result, leads to poor trainability due to the well-known gradient explosion/decay phenomena. While a number of prior works have proposed to mitigate vanishing/explosion effect through careful RNN architecture design, these RNN variants still train with BPTT. We propose a novel forward-propagation algorithm, FPTT, where at each time, for an instance, we update RNN parameters by optimizing an instantaneous risk function. Our proposed risk is a regularization penalty at time $t$ that evolves dynamically based on previously observed losses, and allows for RNN parameter updates to converge to a stationary solution of the empirical RNN objective. We consider both sequence-to-sequence as well as terminal loss problems. Empirically FPTT outperforms BPTT on a number of well-known benchmark tasks, thus enabling architectures like LSTMs to solve long range dependencies problems.}
}
@InProceedings{pmlr-v139-kairouz21a,
title = {The Distributed Discrete Gaussian Mechanism for Federated Learning with Secure Aggregation},
author = {Kairouz, Peter and Liu, Ziyu and Steinke, Thomas},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5201--5212},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kairouz21a/kairouz21a.pdf},
url = {https://proceedings.mlr.press/v139/kairouz21a.html},
abstract = {We consider training models on private data that are distributed across user devices. To ensure privacy, we add on-device noise and use secure aggregation so that only the noisy sum is revealed to the server. We present a comprehensive end-to-end system, which appropriately discretizes the data and adds discrete Gaussian noise before performing secure aggregation. We provide a novel privacy analysis for sums of discrete Gaussians and carefully analyze the effects of data quantization and modular summation arithmetic. Our theoretical guarantees highlight the complex tension between communication, privacy, and accuracy. Our extensive experimental results demonstrate that our solution is essentially able to match the accuracy to central differential privacy with less than 16 bits of precision per value.}
}
@InProceedings{pmlr-v139-kairouz21b,
title = {Practical and Private (Deep) Learning Without Sampling or Shuffling},
author = {Kairouz, Peter and Mcmahan, Brendan and Song, Shuang and Thakkar, Om and Thakurta, Abhradeep and Xu, Zheng},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5213--5225},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kairouz21b/kairouz21b.pdf},
url = {https://proceedings.mlr.press/v139/kairouz21b.html},
abstract = {We consider training models with differential privacy (DP) using mini-batch gradients. The existing state-of-the-art, Differentially Private Stochastic Gradient Descent (DP-SGD), requires \emph{privacy amplification by sampling or shuffling} to obtain the best privacy/accuracy/computation trade-offs. Unfortunately, the precise requirements on exact sampling and shuffling can be hard to obtain in important practical scenarios, particularly federated learning (FL). We design and analyze a DP variant of Follow-The-Regularized-Leader (DP-FTRL) that compares favorably (both theoretically and empirically) to amplified DP-SGD, while allowing for much more flexible data access patterns. DP-FTRL does not use any form of privacy amplification.}
}
@InProceedings{pmlr-v139-kajino21a,
title = {A Differentiable Point Process with Its Application to Spiking Neural Networks},
author = {Kajino, Hiroshi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5226--5235},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kajino21a/kajino21a.pdf},
url = {https://proceedings.mlr.press/v139/kajino21a.html},
abstract = {This paper is concerned about a learning algorithm for a probabilistic model of spiking neural networks (SNNs). Jimenez Rezende & Gerstner (2014) proposed a stochastic variational inference algorithm to train SNNs with hidden neurons. The algorithm updates the variational distribution using the score function gradient estimator, whose high variance often impedes the whole learning algorithm. This paper presents an alternative gradient estimator for SNNs based on the path-wise gradient estimator. The main technical difficulty is a lack of a general method to differentiate a realization of an arbitrary point process, which is necessary to derive the path-wise gradient estimator. We develop a differentiable point process, which is the technical highlight of this paper, and apply it to derive the path-wise gradient estimator for SNNs. We investigate the effectiveness of our gradient estimator through numerical simulation.}
}
@InProceedings{pmlr-v139-kalantzis21a,
title = {Projection techniques to update the truncated SVD of evolving matrices with applications},
author = {Kalantzis, Vasileios and Kollias, Georgios and Ubaru, Shashanka and Nikolakopoulos, Athanasios N. and Horesh, Lior and Clarkson, Kenneth},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5236--5246},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kalantzis21a/kalantzis21a.pdf},
url = {https://proceedings.mlr.press/v139/kalantzis21a.html},
abstract = {This submission considers the problem of updating the rank-$k$ truncated Singular Value Decomposition (SVD) of matrices subject to the addition of new rows and/or columns over time. Such matrix problems represent an important computational kernel in applications such as Latent Semantic Indexing and Recommender Systems. Nonetheless, the proposed framework is purely algebraic and targets general updating problems. The algorithm presented in this paper undertakes a projection viewpoint and focuses on building a pair of subspaces which approximate the linear span of the sought singular vectors of the updated matrix. We discuss and analyze two different choices to form the projection subspaces. Results on matrices from real applications suggest that the proposed algorithm can lead to higher accuracy, especially for the singular triplets associated with the largest modulus singular values. Several practical details and key differences with other approaches are also discussed.}
}
@InProceedings{pmlr-v139-kallus21a,
title = {Optimal Off-Policy Evaluation from Multiple Logging Policies},
author = {Kallus, Nathan and Saito, Yuta and Uehara, Masatoshi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5247--5256},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kallus21a/kallus21a.pdf},
url = {https://proceedings.mlr.press/v139/kallus21a.html},
abstract = {We study off-policy evaluation (OPE) from multiple logging policies, each generating a dataset of fixed size, i.e., stratified sampling. Previous work noted that in this setting the ordering of the variances of different importance sampling estimators is instance-dependent, which brings up a dilemma as to which importance sampling weights to use. In this paper, we resolve this dilemma by finding the OPE estimator for multiple loggers with minimum variance for any instance, i.e., the efficient one. In particular, we establish the efficiency bound under stratified sampling and propose an estimator achieving this bound when given consistent $q$-estimates. To guard against misspecification of $q$-functions, we also provide a way to choose the control variate in a hypothesis class to minimize variance. Extensive experiments demonstrate the benefits of our methods’ efficiently leveraging of the stratified sampling of off-policy data from multiple loggers.}
}
@InProceedings{pmlr-v139-kamoutsi21a,
title = {Efficient Performance Bounds for Primal-Dual Reinforcement Learning from Demonstrations},
author = {Kamoutsi, Angeliki and Banjac, Goran and Lygeros, John},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5257--5268},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kamoutsi21a/kamoutsi21a.pdf},
url = {https://proceedings.mlr.press/v139/kamoutsi21a.html},
abstract = {We consider large-scale Markov decision processes with an unknown cost function and address the problem of learning a policy from a finite set of expert demonstrations. We assume that the learner is not allowed to interact with the expert and has no access to reinforcement signal of any kind. Existing inverse reinforcement learning methods come with strong theoretical guarantees, but are computationally expensive, while state-of-the-art policy optimization algorithms achieve significant empirical success, but are hampered by limited theoretical understanding. To bridge the gap between theory and practice, we introduce a novel bilinear saddle-point framework using Lagrangian duality. The proposed primal-dual viewpoint allows us to develop a model-free provably efficient algorithm through the lens of stochastic convex optimization. The method enjoys the advantages of simplicity of implementation, low memory requirements, and computational and sample complexities independent of the number of states. We further present an equivalent no-regret online-learning interpretation.}
}
@InProceedings{pmlr-v139-kandiros21a,
title = {Statistical Estimation from Dependent Data},
author = {Kandiros, Vardis and Dagan, Yuval and Dikkala, Nishanth and Goel, Surbhi and Daskalakis, Constantinos},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5269--5278},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kandiros21a/kandiros21a.pdf},
url = {https://proceedings.mlr.press/v139/kandiros21a.html},
abstract = {We consider a general statistical estimation problem wherein binary labels across different observations are not independent conditioning on their feature vectors, but dependent, capturing settings where e.g. these observations are collected on a spatial domain, a temporal domain, or a social network, which induce dependencies. We model these dependencies in the language of Markov Random Fields and, importantly, allow these dependencies to be substantial, i.e. do not assume that the Markov Random Field capturing these dependencies is in high temperature. As our main contribution we provide algorithms and statistically efficient estimation rates for this model, giving several instantiations of our bounds in logistic regression, sparse logistic regression, and neural network regression settings with dependent data. Our estimation guarantees follow from novel results for estimating the parameters (i.e. external fields and interaction strengths) of Ising models from a single sample.}
}
@InProceedings{pmlr-v139-kapoor21a,
title = {SKIing on Simplices: Kernel Interpolation on the Permutohedral Lattice for Scalable Gaussian Processes},
author = {Kapoor, Sanyam and Finzi, Marc and Wang, Ke Alexander and Wilson, Andrew Gordon Gordon},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5279--5289},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kapoor21a/kapoor21a.pdf},
url = {https://proceedings.mlr.press/v139/kapoor21a.html},
abstract = {State-of-the-art methods for scalable Gaussian processes use iterative algorithms, requiring fast matrix vector multiplies (MVMs) with the co-variance kernel. The Structured Kernel Interpolation (SKI) framework accelerates these MVMs by performing efficient MVMs on a grid and interpolating back to the original space. In this work, we develop a connection between SKI and the permutohedral lattice used for high-dimensional fast bilateral filtering. Using a sparse simplicial grid instead of a dense rectangular one, we can perform GP inference exponentially faster in the dimension than SKI. Our approach, Simplex-GP, enables scaling SKI to high dimensions, while maintaining strong predictive performance. We additionally provide a CUDA implementation of Simplex-GP, which enables significant GPU acceleration of MVM based inference.}
}
@InProceedings{pmlr-v139-kapoor21b,
title = {Variational Auto-Regressive Gaussian Processes for Continual Learning},
author = {Kapoor, Sanyam and Karaletsos, Theofanis and Bui, Thang D},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5290--5300},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kapoor21b/kapoor21b.pdf},
url = {https://proceedings.mlr.press/v139/kapoor21b.html},
abstract = {Through sequential construction of posteriors on observing data online, Bayes’ theorem provides a natural framework for continual learning. We develop Variational Auto-Regressive Gaussian Processes (VAR-GPs), a principled posterior updating mechanism to solve sequential tasks in continual learning. By relying on sparse inducing point approximations for scalable posteriors, we propose a novel auto-regressive variational distribution which reveals two fruitful connections to existing results in Bayesian inference, expectation propagation and orthogonal inducing points. Mean predictive entropy estimates show VAR-GPs prevent catastrophic forgetting, which is empirically supported by strong performance on modern continual learning benchmarks against competitive baselines. A thorough ablation study demonstrates the efficacy of our modeling choices.}
}
@InProceedings{pmlr-v139-karampatziakis21a,
title = {Off-Policy Confidence Sequences},
author = {Karampatziakis, Nikos and Mineiro, Paul and Ramdas, Aaditya},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5301--5310},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/karampatziakis21a/karampatziakis21a.pdf},
url = {https://proceedings.mlr.press/v139/karampatziakis21a.html},
abstract = {We develop confidence bounds that hold uniformly over time for off-policy evaluation in the contextual bandit setting. These confidence sequences are based on recent ideas from martingale analysis and are non-asymptotic, non-parametric, and valid at arbitrary stopping times. We provide algorithms for computing these confidence sequences that strike a good balance between computational and statistical efficiency. We empirically demonstrate the tightness of our approach in terms of failure probability and width and apply it to the “gated deployment” problem of safely upgrading a production contextual bandit system.}
}
@InProceedings{pmlr-v139-karimireddy21a,
title = {Learning from History for Byzantine Robust Optimization},
author = {Karimireddy, Sai Praneeth and He, Lie and Jaggi, Martin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5311--5319},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/karimireddy21a/karimireddy21a.pdf},
url = {https://proceedings.mlr.press/v139/karimireddy21a.html},
abstract = {Byzantine robustness has received significant attention recently given its importance for distributed and federated learning. In spite of this, we identify severe flaws in existing algorithms even when the data across the participants is identically distributed. First, we show realistic examples where current state of the art robust aggregation rules fail to converge even in the absence of any Byzantine attackers. Secondly, we prove that even if the aggregation rules may succeed in limiting the influence of the attackers in a single round, the attackers can couple their attacks across time eventually leading to divergence. To address these issues, we present two surprisingly simple strategies: a new robust iterative clipping procedure, and incorporating worker momentum to overcome time-coupled attacks. This is the first provably robust method for the standard stochastic optimization setting.}
}
@InProceedings{pmlr-v139-kato21a,
title = {Non-Negative Bregman Divergence Minimization for Deep Direct Density Ratio Estimation},
author = {Kato, Masahiro and Teshima, Takeshi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5320--5333},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kato21a/kato21a.pdf},
url = {https://proceedings.mlr.press/v139/kato21a.html},
abstract = {Density ratio estimation (DRE) is at the core of various machine learning tasks such as anomaly detection and domain adaptation. In the DRE literature, existing studies have extensively studied methods based on Bregman divergence (BD) minimization. However, when we apply the BD minimization with highly flexible models, such as deep neural networks, it tends to suffer from what we call train-loss hacking, which is a source of over-fitting caused by a typical characteristic of empirical BD estimators. In this paper, to mitigate train-loss hacking, we propose non-negative correction for empirical BD estimators. Theoretically, we confirm the soundness of the proposed method through a generalization error bound. In our experiments, the proposed methods show favorable performances in inlier-based outlier detection.}
}
@InProceedings{pmlr-v139-katz-samuels21a,
title = {Improved Algorithms for Agnostic Pool-based Active Classification},
author = {Katz-Samuels, Julian and Zhang, Jifan and Jain, Lalit and Jamieson, Kevin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5334--5344},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/katz-samuels21a/katz-samuels21a.pdf},
url = {https://proceedings.mlr.press/v139/katz-samuels21a.html},
abstract = {We consider active learning for binary classification in the agnostic pool-based setting. The vast majority of works in active learning in the agnostic setting are inspired by the CAL algorithm where each query is uniformly sampled from the disagreement region of the current version space. The sample complexity of such algorithms is described by a quantity known as the disagreement coefficient which captures both the geometry of the hypothesis space as well as the underlying probability space. To date, the disagreement coefficient has been justified by minimax lower bounds only, leaving the door open for superior instance dependent sample complexities. In this work we propose an algorithm that, in contrast to uniform sampling over the disagreement region, solves an experimental design problem to determine a distribution over examples from which to request labels. We show that the new approach achieves sample complexity bounds that are never worse than the best disagreement coefficient-based bounds, but in specific cases can be dramatically smaller. From a practical perspective, the proposed algorithm requires no hyperparameters to tune (e.g., to control the aggressiveness of sampling), and is computationally efficient by means of assuming access to an empirical risk minimization oracle (without any constraints). Empirically, we demonstrate that our algorithm is superior to state of the art agnostic active learning algorithms on image classification datasets.}
}
@InProceedings{pmlr-v139-kaya21a,
title = {When Does Data Augmentation Help With Membership Inference Attacks?},
author = {Kaya, Yigitcan and Dumitras, Tudor},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5345--5355},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kaya21a/kaya21a.pdf},
url = {https://proceedings.mlr.press/v139/kaya21a.html},
abstract = {Deep learning models often raise privacy concerns as they leak information about their training data. This leakage enables membership inference attacks (MIA) that can identify whether a data point was in a model’s training set. Research shows that some ’data augmentation’ mechanisms may reduce the risk by combatting a key factor increasing the leakage, overfitting. While many mechanisms exist, their effectiveness against MIAs and privacy properties have not been studied systematically. Employing two recent MIAs, we explore the lower bound on the risk in the absence of formal upper bounds. First, we evaluate 7 mechanisms and differential privacy, on three image classification tasks. We find that applying augmentation to increase the model’s utility does not mitigate the risk and protection comes with a utility penalty. Further, we also investigate why popular label smoothing mechanism consistently amplifies the risk. Finally, we propose ’loss-rank-correlation’ (LRC) metric to assess how similar the effects of different mechanisms are. This, for example, reveals the similarity of applying high-intensity augmentation against MIAs to simply reducing the training time. Our findings emphasize the utility-privacy trade-off and provide practical guidelines on using augmentation to manage the trade-off.}
}
@InProceedings{pmlr-v139-kazemi21a,
title = {Regularized Submodular Maximization at Scale},
author = {Kazemi, Ehsan and Minaee, Shervin and Feldman, Moran and Karbasi, Amin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5356--5366},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kazemi21a/kazemi21a.pdf},
url = {https://proceedings.mlr.press/v139/kazemi21a.html},
abstract = {In this paper, we propose scalable methods for maximizing a regularized submodular function $f \triangleq g-\ell$ expressed as the difference between a monotone submodular function $g$ and a modular function $\ell$. Submodularity is inherently related to the notions of diversity, coverage, and representativeness. In particular, finding the mode (i.e., the most likely configuration) of many popular probabilistic models of diversity, such as determinantal point processes and strongly log-concave distributions, involves maximization of (regularized) submodular functions. Since a regularized function $f$ can potentially take on negative values, the classic theory of submodular maximization, which heavily relies on the non-negativity assumption of submodular functions, is not applicable. To circumvent this challenge, we develop the first one-pass streaming algorithm for maximizing a regularized submodular function subject to a $k$-cardinality constraint. Furthermore, we develop the first distributed algorithm that returns a solution $S$ in $O(1/ \epsilon)$ rounds of MapReduce computation. We highlight that our result, even for the unregularized case where the modular term $\ell$ is zero, improves the memory and communication complexity of the state-of-the-art by a factor of $O(1/ \epsilon)$ while arguably provides a simpler distributed algorithm and a unifying analysis. We empirically study the performance of our scalable methods on a set of real-life applications, including finding the mode of negatively correlated distributions, vertex cover of social networks, and several data summarization tasks.}
}
@InProceedings{pmlr-v139-kelkar21a,
title = {Prior Image-Constrained Reconstruction using Style-Based Generative Models},
author = {Kelkar, Varun A and Anastasio, Mark},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5367--5377},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kelkar21a/kelkar21a.pdf},
url = {https://proceedings.mlr.press/v139/kelkar21a.html},
abstract = {Obtaining a useful estimate of an object from highly incomplete imaging measurements remains a holy grail of imaging science. Deep learning methods have shown promise in learning object priors or constraints to improve the conditioning of an ill-posed imaging inverse problem. In this study, a framework for estimating an object of interest that is semantically related to a known prior image, is proposed. An optimization problem is formulated in the disentangled latent space of a style-based generative model, and semantically meaningful constraints are imposed using the disentangled latent representation of the prior image. Stable recovery from incomplete measurements with the help of a prior image is theoretically analyzed. Numerical experiments demonstrating the superior performance of our approach as compared to related methods are presented.}
}
@InProceedings{pmlr-v139-keller21a,
title = {Self Normalizing Flows},
author = {Keller, Thomas A and Peters, Jorn W.T. and Jaini, Priyank and Hoogeboom, Emiel and Forr{\'e}, Patrick and Welling, Max},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5378--5387},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/keller21a/keller21a.pdf},
url = {https://proceedings.mlr.press/v139/keller21a.html},
abstract = {Efficient gradient computation of the Jacobian determinant term is a core problem in many machine learning settings, and especially so in the normalizing flow framework. Most proposed flow models therefore either restrict to a function class with easy evaluation of the Jacobian determinant, or an efficient estimator thereof. However, these restrictions limit the performance of such density models, frequently requiring significant depth to reach desired performance levels. In this work, we propose \emph{Self Normalizing Flows}, a flexible framework for training normalizing flows by replacing expensive terms in the gradient by learned approximate inverses at each layer. This reduces the computational complexity of each layer’s exact update from $\mathcal{O}(D^3)$ to $\mathcal{O}(D^2)$, allowing for the training of flow architectures which were otherwise computationally infeasible, while also providing efficient sampling. We show experimentally that such models are remarkably stable and optimize to similar data likelihood values as their exact gradient counterparts, while training more quickly and surpassing the performance of functionally constrained counterparts.}
}
@InProceedings{pmlr-v139-kenlay21a,
title = {Interpretable Stability Bounds for Spectral Graph Filters},
author = {Kenlay, Henry and Thanou, Dorina and Dong, Xiaowen},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5388--5397},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kenlay21a/kenlay21a.pdf},
url = {https://proceedings.mlr.press/v139/kenlay21a.html},
abstract = {Graph-structured data arise in a variety of real-world context ranging from sensor and transportation to biological and social networks. As a ubiquitous tool to process graph-structured data, spectral graph filters have been used to solve common tasks such as denoising and anomaly detection, as well as design deep learning architectures such as graph neural networks. Despite being an important tool, there is a lack of theoretical understanding of the stability properties of spectral graph filters, which are important for designing robust machine learning models. In this paper, we study filter stability and provide a novel and interpretable upper bound on the change of filter output, where the bound is expressed in terms of the endpoint degrees of the deleted and newly added edges, as well as the spatial proximity of those edges. This upper bound allows us to reason, in terms of structural properties of the graph, when a spectral graph filter will be stable. We further perform extensive experiments to verify intuition that can be gained from the bound.}
}
@InProceedings{pmlr-v139-kerdreux21a,
title = {Affine Invariant Analysis of Frank-Wolfe on Strongly Convex Sets},
author = {Kerdreux, Thomas and Liu, Lewis and Lacoste-Julien, Simon and Scieur, Damien},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5398--5408},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kerdreux21a/kerdreux21a.pdf},
url = {https://proceedings.mlr.press/v139/kerdreux21a.html},
abstract = {It is known that the Frank-Wolfe (FW) algorithm, which is affine covariant, enjoys faster convergence rates than $\mathcal{O}\left(1/K\right)$ when the constraint set is strongly convex. However, these results rely on norm-dependent assumptions, usually incurring non-affine invariant bounds, in contradiction with FW’s affine covariant property. In this work, we introduce new structural assumptions on the problem (such as the directional smoothness) and derive an affine invariant, norm-independent analysis of Frank-Wolfe. We show that our rates are better than any other known convergence rates of FW in this setting. Based on our analysis, we propose an affine invariant backtracking line-search. Interestingly, we show that typical backtracking line-searches using smoothness of the objective function present similar performances than its affine invariant counterpart, despite using affine dependent norms in the step size’s computation.}
}
@InProceedings{pmlr-v139-khachaturov21a,
title = {Markpainting: Adversarial Machine Learning meets Inpainting},
author = {Khachaturov, David and Shumailov, Ilia and Zhao, Yiren and Papernot, Nicolas and Anderson, Ross},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5409--5419},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/khachaturov21a/khachaturov21a.pdf},
url = {https://proceedings.mlr.press/v139/khachaturov21a.html},
abstract = {Inpainting is a learned interpolation technique that is based on generative modeling and used to populate masked or missing pieces in an image; it has wide applications in picture editing and retouching. Recently, inpainting started being used for watermark removal, raising concerns. In this paper we study how to manipulate it using our markpainting technique. First, we show how an image owner with access to an inpainting model can augment their image in such a way that any attempt to edit it using that model will add arbitrary visible information. We find that we can target multiple different models simultaneously with our technique. This can be designed to reconstitute a watermark if the editor had been trying to remove it. Second, we show that our markpainting technique is transferable to models that have different architectures or were trained on different datasets, so watermarks created using it are difficult for adversaries to remove. Markpainting is novel and can be used as a manipulation alarm that becomes visible in the event of inpainting. Source code is available at: https://github.com/iliaishacked/markpainting.}
}
@InProceedings{pmlr-v139-khodadadian21a,
title = {Finite-Sample Analysis of Off-Policy Natural Actor-Critic Algorithm},
author = {Khodadadian, Sajad and Chen, Zaiwei and Maguluri, Siva Theja},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5420--5431},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/khodadadian21a/khodadadian21a.pdf},
url = {https://proceedings.mlr.press/v139/khodadadian21a.html},
abstract = {In this paper, we provide finite-sample convergence guarantees for an off-policy variant of the natural actor-critic (NAC) algorithm based on Importance Sampling. In particular, we show that the algorithm converges to a global optimal policy with a sample complexity of $\mathcal{O}(\epsilon^{-3}\log^2(1/\epsilon))$ under an appropriate choice of stepsizes. In order to overcome the issue of large variance due to Importance Sampling, we propose the $Q$-trace algorithm for the critic, which is inspired by the V-trace algorithm (Espeholt et al., 2018). This enables us to explicitly control the bias and variance, and characterize the trade-off between them. As an advantage of off-policy sampling, a major feature of our result is that we do not need any additional assumptions, beyond the ergodicity of the Markov chain induced by the behavior policy.}
}
@InProceedings{pmlr-v139-khrulkov21a,
title = {Functional Space Analysis of Local GAN Convergence},
author = {Khrulkov, Valentin and Babenko, Artem and Oseledets, Ivan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5432--5442},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/khrulkov21a/khrulkov21a.pdf},
url = {https://proceedings.mlr.press/v139/khrulkov21a.html},
abstract = {Recent work demonstrated the benefits of studying continuous-time dynamics governing the GAN training. However, this dynamics is analyzed in the model parameter space, which results in finite-dimensional dynamical systems. We propose a novel perspective where we study the local dynamics of adversarial training in the general functional space and show how it can be represented as a system of partial differential equations. Thus, the convergence properties can be inferred from the eigenvalues of the resulting differential operator. We show that these eigenvalues can be efficiently estimated from the target dataset before training. Our perspective reveals several insights on the practical tricks commonly used to stabilize GANs, such as gradient penalty, data augmentation, and advanced integration schemes. As an immediate practical benefit, we demonstrate how one can a priori select an optimal data augmentation strategy for a particular generation task.}
}
@InProceedings{pmlr-v139-kidger21a,
title = {"Hey, that’s not an ODE": Faster ODE Adjoints via Seminorms},
author = {Kidger, Patrick and Chen, Ricky T. Q. and Lyons, Terry J},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5443--5452},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kidger21a/kidger21a.pdf},
url = {https://proceedings.mlr.press/v139/kidger21a.html},
abstract = {Neural differential equations may be trained by backpropagating gradients via the adjoint method, which is another differential equation typically solved using an adaptive-step-size numerical differential equation solver. A proposed step is accepted if its error, \emph{relative to some norm}, is sufficiently small; else it is rejected, the step is shrunk, and the process is repeated. Here, we demonstrate that the particular structure of the adjoint equations makes the usual choices of norm (such as $L^2$) unnecessarily stringent. By replacing it with a more appropriate (semi)norm, fewer steps are unnecessarily rejected and the backpropagation is made faster. This requires only minor code modifications. Experiments on a wide range of tasks—including time series, generative modeling, and physical control—demonstrate a median improvement of 40% fewer function evaluations. On some problems we see as much as 62% fewer function evaluations, so that the overall training time is roughly halved.}
}
@InProceedings{pmlr-v139-kidger21b,
title = {Neural SDEs as Infinite-Dimensional GANs},
author = {Kidger, Patrick and Foster, James and Li, Xuechen and Lyons, Terry J},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5453--5463},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kidger21b/kidger21b.pdf},
url = {https://proceedings.mlr.press/v139/kidger21b.html},
abstract = {Stochastic differential equations (SDEs) are a staple of mathematical modelling of temporal dynamics. However, a fundamental limitation has been that such models have typically been relatively inflexible, which recent work introducing Neural SDEs has sought to solve. Here, we show that the current classical approach to fitting SDEs may be approached as a special case of (Wasserstein) GANs, and in doing so the neural and classical regimes may be brought together. The input noise is Brownian motion, the output samples are time-evolving paths produced by a numerical solver, and by parameterising a discriminator as a Neural Controlled Differential Equation (CDE), we obtain Neural SDEs as (in modern machine learning parlance) continuous-time generative time series models. Unlike previous work on this problem, this is a direct extension of the classical approach without reference to either prespecified statistics or density functions. Arbitrary drift and diffusions are admissible, so as the Wasserstein loss has a unique global minima, in the infinite data limit \textit{any} SDE may be learnt.}
}
@InProceedings{pmlr-v139-killamsetty21a,
title = {GRAD-MATCH: Gradient Matching based Data Subset Selection for Efficient Deep Model Training},
author = {Killamsetty, Krishnateja and S, Durga and Ramakrishnan, Ganesh and De, Abir and Iyer, Rishabh},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5464--5474},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/killamsetty21a/killamsetty21a.pdf},
url = {https://proceedings.mlr.press/v139/killamsetty21a.html},
abstract = {The great success of modern machine learning models on large datasets is contingent on extensive computational resources with high financial and environmental costs. One way to address this is by extracting subsets that generalize on par with the full data. In this work, we propose a general framework, GRAD-MATCH, which finds subsets that closely match the gradient of the \emph{training or validation} set. We find such subsets effectively using an orthogonal matching pursuit algorithm. We show rigorous theoretical and convergence guarantees of the proposed algorithm and, through our extensive experiments on real-world datasets, show the effectiveness of our proposed framework. We show that GRAD-MATCH significantly and consistently outperforms several recent data-selection algorithms and achieves the best accuracy-efficiency trade-off. GRAD-MATCH is available as a part of the CORDS toolkit: \url{https://github.com/decile-team/cords}.}
}
@InProceedings{pmlr-v139-kim21a,
title = {Improving Predictors via Combination Across Diverse Task Categories},
author = {Kim, Kwang In},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5475--5485},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kim21a/kim21a.pdf},
url = {https://proceedings.mlr.press/v139/kim21a.html},
abstract = {Predictor combination is the problem of improving a task predictor using predictors of other tasks when the forms of individual predictors are unknown. Previous work approached this problem by nonparametrically assessing predictor relationships based on their joint evaluations on a shared sample. This limits their application to cases where all predictors are defined on the same task category, e.g. all predictors estimate attributes of shoes. We present a new predictor combination algorithm that overcomes this limitation. Our algorithm aligns the heterogeneous domains of different predictors in a shared latent space to facilitate comparisons of predictors independently of the domains on which they are originally defined. We facilitate this by a new data alignment scheme that matches data distributions across task categories. Based on visual attribute ranking experiments on datasets that span diverse task categories (e.g. shoes and animals), we demonstrate that our approach often significantly improves the performances of the initial predictors.}
}
@InProceedings{pmlr-v139-kim21b,
title = {Self-Improved Retrosynthetic Planning},
author = {Kim, Junsu and Ahn, Sungsoo and Lee, Hankook and Shin, Jinwoo},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5486--5495},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kim21b/kim21b.pdf},
url = {https://proceedings.mlr.press/v139/kim21b.html},
abstract = {Retrosynthetic planning is a fundamental problem in chemistry for finding a pathway of reactions to synthesize a target molecule. Recently, search algorithms have shown promising results for solving this problem by using deep neural networks (DNNs) to expand their candidate solutions, i.e., adding new reactions to reaction pathways. However, the existing works on this line are suboptimal; the retrosynthetic planning problem requires the reaction pathways to be (a) represented by real-world reactions and (b) executable using “building block” molecules, yet the DNNs expand reaction pathways without fully incorporating such requirements. Motivated by this, we propose an end-to-end framework for directly training the DNNs towards generating reaction pathways with the desirable properties. Our main idea is based on a self-improving procedure that trains the model to imitate successful trajectories found by itself. We also propose a novel reaction augmentation scheme based on a forward reaction model. Our experiments demonstrate that our scheme significantly improves the success rate of solving the retrosynthetic problem from 86.84% to 96.32% while maintaining the performance of DNN for predicting valid reactions.}
}
@InProceedings{pmlr-v139-kim21c,
title = {Reward Identification in Inverse Reinforcement Learning},
author = {Kim, Kuno and Garg, Shivam and Shiragur, Kirankumar and Ermon, Stefano},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5496--5505},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kim21c/kim21c.pdf},
url = {https://proceedings.mlr.press/v139/kim21c.html},
abstract = {We study the problem of reward identifiability in the context of Inverse Reinforcement Learning (IRL). The reward identifiability question is critical to answer when reasoning about the effectiveness of using Markov Decision Processes (MDPs) as computational models of real world decision makers in order to understand complex decision making behavior and perform counterfactual reasoning. While identifiability has been acknowledged as a fundamental theoretical question in IRL, little is known about the types of MDPs for which rewards are identifiable, or even if there exist such MDPs. In this work, we formalize the reward identification problem in IRL and study how identifiability relates to properties of the MDP model. For deterministic MDP models with the MaxEntRL objective, we prove necessary and sufficient conditions for identifiability. Building on these results, we present efficient algorithms for testing whether or not an MDP model is identifiable.}
}
@InProceedings{pmlr-v139-kim21d,
title = {I-BERT: Integer-only BERT Quantization},
author = {Kim, Sehoon and Gholami, Amir and Yao, Zhewei and Mahoney, Michael W. and Keutzer, Kurt},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5506--5518},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kim21d/kim21d.pdf},
url = {https://proceedings.mlr.press/v139/kim21d.html},
abstract = {Transformer based models, like BERT and RoBERTa, have achieved state-of-the-art results in many Natural Language Processing tasks. However, their memory footprint, inference latency, and power consumption are prohibitive efficient inference at the edge, and even at the data center. While quantization can be a viable solution for this, previous work on quantizing Transformer based models use floating-point arithmetic during inference, which cannot efficiently utilize integer-only logical units such as the recent Turing Tensor Cores, or traditional integer-only ARM processors. In this work, we propose I-BERT, a novel quantization scheme for Transformer based models that quantizes the entire inference with integer-only arithmetic. Based on lightweight integer-only approximation methods for nonlinear operations, e.g., GELU, Softmax, and Layer Normalization, I-BERT performs an end-to-end integer-only BERT inference without any floating point calculation. We evaluate our approach on GLUE downstream tasks using RoBERTa-Base/Large. We show that for both cases, I-BERT achieves similar (and slightly higher) accuracy as compared to the full-precision baseline. Furthermore, our preliminary implementation of I-BERT shows a speedup of 2.4- 4.0x for INT8 inference on a T4 GPU system as compared to FP32 inference. The framework has been developed in PyTorch and has been open-sourced.}
}
@InProceedings{pmlr-v139-kim21e,
title = {Message Passing Adaptive Resonance Theory for Online Active Semi-supervised Learning},
author = {Kim, Taehyeong and Hwang, Injune and Lee, Hyundo and Kim, Hyunseo and Choi, Won-Seok and Lim, Joseph J and Zhang, Byoung-Tak},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5519--5529},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kim21e/kim21e.pdf},
url = {https://proceedings.mlr.press/v139/kim21e.html},
abstract = {Active learning is widely used to reduce labeling effort and training time by repeatedly querying only the most beneficial samples from unlabeled data. In real-world problems where data cannot be stored indefinitely due to limited storage or privacy issues, the query selection and the model update should be performed as soon as a new data sample is observed. Various online active learning methods have been studied to deal with these challenges; however, there are difficulties in selecting representative query samples and updating the model efficiently without forgetting. In this study, we propose Message Passing Adaptive Resonance Theory (MPART) that learns the distribution and topology of input data online. Through message passing on the topological graph, MPART actively queries informative and representative samples, and continuously improves the classification performance using both labeled and unlabeled data. We evaluate our model in stream-based selective sampling scenarios with comparable query selection strategies, showing that MPART significantly outperforms competitive models.}
}
@InProceedings{pmlr-v139-kim21f,
title = {Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech},
author = {Kim, Jaehyeon and Kong, Jungil and Son, Juhee},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5530--5540},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kim21f/kim21f.pdf},
url = {https://proceedings.mlr.press/v139/kim21f.html},
abstract = {Several recent end-to-end text-to-speech (TTS) models enabling single-stage training and parallel sampling have been proposed, but their sample quality does not match that of two-stage TTS systems. In this work, we present a parallel end-to-end TTS method that generates more natural sounding audio than current two-stage models. Our method adopts variational inference augmented with normalizing flows and an adversarial training process, which improves the expressive power of generative modeling. We also propose a stochastic duration predictor to synthesize speech with diverse rhythms from input text. With the uncertainty modeling over latent variables and the stochastic duration predictor, our method expresses the natural one-to-many relationship in which a text input can be spoken in multiple ways with different pitches and rhythms. A subjective human evaluation (mean opinion score, or MOS) on the LJ Speech, a single speaker dataset, shows that our method outperforms the best publicly available TTS systems and achieves a MOS comparable to ground truth.}
}
@InProceedings{pmlr-v139-kim21g,
title = {A Policy Gradient Algorithm for Learning to Learn in Multiagent Reinforcement Learning},
author = {Kim, Dong Ki and Liu, Miao and Riemer, Matthew D and Sun, Chuangchuang and Abdulhai, Marwa and Habibi, Golnaz and Lopez-Cot, Sebastian and Tesauro, Gerald and How, Jonathan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5541--5550},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kim21g/kim21g.pdf},
url = {https://proceedings.mlr.press/v139/kim21g.html},
abstract = {A fundamental challenge in multiagent reinforcement learning is to learn beneficial behaviors in a shared environment with other simultaneously learning agents. In particular, each agent perceives the environment as effectively non-stationary due to the changing policies of other agents. Moreover, each agent is itself constantly learning, leading to natural non-stationarity in the distribution of experiences encountered. In this paper, we propose a novel meta-multiagent policy gradient theorem that directly accounts for the non-stationary policy dynamics inherent to multiagent learning settings. This is achieved by modeling our gradient updates to consider both an agent’s own non-stationary policy dynamics and the non-stationary policy dynamics of other agents in the environment. We show that our theoretically grounded approach provides a general solution to the multiagent learning problem, which inherently comprises all key aspects of previous state of the art approaches on this topic. We test our method on a diverse suite of multiagent benchmarks and demonstrate a more efficient ability to adapt to new agents as they learn than baseline methods across the full spectrum of mixed incentive, competitive, and cooperative domains.}
}
@InProceedings{pmlr-v139-kim21h,
title = {Inferring Latent Dynamics Underlying Neural Population Activity via Neural Differential Equations},
author = {Kim, Timothy D. and Luo, Thomas Z. and Pillow, Jonathan W. and Brody, Carlos D.},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5551--5561},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kim21h/kim21h.pdf},
url = {https://proceedings.mlr.press/v139/kim21h.html},
abstract = {An important problem in systems neuroscience is to identify the latent dynamics underlying neural population activity. Here we address this problem by introducing a low-dimensional nonlinear model for latent neural population dynamics using neural ordinary differential equations (neural ODEs), with noisy sensory inputs and Poisson spike train outputs. We refer to this as the Poisson Latent Neural Differential Equations (PLNDE) model. We apply the PLNDE framework to a variety of synthetic datasets, and show that it accurately infers the phase portraits and fixed points of nonlinear systems augmented to produce spike train data, including the FitzHugh-Nagumo oscillator, a 3-dimensional nonlinear spiral, and a nonlinear sensory decision-making model with attractor dynamics. Our model significantly outperforms existing methods at inferring single-trial neural firing rates and the corresponding latent trajectories that generated them, especially in the regime where the spike counts and number of trials are low. We then apply our model to multi-region neural population recordings from medial frontal cortex of rats performing an auditory decision-making task. Our model provides a general, interpretable framework for investigating the neural mechanisms of decision-making and other cognitive computations through the lens of dynamical systems.}
}
@InProceedings{pmlr-v139-kim21i,
title = {The Lipschitz Constant of Self-Attention},
author = {Kim, Hyunjik and Papamakarios, George and Mnih, Andriy},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5562--5571},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kim21i/kim21i.pdf},
url = {https://proceedings.mlr.press/v139/kim21i.html},
abstract = {Lipschitz constants of neural networks have been explored in various contexts in deep learning, such as provable adversarial robustness, estimating Wasserstein distance, stabilising training of GANs, and formulating invertible neural networks. Such works have focused on bounding the Lipschitz constant of fully connected or convolutional networks, composed of linear maps and pointwise non-linearities. In this paper, we investigate the Lipschitz constant of self-attention, a non-linear neural network module widely used in sequence modelling. We prove that the standard dot-product self-attention is not Lipschitz for unbounded input domain, and propose an alternative L2 self-attention that is Lipschitz. We derive an upper bound on the Lipschitz constant of L2 self-attention and provide empirical evidence for its asymptotic tightness. To demonstrate the practical relevance of our theoretical work, we formulate invertible self-attention and use it in a Transformer-based architecture for a character-level language modelling task.}
}
@InProceedings{pmlr-v139-kim21j,
title = {Unsupervised Skill Discovery with Bottleneck Option Learning},
author = {Kim, Jaekyeom and Park, Seohong and Kim, Gunhee},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5572--5582},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kim21j/kim21j.pdf},
url = {https://proceedings.mlr.press/v139/kim21j.html},
abstract = {Having the ability to acquire inherent skills from environments without any external rewards or supervision like humans is an important problem. We propose a novel unsupervised skill discovery method named Information Bottleneck Option Learning (IBOL). On top of the linearization of environments that promotes more various and distant state transitions, IBOL enables the discovery of diverse skills. It provides the abstraction of the skills learned with the information bottleneck framework for the options with improved stability and encouraged disentanglement. We empirically demonstrate that IBOL outperforms multiple state-of-the-art unsupervised skill discovery methods on the information-theoretic evaluations and downstream tasks in MuJoCo environments, including Ant, HalfCheetah, Hopper and D’Kitty. Our code is available at https://vision.snu.ac.kr/projects/ibol.}
}
@InProceedings{pmlr-v139-kim21k,
title = {ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision},
author = {Kim, Wonjae and Son, Bokyung and Kim, Ildoo},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5583--5594},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kim21k/kim21k.pdf},
url = {https://proceedings.mlr.press/v139/kim21k.html},
abstract = {Vision-and-Language Pre-training (VLP) has improved performance on various joint vision-and-language downstream tasks. Current approaches to VLP heavily rely on image feature extraction processes, most of which involve region supervision (e.g., object detection) and the convolutional architecture (e.g., ResNet). Although disregarded in the literature, we find it problematic in terms of both (1) efficiency/speed, that simply extracting input features requires much more computation than the multimodal interaction steps; and (2) expressive power, as it is upper bounded to the expressive power of the visual embedder and its predefined visual vocabulary. In this paper, we present a minimal VLP model, Vision-and-Language Transformer (ViLT), monolithic in the sense that the processing of visual inputs is drastically simplified to just the same convolution-free manner that we process textual inputs. We show that ViLT is up to tens of times faster than previous VLP models, yet with competitive or better downstream task performance. Our code and pre-trained weights are available at https://github.com/dandelin/vilt.}
}
@InProceedings{pmlr-v139-kirschner21a,
title = {Bias-Robust Bayesian Optimization via Dueling Bandits},
author = {Kirschner, Johannes and Krause, Andreas},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5595--5605},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kirschner21a/kirschner21a.pdf},
url = {https://proceedings.mlr.press/v139/kirschner21a.html},
abstract = {We consider Bayesian optimization in settings where observations can be adversarially biased, for example by an uncontrolled hidden confounder. Our first contribution is a reduction of the confounded setting to the dueling bandit model. Then we propose a novel approach for dueling bandits based on information-directed sampling (IDS). Thereby, we obtain the first efficient kernelized algorithm for dueling bandits that comes with cumulative regret guarantees. Our analysis further generalizes a previously proposed semi-parametric linear bandit model to non-linear reward functions, and uncovers interesting links to doubly-robust estimation.}
}
@InProceedings{pmlr-v139-kiyasseh21a,
title = {CLOCS: Contrastive Learning of Cardiac Signals Across Space, Time, and Patients},
author = {Kiyasseh, Dani and Zhu, Tingting and Clifton, David A},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5606--5615},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kiyasseh21a/kiyasseh21a.pdf},
url = {https://proceedings.mlr.press/v139/kiyasseh21a.html},
abstract = {The healthcare industry generates troves of unlabelled physiological data. This data can be exploited via contrastive learning, a self-supervised pre-training method that encourages representations of instances to be similar to one another. We propose a family of contrastive learning methods, CLOCS, that encourages representations across space, time, \textit{and} patients to be similar to one another. We show that CLOCS consistently outperforms the state-of-the-art methods, BYOL and SimCLR, when performing a linear evaluation of, and fine-tuning on, downstream tasks. We also show that CLOCS achieves strong generalization performance with only 25% of labelled training data. Furthermore, our training procedure naturally generates patient-specific representations that can be used to quantify patient-similarity.}
}
@InProceedings{pmlr-v139-gasteiger21a,
title = {Scalable Optimal Transport in High Dimensions for Graph Distances, Embedding Alignment, and More},
author = {Gasteiger, Johannes and Lienen, Marten and G{\"u}nnemann, Stephan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5616--5627},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/gasteiger21a/gasteiger21a.pdf},
url = {https://proceedings.mlr.press/v139/gasteiger21a.html},
abstract = {The current best practice for computing optimal transport (OT) is via entropy regularization and Sinkhorn iterations. This algorithm runs in quadratic time as it requires the full pairwise cost matrix, which is prohibitively expensive for large sets of objects. In this work we propose two effective log-linear time approximations of the cost matrix: First, a sparse approximation based on locality sensitive hashing (LSH) and, second, a Nystr{ö}m approximation with LSH-based sparse corrections, which we call locally corrected Nystr{ö}m (LCN). These approximations enable general log-linear time algorithms for entropy-regularized OT that perform well even for the complex, high-dimensional spaces common in deep learning. We analyse these approximations theoretically and evaluate them experimentally both directly and end-to-end as a component for real-world applications. Using our approximations for unsupervised word embedding alignment enables us to speed up a state-of-the-art method by a factor of 3 while also improving the accuracy by 3.1 percentage points without any additional model changes. For graph distance regression we propose the graph transport network (GTN), which combines graph neural networks (GNNs) with enhanced Sinkhorn. GTN outcompetes previous models by 48% and still scales log-linearly in the number of nodes.}
}
@InProceedings{pmlr-v139-koehler21a,
title = {Representational aspects of depth and conditioning in normalizing flows},
author = {Koehler, Frederic and Mehta, Viraj and Risteski, Andrej},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5628--5636},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/koehler21a/koehler21a.pdf},
url = {https://proceedings.mlr.press/v139/koehler21a.html},
abstract = {Normalizing flows are among the most popular paradigms in generative modeling, especially for images, primarily because we can efficiently evaluate the likelihood of a data point. This is desirable both for evaluating the fit of a model, and for ease of training, as maximizing the likelihood can be done by gradient descent. However, training normalizing flows comes with difficulties as well: models which produce good samples typically need to be extremely deep – which comes with accompanying vanishing/exploding gradient problems. A very related problem is that they are often poorly \emph{conditioned}: since they are parametrized as invertible maps from $\mathbb{R}^d \to \mathbb{R}^d$, and typical training data like images intuitively is lower-dimensional, the learned maps often have Jacobians that are close to being singular. In our paper, we tackle representational aspects around depth and conditioning of normalizing flows: both for general invertible architectures, and for a particular common architecture, affine couplings. We prove that $\Theta(1)$ affine coupling layers suffice to exactly represent a permutation or $1 \times 1$ convolution, as used in GLOW, showing that representationally the choice of partition is not a bottleneck for depth. We also show that shallow affine coupling networks are universal approximators in Wasserstein distance if ill-conditioning is allowed, and experimentally investigate related phenomena involving padding. Finally, we show a depth lower bound for general flow architectures with few neurons per layer and bounded Lipschitz constant.}
}
@InProceedings{pmlr-v139-koh21a,
title = {WILDS: A Benchmark of in-the-Wild Distribution Shifts},
author = {Koh, Pang Wei and Sagawa, Shiori and Marklund, Henrik and Xie, Sang Michael and Zhang, Marvin and Balsubramani, Akshay and Hu, Weihua and Yasunaga, Michihiro and Phillips, Richard Lanas and Gao, Irena and Lee, Tony and David, Etienne and Stavness, Ian and Guo, Wei and Earnshaw, Berton and Haque, Imran and Beery, Sara M and Leskovec, Jure and Kundaje, Anshul and Pierson, Emma and Levine, Sergey and Finn, Chelsea and Liang, Percy},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5637--5664},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/koh21a/koh21a.pdf},
url = {https://proceedings.mlr.press/v139/koh21a.html},
abstract = {Distribution shifts—where the training distribution differs from the test distribution—can substantially degrade the accuracy of machine learning (ML) systems deployed in the wild. Despite their ubiquity in the real-world deployments, these distribution shifts are under-represented in the datasets widely used in the ML community today. To address this gap, we present WILDS, a curated benchmark of 10 datasets reflecting a diverse range of distribution shifts that naturally arise in real-world applications, such as shifts across hospitals for tumor identification; across camera traps for wildlife monitoring; and across time and location in satellite imaging and poverty mapping. On each dataset, we show that standard training yields substantially lower out-of-distribution than in-distribution performance. This gap remains even with models trained by existing methods for tackling distribution shifts, underscoring the need for new methods for training models that are more robust to the types of distribution shifts that arise in practice. To facilitate method development, we provide an open-source package that automates dataset loading, contains default model architectures and hyperparameters, and standardizes evaluations. The full paper, code, and leaderboards are available at https://wilds.stanford.edu.}
}
@InProceedings{pmlr-v139-kolmogorov21a,
title = {One-sided Frank-Wolfe algorithms for saddle problems},
author = {Kolmogorov, Vladimir and Pock, Thomas},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5665--5675},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kolmogorov21a/kolmogorov21a.pdf},
url = {https://proceedings.mlr.press/v139/kolmogorov21a.html},
abstract = {We study a class of convex-concave saddle-point problems of the form $\min_x\max_y ⟨Kx,y⟩+f_{\cal P}(x)-h^*(y)$ where $K$ is a linear operator, $f_{\cal P}$ is the sum of a convex function $f$ with a Lipschitz-continuous gradient and the indicator function of a bounded convex polytope ${\cal P}$, and $h^\ast$ is a convex (possibly nonsmooth) function. Such problem arises, for example, as a Lagrangian relaxation of various discrete optimization problems. Our main assumptions are the existence of an efficient {\em linear minimization oracle} ($lmo$) for $f_{\cal P}$ and an efficient {\em proximal map} ($prox$) for $h^*$ which motivate the solution via a blend of proximal primal-dual algorithms and Frank-Wolfe algorithms. In case $h^*$ is the indicator function of a linear constraint and function $f$ is quadratic, we show a $O(1/n^2)$ convergence rate on the dual objective, requiring $O(n \log n)$ calls of $lmo$. If the problem comes from the constrained optimization problem $\min_{x\in\mathbb R^d}\{f_{\cal P}(x)\:|\:Ax-b=0\}$ then we additionally get bound $O(1/n^2)$ both on the primal gap and on the infeasibility gap. In the most general case, we show a $O(1/n)$ convergence rate of the primal-dual gap again requiring $O(n\log n)$ calls of $lmo$. To the best of our knowledge, this improves on the known convergence rates for the considered class of saddle-point problems. We show applications to labeling problems frequently appearing in machine learning and computer vision.}
}
@InProceedings{pmlr-v139-komanduru21a,
title = {A Lower Bound for the Sample Complexity of Inverse Reinforcement Learning},
author = {Komanduru, Abi and Honorio, Jean},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5676--5685},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/komanduru21a/komanduru21a.pdf},
url = {https://proceedings.mlr.press/v139/komanduru21a.html},
abstract = {Inverse reinforcement learning (IRL) is the task of finding a reward function that generates a desired optimal policy for a given Markov Decision Process (MDP). This paper develops an information-theoretic lower bound for the sample complexity of the finite state, finite action IRL problem. A geometric construction of $\beta$-strict separable IRL problems using spherical codes is considered. Properties of the ensemble size as well as the Kullback-Leibler divergence between the generated trajectories are derived. The resulting ensemble is then used along with Fano’s inequality to derive a sample complexity lower bound of $O(n \log n)$, where $n$ is the number of states in the MDP.}
}
@InProceedings{pmlr-v139-kong21a,
title = {Consensus Control for Decentralized Deep Learning},
author = {Kong, Lingjing and Lin, Tao and Koloskova, Anastasia and Jaggi, Martin and Stich, Sebastian},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5686--5696},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kong21a/kong21a.pdf},
url = {https://proceedings.mlr.press/v139/kong21a.html},
abstract = {Decentralized training of deep learning models enables on-device learning over networks, as well as efficient scaling to large compute clusters. Experiments in earlier works reveal that, even in a data-center setup, decentralized training often suffers from the degradation in the quality of the model: the training and test performance of models trained in a decentralized fashion is in general worse than that of models trained in a centralized fashion, and this performance drop is impacted by parameters such as network size, communication topology and data partitioning. We identify the changing consensus distance between devices as a key parameter to explain the gap between centralized and decentralized training. We show in theory that when the training consensus distance is lower than a critical quantity, decentralized training converges as fast as the centralized counterpart. We empirically validate that the relation between generalization performance and consensus distance is consistent with this theoretical observation. Our empirical insights allow the principled design of better decentralized training schemes that mitigate the performance drop. To this end, we provide practical training guidelines and exemplify its effectiveness on the data-center setup as the important first step.}
}
@InProceedings{pmlr-v139-konobeev21a,
title = {A Distribution-dependent Analysis of Meta Learning},
author = {Konobeev, Mikhail and Kuzborskij, Ilja and Szepesvari, Csaba},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5697--5706},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/konobeev21a/konobeev21a.pdf},
url = {https://proceedings.mlr.press/v139/konobeev21a.html},
abstract = {A key problem in the theory of meta-learning is to understand how the task distributions influence transfer risk, the expected error of a meta-learner on a new task drawn from the unknown task distribution. In this paper, focusing on fixed design linear regression with Gaussian noise and a Gaussian task (or parameter) distribution, we give distribution-dependent lower bounds on the transfer risk of any algorithm, while we also show that a novel, weighted version of the so-called biased regularized regression method is able to match these lower bounds up to a fixed constant factor. Notably, the weighting is derived from the covariance of the Gaussian task distribution. Altogether, our results provide a precise characterization of the difficulty of meta-learning in this Gaussian setting. While this problem setting may appear simple, we show that it is rich enough to unify the “parameter sharing” and “representation learning” streams of meta-learning; in particular, representation learning is obtained as the special case when the covariance matrix of the task distribution is unknown. For this case we propose to adopt the EM method, which is shown to enjoy efficient updates in our case. The paper is completed by an empirical study of EM. In particular, our experimental results show that the EM algorithm can attain the lower bound as the number of tasks grows, while the algorithm is also successful in competing with its alternatives when used in a representation learning context.}
}
@InProceedings{pmlr-v139-kopetzki21a,
title = {Evaluating Robustness of Predictive Uncertainty Estimation: Are Dirichlet-based Models Reliable?},
author = {Kopetzki, Anna-Kathrin and Charpentier, Bertrand and Z{\"u}gner, Daniel and Giri, Sandhya and G{\"u}nnemann, Stephan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5707--5718},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kopetzki21a/kopetzki21a.pdf},
url = {https://proceedings.mlr.press/v139/kopetzki21a.html},
abstract = {Dirichlet-based uncertainty (DBU) models are a recent and promising class of uncertainty-aware models. DBU models predict the parameters of a Dirichlet distribution to provide fast, high-quality uncertainty estimates alongside with class predictions. In this work, we present the first large-scale, in-depth study of the robustness of DBU models under adversarial attacks. Our results suggest that uncertainty estimates of DBU models are not robust w.r.t. three important tasks: (1) indicating correctly and wrongly classified samples; (2) detecting adversarial examples; and (3) distinguishing between in-distribution (ID) and out-of-distribution (OOD) data. Additionally, we explore the first approaches to make DBU mod- els more robust. While adversarial training has a minor effect, our median smoothing based ap- proach significantly increases robustness of DBU models.}
}
@InProceedings{pmlr-v139-korba21a,
title = {Kernel Stein Discrepancy Descent},
author = {Korba, Anna and Aubin-Frankowski, Pierre-Cyril and Majewski, Szymon and Ablin, Pierre},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5719--5730},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/korba21a/korba21a.pdf},
url = {https://proceedings.mlr.press/v139/korba21a.html},
abstract = {Among dissimilarities between probability distributions, the Kernel Stein Discrepancy (KSD) has received much interest recently. We investigate the properties of its Wasserstein gradient flow to approximate a target probability distribution $\pi$ on $\mathbb{R}^d$, known up to a normalization constant. This leads to a straightforwardly implementable, deterministic score-based method to sample from $\pi$, named KSD Descent, which uses a set of particles to approximate $\pi$. Remarkably, owing to a tractable loss function, KSD Descent can leverage robust parameter-free optimization schemes such as L-BFGS; this contrasts with other popular particle-based schemes such as the Stein Variational Gradient Descent algorithm. We study the convergence properties of KSD Descent and demonstrate its practical relevance. However, we also highlight failure cases by showing that the algorithm can get stuck in spurious local minima.}
}
@InProceedings{pmlr-v139-kosaian21a,
title = {Boosting the Throughput and Accelerator Utilization of Specialized CNN Inference Beyond Increasing Batch Size},
author = {Kosaian, Jack and Phanishayee, Amar and Philipose, Matthai and Dey, Debadeepta and Vinayak, Rashmi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5731--5741},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kosaian21a/kosaian21a.pdf},
url = {https://proceedings.mlr.press/v139/kosaian21a.html},
abstract = {Datacenter vision systems widely use small, specialized convolutional neural networks (CNNs) trained on specific tasks for high-throughput inference. These settings employ accelerators with massive computational capacity, but which specialized CNNs underutilize due to having low arithmetic intensity. This results in suboptimal application-level throughput and poor returns on accelerator investment. Increasing batch size is the only known way to increase both application-level throughput and accelerator utilization for inference, but yields diminishing returns; specialized CNNs poorly utilize accelerators even with large batch size. We propose FoldedCNNs, a new approach to CNN design that increases inference throughput and utilization beyond large batch size. FoldedCNNs rethink the structure of inputs and layers of specialized CNNs to boost arithmetic intensity: in FoldedCNNs, f images with C channels each are concatenated into a single input with fC channels and jointly classified by a wider CNN. Increased arithmetic intensity in FoldedCNNs increases the throughput and GPU utilization of specialized CNN inference by up to 2.5x and 2.8x, with accuracy close to the original CNN in most cases.}
}
@InProceedings{pmlr-v139-kosiorek21a,
title = {NeRF-VAE: A Geometry Aware 3D Scene Generative Model},
author = {Kosiorek, Adam R and Strathmann, Heiko and Zoran, Daniel and Moreno, Pol and Schneider, Rosalia and Mokra, Sona and Rezende, Danilo Jimenez},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5742--5752},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kosiorek21a/kosiorek21a.pdf},
url = {https://proceedings.mlr.press/v139/kosiorek21a.html},
abstract = {We propose NeRF-VAE, a 3D scene generative model that incorporates geometric structure via Neural Radiance Fields (NeRF) and differentiable volume rendering. In contrast to NeRF, our model takes into account shared structure across scenes, and is able to infer the structure of a novel scene—without the need to re-train—using amortized inference. NeRF-VAE’s explicit 3D rendering process further contrasts previous generative models with convolution-based rendering which lacks geometric structure. Our model is a VAE that learns a distribution over radiance fields by conditioning them on a latent scene representation. We show that, once trained, NeRF-VAE is able to infer and render geometrically-consistent scenes from previously unseen 3D environments of synthetic scenes using very few input images. We further demonstrate that NeRF-VAE generalizes well to out-of-distribution cameras, while convolutional models do not. Finally, we introduce and study an attention-based conditioning mechanism of NeRF-VAE’s decoder, which improves model performance.}
}
@InProceedings{pmlr-v139-kossen21a,
title = {Active Testing: Sample-Efficient Model Evaluation},
author = {Kossen, Jannik and Farquhar, Sebastian and Gal, Yarin and Rainforth, Tom},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5753--5763},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kossen21a/kossen21a.pdf},
url = {https://proceedings.mlr.press/v139/kossen21a.html},
abstract = {We introduce a new framework for sample-efficient model evaluation that we call active testing. While approaches like active learning reduce the number of labels needed for model training, existing literature largely ignores the cost of labeling test data, typically unrealistically assuming large test sets for model evaluation. This creates a disconnect to real applications, where test labels are important and just as expensive, e.g. for optimizing hyperparameters. Active testing addresses this by carefully selecting the test points to label, ensuring model evaluation is sample-efficient. To this end, we derive theoretically-grounded and intuitive acquisition strategies that are specifically tailored to the goals of active testing, noting these are distinct to those of active learning. As actively selecting labels introduces a bias; we further show how to remove this bias while reducing the variance of the estimator at the same time. Active testing is easy to implement and can be applied to any supervised machine learning method. We demonstrate its effectiveness on models including WideResNets and Gaussian processes on datasets including Fashion-MNIST and CIFAR-100.}
}
@InProceedings{pmlr-v139-kostas21a,
title = {High Confidence Generalization for Reinforcement Learning},
author = {Kostas, James and Chandak, Yash and Jordan, Scott M and Theocharous, Georgios and Thomas, Philip},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5764--5773},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kostas21a/kostas21a.pdf},
url = {https://proceedings.mlr.press/v139/kostas21a.html},
abstract = {We present several classes of reinforcement learning algorithms that safely generalize to Markov decision processes (MDPs) not seen during training. Specifically, we study the setting in which some set of MDPs is accessible for training. The goal is to generalize safely to MDPs that are sampled from the same distribution, but which may not be in the set accessible for training. For various definitions of safety, our algorithms give probabilistic guarantees that agents can safely generalize to MDPs that are sampled from the same distribution but are not necessarily in the training set. These algorithms are a type of Seldonian algorithm (Thomas et al., 2019), which is a class of machine learning algorithms that return models with probabilistic safety guarantees for user-specified definitions of safety.}
}
@InProceedings{pmlr-v139-kostrikov21a,
title = {Offline Reinforcement Learning with Fisher Divergence Critic Regularization},
author = {Kostrikov, Ilya and Fergus, Rob and Tompson, Jonathan and Nachum, Ofir},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5774--5783},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kostrikov21a/kostrikov21a.pdf},
url = {https://proceedings.mlr.press/v139/kostrikov21a.html},
abstract = {Many modern approaches to offline Reinforcement Learning (RL) utilize behavior regularization, typically augmenting a model-free actor critic algorithm with a penalty measuring divergence of the policy from the offline data. In this work, we propose an alternative approach to encouraging the learned policy to stay close to the data, namely parameterizing the critic as the log-behavior-policy, which generated the offline data, plus a state-action value offset term, which can be learned using a neural network. Behavior regularization then corresponds to an appropriate regularizer on the offset term. We propose using a gradient penalty regularizer for the offset term and demonstrate its equivalence to Fisher divergence regularization, suggesting connections to the score matching and generative energy-based model literature. We thus term our resulting algorithm Fisher-BRC (Behavior Regularized Critic). On standard offline RL benchmarks, Fisher-BRC achieves both improved performance and faster convergence over existing state-of-the-art methods.}
}
@InProceedings{pmlr-v139-kovalev21a,
title = {ADOM: Accelerated Decentralized Optimization Method for Time-Varying Networks},
author = {Kovalev, Dmitry and Shulgin, Egor and Richtarik, Peter and Rogozin, Alexander V and Gasnikov, Alexander},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5784--5793},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kovalev21a/kovalev21a.pdf},
url = {https://proceedings.mlr.press/v139/kovalev21a.html},
abstract = {We propose ADOM – an accelerated method for smooth and strongly convex decentralized optimization over time-varying networks. ADOM uses a dual oracle, i.e., we assume access to the gradient of the Fenchel conjugate of the individual loss functions. Up to a constant factor, which depends on the network structure only, its communication complexity is the same as that of accelerated Nesterov gradient method. To the best of our knowledge, only the algorithm of Rogozin et al. (2019) has a convergence rate with similar properties. However, their algorithm converges under the very restrictive assumption that the number of network changes can not be greater than a tiny percentage of the number of iterations. This assumption is hard to satisfy in practice, as the network topology changes usually can not be controlled. In contrast, ADOM merely requires the network to stay connected throughout time.}
}
@InProceedings{pmlr-v139-kozuno21a,
title = {Revisiting Peng’s Q($λ$) for Modern Reinforcement Learning},
author = {Kozuno, Tadashi and Tang, Yunhao and Rowland, Mark and Munos, Remi and Kapturowski, Steven and Dabney, Will and Valko, Michal and Abel, David},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5794--5804},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kozuno21a/kozuno21a.pdf},
url = {https://proceedings.mlr.press/v139/kozuno21a.html},
abstract = {Off-policy multi-step reinforcement learning algorithms consist of conservative and non-conservative algorithms: the former actively cut traces, whereas the latter do not. Recently, Munos et al. (2016) proved the convergence of conservative algorithms to an optimal Q-function. In contrast, non-conservative algorithms are thought to be unsafe and have a limited or no theoretical guarantee. Nonetheless, recent studies have shown that non-conservative algorithms empirically outperform conservative ones. Motivated by the empirical results and the lack of theory, we carry out theoretical analyses of Peng’s Q($\lambda$), a representative example of non-conservative algorithms. We prove that \emph{it also converges to an optimal policy} provided that the behavior policy slowly tracks a greedy policy in a way similar to conservative policy iteration. Such a result has been conjectured to be true but has not been proven. We also experiment with Peng’s Q($\lambda$) in complex continuous control tasks, confirming that Peng’s Q($\lambda$) often outperforms conservative algorithms despite its simplicity. These results indicate that Peng’s Q($\lambda$), which was thought to be unsafe, is a theoretically-sound and practically effective algorithm.}
}
@InProceedings{pmlr-v139-krishnamurthy21a,
title = {Adapting to misspecification in contextual bandits with offline regression oracles},
author = {Krishnamurthy, Sanath Kumar and Hadad, Vitor and Athey, Susan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5805--5814},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/krishnamurthy21a/krishnamurthy21a.pdf},
url = {https://proceedings.mlr.press/v139/krishnamurthy21a.html},
abstract = {Computationally efficient contextual bandits are often based on estimating a predictive model of rewards given contexts and arms using past data. However, when the reward model is not well-specified, the bandit algorithm may incur unexpected regret, so recent work has focused on algorithms that are robust to misspecification. We propose a simple family of contextual bandit algorithms that adapt to misspecification error by reverting to a good safe policy when there is evidence that misspecification is causing a regret increase. Our algorithm requires only an offline regression oracle to ensure regret guarantees that gracefully degrade in terms of a measure of the average misspecification level. Compared to prior work, we attain similar regret guarantees, but we do no rely on a master algorithm, and do not require more robust oracles like online or constrained regression oracles (e.g., Foster et al. (2020), Krishnamurthy et al. (2020)). This allows us to design algorithms for more general function approximation classes.}
}
@InProceedings{pmlr-v139-krueger21a,
title = {Out-of-Distribution Generalization via Risk Extrapolation (REx)},
author = {Krueger, David and Caballero, Ethan and Jacobsen, Joern-Henrik and Zhang, Amy and Binas, Jonathan and Zhang, Dinghuai and Priol, Remi Le and Courville, Aaron},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5815--5826},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/krueger21a/krueger21a.pdf},
url = {https://proceedings.mlr.press/v139/krueger21a.html},
abstract = {Distributional shift is one of the major obstacles when transferring machine learning prediction systems from the lab to the real world. To tackle this problem, we assume that variation across training domains is representative of the variation we might encounter at test time, but also that shifts at test time may be more extreme in magnitude. In particular, we show that reducing differences in risk across training domains can reduce a model’s sensitivity to a wide range of extreme distributional shifts, including the challenging setting where the input contains both causal and anti-causal elements. We motivate this approach, Risk Extrapolation (REx), as a form of robust optimization over a perturbation set of extrapolated domains (MM-REx), and propose a penalty on the variance of training risks (V-REx) as a simpler variant. We prove that variants of REx can recover the causal mechanisms of the targets, while also providing robustness to changes in the input distribution (“covariate shift”). By appropriately trading-off robustness to causally induced distributional shifts and covariate shift, REx is able to outperform alternative methods such as Invariant Risk Minimization in situations where these types of shift co-occur.}
}
@InProceedings{pmlr-v139-kuchibhotla21a,
title = {Near-Optimal Confidence Sequences for Bounded Random Variables},
author = {Kuchibhotla, Arun K and Zheng, Qinqing},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5827--5837},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kuchibhotla21a/kuchibhotla21a.pdf},
url = {https://proceedings.mlr.press/v139/kuchibhotla21a.html},
abstract = {Many inference problems, such as sequential decision problems like A/B testing, adaptive sampling schemes like bandit selection, are often online in nature. The fundamental problem for online inference is to provide a sequence of confidence intervals that are valid uniformly over the growing-into-infinity sample sizes. To address this question, we provide a near-optimal confidence sequence for bounded random variables by utilizing Bentkus’ concentration results. We show that it improves on the existing approaches that use the Cram{é}r-Chernoff technique such as the Hoeffding, Bernstein, and Bennett inequalities. The resulting confidence sequence is confirmed to be favorable in synthetic coverage problems, adaptive stopping algorithms, and multi-armed bandit problems.}
}
@InProceedings{pmlr-v139-kulkarni21a,
title = {Differentially Private Bayesian Inference for Generalized Linear Models},
author = {Kulkarni, Tejas and J{\"a}lk{\"o}, Joonas and Koskela, Antti and Kaski, Samuel and Honkela, Antti},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5838--5849},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kulkarni21a/kulkarni21a.pdf},
url = {https://proceedings.mlr.press/v139/kulkarni21a.html},
abstract = {Generalized linear models (GLMs) such as logistic regression are among the most widely used arms in data analyst’s repertoire and often used on sensitive datasets. A large body of prior works that investigate GLMs under differential privacy (DP) constraints provide only private point estimates of the regression coefficients, and are not able to quantify parameter uncertainty. In this work, with logistic and Poisson regression as running examples, we introduce a generic noise-aware DP Bayesian inference method for a GLM at hand, given a noisy sum of summary statistics. Quantifying uncertainty allows us to determine which of the regression coefficients are statistically significantly different from zero. We provide a previously unknown tight privacy analysis and experimentally demonstrate that the posteriors obtained from our model, while adhering to strong privacy guarantees, are close to the non-private posteriors.}
}
@InProceedings{pmlr-v139-kumar21a,
title = {Bayesian Structural Adaptation for Continual Learning},
author = {Kumar, Abhishek and Chatterjee, Sunabha and Rai, Piyush},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5850--5860},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kumar21a/kumar21a.pdf},
url = {https://proceedings.mlr.press/v139/kumar21a.html},
abstract = {Continual Learning is a learning paradigm where learning systems are trained on a sequence of tasks. The goal here is to perform well on the current task without suffering from a performance drop on the previous tasks. Two notable directions among the recent advances in continual learning with neural networks are (1) variational Bayes based regularization by learning priors from previous tasks, and, (2) learning the structure of deep networks to adapt to new tasks. So far, these two approaches have been largely orthogonal. We present a novel Bayesian framework based on continually learning the structure of deep neural networks, to unify these distinct yet complementary approaches. The proposed framework learns the deep structure for each task by learning which weights to be used, and supports inter-task transfer through the overlapping of different sparse subsets of weights learned by different tasks. An appealing aspect of our proposed continual learning framework is that it is applicable to both discriminative (supervised) and generative (unsupervised) settings. Experimental results on supervised and unsupervised benchmarks demonstrate that our approach performs comparably or better than recent advances in continual learning.}
}
@InProceedings{pmlr-v139-kumar21b,
title = {Implicit rate-constrained optimization of non-decomposable objectives},
author = {Kumar, Abhishek and Narasimhan, Harikrishna and Cotter, Andrew},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5861--5871},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kumar21b/kumar21b.pdf},
url = {https://proceedings.mlr.press/v139/kumar21b.html},
abstract = {We consider a popular family of constrained optimization problems arising in machine learning that involve optimizing a non-decomposable evaluation metric with a certain thresholded form, while constraining another metric of interest. Examples of such problems include optimizing false negative rate at a fixed false positive rate, optimizing precision at a fixed recall, optimizing the area under the precision-recall or ROC curves, etc. Our key idea is to formulate a rate-constrained optimization that expresses the threshold parameter as a function of the model parameters via the Implicit Function theorem. We show how the resulting optimization problem can be solved using standard gradient based methods. Experiments on benchmark datasets demonstrate the effectiveness of our proposed method over existing state-of-the-art approaches for these problems.}
}
@InProceedings{pmlr-v139-kummerle21a,
title = {A Scalable Second Order Method for Ill-Conditioned Matrix Completion from Few Samples},
author = {K{\"u}mmerle, Christian and Verdun, Claudio M.},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5872--5883},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kummerle21a/kummerle21a.pdf},
url = {https://proceedings.mlr.press/v139/kummerle21a.html},
abstract = {We propose an iterative algorithm for low-rank matrix completion with that can be interpreted as an iteratively reweighted least squares (IRLS) algorithm, a saddle-escaping smoothing Newton method or a variable metric proximal gradient method applied to a non-convex rank surrogate. It combines the favorable data-efficiency of previous IRLS approaches with an improved scalability by several orders of magnitude. We establish the first local convergence guarantee from a minimal number of samples for that class of algorithms, showing that the method attains a local quadratic convergence rate. Furthermore, we show that the linear systems to be solved are well-conditioned even for very ill-conditioned ground truth matrices. We provide extensive experiments, indicating that unlike many state-of-the-art approaches, our method is able to complete very ill-conditioned matrices with a condition number of up to $10^{10}$ from few samples, while being competitive in its scalability.}
}
@InProceedings{pmlr-v139-kveton21a,
title = {Meta-Thompson Sampling},
author = {Kveton, Branislav and Konobeev, Mikhail and Zaheer, Manzil and Hsu, Chih-Wei and Mladenov, Martin and Boutilier, Craig and Szepesvari, Csaba},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5884--5893},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kveton21a/kveton21a.pdf},
url = {https://proceedings.mlr.press/v139/kveton21a.html},
abstract = {Efficient exploration in bandits is a fundamental online learning problem. We propose a variant of Thompson sampling that learns to explore better as it interacts with bandit instances drawn from an unknown prior. The algorithm meta-learns the prior and thus we call it MetaTS. We propose several efficient implementations of MetaTS and analyze it in Gaussian bandits. Our analysis shows the benefit of meta-learning and is of a broader interest, because we derive a novel prior-dependent Bayes regret bound for Thompson sampling. Our theory is complemented by empirical evaluation, which shows that MetaTS quickly adapts to the unknown prior.}
}
@InProceedings{pmlr-v139-kwon21a,
title = {Targeted Data Acquisition for Evolving Negotiation Agents},
author = {Kwon, Minae and Karamcheti, Siddharth and Cuellar, Mariano-Florentino and Sadigh, Dorsa},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5894--5904},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kwon21a/kwon21a.pdf},
url = {https://proceedings.mlr.press/v139/kwon21a.html},
abstract = {Successful negotiators must learn how to balance optimizing for self-interest and cooperation. Yet current artificial negotiation agents often heavily depend on the quality of the static datasets they were trained on, limiting their capacity to fashion an adaptive response balancing self-interest and cooperation. For this reason, we find that these agents can achieve either high utility or cooperation, but not both. To address this, we introduce a targeted data acquisition framework where we guide the exploration of a reinforcement learning agent using annotations from an expert oracle. The guided exploration incentivizes the learning agent to go beyond its static dataset and develop new negotiation strategies. We show that this enables our agents to obtain higher-reward and more Pareto-optimal solutions when negotiating with both simulated and human partners compared to standard supervised learning and reinforcement learning methods. This trend additionally holds when comparing agents using our targeted data acquisition framework to variants of agents trained with a mix of supervised learning and reinforcement learning, or to agents using tailored reward functions that explicitly optimize for utility and Pareto-optimality.}
}
@InProceedings{pmlr-v139-kwon21b,
title = {ASAM: Adaptive Sharpness-Aware Minimization for Scale-Invariant Learning of Deep Neural Networks},
author = {Kwon, Jungmin and Kim, Jeongseop and Park, Hyunseo and Choi, In Kwon},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5905--5914},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/kwon21b/kwon21b.pdf},
url = {https://proceedings.mlr.press/v139/kwon21b.html},
abstract = {Recently, learning algorithms motivated from sharpness of loss surface as an effective measure of generalization gap have shown state-of-the-art performances. Nevertheless, sharpness defined in a rigid region with a fixed radius, has a drawback in sensitivity to parameter re-scaling which leaves the loss unaffected, leading to weakening of the connection between sharpness and generalization gap. In this paper, we introduce the concept of adaptive sharpness which is scale-invariant and propose the corresponding generalization bound. We suggest a novel learning method, adaptive sharpness-aware minimization (ASAM), utilizing the proposed generalization bound. Experimental results in various benchmark datasets show that ASAM contributes to significant improvement of model generalization performance.}
}
@InProceedings{pmlr-v139-laber21a,
title = {On the price of explainability for some clustering problems},
author = {Laber, Eduardo S and Murtinho, Lucas},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5915--5925},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/laber21a/laber21a.pdf},
url = {https://proceedings.mlr.press/v139/laber21a.html},
abstract = {The price of explainability for a clustering task can be defined as the unavoidable loss, in terms of the objective function, if we force the final partition to be explainable. Here, we study this price for the following clustering problems: $k$-means, $k$-medians, $k$-centers and maximum-spacing. We provide upper and lower bounds for a natural model where explainability is achieved via decision trees. For the $k$-means and $k$-medians problems our upper bounds improve those obtained by [Dasgupta et. al, ICML 20] for low dimensions. Another contribution is a simple and efficient algorithm for building explainable clusterings for the $k$-means problem. We provide empirical evidence that its performance is better than the current state of the art for decision-tree based explainable clustering.}
}
@InProceedings{pmlr-v139-lacotte21a,
title = {Adaptive Newton Sketch: Linear-time Optimization with Quadratic Convergence and Effective Hessian Dimensionality},
author = {Lacotte, Jonathan and Wang, Yifei and Pilanci, Mert},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5926--5936},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lacotte21a/lacotte21a.pdf},
url = {https://proceedings.mlr.press/v139/lacotte21a.html},
abstract = {We propose a randomized algorithm with quadratic convergence rate for convex optimization problems with a self-concordant, composite, strongly convex objective function. Our method is based on performing an approximate Newton step using a random projection of the Hessian. Our first contribution is to show that, at each iteration, the embedding dimension (or sketch size) can be as small as the effective dimension of the Hessian matrix. Leveraging this novel fundamental result, we design an algorithm with a sketch size proportional to the effective dimension and which exhibits a quadratic rate of convergence. This result dramatically improves on the classical linear-quadratic convergence rates of state-of-the-art sub-sampled Newton methods. However, in most practical cases, the effective dimension is not known beforehand, and this raises the question of how to pick a sketch size as small as the effective dimension while preserving a quadratic convergence rate. Our second and main contribution is thus to propose an adaptive sketch size algorithm with quadratic convergence rate and which does not require prior knowledge or estimation of the effective dimension: at each iteration, it starts with a small sketch size, and increases it until quadratic progress is achieved. Importantly, we show that the embedding dimension remains proportional to the effective dimension throughout the entire path and that our method achieves state-of-the-art computational complexity for solving convex optimization programs with a strongly convex component. We discuss and illustrate applications to linear and quadratic programming, as well as logistic regression and other generalized linear models.}
}
@InProceedings{pmlr-v139-laforgue21a,
title = {Generalization Bounds in the Presence of Outliers: a Median-of-Means Study},
author = {Laforgue, Pierre and Staerman, Guillaume and Cl{\'e}men{\c{c}}on, Stephan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5937--5947},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/laforgue21a/laforgue21a.pdf},
url = {https://proceedings.mlr.press/v139/laforgue21a.html},
abstract = {In contrast to the empirical mean, the Median-of-Means (MoM) is an estimator of the mean $\theta$ of a square integrable r.v. Z, around which accurate nonasymptotic confidence bounds can be built, even when Z does not exhibit a sub-Gaussian tail behavior. Thanks to the high confidence it achieves on heavy-tailed data, MoM has found various applications in machine learning, where it is used to design training procedures that are not sensitive to atypical observations. More recently, a new line of work is now trying to characterize and leverage MoM’s ability to deal with corrupted data. In this context, the present work proposes a general study of MoM’s concentration properties under the contamination regime, that provides a clear understanding on the impact of the outlier proportion and the number of blocks chosen. The analysis is extended to (multisample) U-statistics, i.e. averages over tuples of observations, that raise additional challenges due to the dependence induced. Finally, we show that the latter bounds can be used in a straightforward fashion to derive generalization guarantees for pairwise learning in a contaminated setting, and propose an algorithm to compute provably reliable decision functions.}
}
@InProceedings{pmlr-v139-lam21a,
title = {Model Fusion for Personalized Learning},
author = {Lam, Thanh Chi and Hoang, Nghia and Low, Bryan Kian Hsiang and Jaillet, Patrick},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5948--5958},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lam21a/lam21a.pdf},
url = {https://proceedings.mlr.press/v139/lam21a.html},
abstract = {Production systems operating on a growing domain of analytic services often require generating warm-start solution models for emerging tasks with limited data. One potential approach to address this warm-start challenge is to adopt meta learning to generate a base model that can be adapted to solve unseen tasks with minimal fine-tuning. This however requires the training processes of previous solution models of existing tasks to be synchronized. This is not possible if these models were pre-trained separately on private data owned by different entities and cannot be synchronously re-trained. To accommodate for such scenarios, we develop a new personalized learning framework that synthesizes customized models for unseen tasks via fusion of independently pre-trained models of related tasks. We establish performance guarantee for the proposed framework and demonstrate its effectiveness on both synthetic and real datasets.}
}
@InProceedings{pmlr-v139-lam21b,
title = {Gradient Disaggregation: Breaking Privacy in Federated Learning by Reconstructing the User Participant Matrix},
author = {Lam, Maximilian and Wei, Gu-Yeon and Brooks, David and Reddi, Vijay Janapa and Mitzenmacher, Michael},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5959--5968},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lam21b/lam21b.pdf},
url = {https://proceedings.mlr.press/v139/lam21b.html},
abstract = {We show that aggregated model updates in federated learning may be insecure. An untrusted central server may disaggregate user updates from sums of updates across participants given repeated observations, enabling the server to recover privileged information about individual users’ private training data via traditional gradient inference attacks. Our method revolves around reconstructing participant information (e.g: which rounds of training users participated in) from aggregated model updates by leveraging summary information from device analytics commonly used to monitor, debug, and manage federated learning systems. Our attack is parallelizable and we successfully disaggregate user updates on settings with up to thousands of participants. We quantitatively and qualitatively demonstrate significant improvements in the capability of various inference attacks on the disaggregated updates. Our attack enables the attribution of learned properties to individual users, violating anonymity, and shows that a determined central server may undermine the secure aggregation protocol to break individual users’ data privacy in federated learning.}
}
@InProceedings{pmlr-v139-lancewicki21a,
title = {Stochastic Multi-Armed Bandits with Unrestricted Delay Distributions},
author = {Lancewicki, Tal and Segal, Shahar and Koren, Tomer and Mansour, Yishay},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5969--5978},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lancewicki21a/lancewicki21a.pdf},
url = {https://proceedings.mlr.press/v139/lancewicki21a.html},
abstract = {We study the stochastic Multi-Armed Bandit (MAB) problem with random delays in the feedback received by the algorithm. We consider two settings: the {\it reward dependent} delay setting, where realized delays may depend on the stochastic rewards, and the {\it reward-independent} delay setting. Our main contribution is algorithms that achieve near-optimal regret in each of the settings, with an additional additive dependence on the quantiles of the delay distribution. Our results do not make any assumptions on the delay distributions: in particular, we do not assume they come from any parametric family of distributions and allow for unbounded support and expectation; we further allow for the case of infinite delays where the algorithm might occasionally not observe any feedback.}
}
@InProceedings{pmlr-v139-landajuela21a,
title = {Discovering symbolic policies with deep reinforcement learning},
author = {Landajuela, Mikel and Petersen, Brenden K and Kim, Sookyung and Santiago, Claudio P and Glatt, Ruben and Mundhenk, Nathan and Pettit, Jacob F and Faissol, Daniel},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5979--5989},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/landajuela21a/landajuela21a.pdf},
url = {https://proceedings.mlr.press/v139/landajuela21a.html},
abstract = {Deep reinforcement learning (DRL) has proven successful for many difficult control problems by learning policies represented by neural networks. However, the complexity of neural network-based policies{—}involving thousands of composed non-linear operators{—}can render them problematic to understand, trust, and deploy. In contrast, simple policies comprising short symbolic expressions can facilitate human understanding, while also being transparent and exhibiting predictable behavior. To this end, we propose deep symbolic policy, a novel approach to directly search the space of symbolic policies. We use an autoregressive recurrent neural network to generate control policies represented by tractable mathematical expressions, employing a risk-seeking policy gradient to maximize performance of the generated policies. To scale to environments with multi-dimensional action spaces, we propose an "anchoring" algorithm that distills pre-trained neural network-based policies into fully symbolic policies, one action dimension at a time. We also introduce two novel methods to improve exploration in DRL-based combinatorial optimization, building on ideas of entropy regularization and distribution initialization. Despite their dramatically reduced complexity, we demonstrate that discovered symbolic policies outperform seven state-of-the-art DRL algorithms in terms of average rank and average normalized episodic reward across eight benchmark environments.}
}
@InProceedings{pmlr-v139-lang21a,
title = {Graph Cuts Always Find a Global Optimum for Potts Models (With a Catch)},
author = {Lang, Hunter and Sontag, David and Vijayaraghavan, Aravindan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {5990--5999},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lang21a/lang21a.pdf},
url = {https://proceedings.mlr.press/v139/lang21a.html},
abstract = {We prove that the alpha-expansion algorithm for MAP inference always returns a globally optimal assignment for Markov Random Fields with Potts pairwise potentials, with a catch: the returned assignment is only guaranteed to be optimal for an instance within a small perturbation of the original problem instance. In other words, all local minima with respect to expansion moves are global minima to slightly perturbed versions of the problem. On "real-world" instances, MAP assignments of small perturbations of the problem should be very similar to the MAP assignment(s) of the original problem instance. We design an algorithm that can certify whether this is the case in practice. On several MAP inference problem instances from computer vision, this algorithm certifies that MAP solutions to all of these perturbations are very close to solutions of the original instance. These results taken together give a cohesive explanation for the good performance of "graph cuts" algorithms in practice. Every local expansion minimum is a global minimum in a small perturbation of the problem, and all of these global minima are close to the original solution.}
}
@InProceedings{pmlr-v139-lange21a,
title = {Efficient Message Passing for 0{–}1 ILPs with Binary Decision Diagrams},
author = {Lange, Jan-Hendrik and Swoboda, Paul},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6000--6010},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lange21a/lange21a.pdf},
url = {https://proceedings.mlr.press/v139/lange21a.html},
abstract = {We present a message passing method for 0{–}1 integer linear programs. Our algorithm is based on a decomposition of the original problem into subproblems that are represented as binary deci- sion diagrams. The resulting Lagrangean dual is solved iteratively by a series of efficient block coordinate ascent steps. Our method has linear iteration complexity in the size of the decomposi- tion and can be effectively parallelized. The char- acteristics of our approach are desirable towards solving ever larger problems arising in structured prediction. We present experimental results on combinatorial problems from MAP inference for Markov Random Fields, quadratic assignment, discrete tomography and cell tracking for develop- mental biology and show promising performance.}
}
@InProceedings{pmlr-v139-larsen21a,
title = {CountSketches, Feature Hashing and the Median of Three},
author = {Larsen, Kasper Green and Pagh, Rasmus and T{\v{e}}tek, Jakub},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6011--6020},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/larsen21a/larsen21a.pdf},
url = {https://proceedings.mlr.press/v139/larsen21a.html},
abstract = {In this paper, we revisit the classic CountSketch method, which is a sparse, random projection that transforms a (high-dimensional) Euclidean vector $v$ to a vector of dimension $(2t-1) s$, where $t, s > 0$ are integer parameters. It is known that a CountSketch allows estimating coordinates of $v$ with variance bounded by $\|v\|_2^2/s$. For $t > 1$, the estimator takes the median of $2t-1$ independent estimates, and the probability that the estimate is off by more than $2 \|v\|_2/\sqrt{s}$ is exponentially small in $t$. This suggests choosing $t$ to be logarithmic in a desired inverse failure probability. However, implementations of CountSketch often use a small, constant $t$. Previous work only predicts a constant factor improvement in this setting. Our main contribution is a new analysis of CountSketch, showing an improvement in variance to $O(\min\{\|v\|_1^2/s^2,\|v\|_2^2/s\})$ when $t > 1$. That is, the variance decreases proportionally to $s^{-2}$, asymptotically for large enough $s$.}
}
@InProceedings{pmlr-v139-laturnus21a,
title = {MorphVAE: Generating Neural Morphologies from 3D-Walks using a Variational Autoencoder with Spherical Latent Space},
author = {Laturnus, Sophie C. and Berens, Philipp},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6021--6031},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/laturnus21a/laturnus21a.pdf},
url = {https://proceedings.mlr.press/v139/laturnus21a.html},
abstract = {For the past century, the anatomy of a neuron has been considered one of its defining features: The shape of a neuron’s dendrites and axon fundamentally determines what other neurons it can connect to. These neurites have been described using mathematical tools e.g. in the context of cell type classification, but generative models of these structures have only rarely been proposed and are often computationally inefficient. Here we propose MorphVAE, a sequence-to-sequence variational autoencoder with spherical latent space as a generative model for neural morphologies. The model operates on walks within the tree structure of a neuron and can incorporate expert annotations on a subset of the data using semi-supervised learning. We develop our model on artificially generated toy data and evaluate its performance on dendrites of excitatory cells and axons of inhibitory cells of mouse motor cortex (M1) and dendrites of retinal ganglion cells. We show that the learned latent feature space allows for better cell type discrimination than other commonly used features. By sampling new walks from the latent space we can easily construct new morphologies with a specified degree of similarity to their reference neuron, providing an efficient generative model for neural morphologies.}
}
@InProceedings{pmlr-v139-lazic21a,
title = {Improved Regret Bound and Experience Replay in Regularized Policy Iteration},
author = {Lazic, Nevena and Yin, Dong and Abbasi-Yadkori, Yasin and Szepesvari, Csaba},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6032--6042},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lazic21a/lazic21a.pdf},
url = {https://proceedings.mlr.press/v139/lazic21a.html},
abstract = {In this work, we study algorithms for learning in infinite-horizon undiscounted Markov decision processes (MDPs) with function approximation. We first show that the regret analysis of the Politex algorithm (a version of regularized policy iteration) can be sharpened from $O(T^{3/4})$ to $O(\sqrt{T})$ under nearly identical assumptions, and instantiate the bound with linear function approximation. Our result provides the first high-probability $O(\sqrt{T})$ regret bound for a computationally efficient algorithm in this setting. The exact implementation of Politex with neural network function approximation is inefficient in terms of memory and computation. Since our analysis suggests that we need to approximate the average of the action-value functions of past policies well, we propose a simple efficient implementation where we train a single Q-function on a replay buffer with past data. We show that this often leads to superior performance over other implementation choices, especially in terms of wall-clock time. Our work also provides a novel theoretical justification for using experience replay within policy iteration algorithms.}
}
@InProceedings{pmlr-v139-le21a,
title = {LAMDA: Label Matching Deep Domain Adaptation},
author = {Le, Trung and Nguyen, Tuan and Ho, Nhat and Bui, Hung and Phung, Dinh},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6043--6054},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/le21a/le21a.pdf},
url = {https://proceedings.mlr.press/v139/le21a.html},
abstract = {Deep domain adaptation (DDA) approaches have recently been shown to perform better than their shallow rivals with better modeling capacity on complex domains (e.g., image, structural data, and sequential data). The underlying idea is to learn domain invariant representations on a latent space that can bridge the gap between source and target domains. Several theoretical studies have established insightful understanding and the benefit of learning domain invariant features; however, they are usually limited to the case where there is no label shift, hence hindering its applicability. In this paper, we propose and study a new challenging setting that allows us to use a Wasserstein distance (WS) to not only quantify the data shift but also to define the label shift directly. We further develop a theory to demonstrate that minimizing the WS of the data shift leads to closing the gap between the source and target data distributions on the latent space (e.g., an intermediate layer of a deep net), while still being able to quantify the label shift with respect to this latent space. Interestingly, our theory can consequently explain certain drawbacks of learning domain invariant features on the latent space. Finally, grounded on the results and guidance of our developed theory, we propose the Label Matching Deep Domain Adaptation (LAMDA) approach that outperforms baselines on real-world datasets for DA problems.}
}
@InProceedings{pmlr-v139-lederer21a,
title = {Gaussian Process-Based Real-Time Learning for Safety Critical Applications},
author = {Lederer, Armin and Conejo, Alejandro J Ord{\'o}{\~n}ez and Maier, Korbinian A and Xiao, Wenxin and Umlauft, Jonas and Hirche, Sandra},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6055--6064},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lederer21a/lederer21a.pdf},
url = {https://proceedings.mlr.press/v139/lederer21a.html},
abstract = {The safe operation of physical systems typically relies on high-quality models. Since a continuous stream of data is generated during run-time, such models are often obtained through the application of Gaussian process regression because it provides guarantees on the prediction error. Due to its high computational complexity, Gaussian process regression must be used offline on batches of data, which prevents applications, where a fast adaptation through online learning is necessary to ensure safety. In order to overcome this issue, we propose the LoG-GP. It achieves a logarithmic update and prediction complexity in the number of training points through the aggregation of locally active Gaussian process models. Under weak assumptions on the aggregation scheme, it inherits safety guarantees from exact Gaussian process regression. These theoretical advantages are exemplarily exploited in the design of a safe and data-efficient, online-learning control policy. The efficiency and performance of the proposed real-time learning approach is demonstrated in a comparison to state-of-the-art methods.}
}
@InProceedings{pmlr-v139-lee21a,
title = {Sharing Less is More: Lifelong Learning in Deep Networks with Selective Layer Transfer},
author = {Lee, Seungwon and Behpour, Sima and Eaton, Eric},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6065--6075},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lee21a/lee21a.pdf},
url = {https://proceedings.mlr.press/v139/lee21a.html},
abstract = {Effective lifelong learning across diverse tasks requires the transfer of diverse knowledge, yet transferring irrelevant knowledge may lead to interference and catastrophic forgetting. In deep networks, transferring the appropriate granularity of knowledge is as important as the transfer mechanism, and must be driven by the relationships among tasks. We first show that the lifelong learning performance of several current deep learning architectures can be significantly improved by transfer at the appropriate layers. We then develop an expectation-maximization (EM) method to automatically select the appropriate transfer configuration and optimize the task network weights. This EM-based selective transfer is highly effective, balancing transfer performance on all tasks with avoiding catastrophic forgetting, as demonstrated on three algorithms in several lifelong object classification scenarios.}
}
@InProceedings{pmlr-v139-lee21b,
title = {Fair Selective Classification Via Sufficiency},
author = {Lee, Joshua K and Bu, Yuheng and Rajan, Deepta and Sattigeri, Prasanna and Panda, Rameswar and Das, Subhro and Wornell, Gregory W},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6076--6086},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lee21b/lee21b.pdf},
url = {https://proceedings.mlr.press/v139/lee21b.html},
abstract = {Selective classification is a powerful tool for decision-making in scenarios where mistakes are costly but abstentions are allowed. In general, by allowing a classifier to abstain, one can improve the performance of a model at the cost of reducing coverage and classifying fewer samples. However, recent work has shown, in some cases, that selective classification can magnify disparities between groups, and has illustrated this phenomenon on multiple real-world datasets. We prove that the sufficiency criterion can be used to mitigate these disparities by ensuring that selective classification increases performance on all groups, and introduce a method for mitigating the disparity in precision across the entire coverage scale based on this criterion. We then provide an upper bound on the conditional mutual information between the class label and sensitive attribute, conditioned on the learned features, which can be used as a regularizer to achieve fairer selective classification. The effectiveness of the method is demonstrated on the Adult, CelebA, Civil Comments, and CheXpert datasets.}
}
@InProceedings{pmlr-v139-lee21c,
title = {On-the-fly Rectification for Robust Large-Vocabulary Topic Inference},
author = {Lee, Moontae and Cho, Sungjun and Dong, Kun and Mimno, David and Bindel, David},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6087--6097},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lee21c/lee21c.pdf},
url = {https://proceedings.mlr.press/v139/lee21c.html},
abstract = {Across many data domains, co-occurrence statistics about the joint appearance of objects are powerfully informative. By transforming unsupervised learning problems into decompositions of co-occurrence statistics, spectral algorithms provide transparent and efficient algorithms for posterior inference such as latent topic analysis and community detection. As object vocabularies grow, however, it becomes rapidly more expensive to store and run inference algorithms on co-occurrence statistics. Rectifying co-occurrence, the key process to uphold model assumptions, becomes increasingly more vital in the presence of rare terms, but current techniques cannot scale to large vocabularies. We propose novel methods that simultaneously compress and rectify co-occurrence statistics, scaling gracefully with the size of vocabulary and the dimension of latent space. We also present new algorithms learning latent variables from the compressed statistics, and verify that our methods perform comparably to previous approaches on both textual and non-textual data.}
}
@InProceedings{pmlr-v139-lee21d,
title = {Unsupervised Embedding Adaptation via Early-Stage Feature Reconstruction for Few-Shot Classification},
author = {Lee, Dong Hoon and Chung, Sae-Young},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6098--6108},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lee21d/lee21d.pdf},
url = {https://proceedings.mlr.press/v139/lee21d.html},
abstract = {We propose unsupervised embedding adaptation for the downstream few-shot classification task. Based on findings that deep neural networks learn to generalize before memorizing, we develop Early-Stage Feature Reconstruction (ESFR) — a novel adaptation scheme with feature reconstruction and dimensionality-driven early stopping that finds generalizable features. Incorporating ESFR consistently improves the performance of baseline methods on all standard settings, including the recently proposed transductive method. ESFR used in conjunction with the transductive method further achieves state-of-the-art performance on mini-ImageNet, tiered-ImageNet, and CUB; especially with 1.2% 2.0% improvements in accuracy over the previous best performing method on 1-shot setting.}
}
@InProceedings{pmlr-v139-lee21e,
title = {Continual Learning in the Teacher-Student Setup: Impact of Task Similarity},
author = {Lee, Sebastian and Goldt, Sebastian and Saxe, Andrew},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6109--6119},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lee21e/lee21e.pdf},
url = {https://proceedings.mlr.press/v139/lee21e.html},
abstract = {Continual learning{—}the ability to learn many tasks in sequence{—}is critical for artificial learning systems. Yet standard training methods for deep networks often suffer from catastrophic forgetting, where learning new tasks erases knowledge of the earlier tasks. While catastrophic forgetting labels the problem, the theoretical reasons for interference between tasks remain unclear. Here, we attempt to narrow this gap between theory and practice by studying continual learning in the teacher-student setup. We extend previous analytical work on two-layer networks in the teacher-student setup to multiple teachers. Using each teacher to represent a different task, we investigate how the relationship between teachers affects the amount of forgetting and transfer exhibited by the student when the task switches. In line with recent work, we find that when tasks depend on similar features, intermediate task similarity leads to greatest forgetting. However, feature similarity is only one way in which tasks may be related. The teacher-student approach allows us to disentangle task similarity at the level of \emph{readouts} (hidden-to-output weights) as well as \emph{features} (input-to-hidden weights). We find a complex interplay between both types of similarity, initial transfer/forgetting rates, maximum transfer/forgetting, and the long-time (post-switch) amount of transfer/forgetting. Together, these results help illuminate the diverse factors contributing to catastrophic forgetting.}
}
@InProceedings{pmlr-v139-lee21f,
title = {OptiDICE: Offline Policy Optimization via Stationary Distribution Correction Estimation},
author = {Lee, Jongmin and Jeon, Wonseok and Lee, Byungjun and Pineau, Joelle and Kim, Kee-Eung},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6120--6130},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lee21f/lee21f.pdf},
url = {https://proceedings.mlr.press/v139/lee21f.html},
abstract = {We consider the offline reinforcement learning (RL) setting where the agent aims to optimize the policy solely from the data without further environment interactions. In offline RL, the distributional shift becomes the primary source of difficulty, which arises from the deviation of the target policy being optimized from the behavior policy used for data collection. This typically causes overestimation of action values, which poses severe problems for model-free algorithms that use bootstrapping. To mitigate the problem, prior offline RL algorithms often used sophisticated techniques that encourage underestimation of action values, which introduces an additional set of hyperparameters that need to be tuned properly. In this paper, we present an offline RL algorithm that prevents overestimation in a more principled way. Our algorithm, OptiDICE, directly estimates the stationary distribution corrections of the optimal policy and does not rely on policy-gradients, unlike previous offline RL algorithms. Using an extensive set of benchmark datasets for offline RL, we show that OptiDICE performs competitively with the state-of-the-art methods.}
}
@InProceedings{pmlr-v139-lee21g,
title = {SUNRISE: A Simple Unified Framework for Ensemble Learning in Deep Reinforcement Learning},
author = {Lee, Kimin and Laskin, Michael and Srinivas, Aravind and Abbeel, Pieter},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6131--6141},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lee21g/lee21g.pdf},
url = {https://proceedings.mlr.press/v139/lee21g.html},
abstract = {Off-policy deep reinforcement learning (RL) has been successful in a range of challenging domains. However, standard off-policy RL algorithms can suffer from several issues, such as instability in Q-learning and balancing exploration and exploitation. To mitigate these issues, we present SUNRISE, a simple unified ensemble method, which is compatible with various off-policy RL algorithms. SUNRISE integrates two key ingredients: (a) ensemble-based weighted Bellman backups, which re-weight target Q-values based on uncertainty estimates from a Q-ensemble, and (b) an inference method that selects actions using the highest upper-confidence bounds for efficient exploration. By enforcing the diversity between agents using Bootstrap with random initialization, we show that these different ideas are largely orthogonal and can be fruitfully integrated, together further improving the performance of existing off-policy RL algorithms, such as Soft Actor-Critic and Rainbow DQN, for both continuous and discrete control tasks on both low-dimensional and high-dimensional environments.}
}
@InProceedings{pmlr-v139-lee21h,
title = {Achieving Near Instance-Optimality and Minimax-Optimality in Stochastic and Adversarial Linear Bandits Simultaneously},
author = {Lee, Chung-Wei and Luo, Haipeng and Wei, Chen-Yu and Zhang, Mengxiao and Zhang, Xiaojin},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6142--6151},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lee21h/lee21h.pdf},
url = {https://proceedings.mlr.press/v139/lee21h.html},
abstract = {In this work, we develop linear bandit algorithms that automatically adapt to different environments. By plugging a novel loss estimator into the optimization problem that characterizes the instance-optimal strategy, our first algorithm not only achieves nearly instance-optimal regret in stochastic environments, but also works in corrupted environments with additional regret being the amount of corruption, while the state-of-the-art (Li et al., 2019) achieves neither instance-optimality nor the optimal dependence on the corruption amount. Moreover, by equipping this algorithm with an adversarial component and carefully-designed testings, our second algorithm additionally enjoys minimax-optimal regret in completely adversarial environments, which is the first of this kind to our knowledge. Finally, all our guarantees hold with high probability, while existing instance-optimal guarantees only hold in expectation.}
}
@InProceedings{pmlr-v139-lee21i,
title = {PEBBLE: Feedback-Efficient Interactive Reinforcement Learning via Relabeling Experience and Unsupervised Pre-training},
author = {Lee, Kimin and Smith, Laura M and Abbeel, Pieter},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6152--6163},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lee21i/lee21i.pdf},
url = {https://proceedings.mlr.press/v139/lee21i.html},
abstract = {Conveying complex objectives to reinforcement learning (RL) agents can often be difficult, involving meticulous design of reward functions that are sufficiently informative yet easy enough to provide. Human-in-the-loop RL methods allow practitioners to instead interactively teach agents through tailored feedback; however, such approaches have been challenging to scale since human feedback is very expensive. In this work, we aim to make this process more sample- and feedback-efficient. We present an off-policy, interactive RL algorithm that capitalizes on the strengths of both feedback and off-policy learning. Specifically, we learn a reward model by actively querying a teacher’s preferences between two clips of behavior and use it to train an agent. To enable off-policy learning, we relabel all the agent’s past experience when its reward model changes. We additionally show that pre-training our agents with unsupervised exploration substantially increases the mileage of its queries. We demonstrate that our approach is capable of learning tasks of higher complexity than previously considered by human-in-the-loop methods, including a variety of locomotion and robotic manipulation skills. We also show that our method is able to utilize real-time human feedback to effectively prevent reward exploitation and learn new behaviors that are difficult to specify with standard reward functions.}
}
@InProceedings{pmlr-v139-lei21a,
title = {Near-Optimal Linear Regression under Distribution Shift},
author = {Lei, Qi and Hu, Wei and Lee, Jason},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6164--6174},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lei21a/lei21a.pdf},
url = {https://proceedings.mlr.press/v139/lei21a.html},
abstract = {Transfer learning is essential when sufficient data comes from the source domain, with scarce labeled data from the target domain. We develop estimators that achieve minimax linear risk for linear regression problems under distribution shift. Our algorithms cover different transfer learning settings including covariate shift and model shift. We also consider when data are generated from either linear or general nonlinear models. We show that linear minimax estimators are within an absolute constant of the minimax risk even among nonlinear estimators for various source/target distributions.}
}
@InProceedings{pmlr-v139-lei21b,
title = {Stability and Generalization of Stochastic Gradient Methods for Minimax Problems},
author = {Lei, Yunwen and Yang, Zhenhuan and Yang, Tianbao and Ying, Yiming},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6175--6186},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lei21b/lei21b.pdf},
url = {https://proceedings.mlr.press/v139/lei21b.html},
abstract = {Many machine learning problems can be formulated as minimax problems such as Generative Adversarial Networks (GANs), AUC maximization and robust estimation, to mention but a few. A substantial amount of studies are devoted to studying the convergence behavior of their stochastic gradient-type algorithms. In contrast, there is relatively little work on understanding their generalization, i.e., how the learning models built from training examples would behave on test examples. In this paper, we provide a comprehensive generalization analysis of stochastic gradient methods for minimax problems under both convex-concave and nonconvex-nonconcave cases through the lens of algorithmic stability. We establish a quantitative connection between stability and several generalization measures both in expectation and with high probability. For the convex-concave setting, our stability analysis shows that stochastic gradient descent ascent attains optimal generalization bounds for both smooth and nonsmooth minimax problems. We also establish generalization bounds for both weakly-convex-weakly-concave and gradient-dominated problems. We report preliminary experimental results to verify our theory.}
}
@InProceedings{pmlr-v139-leibo21a,
title = {Scalable Evaluation of Multi-Agent Reinforcement Learning with Melting Pot},
author = {Leibo, Joel Z and Due{\~n}ez-Guzman, Edgar A and Vezhnevets, Alexander and Agapiou, John P and Sunehag, Peter and Koster, Raphael and Matyas, Jayd and Beattie, Charlie and Mordatch, Igor and Graepel, Thore},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6187--6199},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/leibo21a/leibo21a.pdf},
url = {https://proceedings.mlr.press/v139/leibo21a.html},
abstract = {Existing evaluation suites for multi-agent reinforcement learning (MARL) do not assess generalization to novel situations as their primary objective (unlike supervised learning benchmarks). Our contribution, Melting Pot, is a MARL evaluation suite that fills this gap and uses reinforcement learning to reduce the human labor required to create novel test scenarios. This works because one agent’s behavior constitutes (part of) another agent’s environment. To demonstrate scalability, we have created over 80 unique test scenarios covering a broad range of research topics such as social dilemmas, reciprocity, resource sharing, and task partitioning. We apply these test scenarios to standard MARL training algorithms, and demonstrate how Melting Pot reveals weaknesses not apparent from training performance alone.}
}
@InProceedings{pmlr-v139-leimkuhler21a,
title = {Better Training using Weight-Constrained Stochastic Dynamics},
author = {Leimkuhler, Benedict and Vlaar, Tiffany J and Pouchon, Timoth{\'e}e and Storkey, Amos},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6200--6211},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/leimkuhler21a/leimkuhler21a.pdf},
url = {https://proceedings.mlr.press/v139/leimkuhler21a.html},
abstract = {We employ constraints to control the parameter space of deep neural networks throughout training. The use of customised, appropriately designed constraints can reduce the vanishing/exploding gradients problem, improve smoothness of classification boundaries, control weight magnitudes and stabilize deep neural networks, and thus enhance the robustness of training algorithms and the generalization capabilities of neural networks. We provide a general approach to efficiently incorporate constraints into a stochastic gradient Langevin framework, allowing enhanced exploration of the loss landscape. We also present specific examples of constrained training methods motivated by orthogonality preservation for weight matrices and explicit weight normalizations. Discretization schemes are provided both for the overdamped formulation of Langevin dynamics and the underdamped form, in which momenta further improve sampling efficiency. These optimisation schemes can be used directly, without needing to adapt neural network architecture design choices or to modify the objective with regularization terms, and see performance improvements in classification tasks.}
}
@InProceedings{pmlr-v139-leino21a,
title = {Globally-Robust Neural Networks},
author = {Leino, Klas and Wang, Zifan and Fredrikson, Matt},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6212--6222},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/leino21a/leino21a.pdf},
url = {https://proceedings.mlr.press/v139/leino21a.html},
abstract = {The threat of adversarial examples has motivated work on training certifiably robust neural networks to facilitate efficient verification of local robustness at inference time. We formalize a notion of global robustness, which captures the operational properties of on-line local robustness certification while yielding a natural learning objective for robust training. We show that widely-used architectures can be easily adapted to this objective by incorporating efficient global Lipschitz bounds into the network, yielding certifiably-robust models by construction that achieve state-of-the-art verifiable accuracy. Notably, this approach requires significantly less time and memory than recent certifiable training methods, and leads to negligible costs when certifying points on-line; for example, our evaluation shows that it is possible to train a large robust Tiny-Imagenet model in a matter of hours. Our models effectively leverage inexpensive global Lipschitz bounds for real-time certification, despite prior suggestions that tighter local bounds are needed for good performance; we posit this is possible because our models are specifically trained to achieve tighter global bounds. Namely, we prove that the maximum achievable verifiable accuracy for a given dataset is not improved by using a local bound.}
}
@InProceedings{pmlr-v139-leme21a,
title = {Learning to Price Against a Moving Target},
author = {Leme, Renato Paes and Sivan, Balasubramanian and Teng, Yifeng and Worah, Pratik},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6223--6232},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/leme21a/leme21a.pdf},
url = {https://proceedings.mlr.press/v139/leme21a.html},
abstract = {In the Learning to Price setting, a seller posts prices over time with the goal of maximizing revenue while learning the buyer’s valuation. This problem is very well understood when values are stationary (fixed or iid). Here we study the problem where the buyer’s value is a moving target, i.e., they change over time either by a stochastic process or adversarially with bounded variation. In either case, we provide matching upper and lower bounds on the optimal revenue loss. Since the target is moving, any information learned soon becomes out-dated, which forces the algorithms to keep switching between exploring and exploiting phases.}
}
@InProceedings{pmlr-v139-lemercier21a,
title = {SigGPDE: Scaling Sparse Gaussian Processes on Sequential Data},
author = {Lemercier, Maud and Salvi, Cristopher and Cass, Thomas and Bonilla, Edwin V. and Damoulas, Theodoros and Lyons, Terry J},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6233--6242},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lemercier21a/lemercier21a.pdf},
url = {https://proceedings.mlr.press/v139/lemercier21a.html},
abstract = {Making predictions and quantifying their uncertainty when the input data is sequential is a fundamental learning challenge, recently attracting increasing attention. We develop SigGPDE, a new scalable sparse variational inference framework for Gaussian Processes (GPs) on sequential data. Our contribution is twofold. First, we construct inducing variables underpinning the sparse approximation so that the resulting evidence lower bound (ELBO) does not require any matrix inversion. Second, we show that the gradients of the GP signature kernel are solutions of a hyperbolic partial differential equation (PDE). This theoretical insight allows us to build an efficient back-propagation algorithm to optimize the ELBO. We showcase the significant computational gains of SigGPDE compared to existing methods, while achieving state-of-the-art performance for classification tasks on large datasets of up to 1 million multivariate time series.}
}
@InProceedings{pmlr-v139-levanon21a,
title = {Strategic Classification Made Practical},
author = {Levanon, Sagi and Rosenfeld, Nir},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6243--6253},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/levanon21a/levanon21a.pdf},
url = {https://proceedings.mlr.press/v139/levanon21a.html},
abstract = {Strategic classification regards the problem of learning in settings where users can strategically modify their features to improve outcomes. This setting applies broadly, and has received much recent attention. But despite its practical significance, work in this space has so far been predominantly theoretical. In this paper we present a learning framework for strategic classification that is practical. Our approach directly minimizes the “strategic” empirical risk, which we achieve by differentiating through the strategic response of users. This provides flexibility that allows us to extend beyond the original problem formulation and towards more realistic learning scenarios. A series of experiments demonstrates the effectiveness of our approach on various learning settings.}
}
@InProceedings{pmlr-v139-levine21a,
title = {Improved, Deterministic Smoothing for L_1 Certified Robustness},
author = {Levine, Alexander J and Feizi, Soheil},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6254--6264},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/levine21a/levine21a.pdf},
url = {https://proceedings.mlr.press/v139/levine21a.html},
abstract = {Randomized smoothing is a general technique for computing sample-dependent robustness guarantees against adversarial attacks for deep classifiers. Prior works on randomized smoothing against L_1 adversarial attacks use additive smoothing noise and provide probabilistic robustness guarantees. In this work, we propose a non-additive and deterministic smoothing method, Deterministic Smoothing with Splitting Noise (DSSN). To develop DSSN, we first develop SSN, a randomized method which involves generating each noisy smoothing sample by first randomly splitting the input space and then returning a representation of the center of the subdivision occupied by the input sample. In contrast to uniform additive smoothing, the SSN certification does not require the random noise components used to be independent. Thus, smoothing can be done effectively in just one dimension and can therefore be efficiently derandomized for quantized data (e.g., images). To the best of our knowledge, this is the first work to provide deterministic "randomized smoothing" for a norm-based adversarial threat model while allowing for an arbitrary classifier (i.e., a deep model) to be used as a base classifier and without requiring an exponential number of smoothing samples. On CIFAR-10 and ImageNet datasets, we provide substantially larger L_1 robustness certificates compared to prior works, establishing a new state-of-the-art. The determinism of our method also leads to significantly faster certificate computation. Code is available at: https://github.com/alevine0/smoothingSplittingNoise.}
}
@InProceedings{pmlr-v139-lewis21a,
title = {BASE Layers: Simplifying Training of Large, Sparse Models},
author = {Lewis, Mike and Bhosale, Shruti and Dettmers, Tim and Goyal, Naman and Zettlemoyer, Luke},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6265--6274},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lewis21a/lewis21a.pdf},
url = {https://proceedings.mlr.press/v139/lewis21a.html},
abstract = {We introduce a new balanced assignment of experts (BASE) layer for large language models that greatly simplifies existing high capacity sparse layers. Sparse layers can dramatically improve the efficiency of training and inference by routing each token to specialized expert modules that contain only a small fraction of the model parameters. However, it can be difficult to learn balanced routing functions that make full use of the available experts; existing approaches typically use routing heuristics or auxiliary expert-balancing loss functions. In contrast, we formulate token-to-expert allocation as a linear assignment problem, allowing an optimal assignment in which each expert receives an equal number of tokens. This optimal assignment scheme improves efficiency by guaranteeing balanced compute loads, and also simplifies training by not requiring any new hyperparameters or auxiliary losses. Code is publicly released.}
}
@InProceedings{pmlr-v139-lezama21a,
title = {Run-Sort-ReRun: Escaping Batch Size Limitations in Sliced Wasserstein Generative Models},
author = {Lezama, Jose and Chen, Wei and Qiu, Qiang},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6275--6285},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/lezama21a/lezama21a.pdf},
url = {https://proceedings.mlr.press/v139/lezama21a.html},
abstract = {When training an implicit generative model, ideally one would like the generator to reproduce all the different modes and subtleties of the target distribution. Naturally, when comparing two empirical distributions, the larger the sample population, the more these statistical nuances can be captured. However, existing objective functions are computationally constrained in the amount of samples they can consider by the memory required to process a batch of samples. In this paper, we build upon recent progress in sliced Wasserstein distances, a family of differentiable metrics for distribution discrepancy based on the Optimal Transport paradigm. We introduce a procedure to train these distances with virtually any batch size, allowing the discrepancy measure to capture richer statistics and better approximating the distance between the underlying continuous distributions. As an example, we demonstrate the matching of the distribution of Inception features with batches of tens of thousands of samples, achieving FID scores that outperform state-of-the-art implicit generative models.}
}
@InProceedings{pmlr-v139-li21a,
title = {PAGE: A Simple and Optimal Probabilistic Gradient Estimator for Nonconvex Optimization},
author = {Li, Zhize and Bao, Hongyan and Zhang, Xiangliang and Richtarik, Peter},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6286--6295},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/li21a/li21a.pdf},
url = {https://proceedings.mlr.press/v139/li21a.html},
abstract = {In this paper, we propose a novel stochastic gradient estimator—ProbAbilistic Gradient Estimator (PAGE)—for nonconvex optimization. PAGE is easy to implement as it is designed via a small adjustment to vanilla SGD: in each iteration, PAGE uses the vanilla minibatch SGD update with probability $p_t$ or reuses the previous gradient with a small adjustment, at a much lower computational cost, with probability $1-p_t$. We give a simple formula for the optimal choice of $p_t$. Moreover, we prove the first tight lower bound $\Omega(n+\frac{\sqrt{n}}{\epsilon^2})$ for nonconvex finite-sum problems, which also leads to a tight lower bound $\Omega(b+\frac{\sqrt{b}}{\epsilon^2})$ for nonconvex online problems, where $b:= \min\{\frac{\sigma^2}{\epsilon^2}, n\}$. Then, we show that PAGE obtains the optimal convergence results $O(n+\frac{\sqrt{n}}{\epsilon^2})$ (finite-sum) and $O(b+\frac{\sqrt{b}}{\epsilon^2})$ (online) matching our lower bounds for both nonconvex finite-sum and online problems. Besides, we also show that for nonconvex functions satisfying the Polyak-Ł{ojasiewicz} (PL) condition, PAGE can automatically switch to a faster linear convergence rate $O(\cdot\log \frac{1}{\epsilon})$. Finally, we conduct several deep learning experiments (e.g., LeNet, VGG, ResNet) on real datasets in PyTorch showing that PAGE not only converges much faster than SGD in training but also achieves the higher test accuracy, validating the optimal theoretical results and confirming the practical superiority of PAGE.}
}
@InProceedings{pmlr-v139-li21b,
title = {Tightening the Dependence on Horizon in the Sample Complexity of Q-Learning},
author = {Li, Gen and Cai, Changxiao and Chen, Yuxin and Gu, Yuantao and Wei, Yuting and Chi, Yuejie},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6296--6306},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/li21b/li21b.pdf},
url = {https://proceedings.mlr.press/v139/li21b.html},
abstract = {Q-learning, which seeks to learn the optimal Q-function of a Markov decision process (MDP) in a model-free fashion, lies at the heart of reinforcement learning. Focusing on the synchronous setting (such that independent samples for all state-action pairs are queried via a generative model in each iteration), substantial progress has been made recently towards understanding the sample efficiency of Q-learning. To yield an entrywise $\varepsilon$-accurate estimate of the optimal Q-function, state-of-the-art theory requires at least an order of $\frac{|S||A|}{(1-\gamma)^5\varepsilon^{2}}$ samples in the infinite-horizon $\gamma$-discounted setting. In this work, we sharpen the sample complexity of synchronous Q-learning to the order of $\frac{|S||A|}{(1-\gamma)^4\varepsilon^2}$ (up to some logarithmic factor) for any $0<\varepsilon <1$, leading to an order-wise improvement in $\frac{1}{1-\gamma}$. Analogous results are derived for finite-horizon MDPs as well. Notably, our sample complexity analysis unveils the effectiveness of vanilla Q-learning, which matches that of speedy Q-learning without requiring extra computation and storage. Our result is obtained by identifying novel error decompositions and recursion relations, which might shed light on how to study other variants of Q-learning.}
}
@InProceedings{pmlr-v139-li21c,
title = {Winograd Algorithm for AdderNet},
author = {Li, Wenshuo and Chen, Hanting and Huang, Mingqiang and Chen, Xinghao and Xu, Chunjing and Wang, Yunhe},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6307--6315},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/li21c/li21c.pdf},
url = {https://proceedings.mlr.press/v139/li21c.html},
abstract = {Adder neural network (AdderNet) is a new kind of deep model that replaces the original massive multiplications in convolutions by additions while preserving the high performance. Since the hardware complexity of additions is much lower than that of multiplications, the overall energy consumption is thus reduced significantly. To further optimize the hardware overhead of using AdderNet, this paper studies the winograd algorithm, which is a widely used fast algorithm for accelerating convolution and saving the computational costs. Unfortunately, the conventional Winograd algorithm cannot be directly applied to AdderNets since the distributive law in multiplication is not valid for the l1-norm. Therefore, we replace the element-wise multiplication in the Winograd equation by additions and then develop a new set of transform matrixes that can enhance the representation ability of output features to maintain the performance. Moreover, we propose the l2-to-l1 training strategy to mitigate the negative impacts caused by formal inconsistency. Experimental results on both FPGA and benchmarks show that the new method can further reduce the energy consumption without affecting the accuracy of the original AdderNet.}
}
@InProceedings{pmlr-v139-li21d,
title = {A Free Lunch From ANN: Towards Efficient, Accurate Spiking Neural Networks Calibration},
author = {Li, Yuhang and Deng, Shikuang and Dong, Xin and Gong, Ruihao and Gu, Shi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6316--6325},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/li21d/li21d.pdf},
url = {https://proceedings.mlr.press/v139/li21d.html},
abstract = {Spiking Neural Network (SNN) has been recognized as one of the next generation of neural networks. Conventionally, SNN can be converted from a pre-trained ANN by only replacing the ReLU activation to spike activation while keeping the parameters intact. Perhaps surprisingly, in this work we show that a proper way to calibrate the parameters during the conversion of ANN to SNN can bring significant improvements. We introduce SNN Calibration, a cheap but extraordinarily effective method by leveraging the knowledge within a pre-trained Artificial Neural Network (ANN). Starting by analyzing the conversion error and its propagation through layers theoretically, we propose the calibration algorithm that can correct the error layer-by-layer. The calibration only takes a handful number of training data and several minutes to finish. Moreover, our calibration algorithm can produce SNN with state-of-the-art architecture on the large-scale ImageNet dataset, including MobileNet and RegNet. Extensive experiments demonstrate the effectiveness and efficiency of our algorithm. For example, our advanced pipeline can increase up to 69% top-1 accuracy when converting MobileNet on ImageNet compared to baselines. Codes are released at https://github.com/yhhhli/SNN_Calibration.}
}
@InProceedings{pmlr-v139-li21e,
title = {Privacy-Preserving Feature Selection with Secure Multiparty Computation},
author = {Li, Xiling and Dowsley, Rafael and De Cock, Martine},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6326--6336},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/li21e/li21e.pdf},
url = {https://proceedings.mlr.press/v139/li21e.html},
abstract = {Existing work on privacy-preserving machine learning with Secure Multiparty Computation (MPC) is almost exclusively focused on model training and on inference with trained models, thereby overlooking the important data pre-processing stage. In this work, we propose the first MPC based protocol for private feature selection based on the filter method, which is independent of model training, and can be used in combination with any MPC protocol to rank features. We propose an efficient feature scoring protocol based on Gini impurity to this end. To demonstrate the feasibility of our approach for practical data science, we perform experiments with the proposed MPC protocols for feature selection in a commonly used machine-learning-as-a-service configuration where computations are outsourced to multiple servers, with semi-honest and with malicious adversaries. Regarding effectiveness, we show that secure feature selection with the proposed protocols improves the accuracy of classifiers on a variety of real-world data sets, without leaking information about the feature values or even which features were selected. Regarding efficiency, we document runtimes ranging from several seconds to an hour for our protocols to finish, depending on the size of the data set and the security settings.}
}
@InProceedings{pmlr-v139-li21f,
title = {Theory of Spectral Method for Union of Subspaces-Based Random Geometry Graph},
author = {Li, Gen and Gu, Yuantao},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6337--6345},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/li21f/li21f.pdf},
url = {https://proceedings.mlr.press/v139/li21f.html},
abstract = {Spectral method is a commonly used scheme to cluster data points lying close to Union of Subspaces, a task known as Subspace Clustering. The typical usage is to construct a Random Geometry Graph first and then apply spectral method to the graph to obtain clustering result. The latter step has been coined the name Spectral Clustering. As far as we know, in spite of the significance of both steps in spectral-method-based Subspace Clustering, all existing theoretical results focus on the first step of constructing the graph, but ignore the final step to correct false connections through spectral clustering. This paper establishes a theory to show the power of this method for the first time, in which we demonstrate the mechanism of spectral clustering by analyzing a simplified algorithm under the widely used semi-random model. Based on this theory, we prove the efficiency of Subspace Clustering in fairly broad conditions. The insights and analysis techniques developed in this paper might also have implications for other random graph problems.}
}
@InProceedings{pmlr-v139-li21g,
title = {MURAL: Meta-Learning Uncertainty-Aware Rewards for Outcome-Driven Reinforcement Learning},
author = {Li, Kevin and Gupta, Abhishek and Reddy, Ashwin and Pong, Vitchyr H and Zhou, Aurick and Yu, Justin and Levine, Sergey},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6346--6356},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/li21g/li21g.pdf},
url = {https://proceedings.mlr.press/v139/li21g.html},
abstract = {Exploration in reinforcement learning is, in general, a challenging problem. A common technique to make learning easier is providing demonstrations from a human supervisor, but such demonstrations can be expensive and time-consuming to acquire. In this work, we study a more tractable class of reinforcement learning problems defined simply by examples of successful outcome states, which can be much easier to provide while still making the exploration problem more tractable. In this problem setting, the reward function can be obtained automatically by training a classifier to categorize states as successful or not. However, as we will show, this requires the classifier to make uncertainty-aware predictions that are very difficult using standard techniques for training deep networks. To address this, we propose a novel mechanism for obtaining calibrated uncertainty based on an amortized technique for computing the normalized maximum likelihood (NML) distribution, leveraging tools from meta-learning to make this distribution tractable. We show that the resulting algorithm has a number of intriguing connections to both count-based exploration methods and prior algorithms for learning reward functions, while also providing more effective guidance towards the goal. We demonstrate that our algorithm solves a number of challenging navigation and robotic manipulation tasks which prove difficult or impossible for prior methods.}
}
@InProceedings{pmlr-v139-li21h,
title = {Ditto: Fair and Robust Federated Learning Through Personalization},
author = {Li, Tian and Hu, Shengyuan and Beirami, Ahmad and Smith, Virginia},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6357--6368},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/li21h/li21h.pdf},
url = {https://proceedings.mlr.press/v139/li21h.html},
abstract = {Fairness and robustness are two important concerns for federated learning systems. In this work, we identify that robustness to data and model poisoning attacks and fairness, measured as the uniformity of performance across devices, are competing constraints in statistically heterogeneous networks. To address these constraints, we propose employing a simple, general framework for personalized federated learning, Ditto, that can inherently provide fairness and robustness benefits, and develop a scalable solver for it. Theoretically, we analyze the ability of Ditto to achieve fairness and robustness simultaneously on a class of linear problems. Empirically, across a suite of federated datasets, we show that Ditto not only achieves competitive performance relative to recent personalization methods, but also enables more accurate, robust, and fair models relative to state-of-the-art fair or robust baselines.}
}
@InProceedings{pmlr-v139-li21i,
title = {Quantization Algorithms for Random Fourier Features},
author = {Li, Xiaoyun and Li, Ping},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6369--6380},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/li21i/li21i.pdf},
url = {https://proceedings.mlr.press/v139/li21i.html},
abstract = {The method of random projection (RP) is the standard technique for dimensionality reduction, approximate near neighbor search, compressed sensing, etc., which provides a simple and effective scheme for approximating pairwise inner products and Euclidean distances in massive data. Closely related to RP, the method of random Fourier features (RFF) has also become popular for approximating the (nonlinear) Gaussian kernel. RFF applies a specific nonlinear transformation on the projected data from RP. In practice, using the Gaussian kernel often leads to better performance than the linear kernel (inner product). After random projections, quantization is an important step for efficient data storage, computation and transmission. Quantization for RP has been extensively studied in the literature. In this paper, we focus on developing quantization algorithms for RFF. The task is in a sense challenging due to the tuning parameter $\gamma$ in the Gaussian kernel. For example, the quantizer and the quantized data might be tied to each specific Gaussian kernel parameter $\gamma$. Our contribution begins with the analysis on the probability distributions of RFF, and an interesting discovery that the marginal distribution of RFF is free of the parameter $\gamma$. This significantly simplifies the design of the Lloyd-Max (LM) quantization scheme for RFF in that there would be only one LM quantizer (regardless of $\gamma$). Detailed theoretical analysis is provided on the kernel estimators and approximation error, and experiments confirm the effectiveness and efficiency of the proposed method.}
}
@InProceedings{pmlr-v139-li21j,
title = {Approximate Group Fairness for Clustering},
author = {Li, Bo and Li, Lijun and Sun, Ankang and Wang, Chenhao and Wang, Yingfan},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6381--6391},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/li21j/li21j.pdf},
url = {https://proceedings.mlr.press/v139/li21j.html},
abstract = {We incorporate group fairness into the algorithmic centroid clustering problem, where $k$ centers are to be located to serve $n$ agents distributed in a metric space. We refine the notion of proportional fairness proposed in [Chen et al., ICML 2019] as {\em core fairness}. A $k$-clustering is in the core if no coalition containing at least $n/k$ agents can strictly decrease their total distance by deviating to a new center together. Our solution concept is motivated by the situation where agents are able to coordinate and utilities are transferable. A string of existence, hardness and approximability results is provided. Particularly, we propose two dimensions to relax core requirements: one is on the degree of distance improvement, and the other is on the size of deviating coalition. For both relaxations and their combination, we study the extent to which relaxed core fairness can be satisfied in metric spaces including line, tree and general metric space, and design approximation algorithms accordingly. We also conduct experiments on synthetic and real-world data to examine the performance of our algorithms.}
}
@InProceedings{pmlr-v139-li21k,
title = {Sharper Generalization Bounds for Clustering},
author = {Li, Shaojie and Liu, Yong},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6392--6402},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/li21k/li21k.pdf},
url = {https://proceedings.mlr.press/v139/li21k.html},
abstract = {Existing generalization analysis of clustering mainly focuses on specific instantiations, such as (kernel) $k$-means, and a unified framework for studying clustering performance is still lacking. Besides, the existing excess clustering risk bounds are mostly of order $\mathcal{O}(K/\sqrt{n})$ provided that the underlying distribution has bounded support, where $n$ is the sample size and $K$ is the cluster numbers, or of order $\mathcal{O}(K^2/n)$ under strong assumptions on the underlying distribution, where these assumptions are hard to be verified in general. In this paper, we propose a unified clustering learning framework and investigate its excess risk bounds, obtaining state-of-the-art upper bounds under mild assumptions. Specifically, we derive sharper bounds of order $\mathcal{O}(K^2/n)$ under mild assumptions on the covering number of the hypothesis spaces, where these assumptions are easy to be verified. Moreover, for the hard clustering scheme, such as (kernel) $k$-means, if just assume the hypothesis functions to be bounded, we improve the upper bounds from the order $\mathcal{O}(K/\sqrt{n})$ to $\mathcal{O}(\sqrt{K}/\sqrt{n})$. Furthermore, state-of-the-art bounds of faster order $\mathcal{O}(K/n)$ are obtained with the covering number assumptions.}
}
@InProceedings{pmlr-v139-li21l,
title = {Provably End-to-end Label-noise Learning without Anchor Points},
author = {Li, Xuefeng and Liu, Tongliang and Han, Bo and Niu, Gang and Sugiyama, Masashi},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6403--6413},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/li21l/li21l.pdf},
url = {https://proceedings.mlr.press/v139/li21l.html},
abstract = {In label-noise learning, the transition matrix plays a key role in building statistically consistent classifiers. Existing consistent estimators for the transition matrix have been developed by exploiting anchor points. However, the anchor-point assumption is not always satisfied in real scenarios. In this paper, we propose an end-to-end framework for solving label-noise learning without anchor points, in which we simultaneously optimize two objectives: the cross entropy loss between the noisy label and the predicted probability by the neural network, and the volume of the simplex formed by the columns of the transition matrix. Our proposed framework can identify the transition matrix if the clean class-posterior probabilities are sufficiently scattered. This is by far the mildest assumption under which the transition matrix is provably identifiable and the learned classifier is statistically consistent. Experimental results on benchmark datasets demonstrate the effectiveness and robustness of the proposed method.}
}
@InProceedings{pmlr-v139-li21m,
title = {A Novel Method to Solve Neural Knapsack Problems},
author = {Li, Duanshun and Liu, Jing and Lee, Dongeun and Seyedmazloom, Ali and Kaushik, Giridhar and Lee, Kookjin and Park, Noseong},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6414--6424},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/li21m/li21m.pdf},
url = {https://proceedings.mlr.press/v139/li21m.html},
abstract = {0-1 knapsack is of fundamental importance across many fields. In this paper, we present a game-theoretic method to solve 0-1 knapsack problems (KPs) where the number of items (products) is large and the values of items are not predetermined but decided by an external value assignment function (e.g., a neural network in our case) during the optimization process. While existing papers are interested in predicting solutions with neural networks for classical KPs whose objective functions are mostly linear functions, we are interested in solving KPs whose objective functions are neural networks. In other words, we choose a subset of items that maximize the sum of the values predicted by neural networks. Its key challenge is how to optimize the neural network-based non-linear KP objective with a budget constraint. Our solution is inspired by game-theoretic approaches in deep learning, e.g., generative adversarial networks. After formally defining our two-player game, we develop an adaptive gradient ascent method to solve it. In our experiments, our method successfully solves two neural network-based non-linear KPs and conventional linear KPs with 1 million items.}
}
@InProceedings{pmlr-v139-li21n,
title = {Mixed Cross Entropy Loss for Neural Machine Translation},
author = {Li, Haoran and Lu, Wei},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6425--6436},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/li21n/li21n.pdf},
url = {https://proceedings.mlr.press/v139/li21n.html},
abstract = {In neural machine translation, Cross Entropy loss (CE) is the standard loss function in two training methods of auto-regressive models, i.e., teacher forcing and scheduled sampling. In this paper, we propose mixed Cross Entropy loss (mixed CE) as a substitute for CE in both training approaches. In teacher forcing, the model trained with CE regards the translation problem as a one-to-one mapping process, while in mixed CE this process can be relaxed to one-to-many. In scheduled sampling, we show that mixed CE has the potential to encourage the training and testing behaviours to be similar to each other, more effectively mitigating the exposure bias problem. We demonstrate the superiority of mixed CE over CE on several machine translation datasets, WMT’16 Ro-En, WMT’16 Ru-En, and WMT’14 En-De in both teacher forcing and scheduled sampling setups. Furthermore, in WMT’14 En-De, we also find mixed CE consistently outperforms CE on a multi-reference set as well as a challenging paraphrased reference set. We also found the model trained with mixed CE is able to provide a better probability distribution defined over the translation output space. Our code is available at https://github.com/haorannlp/mix.}
}
@InProceedings{pmlr-v139-li21o,
title = {Training Graph Neural Networks with 1000 Layers},
author = {Li, Guohao and M{\"u}ller, Matthias and Ghanem, Bernard and Koltun, Vladlen},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6437--6449},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/li21o/li21o.pdf},
url = {https://proceedings.mlr.press/v139/li21o.html},
abstract = {Deep graph neural networks (GNNs) have achieved excellent results on various tasks on increasingly large graph datasets with millions of nodes and edges. However, memory complexity has become a major obstacle when training deep GNNs for practical applications due to the immense number of nodes, edges, and intermediate activations. To improve the scalability of GNNs, prior works propose smart graph sampling or partitioning strategies to train GNNs with a smaller set of nodes or sub-graphs. In this work, we study reversible connections, group convolutions, weight tying, and equilibrium models to advance the memory and parameter efficiency of GNNs. We find that reversible connections in combination with deep network architectures enable the training of overparameterized GNNs that significantly outperform existing methods on multiple datasets. Our models RevGNN-Deep (1001 layers with 80 channels each) and RevGNN-Wide (448 layers with 224 channels each) were both trained on a single commodity GPU and achieve an ROC-AUC of 87.74 $\pm$ 0.13 and 88.14 $\pm$ 0.15 on the ogbn-proteins dataset. To the best of our knowledge, RevGNN-Deep is the deepest GNN in the literature by one order of magnitude.}
}
@InProceedings{pmlr-v139-li21p,
title = {Active Feature Acquisition with Generative Surrogate Models},
author = {Li, Yang and Oliva, Junier},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6450--6459},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/li21p/li21p.pdf},
url = {https://proceedings.mlr.press/v139/li21p.html},
abstract = {Many real-world situations allow for the acquisition of additional relevant information when making an assessment with limited or uncertain data. However, traditional ML approaches either require all features to be acquired beforehand or regard part of them as missing data that cannot be acquired. In this work, we consider models that perform active feature acquisition (AFA) and query the environment for unobserved features to improve the prediction assessments at evaluation time. Our work reformulates the Markov decision process (MDP) that underlies the AFA problem as a generative modeling task and optimizes a policy via a novel model-based approach. We propose learning a generative surrogate model (GSM) that captures the dependencies among input features to assess potential information gain from acquisitions. The GSM is leveraged to provide intermediate rewards and auxiliary information to aid the agent navigate a complicated high-dimensional action space and sparse rewards. Furthermore, we extend AFA in a task we coin active instance recognition (AIR) for the unsupervised case where the target variables are the unobserved features themselves and the goal is to collect information for a particular instance in a cost-efficient way. Empirical results demonstrate that our approach achieves considerably better performance than previous state of the art methods on both supervised and unsupervised tasks.}
}
@InProceedings{pmlr-v139-li21q,
title = {Partially Observed Exchangeable Modeling},
author = {Li, Yang and Oliva, Junier},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6460--6470},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/li21q/li21q.pdf},
url = {https://proceedings.mlr.press/v139/li21q.html},
abstract = {Modeling dependencies among features is fundamental for many machine learning tasks. Although there are often multiple related instances that may be leveraged to inform conditional dependencies, typical approaches only model conditional dependencies over individual instances. In this work, we propose a novel framework, partially observed exchangeable modeling (POEx) that takes in a set of related partially observed instances and infers the conditional distribution for the unobserved dimensions over multiple elements. Our approach jointly models the intra-instance (among features in a point) and inter-instance (among multiple points in a set) dependencies in data. POEx is a general framework that encompasses many existing tasks such as point cloud expansion and few-shot generation, as well as new tasks like few-shot imputation. Despite its generality, extensive empirical evaluations show that our model achieves state-of-the-art performance across a range of applications.}
}
@InProceedings{pmlr-v139-li21r,
title = {Testing DNN-based Autonomous Driving Systems under Critical Environmental Conditions},
author = {Li, Zhong and Pan, Minxue and Zhang, Tian and Li, Xuandong},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {6471--6482},
year = {2021},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/li21r/li21r.pdf},
url = {https://proceedings.mlr.press/v139/li21r.html},
abstract = {Due to the increasing usage of Deep Neural Network (DNN) based autonomous driving systems (ADS) where erroneous or unexpected behaviours can lead to catastrophic accidents, testing such systems is of growing importance. Existing approaches often just focus on finding erroneous behaviours