@Proceedings{AABI2024,
title = {Proceedings of the 6th Symposium on Advances in Approximate Bayesian Inference},
booktitle = {Proceedings of the 6th Symposium on Advances in Approximate Bayesian Inference},
editor = {Javier Antorán and Christian A. Naesseth},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
volume = 253
}
@InProceedings{pmlr-v253-ashman24a,
title = {In-Context In-Context Learning with Transformer Neural Processes},
author = {Ashman, Matthew and Diaconu, Cristiana and Weller, Adrian and Turner, Richard E.},
booktitle = {Proceedings of the 6th Symposium on Advances in Approximate Bayesian Inference},
pages = {1--29},
year = {2024},
editor = {Antorán, Javier and Naesseth, Christian A.},
volume = {253},
series = {Proceedings of Machine Learning Research},
month = {21 Jul},
publisher = {PMLR},
pdf = {https://raw.githubusercontent.com/mlresearch/v253/main/assets/ashman24a/ashman24a.pdf},
url = {https://proceedings.mlr.press/v253/ashman24a.html},
abstract = {Neural processes (NPs) are a powerful family of meta-learning models that seek to approximate the posterior predictive map of the ground-truth stochastic process from which each dataset in a meta-dataset is sampled. There are many cases in which practitioners, besides having access to the dataset of interest, may also have access to other datasets that share similarities with it. In this case, integrating these datasets into the NP can improve predictions. We equip NPs with this functionality and describe this paradigm as in-context in-context learning. Standard NP architectures, such as the convolutional conditional NP (ConvCNP) or the family of transformer neural processes (TNPs), are not capable of in-context in-context learning, as they are only able to condition on a single dataset. We address this shortcoming by developing the in-context in-context learning pseudo-token TNP (ICICL-TNP). The ICICL-TNP builds on the family of PT-TNPs, which utilise pseudo-token-based transformer architectures to sidestep the quadratic computational complexity associated with regular transformer architectures. Importantly, the ICICL-TNP is capable of conditioning on both sets of datapoints and sets of datasets, enabling it to perform in-context in-context learning. We demonstrate the importance of in-context in-context learning and the effectiveness of the ICICL-TNP in a number of experiments.}
}
@InProceedings{pmlr-v253-azam24a,
title = {Bayesian Optimization for Crop Genetics with Scalable Probabilistic Models},
author = {Azam, Ruhana and Truong, Sang T. and Fernandes, Samuel B. and Leakey, Andrew D.B. and Lipka, Alexander and El-Kebir, Mohammed and Koyejo, Sanmi},
booktitle = {Proceedings of the 6th Symposium on Advances in Approximate Bayesian Inference},
pages = {30--44},
year = {2024},
editor = {Antorán, Javier and Naesseth, Christian A.},
volume = {253},
series = {Proceedings of Machine Learning Research},
month = {21 Jul},
publisher = {PMLR},
pdf = {https://raw.githubusercontent.com/mlresearch/v253/main/assets/azam24a/azam24a.pdf},
url = {https://proceedings.mlr.press/v253/azam24a.html},
abstract = {An overarching goal of crop improvement is to select plants with desirable traits so that crops can provide sufficient food and nutrients for humanity in the face of climate change. To achieve such a goal, crop breeders utilize genomic prediction, in which that genome-wide DNA marker information is used to predict breeding values for desirable traits . Genomic prediction is complemented by advancements in high-throughput phenotyping, in robots and drones collect orders of magnitude higher amounts of trait information than in the past. Although such data are abundant and easy to collect, identifying the most biologically meaningful traits for use in genomic prediction is expensive. Bayesian optimization (BO) is a strong cost-effective solution to identify such meaningful traits. In this work, we quantified the performance of BO with a collection of acquisition function and surrogate models for identifying good proxies, in a set of +4 million proxies. We found that BO achieves comparable sample efficiency to random search while requiring significantly less computation. Despite traditional BO and random search techniques performing sufficiently well, both search techniques fail to leverage information from related tasks. To this end, we propose a pre-trained model as a transfer learning method. Using this benchmark, we conduct an extensive empirical study and demonstrate promising results on the transfer learning effect, highlighting a core design principle for developing more parsimonious optimization algorithms for crop improvement.}
}
@InProceedings{pmlr-v253-bordino24a,
title = {Non-asymptotic approximations of Gaussian neural networks via second-order Poincaré inequalities},
author = {Bordino, Alberto and Favaro, Stefano and Fortini, Sandra},
booktitle = {Proceedings of the 6th Symposium on Advances in Approximate Bayesian Inference},
pages = {45--78},
year = {2024},
editor = {Antorán, Javier and Naesseth, Christian A.},
volume = {253},
series = {Proceedings of Machine Learning Research},
month = {21 Jul},
publisher = {PMLR},
pdf = {https://raw.githubusercontent.com/mlresearch/v253/main/assets/bordino24a/bordino24a.pdf},
url = {https://proceedings.mlr.press/v253/bordino24a.html},
abstract = {There is a recent and growing literature on large-width asymptotic and non-asymptotic properties of deep Gaussian neural networks (NNs), namely NNs with weights initialized as Gaussian distributions. For a Gaussian NN of depth $L\geq1$ and width $n\geq1$, it is well-known that, as $n\rightarrow+\infty$, the NN’s output converges (in distribution) to a Gaussian process. Recently, some quantitative versions of this result, also known as quantitative central limit theorems (QCLTs), have been obtained, showing that the rate of convergence is $n^{-1}$, in the $2$-Wasserstein distance, and that such a rate is optimal. In this paper, we investigate the use of second-order Poincaré inequalities as an alternative approach to establish QCLTs for the NN’s output. Previous approaches consist of a careful analysis of the NN, by combining non-trivial probabilistic tools with ad-hoc techniques that rely on the recursive definition of the network, typically by means of an induction argument over the layers, and it is unclear if and how they still apply to other NN’s architectures. Instead, the use of second-order Poincaré inequalities rely only on the fact that the NN is a functional of a Gaussian process, reducing the problem of establishing QCLTs to the algebraic problem of computing the gradient and Hessian of the NN’s output, which still applies to other NN’s architectures. We show how our approach is effective in establishing QCLTs for the NN’s output, though it leads to suboptimal rates of convergence. We argue that such a worsening in the rates is peculiar to second-order Poincaré inequalities, and it should be interpreted as the "cost" for having a straightforward, and general, procedure for obtaining QCLTs.}
}
@InProceedings{pmlr-v253-mlodozeniec24a,
title = {Implicitly Bayesian Prediction Rules in Deep Learning},
author = {Mlodozeniec, Bruno and Krueger, David and Turner, Richard},
booktitle = {Proceedings of the 6th Symposium on Advances in Approximate Bayesian Inference},
pages = {79--110},
year = {2024},
editor = {Antorán, Javier and Naesseth, Christian A.},
volume = {253},
series = {Proceedings of Machine Learning Research},
month = {21 Jul},
publisher = {PMLR},
pdf = {https://raw.githubusercontent.com/mlresearch/v253/main/assets/mlodozeniec24a/mlodozeniec24a.pdf},
url = {https://proceedings.mlr.press/v253/mlodozeniec24a.html},
abstract = {The Bayesian approach leads to coherent updates of predictions under new data, which makes adhering to Bayesian principles appealing in decision-making contexts. Traditionally, integrating Bayesian principles into models like deep neural networks involves setting priors on parameters and approximating posteriors. This is done despite the fact that, typically, priors on parameters reflect any prior beliefs only insofar as they dictate function space behaviour. In this paper, we rethink this approach and consider what properties characterise a prediction rule as being Bayesian. Algorithms meeting such criteria can be deemed implicitly Bayesian — they make the same predictions as some Bayesian model, without explicitly manifesting priors and posteriors. We argue this might be a more fruitful approach towards integrating Bayesian principles into deep learning. In this paper, we propose how to measure how close a general prediction rule is to being implicitly Bayesian, and empirically evaluate multiple prediction strategies using our approach. We also show theoretically that agents relying on non-implicitly Bayesian prediction rules can be easily exploited in adversarial betting settings.}
}
@InProceedings{pmlr-v253-robnik24a,
title = {Fluctuation without dissipation: Microcanonical Langevin Monte Carlo},
author = {Robnik, Jakob and Seljak, Uros},
booktitle = {Proceedings of the 6th Symposium on Advances in Approximate Bayesian Inference},
pages = {111--126},
year = {2024},
editor = {Antorán, Javier and Naesseth, Christian A.},
volume = {253},
series = {Proceedings of Machine Learning Research},
month = {21 Jul},
publisher = {PMLR},
pdf = {https://raw.githubusercontent.com/mlresearch/v253/main/assets/robnik24a/robnik24a.pdf},
url = {https://proceedings.mlr.press/v253/robnik24a.html},
abstract = {Stochastic sampling algorithms such as Langevin Monte Carlo are inspired by physical systems in a heat bath. Their equilibrium distribution is the canonical ensemble given by a prescribed target distribution, so they must balance fluctuation and dissipation as dictated by the fluctuation-dissipation theorem. We show that the fluctuation-dissipation theorem is not required because only the configuration space distribution, and not the full phase space distribution, needs to be canonical. We propose a continuous-time Microcanonical Langevin Monte Carlo (MCLMC) as a dissipation-free system of stochastic differential equations (SDE). We derive the corresponding Fokker-Planck equation and show that the stationary distribution is the microcanonical ensemble with the desired canonical distribution on configuration space. We prove that MCLMC is ergodic for any nonzero amount of stochasticity, and for smooth, convex potentials, the expectation values converge exponentially fast. Furthermore, the deterministic drift and the stochastic diffusion separately preserve the stationary distribution. This uncommon property is attractive for practical implementations as it implies that the drift-diffusion discretization schemes are bias-free, so the only source of bias is the discretization of the deterministic dynamics. We apply MCLMC to a $\phi^4$ model on a 2d lattice, where Hamiltonian Monte Carlo (HMC) is currently the state-of-the-art integrator. MCLMC converges 12 to 32 times faster than HMC on an $8\times8$ to $64\times64$ lattice, and we expect even higher improvements for larger lattice sizes, such as in large scale lattice quantum chromodynamics.}
}
@InProceedings{pmlr-v253-tasdighi24a,
title = {PAC-Bayesian Soft Actor-Critic Learning},
author = {Tasdighi, Bahareh and Akg{\"u}l, Abdullah and Haussmann, Manuel and Brink, Kenny Kazimirzak and Kandemir, Melih},
booktitle = {Proceedings of the 6th Symposium on Advances in Approximate Bayesian Inference},
pages = {127--145},
year = {2024},
editor = {Antorán, Javier and Naesseth, Christian A.},
volume = {253},
series = {Proceedings of Machine Learning Research},
month = {21 Jul},
publisher = {PMLR},
pdf = {https://raw.githubusercontent.com/mlresearch/v253/main/assets/tasdighi24a/tasdighi24a.pdf},
url = {https://proceedings.mlr.press/v253/tasdighi24a.html},
abstract = {Actor-critic algorithms address the dual goals of reinforcement learning (RL), policy evaluation and improvement via two separate function approximators. The practicality of this approach comes at the expense of training instability, caused mainly by the destructive effect of the approximation errors of the critic on the actor. We tackle this bottleneck by employing an existing Probably Approximately Correct (PAC) Bayesian bound for the first time as the critic training objective of the Soft Actor-Critic (SAC) algorithm. We further demonstrate that online learning performance improves significantly when a stochastic actor explores multiple futures by critic-guided random search. We observe our resulting algorithm to compare favorably against the state-of-the-art SAC implementation on multiple classical control and locomotion tasks in terms of both sample efficiency and regret.}
}